Coverage for PanACoTA/annotate_module/annotation_functions.py: 100%

262 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to deal with prokka or prodigal only, according to user request 

38 

39 

40@author gem 

41April 2017 

42""" 

43 

44import os 

45import sys 

46import shutil 

47import glob 

48import logging 

49import subprocess 

50import shlex 

51import multiprocessing 

52import progressbar 

53import threading 

54 

55import PanACoTA.utils as utils 

56 

57logger = logging.getLogger('annotate.run_annotation_all') 

58 

59 

60def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only=False, 

61 small=False, quiet=False): 

62 """ 

63 For each genome in genomes, run prokka (or only prodigal) to annotate the genome. 

64 

65 Parameters 

66 ---------- 

67 genomes : dict 

68 {genome: [gembase_name, path_to_origfile, path_split_gembase, gsize, nbcont, L90]} 

69 threads : int 

70 max number of threads that can be used 

71 force : bool 

72 if False, do not override prokka/prodigal outdir and result dir if they exist. If\ 

73 True, rerun prokka and override existing results, for all genomes. 

74 annot_folder : str 

75 folder where prokka/prodigal results must be written: for each genome, 

76 a directory <genome_name>-prokkaRes or <genome_name>-prodigalRes> will be created 

77 in this folder, and all the results 

78 of prokka/prodigal for the genome will be written inside 

79 fgn : str 

80 name (key in genomes dict) of the fist genome, which will be used for prodigal training 

81 prodigal_only : bool 

82 True if only prodigal must run, False if prokka must run 

83 small : bool 

84 True -> use -p meta option with prodigal. Do not use training 

85 quiet : bool 

86 True if nothing must be written to stderr/stdout, False otherwise 

87 

88 Returns 

89 ------- 

90 dict 

91 {genome: boolean} -> with True if prokka/prodigal ran well, False otherwise. 

92 """ 

93 

94 # Update information according to annotation soft used and write message 

95 if prodigal_only: 

96 message = "Annotating all genomes with prodigal" 

97 run_annot = run_prodigal 

98 main_logger = logging.getLogger("annotate.prodigal") 

99 else: 

100 message = "Annotating all genomes with prokka" 

101 run_annot = run_prokka 

102 main_logger = logging.getLogger("annotate.prokka") 

103 main_logger.info(message) 

104 # Get total number of genomes to annotate, used to show annotation progress 

105 nbgen = len(genomes) 

106 bar = None 

107 # If user did not ask for quiet, create progressbar 

108 if not quiet: 

109 # Create progress bar 

110 widgets = ['Annotation: ', progressbar.Bar(marker='█', left='', right=''), 

111 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (', 

112 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - ' 

113 ] 

114 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, 

115 term_width=79).start() 

116 # Get resource availability: 

117 # - number of threads used by prokka/prodigal (cores_annot) 

118 # - how many genomes can be annotated at the same time (pool_size) 

119 # - prodigal does not run with several threads: with prodigal, always cores_annot == 1 

120 # and pool_size == threads 

121 gpath_train = "" # by default, no training genome 

122 if prodigal_only: 

123 cores_annot = 1 

124 pool_size = threads 

125 # If prodigal, train on the first genome 

126 # fgn is key of genomes, genomes[fgn] = [_,_,annote_file,_,_,_] 

127 gtrain = genomes[fgn][2] 

128 # If problem, gpath_train will be empty, but this will be checked while 

129 # trying to run prodigal, because we also need to check that genomes are not simply 

130 # already annotated 

131 if not small: 

132 gpath_train = prodigal_train(gtrain, annot_folder) 

133 else: 

134 gpath_train = "small option" 

135 elif threads <= 3: 

136 # less than 3 threads: run prokka 1 by 1 with all threads 

137 cores_annot = threads 

138 pool_size = 1 

139 else: 

140 # use multiprocessing 

141 # if there are more threads than genomes, use as many threads as possible per genome 

142 if len(genomes) <= threads: 

143 cores_annot = int(threads / len(genomes)) 

144 # otherwise, use 2 threads per genome (and nb_genome/2 genomes at the same time) 

145 else: 

146 cores_annot = 2 

147 pool_size = int(threads / cores_annot) 

148 # Create pool with a given size (=number of tasks to be launched in parallel) 

149 pool = multiprocessing.Pool(pool_size) 

150 # Create a Queue to put logs from processes, and handle them after from a single thread 

151 m = multiprocessing.Manager() 

152 q = m.Queue() 

153 # {genome: [gembase_name, path_to_origfile, path_toannotate_file, gsize, nbcont, L90]} 

154 # arguments: gpath, prok_folder, threads, name, force, nbcont, small(for prodigal), q 

155 arguments = [(genomes[g][2], annot_folder, cores_annot, genomes[g][0], 

156 force, genomes[g][4], gpath_train, q) 

157 for g in sorted(genomes)] 

158 try: 

159 # Start pool (run 'run_annot' n each set of arguments) 

160 final = pool.map_async(run_annot, arguments, chunksize=1) 

161 # Close pool: no more data will be put on this pool 

162 pool.close() 

163 # Listen for logs in processes 

164 lp = threading.Thread(target=utils.logger_thread, args=(q,)) 

165 lp.start() 

166 if not quiet: 

167 while True: 

168 # Final is ready when all pool elements are done 

169 if final.ready(): 

170 break 

171 # If not done, get number of genomes left 

172 remaining = final._number_left 

173 # Add this to start progressbar with 0% instead of N/A% 

174 if remaining == nbgen: 

175 bar.update(0.0000001) 

176 else: 

177 # Update progress bar 

178 bar.update(nbgen - remaining) 

179 # End progress bar 

180 bar.finish() 

181 pool.join() 

182 # Put None to tell 'q' that everything is finished. It can stopped and be joined. 

183 q.put(None) 

184 # join lp (tell to stop once all log processes are done, which is the case here) 

185 lp.join() 

186 final = final.get() 

187 # # If user stops programm (ctrl+C), end it 

188 # except KeyboardInterrupt as ki: 

189 # print("error") 

190 # for worker in pool._pool: 

191 # print(worker.is_alive()) 

192 # pool.join() 

193 # print("closed") 

194 # pool.terminate() 

195 # print("--------------terminate ok----------------") 

196 # lp.join() 

197 # print("thread stopped") 

198 # # run_event.clear() 

199 # # lp.terminate() 

200 # # print("--------------JOIN--------------") 

201 # # pool.terminate() 

202 # main_logger.error("Process killed by CTRL+C") 

203 # return "coucou" 

204 # If an error occurs, terminate pool, write error and exit 

205 except Exception as excp: # pragma: no cover 

206 pool.terminate() 

207 main_logger.error(excp) 

208 sys.exit(1) 

209 final = {genome: res for genome, res in zip(sorted(genomes), final)} 

210 return final 

211 

212 

213def prodigal_train(gpath, annot_folder): 

214 """ 

215 Use prodigal training mode. 

216 First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn, 

217 file which will be used for the annotation of all next sequence 

218 Parameters 

219 ---------- 

220 gpath : str 

221 path to genome to train on 

222 annot_folder : str 

223 path to folder where the log files and train file will be saved 

224 

225 Returns 

226 ------- 

227 str 

228 path and name of train file (will be used to annotate all next genomes) 

229 If problem, returns empty string 

230 """ 

231 logger.info(f"Prodigal will train using {gpath}") 

232 gname = os.path.basename(gpath) # path/to/original/genome.fasta -> genome.fasta 

233 gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn 

234 if os.path.isfile(gpath_train): 

235 logger.info(f"A training file already exists ({gpath_train}). " 

236 "It will be used to annotate all genomes.") 

237 return gpath_train 

238 prodigal_logfile = gpath_train + "-prodigal-train.log" # path/to/genome-prodigal-train.log 

239 prodigal_logfile_err = gpath_train + "-prodigal-train.log.err" 

240 cmd = (f"prodigal -i {gpath} -t {gpath_train}") 

241 error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.") 

242 logger.log(utils.detail_lvl(), "prodigal command: " + cmd) 

243 prodigalf = open(prodigal_logfile, "w") 

244 prodigalferr = open(prodigal_logfile_err, "w") 

245 ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, 

246 logger=logger) 

247 prodigalf.close() 

248 prodigalferr.close() 

249 if ret != 1 and ret.returncode == 0: 

250 logger.log(utils.detail_lvl(), f"End training on {gpath}") 

251 return gpath_train 

252 else: 

253 return "" 

254 

255 

256def run_prokka(arguments): 

257 """ 

258 Run prokka for the given genome. 

259 

260 Parameters 

261 ---------- 

262 arguments : tuple 

263 (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with: 

264 

265 * gpath: path and filename of genome to annotate 

266 * prok_folder: path to folder where all prokka folders for all genomes are saved 

267 * cores_annot: how many cores can use prokka 

268 * name: output name of annotated genome 

269 * force: True if force run (override existing files), False otherwise 

270 * nbcont: number of contigs in the input genome, to check prokka results 

271 * small: used for prodigal, if sequences to annotate are small. Not used here 

272 * q : queue where logs are put 

273 

274 Returns 

275 ------- 

276 boolean 

277 True if eveything went well (all needed output files present, 

278 corresponding numbers of proteins, genes etc.). False otherwise. 

279 """ 

280 gpath, prok_folder, threads, name, force, nbcont, _, q = arguments 

281 # Set logger for this process 

282 qh = logging.handlers.QueueHandler(q) 

283 root = logging.getLogger() 

284 root.setLevel(logging.DEBUG) 

285 root.handlers = [] 

286 logging.addLevelName(utils.detail_lvl(), "DETAIL") 

287 root.addHandler(qh) 

288 logger = logging.getLogger('annotate.run_prokka') 

289 logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka") 

290 

291 # Define prokka directory and logfile, and check their existence 

292 prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes") 

293 fnull = open(os.devnull, 'w') 

294 prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log") 

295 # import sys 

296 # sys.exit(1) 

297 # If result dir already exists, check if we can use it or next step or not 

298 if os.path.isdir(prok_dir) and not force: 

299 logger.warning(f"Prokka results folder {prok_dir} already exists.") 

300 ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger) 

301 # If everything ok in the result dir, do not rerun prokka, 

302 # use those results for next step (formatting) 

303 if ok: 

304 logger.log(utils.detail_lvl(), "Prokka did not run again, " 

305 "formatting step used already generated results of " 

306 f"Prokka in {prok_dir}. If you want to re-run prokka, first " 

307 "remove this result folder, or use '-F' or '--force' " 

308 "option if you want to rerun prokka for all genomes.") 

309 logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}") 

310 # If missing files, or other problems in result dir, error message, 

311 # ask user to force or remove this folder. 

312 else: 

313 logger.warning("Problems in the files contained in your already existing output dir " 

314 "({}). Please check it, or remove it to " 

315 "re-annotate.".format(prok_dir)) 

316 # If everything was ok -> everything is ready for next step -> return True 

317 # If something is wrong -> cannot use those results, genome won't be annotated 

318 # -> return False 

319 return ok 

320 # If result dir exists but user wants to force, remove this result dir 

321 elif os.path.isdir(prok_dir) and force: 

322 shutil.rmtree(prok_dir) 

323 logger.warning("Prokka results folder already exists, but removed because --force option " 

324 "used") 

325 # Now that we checked and solved those cases: 

326 # - outdir exists (problems or not, we returned appropriate boolean) 

327 # - if outdir exists exists but force, remove this outdir. 

328 # So, outdir does not exist -> run prokka 

329 cmd = (f"prokka --outdir {prok_dir} --cpus {threads} " 

330 f"--prefix {name} --centre prokka {gpath}") 

331 error = (f"Error while trying to run prokka on {name} from {gpath}") 

332 logger.log(utils.detail_lvl(), "Prokka command: " + cmd) 

333 prokf = open(prok_logfile, "w") 

334 ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger) 

335 prokf.close() 

336 if ret.returncode != 0: 

337 return False 

338 ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger) 

339 logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.") 

340 return ok 

341 

342 

343def run_prodigal(arguments): 

344 """ 

345 Run prodigal for the given genome. 

346 

347 Parameters 

348 ---------- 

349 arguments : tuple 

350 (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with: 

351 

352 * gpath: path and filename of genome to annotate 

353 * prodigal_folder: path to folder where all prodigal folders for all genomes are saved 

354 * cores_annot: how many cores can use prodigal 

355 * name: output name of annotated genome 

356 * force: True if force run (override existing files), False otherwise 

357 * nbcont: number of contigs in the input genome, to check prodigal results 

358 * small: ifcontigs are too small (<20000bp), use -p meta option 

359 * q : queue where logs are put 

360 

361 Returns 

362 ------- 

363 boolean 

364 True if eveything went well (all needed output files present, 

365 corresponding numbers of proteins, genes etc.). False otherwise. 

366 """ 

367 gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments 

368 # Set logger for this process, which will be given to all subprocess 

369 qh = logging.handlers.QueueHandler(q) 

370 root = logging.getLogger() 

371 root.setLevel(logging.DEBUG) 

372 root.handlers = [] 

373 logging.addLevelName(utils.detail_lvl(), "DETAIL") 

374 root.addHandler(qh) 

375 logger = logging.getLogger('annotate.run_prodigal') 

376 # Define prodigal directory and logfile, and check their existence 

377 # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes 

378 g_ori_name = os.path.basename(gpath) 

379 prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes") 

380 prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log") 

381 prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err") 

382 

383 # If result dir exists but user wants to force, remove this result dir 

384 if os.path.isdir(prodigal_dir) and force: 

385 shutil.rmtree(prodigal_dir) 

386 logger.warning("Prodigal results folder already exists, but is removed because " 

387 "--force option was used.") 

388 

389 # Training file can be "small option", meaning that we did not use the training mode. 

390 # If not "small option", we used the training mode. If training file does not exist  

391 # and prodigal result directory neither, return False 

392 # We cannot annotate using nothing. 

393 # Happens if there was a problem while training 

394 if (gpath_train != "small option" and not os.path.isfile(gpath_train) 

395 and not os.path.isdir(prodigal_dir)): 

396 return False 

397 

398 logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) " 

399 "with Prodigal") 

400 # If prodigal results dir already exists (meaning user did not want to force, 

401 # otherwise it would have been deleted just before), 

402 # can we use it for next step ? -> check content. 

403 if os.path.isdir(prodigal_dir): 

404 logger.warning(f"Prodigal results folder {prodigal_dir} already exists.") 

405 ok = check_prodigal(gpath, name, prodigal_dir, logger) 

406 # If everything ok in the result dir, do not rerun prodigal, 

407 # use those results for next step (formatting) 

408 if ok: 

409 logger.log(utils.detail_lvl(), "Prodigal did not run again. " 

410 "Formatting step will use already generated results of " 

411 "Prodigal in {}. If you want to re-run Prodigal, first " 

412 "remove this result folder, or use '-F' or '--force' " 

413 "option.".format(prodigal_dir)) 

414 

415 logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})") 

416 # If missing files, or other problems in result dir, error message, 

417 # ask user to force or remove this folder. 

418 else: 

419 logger.warning("Problems in the files contained in your already existing output dir " 

420 f"({prodigal_dir}). Please check it, or remove it to " 

421 "re-annotate.") 

422 # If everything was ok -> everything is ready for next step -> return True 

423 # If something is wrong -> cannot use those results, genome won't be annotated 

424 # -> return False 

425 return ok 

426 else: 

427 # We are sure prodigal result dir does not exist yet, because either: 

428 # - never existed 

429 # - removed because user asked to force 

430 # - exists but left function, so does not go until this line 

431 # -> either if files inside are ok or not 

432 # So make prodigal_dir (not automatically created by prodigal) 

433 os.makedirs(prodigal_dir) 

434 

435 # Prodigal_directory is empty and ready to get prodigal results 

436 basic_outname = os.path.join(prodigal_dir, name) 

437 # Define cmd, stderr and stdout files, and error to write if problem. 

438 error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.") 

439 prodigalf = open(prodigal_logfile, "w") 

440 prodigalferr = open(prodigal_logfile_err, "w") 

441 if gpath_train == "small option": 

442 training = "-p meta" 

443 else: 

444 training = f"-t {gpath_train}" 

445 cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} " 

446 f"-f gff -o {basic_outname + '.gff'} {training} -q") 

447 logger.log(utils.detail_lvl(), "Prodigal command: " + cmd) 

448 

449 ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, 

450 logger=logger) 

451 prodigalf.close() 

452 prodigalferr.close() 

453 if ret.returncode == 0: 

454 logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})") 

455 return True 

456 else: 

457 return False 

458 

459 

460def check_prokka(outdir, logf, name, gpath, nbcont, logger): 

461 """ 

462 Prokka writes everything to stderr, and always returns a non-zero return code. So, we 

463 check if it ran well by checking the content of output directory. 

464 This function is also used when prokka files already exist (prokka was run previously), to 

465 check if everything is ok before going to next step. 

466 

467 Parameters 

468 ---------- 

469 outdir : str 

470 output directory, where all files are written by prokka 

471 logf : str 

472 prokka/prodigal logfile, containing stderr of prokka 

473 name : str 

474 genome name in gembase format 

475 gpath : str 

476 path to fasta file given as input for prokka 

477 nbcont : int 

478 number of contigs in fasta file given to prokka 

479 logger : logging.Logger 

480 logger object to get logs 

481 

482 Returns 

483 ------- 

484 bool 

485 True if everything went well, False otherwise 

486 """ 

487 missing_file = False 

488 problem = False 

489 if not os.path.isdir(outdir): 

490 logger.error(("Previous annotation could not run properly. " 

491 "Look at {} for more information.").format(logf)) 

492 missing_file = True 

493 else: 

494 oriname = os.path.basename(gpath) 

495 fnafile = glob.glob(os.path.join(outdir, "*.fna")) 

496 tblfile = glob.glob(os.path.join(outdir, "*.tbl")) 

497 faafile = glob.glob(os.path.join(outdir, "*.faa")) 

498 ffnfile = glob.glob(os.path.join(outdir, "*.ffn")) 

499 gfffile = glob.glob(os.path.join(outdir, "*.gff")) 

500 if len(fnafile) == 0: 

501 logger.error("{} {}: no .fna file".format(name, oriname)) 

502 missing_file = True 

503 elif len(fnafile) > 1: 

504 logger.error("{} {}: several .fna files".format(name, oriname)) 

505 missing_file = True 

506 if len(tblfile) == 0: 

507 logger.error("{} {}: no .tbl file".format(name, oriname)) 

508 missing_file = True 

509 elif len(tblfile) > 1: 

510 logger.error("{} {}: several .tbl files".format(name, oriname)) 

511 missing_file = True 

512 if len(faafile) == 0: 

513 logger.error("{} {}: no .faa file".format(name, oriname)) 

514 missing_file = True 

515 elif len(faafile) > 1: 

516 logger.error("{} {}: several .faa files".format(name, oriname)) 

517 missing_file = True 

518 if len(ffnfile) == 0: 

519 logger.error("{} {}: no .ffn file".format(name, oriname)) 

520 missing_file = True 

521 elif len(ffnfile) > 1: 

522 logger.error("{} {}: several .ffn files".format(name, oriname)) 

523 missing_file = True 

524 if len(gfffile) == 0: 

525 logger.error("{} {}: no .gff file".format(name, oriname)) 

526 missing_file = True 

527 elif len(gfffile) > 1: 

528 logger.error("{} {}: several .gff files".format(name, oriname)) 

529 missing_file = True 

530 if not missing_file: 

531 tblfile = tblfile[0] 

532 faafile = faafile[0] 

533 ffnfile = ffnfile[0] 

534 fnbcont, tnb_cds, nb_gene = count_tbl(tblfile) 

535 faaprot = count_headers(faafile) 

536 ffngene = count_headers(ffnfile) 

537 if nbcont != fnbcont: 

538 logger.error(("{} {}: no matching number of contigs; " 

539 "nbcontig={}; in tbl ={}").format(name, oriname, nbcont, fnbcont)) 

540 problem = True 

541 if tnb_cds != faaprot: 

542 logger.error(("{} {}: no matching number of proteins between tbl and faa; " 

543 "faa={}; in tbl ={}").format(name, oriname, faaprot, tnb_cds)) 

544 problem = True 

545 return not problem and not missing_file 

546 

547 

548def check_prodigal(gpath, name, prodigal_dir, logger): 

549 """ 

550 When prodigal result folder already exists, check that the ouput files exist. 

551 We cannot check all content, but check that they are present. 

552 

553 Parameters 

554 ---------- 

555 gpath : str 

556 path to fasta file given as input for prokka 

557 name : str 

558 genome name in gembase format 

559 prodigal_dir : str 

560 output directory, where all files are written by prodigal 

561 logger : logging.Logger 

562 logger object to get logs 

563 

564 Returns 

565 ------- 

566 bool 

567 True if everything went well, False otherwise 

568 """ 

569 oriname = os.path.basename(gpath) 

570 faafile = glob.glob(os.path.join(prodigal_dir, "*.faa")) 

571 ffnfile = glob.glob(os.path.join(prodigal_dir, "*.ffn")) 

572 gfffile = glob.glob(os.path.join(prodigal_dir, "*.gff")) 

573 missing_file = False 

574 

575 if len(faafile) != 1: 

576 logger.error("{} {}: no or several .faa file(s)".format(name, oriname)) 

577 missing_file = True 

578 if len(ffnfile) != 1: 

579 logger.error("{} {}: no or several .ffn file(s)".format(name, oriname)) 

580 missing_file = True 

581 if len(gfffile) != 1: 

582 logger.error("{} {}: no or several .gff file(s)".format(name, oriname)) 

583 missing_file = True 

584 

585 # If we have all result files, check they are not empty 

586 if not missing_file: 

587 if (os.path.getsize(faafile[0]) == 0 or os.path.getsize(ffnfile[0]) == 0 

588 or os.path.getsize(gfffile[0]) == 0): 

589 logger.error("Genome {} (from {}): At least one of your Prodigal result file " 

590 "is empty.".format(name, oriname)) 

591 return False 

592 return not missing_file 

593 

594 

595def count_tbl(tblfile): 

596 """ 

597 Count the different features found in the tbl file: 

598 

599 - number of contigs 

600 - number of proteins (CDS) 

601 - number of genes (locus_tag) 

602 - number of CRISPR arrays (repeat_region) -> ignore crisprs 

603 

604 Parameters 

605 ---------- 

606 tblfile : str 

607 tbl file generated by prokka 

608 

609 Returns 

610 ------- 

611 (nbcont, nb_cds, nb_gene, nb_crispr) 

612 information on features found in the tbl file. 

613 """ 

614 nbcont = 0 

615 nb_cds = 0 

616 nb_gene = 0 

617 # nb_crispr = 0 

618 with open(tblfile) as tblf: 

619 for line in tblf: 

620 if line.startswith(">"): 

621 nbcont += 1 

622 if "CDS" in line: 

623 nb_cds += 1 

624 if "locus_tag" in line: 

625 nb_gene += 1 

626 # if "repeat_region" in line or (len(line.split()) == 3 and "CRISPR" in line): 

627 # nb_crispr += 1 

628 return nbcont, nb_cds, nb_gene #, nb_crispr 

629 

630 

631def count_headers(seqfile): 

632 """ 

633 Count how many sequences there are in the given file 

634 

635 Parameters 

636 ---------- 

637 seqfile : str 

638 file containing a sequence in multi-fasta format 

639 

640 Returns 

641 ------- 

642 int 

643 number of contigs in the given multi-fasta file 

644 """ 

645 nbseq = 0 

646 with open(seqfile) as faaf: 

647 for line in faaf: 

648 if line.startswith(">"): 

649 nbseq += 1 

650 return nbseq