Coverage for PanACoTA/annotate_module/annotation

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions to deal with prokka or prodigal only, according to user request

40@author gem

41April 2017

42"""

44import os

45import sys

46import shutil

47import glob

48import logging

49import subprocess

50import shlex

51import multiprocessing

52import progressbar

53import threading

55import PanACoTA.utils as utils

57logger = logging.getLogger('annotate.run_annotation_all')

60def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only=False,

61 small=False, quiet=False):

62 """

63 For each genome in genomes, run prokka (or only prodigal) to annotate the genome.

65 Parameters

66 ----------

67 genomes : dict

68 {genome: [gembase_name, path_to_origfile, path_split_gembase, gsize, nbcont, L90]}

69 threads : int

70 max number of threads that can be used

71 force : bool

72 if False, do not override prokka/prodigal outdir and result dir if they exist. If\

73 True, rerun prokka and override existing results, for all genomes.

74 annot_folder : str

75 folder where prokka/prodigal results must be written: for each genome,

76 a directory <genome_name>-prokkaRes or <genome_name>-prodigalRes> will be created

77 in this folder, and all the results

78 of prokka/prodigal for the genome will be written inside

79 fgn : str

80 name (key in genomes dict) of the fist genome, which will be used for prodigal training

81 prodigal_only : bool

82 True if only prodigal must run, False if prokka must run

83 small : bool

84 True -> use -p meta option with prodigal. Do not use training

85 quiet : bool

86 True if nothing must be written to stderr/stdout, False otherwise

88 Returns

89 -------

90 dict

91 {genome: boolean} -> with True if prokka/prodigal ran well, False otherwise.

92 """

94 # Update information according to annotation soft used and write message

95 if prodigal_only:

96 message = "Annotating all genomes with prodigal"

97 run_annot = run_prodigal

98 main_logger = logging.getLogger("annotate.prodigal")

99 else:

100 message = "Annotating all genomes with prokka"

101 run_annot = run_prokka

102 main_logger = logging.getLogger("annotate.prokka")

103 main_logger.info(message)

104 # Get total number of genomes to annotate, used to show annotation progress

105 nbgen = len(genomes)

106 bar = None

107 # If user did not ask for quiet, create progressbar

108 if not quiet:

109 # Create progress bar

110 widgets = ['Annotation: ', progressbar.Bar(marker='█', left='', right=''),

111 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',

112 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - '

113 ]

114 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen,

115 term_width=79).start()

116 # Get resource availability:

117 # - number of threads used by prokka/prodigal (cores_annot)

118 # - how many genomes can be annotated at the same time (pool_size)

119 # - prodigal does not run with several threads: with prodigal, always cores_annot == 1

120 # and pool_size == threads

121 gpath_train = "" # by default, no training genome

122 if prodigal_only:

123 cores_annot = 1

124 pool_size = threads

125 # If prodigal, train on the first genome

126 # fgn is key of genomes, genomes[fgn] = [_,_,annote_file,_,_,_]

127 gtrain = genomes[fgn][2]

128 # If problem, gpath_train will be empty, but this will be checked while

129 # trying to run prodigal, because we also need to check that genomes are not simply

130 # already annotated

131 if not small:

132 gpath_train = prodigal_train(gtrain, annot_folder)

133 else:

134 gpath_train = "small option"

135 elif threads <= 3:

136 # less than 3 threads: run prokka 1 by 1 with all threads

137 cores_annot = threads

138 pool_size = 1

139 else:

140 # use multiprocessing

141 # if there are more threads than genomes, use as many threads as possible per genome

142 if len(genomes) <= threads:

143 cores_annot = int(threads / len(genomes))

144 # otherwise, use 2 threads per genome (and nb_genome/2 genomes at the same time)

145 else:

146 cores_annot = 2

147 pool_size = int(threads / cores_annot)

148 # Create pool with a given size (=number of tasks to be launched in parallel)

149 pool = multiprocessing.Pool(pool_size)

150 # Create a Queue to put logs from processes, and handle them after from a single thread

151 m = multiprocessing.Manager()

152 q = m.Queue()

153 # {genome: [gembase_name, path_to_origfile, path_toannotate_file, gsize, nbcont, L90]}

154 # arguments: gpath, prok_folder, threads, name, force, nbcont, small(for prodigal), q

155 arguments = [(genomes[g][2], annot_folder, cores_annot, genomes[g][0],

156 force, genomes[g][4], gpath_train, q)

157 for g in sorted(genomes)]

158 try:

159 # Start pool (run 'run_annot' n each set of arguments)

160 final = pool.map_async(run_annot, arguments, chunksize=1)

161 # Close pool: no more data will be put on this pool

162 pool.close()

163 # Listen for logs in processes

164 lp = threading.Thread(target=utils.logger_thread, args=(q,))

165 lp.start()

166 if not quiet:

167 while True:

168 # Final is ready when all pool elements are done

169 if final.ready():

170 break

171 # If not done, get number of genomes left

172 remaining = final._number_left

173 # Add this to start progressbar with 0% instead of N/A%

174 if remaining == nbgen:

175 bar.update(0.0000001)

176 else:

177 # Update progress bar

178 bar.update(nbgen - remaining)

179 # End progress bar

180 bar.finish()

181 pool.join()

182 # Put None to tell 'q' that everything is finished. It can stopped and be joined.

183 q.put(None)

184 # join lp (tell to stop once all log processes are done, which is the case here)

185 lp.join()

186 final = final.get()

187 # # If user stops programm (ctrl+C), end it

188 # except KeyboardInterrupt as ki:

189 # print("error")

190 # for worker in pool._pool:

191 # print(worker.is_alive())

192 # pool.join()

193 # print("closed")

194 # pool.terminate()

195 # print("--------------terminate ok----------------")

196 # lp.join()

197 # print("thread stopped")

198 # # run_event.clear()

199 # # lp.terminate()

200 # # print("--------------JOIN--------------")

201 # # pool.terminate()

202 # main_logger.error("Process killed by CTRL+C")

203 # return "coucou"

204 # If an error occurs, terminate pool, write error and exit

205 except Exception as excp: # pragma: no cover

206 pool.terminate()

207 main_logger.error(excp)

208 sys.exit(1)

209 final = {genome: res for genome, res in zip(sorted(genomes), final)}

210 return final

211

212

213def prodigal_train(gpath, annot_folder):

214 """

215 Use prodigal training mode.

216 First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn,

217 file which will be used for the annotation of all next sequence

218 Parameters

219 ----------

220 gpath : str

221 path to genome to train on

222 annot_folder : str

223 path to folder where the log files and train file will be saved

224

225 Returns

226 -------

227 str

228 path and name of train file (will be used to annotate all next genomes)

229 If problem, returns empty string

230 """

231 logger.info(f"Prodigal will train using {gpath}")

232 gname = os.path.basename(gpath) # path/to/original/genome.fasta -> genome.fasta

233 gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn

234 if os.path.isfile(gpath_train):

235 logger.info(f"A training file already exists ({gpath_train}). "

236 "It will be used to annotate all genomes.")

237 return gpath_train

238 prodigal_logfile = gpath_train + "-prodigal-train.log" # path/to/genome-prodigal-train.log

239 prodigal_logfile_err = gpath_train + "-prodigal-train.log.err"

240 cmd = (f"prodigal -i {gpath} -t {gpath_train}")

241 error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.")

242 logger.log(utils.detail_lvl(), "prodigal command: " + cmd)

243 prodigalf = open(prodigal_logfile, "w")

244 prodigalferr = open(prodigal_logfile_err, "w")

245 ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,

246 logger=logger)

247 prodigalf.close()

248 prodigalferr.close()

249 if ret != 1 and ret.returncode == 0:

250 logger.log(utils.detail_lvl(), f"End training on {gpath}")

251 return gpath_train

252 else:

253 return ""

254

255

256def run_prokka(arguments):

257 """

258 Run prokka for the given genome.

259

260 Parameters

261 ----------

262 arguments : tuple

263 (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with:

264

265 * gpath: path and filename of genome to annotate

266 * prok_folder: path to folder where all prokka folders for all genomes are saved

267 * cores_annot: how many cores can use prokka

268 * name: output name of annotated genome

269 * force: True if force run (override existing files), False otherwise

270 * nbcont: number of contigs in the input genome, to check prokka results

271 * small: used for prodigal, if sequences to annotate are small. Not used here

272 * q : queue where logs are put

273

274 Returns

275 -------

276 boolean

277 True if eveything went well (all needed output files present,

278 corresponding numbers of proteins, genes etc.). False otherwise.

279 """

280 gpath, prok_folder, threads, name, force, nbcont, _, q = arguments

281 # Set logger for this process

282 qh = logging.handlers.QueueHandler(q)

283 root = logging.getLogger()

284 root.setLevel(logging.DEBUG)

285 root.handlers = []

286 logging.addLevelName(utils.detail_lvl(), "DETAIL")

287 root.addHandler(qh)

288 logger = logging.getLogger('annotate.run_prokka')

289 logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka")

290

291 # Define prokka directory and logfile, and check their existence

292 prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes")

293 fnull = open(os.devnull, 'w')

294 prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log")

295 # import sys

296 # sys.exit(1)

297 # If result dir already exists, check if we can use it or next step or not

298 if os.path.isdir(prok_dir) and not force:

299 logger.warning(f"Prokka results folder {prok_dir} already exists.")

300 ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)

301 # If everything ok in the result dir, do not rerun prokka,

302 # use those results for next step (formatting)

303 if ok:

304 logger.log(utils.detail_lvl(), "Prokka did not run again, "

305 "formatting step used already generated results of "

306 f"Prokka in {prok_dir}. If you want to re-run prokka, first "

307 "remove this result folder, or use '-F' or '--force' "

308 "option if you want to rerun prokka for all genomes.")

309 logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}")

310 # If missing files, or other problems in result dir, error message,

311 # ask user to force or remove this folder.

312 else:

313 logger.warning("Problems in the files contained in your already existing output dir "

314 "({}). Please check it, or remove it to "

315 "re-annotate.".format(prok_dir))

316 # If everything was ok -> everything is ready for next step -> return True

317 # If something is wrong -> cannot use those results, genome won't be annotated

318 # -> return False

319 return ok

320 # If result dir exists but user wants to force, remove this result dir

321 elif os.path.isdir(prok_dir) and force:

322 shutil.rmtree(prok_dir)

323 logger.warning("Prokka results folder already exists, but removed because --force option "

324 "used")

325 # Now that we checked and solved those cases:

326 # - outdir exists (problems or not, we returned appropriate boolean)

327 # - if outdir exists exists but force, remove this outdir.

328 # So, outdir does not exist -> run prokka

329 cmd = (f"prokka --outdir {prok_dir} --cpus {threads} "

330 f"--prefix {name} --centre prokka {gpath}")

331 error = (f"Error while trying to run prokka on {name} from {gpath}")

332 logger.log(utils.detail_lvl(), "Prokka command: " + cmd)

333 prokf = open(prok_logfile, "w")

334 ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger)

335 prokf.close()

336 if ret.returncode != 0:

337 return False

338 ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)

339 logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.")

340 return ok

341

342

343def run_prodigal(arguments):

344 """

345 Run prodigal for the given genome.

346

347 Parameters

348 ----------

349 arguments : tuple

350 (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with:

351

352 * gpath: path and filename of genome to annotate

353 * prodigal_folder: path to folder where all prodigal folders for all genomes are saved

354 * cores_annot: how many cores can use prodigal

355 * name: output name of annotated genome

356 * force: True if force run (override existing files), False otherwise

357 * nbcont: number of contigs in the input genome, to check prodigal results

358 * small: ifcontigs are too small (<20000bp), use -p meta option

359 * q : queue where logs are put

360

361 Returns

362 -------

363 boolean

364 True if eveything went well (all needed output files present,

365 corresponding numbers of proteins, genes etc.). False otherwise.

366 """

367 gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments

368 # Set logger for this process, which will be given to all subprocess

369 qh = logging.handlers.QueueHandler(q)

370 root = logging.getLogger()

371 root.setLevel(logging.DEBUG)

372 root.handlers = []

373 logging.addLevelName(utils.detail_lvl(), "DETAIL")

374 root.addHandler(qh)

375 logger = logging.getLogger('annotate.run_prodigal')

376 # Define prodigal directory and logfile, and check their existence

377 # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes

378 g_ori_name = os.path.basename(gpath)

379 prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes")

380 prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log")

381 prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err")

382

383 # If result dir exists but user wants to force, remove this result dir

384 if os.path.isdir(prodigal_dir) and force:

385 shutil.rmtree(prodigal_dir)

386 logger.warning("Prodigal results folder already exists, but is removed because "

387 "--force option was used.")

388

389 # Training file can be "small option", meaning that we did not use the training mode.

390 # If not "small option", we used the training mode. If training file does not exist

391 # and prodigal result directory neither, return False

392 # We cannot annotate using nothing.

393 # Happens if there was a problem while training

394 if (gpath_train != "small option" and not os.path.isfile(gpath_train)

395 and not os.path.isdir(prodigal_dir)):

396 return False

397

398 logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) "

399 "with Prodigal")

400 # If prodigal results dir already exists (meaning user did not want to force,

401 # otherwise it would have been deleted just before),

402 # can we use it for next step ? -> check content.

403 if os.path.isdir(prodigal_dir):

404 logger.warning(f"Prodigal results folder {prodigal_dir} already exists.")

405 ok = check_prodigal(gpath, name, prodigal_dir, logger)

406 # If everything ok in the result dir, do not rerun prodigal,

407 # use those results for next step (formatting)

408 if ok:

409 logger.log(utils.detail_lvl(), "Prodigal did not run again. "

410 "Formatting step will use already generated results of "

411 "Prodigal in {}. If you want to re-run Prodigal, first "

412 "remove this result folder, or use '-F' or '--force' "

413 "option.".format(prodigal_dir))

414

415 logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")

416 # If missing files, or other problems in result dir, error message,

417 # ask user to force or remove this folder.

418 else:

419 logger.warning("Problems in the files contained in your already existing output dir "

420 f"({prodigal_dir}). Please check it, or remove it to "

421 "re-annotate.")

422 # If everything was ok -> everything is ready for next step -> return True

423 # If something is wrong -> cannot use those results, genome won't be annotated

424 # -> return False

425 return ok

426 else:

427 # We are sure prodigal result dir does not exist yet, because either:

428 # - never existed

429 # - removed because user asked to force

430 # - exists but left function, so does not go until this line

431 # -> either if files inside are ok or not

432 # So make prodigal_dir (not automatically created by prodigal)

433 os.makedirs(prodigal_dir)

434

435 # Prodigal_directory is empty and ready to get prodigal results

436 basic_outname = os.path.join(prodigal_dir, name)

437 # Define cmd, stderr and stdout files, and error to write if problem.

438 error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.")

439 prodigalf = open(prodigal_logfile, "w")

440 prodigalferr = open(prodigal_logfile_err, "w")

441 if gpath_train == "small option":

442 training = "-p meta"

443 else:

444 training = f"-t {gpath_train}"

445 cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} "

446 f"-f gff -o {basic_outname + '.gff'} {training} -q")

447 logger.log(utils.detail_lvl(), "Prodigal command: " + cmd)

448

449 ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,

450 logger=logger)

451 prodigalf.close()

452 prodigalferr.close()

453 if ret.returncode == 0:

454 logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")

455 return True

456 else:

457 return False

458

459

460def check_prokka(outdir, logf, name, gpath, nbcont, logger):

461 """

462 Prokka writes everything to stderr, and always returns a non-zero return code. So, we

463 check if it ran well by checking the content of output directory.

464 This function is also used when prokka files already exist (prokka was run previously), to

465 check if everything is ok before going to next step.

466

467 Parameters

468 ----------

469 outdir : str

470 output directory, where all files are written by prokka

471 logf : str

472 prokka/prodigal logfile, containing stderr of prokka

473 name : str

474 genome name in gembase format

475 gpath : str

476 path to fasta file given as input for prokka

477 nbcont : int

478 number of contigs in fasta file given to prokka

479 logger : logging.Logger

480 logger object to get logs

481

482 Returns

483 -------

484 bool

485 True if everything went well, False otherwise

486 """

487 missing_file = False

488 problem = False

489 if not os.path.isdir(outdir):

490 logger.error(("Previous annotation could not run properly. "

491 "Look at {} for more information.").format(logf))

492 missing_file = True

493 else:

494 oriname = os.path.basename(gpath)

495 fnafile = glob.glob(os.path.join(outdir, "*.fna"))

496 tblfile = glob.glob(os.path.join(outdir, "*.tbl"))

497 faafile = glob.glob(os.path.join(outdir, "*.faa"))

498 ffnfile = glob.glob(os.path.join(outdir, "*.ffn"))

499 gfffile = glob.glob(os.path.join(outdir, "*.gff"))

500 if len(fnafile) == 0:

501 logger.error("{} {}: no .fna file".format(name, oriname))

502 missing_file = True

503 elif len(fnafile) > 1:

504 logger.error("{} {}: several .fna files".format(name, oriname))

505 missing_file = True

506 if len(tblfile) == 0:

507 logger.error("{} {}: no .tbl file".format(name, oriname))

508 missing_file = True

509 elif len(tblfile) > 1:

510 logger.error("{} {}: several .tbl files".format(name, oriname))

511 missing_file = True

512 if len(faafile) == 0:

513 logger.error("{} {}: no .faa file".format(name, oriname))

514 missing_file = True

515 elif len(faafile) > 1:

516 logger.error("{} {}: several .faa files".format(name, oriname))

517 missing_file = True

518 if len(ffnfile) == 0:

519 logger.error("{} {}: no .ffn file".format(name, oriname))

520 missing_file = True

521 elif len(ffnfile) > 1:

522 logger.error("{} {}: several .ffn files".format(name, oriname))

523 missing_file = True

524 if len(gfffile) == 0:

525 logger.error("{} {}: no .gff file".format(name, oriname))

526 missing_file = True

527 elif len(gfffile) > 1:

528 logger.error("{} {}: several .gff files".format(name, oriname))

529 missing_file = True

530 if not missing_file:

531 tblfile = tblfile[0]

532 faafile = faafile[0]

533 ffnfile = ffnfile[0]

534 fnbcont, tnb_cds, nb_gene = count_tbl(tblfile)

535 faaprot = count_headers(faafile)

536 ffngene = count_headers(ffnfile)

537 if nbcont != fnbcont:

538 logger.error(("{} {}: no matching number of contigs; "

539 "nbcontig={}; in tbl ={}").format(name, oriname, nbcont, fnbcont))

540 problem = True

541 if tnb_cds != faaprot:

542 logger.error(("{} {}: no matching number of proteins between tbl and faa; "

543 "faa={}; in tbl ={}").format(name, oriname, faaprot, tnb_cds))

544 problem = True

545 return not problem and not missing_file

546

547

548def check_prodigal(gpath, name, prodigal_dir, logger):

549 """

550 When prodigal result folder already exists, check that the ouput files exist.

551 We cannot check all content, but check that they are present.

552

553 Parameters

554 ----------

555 gpath : str

556 path to fasta file given as input for prokka

557 name : str

558 genome name in gembase format

559 prodigal_dir : str

560 output directory, where all files are written by prodigal

561 logger : logging.Logger

562 logger object to get logs

563

564 Returns

565 -------

566 bool

567 True if everything went well, False otherwise

568 """

569 oriname = os.path.basename(gpath)

570 faafile = glob.glob(os.path.join(prodigal_dir, "*.faa"))

571 ffnfile = glob.glob(os.path.join(prodigal_dir, "*.ffn"))

572 gfffile = glob.glob(os.path.join(prodigal_dir, "*.gff"))

573 missing_file = False

574

575 if len(faafile) != 1:

576 logger.error("{} {}: no or several .faa file(s)".format(name, oriname))

577 missing_file = True

578 if len(ffnfile) != 1:

579 logger.error("{} {}: no or several .ffn file(s)".format(name, oriname))

580 missing_file = True

581 if len(gfffile) != 1:

582 logger.error("{} {}: no or several .gff file(s)".format(name, oriname))

583 missing_file = True

584

585 # If we have all result files, check they are not empty

586 if not missing_file:

587 if (os.path.getsize(faafile[0]) == 0 or os.path.getsize(ffnfile[0]) == 0

588 or os.path.getsize(gfffile[0]) == 0):

589 logger.error("Genome {} (from {}): At least one of your Prodigal result file "

590 "is empty.".format(name, oriname))

591 return False

592 return not missing_file

593

594

595def count_tbl(tblfile):

596 """

597 Count the different features found in the tbl file:

598

599 - number of contigs

600 - number of proteins (CDS)

601 - number of genes (locus_tag)

602 - number of CRISPR arrays (repeat_region) -> ignore crisprs

603

604 Parameters

605 ----------

606 tblfile : str

607 tbl file generated by prokka

608

609 Returns

610 -------

611 (nbcont, nb_cds, nb_gene, nb_crispr)

612 information on features found in the tbl file.

613 """

614 nbcont = 0

615 nb_cds = 0

616 nb_gene = 0

617 # nb_crispr = 0

618 with open(tblfile) as tblf:

619 for line in tblf:

620 if line.startswith(">"):

621 nbcont += 1

622 if "CDS" in line:

623 nb_cds += 1

624 if "locus_tag" in line:

625 nb_gene += 1

626 # if "repeat_region" in line or (len(line.split()) == 3 and "CRISPR" in line):

627 # nb_crispr += 1

628 return nbcont, nb_cds, nb_gene #, nb_crispr

629

630

631def count_headers(seqfile):

632 """

633 Count how many sequences there are in the given file

634

635 Parameters

636 ----------

637 seqfile : str

638 file containing a sequence in multi-fasta format

639

640 Returns

641 -------

642 int

643 number of contigs in the given multi-fasta file

644 """

645 nbseq = 0

646 with open(seqfile) as faaf:

647 for line in faaf:

648 if line.startswith(">"):

649 nbseq += 1

650 return nbseq

Coverage for PanACoTA/annotate_module/annotation_functions.py: 100%

262 statements