Coverage for PanACoTA/annotate_module/annotation_functions.py: 100%
262 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to deal with prokka or prodigal only, according to user request
40@author gem
41April 2017
42"""
44import os
45import sys
46import shutil
47import glob
48import logging
49import subprocess
50import shlex
51import multiprocessing
52import progressbar
53import threading
55import PanACoTA.utils as utils
57logger = logging.getLogger('annotate.run_annotation_all')
60def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only=False,
61 small=False, quiet=False):
62 """
63 For each genome in genomes, run prokka (or only prodigal) to annotate the genome.
65 Parameters
66 ----------
67 genomes : dict
68 {genome: [gembase_name, path_to_origfile, path_split_gembase, gsize, nbcont, L90]}
69 threads : int
70 max number of threads that can be used
71 force : bool
72 if False, do not override prokka/prodigal outdir and result dir if they exist. If\
73 True, rerun prokka and override existing results, for all genomes.
74 annot_folder : str
75 folder where prokka/prodigal results must be written: for each genome,
76 a directory <genome_name>-prokkaRes or <genome_name>-prodigalRes> will be created
77 in this folder, and all the results
78 of prokka/prodigal for the genome will be written inside
79 fgn : str
80 name (key in genomes dict) of the fist genome, which will be used for prodigal training
81 prodigal_only : bool
82 True if only prodigal must run, False if prokka must run
83 small : bool
84 True -> use -p meta option with prodigal. Do not use training
85 quiet : bool
86 True if nothing must be written to stderr/stdout, False otherwise
88 Returns
89 -------
90 dict
91 {genome: boolean} -> with True if prokka/prodigal ran well, False otherwise.
92 """
94 # Update information according to annotation soft used and write message
95 if prodigal_only:
96 message = "Annotating all genomes with prodigal"
97 run_annot = run_prodigal
98 main_logger = logging.getLogger("annotate.prodigal")
99 else:
100 message = "Annotating all genomes with prokka"
101 run_annot = run_prokka
102 main_logger = logging.getLogger("annotate.prokka")
103 main_logger.info(message)
104 # Get total number of genomes to annotate, used to show annotation progress
105 nbgen = len(genomes)
106 bar = None
107 # If user did not ask for quiet, create progressbar
108 if not quiet:
109 # Create progress bar
110 widgets = ['Annotation: ', progressbar.Bar(marker='█', left='', right=''),
111 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',
112 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - '
113 ]
114 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen,
115 term_width=79).start()
116 # Get resource availability:
117 # - number of threads used by prokka/prodigal (cores_annot)
118 # - how many genomes can be annotated at the same time (pool_size)
119 # - prodigal does not run with several threads: with prodigal, always cores_annot == 1
120 # and pool_size == threads
121 gpath_train = "" # by default, no training genome
122 if prodigal_only:
123 cores_annot = 1
124 pool_size = threads
125 # If prodigal, train on the first genome
126 # fgn is key of genomes, genomes[fgn] = [_,_,annote_file,_,_,_]
127 gtrain = genomes[fgn][2]
128 # If problem, gpath_train will be empty, but this will be checked while
129 # trying to run prodigal, because we also need to check that genomes are not simply
130 # already annotated
131 if not small:
132 gpath_train = prodigal_train(gtrain, annot_folder)
133 else:
134 gpath_train = "small option"
135 elif threads <= 3:
136 # less than 3 threads: run prokka 1 by 1 with all threads
137 cores_annot = threads
138 pool_size = 1
139 else:
140 # use multiprocessing
141 # if there are more threads than genomes, use as many threads as possible per genome
142 if len(genomes) <= threads:
143 cores_annot = int(threads / len(genomes))
144 # otherwise, use 2 threads per genome (and nb_genome/2 genomes at the same time)
145 else:
146 cores_annot = 2
147 pool_size = int(threads / cores_annot)
148 # Create pool with a given size (=number of tasks to be launched in parallel)
149 pool = multiprocessing.Pool(pool_size)
150 # Create a Queue to put logs from processes, and handle them after from a single thread
151 m = multiprocessing.Manager()
152 q = m.Queue()
153 # {genome: [gembase_name, path_to_origfile, path_toannotate_file, gsize, nbcont, L90]}
154 # arguments: gpath, prok_folder, threads, name, force, nbcont, small(for prodigal), q
155 arguments = [(genomes[g][2], annot_folder, cores_annot, genomes[g][0],
156 force, genomes[g][4], gpath_train, q)
157 for g in sorted(genomes)]
158 try:
159 # Start pool (run 'run_annot' n each set of arguments)
160 final = pool.map_async(run_annot, arguments, chunksize=1)
161 # Close pool: no more data will be put on this pool
162 pool.close()
163 # Listen for logs in processes
164 lp = threading.Thread(target=utils.logger_thread, args=(q,))
165 lp.start()
166 if not quiet:
167 while True:
168 # Final is ready when all pool elements are done
169 if final.ready():
170 break
171 # If not done, get number of genomes left
172 remaining = final._number_left
173 # Add this to start progressbar with 0% instead of N/A%
174 if remaining == nbgen:
175 bar.update(0.0000001)
176 else:
177 # Update progress bar
178 bar.update(nbgen - remaining)
179 # End progress bar
180 bar.finish()
181 pool.join()
182 # Put None to tell 'q' that everything is finished. It can stopped and be joined.
183 q.put(None)
184 # join lp (tell to stop once all log processes are done, which is the case here)
185 lp.join()
186 final = final.get()
187 # # If user stops programm (ctrl+C), end it
188 # except KeyboardInterrupt as ki:
189 # print("error")
190 # for worker in pool._pool:
191 # print(worker.is_alive())
192 # pool.join()
193 # print("closed")
194 # pool.terminate()
195 # print("--------------terminate ok----------------")
196 # lp.join()
197 # print("thread stopped")
198 # # run_event.clear()
199 # # lp.terminate()
200 # # print("--------------JOIN--------------")
201 # # pool.terminate()
202 # main_logger.error("Process killed by CTRL+C")
203 # return "coucou"
204 # If an error occurs, terminate pool, write error and exit
205 except Exception as excp: # pragma: no cover
206 pool.terminate()
207 main_logger.error(excp)
208 sys.exit(1)
209 final = {genome: res for genome, res in zip(sorted(genomes), final)}
210 return final
213def prodigal_train(gpath, annot_folder):
214 """
215 Use prodigal training mode.
216 First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn,
217 file which will be used for the annotation of all next sequence
218 Parameters
219 ----------
220 gpath : str
221 path to genome to train on
222 annot_folder : str
223 path to folder where the log files and train file will be saved
225 Returns
226 -------
227 str
228 path and name of train file (will be used to annotate all next genomes)
229 If problem, returns empty string
230 """
231 logger.info(f"Prodigal will train using {gpath}")
232 gname = os.path.basename(gpath) # path/to/original/genome.fasta -> genome.fasta
233 gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn
234 if os.path.isfile(gpath_train):
235 logger.info(f"A training file already exists ({gpath_train}). "
236 "It will be used to annotate all genomes.")
237 return gpath_train
238 prodigal_logfile = gpath_train + "-prodigal-train.log" # path/to/genome-prodigal-train.log
239 prodigal_logfile_err = gpath_train + "-prodigal-train.log.err"
240 cmd = (f"prodigal -i {gpath} -t {gpath_train}")
241 error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.")
242 logger.log(utils.detail_lvl(), "prodigal command: " + cmd)
243 prodigalf = open(prodigal_logfile, "w")
244 prodigalferr = open(prodigal_logfile_err, "w")
245 ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
246 logger=logger)
247 prodigalf.close()
248 prodigalferr.close()
249 if ret != 1 and ret.returncode == 0:
250 logger.log(utils.detail_lvl(), f"End training on {gpath}")
251 return gpath_train
252 else:
253 return ""
256def run_prokka(arguments):
257 """
258 Run prokka for the given genome.
260 Parameters
261 ----------
262 arguments : tuple
263 (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with:
265 * gpath: path and filename of genome to annotate
266 * prok_folder: path to folder where all prokka folders for all genomes are saved
267 * cores_annot: how many cores can use prokka
268 * name: output name of annotated genome
269 * force: True if force run (override existing files), False otherwise
270 * nbcont: number of contigs in the input genome, to check prokka results
271 * small: used for prodigal, if sequences to annotate are small. Not used here
272 * q : queue where logs are put
274 Returns
275 -------
276 boolean
277 True if eveything went well (all needed output files present,
278 corresponding numbers of proteins, genes etc.). False otherwise.
279 """
280 gpath, prok_folder, threads, name, force, nbcont, _, q = arguments
281 # Set logger for this process
282 qh = logging.handlers.QueueHandler(q)
283 root = logging.getLogger()
284 root.setLevel(logging.DEBUG)
285 root.handlers = []
286 logging.addLevelName(utils.detail_lvl(), "DETAIL")
287 root.addHandler(qh)
288 logger = logging.getLogger('annotate.run_prokka')
289 logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka")
291 # Define prokka directory and logfile, and check their existence
292 prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes")
293 fnull = open(os.devnull, 'w')
294 prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log")
295 # import sys
296 # sys.exit(1)
297 # If result dir already exists, check if we can use it or next step or not
298 if os.path.isdir(prok_dir) and not force:
299 logger.warning(f"Prokka results folder {prok_dir} already exists.")
300 ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
301 # If everything ok in the result dir, do not rerun prokka,
302 # use those results for next step (formatting)
303 if ok:
304 logger.log(utils.detail_lvl(), "Prokka did not run again, "
305 "formatting step used already generated results of "
306 f"Prokka in {prok_dir}. If you want to re-run prokka, first "
307 "remove this result folder, or use '-F' or '--force' "
308 "option if you want to rerun prokka for all genomes.")
309 logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}")
310 # If missing files, or other problems in result dir, error message,
311 # ask user to force or remove this folder.
312 else:
313 logger.warning("Problems in the files contained in your already existing output dir "
314 "({}). Please check it, or remove it to "
315 "re-annotate.".format(prok_dir))
316 # If everything was ok -> everything is ready for next step -> return True
317 # If something is wrong -> cannot use those results, genome won't be annotated
318 # -> return False
319 return ok
320 # If result dir exists but user wants to force, remove this result dir
321 elif os.path.isdir(prok_dir) and force:
322 shutil.rmtree(prok_dir)
323 logger.warning("Prokka results folder already exists, but removed because --force option "
324 "used")
325 # Now that we checked and solved those cases:
326 # - outdir exists (problems or not, we returned appropriate boolean)
327 # - if outdir exists exists but force, remove this outdir.
328 # So, outdir does not exist -> run prokka
329 cmd = (f"prokka --outdir {prok_dir} --cpus {threads} "
330 f"--prefix {name} --centre prokka {gpath}")
331 error = (f"Error while trying to run prokka on {name} from {gpath}")
332 logger.log(utils.detail_lvl(), "Prokka command: " + cmd)
333 prokf = open(prok_logfile, "w")
334 ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger)
335 prokf.close()
336 if ret.returncode != 0:
337 return False
338 ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
339 logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.")
340 return ok
343def run_prodigal(arguments):
344 """
345 Run prodigal for the given genome.
347 Parameters
348 ----------
349 arguments : tuple
350 (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with:
352 * gpath: path and filename of genome to annotate
353 * prodigal_folder: path to folder where all prodigal folders for all genomes are saved
354 * cores_annot: how many cores can use prodigal
355 * name: output name of annotated genome
356 * force: True if force run (override existing files), False otherwise
357 * nbcont: number of contigs in the input genome, to check prodigal results
358 * small: ifcontigs are too small (<20000bp), use -p meta option
359 * q : queue where logs are put
361 Returns
362 -------
363 boolean
364 True if eveything went well (all needed output files present,
365 corresponding numbers of proteins, genes etc.). False otherwise.
366 """
367 gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments
368 # Set logger for this process, which will be given to all subprocess
369 qh = logging.handlers.QueueHandler(q)
370 root = logging.getLogger()
371 root.setLevel(logging.DEBUG)
372 root.handlers = []
373 logging.addLevelName(utils.detail_lvl(), "DETAIL")
374 root.addHandler(qh)
375 logger = logging.getLogger('annotate.run_prodigal')
376 # Define prodigal directory and logfile, and check their existence
377 # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes
378 g_ori_name = os.path.basename(gpath)
379 prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes")
380 prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log")
381 prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err")
383 # If result dir exists but user wants to force, remove this result dir
384 if os.path.isdir(prodigal_dir) and force:
385 shutil.rmtree(prodigal_dir)
386 logger.warning("Prodigal results folder already exists, but is removed because "
387 "--force option was used.")
389 # Training file can be "small option", meaning that we did not use the training mode.
390 # If not "small option", we used the training mode. If training file does not exist
391 # and prodigal result directory neither, return False
392 # We cannot annotate using nothing.
393 # Happens if there was a problem while training
394 if (gpath_train != "small option" and not os.path.isfile(gpath_train)
395 and not os.path.isdir(prodigal_dir)):
396 return False
398 logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) "
399 "with Prodigal")
400 # If prodigal results dir already exists (meaning user did not want to force,
401 # otherwise it would have been deleted just before),
402 # can we use it for next step ? -> check content.
403 if os.path.isdir(prodigal_dir):
404 logger.warning(f"Prodigal results folder {prodigal_dir} already exists.")
405 ok = check_prodigal(gpath, name, prodigal_dir, logger)
406 # If everything ok in the result dir, do not rerun prodigal,
407 # use those results for next step (formatting)
408 if ok:
409 logger.log(utils.detail_lvl(), "Prodigal did not run again. "
410 "Formatting step will use already generated results of "
411 "Prodigal in {}. If you want to re-run Prodigal, first "
412 "remove this result folder, or use '-F' or '--force' "
413 "option.".format(prodigal_dir))
415 logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
416 # If missing files, or other problems in result dir, error message,
417 # ask user to force or remove this folder.
418 else:
419 logger.warning("Problems in the files contained in your already existing output dir "
420 f"({prodigal_dir}). Please check it, or remove it to "
421 "re-annotate.")
422 # If everything was ok -> everything is ready for next step -> return True
423 # If something is wrong -> cannot use those results, genome won't be annotated
424 # -> return False
425 return ok
426 else:
427 # We are sure prodigal result dir does not exist yet, because either:
428 # - never existed
429 # - removed because user asked to force
430 # - exists but left function, so does not go until this line
431 # -> either if files inside are ok or not
432 # So make prodigal_dir (not automatically created by prodigal)
433 os.makedirs(prodigal_dir)
435 # Prodigal_directory is empty and ready to get prodigal results
436 basic_outname = os.path.join(prodigal_dir, name)
437 # Define cmd, stderr and stdout files, and error to write if problem.
438 error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.")
439 prodigalf = open(prodigal_logfile, "w")
440 prodigalferr = open(prodigal_logfile_err, "w")
441 if gpath_train == "small option":
442 training = "-p meta"
443 else:
444 training = f"-t {gpath_train}"
445 cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} "
446 f"-f gff -o {basic_outname + '.gff'} {training} -q")
447 logger.log(utils.detail_lvl(), "Prodigal command: " + cmd)
449 ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
450 logger=logger)
451 prodigalf.close()
452 prodigalferr.close()
453 if ret.returncode == 0:
454 logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
455 return True
456 else:
457 return False
460def check_prokka(outdir, logf, name, gpath, nbcont, logger):
461 """
462 Prokka writes everything to stderr, and always returns a non-zero return code. So, we
463 check if it ran well by checking the content of output directory.
464 This function is also used when prokka files already exist (prokka was run previously), to
465 check if everything is ok before going to next step.
467 Parameters
468 ----------
469 outdir : str
470 output directory, where all files are written by prokka
471 logf : str
472 prokka/prodigal logfile, containing stderr of prokka
473 name : str
474 genome name in gembase format
475 gpath : str
476 path to fasta file given as input for prokka
477 nbcont : int
478 number of contigs in fasta file given to prokka
479 logger : logging.Logger
480 logger object to get logs
482 Returns
483 -------
484 bool
485 True if everything went well, False otherwise
486 """
487 missing_file = False
488 problem = False
489 if not os.path.isdir(outdir):
490 logger.error(("Previous annotation could not run properly. "
491 "Look at {} for more information.").format(logf))
492 missing_file = True
493 else:
494 oriname = os.path.basename(gpath)
495 fnafile = glob.glob(os.path.join(outdir, "*.fna"))
496 tblfile = glob.glob(os.path.join(outdir, "*.tbl"))
497 faafile = glob.glob(os.path.join(outdir, "*.faa"))
498 ffnfile = glob.glob(os.path.join(outdir, "*.ffn"))
499 gfffile = glob.glob(os.path.join(outdir, "*.gff"))
500 if len(fnafile) == 0:
501 logger.error("{} {}: no .fna file".format(name, oriname))
502 missing_file = True
503 elif len(fnafile) > 1:
504 logger.error("{} {}: several .fna files".format(name, oriname))
505 missing_file = True
506 if len(tblfile) == 0:
507 logger.error("{} {}: no .tbl file".format(name, oriname))
508 missing_file = True
509 elif len(tblfile) > 1:
510 logger.error("{} {}: several .tbl files".format(name, oriname))
511 missing_file = True
512 if len(faafile) == 0:
513 logger.error("{} {}: no .faa file".format(name, oriname))
514 missing_file = True
515 elif len(faafile) > 1:
516 logger.error("{} {}: several .faa files".format(name, oriname))
517 missing_file = True
518 if len(ffnfile) == 0:
519 logger.error("{} {}: no .ffn file".format(name, oriname))
520 missing_file = True
521 elif len(ffnfile) > 1:
522 logger.error("{} {}: several .ffn files".format(name, oriname))
523 missing_file = True
524 if len(gfffile) == 0:
525 logger.error("{} {}: no .gff file".format(name, oriname))
526 missing_file = True
527 elif len(gfffile) > 1:
528 logger.error("{} {}: several .gff files".format(name, oriname))
529 missing_file = True
530 if not missing_file:
531 tblfile = tblfile[0]
532 faafile = faafile[0]
533 ffnfile = ffnfile[0]
534 fnbcont, tnb_cds, nb_gene = count_tbl(tblfile)
535 faaprot = count_headers(faafile)
536 ffngene = count_headers(ffnfile)
537 if nbcont != fnbcont:
538 logger.error(("{} {}: no matching number of contigs; "
539 "nbcontig={}; in tbl ={}").format(name, oriname, nbcont, fnbcont))
540 problem = True
541 if tnb_cds != faaprot:
542 logger.error(("{} {}: no matching number of proteins between tbl and faa; "
543 "faa={}; in tbl ={}").format(name, oriname, faaprot, tnb_cds))
544 problem = True
545 return not problem and not missing_file
548def check_prodigal(gpath, name, prodigal_dir, logger):
549 """
550 When prodigal result folder already exists, check that the ouput files exist.
551 We cannot check all content, but check that they are present.
553 Parameters
554 ----------
555 gpath : str
556 path to fasta file given as input for prokka
557 name : str
558 genome name in gembase format
559 prodigal_dir : str
560 output directory, where all files are written by prodigal
561 logger : logging.Logger
562 logger object to get logs
564 Returns
565 -------
566 bool
567 True if everything went well, False otherwise
568 """
569 oriname = os.path.basename(gpath)
570 faafile = glob.glob(os.path.join(prodigal_dir, "*.faa"))
571 ffnfile = glob.glob(os.path.join(prodigal_dir, "*.ffn"))
572 gfffile = glob.glob(os.path.join(prodigal_dir, "*.gff"))
573 missing_file = False
575 if len(faafile) != 1:
576 logger.error("{} {}: no or several .faa file(s)".format(name, oriname))
577 missing_file = True
578 if len(ffnfile) != 1:
579 logger.error("{} {}: no or several .ffn file(s)".format(name, oriname))
580 missing_file = True
581 if len(gfffile) != 1:
582 logger.error("{} {}: no or several .gff file(s)".format(name, oriname))
583 missing_file = True
585 # If we have all result files, check they are not empty
586 if not missing_file:
587 if (os.path.getsize(faafile[0]) == 0 or os.path.getsize(ffnfile[0]) == 0
588 or os.path.getsize(gfffile[0]) == 0):
589 logger.error("Genome {} (from {}): At least one of your Prodigal result file "
590 "is empty.".format(name, oriname))
591 return False
592 return not missing_file
595def count_tbl(tblfile):
596 """
597 Count the different features found in the tbl file:
599 - number of contigs
600 - number of proteins (CDS)
601 - number of genes (locus_tag)
602 - number of CRISPR arrays (repeat_region) -> ignore crisprs
604 Parameters
605 ----------
606 tblfile : str
607 tbl file generated by prokka
609 Returns
610 -------
611 (nbcont, nb_cds, nb_gene, nb_crispr)
612 information on features found in the tbl file.
613 """
614 nbcont = 0
615 nb_cds = 0
616 nb_gene = 0
617 # nb_crispr = 0
618 with open(tblfile) as tblf:
619 for line in tblf:
620 if line.startswith(">"):
621 nbcont += 1
622 if "CDS" in line:
623 nb_cds += 1
624 if "locus_tag" in line:
625 nb_gene += 1
626 # if "repeat_region" in line or (len(line.split()) == 3 and "CRISPR" in line):
627 # nb_crispr += 1
628 return nbcont, nb_cds, nb_gene #, nb_crispr
631def count_headers(seqfile):
632 """
633 Count how many sequences there are in the given file
635 Parameters
636 ----------
637 seqfile : str
638 file containing a sequence in multi-fasta format
640 Returns
641 -------
642 int
643 number of contigs in the given multi-fasta file
644 """
645 nbseq = 0
646 with open(seqfile) as faaf:
647 for line in faaf:
648 if line.startswith(">"):
649 nbseq += 1
650 return nbseq