Coverage for PanACoTA/utils.py: 100%
465 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Util functions and classes.
40@author gem
41April 2017
42"""
44import os
45import sys
46import re
47import glob
48import subprocess
49import shutil
50import shlex
51import progressbar
53# Logging
54import logging
55from logging.handlers import RotatingFileHandler
56from colorlog import ColoredFormatter
58try:
59 import cPickle as pickle
60except:
61 try:
62 import _pickle as pickle
63 except: # pragma: no cover
64 import pickle
67def init_logger(logfile_base, level, name, log_details=False, verbose=0, quiet=False):
68 """
69 Create logger and its handlers, and set them to the given level
71 level hierarchy: ``CRITICAL > ERROR > WARNING > INFO > DETAILS > DEBUG``
73 Messages from all levels are written in 'logfile'.log
75 Messages for levels less than WARNING (only INFO and DEBUG) written to stdout
77 Messages for levels equal or higher than WARNING written to stderr
79 Messages for levels equal or higher than WARNING written in `logfile`.log.err
82 Parameters
83 ----------
84 logfile_base : str
85 base of filename to use for logs. Will add '.log', '.log.details' and '.log.err' for\
86 the 3 log files created
87 level : int
88 minimum level that must be considered.
89 name : str or None
90 if we need to name the logger (used for tests)
91 log_details : bool
92 if True, force creation of .log.details file. Otherwise, just create
93 it if needed according to level
95 verbose : int
96 be more verbose:
97 default (0): info in stdout, error and more in stderr ;
98 info and more in *.log ; warning and more in *.log.err
99 1 = add warnings in stderr
100 2 = like 1 + add details to stdout (by default only INFO) + add details to *.log.details
101 >15: add debug to stdout and create *.log.debug with all levels
102 quiet : bool
103 True if nothing must be sent to stdout/stderr, False otherwise
104 """
105 import time
107 # time when soft is launched
108 time_start = time.strftime("_%y-%m-%d_%H-%m-%S")
110 # create logger
111 logger = logging.getLogger(name)
113 # Determine logfile names
114 logfile = logfile_base + ".log"
115 if os.path.isfile(logfile):
116 logfile = logfile_base + "-" + time_start + ".log"
117 errfile = logfile_base + ".log.err"
118 if os.path.isfile(errfile):
119 errfile = logfile_base + "-" + time_start + ".log.err"
120 detailfile = logfile_base + ".log.details"
121 if os.path.isfile(detailfile):
122 detailfile = logfile_base + "-" + time_start + ".log.details"
123 debugfile = logfile_base + ".log.debug"
124 if os.path.isfile(debugfile):
125 debugfile = logfile_base + "-" + time_start + ".log.debug"
127 # Create a new logging level: details (between info and debug)
128 # Used to add details to the log file, but not to stdout, while still having
129 # the possibility to put debug messages, used only for development.
130 def details(self, message, *args, **kws):
131 """
132 Define a new log level: details
133 """
134 if self.isEnabledFor(logging.DETAIL):
135 self._log(logging.DETAIL, message, args, **kws)
136 logging.addLevelName(detail_lvl(), "DETAIL")
137 logging.DETAIL = detail_lvl()
138 logging.Logger.details = details
140 # set level of logger
141 logger.setLevel(logging.DEBUG)
143 # create formatter for log messages:
144 # "timestamp :: level :: message"
145 # (add :: %(name)s to add the logger name)
146 my_format = '[%(asctime)s] :: %(levelname)s :: %(message)s'
147 formatter_file = logging.Formatter(my_format, '%Y-%m-%d %H:%M:%S')
148 my_format_stream = '%(log_color)s * [%(asctime)s] : %(levelname)s %(reset)s %(message)s'
149 formatter_stream = ColoredFormatter(my_format_stream, datefmt='%Y-%m-%d %H:%M:%S',
150 log_colors={'DEBUG' : 'cyan',
151 'INFO' : 'green',
152 'DETAIL' : 'cyan',
153 'WARNING' : 'yellow',
154 'ERROR' : 'red',
155 'CRITICAL' : 'red',
156 })
160 # Create handler 1: writing to 'logfile'. mode 'write', max size = 1Mo.
161 # If logfile is 1Mo, it is renamed to logfile.1, and next logs are still
162 # written to logfile. Then, logfile.1 is renamed to logfile.2, logfile to
163 # logfile.1 etc. We allow maximum 5 log files.
164 # logfile contains everything from INFO level (INFO, WARNING, ERROR)
165 logfile_handler = RotatingFileHandler(logfile, 'w', 10000000, 5)
166 # set level to the same as the logger level
167 logfile_handler.setLevel(logging.INFO)
168 logfile_handler.setFormatter(formatter_file) # add formatter
169 logger.addHandler(logfile_handler) # add handler to logger
171 # Create handler 2: errfile. Write only warnings and errors
172 errfile_handler = RotatingFileHandler(errfile, 'w', 10000000, 5)
173 errfile_handler.setLevel(logging.WARNING)
174 errfile_handler.setFormatter(formatter_file) # add formatter
175 logger.addHandler(errfile_handler) # add handler to logger
178 # Create handler 3: detailsfile. Write everything to this file, except debug
179 # Create it only if:
180 # - level is <= info (for modules which have no details, so detailsfile is the same as
181 # logfile)
182 # - details==True force creation of detailsfile
183 # - quiet==True nothing in stdout, put all log files so that user can check
184 if level < logging.INFO or quiet or log_details:
185 detfile_handler = RotatingFileHandler(detailfile, 'w', 10000000, 5)
186 detfile_handler.setLevel(logging.DETAIL)
187 detfile_handler.setFormatter(formatter_file) # add formatter
188 logger.addHandler(detfile_handler) # add handler to logger
190 # Formats for detailed log files
191 my_format_debug = '[%(asctime)s] :: %(levelname)s (from %(name)s logger) :: %(message)s'
192 formatter_file_debug = logging.Formatter(my_format_debug, '%Y-%m-%d %H:%M:%S')
194 # Create handler 4: debug file. Write everything
195 if level < logging.DETAIL:
196 debugfile_handler = RotatingFileHandler(debugfile, 'w', 10000000, 5)
197 debugfile_handler.setLevel(logging.DEBUG)
198 debugfile_handler.setFormatter(formatter_file_debug) # add formatter
199 logger.addHandler(debugfile_handler) # add handler to logger
201 # If not quiet, add handlers for stdout and stderr
202 if not quiet:
203 # Create handler 4: write to stdout
204 stream_handler = logging.StreamHandler(sys.stdout)
205 # By default, write everything
206 stream_handler.setLevel(logging.DEBUG)
207 # BUT: don't write messages >= WARNING (warning, error, critical)
208 stream_handler.addFilter(LessThanFilter(logging.WARNING))
209 # if not verbose (level 0 or 1): only put info in stdout (remove details and debug)
210 if verbose < 2:
211 stream_handler.addFilter(NoLevelFilter(logging.DETAIL))
212 stream_handler.addFilter(NoLevelFilter(logging.DEBUG))
213 stream_handler.setFormatter(formatter_stream)
214 logger.addHandler(stream_handler) # add handler to logger
216 # Create handler 5: write to stderr
217 err_handler = logging.StreamHandler(sys.stderr)
219 if verbose > 0:
220 err_handler.setLevel(logging.WARNING) # write all messages >= WARNING
221 else:
222 err_handler.setLevel(logging.ERROR) # write all messages >= ERROR
223 err_handler.setFormatter(formatter_stream)
224 logger.addHandler(err_handler) # add handler to logger
225 logger.propagate = False
226 return logfile, logger
229class LessThanFilter(logging.Filter):
230 """
231 When using log, when a level is set to a handler, it is a minimum level. All
232 levels higher than it will be printed. If you want to print only until
233 a given level (no levels higher than the specified one), use this class like this:
234 handler.addFilter(LessThanFilter(level))
235 """
237 def __init__(self, level):
238 self._level = level
239 logging.Filter.__init__(self)
241 def filter(self, rec):
242 """
243 Function to decide if given log has to be logged or not, according to its level
245 Parameters
246 ----------
247 rec : current record handled by logger
249 Returns
250 -------
251 bool
252 True if level of current log is less than the defined limit, False otherwise
253 """
254 return rec.levelno < self._level
257class NoLevelFilter(logging.Filter):
258 """
259 When using log, specify a given level that must not be taken into account by the handler.
260 This is used for the stdout handler. We want to print, by default,
261 DEBUG (for development use) and INFO levels, but not DETAILS level (which is between
262 DEBUG and INFO). We want to print DETAIL only if verbose option was set
263 """
265 def __init__(self, level):
266 self._level = level
267 logging.Filter.__init__(self)
269 def filter(self, rec):
270 """
271 Function to decide if given log has to be logged or not, according to its level
273 Parameters
274 ----------
275 rec : current record handled by logger
277 Returns
278 -------
279 bool
280 True if level of current log is different from forbidden level, False if it is the same
281 """
282 return rec.levelno != self._level
285def check_installed(cmd):
286 """
287 Check if the command 'cmd' is in $PATH and can then be executed
289 Parameters
290 ----------
291 cmd : str
292 command to run
294 Returns
295 -------
296 bool
297 True if installed, False otherwise
298 """
299 torun = "which " + cmd
300 trying = subprocess.Popen(shlex.split(torun), stdout=subprocess.PIPE)
301 out, _ = trying.communicate()
302 if trying.returncode == 0:
303 if os.path.isfile(out.strip()):
304 return True
305 return False
308def run_cmd(cmd, error, eof=False, **kwargs):
309 """
310 Run the given command line. If the return code is not 0, print error message.
311 if eof (exit on fail) is True, exit program if error code is not 0.
313 Parameters
314 ----------
315 cmd : str
316 command to run
317 error : str
318 error message to print if error while running command
319 eof : bool
320 True: exit program if command failed, False: do not exit even if command fails
321 kwargs : Object
322 Can provide a logger, stdout and/or stderr streams
324 Returns
325 -------
326 subprocess.Popen
327 returns object of subprocess call (has attributes returncode, pid, communicate etc.)
329 """
330 if "logger" not in kwargs:
331 logger = logging.getLogger("utils.run_cmd")
332 else:
333 logger = kwargs["logger"]
334 if "stdout" not in kwargs:
335 kwargs["stdout"] = None
336 if "stderr" not in kwargs:
337 kwargs["stderr"] = None
338 try:
339 call = subprocess.Popen(shlex.split(cmd), stdout=kwargs["stdout"],
340 stderr=kwargs["stderr"])
341 call.wait()
342 retcode = call.returncode
343 except OSError:
344 logger.error(f"error: command '>{cmd}' is not possible.")
345 if eof:
346 sys.exit(1)
347 else:
348 return 1
349 if retcode != 0:
350 logger.error(error)
351 if eof:
352 sys.exit(retcode)
353 return call
356def plot_distr(values, limit, title, text, logger):
357 """
358 Plot histogram of given 'values', and add a vertical line corresponding to the chosen
359 'limit' and return the mpl figure
361 Parameters
362 ----------
363 values : list
364 list of values
365 limit : int
366 limit for which a vertical line must be drawn
367 title : str
368 Title to give to plot
369 text : str
370 text to write near the vertical line representing the limit
371 logger : logging.Logger
372 logger object to write log information
374 Returns
375 -------
376 matplotlib.figure.Figure
377 figure generated
378 """
379 import math
380 import numpy as np
381 import matplotlib
382 matplotlib.use('AGG')
383 from matplotlib import pyplot as plt
384 plt.close("all")
385 fig = plt.figure(figsize=(10, 7))
386 ax = fig.add_subplot(1, 1, 1)
387 max_x = max(values)
388 # if too many values, group them to have less bins in the histogram.
389 # Put 'group_values' values in each bin ->
390 # if less than 300 values, 1 value per bin, otherwise more values per bin
391 group_values = int(max_x / 300) + 1
392 dec_ax = math.exp(0.001 * max_x) - 1
393 dec_text = 3 * dec_ax
394 bins = np.arange(0, max_x + 2 * group_values, group_values) - 0.5
395 ax.hist(values, bins=bins, edgecolor="black", color="blue")
396 ax.set_xlim(0.5, max_x + 0.5 * group_values)
397 ax.axvline(x=limit + 0.5 * group_values + dec_ax, color="r")
398 ax.text(x=limit + 0.5 * group_values + dec_text, y=plt.ylim()[1] / 2,
399 s=text + " " + str(limit), color="r", rotation=90)
400 ax.set_title(title)
401 return fig
404def write_warning_skipped(skipped, do_format=False, prodigal_only=False, logfile=""):
405 """
406 At the end of the script, write a warning to the user with the names of the genomes
407 which had problems with prokka.
409 Parameters
410 ----------
411 skipped : list
412 list of genomes with problems
413 do_format : bool
414 if False, genomes were not skipped because of format step, but before that.\
415 if True, they were skipped because of format
416 prodigal_only : bool
417 if False: used prokka to annotate
418 if True: used prodigal to annotate
419 """
420 if not prodigal_only:
421 soft = "prokka"
422 else:
423 soft = "prodigal"
424 logger = logging.getLogger("utils")
425 list_to_write = "\n".join(["\t- " + genome for genome in skipped])
426 if not do_format:
427 logger.info(f"WARNING: Some genomes could not be annotated. See {soft}")
428 logger.warning(f"{soft} had problems while annotating some genomes, or "
429 "did not find any gene. Hence, they are not formatted, and absent "
430 "from your output database. Please look at the "
431 "current error log "
432 "(<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err) to get more "
433 "information on the problems. Here are those "
434 f"genomes:\n{list_to_write}")
435 else:
436 logger.info(f"WARNING: Some genomes could not be formatted. See {logfile}")
437 logger.warning((f"Some genomes were annotated by {soft}, but could not be formatted, "
438 "and are hence absent from your output database. Please look at "
439 "'<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err' and "
440 ".details files to get more information about why they could not be "
441 f"formatted.\n{list_to_write}"))
444def write_genomes_info(genomes, kept_genomes, list_file, res_path, qc=False):
445 """
446 Write the list of genomes discarded to a file (qc=False), so that users can
447 keep a trace of them, with their information (nb contigs, L90 etc.)
449 If qc=True, we stop after QC.
450 -> Write the list of genomes that would be kept for annotation with all
451 their information (L90, size, #contig)
454 Parameters
455 ----------
456 genomes : dict
457 {genome: [gembase_start_name, orig_seq_file, to_annotate_seq_file,
458 genome_size, nb_contigs, L90]}
459 kept_genomes : list
460 list of genomes kept
461 list_file : str
462 path to input file containing the list of genomes
463 res_path : str
464 folder where results must be saved
465 qc : bool
466 * True: called only if QC only. Name this file info-genomes-<list_file>.txt to put
467 information on genomes that would be annotated if not QC only
468 * otherwise (False), called in any case. Name this file discarded-<list_file>.txt
469 and write all discarded genomes, whether sequences kept are next annotated or not
470 => columns: orig_name, to_annotate, gsize, nb_conts, L90
471 """
472 logger = logging.getLogger("utils")
473 # number of genomes discarded
474 nb_disc = len(genomes) - len(kept_genomes)
475 # Log number of genomes discarded.
476 if not qc and nb_disc < 2:
477 logger.info(f"{nb_disc} genome was discarded.")
478 elif not qc:
479 logger.info(f"{nb_disc} genomes were discarded.")
480 # Get input list file name (without path)
481 _, name_lst = os.path.split(list_file)
482 # if not QC, write discarded genomes to a file "discarded-[list_file].lst"
483 if not qc:
484 outdisc = os.path.join(res_path,
485 "discarded-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
486 logger.info("Writing discarded genomes to {}".format(outdisc))
487 # if QC, there is no 'discarded genome', just write information on all analyzed genomes
488 else:
489 outdisc = os.path.join(res_path,
490 "ALL-GENOMES-info-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
491 logger.info("Writing information on genomes in {}".format(outdisc))
492 with open(outdisc, "w") as outdf:
493 outdf.write("\t".join(["orig_name", "to_annotate", "gsize", "nb_conts", "L90"]) + "\n")
494 for genome, values in genomes.items():
495 if genome in kept_genomes:
496 continue
497 _, _, to_annotate, gsize, nbcont, l90 = [str(x) for x in values]
498 to_annotate_file = os.path.basename(to_annotate)
499 outdf.write("\t".join([genome, to_annotate_file, gsize, nbcont, l90]) + "\n")
502def write_lstinfo(list_file, genomes, outdir):
503 """
504 Write lstinfo file, with following columns:
505 gembase_name, orig_name, to_annotate_name, size, nbcontigs, l90
507 Parameters
508 ----------
509 list_file : str
510 input file containing the list of genomes
511 genomes : dict
512 {genome: [gembase_start_name, seq_file, seq_to_annotate, genome_size, nb_contigs, L90]}
513 outdir : str
514 folder where results must be saved
516 """
517 _, name_lst = os.path.split(list_file)
518 outlst = os.path.join(outdir, "LSTINFO-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
519 with open(outlst, "w") as outf:
520 outf.write("\t".join(["gembase_name", "orig_name", "to_annotate", "gsize",
521 "nb_conts", "L90"]) + "\n")
522 for genome, values in sorted(genomes.items(), key=sort_genomes_byname_l90_nbcont):
523 gembase, _, to_annote, gsize, nbcont, l90 = [str(x) for x in values]
524 outf.write("\t".join([gembase, genome, to_annote, gsize, nbcont, l90]) + "\n")
525 return outlst
528def sort_genomes_by_name(x):
529 """
530 order by:
532 - species
533 - in each species, by strain number
535 Parameters
536 ----------
537 x : tuple or str
538 [genome_orig, [gembase, path, gsize, nbcont, L90]] with gembase = species.date.strain
540 Returns
541 -------
542 str
543 variable to take into account for sorting. If format is ESCO.1512.00001 return\
544 ESCO and 00001. Otherwise, just return x itself (sort by alphabetical order)
545 """
546 # get gembase name
547 if isinstance(x, tuple):
548 x = x[1][0]
550 # if format is ESCO.1512.00001 sort by ESCO, then 00001
551 if "." in x and len(x.split(".")) == 3:
552 return x.split(".")[0], int(x.split(".")[-1])
553 # if format is not like this, just return alphabetical order
554 return x,
557def sort_genomes_byname_l90_nbcont(x):
558 """
559 Sort all genomes with the following criteria:
561 - sort by species (x[1][0] is species.date)
562 - for each species, sort by l90
563 - for same l90, sort by nb contigs
565 Parameters
566 ----------
567 x : [[]]
568 [genome_name, [species.date, path, path_to_seq, gsize, nbcont, L90]]
570 Returns
571 -------
572 tuple
573 information on species, l90 and nb_contigs
574 """
575 return x[1][0].split(".")[0], x[1][-1], x[1][-2]
578def sort_genomes_l90_nbcont(x):
579 """
580 Sort all genomes with the following criteria:
582 - for each strain, sort by l90
583 - for same l90, sort by nb contigs
585 Parameters
586 ----------
587 x : [[]]
588 [genome_name, [species.date, path, gsize, nbcont, L90]]
590 Returns
591 -------
592 tuple
593 information on l90 and nb_contigs
594 """
595 return x[1][-1], x[1][-2]
598def sort_proteins(x):
599 """
600 order by:
602 - species
603 - in each species, strain number
604 - in each species and strain number, by protein number
606 Parameters
607 ----------
608 x : str
609 species.date.strain.contig_protnum
611 Returns
612 -------
613 str
614 variable to take into account for sorting. If format is ESCO.1512.00001.i0002_12124,\
615 return ESCO, 00001 and 12124. If not, it must be something_00001:\
616 return something and 00001.
617 """
618 try:
619 # if format is ESCO.1512.00001.i0002_12124, sort by ESCO, then 00001, then 12124
620 if "." in x and len(x.split(".")) >= 3:
621 return x.split(".")[0], int(x.split(".")[2].split("_")[0]), int(x.split("_")[-1])
622 # if format is not like this, it must be something_00001:
623 # sort by 'something' and then 00001
624 return "_".join(x.split("_")[:-1]), int(x.split("_")[-1])
625 except (IndexError, ValueError):
626 logger = logging.getLogger("utils")
627 logger.error(("ERROR: Protein {} does not have the required format. "
628 "It must contain, at least <alpha-num_only>_<num_only>, and at best "
629 "<name>.<date>.<strain_num>.<contig_info>_<prot_num>. "
630 "Please change its name.").format(x))
631 sys.exit(1)
634def read_genomes(list_file, name, date, dbpath, tmp_path, logger):
635 """
636 Read list of genomes, and return them.
637 If a genome has a name, also return it. Otherwise, return the name given by user.
639 Check that the given genome file exists in dbpath. Otherwise, put an error message,
640 and ignore this file.
642 Parameters
643 ----------
644 list_file : str
645 input file containing the list of genomes
646 name : str
647 Default species name
648 date : str
649 Default date
650 dbpath : str
651 path to folder containing original genome files
652 tmp_path : str
653 path to folder which will contain the genome files to use before annotation, if\
654 needed to change them from original file (for example, merging several contig files\
655 in one file, split at each stretch of 5 'N', etc.).
657 Returns
658 -------
659 dict
660 {genome: spegenus.date} spegenus.date = name.date
661 """
662 logger.info("Reading genomes")
663 genomes = {}
664 # Check that given list file exists
665 if not os.path.isfile(list_file):
666 logger.error(("ERROR: Your list file '{}' does not exist. "
667 "Please provide a list file.\n Ending program.").format(list_file))
668 sys.exit(1)
669 # List file exists: open it and read it
670 with open(list_file, "r") as lff:
671 for line in lff:
672 line = line.strip()
673 # empty line: go to the next one
674 if line == "":
675 continue
676 # If separator ::, look for species and/or date
677 if "::" in line:
678 # genomes_inf = genome filename(s) separated by space
679 # name_inf = <name>.<date>
680 genomes_inf, name_inf = line.split("::")
681 genomes_inf = genomes_inf.strip()
682 cur_name, cur_date = read_info(name_inf, name, date, genomes_inf)
683 # If no separator '::', no information on name and date of genome: use default ones
684 # Line only contains genome name (filename in given db_path)
685 else:
686 genomes_inf = line.strip()
687 cur_name = name
688 cur_date = date
689 # If several file names, check that each one exists, and concatenate the existing files
690 genomes_inf = genomes_inf.split()
691 if len(genomes_inf) > 1:
692 to_concat = []
693 for file in genomes_inf:
694 if os.path.isfile(os.path.join(dbpath, file)):
695 to_concat.append(file)
696 else:
697 logger.warning(("{} genome file does not exist. Its file will be "
698 "ignored when concatenating {}").format(file, genomes_inf))
699 # If there are files to concatenate, concatenate them
700 if to_concat:
701 genome_name = to_concat[0] + "-all.fna"
702 concat_file = os.path.join(tmp_path, genome_name)
703 to_concat = [os.path.join(dbpath, gname) for gname in to_concat]
704 # Put all genomes listed in 'to_concat' into a same file named 'concat_file'
705 cat(to_concat, concat_file)
706 else:
707 # No genome file listed exists. No sequence = genome ignored
708 logger.warning(("None of the genome files in {} exist. "
709 "This genome will be ignored.").format(genomes_inf))
710 genome_name = ""
711 # If only 1 sequence file, check that it exists, and take its name
712 else:
713 # Genome file given does not exist
714 if not os.path.isfile(os.path.join(dbpath, genomes_inf[0])):
715 logger.warning(("{} genome file does not exist. "
716 "It will be ignored.").format(genomes_inf[0]))
717 genome_name = ""
718 # Genome file exists: get genome_name (from information in first field)
719 else:
720 genome_name = genomes_inf[0]
721 # If there is a genome file (concatenated from several, or already existing), get the full name (with date)
722 if genome_name != "":
723 genomes[genome_name] = [cur_name + "." + cur_date]
724 return genomes
727def read_genomes_info(list_file, name, date=None, logger=None):
728 """
729 Read a lstinfo file containing the list of genomes with information (L90, genome size etc.).
730 1 line per genome, 4 required columns (Others will just be ignored):
731 to_annotate gsize nb_conts L90
733 Check that the given genome file (to_annotate column) exists.
735 Parameters
736 ----------
737 list_file : str
738 input file containing information on genomes (to_annotate, size, L90, nb_contigs)
739 name : str
740 Default species name
741 date : str
742 Default date
743 logger : logging.Logger
744 logger object to write log information
747 Returns
748 -------
749 dict
750 genomes = {genome:
751 [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]
752 }
753 """
754 if not logger:
755 logger = logging.getLogger("prepare.utils")
756 logger.info(f"Reading given information on your genomes in {list_file}")
757 genomes = {}
758 if name and date:
759 spegenus = f"{name}.{date}"
760 column_order = {} # Put the number of column corresponding to each field
761 if not os.path.isfile(list_file):
762 logger.error(f"ERROR: The info file {list_file} that you gave does not exist. "
763 "Please provide the right path/name for this file.\nEnding program.")
764 sys.exit(1)
765 message_no_header = (f"ERROR: It seems that your info file {list_file} does not have a "
766 "header, or this header does not have, at least, the required "
767 "columns tab separated: to_annotate, gsize nb_conts and L90 (in any "
768 "order).\nEnding program.")
769 with open(list_file, "r") as lff:
770 for line in lff:
771 line = line.strip()
772 # Ignore empty lines
773 if line == "":
774 continue
775 # Header line: Just get column number corresponding to each field
776 if "to_annotate" in line:
777 column_headers = line.split("\t")
778 column_order = {header:num for num,header in enumerate(column_headers)}
779 found = [head for head in ["to_annotate", "gsize", "nb_conts", "L90"]
780 if head in column_order]
781 if len(found) != 4:
782 logger.error(message_no_header)
783 sys.exit(1)
784 # If no header found, error message and exit
785 if not column_order:
786 logger.error(message_no_header)
787 sys.exit(1)
788 # Get all information on the given genome
789 # line.strip().split() -> all given information.
790 # So, at least to_annotate, gsize, nbcont, L90
791 # Get numeric information
792 try:
793 infos = line.strip().split()
794 # Get genome name with its path to db_dir
795 gpath = infos[column_order["to_annotate"]]
796 gfile = os.path.basename(gpath)
797 gname = os.path.splitext(gfile)[0]
798 gsize = int(infos[column_order["gsize"]])
799 gl90 = int(infos[column_order["L90"]])
800 gcont = int(infos[column_order["nb_conts"]])
801 # If invalid values, warning message and ignore genome
802 except ValueError:
803 logger.warning(f"For genome {gname}, at least one of your columns 'gsize', "
804 "'nb_conts' or 'L90' contains a non numeric value. "
805 "This genome will be ignored.")
806 continue
807 # If no value for at least 1 field, warning message and ignore genome
808 except IndexError:
809 logger.error(f"ERROR: Check that all fields of {list_file} are filled in each "
810 "line (can be 'NA')")
811 sys.exit(1)
812 # Could we find genome file?
813 # Check if genome file exists in db_path.
814 if not os.path.isfile(gpath):
815 logger.warning(f"{gpath} genome file does not exist. This genome will be ignored.")
816 continue
817 # cur genome information to save:
818 # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90]
819 if name and date:
820 genomes[gpath] = [spegenus, gpath, gpath, gsize, gcont, gl90]
821 # If called from prepare, no need to rename genomes
822 else:
823 gfile = os.path.basename(gpath)
824 gname = os.path.splitext(gfile)[0]
825 genomes[gfile] = [gname, gpath, gpath, gsize, gcont, gl90]
826 if len(genomes) > 0:
827 logger.info(("Found {} genomes in total").format(len(genomes)))
828 else:
829 logger.error(f"No genome listed in {list_file} was found.")
830 sys.exit(1)
831 return genomes
834def read_info(name_inf, name, date, genomes_inf):
835 """
836 From the given information in 'name_inf', check if there is a name (and if its
837 format is ok) and if there is a date (and if its format is ok).
838 If no name (resp. no date), return default name (resp. default date).
840 Parameters
841 ----------
842 name_inf : str
843 information on current genome, which could contain a species name and a date
844 name : str
845 default species name
846 date : str
847 default date
848 genomes_inf : str
849 current genome filename. Used to complete information when there is a warning (species\
850 name or date given not in the right format...)
852 Returns
853 -------
854 (cur_name, cur_date) : tuple
855 with:
857 - curname: name to use for this genome (can be the default one, or the one read from\
858 'name_inf'
859 - curdate: date to use for this genome (default or read from 'name_inf')
860 """
861 logger = logging.getLogger("utils")
862 # information on current genome, which could contain a species name and a date: split to get
863 # name and date
864 name_inf = name_inf.strip().split(".")
865 # if only 1 field provided (means only name is provided).
866 # (Otherwise, if only date is provided, it is with '.DATE', so
867 # len(name_inf) = 2 with name_inf[0] = "")
868 if len(name_inf) == 1:
869 # No genome name: use default one
870 if name_inf[0] == "":
871 cur_name = name
872 # Genome name, and in right format: use it
873 elif check_format(name_inf[0]):
874 cur_name = name_inf[0]
875 # Genome name but wrong format: warning message and use default one
876 else:
877 logger.warning(("Invalid name {} given for genome {}. Only put "
878 "4 alphanumeric characters in your date and name. "
879 "For this genome, the default name ({}) will be "
880 "used.").format(name_inf[0], genomes_inf, name))
881 cur_name = name
882 # In any case, no given date: use default date
883 cur_date = date
885 # More than 2 informations (more than 2 fields separated by a '.'):
886 # - either user put more information
887 # - either user put a '.' inside the name or date field: wrong format
888 # -> warning message and use default date and name.
889 elif len(name_inf) > 2:
890 logger.warning(("Invalid name/date given for genome {}. Only put "
891 "4 alphanumeric characters in your date and name. For "
892 "this genome, the default name ({}) and date ({}) will "
893 "be used.").format(genomes_inf, name, date))
894 cur_name = name
895 cur_date = date
896 # information on name and date given (can be empty)
897 else:
898 cur_name, cur_date = name_inf
899 # name given empty -> use default name
900 if cur_name == "":
901 cur_name = name
902 # date given empty -> use default date
903 if cur_date == "":
904 cur_date = date
905 # name given not empty but wrong format -> warning message and default name
906 if not check_format(cur_name):
907 logger.warning(("Invalid name {} given for genome {}. Only put "
908 "4 alphanumeric characters in your date and name. "
909 "For this genome, the default name ({}) "
910 "will be used.").format(cur_name, genomes_inf, name))
911 cur_name = name
912 # date given not empty but wrong format -> warning message and default date
913 if not check_format(cur_date):
914 logger.warning(("Invalid date {} given for genome {}. Only put "
915 "4 alphanumeric characters in your date and name. "
916 "For this genome, the default date ({}) "
917 "will be used.").format(cur_date, genomes_inf, date))
918 cur_date = date
919 return cur_name, cur_date
922def cat(list_files, output, title=None):
923 """
924 Equivalent of 'cat' unix command.
926 Concatenate all files in 'list_files' and save result in 'output' folder.
927 Concat using shutil.copyfileobj, in order to copy by chunks, to
928 avoid memory problems if files are big.
930 Parameters
931 ----------
932 list_files : list
933 list of filenames to concatenate
934 output : str
935 output filename, where all concatenated files will be written
936 title : str or None
937 if you want to show a progressbar while concatenating files, add a title for this
938 progressbar here. If no title, nothing will be shown during concatenation.
940 """
941 bar = None
942 curnum = None
943 if title:
944 nbfiles = len(list_files)
945 widgets = [title + ': ', progressbar.Bar(marker='█', left='', right='', fill=' '),
946 ' ', progressbar.Counter(), f"/{nbfiles}" ' (',
947 progressbar.Percentage(), ") - ", progressbar.Timer()]
948 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfiles, term_width=79).start()
949 curnum = 1
950 with open(output, "w") as outf:
951 for file in list_files:
952 if title:
953 bar.update(curnum)
954 curnum += 1
955 with open(file, "r") as inf:
956 shutil.copyfileobj(inf, outf)
957 if title:
958 bar.finish()
961def grep(filein, pattern, counts=False):
962 """
963 Equivalent of 'grep' unix command
965 By default, returns all the lines containing the given pattern.
966 If counts = True, returns the number of lines containing the pattern.
968 Parameters
969 ----------
970 filein : str
971 path to the file in which pattern must be searched
972 pattern : str
973 pattern to search
974 counts : bool
975 True if you want to count how many lines have the pattern and return this number,\
976 False if you want to return all lines containing the pattern.
978 Returns
979 -------
980 list or int
981 list of lines if counts=False; number of lines if counts=True
982 """
983 num = 0
984 lines = []
985 with open(filein, "r") as inf:
986 for line in inf:
987 if re.search(pattern, line):
988 lines.append(line.strip())
989 num += 1
990 if counts:
991 return num
992 else:
993 return lines
996def count(filein, get="lines"):
997 """
998 Similar to 'wc' unix command.
1000 Count the number of what is given in 'get'. It can be:
1002 - lines (default)
1003 - words
1005 Parameters
1006 ----------
1007 filein : str
1008 path to the file for which we want to count lines or words
1009 get : ["lines", "words"]
1010 either lines to count the number of lines in the file, or words to count the number\
1011 of words.
1013 Returns
1014 -------
1015 int
1016 Number of lines or words according to value of 'get' parameter.
1017 """
1018 gets = ["lines", "words"]
1019 if get not in gets:
1020 logger = logging.getLogger("utils")
1021 logger.error("Choose what you want to count among {}.".format(gets))
1022 sys.exit(1)
1023 num = 0
1024 with open(filein, "r") as inf:
1025 for line in inf:
1026 if get == "lines":
1027 num += 1
1028 elif get == "words":
1029 num += len(line.split())
1030 return num
1033def check_format(info):
1034 """
1035 Check that the given information (can be the genomes name or the date) is in the right
1036 format: it should have 4 characters, all alphanumeric.
1038 Parameters
1039 ----------
1040 info : str
1041 information to check
1043 Returns
1044 -------
1045 bool
1046 True if right format, False otherwise
1047 """
1048 if len(info) != 4:
1049 return False
1050 return info.isalnum()
1053def check_out_dirs(resdir):
1054 """
1055 Check that there is no file in:
1057 - resdir/LSTINFO
1058 - resdir/Genes
1059 - resdir/Proteins
1060 - resdir/Replicons
1061 - resdir/gff3
1063 Parameters
1064 ----------
1065 resdir : str
1066 path to result directory
1068 """
1069 logger = logging.getLogger("utils")
1070 if glob.glob(os.path.join(resdir, "LSTINFO", "*.lst")):
1071 logger.error("ERROR: Your output directory already has .lst files in the "
1072 "LSTINFO folder. Provide another result directory, or remove the "
1073 "files in this one.\nEnding program.")
1074 sys.exit(1)
1075 if glob.glob(os.path.join(resdir, "Proteins", "*.prt")):
1076 logger.error("ERROR: Your output directory already has .prt files in the "
1077 "Proteins folder. Provide another result directory, or remove the "
1078 "files in this one.\nEnding program.")
1079 sys.exit(1)
1080 if glob.glob(os.path.join(resdir, "Genes", "*.gen")):
1081 logger.error("ERROR: Your output directory already has .gen files in the "
1082 "Genes folder. Provide another result directory, or remove the "
1083 "files in this one.\nEnding program.")
1084 sys.exit(1)
1085 if glob.glob(os.path.join(resdir, "Replicons", "*.fna")):
1086 logger.error("ERROR: Your output directory already has .fna files in the "
1087 "Replicons folder. Provide another result directory, or remove the "
1088 "files in this one.\nEnding program.")
1089 sys.exit(1)
1090 if glob.glob(os.path.join(resdir, "gff3", "*.gff")):
1091 logger.error("ERROR: Your output directory already has .gff files in the "
1092 "gff3 folder. Provide another result directory, or remove the "
1093 "files in this one.\nEnding program.")
1094 sys.exit(1)
1097def get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger):
1098 """
1099 For the given genome (sequence in gpath), rename all its contigs
1100 with the new name: 'gembase_name', and save the output sequence in outfile.
1102 For each contig renamed, save its new name as well as its size. This will be used to generate
1103 gff files
1105 Parameters
1106 ----------
1107 gembase_name : str
1108 genome name to use (species.date.strain)
1109 gpath : str
1110 path to the genome sequence
1111 outfile : str
1112 path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers
1114 Returns
1115 -------
1116 tuple
1117 - Dict of all contigs with their original and new name: (list of str)
1118 {>orig_name: >new_name}
1119 - Dict of all contigs with their size: (list of str)
1120 {"new_name': 'size1"}
1121 """
1122 # Initialize variables
1124 # Contig number
1125 contig_num = 1
1126 # contig size
1127 cont_size = 0
1128 # List of contigs (str) [<name>\t<orig_name>]
1129 contigs = {}
1130 # List of contigs (str) with their sizes [<name>\t<size>]
1131 sizes = {}
1132 # Name of previous contig (to put to contigs, as we need to wait for the next
1133 # contig to know the size of the previous one)
1134 prev_cont = ""
1135 prev_orig_name = ""
1136 # sequence of previous contig
1137 seq = ""
1139 # Read input sequence given to prodigal, and open file where sequences with new
1140 # headers must be written.
1141 with open(gpath, "r") as gpf, open(outfile, "w") as grf:
1142 for line in gpf:
1143 # When we find a new header line, convert its name to gembase format, and write it
1144 # to output replicon file
1145 if line.startswith(">") :
1146 # If not first contig (contigs not empty):
1147 # - add its name as well as its size to contigs list
1148 # - add its name with its original name to
1149 # - write header ("<contig name> <size>") to replicon file
1150 if prev_cont:
1151 cont = " ".join([prev_cont, str(cont_size)]) + "\n"
1152 prevcont_nohead = prev_cont.split(">")[1]
1153 prev_orig_name_nohead = prev_orig_name.split(">")[1]
1154 if prev_orig_name_nohead:
1155 if prev_orig_name_nohead in contigs:
1156 logger.error(f"several contigs have the same name "
1157 f"{prev_orig_name_nohead} in {gpath}.")
1158 return False, False
1159 sizes[prevcont_nohead] = cont_size
1160 contigs[prev_orig_name_nohead] = prevcont_nohead
1161 grf.write(cont)
1162 grf.write(seq)
1163 prev_cont = ">" + gembase_name + "." + str(contig_num).zfill(4)
1164 # keep only first string of contig
1165 prev_orig_name = line.strip().split()[0]
1166 contig_num += 1
1167 cont_size = 0
1168 seq = ""
1169 # Sequence line: write it as is in replicon file, and add its size to cont_size.
1170 else:
1171 seq += line
1172 cont_size += len(line.strip())
1173 # Write last contig, if there is one (if gpath not empty)
1174 if prev_cont:
1175 cont = " ".join([prev_cont, str(cont_size)]) + "\n"
1176 prevcont_nohead = "".join(prev_cont.split(">")[1:])
1177 prev_orig_name_nohead = prev_orig_name.split(">")[1]
1178 if prev_orig_name_nohead:
1179 if prev_orig_name_nohead in contigs:
1180 logger.error(f"several contigs have the same name {prev_orig_name_nohead} "
1181 f"in {gpath}.")
1182 return False, False
1183 contigs[prev_orig_name_nohead] = prevcont_nohead
1184 sizes[prevcont_nohead] = cont_size
1185 grf.write(cont)
1186 grf.write(seq)
1187 if not contigs:
1188 logger.error(f"Your genome {gpath} does not contain any sequence, "
1189 "or is not in fasta format.")
1190 return contigs, sizes
1191# Add test with empty gpath
1192# Add test with non fasta gpath
1195def logger_thread(q):
1196 """
1197 Queue listener used in a thread to handle the logs put to a QueueHandler
1198 by several processes (multiprocessing.pool.map_async for example)
1200 Parameters
1201 ----------
1202 q : multiprocessing.managers.AutoProxy[Queue]
1203 queue to listen
1205 """
1206 while True:
1207 record = q.get()
1208 if record is None:
1209 break
1210 logger = logging.getLogger(record.name)
1211 logger.handle(record)
1214def detail_lvl():
1215 """
1216 Get the int level corresponding to "DETAIL"
1218 Returns
1219 -------
1220 int
1221 int corresponding to the level "DETAIL"
1222 """
1223 return 15
1226def save_bin(objects, fileout):
1227 """
1228 Save python 'objects' in a binary file called 'fileout'
1230 Parameters
1231 ----------
1232 objects : Object
1233 python object to save
1234 fileout : str
1235 path to binary file where objects must be saved
1237 """
1238 with open(fileout, "wb") as binf:
1239 pickle.dump(objects, binf)
1242def load_bin(binfile):
1243 """
1244 Unpickle python objects from the binary file 'binfile'
1246 Parameters
1247 ----------
1248 binfile : str
1249 path to binary file containing python object
1251 Returns
1252 -------
1253 Object
1254 The python objects unpickled
1256 """
1257 with open(binfile, "rb") as binf:
1258 objects = pickle.load(binf)
1259 return objects
1262def write_list(list_names, fileout):
1263 """
1264 Write the given list of strings to a file, 1 per line
1265 """
1266 with open(fileout, "w") as fo:
1267 for genome in list_names:
1268 fo.write(str(genome) + "\n")
1271def list_to_str(list, sep='\t'):
1272 """
1273 Return a string corresponding to the given list, with all elements separated
1274 by a space. Used to write a list into a file. Ex::
1276 [1, 2, "toto"] -> "1 2 toto"
1278 Parameters
1279 ----------
1280 list : list
1281 list of elements that we would like to write
1282 sep : str
1283 Separator to use between the different elements
1285 Returns
1286 -------
1287 str
1288 the string to write
1289 """
1290 list_write = [str(l) for l in list]
1291 return sep.join(list_write) + "\n"
1294def remove(infile):
1295 """
1296 Remove the given file if it exists
1298 Parameters
1299 ----------
1300 infile : str
1301 path to file to remove
1303 """
1304 if os.path.isfile(infile):
1305 os.remove(infile)
1308def thread_progressbar(widgets, stop):
1309 """
1310 Thread running an "inifite" progress bar, while the main thread is working.
1311 Once this progressbar has to stop, we send a signal.
1313 Parameters
1314 ----------
1315 widgets : list
1316 list of widgets to put in the progressbar
1317 stop : function
1318 function returning False when thread can run, True when it has to stop.
1319 """
1320 if widgets:
1321 bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)
1322 while True:
1323 bar.update()
1324 if stop():
1325 print()
1326 break