Coverage for PanACoTA/subcommands/align.py: 100%
59 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37align is a subcommand of PanACoTA
40@author gem
41June 2017
42"""
44import os
45import sys
48def main_from_parse(args):
49 """
50 Call main function from the arguments given by parser
52 Parameters
53 ----------
54 args : argparse.Namespace
55 result of argparse parsing of all arguments in command line
56 """
57 cmd = "PanACoTA " + ' '.join(args.argv)
58 main(cmd, args.corepers, args.list_genomes, args.dataset_name, args.dbpath,
59 args.outdir, args.prot_ali, args.threads, args.force, args.verbose, args.quiet)
62def main(cmd, corepers, list_genomes, dname, dbpath, outdir, prot_ali, threads, force, verbose=0,
63 quiet=False):
64 """
65 Align given core genome families
67 Parameters
68 ----------
69 corepers : str
70 File containing persistent genome families
71 list_genomes : str
72 File containing the list of all genomes in the dataset. Only first column is
73 considered.
74 dname : str
75 Dataset name, used to name output files
76 dbpath : str
77 path to the directory containing 'Proteins' and 'Genes' folders
78 outdir : str
79 path to the directory where output files must be saved
80 prot_ali : bool
81 Also give aa alignment of concatenation of persistent proteins
82 threads : int
83 Max number of threads to use
84 force : bool
85 Remove existing output files and rerun everything if True.
86 verbose : int
87 verbosity:
88 - defaut 0 : stdout contains INFO, stderr contains ERROR.
89 - 1: stdout contains INFO, stderr contains WARNING and ERROR
90 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
91 - >=15: Add DEBUG in stdout
93 quiet : bool
94 True if nothing must be sent to stdout/stderr, False otherwise
95 """
96 # import needed packages
97 import logging
98 import shutil
99 from PanACoTA import utils
100 from PanACoTA.align_module import pan_to_pergenome as p2g
101 from PanACoTA.align_module import get_seqs as gseqs
102 from PanACoTA.align_module import alignment as ali
103 from PanACoTA.align_module import post_align as post
104 from PanACoTA import __version__ as version
106 # test if prokka is installed and in the path
107 if not utils.check_installed("mafft"): # pragma: no cover
108 print("mafft is not installed. 'PanACoTA align' cannot run.")
109 sys.exit(1)
111 if force and os.path.isdir(outdir):
112 shutil.rmtree(outdir)
113 os.makedirs(outdir, exist_ok=True)
114 # set level of logger (here debug to show everything during development)
115 # level is the minimum level that will be considered.
116 # for verbose = 0 or 1, ignore details and debug, start from info
117 if verbose <= 1:
118 level = logging.INFO
119 # for verbose = 2, ignore only debug
120 if verbose >= 2 and verbose < 15:
121 level = 15 # int corresponding to detail level
122 # for verbose >= 15, write everything
123 if verbose >= 15:
124 level = logging.DEBUG
125 # name logfile, add timestamp if already existing
126 logfile_base = os.path.join(outdir, "PanACoTA-align_" + dname)
127 utils.init_logger(logfile_base, level, 'align', log_details=True, verbose=verbose, quiet=quiet)
128 logger = logging.getLogger("align")
129 logger.info(f'PanACoTA version {version}')
130 logger.info("Command used\n \t > " + cmd)
132 all_genomes, aldir, listdir, fam_nums = p2g.get_per_genome(corepers, list_genomes,
133 dname, outdir)
134 # generate required files
135 gseqs.get_all_seqs(all_genomes, dname, dbpath, listdir, aldir, fam_nums, quiet)
136 prefix = os.path.join(aldir, dname)
138 # Align all families
139 status = ali.align_all_families(prefix, fam_nums, len(all_genomes), dname, quiet, threads)
140 if not status:
141 logger.error(("At least one alignment did not run well. See detailed log file for "
142 "more information. Program will stop here, alignments won't be "
143 "grouped by genome."))
144 sys.exit(1)
146 # post-process alignment files
147 align_file = post.post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet)
148 logger.info("END")
149 return align_file
152def build_parser(parser):
153 """
154 Method to create a parser for command-line options
156 Parameters
157 ----------
158 parser : argparse.ArgumentParser
159 parser to configure in order to extract command-line arguments
160 """
161 import argparse
162 import multiprocessing
163 from PanACoTA import utils_argparse
165 # Create command-line parser for all options and arguments to give
166 required = parser.add_argument_group('Required arguments')
167 required.add_argument("-c", dest="corepers", required=True,
168 help="Core or persistent genome whose families must be aligned.")
169 required.add_argument("-l", dest="list_genomes", required=True,
170 help=("File containing the list of all the genomes you want "
171 "to align from their core/persistent families. "
172 "1 genome per line: it can be the "
173 "LSTINFO-<list_file>.lst file of 'PanACoTA annotate' module. "
174 "Here, only the first column (genome name without extension) "
175 "will be used. The final alignment file will contain "
176 "1 alignment per genome in this file."))
177 required.add_argument("-n", dest="dataset_name", required=True,
178 help=("Name of the dataset which will be aligned (for example, "
179 "SAEN1234 for 1234 Salmonella enterica genomes). This name will "
180 "be used to name the alignment file."))
181 required.add_argument("-d", dest="dbpath", required=True,
182 help=("Path to the folder containing the directories 'Proteins' "
183 "and 'Genes', created by 'PanACoTA annotate'."))
184 required.add_argument("-o", dest="outdir", required=True,
185 help="Output directory, where all results must be saved ")
187 optional = parser.add_argument_group('Optional arguments')
188 optional.add_argument("--threads", dest="threads", default=1, type=utils_argparse.thread_num,
189 help=("add this option if you want to parallelize on several threads. "
190 "Indicate on how many threads you want to parallelize. "
191 "By default, it uses 1 thread. Put 0 if you want to use "
192 "all threads of your computer."))
193 optional.add_argument("-F", "--force", dest="force", action="store_true",
194 help=("Force run: Add this option if you want to redo all alignments "
195 "for all families, even if their result file already exists. "
196 "Without this option, if an alignment file already exists, "
197 "it will be used for the next step. If you want to redo only "
198 "a given alignment, just delete its file, without using "
199 "this option."))
200 optional.add_argument("-P", dest="prot_ali", default=False, action="store_true",
201 help=("Add this option if you also need the aa alignment of the concatenation of "
202 "all persistent proteins. "
203 "By default, PanACoTA only gives the nucleic alignment."))
204 helper = parser.add_argument_group('Others')
205 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,
206 help="Increase verbosity in stdout/stderr.")
207 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
208 help=("Do not display anything to stdout/stderr. log files will "
209 "still be created."))
210 helper.add_argument("-h", "--help", dest="help", action="help",
211 help="show this help message and exit")
214def parse(parser, argu):
215 """
216 Parse arguments given to parser
218 Parameters
219 ----------
220 parser : argparse.ArgumentParser
221 the parser used
222 argu : [str]
223 command-line given by user, to parse using parser
225 Returns
226 -------
227 argparse.Namespace
228 Parsed arguments
229 """
230 return parser.parse_args(argu)
233if __name__ == '__main__':
234 import argparse
236 myparser = argparse.ArgumentParser(description="Align Core/Persistent families",
237 add_help=False)
239 build_parser(myparser)
240 OPTIONS = parse(myparser, sys.argv[1:])
241 main_from_parse(OPTIONS)