Coverage for PanACoTA/subcommands/corepers.py: 100%
87 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37corepers is a subcommand of PanACoTA
39Generate a core genome (families containing 1 member in all genomes of the dataset)
40or a persistent genome (families with a given % of genomes having exactly 1 member).
41You can also allow:
43- mixed families: exactly 1 member in the given percentage of genomes, but the other genomes
44 can contain 0 or several members
45- multi families: allow several members in any genome.
48@author gem
49June 2017
50"""
52import os
54import sys
57def main_from_parse(args):
58 """
59 Call main function from the arguments given by parser
61 Parameters
62 ----------
63 args : argparse.Namespace
64 result of argparse parsing of all arguments in command line
65 """
66 cmd = "PanACoTA " + ' '.join(args.argv)
67 main(cmd, args.pangenome, args.tol, args.multi, args.mixed, args.outputdir,
68 args.lstinfo_file, args.floor, args.verbose, args.quiet)
71def main(cmd, pangenome, tol, multi, mixed, outputdir, lstinfo_file, floor, verbose, quiet):
72 """
73 Read pangenome and deduce Persistent genome according to the user criteria
75 Parameters
76 ----------
77 pangenome : str
78 file containing pangenome
79 tol : float
80 min % of genomes present in a family to consider it as persistent (between 0 and 1)
81 multi : bool
82 True if multigenic families are allowed, False otherwise
83 mixed : bool
84 True if mixed families are allowed, False otherwise
85 outputdir : str or None
86 Specific directory for the generated persistent genome. If not given, pangenome directory is used.
87 lstinfo_file : str
88 list of genomes to include in the core/persistent genome. If not given, include all genomes of pan
89 floor : bool
90 Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False
91 verbose : int
92 verbosity:
93 - defaut 0 : stdout contains INFO, stderr contains ERROR.
94 - 1: stdout contains INFO, stderr contains WARNING and ERROR
95 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
96 - >=15: Add DEBUG in stdout
97 quiet : bool
98 True if nothing must be sent to stdout/stderr, False otherwise
99 """
100 # import needed packages
101 import logging
102 from PanACoTA import utils
103 from PanACoTA import utils_pangenome as utilsp
104 import PanACoTA.corepers_module.persistent_functions as pers
105 from PanACoTA import __version__ as version
107 # get pangenome name info
108 _, base_pan = os.path.split(pangenome)
109 if lstinfo_file:
110 _, base_lst = os.path.split(lstinfo_file)
111 else:
112 base_lst = "all"
113 # Define output filename
114 output_name = f"PersGenome_{base_pan}-{base_lst}_"
115 if floor:
116 output_name += "F"
117 output_name += str(tol)
118 if multi:
119 output_name += "-multi.lst"
120 elif mixed:
121 output_name += "-mixed.lst"
122 else:
123 output_name += ".lst"
124 # Define output directory and filename path
125 if not os.path.isdir(outputdir):
126 os.makedirs(outputdir)
127 outputfile = os.path.join(outputdir, output_name)
128 logfile_base = os.path.join(outputdir, "PanACoTA-corepers")
129 # level is the minimum level that will be considered.
130 # for verbose = 0 or 1, ignore details and debug, start from info
131 if verbose <= 1:
132 level = logging.INFO
133 # for verbose = 2, ignore only debug
134 if verbose >= 2 and verbose < 15:
135 level = 15 # int corresponding to detail level
136 # for verbose >= 15, write everything
137 if verbose >= 15:
138 level = logging.DEBUG
139 utils.init_logger(logfile_base, level, 'corepers', verbose=verbose, quiet=quiet)
140 logger = logging.getLogger("corepers")
141 logger.info(f'PanACoTA version {version}')
142 logger.info("Command used\n \t > " + cmd)
144 logger.info(get_info(tol, multi, mixed, floor))
146 # Read pangenome
147 fams_by_strain, families, all_strains = utilsp.read_pangenome(pangenome, logger)
148 # If list of genomes given, get subset of previous dicts, including only the genomes aksed
149 if lstinfo_file:
150 fams_by_strain, families, all_strains = pers.get_subset_genomes(fams_by_strain, families, lstinfo_file)
151 # Generate persistent genome
152 fams = pers.get_pers(fams_by_strain, families, len(all_strains), tol, multi, mixed, floor)
153 # Write persistent genome to file
154 pers.write_persistent(fams, outputfile)
155 logger.info("Persistent genome step done.")
156 return outputfile
159def get_info(tol, multi, mixed, floor):
160 """
161 Get a string corresponding to the information that will be given to logger.
163 Parameters
164 ----------
165 tol : float
166 min % of genomes present in a family to consider it as persistent (between 0 and 1)
167 multi : bool
168 True if multigenic families are allowed, False otherwise
169 mixed : bool
170 True if mixed families are allowed, False otherwise
171 floor : bool
172 Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False
174 Returns
175 -------
176 str
177 Information to give to logger
178 """
179 if tol == 1:
180 return "Will generate a CoreGenome."
181 else:
182 if floor:
183 floorstr = "floor"
184 else:
185 floorstr = "ceil"
186 toprint = (f"Will generate a Persistent genome with member(s) in at least {100*tol}"
187 f"% of all genomes in each family.\n")
188 if multi:
189 toprint += ("Multigenic families are allowed (several members in "
190 "any genome of a family).")
191 elif mixed:
192 toprint += ("Mixed families are allowed. To be considered as persistent, "
193 f"a family must have exactly 1 member in {tol*100}% of the genomes, "
194 f"but in the remaining {round((1-tol)*100,3)}% genomes, there can be 0, 1 or "
195 "several members.")
196 else:
197 toprint += ("To be considered as persistent, a family must contain exactly 1 member "
198 f"in at least {tol*100}% of all genomes. The other genomes are absent from the "
199 "family.")
200 return toprint
203def build_parser(parser):
204 """
205 Method to create a parser for command-line options
207 Parameters
208 ----------
209 parser : argparse.ArgumentParser
210 parser to configure in order to extract command-line arguments
211 """
212 import argparse
213 from PanACoTA import utils_argparse
215 # Create command-line parser for all options and arguments to give
216 required = parser.add_argument_group('Required arguments')
217 required.add_argument("-p", dest="pangenome", required=True,
218 help="PanGenome file (1 line per family, first column is fam number)")
219 required.add_argument("-o", dest="outputdir", required=True,
220 help=("Specify the output directory for your core/persistent genome."),
221 default=".")
222 optional = parser.add_argument_group('Optional arguments')
223 optional.add_argument("-t", "--tol", dest="tol", default=1, type=utils_argparse.percentage,
224 help=("min %% of genomes having at least 1 member in a family to "
225 "consider the family as persistent (between 0 and 1, "
226 "default is 1 = 100%% of genomes = Core genome)."
227 "By default, the minimum number of genomes will be "
228 "ceil('tol'*N) (N being the total number of genomes). If "
229 "you want to use floor('tol'*N) instead, add the '-F' option."))
230 optional.add_argument("-M", dest="multi", action='store_true',
231 help=("Add this option if you allow several members in any genome "
232 "of a family. By default, only 1 (or 0 if tol<1) member "
233 "per genome are allowed in all genomes. If you want to allow "
234 "exactly 1 member in 'tol'%% of the genomes, and 0, 1 "
235 "or several members in the '1-tol'%% left, use the option -X "
236 "instead of this one: -M and -X options are not compatible."))
237 optional.add_argument("-X", dest="mixed", action='store_true',
238 help="Add this option if you want to allow families having several "
239 "members only in '1-tol'%% of the genomes. In the other genomes, "
240 "only 1 member exactly is allowed. This option is not compatible "
241 "with -M (which is allowing multigenic families: having several "
242 "members in any number of genomes).")
243 optional.add_argument("-F", dest="floor", action="store_true",
244 help="When you specify the '-tol' option, with a number lower "
245 "than 1, you can add this option to use floor('tol'*N) "
246 "as a minimum number of genomes instead of ceil('tol'*N) "
247 "which is the default behavior.")
248 optional.add_argument("-l", dest="lstinfo_file",
249 help=("By default, the core/persistent genome will include all genomes "
250 "found in the given pangenome file. If you want to do a core/persistent "
251 "genome on a subset of those genomes, give a file with this "
252 "list of genomes. This file must have 1 line per genome, only the first column "
253 "(genome name without extension) will be used."))
255 helper = parser.add_argument_group('Others')
256 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,
257 help="Increase verbosity in stdout/stderr.")
258 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
259 help=("Do not display anything to stdout/stderr. log files will "
260 "still be created."))
261 helper.add_argument("-h", "--help", dest="help", action="help",
262 help="show this help message and exit")
265def check_args(parser, args):
266 """
267 Check that arguments given to parser are as expected.
269 Parameters
270 ----------
271 parser : argparse.ArgumentParser
272 The parser used to parse command-line
273 args : argparse.Namespace
274 Parsed arguments
276 Returns
277 -------
278 argparse.Namespace or None
279 The arguments parsed, updated according to some rules. Exit program
280 with error message if error occurs with arguments given.
281 """
282 if args.multi and args.mixed:
283 parser.error("-M and -X options cannot be activated together. Choose if you want to:\n"
284 "- allow several members in any number of genomes of a family (-M)\n"
285 "- allow several members in only '1-tol'% of the genomes of a family "
286 "(other 'tol'% genomes must have exactly 1 member) (-X)")
287 if args.mixed and args.tol == 1:
288 parser.error("You are asking for mixed families, while asking for 100% of the genomes of "
289 "a family to have exactly one member, which is not compatible. Do you want "
290 "to \n- lower the percentage of genomes required to have exactly "
291 "1 member (-t tol)\n- not allow mixed families (remove -X option)")
292 if args.floor and args.tol == 1:
293 parser.error("You are asking to use floor('tol'*N) as a minimum number of genomes "
294 "present in a family, but with 'tol'=1: the minimum number of genomes "
295 "will always be equal to N, using floor or the default ceil! Either "
296 "use a 'tol' lower than 1, or remove the '-F' option.")
297 return args
300def parse(parser, argu):
301 """
302 Parse arguments given to parser
304 Parameters
305 ----------
306 parser : argparse.ArgumentParser
307 the parser used
308 argu : [str]
309 command-line given by user, to parse using parser
311 Returns
312 -------
313 argparse.Namespace
314 Parsed arguments
315 """
316 args = parser.parse_args(argu)
317 return check_args(parser, args)
320if __name__ == '__main__':
321 import argparse
322 myparser = argparse.ArgumentParser(description="Compute core or persistent genome",
323 add_help=False)
324 build_parser(myparser)
325 OPTIONS = parse(myparser, sys.argv[1:])
326 main_from_parse(OPTIONS)