Coverage for PanACoTA/subcommands/pangenome.py: 100%
51 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37pangenome is a subcommand of PanACoTA
40@author gem
41May 2017
42"""
44import sys
45import os
48def main_from_parse(args):
49 """
50 Call main function from the arguments given by parser
52 Parameters
53 ----------
54 args : argparse.Namespace
55 result of argparse parsing of all arguments in command line
56 """
57 cmd = "PanACoTA " + ' '.join(args.argv)
58 main(cmd, args.lstinfo_file, args.dataset_name, args.dbpath, args.min_id, args.outdir,
59 args.clust_mode, args.spedir, args.threads, args.outfile, args.verbose,
60 args.quiet)
63def main(cmd, lstinfo, name, dbpath, min_id, outdir, clust_mode, spe_dir, threads, outfile=None,
64 verbose=0, quiet=False):
65 """
66 Main method, doing all steps:
68 - concatenate all protein files
69 - create database as ffindex
70 - cluster all proteins
71 - convert to pangenome file
72 - creating summary and matrix of pangenome
74 Parameters
75 ----------
76 lstinfo : str
77 file with name of genomes to consider for pan in the first column, without extension.
78 Other columns are ignored. The first column header must be 'gembase_name'
79 name : str
80 name given to the dataset. For example, ESCO44 for 44 *Escherichia coli* genomes.
81 dbpath : str
82 path to the folder containing all protein files (files called as the name of genome
83 given in lstinfo + ".prt"
84 min_id : float
85 Minimum percentage of identity between 2 proteins to put them in the same family
86 outdir : str
87 path to folder which will contain pangenome results and tmp files
88 clust_mode : [0, 1, 2]
89 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
90 spe_dir : str or None
91 path to the folder where concatenated bank of proteins must be saved.
92 None to use the same folder as protein files
93 threads : int
94 Max number of threads to use
95 outfile : str or None
96 Name of the pangenome. None to use the default name
97 verbose : int
98 verbosity:
99 - defaut 0 : stdout contains INFO, stderr contains ERROR.
100 - 1: stdout contains INFO, stderr contains WARNING and ERROR
101 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
102 - >=15: Add DEBUG in stdout
103 quiet : bool
104 True if nothing must be sent to stdout/stderr, False otherwise
105 """
106 # import needed packages
107 import logging
108 from PanACoTA import utils
109 from PanACoTA.pangenome_module import protein_seq_functions as protf
110 from PanACoTA.pangenome_module import mmseqs_functions as mmf
111 from PanACoTA.pangenome_module import post_treatment as pt
112 from PanACoTA import __version__ as version
114 # test if mmseqs is installed and in the path
115 if not utils.check_installed("mmseqs"): # pragma: no cover
116 print("mmseqs is not installed. 'PanACoTA pangenome' cannot run.")
117 sys.exit(1)
119 os.makedirs(outdir, exist_ok=True)
120 # level is the minimum level that will be considered.
121 # for verbose = 0 or 1, ignore details and debug, start from info
122 if verbose <= 1:
123 level = logging.INFO
124 # for verbose = 2, ignore only debug
125 if verbose >= 2 and verbose < 15:
126 level = utils.detail_lvl() # int corresponding to detail level
127 # for verbose >= 15, write everything
128 if verbose >= 15:
129 level = logging.DEBUG
130 # name logfile, add timestamp if already existing
131 logfile_base = os.path.join(outdir, "PanACoTA-pangenome_" + name)
132 utils.init_logger(logfile_base, level, '', verbose=verbose, quiet=quiet, log_details=True)
133 logger = logging.getLogger("pangenome")
134 logger.info(f'PanACoTA version {version}')
135 logger.info("Command used\n \t > " + cmd)
137 # Build bank with all proteins to include in the pangenome
138 prt_path = protf.build_prt_bank(lstinfo, dbpath, name, spe_dir, quiet)
139 # Do pangenome
140 families, panfile = mmf.run_all_pangenome(min_id, clust_mode, outdir,
141 prt_path, threads, outfile, quiet)
142 # Create matrix pan_quali, pan_quanti and summary file
143 pt.post_treat(families, panfile)
144 logger.info("DONE")
145 return panfile
148def build_parser(parser):
149 """
150 Method to create a parser for command-line options
152 Parameters
153 ----------
154 parser : argparse.ArgumentParser
155 parser to configure in order to extract command-line arguments
156 """
157 import argparse
158 from PanACoTA import utils_argparse
160 # Create command-line parser for all options and arguments to give
161 required = parser.add_argument_group('Required arguments')
162 required.add_argument("-l", dest="lstinfo_file", required=True,
163 help=("File containing the list of all genomes to include in "
164 "the pan-genome, 1 genome per line: it can be the "
165 "LSTINFO-<list_file>.lst file of 'PanACoTA annotate' module."
166 "Here, only the first column (genome name without extension) "
167 "will be used. All proteins of all these genomes will be "
168 "concatenated in a file called <dataset_name>.All.prt. The "
169 "column header must be 'gembase_name'."))
170 required.add_argument("-n", dest="dataset_name", required=True,
171 help=("Name of the dataset which will be clustered (for example, "
172 "SAEN1234 for 1234 Salmonella enterica genomes). This name will "
173 "be used to name the protein databank, a well as the "
174 "pangenome files."))
175 required.add_argument("-d", dest="dbpath", required=True,
176 help=("Path to the folder containing all protein files corresponding "
177 "to the genomes of the dataset (output directory 'Proteins' "
178 "of 'PanACoTA annotate' module)."))
179 required.add_argument("-o", dest="outdir", required=True,
180 help=("Output directory, where all results must be saved "
181 "(including tmp folder)"))
183 optional = parser.add_argument_group('Optional arguments')
184 optional.add_argument("-i", dest="min_id", type=utils_argparse.perc_id, default=0.8,
185 help=("Minimum sequence identity to be considered in the same "
186 "cluster (float between 0 and 1). Default is 0.8."))
187 optional.add_argument("-f", dest="outfile",
188 help=("Use this option if you want to give the name of the pangenome "
189 "output file (without path). Otherwise, by default, it is called "
190 "PanGenome-mmseq_<given_dataset_name>.All.prt_<"
191 "information_on_parameters>.lst"))
192 optional.add_argument("-c", dest="clust_mode", type=int, choices=[0, 1, 2], default=1,
193 help=("Choose the clustering mode: 0 for 'set cover', 1 for "
194 "'single-linkage', 2 for 'CD-Hit'. Default "
195 "is 'single-linkage' (1)"))
196 optional.add_argument("-s", dest="spedir",
197 help=("use this option if you want to save the concatenated protein "
198 "databank in another directory than the one containing all "
199 "individual protein files ('Proteins' folder)."))
200 optional.add_argument("--threads", dest="threads", default=1, type=utils_argparse.thread_num,
201 help=("add this option if you want to parallelize on several threads. "
202 "Indicate on how many threads you want to parallelize. "
203 "By default, it uses 1 thread. Put 0 if you want to use "
204 "all threads of your computer."))
206 helper = parser.add_argument_group('Others')
207 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,
208 help="Increase verbosity in stdout/stderr.")
209 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
210 help=("Do not display anything to stdout/stderr. log files will "
211 "still be created."))
212 helper.add_argument("-h", "--help", dest="help", action="help",
213 help="show this help message and exit")
216def parse(parser, argu):
217 """
218 Parse arguments given to parser
220 Parameters
221 ----------
222 parser : argparse.ArgumentParser
223 Parser to use to parse command-line arguments
224 argu : [str]
225 command-line given
227 Returns
228 -------
229 argparse.Namespace or None
230 The arguments parsed, updated according to some rules. Exit program
231 with error message if error occurs with arguments given.
232 """
233 args = parser.parse_args(argu)
234 return args
237if __name__ == '__main__':
238 import argparse
240 my_parser = argparse.ArgumentParser(description="Compute pan-genome", add_help=False)
241 build_parser(my_parser)
242 OPTIONS = parse(my_parser, sys.argv[1:])
243 main_from_parse(OPTIONS)