Coverage for PanACoTA/subcommands/pangenome.py: 100%

51 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37pangenome is a subcommand of PanACoTA 

38 

39 

40@author gem 

41May 2017 

42""" 

43 

44import sys 

45import os 

46 

47 

48def main_from_parse(args): 

49 """ 

50 Call main function from the arguments given by parser 

51 

52 Parameters 

53 ---------- 

54 args : argparse.Namespace 

55 result of argparse parsing of all arguments in command line 

56 """ 

57 cmd = "PanACoTA " + ' '.join(args.argv) 

58 main(cmd, args.lstinfo_file, args.dataset_name, args.dbpath, args.min_id, args.outdir, 

59 args.clust_mode, args.spedir, args.threads, args.outfile, args.verbose, 

60 args.quiet) 

61 

62 

63def main(cmd, lstinfo, name, dbpath, min_id, outdir, clust_mode, spe_dir, threads, outfile=None, 

64 verbose=0, quiet=False): 

65 """ 

66 Main method, doing all steps: 

67 

68 - concatenate all protein files 

69 - create database as ffindex 

70 - cluster all proteins 

71 - convert to pangenome file 

72 - creating summary and matrix of pangenome 

73 

74 Parameters 

75 ---------- 

76 lstinfo : str 

77 file with name of genomes to consider for pan in the first column, without extension. 

78 Other columns are ignored. The first column header must be 'gembase_name' 

79 name : str 

80 name given to the dataset. For example, ESCO44 for 44 *Escherichia coli* genomes. 

81 dbpath : str 

82 path to the folder containing all protein files (files called as the name of genome 

83 given in lstinfo + ".prt" 

84 min_id : float 

85 Minimum percentage of identity between 2 proteins to put them in the same family 

86 outdir : str 

87 path to folder which will contain pangenome results and tmp files 

88 clust_mode : [0, 1, 2] 

89 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit' 

90 spe_dir : str or None 

91 path to the folder where concatenated bank of proteins must be saved. 

92 None to use the same folder as protein files 

93 threads : int 

94 Max number of threads to use 

95 outfile : str or None 

96 Name of the pangenome. None to use the default name 

97 verbose : int 

98 verbosity: 

99 - defaut 0 : stdout contains INFO, stderr contains ERROR. 

100 - 1: stdout contains INFO, stderr contains WARNING and ERROR 

101 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR 

102 - >=15: Add DEBUG in stdout 

103 quiet : bool 

104 True if nothing must be sent to stdout/stderr, False otherwise 

105 """ 

106 # import needed packages 

107 import logging 

108 from PanACoTA import utils 

109 from PanACoTA.pangenome_module import protein_seq_functions as protf 

110 from PanACoTA.pangenome_module import mmseqs_functions as mmf 

111 from PanACoTA.pangenome_module import post_treatment as pt 

112 from PanACoTA import __version__ as version 

113 

114 # test if mmseqs is installed and in the path 

115 if not utils.check_installed("mmseqs"): # pragma: no cover 

116 print("mmseqs is not installed. 'PanACoTA pangenome' cannot run.") 

117 sys.exit(1) 

118 

119 os.makedirs(outdir, exist_ok=True) 

120 # level is the minimum level that will be considered. 

121 # for verbose = 0 or 1, ignore details and debug, start from info 

122 if verbose <= 1: 

123 level = logging.INFO 

124 # for verbose = 2, ignore only debug 

125 if verbose >= 2 and verbose < 15: 

126 level = utils.detail_lvl() # int corresponding to detail level 

127 # for verbose >= 15, write everything 

128 if verbose >= 15: 

129 level = logging.DEBUG 

130 # name logfile, add timestamp if already existing 

131 logfile_base = os.path.join(outdir, "PanACoTA-pangenome_" + name) 

132 utils.init_logger(logfile_base, level, '', verbose=verbose, quiet=quiet, log_details=True) 

133 logger = logging.getLogger("pangenome") 

134 logger.info(f'PanACoTA version {version}') 

135 logger.info("Command used\n \t > " + cmd) 

136 

137 # Build bank with all proteins to include in the pangenome 

138 prt_path = protf.build_prt_bank(lstinfo, dbpath, name, spe_dir, quiet) 

139 # Do pangenome 

140 families, panfile = mmf.run_all_pangenome(min_id, clust_mode, outdir, 

141 prt_path, threads, outfile, quiet) 

142 # Create matrix pan_quali, pan_quanti and summary file 

143 pt.post_treat(families, panfile) 

144 logger.info("DONE") 

145 return panfile 

146 

147 

148def build_parser(parser): 

149 """ 

150 Method to create a parser for command-line options 

151 

152 Parameters 

153 ---------- 

154 parser : argparse.ArgumentParser 

155 parser to configure in order to extract command-line arguments 

156 """ 

157 import argparse 

158 from PanACoTA import utils_argparse 

159 

160 # Create command-line parser for all options and arguments to give 

161 required = parser.add_argument_group('Required arguments') 

162 required.add_argument("-l", dest="lstinfo_file", required=True, 

163 help=("File containing the list of all genomes to include in " 

164 "the pan-genome, 1 genome per line: it can be the " 

165 "LSTINFO-<list_file>.lst file of 'PanACoTA annotate' module." 

166 "Here, only the first column (genome name without extension) " 

167 "will be used. All proteins of all these genomes will be " 

168 "concatenated in a file called <dataset_name>.All.prt. The " 

169 "column header must be 'gembase_name'.")) 

170 required.add_argument("-n", dest="dataset_name", required=True, 

171 help=("Name of the dataset which will be clustered (for example, " 

172 "SAEN1234 for 1234 Salmonella enterica genomes). This name will " 

173 "be used to name the protein databank, a well as the " 

174 "pangenome files.")) 

175 required.add_argument("-d", dest="dbpath", required=True, 

176 help=("Path to the folder containing all protein files corresponding " 

177 "to the genomes of the dataset (output directory 'Proteins' " 

178 "of 'PanACoTA annotate' module).")) 

179 required.add_argument("-o", dest="outdir", required=True, 

180 help=("Output directory, where all results must be saved " 

181 "(including tmp folder)")) 

182 

183 optional = parser.add_argument_group('Optional arguments') 

184 optional.add_argument("-i", dest="min_id", type=utils_argparse.perc_id, default=0.8, 

185 help=("Minimum sequence identity to be considered in the same " 

186 "cluster (float between 0 and 1). Default is 0.8.")) 

187 optional.add_argument("-f", dest="outfile", 

188 help=("Use this option if you want to give the name of the pangenome " 

189 "output file (without path). Otherwise, by default, it is called " 

190 "PanGenome-mmseq_<given_dataset_name>.All.prt_<" 

191 "information_on_parameters>.lst")) 

192 optional.add_argument("-c", dest="clust_mode", type=int, choices=[0, 1, 2], default=1, 

193 help=("Choose the clustering mode: 0 for 'set cover', 1 for " 

194 "'single-linkage', 2 for 'CD-Hit'. Default " 

195 "is 'single-linkage' (1)")) 

196 optional.add_argument("-s", dest="spedir", 

197 help=("use this option if you want to save the concatenated protein " 

198 "databank in another directory than the one containing all " 

199 "individual protein files ('Proteins' folder).")) 

200 optional.add_argument("--threads", dest="threads", default=1, type=utils_argparse.thread_num, 

201 help=("add this option if you want to parallelize on several threads. " 

202 "Indicate on how many threads you want to parallelize. " 

203 "By default, it uses 1 thread. Put 0 if you want to use " 

204 "all threads of your computer.")) 

205 

206 helper = parser.add_argument_group('Others') 

207 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0, 

208 help="Increase verbosity in stdout/stderr.") 

209 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False, 

210 help=("Do not display anything to stdout/stderr. log files will " 

211 "still be created.")) 

212 helper.add_argument("-h", "--help", dest="help", action="help", 

213 help="show this help message and exit") 

214 

215 

216def parse(parser, argu): 

217 """ 

218 Parse arguments given to parser 

219 

220 Parameters 

221 ---------- 

222 parser : argparse.ArgumentParser 

223 Parser to use to parse command-line arguments 

224 argu : [str] 

225 command-line given 

226 

227 Returns 

228 ------- 

229 argparse.Namespace or None 

230 The arguments parsed, updated according to some rules. Exit program 

231 with error message if error occurs with arguments given. 

232 """ 

233 args = parser.parse_args(argu) 

234 return args 

235 

236 

237if __name__ == '__main__': 

238 import argparse 

239 

240 my_parser = argparse.ArgumentParser(description="Compute pan-genome", add_help=False) 

241 build_parser(my_parser) 

242 OPTIONS = parse(my_parser, sys.argv[1:]) 

243 main_from_parse(OPTIONS)