Coverage for PanACoTA/subcommands/corepers.py: 100%

87 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37corepers is a subcommand of PanACoTA 

38 

39Generate a core genome (families containing 1 member in all genomes of the dataset) 

40or a persistent genome (families with a given % of genomes having exactly 1 member). 

41You can also allow: 

42 

43- mixed families: exactly 1 member in the given percentage of genomes, but the other genomes 

44 can contain 0 or several members 

45- multi families: allow several members in any genome. 

46 

47 

48@author gem 

49June 2017 

50""" 

51 

52import os 

53 

54import sys 

55 

56 

57def main_from_parse(args): 

58 """ 

59 Call main function from the arguments given by parser 

60 

61 Parameters 

62 ---------- 

63 args : argparse.Namespace 

64 result of argparse parsing of all arguments in command line 

65 """ 

66 cmd = "PanACoTA " + ' '.join(args.argv) 

67 main(cmd, args.pangenome, args.tol, args.multi, args.mixed, args.outputdir, 

68 args.lstinfo_file, args.floor, args.verbose, args.quiet) 

69 

70 

71def main(cmd, pangenome, tol, multi, mixed, outputdir, lstinfo_file, floor, verbose, quiet): 

72 """ 

73 Read pangenome and deduce Persistent genome according to the user criteria 

74 

75 Parameters 

76 ---------- 

77 pangenome : str 

78 file containing pangenome 

79 tol : float 

80 min % of genomes present in a family to consider it as persistent (between 0 and 1) 

81 multi : bool 

82 True if multigenic families are allowed, False otherwise 

83 mixed : bool 

84 True if mixed families are allowed, False otherwise 

85 outputdir : str or None 

86 Specific directory for the generated persistent genome. If not given, pangenome directory is used. 

87 lstinfo_file : str 

88 list of genomes to include in the core/persistent genome. If not given, include all genomes of pan 

89 floor : bool 

90 Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False 

91 verbose : int 

92 verbosity: 

93 - defaut 0 : stdout contains INFO, stderr contains ERROR. 

94 - 1: stdout contains INFO, stderr contains WARNING and ERROR 

95 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR 

96 - >=15: Add DEBUG in stdout 

97 quiet : bool 

98 True if nothing must be sent to stdout/stderr, False otherwise 

99 """ 

100 # import needed packages 

101 import logging 

102 from PanACoTA import utils 

103 from PanACoTA import utils_pangenome as utilsp 

104 import PanACoTA.corepers_module.persistent_functions as pers 

105 from PanACoTA import __version__ as version 

106 

107 # get pangenome name info 

108 _, base_pan = os.path.split(pangenome) 

109 if lstinfo_file: 

110 _, base_lst = os.path.split(lstinfo_file) 

111 else: 

112 base_lst = "all" 

113 # Define output filename 

114 output_name = f"PersGenome_{base_pan}-{base_lst}_" 

115 if floor: 

116 output_name += "F" 

117 output_name += str(tol) 

118 if multi: 

119 output_name += "-multi.lst" 

120 elif mixed: 

121 output_name += "-mixed.lst" 

122 else: 

123 output_name += ".lst" 

124 # Define output directory and filename path 

125 if not os.path.isdir(outputdir): 

126 os.makedirs(outputdir) 

127 outputfile = os.path.join(outputdir, output_name) 

128 logfile_base = os.path.join(outputdir, "PanACoTA-corepers") 

129 # level is the minimum level that will be considered. 

130 # for verbose = 0 or 1, ignore details and debug, start from info 

131 if verbose <= 1: 

132 level = logging.INFO 

133 # for verbose = 2, ignore only debug 

134 if verbose >= 2 and verbose < 15: 

135 level = 15 # int corresponding to detail level 

136 # for verbose >= 15, write everything 

137 if verbose >= 15: 

138 level = logging.DEBUG 

139 utils.init_logger(logfile_base, level, 'corepers', verbose=verbose, quiet=quiet) 

140 logger = logging.getLogger("corepers") 

141 logger.info(f'PanACoTA version {version}') 

142 logger.info("Command used\n \t > " + cmd) 

143 

144 logger.info(get_info(tol, multi, mixed, floor)) 

145 

146 # Read pangenome 

147 fams_by_strain, families, all_strains = utilsp.read_pangenome(pangenome, logger) 

148 # If list of genomes given, get subset of previous dicts, including only the genomes aksed 

149 if lstinfo_file: 

150 fams_by_strain, families, all_strains = pers.get_subset_genomes(fams_by_strain, families, lstinfo_file) 

151 # Generate persistent genome 

152 fams = pers.get_pers(fams_by_strain, families, len(all_strains), tol, multi, mixed, floor) 

153 # Write persistent genome to file 

154 pers.write_persistent(fams, outputfile) 

155 logger.info("Persistent genome step done.") 

156 return outputfile 

157 

158 

159def get_info(tol, multi, mixed, floor): 

160 """ 

161 Get a string corresponding to the information that will be given to logger. 

162 

163 Parameters 

164 ---------- 

165 tol : float 

166 min % of genomes present in a family to consider it as persistent (between 0 and 1) 

167 multi : bool 

168 True if multigenic families are allowed, False otherwise 

169 mixed : bool 

170 True if mixed families are allowed, False otherwise 

171 floor : bool 

172 Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False 

173 

174 Returns 

175 ------- 

176 str 

177 Information to give to logger 

178 """ 

179 if tol == 1: 

180 return "Will generate a CoreGenome." 

181 else: 

182 if floor: 

183 floorstr = "floor" 

184 else: 

185 floorstr = "ceil" 

186 toprint = (f"Will generate a Persistent genome with member(s) in at least {100*tol}" 

187 f"% of all genomes in each family.\n") 

188 if multi: 

189 toprint += ("Multigenic families are allowed (several members in " 

190 "any genome of a family).") 

191 elif mixed: 

192 toprint += ("Mixed families are allowed. To be considered as persistent, " 

193 f"a family must have exactly 1 member in {tol*100}% of the genomes, " 

194 f"but in the remaining {round((1-tol)*100,3)}% genomes, there can be 0, 1 or " 

195 "several members.") 

196 else: 

197 toprint += ("To be considered as persistent, a family must contain exactly 1 member " 

198 f"in at least {tol*100}% of all genomes. The other genomes are absent from the " 

199 "family.") 

200 return toprint 

201 

202 

203def build_parser(parser): 

204 """ 

205 Method to create a parser for command-line options 

206 

207 Parameters 

208 ---------- 

209 parser : argparse.ArgumentParser 

210 parser to configure in order to extract command-line arguments 

211 """ 

212 import argparse 

213 from PanACoTA import utils_argparse 

214 

215 # Create command-line parser for all options and arguments to give 

216 required = parser.add_argument_group('Required arguments') 

217 required.add_argument("-p", dest="pangenome", required=True, 

218 help="PanGenome file (1 line per family, first column is fam number)") 

219 required.add_argument("-o", dest="outputdir", required=True, 

220 help=("Specify the output directory for your core/persistent genome."), 

221 default=".") 

222 optional = parser.add_argument_group('Optional arguments') 

223 optional.add_argument("-t", "--tol", dest="tol", default=1, type=utils_argparse.percentage, 

224 help=("min %% of genomes having at least 1 member in a family to " 

225 "consider the family as persistent (between 0 and 1, " 

226 "default is 1 = 100%% of genomes = Core genome)." 

227 "By default, the minimum number of genomes will be " 

228 "ceil('tol'*N) (N being the total number of genomes). If " 

229 "you want to use floor('tol'*N) instead, add the '-F' option.")) 

230 optional.add_argument("-M", dest="multi", action='store_true', 

231 help=("Add this option if you allow several members in any genome " 

232 "of a family. By default, only 1 (or 0 if tol<1) member " 

233 "per genome are allowed in all genomes. If you want to allow " 

234 "exactly 1 member in 'tol'%% of the genomes, and 0, 1 " 

235 "or several members in the '1-tol'%% left, use the option -X " 

236 "instead of this one: -M and -X options are not compatible.")) 

237 optional.add_argument("-X", dest="mixed", action='store_true', 

238 help="Add this option if you want to allow families having several " 

239 "members only in '1-tol'%% of the genomes. In the other genomes, " 

240 "only 1 member exactly is allowed. This option is not compatible " 

241 "with -M (which is allowing multigenic families: having several " 

242 "members in any number of genomes).") 

243 optional.add_argument("-F", dest="floor", action="store_true", 

244 help="When you specify the '-tol' option, with a number lower " 

245 "than 1, you can add this option to use floor('tol'*N) " 

246 "as a minimum number of genomes instead of ceil('tol'*N) " 

247 "which is the default behavior.") 

248 optional.add_argument("-l", dest="lstinfo_file", 

249 help=("By default, the core/persistent genome will include all genomes " 

250 "found in the given pangenome file. If you want to do a core/persistent " 

251 "genome on a subset of those genomes, give a file with this " 

252 "list of genomes. This file must have 1 line per genome, only the first column " 

253 "(genome name without extension) will be used.")) 

254 

255 helper = parser.add_argument_group('Others') 

256 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0, 

257 help="Increase verbosity in stdout/stderr.") 

258 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False, 

259 help=("Do not display anything to stdout/stderr. log files will " 

260 "still be created.")) 

261 helper.add_argument("-h", "--help", dest="help", action="help", 

262 help="show this help message and exit") 

263 

264 

265def check_args(parser, args): 

266 """ 

267 Check that arguments given to parser are as expected. 

268 

269 Parameters 

270 ---------- 

271 parser : argparse.ArgumentParser 

272 The parser used to parse command-line 

273 args : argparse.Namespace 

274 Parsed arguments 

275 

276 Returns 

277 ------- 

278 argparse.Namespace or None 

279 The arguments parsed, updated according to some rules. Exit program 

280 with error message if error occurs with arguments given. 

281 """ 

282 if args.multi and args.mixed: 

283 parser.error("-M and -X options cannot be activated together. Choose if you want to:\n" 

284 "- allow several members in any number of genomes of a family (-M)\n" 

285 "- allow several members in only '1-tol'% of the genomes of a family " 

286 "(other 'tol'% genomes must have exactly 1 member) (-X)") 

287 if args.mixed and args.tol == 1: 

288 parser.error("You are asking for mixed families, while asking for 100% of the genomes of " 

289 "a family to have exactly one member, which is not compatible. Do you want " 

290 "to \n- lower the percentage of genomes required to have exactly " 

291 "1 member (-t tol)\n- not allow mixed families (remove -X option)") 

292 if args.floor and args.tol == 1: 

293 parser.error("You are asking to use floor('tol'*N) as a minimum number of genomes " 

294 "present in a family, but with 'tol'=1: the minimum number of genomes " 

295 "will always be equal to N, using floor or the default ceil! Either " 

296 "use a 'tol' lower than 1, or remove the '-F' option.") 

297 return args 

298 

299 

300def parse(parser, argu): 

301 """ 

302 Parse arguments given to parser 

303 

304 Parameters 

305 ---------- 

306 parser : argparse.ArgumentParser 

307 the parser used 

308 argu : [str] 

309 command-line given by user, to parse using parser 

310 

311 Returns 

312 ------- 

313 argparse.Namespace 

314 Parsed arguments 

315 """ 

316 args = parser.parse_args(argu) 

317 return check_args(parser, args) 

318 

319 

320if __name__ == '__main__': 

321 import argparse 

322 myparser = argparse.ArgumentParser(description="Compute core or persistent genome", 

323 add_help=False) 

324 build_parser(myparser) 

325 OPTIONS = parse(myparser, sys.argv[1:]) 

326 main_from_parse(OPTIONS)