Coverage for PanACoTA/subcommands/corepers.py: 100%

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37corepers is a subcommand of PanACoTA

39Generate a core genome (families containing 1 member in all genomes of the dataset)

40or a persistent genome (families with a given % of genomes having exactly 1 member).

41You can also allow:

43- mixed families: exactly 1 member in the given percentage of genomes, but the other genomes

44 can contain 0 or several members

45- multi families: allow several members in any genome.

48@author gem

49June 2017

50"""

52import os

54import sys

57def main_from_parse(args):

58 """

59 Call main function from the arguments given by parser

61 Parameters

62 ----------

63 args : argparse.Namespace

64 result of argparse parsing of all arguments in command line

65 """

66 cmd = "PanACoTA " + ' '.join(args.argv)

67 main(cmd, args.pangenome, args.tol, args.multi, args.mixed, args.outputdir,

68 args.lstinfo_file, args.floor, args.verbose, args.quiet)

71def main(cmd, pangenome, tol, multi, mixed, outputdir, lstinfo_file, floor, verbose, quiet):

72 """

73 Read pangenome and deduce Persistent genome according to the user criteria

75 Parameters

76 ----------

77 pangenome : str

78 file containing pangenome

79 tol : float

80 min % of genomes present in a family to consider it as persistent (between 0 and 1)

81 multi : bool

82 True if multigenic families are allowed, False otherwise

83 mixed : bool

84 True if mixed families are allowed, False otherwise

85 outputdir : str or None

86 Specific directory for the generated persistent genome. If not given, pangenome directory is used.

87 lstinfo_file : str

88 list of genomes to include in the core/persistent genome. If not given, include all genomes of pan

89 floor : bool

90 Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False

91 verbose : int

92 verbosity:

93 - defaut 0 : stdout contains INFO, stderr contains ERROR.

94 - 1: stdout contains INFO, stderr contains WARNING and ERROR

95 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR

96 - >=15: Add DEBUG in stdout

97 quiet : bool

98 True if nothing must be sent to stdout/stderr, False otherwise

99 """

100 # import needed packages

101 import logging

102 from PanACoTA import utils

103 from PanACoTA import utils_pangenome as utilsp

104 import PanACoTA.corepers_module.persistent_functions as pers

105 from PanACoTA import __version__ as version

106

107 # get pangenome name info

108 _, base_pan = os.path.split(pangenome)

109 if lstinfo_file:

110 _, base_lst = os.path.split(lstinfo_file)

111 else:

112 base_lst = "all"

113 # Define output filename

114 output_name = f"PersGenome_{base_pan}-{base_lst}_"

115 if floor:

116 output_name += "F"

117 output_name += str(tol)

118 if multi:

119 output_name += "-multi.lst"

120 elif mixed:

121 output_name += "-mixed.lst"

122 else:

123 output_name += ".lst"

124 # Define output directory and filename path

125 if not os.path.isdir(outputdir):

126 os.makedirs(outputdir)

127 outputfile = os.path.join(outputdir, output_name)

128 logfile_base = os.path.join(outputdir, "PanACoTA-corepers")

129 # level is the minimum level that will be considered.

130 # for verbose = 0 or 1, ignore details and debug, start from info

131 if verbose <= 1:

132 level = logging.INFO

133 # for verbose = 2, ignore only debug

134 if verbose >= 2 and verbose < 15:

135 level = 15 # int corresponding to detail level

136 # for verbose >= 15, write everything

137 if verbose >= 15:

138 level = logging.DEBUG

139 utils.init_logger(logfile_base, level, 'corepers', verbose=verbose, quiet=quiet)

140 logger = logging.getLogger("corepers")

141 logger.info(f'PanACoTA version {version}')

142 logger.info("Command used\n \t > " + cmd)

143

144 logger.info(get_info(tol, multi, mixed, floor))

145

146 # Read pangenome

147 fams_by_strain, families, all_strains = utilsp.read_pangenome(pangenome, logger)

148 # If list of genomes given, get subset of previous dicts, including only the genomes aksed

149 if lstinfo_file:

150 fams_by_strain, families, all_strains = pers.get_subset_genomes(fams_by_strain, families, lstinfo_file)

151 # Generate persistent genome

152 fams = pers.get_pers(fams_by_strain, families, len(all_strains), tol, multi, mixed, floor)

153 # Write persistent genome to file

154 pers.write_persistent(fams, outputfile)

155 logger.info("Persistent genome step done.")

156 return outputfile

157

158

159def get_info(tol, multi, mixed, floor):

160 """

161 Get a string corresponding to the information that will be given to logger.

162

163 Parameters

164 ----------

165 tol : float

166 min % of genomes present in a family to consider it as persistent (between 0 and 1)

167 multi : bool

168 True if multigenic families are allowed, False otherwise

169 mixed : bool

170 True if mixed families are allowed, False otherwise

171 floor : bool

172 Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False

173

174 Returns

175 -------

176 str

177 Information to give to logger

178 """

179 if tol == 1:

180 return "Will generate a CoreGenome."

181 else:

182 if floor:

183 floorstr = "floor"

184 else:

185 floorstr = "ceil"

186 toprint = (f"Will generate a Persistent genome with member(s) in at least {100*tol}"

187 f"% of all genomes in each family.\n")

188 if multi:

189 toprint += ("Multigenic families are allowed (several members in "

190 "any genome of a family).")

191 elif mixed:

192 toprint += ("Mixed families are allowed. To be considered as persistent, "

193 f"a family must have exactly 1 member in {tol*100}% of the genomes, "

194 f"but in the remaining {round((1-tol)*100,3)}% genomes, there can be 0, 1 or "

195 "several members.")

196 else:

197 toprint += ("To be considered as persistent, a family must contain exactly 1 member "

198 f"in at least {tol*100}% of all genomes. The other genomes are absent from the "

199 "family.")

200 return toprint

201

202

203def build_parser(parser):

204 """

205 Method to create a parser for command-line options

206

207 Parameters

208 ----------

209 parser : argparse.ArgumentParser

210 parser to configure in order to extract command-line arguments

211 """

212 import argparse

213 from PanACoTA import utils_argparse

214

215 # Create command-line parser for all options and arguments to give

216 required = parser.add_argument_group('Required arguments')

217 required.add_argument("-p", dest="pangenome", required=True,

218 help="PanGenome file (1 line per family, first column is fam number)")

219 required.add_argument("-o", dest="outputdir", required=True,

220 help=("Specify the output directory for your core/persistent genome."),

221 default=".")

222 optional = parser.add_argument_group('Optional arguments')

223 optional.add_argument("-t", "--tol", dest="tol", default=1, type=utils_argparse.percentage,

224 help=("min %% of genomes having at least 1 member in a family to "

225 "consider the family as persistent (between 0 and 1, "

226 "default is 1 = 100%% of genomes = Core genome)."

227 "By default, the minimum number of genomes will be "

228 "ceil('tol'*N) (N being the total number of genomes). If "

229 "you want to use floor('tol'*N) instead, add the '-F' option."))

230 optional.add_argument("-M", dest="multi", action='store_true',

231 help=("Add this option if you allow several members in any genome "

232 "of a family. By default, only 1 (or 0 if tol<1) member "

233 "per genome are allowed in all genomes. If you want to allow "

234 "exactly 1 member in 'tol'%% of the genomes, and 0, 1 "

235 "or several members in the '1-tol'%% left, use the option -X "

236 "instead of this one: -M and -X options are not compatible."))

237 optional.add_argument("-X", dest="mixed", action='store_true',

238 help="Add this option if you want to allow families having several "

239 "members only in '1-tol'%% of the genomes. In the other genomes, "

240 "only 1 member exactly is allowed. This option is not compatible "

241 "with -M (which is allowing multigenic families: having several "

242 "members in any number of genomes).")

243 optional.add_argument("-F", dest="floor", action="store_true",

244 help="When you specify the '-tol' option, with a number lower "

245 "than 1, you can add this option to use floor('tol'*N) "

246 "as a minimum number of genomes instead of ceil('tol'*N) "

247 "which is the default behavior.")

248 optional.add_argument("-l", dest="lstinfo_file",

249 help=("By default, the core/persistent genome will include all genomes "

250 "found in the given pangenome file. If you want to do a core/persistent "

251 "genome on a subset of those genomes, give a file with this "

252 "list of genomes. This file must have 1 line per genome, only the first column "

253 "(genome name without extension) will be used."))

254

255 helper = parser.add_argument_group('Others')

256 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,

257 help="Increase verbosity in stdout/stderr.")

258 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,

259 help=("Do not display anything to stdout/stderr. log files will "

260 "still be created."))

261 helper.add_argument("-h", "--help", dest="help", action="help",

262 help="show this help message and exit")

263

264

265def check_args(parser, args):

266 """

267 Check that arguments given to parser are as expected.

268

269 Parameters

270 ----------

271 parser : argparse.ArgumentParser

272 The parser used to parse command-line

273 args : argparse.Namespace

274 Parsed arguments

275

276 Returns

277 -------

278 argparse.Namespace or None

279 The arguments parsed, updated according to some rules. Exit program

280 with error message if error occurs with arguments given.

281 """

282 if args.multi and args.mixed:

283 parser.error("-M and -X options cannot be activated together. Choose if you want to:\n"

284 "- allow several members in any number of genomes of a family (-M)\n"

285 "- allow several members in only '1-tol'% of the genomes of a family "

286 "(other 'tol'% genomes must have exactly 1 member) (-X)")

287 if args.mixed and args.tol == 1:

288 parser.error("You are asking for mixed families, while asking for 100% of the genomes of "

289 "a family to have exactly one member, which is not compatible. Do you want "

290 "to \n- lower the percentage of genomes required to have exactly "

291 "1 member (-t tol)\n- not allow mixed families (remove -X option)")

292 if args.floor and args.tol == 1:

293 parser.error("You are asking to use floor('tol'*N) as a minimum number of genomes "

294 "present in a family, but with 'tol'=1: the minimum number of genomes "

295 "will always be equal to N, using floor or the default ceil! Either "

296 "use a 'tol' lower than 1, or remove the '-F' option.")

297 return args

298

299

300def parse(parser, argu):

301 """

302 Parse arguments given to parser

303

304 Parameters

305 ----------

306 parser : argparse.ArgumentParser

307 the parser used

308 argu : [str]

309 command-line given by user, to parse using parser

310

311 Returns

312 -------

313 argparse.Namespace

314 Parsed arguments

315 """

316 args = parser.parse_args(argu)

317 return check_args(parser, args)

318

319

320if __name__ == '__main__':

321 import argparse

322 myparser = argparse.ArgumentParser(description="Compute core or persistent genome",

323 add_help=False)

324 build_parser(myparser)

325 OPTIONS = parse(myparser, sys.argv[1:])

326 main_from_parse(OPTIONS)