Coverage for PanACoTA/subcommands/pangenome.py: 100%

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37pangenome is a subcommand of PanACoTA

40@author gem

41May 2017

42"""

44import sys

45import os

48def main_from_parse(args):

49 """

50 Call main function from the arguments given by parser

52 Parameters

53 ----------

54 args : argparse.Namespace

55 result of argparse parsing of all arguments in command line

56 """

57 cmd = "PanACoTA " + ' '.join(args.argv)

58 main(cmd, args.lstinfo_file, args.dataset_name, args.dbpath, args.min_id, args.outdir,

59 args.clust_mode, args.spedir, args.threads, args.outfile, args.verbose,

60 args.quiet)

63def main(cmd, lstinfo, name, dbpath, min_id, outdir, clust_mode, spe_dir, threads, outfile=None,

64 verbose=0, quiet=False):

65 """

66 Main method, doing all steps:

68 - concatenate all protein files

69 - create database as ffindex

70 - cluster all proteins

71 - convert to pangenome file

72 - creating summary and matrix of pangenome

74 Parameters

75 ----------

76 lstinfo : str

77 file with name of genomes to consider for pan in the first column, without extension.

78 Other columns are ignored. The first column header must be 'gembase_name'

79 name : str

80 name given to the dataset. For example, ESCO44 for 44 *Escherichia coli* genomes.

81 dbpath : str

82 path to the folder containing all protein files (files called as the name of genome

83 given in lstinfo + ".prt"

84 min_id : float

85 Minimum percentage of identity between 2 proteins to put them in the same family

86 outdir : str

87 path to folder which will contain pangenome results and tmp files

88 clust_mode : [0, 1, 2]

89 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'

90 spe_dir : str or None

91 path to the folder where concatenated bank of proteins must be saved.

92 None to use the same folder as protein files

93 threads : int

94 Max number of threads to use

95 outfile : str or None

96 Name of the pangenome. None to use the default name

97 verbose : int

98 verbosity:

99 - defaut 0 : stdout contains INFO, stderr contains ERROR.

100 - 1: stdout contains INFO, stderr contains WARNING and ERROR

101 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR

102 - >=15: Add DEBUG in stdout

103 quiet : bool

104 True if nothing must be sent to stdout/stderr, False otherwise

105 """

106 # import needed packages

107 import logging

108 from PanACoTA import utils

109 from PanACoTA.pangenome_module import protein_seq_functions as protf

110 from PanACoTA.pangenome_module import mmseqs_functions as mmf

111 from PanACoTA.pangenome_module import post_treatment as pt

112 from PanACoTA import __version__ as version

113

114 # test if mmseqs is installed and in the path

115 if not utils.check_installed("mmseqs"): # pragma: no cover

116 print("mmseqs is not installed. 'PanACoTA pangenome' cannot run.")

117 sys.exit(1)

118

119 os.makedirs(outdir, exist_ok=True)

120 # level is the minimum level that will be considered.

121 # for verbose = 0 or 1, ignore details and debug, start from info

122 if verbose <= 1:

123 level = logging.INFO

124 # for verbose = 2, ignore only debug

125 if verbose >= 2 and verbose < 15:

126 level = utils.detail_lvl() # int corresponding to detail level

127 # for verbose >= 15, write everything

128 if verbose >= 15:

129 level = logging.DEBUG

130 # name logfile, add timestamp if already existing

131 logfile_base = os.path.join(outdir, "PanACoTA-pangenome_" + name)

132 utils.init_logger(logfile_base, level, '', verbose=verbose, quiet=quiet, log_details=True)

133 logger = logging.getLogger("pangenome")

134 logger.info(f'PanACoTA version {version}')

135 logger.info("Command used\n \t > " + cmd)

136

137 # Build bank with all proteins to include in the pangenome

138 prt_path = protf.build_prt_bank(lstinfo, dbpath, name, spe_dir, quiet)

139 # Do pangenome

140 families, panfile = mmf.run_all_pangenome(min_id, clust_mode, outdir,

141 prt_path, threads, outfile, quiet)

142 # Create matrix pan_quali, pan_quanti and summary file

143 pt.post_treat(families, panfile)

144 logger.info("DONE")

145 return panfile

146

147

148def build_parser(parser):

149 """

150 Method to create a parser for command-line options

151

152 Parameters

153 ----------

154 parser : argparse.ArgumentParser

155 parser to configure in order to extract command-line arguments

156 """

157 import argparse

158 from PanACoTA import utils_argparse

159

160 # Create command-line parser for all options and arguments to give

161 required = parser.add_argument_group('Required arguments')

162 required.add_argument("-l", dest="lstinfo_file", required=True,

163 help=("File containing the list of all genomes to include in "

164 "the pan-genome, 1 genome per line: it can be the "

165 "LSTINFO-<list_file>.lst file of 'PanACoTA annotate' module."

166 "Here, only the first column (genome name without extension) "

167 "will be used. All proteins of all these genomes will be "

168 "concatenated in a file called <dataset_name>.All.prt. The "

169 "column header must be 'gembase_name'."))

170 required.add_argument("-n", dest="dataset_name", required=True,

171 help=("Name of the dataset which will be clustered (for example, "

172 "SAEN1234 for 1234 Salmonella enterica genomes). This name will "

173 "be used to name the protein databank, a well as the "

174 "pangenome files."))

175 required.add_argument("-d", dest="dbpath", required=True,

176 help=("Path to the folder containing all protein files corresponding "

177 "to the genomes of the dataset (output directory 'Proteins' "

178 "of 'PanACoTA annotate' module)."))

179 required.add_argument("-o", dest="outdir", required=True,

180 help=("Output directory, where all results must be saved "

181 "(including tmp folder)"))

182

183 optional = parser.add_argument_group('Optional arguments')

184 optional.add_argument("-i", dest="min_id", type=utils_argparse.perc_id, default=0.8,

185 help=("Minimum sequence identity to be considered in the same "

186 "cluster (float between 0 and 1). Default is 0.8."))

187 optional.add_argument("-f", dest="outfile",

188 help=("Use this option if you want to give the name of the pangenome "

189 "output file (without path). Otherwise, by default, it is called "

190 "PanGenome-mmseq_<given_dataset_name>.All.prt_<"

191 "information_on_parameters>.lst"))

192 optional.add_argument("-c", dest="clust_mode", type=int, choices=[0, 1, 2], default=1,

193 help=("Choose the clustering mode: 0 for 'set cover', 1 for "

194 "'single-linkage', 2 for 'CD-Hit'. Default "

195 "is 'single-linkage' (1)"))

196 optional.add_argument("-s", dest="spedir",

197 help=("use this option if you want to save the concatenated protein "

198 "databank in another directory than the one containing all "

199 "individual protein files ('Proteins' folder)."))

200 optional.add_argument("--threads", dest="threads", default=1, type=utils_argparse.thread_num,

201 help=("add this option if you want to parallelize on several threads. "

202 "Indicate on how many threads you want to parallelize. "

203 "By default, it uses 1 thread. Put 0 if you want to use "

204 "all threads of your computer."))

205

206 helper = parser.add_argument_group('Others')

207 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,

208 help="Increase verbosity in stdout/stderr.")

209 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,

210 help=("Do not display anything to stdout/stderr. log files will "

211 "still be created."))

212 helper.add_argument("-h", "--help", dest="help", action="help",

213 help="show this help message and exit")

214

215

216def parse(parser, argu):

217 """

218 Parse arguments given to parser

219

220 Parameters

221 ----------

222 parser : argparse.ArgumentParser

223 Parser to use to parse command-line arguments

224 argu : [str]

225 command-line given

226

227 Returns

228 -------

229 argparse.Namespace or None

230 The arguments parsed, updated according to some rules. Exit program

231 with error message if error occurs with arguments given.

232 """

233 args = parser.parse_args(argu)

234 return args

235

236

237if __name__ == '__main__':

238 import argparse

239

240 my_parser = argparse.ArgumentParser(description="Compute pan-genome", add_help=False)

241 build_parser(my_parser)

242 OPTIONS = parse(my_parser, sys.argv[1:])

243 main_from_parse(OPTIONS)