Coverage for PanACoTA/subcommands/align.py: 100%

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37align is a subcommand of PanACoTA

40@author gem

41June 2017

42"""

44import os

45import sys

48def main_from_parse(args):

49 """

50 Call main function from the arguments given by parser

52 Parameters

53 ----------

54 args : argparse.Namespace

55 result of argparse parsing of all arguments in command line

56 """

57 cmd = "PanACoTA " + ' '.join(args.argv)

58 main(cmd, args.corepers, args.list_genomes, args.dataset_name, args.dbpath,

59 args.outdir, args.prot_ali, args.threads, args.force, args.verbose, args.quiet)

62def main(cmd, corepers, list_genomes, dname, dbpath, outdir, prot_ali, threads, force, verbose=0,

63 quiet=False):

64 """

65 Align given core genome families

67 Parameters

68 ----------

69 corepers : str

70 File containing persistent genome families

71 list_genomes : str

72 File containing the list of all genomes in the dataset. Only first column is

73 considered.

74 dname : str

75 Dataset name, used to name output files

76 dbpath : str

77 path to the directory containing 'Proteins' and 'Genes' folders

78 outdir : str

79 path to the directory where output files must be saved

80 prot_ali : bool

81 Also give aa alignment of concatenation of persistent proteins

82 threads : int

83 Max number of threads to use

84 force : bool

85 Remove existing output files and rerun everything if True.

86 verbose : int

87 verbosity:

88 - defaut 0 : stdout contains INFO, stderr contains ERROR.

89 - 1: stdout contains INFO, stderr contains WARNING and ERROR

90 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR

91 - >=15: Add DEBUG in stdout

93 quiet : bool

94 True if nothing must be sent to stdout/stderr, False otherwise

95 """

96 # import needed packages

97 import logging

98 import shutil

99 from PanACoTA import utils

100 from PanACoTA.align_module import pan_to_pergenome as p2g

101 from PanACoTA.align_module import get_seqs as gseqs

102 from PanACoTA.align_module import alignment as ali

103 from PanACoTA.align_module import post_align as post

104 from PanACoTA import __version__ as version

105

106 # test if prokka is installed and in the path

107 if not utils.check_installed("mafft"): # pragma: no cover

108 print("mafft is not installed. 'PanACoTA align' cannot run.")

109 sys.exit(1)

110

111 if force and os.path.isdir(outdir):

112 shutil.rmtree(outdir)

113 os.makedirs(outdir, exist_ok=True)

114 # set level of logger (here debug to show everything during development)

115 # level is the minimum level that will be considered.

116 # for verbose = 0 or 1, ignore details and debug, start from info

117 if verbose <= 1:

118 level = logging.INFO

119 # for verbose = 2, ignore only debug

120 if verbose >= 2 and verbose < 15:

121 level = 15 # int corresponding to detail level

122 # for verbose >= 15, write everything

123 if verbose >= 15:

124 level = logging.DEBUG

125 # name logfile, add timestamp if already existing

126 logfile_base = os.path.join(outdir, "PanACoTA-align_" + dname)

127 utils.init_logger(logfile_base, level, 'align', log_details=True, verbose=verbose, quiet=quiet)

128 logger = logging.getLogger("align")

129 logger.info(f'PanACoTA version {version}')

130 logger.info("Command used\n \t > " + cmd)

131

132 all_genomes, aldir, listdir, fam_nums = p2g.get_per_genome(corepers, list_genomes,

133 dname, outdir)

134 # generate required files

135 gseqs.get_all_seqs(all_genomes, dname, dbpath, listdir, aldir, fam_nums, quiet)

136 prefix = os.path.join(aldir, dname)

137

138 # Align all families

139 status = ali.align_all_families(prefix, fam_nums, len(all_genomes), dname, quiet, threads)

140 if not status:

141 logger.error(("At least one alignment did not run well. See detailed log file for "

142 "more information. Program will stop here, alignments won't be "

143 "grouped by genome."))

144 sys.exit(1)

145

146 # post-process alignment files

147 align_file = post.post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet)

148 logger.info("END")

149 return align_file

150

151

152def build_parser(parser):

153 """

154 Method to create a parser for command-line options

155

156 Parameters

157 ----------

158 parser : argparse.ArgumentParser

159 parser to configure in order to extract command-line arguments

160 """

161 import argparse

162 import multiprocessing

163 from PanACoTA import utils_argparse

164

165 # Create command-line parser for all options and arguments to give

166 required = parser.add_argument_group('Required arguments')

167 required.add_argument("-c", dest="corepers", required=True,

168 help="Core or persistent genome whose families must be aligned.")

169 required.add_argument("-l", dest="list_genomes", required=True,

170 help=("File containing the list of all the genomes you want "

171 "to align from their core/persistent families. "

172 "1 genome per line: it can be the "

173 "LSTINFO-<list_file>.lst file of 'PanACoTA annotate' module. "

174 "Here, only the first column (genome name without extension) "

175 "will be used. The final alignment file will contain "

176 "1 alignment per genome in this file."))

177 required.add_argument("-n", dest="dataset_name", required=True,

178 help=("Name of the dataset which will be aligned (for example, "

179 "SAEN1234 for 1234 Salmonella enterica genomes). This name will "

180 "be used to name the alignment file."))

181 required.add_argument("-d", dest="dbpath", required=True,

182 help=("Path to the folder containing the directories 'Proteins' "

183 "and 'Genes', created by 'PanACoTA annotate'."))

184 required.add_argument("-o", dest="outdir", required=True,

185 help="Output directory, where all results must be saved ")

186

187 optional = parser.add_argument_group('Optional arguments')

188 optional.add_argument("--threads", dest="threads", default=1, type=utils_argparse.thread_num,

189 help=("add this option if you want to parallelize on several threads. "

190 "Indicate on how many threads you want to parallelize. "

191 "By default, it uses 1 thread. Put 0 if you want to use "

192 "all threads of your computer."))

193 optional.add_argument("-F", "--force", dest="force", action="store_true",

194 help=("Force run: Add this option if you want to redo all alignments "

195 "for all families, even if their result file already exists. "

196 "Without this option, if an alignment file already exists, "

197 "it will be used for the next step. If you want to redo only "

198 "a given alignment, just delete its file, without using "

199 "this option."))

200 optional.add_argument("-P", dest="prot_ali", default=False, action="store_true",

201 help=("Add this option if you also need the aa alignment of the concatenation of "

202 "all persistent proteins. "

203 "By default, PanACoTA only gives the nucleic alignment."))

204 helper = parser.add_argument_group('Others')

205 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,

206 help="Increase verbosity in stdout/stderr.")

207 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,

208 help=("Do not display anything to stdout/stderr. log files will "

209 "still be created."))

210 helper.add_argument("-h", "--help", dest="help", action="help",

211 help="show this help message and exit")

212

213

214def parse(parser, argu):

215 """

216 Parse arguments given to parser

217

218 Parameters

219 ----------

220 parser : argparse.ArgumentParser

221 the parser used

222 argu : [str]

223 command-line given by user, to parse using parser

224

225 Returns

226 -------

227 argparse.Namespace

228 Parsed arguments

229 """

230 return parser.parse_args(argu)

231

232

233if __name__ == '__main__':

234 import argparse

235

236 myparser = argparse.ArgumentParser(description="Align Core/Persistent families",

237 add_help=False)

238

239 build_parser(myparser)

240 OPTIONS = parse(myparser, sys.argv[1:])

241 main_from_parse(OPTIONS)