Coverage for PanACoTA/subcommands/align.py: 100%

59 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37align is a subcommand of PanACoTA 

38 

39 

40@author gem 

41June 2017 

42""" 

43 

44import os 

45import sys 

46 

47 

48def main_from_parse(args): 

49 """ 

50 Call main function from the arguments given by parser 

51 

52 Parameters 

53 ---------- 

54 args : argparse.Namespace 

55 result of argparse parsing of all arguments in command line 

56 """ 

57 cmd = "PanACoTA " + ' '.join(args.argv) 

58 main(cmd, args.corepers, args.list_genomes, args.dataset_name, args.dbpath, 

59 args.outdir, args.prot_ali, args.threads, args.force, args.verbose, args.quiet) 

60 

61 

62def main(cmd, corepers, list_genomes, dname, dbpath, outdir, prot_ali, threads, force, verbose=0, 

63 quiet=False): 

64 """ 

65 Align given core genome families 

66 

67 Parameters 

68 ---------- 

69 corepers : str 

70 File containing persistent genome families 

71 list_genomes : str 

72 File containing the list of all genomes in the dataset. Only first column is 

73 considered. 

74 dname : str 

75 Dataset name, used to name output files 

76 dbpath : str 

77 path to the directory containing 'Proteins' and 'Genes' folders 

78 outdir : str 

79 path to the directory where output files must be saved 

80 prot_ali : bool 

81 Also give aa alignment of concatenation of persistent proteins 

82 threads : int 

83 Max number of threads to use 

84 force : bool 

85 Remove existing output files and rerun everything if True. 

86 verbose : int 

87 verbosity: 

88 - defaut 0 : stdout contains INFO, stderr contains ERROR. 

89 - 1: stdout contains INFO, stderr contains WARNING and ERROR 

90 - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR 

91 - >=15: Add DEBUG in stdout 

92 

93 quiet : bool 

94 True if nothing must be sent to stdout/stderr, False otherwise 

95 """ 

96 # import needed packages 

97 import logging 

98 import shutil 

99 from PanACoTA import utils 

100 from PanACoTA.align_module import pan_to_pergenome as p2g 

101 from PanACoTA.align_module import get_seqs as gseqs 

102 from PanACoTA.align_module import alignment as ali 

103 from PanACoTA.align_module import post_align as post 

104 from PanACoTA import __version__ as version 

105 

106 # test if prokka is installed and in the path 

107 if not utils.check_installed("mafft"): # pragma: no cover 

108 print("mafft is not installed. 'PanACoTA align' cannot run.") 

109 sys.exit(1) 

110 

111 if force and os.path.isdir(outdir): 

112 shutil.rmtree(outdir) 

113 os.makedirs(outdir, exist_ok=True) 

114 # set level of logger (here debug to show everything during development) 

115 # level is the minimum level that will be considered. 

116 # for verbose = 0 or 1, ignore details and debug, start from info 

117 if verbose <= 1: 

118 level = logging.INFO 

119 # for verbose = 2, ignore only debug 

120 if verbose >= 2 and verbose < 15: 

121 level = 15 # int corresponding to detail level 

122 # for verbose >= 15, write everything 

123 if verbose >= 15: 

124 level = logging.DEBUG 

125 # name logfile, add timestamp if already existing 

126 logfile_base = os.path.join(outdir, "PanACoTA-align_" + dname) 

127 utils.init_logger(logfile_base, level, 'align', log_details=True, verbose=verbose, quiet=quiet) 

128 logger = logging.getLogger("align") 

129 logger.info(f'PanACoTA version {version}') 

130 logger.info("Command used\n \t > " + cmd) 

131 

132 all_genomes, aldir, listdir, fam_nums = p2g.get_per_genome(corepers, list_genomes, 

133 dname, outdir) 

134 # generate required files 

135 gseqs.get_all_seqs(all_genomes, dname, dbpath, listdir, aldir, fam_nums, quiet) 

136 prefix = os.path.join(aldir, dname) 

137 

138 # Align all families 

139 status = ali.align_all_families(prefix, fam_nums, len(all_genomes), dname, quiet, threads) 

140 if not status: 

141 logger.error(("At least one alignment did not run well. See detailed log file for " 

142 "more information. Program will stop here, alignments won't be " 

143 "grouped by genome.")) 

144 sys.exit(1) 

145 

146 # post-process alignment files 

147 align_file = post.post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet) 

148 logger.info("END") 

149 return align_file 

150 

151 

152def build_parser(parser): 

153 """ 

154 Method to create a parser for command-line options 

155 

156 Parameters 

157 ---------- 

158 parser : argparse.ArgumentParser 

159 parser to configure in order to extract command-line arguments 

160 """ 

161 import argparse 

162 import multiprocessing 

163 from PanACoTA import utils_argparse 

164 

165 # Create command-line parser for all options and arguments to give 

166 required = parser.add_argument_group('Required arguments') 

167 required.add_argument("-c", dest="corepers", required=True, 

168 help="Core or persistent genome whose families must be aligned.") 

169 required.add_argument("-l", dest="list_genomes", required=True, 

170 help=("File containing the list of all the genomes you want " 

171 "to align from their core/persistent families. " 

172 "1 genome per line: it can be the " 

173 "LSTINFO-<list_file>.lst file of 'PanACoTA annotate' module. " 

174 "Here, only the first column (genome name without extension) " 

175 "will be used. The final alignment file will contain " 

176 "1 alignment per genome in this file.")) 

177 required.add_argument("-n", dest="dataset_name", required=True, 

178 help=("Name of the dataset which will be aligned (for example, " 

179 "SAEN1234 for 1234 Salmonella enterica genomes). This name will " 

180 "be used to name the alignment file.")) 

181 required.add_argument("-d", dest="dbpath", required=True, 

182 help=("Path to the folder containing the directories 'Proteins' " 

183 "and 'Genes', created by 'PanACoTA annotate'.")) 

184 required.add_argument("-o", dest="outdir", required=True, 

185 help="Output directory, where all results must be saved ") 

186 

187 optional = parser.add_argument_group('Optional arguments') 

188 optional.add_argument("--threads", dest="threads", default=1, type=utils_argparse.thread_num, 

189 help=("add this option if you want to parallelize on several threads. " 

190 "Indicate on how many threads you want to parallelize. " 

191 "By default, it uses 1 thread. Put 0 if you want to use " 

192 "all threads of your computer.")) 

193 optional.add_argument("-F", "--force", dest="force", action="store_true", 

194 help=("Force run: Add this option if you want to redo all alignments " 

195 "for all families, even if their result file already exists. " 

196 "Without this option, if an alignment file already exists, " 

197 "it will be used for the next step. If you want to redo only " 

198 "a given alignment, just delete its file, without using " 

199 "this option.")) 

200 optional.add_argument("-P", dest="prot_ali", default=False, action="store_true", 

201 help=("Add this option if you also need the aa alignment of the concatenation of " 

202 "all persistent proteins. " 

203 "By default, PanACoTA only gives the nucleic alignment.")) 

204 helper = parser.add_argument_group('Others') 

205 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0, 

206 help="Increase verbosity in stdout/stderr.") 

207 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False, 

208 help=("Do not display anything to stdout/stderr. log files will " 

209 "still be created.")) 

210 helper.add_argument("-h", "--help", dest="help", action="help", 

211 help="show this help message and exit") 

212 

213 

214def parse(parser, argu): 

215 """ 

216 Parse arguments given to parser 

217 

218 Parameters 

219 ---------- 

220 parser : argparse.ArgumentParser 

221 the parser used 

222 argu : [str] 

223 command-line given by user, to parse using parser 

224 

225 Returns 

226 ------- 

227 argparse.Namespace 

228 Parsed arguments 

229 """ 

230 return parser.parse_args(argu) 

231 

232 

233if __name__ == '__main__': 

234 import argparse 

235 

236 myparser = argparse.ArgumentParser(description="Align Core/Persistent families", 

237 add_help=False) 

238 

239 build_parser(myparser) 

240 OPTIONS = parse(myparser, sys.argv[1:]) 

241 main_from_parse(OPTIONS)