Coverage for PanACoTA/align_module/post

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Concatenate all alignment files of all families. Then,

38group alignments by genome.

40@author: GEM, Institut Pasteur

41March 2017

42"""

44import os

45import sys

46import logging

47import progressbar

48import multiprocessing

49from PanACoTA import utils

51logger = logging.getLogger("align.post")

54def post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet):

55 """

56 After the alignment of all proteins by family:

58 - concatenate all alignment files

59 - group the alignment by genome

61 Parameters

62 ----------

63 fam_nums : []

64 list of family numbers

65 all_genomes : []

66 list of all genomes in dataset

67 prefix : str

68 path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files easily)

69 outdir : str

70 path to output directory, containing Aldir and Listdir, and that will also contain Treedir

71 dname : str

72 name of dataset (used to name concat and grouped files, as well as tree folder)

73 prot_ali : bool

74 true: also give concatenated alignment in aa

75 quiet : bool

76 True if nothing must be sent to sdtout/stderr, False otherwise

77 """

78 all_alns_nucl, status_nucl = concat_alignments(fam_nums, prefix, "nucl", quiet)

79 treedir = os.path.join(outdir, "Phylo-" + dname)

80 os.makedirs(treedir, exist_ok=True)

81 outfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln")

82 res_nucl = launch_group_by_genome(all_genomes, all_alns_nucl, status_nucl, outfile_nucl, dname, "nucleic", quiet)

83 if not res_nucl:

84 utils.remove(all_alns_nucl)

85 utils.remove(outfile_nucl)

86 logger.error("An error occurred. We could not group DNA alignments by genome.")

87 sys.exit(1)

88 if prot_ali:

89 all_alns_aa, status_aa = concat_alignments(fam_nums, prefix, "aa", quiet)

90 outfile_aa = os.path.join(treedir, dname + ".aa.grp.aln")

91 res_aa = launch_group_by_genome(all_genomes, all_alns_aa, status_aa, outfile_aa, dname, "protein", quiet)

92 if not res_aa:

93 utils.remove(all_alns_aa)

94 utils.remove(outfile_aa)

95 logger.error("An error occurred. We could not group protein alignments by genome.")

96 return outfile_nucl

99def concat_alignments(fam_nums, prefix, ali_type, quiet):

100 """

101 Concatenate all family alignment files to a unique file

102

103 Parameters

104 ----------

105 fam_nums : []

106 list of family numbers

107 prefix : str

108 path to ``aldir/<name of dataset>-[mafft-align or mafft-prt2nuc]``

109 (used to get extraction, alignment and btr files easily)

110 ali_type : str

111 aa or nucl

112 quiet : bool

113 True if nothing must be sent to sdtout/stderr, False otherwise

114

115 Returns

116 -------

117 tuple

118 (output, str) with:

119

120 - output: path to file containing concatenation of all alignments

121 - str: "OK" if concatenation file already exists, "Done" if just did concatenation

122 """

123 if ali_type == "aa":

124 info = "mafft-align"

125 elif ali_type == "nucl":

126 info = "mafft-prt2nuc"

127 else:

128 logger.error(f"Not possible to concatenate '{ali_type}' type of alignments.")

129 sys.exit(1)

130 output = f"{prefix}-complete.{ali_type}.cat.aln"

131 if os.path.isfile(output):

132 logger.info(f"{ali_type} alignments already concatenated")

133 logger.warning(f"{ali_type} alignments already concatenated in {output}. Program will use "

134 "it for next steps. If you want to redo it, remove it before "

135 "running.")

136 return output, "OK"

137 logger.info(f"Concatenating all {ali_type} alignment files")

138 list_files = [f"{prefix}-{info}.{num_fam}.aln" for num_fam in fam_nums]

139 # Check that all files exist

140 for f in list_files:

141 if not os.path.isfile(f):

142 logger.error(f"The alignment file {f} does not exist. Please check the families you "

143 "want, and their corresponding alignment files")

144 sys.exit(1)

145 if quiet:

146 utils.cat(list_files, output)

147 else:

148 utils.cat(list_files, output, title="Concatenation")

149 return output, "Done"

150

151

152def launch_group_by_genome(all_genomes, all_alns, status, outfile, dname, type_ali, quiet):

153 """

154 Function calling group_by_genome in a pool, while giving information to user

155 (time elapsed)

156

157 Parameters

158 ----------

159 all_genomes : []

160 list of all genomes in the dataset

161 all_alns : str

162 path to file containing all alignments concatenated

163 status : str

164 "OK" if concatenation file already existed before running, "Done" if just did concatenation

165 outfile : str

166 file containing all families align by genome

167 dname : str

168 name of dataset

169 type_ali : str

170 nucleic or protein

171 quiet : bool

172 True if nothing must be sent to sdtout/stderr, False otherwise

173

174 Returns

175 -------

176 bool

177 - True if everything went well or was already done

178 - False if error occurred in at least one step

179 """

180 # Status = Done means that we just did the concatenation. So, if grouped by genome

181 # file already exists, remove it.

182 if status == "Done":

183 if os.path.isfile(outfile):

184 utils.remove(outfile)

185 # Status was not 'Done' (it was 'ok', concat file already existed). And by_genome file

186 # also already exists. Warn user

187 if os.path.isfile(outfile):

188 logger.info(f"{type_ali} alignments already grouped by genome")

189 logger.warning((f"{type_ali} alignments already grouped by genome in {outfile}. "

190 "Program will end. "))

191 return True

192 logger.info(f"Grouping {type_ali} alignments per genome")

193 bar = None

194 if not quiet:

195 widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")),

196 " - ", progressbar.Timer()]

197 bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)

198 pool = multiprocessing.Pool(1)

199 args = [all_genomes, all_alns, outfile]

200 final = pool.map_async(group_by_genome, [args], chunksize=1)

201 pool.close()

202 if not quiet:

203 while True:

204 if final.ready():

205 break

206 bar.update()

207 bar.finish()

208 pool.join()

209 return False not in final.get()

210

211

212def group_by_genome(args):

213 """

214 From the alignment file 'all_alns' containing all proteins, group the alignments of

215 proteins by their genome (listed in 'all_genomes'), and save the result

216 in 'treedir'

217

218 Parameters

219 ----------

220 args : tuple

221 - all_genomes: list of all genomes

222 - all_alns: path to file containing all alignments concatenated

223 - outfile: path to file which will contain alignments grouped by genome

224

225 Returns

226 -------

227 bool

228 - True if everything went well

229 - False if problem when trying to group by genomes

230 """

231 all_genomes, all_alns, outfile = args

232 sequences = read_alignments(all_alns, all_genomes)

233 if not sequences:

234 return False

235 write_groups(outfile, sequences)

236 return True

237

238

239def read_alignments(all_alns, all_genomes):

240 """

241 Read alignment file, and assign each sequence to a genome

242

243 Parameters

244 ----------

245 all_alns : str

246 path to file containing all alignments concatenated

247 all_genomes : []

248 list of all genomes

249

250 Returns

251 -------

252 dict or None

253 - {genome_name: [list of sequences for this genome]}

254 - None if problem with a protein for which we don't find the genome

255 """

256 sequences = {} # name: [ordered list of sequences]

257 genome = None

258 seq = ""

259 with open(all_alns, 'r') as alnf:

260 for line in alnf:

261 if line.startswith(">"):

262 # If new header, write previous protein name/sequence to 'sequences'

263 if genome and seq:

264 sequences[genome].append(seq)

265 seq = ""

266 # Get new genome header

267 genome = get_genome(line, all_genomes)

268 if not genome:

269 return None

270 if genome not in sequences:

271 sequences[genome] = []

272 else:

273 seq += line.strip()

274 if genome and seq:

275 sequences[genome].append(seq)

276 per_genome = [len(seq) for seq in sequences.values()]

277 if len(set(per_genome)) != 1:

278 logger.error("Problems occurred while grouping alignments by genome: all genomes "

279 "do not have the same number of sequences. Check that each protein "

280 "name contains the name of the genome from which it comes.")

281 return None

282 logger.log(utils.detail_lvl(), f"{per_genome[0]} sequences found per genome")

283 return sequences

284

285

286def write_groups(outfile, sequences):

287 """

288 Writing alignments per genome to output file.

289

290 Parameters

291 ----------

292 outfile : str

293 path to file that will contain alignments grouped by genome

294 sequences : dict

295 {genome_name: [list of sequences (DNA, prot...) for this genome]}

296 """

297 logger.log(utils.detail_lvl(), "Writing alignments per genome")

298 with open(outfile, "w") as outf:

299 for genome in sorted(sequences, key=utils.sort_genomes_by_name):

300 # write header for genome

301 outf.write(">" + genome + "\n")

302 # Write all sequences

303 outf.write("".join(sequences[genome]) + "\n")

304

305

306def get_genome(header, all_genomes):

307 """

308 Find to which genome belongs 'header'

309

310 Parameters

311 ----------

312 header : str

313 header read in alignment file

314 all_genomes : []

315 list of all genomes

316

317 Returns

318 -------

319 str or None

320 name of genome from which the header is

321 None if no genome found

322 """

323 # Name of protein is not always genome-name_num

324 # Ex: in gembase complete DB: >TOTO.0215.00002.i006_00065 is from genome TOTO.0215.00002

325 # So, genome name cannot be deduced directly from header. But it is always included in header

326 header = header.split(">")[1].split()[0]

327

328 for genome in all_genomes:

329 if header.startswith(genome):

330 # header should start with the genome name. Nothing before it.

331 # Ex: >86KG_12345 is from genome 86KG. >6KG_12345 is from genome 6KG, not 86KG

332 return genome

333 logger.error((f"Protein {header} does not correspond to any genome name "

334 f"given... {all_genomes}"))

335 return None

Coverage for PanACoTA/align_module/post_align.py: 100%

119 statements