Coverage for PanACoTA/align_module/post_align.py: 100%
119 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Concatenate all alignment files of all families. Then,
38group alignments by genome.
40@author: GEM, Institut Pasteur
41March 2017
42"""
44import os
45import sys
46import logging
47import progressbar
48import multiprocessing
49from PanACoTA import utils
51logger = logging.getLogger("align.post")
54def post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet):
55 """
56 After the alignment of all proteins by family:
58 - concatenate all alignment files
59 - group the alignment by genome
61 Parameters
62 ----------
63 fam_nums : []
64 list of family numbers
65 all_genomes : []
66 list of all genomes in dataset
67 prefix : str
68 path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files easily)
69 outdir : str
70 path to output directory, containing Aldir and Listdir, and that will also contain Treedir
71 dname : str
72 name of dataset (used to name concat and grouped files, as well as tree folder)
73 prot_ali : bool
74 true: also give concatenated alignment in aa
75 quiet : bool
76 True if nothing must be sent to sdtout/stderr, False otherwise
77 """
78 all_alns_nucl, status_nucl = concat_alignments(fam_nums, prefix, "nucl", quiet)
79 treedir = os.path.join(outdir, "Phylo-" + dname)
80 os.makedirs(treedir, exist_ok=True)
81 outfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln")
82 res_nucl = launch_group_by_genome(all_genomes, all_alns_nucl, status_nucl, outfile_nucl, dname, "nucleic", quiet)
83 if not res_nucl:
84 utils.remove(all_alns_nucl)
85 utils.remove(outfile_nucl)
86 logger.error("An error occurred. We could not group DNA alignments by genome.")
87 sys.exit(1)
88 if prot_ali:
89 all_alns_aa, status_aa = concat_alignments(fam_nums, prefix, "aa", quiet)
90 outfile_aa = os.path.join(treedir, dname + ".aa.grp.aln")
91 res_aa = launch_group_by_genome(all_genomes, all_alns_aa, status_aa, outfile_aa, dname, "protein", quiet)
92 if not res_aa:
93 utils.remove(all_alns_aa)
94 utils.remove(outfile_aa)
95 logger.error("An error occurred. We could not group protein alignments by genome.")
96 return outfile_nucl
99def concat_alignments(fam_nums, prefix, ali_type, quiet):
100 """
101 Concatenate all family alignment files to a unique file
103 Parameters
104 ----------
105 fam_nums : []
106 list of family numbers
107 prefix : str
108 path to ``aldir/<name of dataset>-[mafft-align or mafft-prt2nuc]``
109 (used to get extraction, alignment and btr files easily)
110 ali_type : str
111 aa or nucl
112 quiet : bool
113 True if nothing must be sent to sdtout/stderr, False otherwise
115 Returns
116 -------
117 tuple
118 (output, str) with:
120 - output: path to file containing concatenation of all alignments
121 - str: "OK" if concatenation file already exists, "Done" if just did concatenation
122 """
123 if ali_type == "aa":
124 info = "mafft-align"
125 elif ali_type == "nucl":
126 info = "mafft-prt2nuc"
127 else:
128 logger.error(f"Not possible to concatenate '{ali_type}' type of alignments.")
129 sys.exit(1)
130 output = f"{prefix}-complete.{ali_type}.cat.aln"
131 if os.path.isfile(output):
132 logger.info(f"{ali_type} alignments already concatenated")
133 logger.warning(f"{ali_type} alignments already concatenated in {output}. Program will use "
134 "it for next steps. If you want to redo it, remove it before "
135 "running.")
136 return output, "OK"
137 logger.info(f"Concatenating all {ali_type} alignment files")
138 list_files = [f"{prefix}-{info}.{num_fam}.aln" for num_fam in fam_nums]
139 # Check that all files exist
140 for f in list_files:
141 if not os.path.isfile(f):
142 logger.error(f"The alignment file {f} does not exist. Please check the families you "
143 "want, and their corresponding alignment files")
144 sys.exit(1)
145 if quiet:
146 utils.cat(list_files, output)
147 else:
148 utils.cat(list_files, output, title="Concatenation")
149 return output, "Done"
152def launch_group_by_genome(all_genomes, all_alns, status, outfile, dname, type_ali, quiet):
153 """
154 Function calling group_by_genome in a pool, while giving information to user
155 (time elapsed)
157 Parameters
158 ----------
159 all_genomes : []
160 list of all genomes in the dataset
161 all_alns : str
162 path to file containing all alignments concatenated
163 status : str
164 "OK" if concatenation file already existed before running, "Done" if just did concatenation
165 outfile : str
166 file containing all families align by genome
167 dname : str
168 name of dataset
169 type_ali : str
170 nucleic or protein
171 quiet : bool
172 True if nothing must be sent to sdtout/stderr, False otherwise
174 Returns
175 -------
176 bool
177 - True if everything went well or was already done
178 - False if error occurred in at least one step
179 """
180 # Status = Done means that we just did the concatenation. So, if grouped by genome
181 # file already exists, remove it.
182 if status == "Done":
183 if os.path.isfile(outfile):
184 utils.remove(outfile)
185 # Status was not 'Done' (it was 'ok', concat file already existed). And by_genome file
186 # also already exists. Warn user
187 if os.path.isfile(outfile):
188 logger.info(f"{type_ali} alignments already grouped by genome")
189 logger.warning((f"{type_ali} alignments already grouped by genome in {outfile}. "
190 "Program will end. "))
191 return True
192 logger.info(f"Grouping {type_ali} alignments per genome")
193 bar = None
194 if not quiet:
195 widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")),
196 " - ", progressbar.Timer()]
197 bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)
198 pool = multiprocessing.Pool(1)
199 args = [all_genomes, all_alns, outfile]
200 final = pool.map_async(group_by_genome, [args], chunksize=1)
201 pool.close()
202 if not quiet:
203 while True:
204 if final.ready():
205 break
206 bar.update()
207 bar.finish()
208 pool.join()
209 return False not in final.get()
212def group_by_genome(args):
213 """
214 From the alignment file 'all_alns' containing all proteins, group the alignments of
215 proteins by their genome (listed in 'all_genomes'), and save the result
216 in 'treedir'
218 Parameters
219 ----------
220 args : tuple
221 - all_genomes: list of all genomes
222 - all_alns: path to file containing all alignments concatenated
223 - outfile: path to file which will contain alignments grouped by genome
225 Returns
226 -------
227 bool
228 - True if everything went well
229 - False if problem when trying to group by genomes
230 """
231 all_genomes, all_alns, outfile = args
232 sequences = read_alignments(all_alns, all_genomes)
233 if not sequences:
234 return False
235 write_groups(outfile, sequences)
236 return True
239def read_alignments(all_alns, all_genomes):
240 """
241 Read alignment file, and assign each sequence to a genome
243 Parameters
244 ----------
245 all_alns : str
246 path to file containing all alignments concatenated
247 all_genomes : []
248 list of all genomes
250 Returns
251 -------
252 dict or None
253 - {genome_name: [list of sequences for this genome]}
254 - None if problem with a protein for which we don't find the genome
255 """
256 sequences = {} # name: [ordered list of sequences]
257 genome = None
258 seq = ""
259 with open(all_alns, 'r') as alnf:
260 for line in alnf:
261 if line.startswith(">"):
262 # If new header, write previous protein name/sequence to 'sequences'
263 if genome and seq:
264 sequences[genome].append(seq)
265 seq = ""
266 # Get new genome header
267 genome = get_genome(line, all_genomes)
268 if not genome:
269 return None
270 if genome not in sequences:
271 sequences[genome] = []
272 else:
273 seq += line.strip()
274 if genome and seq:
275 sequences[genome].append(seq)
276 per_genome = [len(seq) for seq in sequences.values()]
277 if len(set(per_genome)) != 1:
278 logger.error("Problems occurred while grouping alignments by genome: all genomes "
279 "do not have the same number of sequences. Check that each protein "
280 "name contains the name of the genome from which it comes.")
281 return None
282 logger.log(utils.detail_lvl(), f"{per_genome[0]} sequences found per genome")
283 return sequences
286def write_groups(outfile, sequences):
287 """
288 Writing alignments per genome to output file.
290 Parameters
291 ----------
292 outfile : str
293 path to file that will contain alignments grouped by genome
294 sequences : dict
295 {genome_name: [list of sequences (DNA, prot...) for this genome]}
296 """
297 logger.log(utils.detail_lvl(), "Writing alignments per genome")
298 with open(outfile, "w") as outf:
299 for genome in sorted(sequences, key=utils.sort_genomes_by_name):
300 # write header for genome
301 outf.write(">" + genome + "\n")
302 # Write all sequences
303 outf.write("".join(sequences[genome]) + "\n")
306def get_genome(header, all_genomes):
307 """
308 Find to which genome belongs 'header'
310 Parameters
311 ----------
312 header : str
313 header read in alignment file
314 all_genomes : []
315 list of all genomes
317 Returns
318 -------
319 str or None
320 name of genome from which the header is
321 None if no genome found
322 """
323 # Name of protein is not always genome-name_num
324 # Ex: in gembase complete DB: >TOTO.0215.00002.i006_00065 is from genome TOTO.0215.00002
325 # So, genome name cannot be deduced directly from header. But it is always included in header
326 header = header.split(">")[1].split()[0]
328 for genome in all_genomes:
329 if header.startswith(genome):
330 # header should start with the genome name. Nothing before it.
331 # Ex: >86KG_12345 is from genome 86KG. >6KG_12345 is from genome 6KG, not 86KG
332 return genome
333 logger.error((f"Protein {header} does not correspond to any genome name "
334 f"given... {all_genomes}"))
335 return None