Coverage for PanACoTA/align_module/alignment.py: 100%
223 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37For a given family:
39- align its proteins using mafft
40- back translate to nucleotides
41- add "-" of same size as alignment for genomes not having members in the family
43@author: GEM, Institut Pasteur
44March 2017
45"""
47import os
48import sys
49import logging
50import multiprocessing
51import progressbar
52import threading
54from PanACoTA import utils
56main_logger = logging.getLogger("align.alignment")
59def align_all_families(prefix, all_fams, ngenomes, dname, quiet, threads):
60 """
61 For each family:
63 - align all its proteins with mafft
64 - back-translate to nucleotides
65 - add missing genomes
67 Parameters
68 ----------
69 prefix : str
70 path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files
71 easily)
72 all_fams : []
73 list of all family numbers
74 ngenomes : int
75 total number of genomes in dataset
76 dname : str
77 name of dataset (used to name concat and grouped files, as well as tree folder)
78 quiet : bool
79 True if nothing must be written in stdout/stderr, False otherwise
80 threads : int
81 max number of threads that can be used by mafft
83 Returns
84 -------
85 bool
86 True if everything went well, False if there was a problem in at least 1 family.
87 """
88 main_logger.info(("Starting alignment of all families: protein alignment, "
89 "back-translation to nucleotides, and add missing genomes in the family"))
90 nbfam = len(all_fams)
91 bar = None
92 if not quiet:
93 # Create progressbar
94 widgets = ['Alignment: ', progressbar.Bar(marker='█', left='', right='', fill=' '),
95 ' ', progressbar.Counter(), "/{}".format(nbfam), ' (',
96 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - '
97 ]
98 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfam,
99 term_width=79).start()
100 final = []
101 if threads == 1:
102 update_bar = 1
103 for num_fam in all_fams:
104 f = handle_family_1thread((prefix, num_fam, ngenomes))
105 final.append(f)
106 bar.update(update_bar)
107 update_bar+=1
109 else:
110 pool = multiprocessing.Pool(threads)
112 # Create a Queue to put logs from processes, and handle them after from a single thread
113 m = multiprocessing.Manager()
114 q = m.Queue()
115 # arguments : (gpath, cores_prokka, name, force, nbcont, q) for each genome
116 arguments = [(prefix, num_fam, ngenomes, q) for num_fam in all_fams]
117 try:
118 final = pool.map_async(handle_family, arguments, chunksize=1)
119 pool.close()
120 # Listen for logs in processes
121 lp = threading.Thread(target=utils.logger_thread, args=(q,))
122 lp.start()
123 if not quiet:
124 while True:
125 if final.ready():
126 break
127 remaining = final._number_left
128 bar.update(nbfam - remaining)
129 bar.finish()
130 pool.join()
131 q.put(None)
132 lp.join()
133 final = final.get()
134 # If an error occurs (or user kills with keybord), terminate pool and exit
135 except Exception as excp: # pragma: no cover
136 pool.terminate()
137 main_logger.error(excp)
138 sys.exit(1)
139 # We re-aligned (or added missing genomes) at least one family
140 # -> remove concatenated files and groupby files (if they exist)
141 if set(final) != {"OK"}:
142 aldir = os.path.split(prefix)[0]
143 concat_nucl = os.path.join(aldir, f"{dname}-complete.nucl.cat.aln")
144 concat_aa = os.path.join(aldir, f"{dname}-complete.aa.cat.aln")
145 outdir = os.path.split(aldir)[0]
146 treedir = os.path.join(outdir, "Phylo-" + dname)
147 grpfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln")
148 grpfile_aa = os.path.join(treedir, dname + ".aa.grp.aln")
149 utils.remove(concat_nucl)
150 utils.remove(concat_aa)
151 utils.remove(grpfile_nucl)
152 utils.remove(grpfile_aa)
153 return False not in final
156def handle_family_1thread(args):
157 """
158 For the given family:
160 - align its proteins with mafft
161 - back-translate to nucleotides
162 - add missing genomes
164 Parameters
165 ----------
166 args : ()
167 (prefix, num_fam, ngenomes, q) with:
169 - prefix: path to ``aldir/<name of dataset>``
170 - num_fam: the current family number
171 - ngenomes: the total number of genomes in dataset
173 Returns
174 -------
175 bool
176 - "OK" if the files were not re-created, and have the expected format. This is used by
177 ``align_all_families`` function, to know if something was regenerated, or if everything
178 already existed with the expected format. If something was redone and concat/group files
179 exist, it removes them.
180 - False if any problem (extractions, alignment, btr, add missing genomes...)
181 - True if just generated all files, and everything is ok
182 """
183 prefix, num_fam, ngenomes = args
184 logger = logging.getLogger('align.align_family')
185 # Get file names
186 prt_file = f"{prefix}-current.{num_fam}.prt"
187 gen_file = f"{prefix}-current.{num_fam}.gen"
188 miss_file = f"{prefix}-current.{num_fam}.miss.lst"
189 mafft_file = f"{prefix}-mafft-align.{num_fam}.aln"
190 btr_file = f"{prefix}-mafft-prt2nuc.{num_fam}.aln"
191 # Align all sequences for given family
192 status1 = family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file,
193 num_fam, ngenomes, logger)
194 # status1 is:
195 # - False if problem with extractions, alignment or backtranslation -> return False
196 # - 'nb_seqs' = number of sequences aligned if everything went well (extractions and
197 # alignment ok, btr created without problem)
198 # - "OK" if extractions and alignments went well, and btr already exists and is ok
199 # If it returned true or the , Add missing genomes
200 # If 'OK' or nb_seq, add missing genomes
201 if status1:
202 added_aa = add_missing_genomes(mafft_file, "protein", miss_file, num_fam, ngenomes, status1, logger)
203 added_nucl = add_missing_genomes(btr_file, "back-translated", miss_file, num_fam, ngenomes, status1, logger)
204 # 1 of them false: return false
205 # both are "OK": return OK (no need to remove concatenated and grouped files)
206 # 1 of them true (not "OK"): return true
207 if not added_nucl or not added_aa:
208 return False
209 elif added_aa == "OK" and added_nucl == "OK":
210 return "OK"
211 else:
212 return True
213 # Else, return False (there was a problem while trying to align, in any step)
214 return False
217def handle_family(args):
218 """
219 For the given family:
221 - align its proteins with mafft
222 - back-translate to nucleotides
223 - add missing genomes
225 Parameters
226 ----------
227 args : ()
228 (prefix, num_fam, ngenomes, q) with:
230 - prefix: path to ``aldir/<name of dataset>``
231 - num_fam: the current family number
232 - ngenomes: the total number of genomes in dataset
233 - q: a queue, which will be used by logger to put logs while in other process
235 Returns
236 -------
237 bool
238 - "OK" if the files were not re-created, and have the expected format. This is used by
239 ``align_all_families`` function, to know if something was regenerated, or if everything
240 already existed with the expected format. If something was redone and concat/group files
241 exist, it removes them.
242 - False if any problem (extractions, alignment, btr, add missing genomes...)
243 - True if just generated all files, and everything is ok
244 """
245 prefix, num_fam, ngenomes, q = args
246 qh = logging.handlers.QueueHandler(q)
247 root = logging.getLogger()
248 root.setLevel(logging.DEBUG)
249 root.handlers = []
250 logging.addLevelName(utils.detail_lvl(), "DETAIL")
251 root.addHandler(qh)
252 logger = logging.getLogger('align.align_family')
253 return handle_family_1thread((prefix, num_fam, ngenomes))
256def add_missing_genomes(align_file, ali_type, miss_file, num_fam, ngenomes, status1, logger):
257 """
258 Once all family proteins are aligned, and back-translated to nucleotides,
259 add missing genomes for the family to the alignment with '-'.
260 (Add missing genomes to both mafft alignment and back-translated alignment)
262 Parameters
263 ----------
264 align_file : str
265 path to file containing alignments (proteins if from mafft output,
266 or nucleic sequences if after backtranslating them)
267 ali_type : str
268 protein or backtranslated
269 miss_file : str
270 path to file containing the list of missing genomes in this family
271 num_fam : int
272 family number
273 ngenomes : int
274 total number of genomes in dataset
275 status1 : bool or str
276 - "OK" if we did not redo the alignments as they already were as expected. In that case,
277 if missing genomes are already present, just add a warning message saying that we
278 used the already existing btr file.
279 - True if we just did the alignments and backtranslate. So no warning message needed.
280 - False if problem with extraction, alignment or backtranslation (will never happen as
281 this function is not called if status1 == False)
282 logger : logging.Logger
283 the logger, having a queue Handler, to give logs to the main logger
285 Returns
286 -------
287 bool or str
288 - "OK" if btr file was not recreated, and already has the right number of sequences,
289 and all with the same length.
290 - False if problem in btr file alignment, so missing genomes not added
291 - True if alignment + adding missing genomes is ok. Can happen if there is no missing
292 genome for this family (in that case, btr generated already has the right number of
293 sequences), or if we just added the missing genomes.
295 """
296 # btr_file should always exist.
297 # Sometimes it comes from previous step ('missing genomes' are missing)
298 # Sometimes it comes from a previous run (all genomes should be here)
299 status = check_add_missing(align_file, num_fam, ngenomes, logger, prev=True)
300 # If btr_file has the correct number of sequences, all the same length, return True
301 if status is True:
302 if status1 == "OK":
303 logger.warning(f"{ali_type} alignment already done for family {num_fam}. The program will use "
304 "it for next steps")
305 return "OK"
306 else:
307 return True
308 # If btr_files has problem in alignment (not all sequences with same size)
309 elif status is False:
310 return False
311 # All sequences have same length but some genomes are missing -> Add missing genomes
312 # status is length of sequence (if it was True or False, it already ended this function)
313 logger.log(utils.detail_lvl(), f"Adding missing genomes for family {num_fam} in {ali_type} alignment.")
314 len_aln = status
315 with open(miss_file, "r") as missf, open(align_file, "a") as alif:
316 for genome in missf:
317 genome = genome.strip()
318 toadd = ">" + genome + "\n" + "-" * len_aln + "\n"
319 alif.write(toadd)
320 # check_add_missing called with prev=False :
321 # output is True if all ok, or False if problems. Cannot be sequence length (as it can be with prev=True)
322 ret = check_add_missing(align_file, num_fam, ngenomes, logger, prev=False)
323 return ret
326def check_add_missing(btr_file, num_fam, ngenomes, logger, prev):
327 """
328 Check back-translated alignment while missing genomes have been added
330 Parameters
331 ----------
332 btr_file : str
333 path to file containing back-translated alignment
334 num_fam : int
335 current family number
336 ngenomes : int
337 total number of genomes in dataset
338 logger : logging.Logger
339 logger with queueHandler to give logs to main logger
340 prev : bool
341 True if we are checking alignments before adding missing genomes (in case it comes
342 from a previous run and is already done, or in case there are no missing genomes for
343 this family, so nothing to do to btr file). In that case, do not write error message
344 if the number of sequences does not correspond to the total number of genomes.
345 False if we just added missing genomes. In that case, nb sequences should be equal to
346 the total number of genomes. If not, write error message.
348 Returns
349 -------
350 bool or int
351 - True if right number of sequences in btr file and all the same length (everything is ok)
352 - False if problem in alignment (sequences do not all have the same size), or same size but not all
353 genomes while not being from previous run (prev=False)
354 - alignment length if sequences aligned all have the same length, but missing
355 genomes are not added yet (so, will have to add lines with this number of '-')
356 """
357 res = check_lens(btr_file, num_fam, logger)
358 # If all sequences have the same length, res = (length of a sequence, number of sequences in btr file)
359 if res:
360 len_aln, nb = res
361 # otherwise, return False: problem while aligning or back-translating
362 else:
363 return False
364 # If number of sequences in btr file is not the same as the total number of genomes, either:
365 # - btr comes from a previous run, but was not complete (prev)
366 # - we just created btr file, and there are missing genomes in this family.
367 # return sequence length, so that we add missing genomes
368 if nb != ngenomes:
369 # prev = true: this function was called at the beggining of 'add_missing_genomes',
370 # so it's ok if there are not all sequences, they will be added just after. We
371 # just checked in case this is a file already complete from a previous run, or
372 # in case there is no missing genome in this family.
373 # prev = false: called after adding missing genomes. So, error message, as it should
374 # contain all expected genomes.
375 if not prev:
376 logger.error((f"ERROR: family {num_fam} contains {nb} genomes in total "
377 f"instead of the {ngenomes} genomes in input.\n"))
378 return False
379 # If not already has all sequences, return sequence length to add the missing ones
380 return len_aln
381 # Everything ok (nb genomes ok and all same length)
382 return True
385def family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file,
386 num_fam, ngenomes, logger):
387 """
388 From a given family, align all its proteins with mafft, back-translate
389 to nucleotides, and add missing genomes in this family.
391 Parameters
392 ----------
393 prt_file : str
394 path to file containing proteins extracted
395 gen_file : str
396 path to file containing genes extracted
397 miss_file : str
398 path to file containing list of genomes missing
399 mafft_file : str
400 path to file which will contain the protein alignment
401 btr_file : str
402 path to file which will contain the nucleotide alignment back-translated from protein
403 alignment
404 num_fam : int
405 current family number
406 ngenomes : int
407 total number of genomes in dataset
408 logger : logging.Logger
409 logger with queueHandler to give logs to main logger
411 Returns
412 -------
413 bool or str
414 - False if problem with extractions or with alignment or with backtranslation
415 - 'nb_seqs' = number of sequences aligned if everything went well (extractions and
416 alignment ok, btr created without problem)
417 - "OK" if extractions and alignments went well, and btr already exists and is ok
418 """
419 # Check number of proteins extracted
420 # =check that nb_prt (or nb_gen, which should be the same) + nb_miss = nb_genomes
421 # returns number of genomes extracted (so, excludes missing genomes for the family)
422 nbfprt = check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger)
423 nbfal = None
424 # If problem with extractions (0 proteins extracted), remove mafft and btr files if they exist, so that they will
425 # be regenerated
426 if not nbfprt:
427 utils.remove(mafft_file)
428 utils.remove(btr_file)
429 return False
430 # If mafft file already exists, check the number of proteins aligned corresponds to number of
431 # proteins extracted. If not, remove mafft and btr files.
432 if os.path.isfile(mafft_file):
433 # There can be nbfprt (number of proteins extracted)
434 # or nb_genomes (proteins extracted + missing added with '-')
435 nbfal1 = check_nb_seqs(mafft_file, nbfprt, logger, "")
436 nbfal2 = check_nb_seqs(mafft_file, ngenomes, logger, "")
437 # if nbfal1: missing genomes have not been added yet. Save this value for later
438 if nbfal1:
439 nbfal = nbfal1
440 # if nbfal2: missing genomes already there, save for later
441 elif nbfal2:
442 nbfal = nbfal2
443 # If not any of those 2 numbers: error
444 else:
445 message = (f"fam {num_fam}: Will redo alignment, because found a different number of proteins "
446 f"extracted in {prt_file} ({nbfprt}) and proteins aligned in "
447 f"existing {mafft_file}")
448 logger.error(message)
449 os.remove(mafft_file)
450 utils.remove(btr_file)
451 # If mafft file does not exist (removed because problem in its alignment, or just not generated
452 # yet), remove btr (will be regenerated), and do alignment with mafft
453 if not os.path.isfile(mafft_file):
454 utils.remove(btr_file) # remove if exists...
455 nbfal = mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger)
456 # If problem with alignment, return False
457 if not nbfal:
458 return False
459 # If btr file already exists, means that it was already done before, and not removed because
460 # extractions and mafft files are ok. So, return True, saying that btr file is done,
461 # next step will be to check it, add missing genomes etc.
462 if os.path.isfile(btr_file):
463 message = (f"fam {num_fam}: Will redo back-translation, because found a different number of "
464 f"proteins aligned in {mafft_file} ({nbfal}) and genes back-translated in "
465 f"existing {btr_file}")
466 # btr file contains either nbfal entries (number of proteins extracted) if it was not completed
467 # with missing genomes, or ngenomes if it was completed. If it is not the case, remove it
468 # (will be regenerated)
469 res = check_nb_seqs(btr_file, [nbfal, ngenomes], logger, message)
470 if not res:
471 utils.remove(btr_file)
472 else:
473 return "OK"
474 # If btr file does not exist (removed because problem with mafft generated before,
475 # or just not generated yet), do back-translation, and return:
476 # - number of sequences back-translated if it went well,
477 # - False otherwise
478 return back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger)
481def check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger):
482 """
483 Check that extractions went well for the given family:
485 - check number of proteins and genes extracted compared to the
486 number of genomes
488 Parameters
489 ----------
490 num_fam : int
491 current family number
492 miss_file : str
493 path to file containing the list of genomes missing for the current family
494 prt_file : str
495 path to file containing all proteins extracted
496 gen_file : str
497 path to file containing all genes extracted
498 ngenomes : int
499 total number of genomes in dataset
500 logger : logging.Logger
501 logger with queueHandler to give logs to main logger
503 Returns
504 -------
505 bool or int
506 False if any problem (nbmiss+prt != nbgenomes or nbmiss+gen != nbgenomes). If no
507 problem, returns the number of proteins/genes extracted
508 """
509 logger.log(utils.detail_lvl(), f"Checking extractions for family {num_fam}")
511 # Check that extractions went well
512 nbmiss = utils.count(miss_file)
513 # If files with proteins extracted do not even exist, close with error
514 # (they should have been created at the previous step)
515 if not os.path.isfile(gen_file):
516 logger.error(f"fam {num_fam}: no file with genes extracted "
517 f"('{gen_file}'). Cannot align.")
518 sys.exit(1)
519 if not os.path.isfile(prt_file):
520 logger.error(f"fam {num_fam}: no file with proteins extracted "
521 f"('{prt_file}'). Cannot align.")
522 sys.exit(1)
523 nbfprt = utils.grep(prt_file, "^>", counts=True)
524 nbfgen = utils.grep(gen_file, "^>", counts=True)
525 if nbmiss + nbfprt != ngenomes:
526 logger.error(("fam {}: wrong sum of missing genomes ({}) and prt extracted ({}) for {} "
527 "genomes in the dataset.").format(num_fam, nbmiss, nbfprt, ngenomes))
528 return False
529 if nbmiss + nbfgen != ngenomes:
530 logger.error(("fam {}: wrong sum of missing genomes ({}) and gen extracted ({}) for {} "
531 "genomes in the dataset.").format(num_fam, nbmiss, nbfgen, ngenomes))
532 return False
533 return nbfprt
536def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger):
537 """
538 Align all proteins of the given family with mafft
540 Parameters
541 ----------
542 num_fam : int
543 current family number
544 prt_file : str
545 path to file containing all proteins extracted
546 mafft_file : str
547 path to file which will contain proteins alignment
548 nbfprt : int
549 number of proteins extracted in prt file
550 logger : logging.Logger
551 logger with queueHandler to give logs to main logger
553 Returns
554 -------
555 bool
556 True if no problem (alignment ok, same number of proteins extracted and aligned),
557 False otherwise
558 """
559 logger.log(utils.detail_lvl(), f"Aligning family {num_fam}")
560 cmd = f"mafft --auto --amino {prt_file}"
561 error = f"Problem while trying to align fam {num_fam}"
562 stdout = open(mafft_file, "w")
563 stderr = open(mafft_file + ".log", "w")
564 logger.log(utils.detail_lvl(), f"Mafft command: {cmd}")
565 ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger)
566 stdout.close()
567 if not isinstance(ret, int):
568 ret = ret.returncode
569 if ret != 0:
570 os.remove(mafft_file)
571 return False
572 message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins "
573 f"aligned in {mafft_file}")
574 return check_nb_seqs(mafft_file, nbfprt, logger, message)
577def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger):
578 """
579 Backtranslate protein alignment to nucleotides
581 Parameters
582 ----------
583 num_fam : int
584 current family number. Used for log messages
585 mafft_file : str
586 path to file containing protein alignments by mafft
587 gen_file : str
588 path to file containing all sequences, not aligned, in nucleotides. It is used to
589 convert the alignment in proteins into a nucleotide alignment
590 btr_file : str
591 path to the file that will contain the nucleotide alignment
592 nbfal : int
593 number of sequences aligned for the family by mafft
594 logger : logging.Logger
595 logger with queueHandler to give logs to main logger
597 Returns
598 -------
599 bool
600 - False if problem (back-translation, different number of families...)
601 - number of sequences in btr file if everything went well
602 """
603 logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}")
604 curpath = os.path.dirname(os.path.abspath(__file__))
605 awk_script = os.path.join(curpath, "prt2codon.awk")
606 cmd = f"awk -f {awk_script} {mafft_file} {gen_file}"
607 stdout = open(btr_file, "w")
608 error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment"
609 ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger)
610 stdout.close()
611 if not isinstance(ret, int):
612 ret = ret.returncode
613 if ret != 0:
614 os.remove(btr_file)
615 return False
616 message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes "
617 f"back-translated in {btr_file}")
618 # Check number of sequences in btr file, and return True/False according to it
619 # It should contain the same number of sequences as the mafft file.
620 return check_nb_seqs(mafft_file, nbfal, logger, message)
623def check_nb_seqs(alnfile, nbfal, logger, message=""):
624 """
625 Check the number of sequences in the given alignment file
627 Parameters
628 ----------
629 alnfile : str
630 path to alignment file
631 nbfal : int or []
632 expected number of sequences, or list of expected numbers of sequences
633 logger : logging.Logger
634 logger with queueHandler to give logs to main logger
635 message : str
636 message to write when sequence numbers do not match
638 Returns
639 -------
640 bool or int
641 - False if not same number of sequences
642 - nbseqs in align file if found among values in 'nbfal'
643 """
644 nbseqs = utils.grep(alnfile, "^>", counts=True)
645 if isinstance(nbfal, int):
646 nbfal = [nbfal]
647 for num in nbfal:
648 if nbseqs == num:
649 return nbseqs
650 if message:
651 logger.error(f"{message} ({nbseqs})")
652 return False
655def check_lens(aln_file, num_fam, logger):
656 """
657 In the given alignment file, check that all sequences have the same length.
658 If there is no problem, it returns the length of alignment, and the number
659 of sequences aligned.
661 Parameters
662 ----------
663 aln_file : str
664 path to the alignment file to check
665 num_fam : int
666 current family number. Used for log message if problem
667 logger : logging.Logger
668 logger having a queue Handler to give logs to the main logger in the main process
670 Returns
671 -------
672 bool or tuple
673 False if there is a problem in the alignment (sequences do not all have the
674 same length).
675 If they all have the same length, returns this length and the number of sequences
676 """
677 nb_gen = 0
678 all_sums = set()
679 with open(aln_file, "r") as btrf:
680 cur_sum = 0
681 start = True
682 for line in btrf:
683 if line.startswith(">"):
684 nb_gen += 1
685 if not start:
686 all_sums.add(cur_sum)
687 cur_sum = 0
688 else:
689 start = False
690 cur_sum += len(line.strip())
691 all_sums.add(cur_sum)
692 if len(all_sums) > 1:
693 logger.error(f"Nucleic alignments for family {num_fam} (in {aln_file}) do not all have the same "
694 f"length. Lengths found are: {all_sums}\n")
695 return False
696 # Return sequence length and number of sequences in alignment file
697 return list(all_sums)[0], nb_gen