Coverage for PanACoTA/align_module/alignment.py: 100%

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37For a given family:

39- align its proteins using mafft

40- back translate to nucleotides

41- add "-" of same size as alignment for genomes not having members in the family

43@author: GEM, Institut Pasteur

44March 2017

45"""

47import os

48import sys

49import logging

50import multiprocessing

51import progressbar

52import threading

54from PanACoTA import utils

56main_logger = logging.getLogger("align.alignment")

59def align_all_families(prefix, all_fams, ngenomes, dname, quiet, threads):

60 """

61 For each family:

63 - align all its proteins with mafft

64 - back-translate to nucleotides

65 - add missing genomes

67 Parameters

68 ----------

69 prefix : str

70 path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files

71 easily)

72 all_fams : []

73 list of all family numbers

74 ngenomes : int

75 total number of genomes in dataset

76 dname : str

77 name of dataset (used to name concat and grouped files, as well as tree folder)

78 quiet : bool

79 True if nothing must be written in stdout/stderr, False otherwise

80 threads : int

81 max number of threads that can be used by mafft

83 Returns

84 -------

85 bool

86 True if everything went well, False if there was a problem in at least 1 family.

87 """

88 main_logger.info(("Starting alignment of all families: protein alignment, "

89 "back-translation to nucleotides, and add missing genomes in the family"))

90 nbfam = len(all_fams)

91 bar = None

92 if not quiet:

93 # Create progressbar

94 widgets = ['Alignment: ', progressbar.Bar(marker='█', left='', right='', fill=' '),

95 ' ', progressbar.Counter(), "/{}".format(nbfam), ' (',

96 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - '

97 ]

98 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfam,

99 term_width=79).start()

100 final = []

101 if threads == 1:

102 update_bar = 1

103 for num_fam in all_fams:

104 f = handle_family_1thread((prefix, num_fam, ngenomes))

105 final.append(f)

106 bar.update(update_bar)

107 update_bar+=1

108

109 else:

110 pool = multiprocessing.Pool(threads)

111

112 # Create a Queue to put logs from processes, and handle them after from a single thread

113 m = multiprocessing.Manager()

114 q = m.Queue()

115 # arguments : (gpath, cores_prokka, name, force, nbcont, q) for each genome

116 arguments = [(prefix, num_fam, ngenomes, q) for num_fam in all_fams]

117 try:

118 final = pool.map_async(handle_family, arguments, chunksize=1)

119 pool.close()

120 # Listen for logs in processes

121 lp = threading.Thread(target=utils.logger_thread, args=(q,))

122 lp.start()

123 if not quiet:

124 while True:

125 if final.ready():

126 break

127 remaining = final._number_left

128 bar.update(nbfam - remaining)

129 bar.finish()

130 pool.join()

131 q.put(None)

132 lp.join()

133 final = final.get()

134 # If an error occurs (or user kills with keybord), terminate pool and exit

135 except Exception as excp: # pragma: no cover

136 pool.terminate()

137 main_logger.error(excp)

138 sys.exit(1)

139 # We re-aligned (or added missing genomes) at least one family

140 # -> remove concatenated files and groupby files (if they exist)

141 if set(final) != {"OK"}:

142 aldir = os.path.split(prefix)[0]

143 concat_nucl = os.path.join(aldir, f"{dname}-complete.nucl.cat.aln")

144 concat_aa = os.path.join(aldir, f"{dname}-complete.aa.cat.aln")

145 outdir = os.path.split(aldir)[0]

146 treedir = os.path.join(outdir, "Phylo-" + dname)

147 grpfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln")

148 grpfile_aa = os.path.join(treedir, dname + ".aa.grp.aln")

149 utils.remove(concat_nucl)

150 utils.remove(concat_aa)

151 utils.remove(grpfile_nucl)

152 utils.remove(grpfile_aa)

153 return False not in final

154

155

156def handle_family_1thread(args):

157 """

158 For the given family:

159

160 - align its proteins with mafft

161 - back-translate to nucleotides

162 - add missing genomes

163

164 Parameters

165 ----------

166 args : ()

167 (prefix, num_fam, ngenomes, q) with:

168

169 - prefix: path to ``aldir/<name of dataset>``

170 - num_fam: the current family number

171 - ngenomes: the total number of genomes in dataset

172

173 Returns

174 -------

175 bool

176 - "OK" if the files were not re-created, and have the expected format. This is used by

177 ``align_all_families`` function, to know if something was regenerated, or if everything

178 already existed with the expected format. If something was redone and concat/group files

179 exist, it removes them.

180 - False if any problem (extractions, alignment, btr, add missing genomes...)

181 - True if just generated all files, and everything is ok

182 """

183 prefix, num_fam, ngenomes = args

184 logger = logging.getLogger('align.align_family')

185 # Get file names

186 prt_file = f"{prefix}-current.{num_fam}.prt"

187 gen_file = f"{prefix}-current.{num_fam}.gen"

188 miss_file = f"{prefix}-current.{num_fam}.miss.lst"

189 mafft_file = f"{prefix}-mafft-align.{num_fam}.aln"

190 btr_file = f"{prefix}-mafft-prt2nuc.{num_fam}.aln"

191 # Align all sequences for given family

192 status1 = family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file,

193 num_fam, ngenomes, logger)

194 # status1 is:

195 # - False if problem with extractions, alignment or backtranslation -> return False

196 # - 'nb_seqs' = number of sequences aligned if everything went well (extractions and

197 # alignment ok, btr created without problem)

198 # - "OK" if extractions and alignments went well, and btr already exists and is ok

199 # If it returned true or the , Add missing genomes

200 # If 'OK' or nb_seq, add missing genomes

201 if status1:

202 added_aa = add_missing_genomes(mafft_file, "protein", miss_file, num_fam, ngenomes, status1, logger)

203 added_nucl = add_missing_genomes(btr_file, "back-translated", miss_file, num_fam, ngenomes, status1, logger)

204 # 1 of them false: return false

205 # both are "OK": return OK (no need to remove concatenated and grouped files)

206 # 1 of them true (not "OK"): return true

207 if not added_nucl or not added_aa:

208 return False

209 elif added_aa == "OK" and added_nucl == "OK":

210 return "OK"

211 else:

212 return True

213 # Else, return False (there was a problem while trying to align, in any step)

214 return False

215

216

217def handle_family(args):

218 """

219 For the given family:

220

221 - align its proteins with mafft

222 - back-translate to nucleotides

223 - add missing genomes

224

225 Parameters

226 ----------

227 args : ()

228 (prefix, num_fam, ngenomes, q) with:

229

230 - prefix: path to ``aldir/<name of dataset>``

231 - num_fam: the current family number

232 - ngenomes: the total number of genomes in dataset

233 - q: a queue, which will be used by logger to put logs while in other process

234

235 Returns

236 -------

237 bool

238 - "OK" if the files were not re-created, and have the expected format. This is used by

239 ``align_all_families`` function, to know if something was regenerated, or if everything

240 already existed with the expected format. If something was redone and concat/group files

241 exist, it removes them.

242 - False if any problem (extractions, alignment, btr, add missing genomes...)

243 - True if just generated all files, and everything is ok

244 """

245 prefix, num_fam, ngenomes, q = args

246 qh = logging.handlers.QueueHandler(q)

247 root = logging.getLogger()

248 root.setLevel(logging.DEBUG)

249 root.handlers = []

250 logging.addLevelName(utils.detail_lvl(), "DETAIL")

251 root.addHandler(qh)

252 logger = logging.getLogger('align.align_family')

253 return handle_family_1thread((prefix, num_fam, ngenomes))

254

255

256def add_missing_genomes(align_file, ali_type, miss_file, num_fam, ngenomes, status1, logger):

257 """

258 Once all family proteins are aligned, and back-translated to nucleotides,

259 add missing genomes for the family to the alignment with '-'.

260 (Add missing genomes to both mafft alignment and back-translated alignment)

261

262 Parameters

263 ----------

264 align_file : str

265 path to file containing alignments (proteins if from mafft output,

266 or nucleic sequences if after backtranslating them)

267 ali_type : str

268 protein or backtranslated

269 miss_file : str

270 path to file containing the list of missing genomes in this family

271 num_fam : int

272 family number

273 ngenomes : int

274 total number of genomes in dataset

275 status1 : bool or str

276 - "OK" if we did not redo the alignments as they already were as expected. In that case,

277 if missing genomes are already present, just add a warning message saying that we

278 used the already existing btr file.

279 - True if we just did the alignments and backtranslate. So no warning message needed.

280 - False if problem with extraction, alignment or backtranslation (will never happen as

281 this function is not called if status1 == False)

282 logger : logging.Logger

283 the logger, having a queue Handler, to give logs to the main logger

284

285 Returns

286 -------

287 bool or str

288 - "OK" if btr file was not recreated, and already has the right number of sequences,

289 and all with the same length.

290 - False if problem in btr file alignment, so missing genomes not added

291 - True if alignment + adding missing genomes is ok. Can happen if there is no missing

292 genome for this family (in that case, btr generated already has the right number of

293 sequences), or if we just added the missing genomes.

294

295 """

296 # btr_file should always exist.

297 # Sometimes it comes from previous step ('missing genomes' are missing)

298 # Sometimes it comes from a previous run (all genomes should be here)

299 status = check_add_missing(align_file, num_fam, ngenomes, logger, prev=True)

300 # If btr_file has the correct number of sequences, all the same length, return True

301 if status is True:

302 if status1 == "OK":

303 logger.warning(f"{ali_type} alignment already done for family {num_fam}. The program will use "

304 "it for next steps")

305 return "OK"

306 else:

307 return True

308 # If btr_files has problem in alignment (not all sequences with same size)

309 elif status is False:

310 return False

311 # All sequences have same length but some genomes are missing -> Add missing genomes

312 # status is length of sequence (if it was True or False, it already ended this function)

313 logger.log(utils.detail_lvl(), f"Adding missing genomes for family {num_fam} in {ali_type} alignment.")

314 len_aln = status

315 with open(miss_file, "r") as missf, open(align_file, "a") as alif:

316 for genome in missf:

317 genome = genome.strip()

318 toadd = ">" + genome + "\n" + "-" * len_aln + "\n"

319 alif.write(toadd)

320 # check_add_missing called with prev=False :

321 # output is True if all ok, or False if problems. Cannot be sequence length (as it can be with prev=True)

322 ret = check_add_missing(align_file, num_fam, ngenomes, logger, prev=False)

323 return ret

324

325

326def check_add_missing(btr_file, num_fam, ngenomes, logger, prev):

327 """

328 Check back-translated alignment while missing genomes have been added

329

330 Parameters

331 ----------

332 btr_file : str

333 path to file containing back-translated alignment

334 num_fam : int

335 current family number

336 ngenomes : int

337 total number of genomes in dataset

338 logger : logging.Logger

339 logger with queueHandler to give logs to main logger

340 prev : bool

341 True if we are checking alignments before adding missing genomes (in case it comes

342 from a previous run and is already done, or in case there are no missing genomes for

343 this family, so nothing to do to btr file). In that case, do not write error message

344 if the number of sequences does not correspond to the total number of genomes.

345 False if we just added missing genomes. In that case, nb sequences should be equal to

346 the total number of genomes. If not, write error message.

347

348 Returns

349 -------

350 bool or int

351 - True if right number of sequences in btr file and all the same length (everything is ok)

352 - False if problem in alignment (sequences do not all have the same size), or same size but not all

353 genomes while not being from previous run (prev=False)

354 - alignment length if sequences aligned all have the same length, but missing

355 genomes are not added yet (so, will have to add lines with this number of '-')

356 """

357 res = check_lens(btr_file, num_fam, logger)

358 # If all sequences have the same length, res = (length of a sequence, number of sequences in btr file)

359 if res:

360 len_aln, nb = res

361 # otherwise, return False: problem while aligning or back-translating

362 else:

363 return False

364 # If number of sequences in btr file is not the same as the total number of genomes, either:

365 # - btr comes from a previous run, but was not complete (prev)

366 # - we just created btr file, and there are missing genomes in this family.

367 # return sequence length, so that we add missing genomes

368 if nb != ngenomes:

369 # prev = true: this function was called at the beggining of 'add_missing_genomes',

370 # so it's ok if there are not all sequences, they will be added just after. We

371 # just checked in case this is a file already complete from a previous run, or

372 # in case there is no missing genome in this family.

373 # prev = false: called after adding missing genomes. So, error message, as it should

374 # contain all expected genomes.

375 if not prev:

376 logger.error((f"ERROR: family {num_fam} contains {nb} genomes in total "

377 f"instead of the {ngenomes} genomes in input.\n"))

378 return False

379 # If not already has all sequences, return sequence length to add the missing ones

380 return len_aln

381 # Everything ok (nb genomes ok and all same length)

382 return True

383

384

385def family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file,

386 num_fam, ngenomes, logger):

387 """

388 From a given family, align all its proteins with mafft, back-translate

389 to nucleotides, and add missing genomes in this family.

390

391 Parameters

392 ----------

393 prt_file : str

394 path to file containing proteins extracted

395 gen_file : str

396 path to file containing genes extracted

397 miss_file : str

398 path to file containing list of genomes missing

399 mafft_file : str

400 path to file which will contain the protein alignment

401 btr_file : str

402 path to file which will contain the nucleotide alignment back-translated from protein

403 alignment

404 num_fam : int

405 current family number

406 ngenomes : int

407 total number of genomes in dataset

408 logger : logging.Logger

409 logger with queueHandler to give logs to main logger

410

411 Returns

412 -------

413 bool or str

414 - False if problem with extractions or with alignment or with backtranslation

415 - 'nb_seqs' = number of sequences aligned if everything went well (extractions and

416 alignment ok, btr created without problem)

417 - "OK" if extractions and alignments went well, and btr already exists and is ok

418 """

419 # Check number of proteins extracted

420 # =check that nb_prt (or nb_gen, which should be the same) + nb_miss = nb_genomes

421 # returns number of genomes extracted (so, excludes missing genomes for the family)

422 nbfprt = check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger)

423 nbfal = None

424 # If problem with extractions (0 proteins extracted), remove mafft and btr files if they exist, so that they will

425 # be regenerated

426 if not nbfprt:

427 utils.remove(mafft_file)

428 utils.remove(btr_file)

429 return False

430 # If mafft file already exists, check the number of proteins aligned corresponds to number of

431 # proteins extracted. If not, remove mafft and btr files.

432 if os.path.isfile(mafft_file):

433 # There can be nbfprt (number of proteins extracted)

434 # or nb_genomes (proteins extracted + missing added with '-')

435 nbfal1 = check_nb_seqs(mafft_file, nbfprt, logger, "")

436 nbfal2 = check_nb_seqs(mafft_file, ngenomes, logger, "")

437 # if nbfal1: missing genomes have not been added yet. Save this value for later

438 if nbfal1:

439 nbfal = nbfal1

440 # if nbfal2: missing genomes already there, save for later

441 elif nbfal2:

442 nbfal = nbfal2

443 # If not any of those 2 numbers: error

444 else:

445 message = (f"fam {num_fam}: Will redo alignment, because found a different number of proteins "

446 f"extracted in {prt_file} ({nbfprt}) and proteins aligned in "

447 f"existing {mafft_file}")

448 logger.error(message)

449 os.remove(mafft_file)

450 utils.remove(btr_file)

451 # If mafft file does not exist (removed because problem in its alignment, or just not generated

452 # yet), remove btr (will be regenerated), and do alignment with mafft

453 if not os.path.isfile(mafft_file):

454 utils.remove(btr_file) # remove if exists...

455 nbfal = mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger)

456 # If problem with alignment, return False

457 if not nbfal:

458 return False

459 # If btr file already exists, means that it was already done before, and not removed because

460 # extractions and mafft files are ok. So, return True, saying that btr file is done,

461 # next step will be to check it, add missing genomes etc.

462 if os.path.isfile(btr_file):

463 message = (f"fam {num_fam}: Will redo back-translation, because found a different number of "

464 f"proteins aligned in {mafft_file} ({nbfal}) and genes back-translated in "

465 f"existing {btr_file}")

466 # btr file contains either nbfal entries (number of proteins extracted) if it was not completed

467 # with missing genomes, or ngenomes if it was completed. If it is not the case, remove it

468 # (will be regenerated)

469 res = check_nb_seqs(btr_file, [nbfal, ngenomes], logger, message)

470 if not res:

471 utils.remove(btr_file)

472 else:

473 return "OK"

474 # If btr file does not exist (removed because problem with mafft generated before,

475 # or just not generated yet), do back-translation, and return:

476 # - number of sequences back-translated if it went well,

477 # - False otherwise

478 return back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger)

479

480

481def check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger):

482 """

483 Check that extractions went well for the given family:

484

485 - check number of proteins and genes extracted compared to the

486 number of genomes

487

488 Parameters

489 ----------

490 num_fam : int

491 current family number

492 miss_file : str

493 path to file containing the list of genomes missing for the current family

494 prt_file : str

495 path to file containing all proteins extracted

496 gen_file : str

497 path to file containing all genes extracted

498 ngenomes : int

499 total number of genomes in dataset

500 logger : logging.Logger

501 logger with queueHandler to give logs to main logger

502

503 Returns

504 -------

505 bool or int

506 False if any problem (nbmiss+prt != nbgenomes or nbmiss+gen != nbgenomes). If no

507 problem, returns the number of proteins/genes extracted

508 """

509 logger.log(utils.detail_lvl(), f"Checking extractions for family {num_fam}")

510

511 # Check that extractions went well

512 nbmiss = utils.count(miss_file)

513 # If files with proteins extracted do not even exist, close with error

514 # (they should have been created at the previous step)

515 if not os.path.isfile(gen_file):

516 logger.error(f"fam {num_fam}: no file with genes extracted "

517 f"('{gen_file}'). Cannot align.")

518 sys.exit(1)

519 if not os.path.isfile(prt_file):

520 logger.error(f"fam {num_fam}: no file with proteins extracted "

521 f"('{prt_file}'). Cannot align.")

522 sys.exit(1)

523 nbfprt = utils.grep(prt_file, "^>", counts=True)

524 nbfgen = utils.grep(gen_file, "^>", counts=True)

525 if nbmiss + nbfprt != ngenomes:

526 logger.error(("fam {}: wrong sum of missing genomes ({}) and prt extracted ({}) for {} "

527 "genomes in the dataset.").format(num_fam, nbmiss, nbfprt, ngenomes))

528 return False

529 if nbmiss + nbfgen != ngenomes:

530 logger.error(("fam {}: wrong sum of missing genomes ({}) and gen extracted ({}) for {} "

531 "genomes in the dataset.").format(num_fam, nbmiss, nbfgen, ngenomes))

532 return False

533 return nbfprt

534

535

536def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger):

537 """

538 Align all proteins of the given family with mafft

539

540 Parameters

541 ----------

542 num_fam : int

543 current family number

544 prt_file : str

545 path to file containing all proteins extracted

546 mafft_file : str

547 path to file which will contain proteins alignment

548 nbfprt : int

549 number of proteins extracted in prt file

550 logger : logging.Logger

551 logger with queueHandler to give logs to main logger

552

553 Returns

554 -------

555 bool

556 True if no problem (alignment ok, same number of proteins extracted and aligned),

557 False otherwise

558 """

559 logger.log(utils.detail_lvl(), f"Aligning family {num_fam}")

560 cmd = f"mafft --auto --amino {prt_file}"

561 error = f"Problem while trying to align fam {num_fam}"

562 stdout = open(mafft_file, "w")

563 stderr = open(mafft_file + ".log", "w")

564 logger.log(utils.detail_lvl(), f"Mafft command: {cmd}")

565 ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger)

566 stdout.close()

567 if not isinstance(ret, int):

568 ret = ret.returncode

569 if ret != 0:

570 os.remove(mafft_file)

571 return False

572 message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins "

573 f"aligned in {mafft_file}")

574 return check_nb_seqs(mafft_file, nbfprt, logger, message)

575

576

577def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger):

578 """

579 Backtranslate protein alignment to nucleotides

580

581 Parameters

582 ----------

583 num_fam : int

584 current family number. Used for log messages

585 mafft_file : str

586 path to file containing protein alignments by mafft

587 gen_file : str

588 path to file containing all sequences, not aligned, in nucleotides. It is used to

589 convert the alignment in proteins into a nucleotide alignment

590 btr_file : str

591 path to the file that will contain the nucleotide alignment

592 nbfal : int

593 number of sequences aligned for the family by mafft

594 logger : logging.Logger

595 logger with queueHandler to give logs to main logger

596

597 Returns

598 -------

599 bool

600 - False if problem (back-translation, different number of families...)

601 - number of sequences in btr file if everything went well

602 """

603 logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}")

604 curpath = os.path.dirname(os.path.abspath(__file__))

605 awk_script = os.path.join(curpath, "prt2codon.awk")

606 cmd = f"awk -f {awk_script} {mafft_file} {gen_file}"

607 stdout = open(btr_file, "w")

608 error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment"

609 ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger)

610 stdout.close()

611 if not isinstance(ret, int):

612 ret = ret.returncode

613 if ret != 0:

614 os.remove(btr_file)

615 return False

616 message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes "

617 f"back-translated in {btr_file}")

618 # Check number of sequences in btr file, and return True/False according to it

619 # It should contain the same number of sequences as the mafft file.

620 return check_nb_seqs(mafft_file, nbfal, logger, message)

621

622

623def check_nb_seqs(alnfile, nbfal, logger, message=""):

624 """

625 Check the number of sequences in the given alignment file

626

627 Parameters

628 ----------

629 alnfile : str

630 path to alignment file

631 nbfal : int or []

632 expected number of sequences, or list of expected numbers of sequences

633 logger : logging.Logger

634 logger with queueHandler to give logs to main logger

635 message : str

636 message to write when sequence numbers do not match

637

638 Returns

639 -------

640 bool or int

641 - False if not same number of sequences

642 - nbseqs in align file if found among values in 'nbfal'

643 """

644 nbseqs = utils.grep(alnfile, "^>", counts=True)

645 if isinstance(nbfal, int):

646 nbfal = [nbfal]

647 for num in nbfal:

648 if nbseqs == num:

649 return nbseqs

650 if message:

651 logger.error(f"{message} ({nbseqs})")

652 return False

653

654

655def check_lens(aln_file, num_fam, logger):

656 """

657 In the given alignment file, check that all sequences have the same length.

658 If there is no problem, it returns the length of alignment, and the number

659 of sequences aligned.

660

661 Parameters

662 ----------

663 aln_file : str

664 path to the alignment file to check

665 num_fam : int

666 current family number. Used for log message if problem

667 logger : logging.Logger

668 logger having a queue Handler to give logs to the main logger in the main process

669

670 Returns

671 -------

672 bool or tuple

673 False if there is a problem in the alignment (sequences do not all have the

674 same length).

675 If they all have the same length, returns this length and the number of sequences

676 """

677 nb_gen = 0

678 all_sums = set()

679 with open(aln_file, "r") as btrf:

680 cur_sum = 0

681 start = True

682 for line in btrf:

683 if line.startswith(">"):

684 nb_gen += 1

685 if not start:

686 all_sums.add(cur_sum)

687 cur_sum = 0

688 else:

689 start = False

690 cur_sum += len(line.strip())

691 all_sums.add(cur_sum)

692 if len(all_sums) > 1:

693 logger.error(f"Nucleic alignments for family {num_fam} (in {aln_file}) do not all have the same "

694 f"length. Lengths found are: {all_sums}\n")

695 return False

696 # Return sequence length and number of sequences in alignment file

697 return list(all_sums)[0], nb_gen