Coverage for PanACoTA/align_module/alignment.py: 100%

223 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37For a given family: 

38 

39- align its proteins using mafft 

40- back translate to nucleotides 

41- add "-" of same size as alignment for genomes not having members in the family 

42 

43@author: GEM, Institut Pasteur 

44March 2017 

45""" 

46 

47import os 

48import sys 

49import logging 

50import multiprocessing 

51import progressbar 

52import threading 

53 

54from PanACoTA import utils 

55 

56main_logger = logging.getLogger("align.alignment") 

57 

58 

59def align_all_families(prefix, all_fams, ngenomes, dname, quiet, threads): 

60 """ 

61 For each family: 

62 

63 - align all its proteins with mafft 

64 - back-translate to nucleotides 

65 - add missing genomes 

66 

67 Parameters 

68 ---------- 

69 prefix : str 

70 path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files 

71 easily) 

72 all_fams : [] 

73 list of all family numbers 

74 ngenomes : int 

75 total number of genomes in dataset 

76 dname : str 

77 name of dataset (used to name concat and grouped files, as well as tree folder) 

78 quiet : bool 

79 True if nothing must be written in stdout/stderr, False otherwise 

80 threads : int 

81 max number of threads that can be used by mafft 

82 

83 Returns 

84 ------- 

85 bool 

86 True if everything went well, False if there was a problem in at least 1 family. 

87 """ 

88 main_logger.info(("Starting alignment of all families: protein alignment, " 

89 "back-translation to nucleotides, and add missing genomes in the family")) 

90 nbfam = len(all_fams) 

91 bar = None 

92 if not quiet: 

93 # Create progressbar 

94 widgets = ['Alignment: ', progressbar.Bar(marker='█', left='', right='', fill=' '), 

95 ' ', progressbar.Counter(), "/{}".format(nbfam), ' (', 

96 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - ' 

97 ] 

98 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfam, 

99 term_width=79).start() 

100 final = [] 

101 if threads == 1: 

102 update_bar = 1 

103 for num_fam in all_fams: 

104 f = handle_family_1thread((prefix, num_fam, ngenomes)) 

105 final.append(f) 

106 bar.update(update_bar) 

107 update_bar+=1 

108 

109 else: 

110 pool = multiprocessing.Pool(threads) 

111 

112 # Create a Queue to put logs from processes, and handle them after from a single thread 

113 m = multiprocessing.Manager() 

114 q = m.Queue() 

115 # arguments : (gpath, cores_prokka, name, force, nbcont, q) for each genome 

116 arguments = [(prefix, num_fam, ngenomes, q) for num_fam in all_fams] 

117 try: 

118 final = pool.map_async(handle_family, arguments, chunksize=1) 

119 pool.close() 

120 # Listen for logs in processes 

121 lp = threading.Thread(target=utils.logger_thread, args=(q,)) 

122 lp.start() 

123 if not quiet: 

124 while True: 

125 if final.ready(): 

126 break 

127 remaining = final._number_left 

128 bar.update(nbfam - remaining) 

129 bar.finish() 

130 pool.join() 

131 q.put(None) 

132 lp.join() 

133 final = final.get() 

134 # If an error occurs (or user kills with keybord), terminate pool and exit 

135 except Exception as excp: # pragma: no cover 

136 pool.terminate() 

137 main_logger.error(excp) 

138 sys.exit(1) 

139 # We re-aligned (or added missing genomes) at least one family  

140 # -> remove concatenated files and groupby files (if they exist) 

141 if set(final) != {"OK"}: 

142 aldir = os.path.split(prefix)[0] 

143 concat_nucl = os.path.join(aldir, f"{dname}-complete.nucl.cat.aln") 

144 concat_aa = os.path.join(aldir, f"{dname}-complete.aa.cat.aln") 

145 outdir = os.path.split(aldir)[0] 

146 treedir = os.path.join(outdir, "Phylo-" + dname) 

147 grpfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln") 

148 grpfile_aa = os.path.join(treedir, dname + ".aa.grp.aln") 

149 utils.remove(concat_nucl) 

150 utils.remove(concat_aa) 

151 utils.remove(grpfile_nucl) 

152 utils.remove(grpfile_aa) 

153 return False not in final 

154 

155 

156def handle_family_1thread(args): 

157 """ 

158 For the given family: 

159 

160 - align its proteins with mafft 

161 - back-translate to nucleotides 

162 - add missing genomes 

163 

164 Parameters 

165 ---------- 

166 args : () 

167 (prefix, num_fam, ngenomes, q) with: 

168 

169 - prefix: path to ``aldir/<name of dataset>`` 

170 - num_fam: the current family number 

171 - ngenomes: the total number of genomes in dataset 

172 

173 Returns 

174 ------- 

175 bool 

176 - "OK" if the files were not re-created, and have the expected format. This is used by 

177 ``align_all_families`` function, to know if something was regenerated, or if everything 

178 already existed with the expected format. If something was redone and concat/group files 

179 exist, it removes them. 

180 - False if any problem (extractions, alignment, btr, add missing genomes...) 

181 - True if just generated all files, and everything is ok 

182 """ 

183 prefix, num_fam, ngenomes = args 

184 logger = logging.getLogger('align.align_family') 

185 # Get file names 

186 prt_file = f"{prefix}-current.{num_fam}.prt" 

187 gen_file = f"{prefix}-current.{num_fam}.gen" 

188 miss_file = f"{prefix}-current.{num_fam}.miss.lst" 

189 mafft_file = f"{prefix}-mafft-align.{num_fam}.aln" 

190 btr_file = f"{prefix}-mafft-prt2nuc.{num_fam}.aln" 

191 # Align all sequences for given family 

192 status1 = family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file, 

193 num_fam, ngenomes, logger) 

194 # status1 is: 

195 # - False if problem with extractions, alignment or backtranslation -> return False 

196 # - 'nb_seqs' = number of sequences aligned if everything went well (extractions and 

197 # alignment ok, btr created without problem) 

198 # - "OK" if extractions and alignments went well, and btr already exists and is ok 

199 # If it returned true or the , Add missing genomes 

200 # If 'OK' or nb_seq, add missing genomes 

201 if status1: 

202 added_aa = add_missing_genomes(mafft_file, "protein", miss_file, num_fam, ngenomes, status1, logger) 

203 added_nucl = add_missing_genomes(btr_file, "back-translated", miss_file, num_fam, ngenomes, status1, logger) 

204 # 1 of them false: return false 

205 # both are "OK": return OK (no need to remove concatenated and grouped files) 

206 # 1 of them true (not "OK"): return true 

207 if not added_nucl or not added_aa: 

208 return False 

209 elif added_aa == "OK" and added_nucl == "OK": 

210 return "OK" 

211 else: 

212 return True 

213 # Else, return False (there was a problem while trying to align, in any step) 

214 return False 

215 

216 

217def handle_family(args): 

218 """ 

219 For the given family: 

220 

221 - align its proteins with mafft 

222 - back-translate to nucleotides 

223 - add missing genomes 

224 

225 Parameters 

226 ---------- 

227 args : () 

228 (prefix, num_fam, ngenomes, q) with: 

229 

230 - prefix: path to ``aldir/<name of dataset>`` 

231 - num_fam: the current family number 

232 - ngenomes: the total number of genomes in dataset 

233 - q: a queue, which will be used by logger to put logs while in other process 

234 

235 Returns 

236 ------- 

237 bool 

238 - "OK" if the files were not re-created, and have the expected format. This is used by 

239 ``align_all_families`` function, to know if something was regenerated, or if everything 

240 already existed with the expected format. If something was redone and concat/group files 

241 exist, it removes them. 

242 - False if any problem (extractions, alignment, btr, add missing genomes...) 

243 - True if just generated all files, and everything is ok 

244 """ 

245 prefix, num_fam, ngenomes, q = args 

246 qh = logging.handlers.QueueHandler(q) 

247 root = logging.getLogger() 

248 root.setLevel(logging.DEBUG) 

249 root.handlers = [] 

250 logging.addLevelName(utils.detail_lvl(), "DETAIL") 

251 root.addHandler(qh) 

252 logger = logging.getLogger('align.align_family') 

253 return handle_family_1thread((prefix, num_fam, ngenomes)) 

254 

255 

256def add_missing_genomes(align_file, ali_type, miss_file, num_fam, ngenomes, status1, logger): 

257 """ 

258 Once all family proteins are aligned, and back-translated to nucleotides, 

259 add missing genomes for the family to the alignment with '-'. 

260 (Add missing genomes to both mafft alignment and back-translated alignment) 

261 

262 Parameters 

263 ---------- 

264 align_file : str 

265 path to file containing alignments (proteins if from mafft output,  

266 or nucleic sequences if after backtranslating them) 

267 ali_type : str 

268 protein or backtranslated 

269 miss_file : str 

270 path to file containing the list of missing genomes in this family 

271 num_fam : int 

272 family number 

273 ngenomes : int 

274 total number of genomes in dataset  

275 status1 : bool or str 

276 - "OK" if we did not redo the alignments as they already were as expected. In that case, 

277 if missing genomes are already present, just add a warning message saying that we 

278 used the already existing btr file. 

279 - True if we just did the alignments and backtranslate. So no warning message needed. 

280 - False if problem with extraction, alignment or backtranslation (will never happen as 

281 this function is not called if status1 == False) 

282 logger : logging.Logger 

283 the logger, having a queue Handler, to give logs to the main logger 

284 

285 Returns 

286 ------- 

287 bool or str 

288 - "OK" if btr file was not recreated, and already has the right number of sequences, 

289 and all with the same length. 

290 - False if problem in btr file alignment, so missing genomes not added  

291 - True if alignment + adding missing genomes is ok. Can happen if there is no missing 

292 genome for this family (in that case, btr generated already has the right number of 

293 sequences), or if we just added the missing genomes. 

294 

295 """ 

296 # btr_file should always exist. 

297 # Sometimes it comes from previous step ('missing genomes' are missing) 

298 # Sometimes it comes from a previous run (all genomes should be here) 

299 status = check_add_missing(align_file, num_fam, ngenomes, logger, prev=True) 

300 # If btr_file has the correct number of sequences, all the same length, return True 

301 if status is True: 

302 if status1 == "OK": 

303 logger.warning(f"{ali_type} alignment already done for family {num_fam}. The program will use " 

304 "it for next steps") 

305 return "OK" 

306 else: 

307 return True 

308 # If btr_files has problem in alignment (not all sequences with same size) 

309 elif status is False: 

310 return False 

311 # All sequences have same length but some genomes are missing -> Add missing genomes 

312 # status is length of sequence (if it was True or False, it already ended this function) 

313 logger.log(utils.detail_lvl(), f"Adding missing genomes for family {num_fam} in {ali_type} alignment.") 

314 len_aln = status 

315 with open(miss_file, "r") as missf, open(align_file, "a") as alif: 

316 for genome in missf: 

317 genome = genome.strip() 

318 toadd = ">" + genome + "\n" + "-" * len_aln + "\n" 

319 alif.write(toadd) 

320 # check_add_missing called with prev=False : 

321 # output is True if all ok, or False if problems. Cannot be sequence length (as it can be with prev=True) 

322 ret = check_add_missing(align_file, num_fam, ngenomes, logger, prev=False) 

323 return ret 

324 

325 

326def check_add_missing(btr_file, num_fam, ngenomes, logger, prev): 

327 """ 

328 Check back-translated alignment while missing genomes have been added 

329 

330 Parameters 

331 ---------- 

332 btr_file : str 

333 path to file containing back-translated alignment 

334 num_fam : int 

335 current family number 

336 ngenomes : int 

337 total number of genomes in dataset 

338 logger : logging.Logger 

339 logger with queueHandler to give logs to main logger 

340 prev : bool 

341 True if we are checking alignments before adding missing genomes (in case it comes 

342 from a previous run and is already done, or in case there are no missing genomes for 

343 this family, so nothing to do to btr file). In that case, do not write error message 

344 if the number of sequences does not correspond to the total number of genomes. 

345 False if we just added missing genomes. In that case, nb sequences should be equal to 

346 the total number of genomes. If not, write error message. 

347 

348 Returns 

349 ------- 

350 bool or int 

351 - True if right number of sequences in btr file and all the same length (everything is ok) 

352 - False if problem in alignment (sequences do not all have the same size), or same size but not all 

353 genomes while not being from previous run (prev=False) 

354 - alignment length if sequences aligned all have the same length, but missing 

355 genomes are not added yet (so, will have to add lines with this number of '-') 

356 """ 

357 res = check_lens(btr_file, num_fam, logger) 

358 # If all sequences have the same length, res = (length of a sequence, number of sequences in btr file) 

359 if res: 

360 len_aln, nb = res 

361 # otherwise, return False: problem while aligning or back-translating 

362 else: 

363 return False 

364 # If number of sequences in btr file is not the same as the total number of genomes, either: 

365 # - btr comes from a previous run, but was not complete (prev) 

366 # - we just created btr file, and there are missing genomes in this family. 

367 # return sequence length, so that we add missing genomes 

368 if nb != ngenomes: 

369 # prev = true: this function was called at the beggining of 'add_missing_genomes', 

370 # so it's ok if there are not all sequences, they will be added just after. We 

371 # just checked in case this is a file already complete from a previous run, or  

372 # in case there is no missing genome in this family. 

373 # prev = false: called after adding missing genomes. So, error message, as it should 

374 # contain all expected genomes. 

375 if not prev: 

376 logger.error((f"ERROR: family {num_fam} contains {nb} genomes in total " 

377 f"instead of the {ngenomes} genomes in input.\n")) 

378 return False 

379 # If not already has all sequences, return sequence length to add the missing ones 

380 return len_aln 

381 # Everything ok (nb genomes ok and all same length) 

382 return True 

383 

384 

385def family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file, 

386 num_fam, ngenomes, logger): 

387 """ 

388 From a given family, align all its proteins with mafft, back-translate 

389 to nucleotides, and add missing genomes in this family. 

390 

391 Parameters 

392 ---------- 

393 prt_file : str 

394 path to file containing proteins extracted 

395 gen_file : str 

396 path to file containing genes extracted 

397 miss_file : str 

398 path to file containing list of genomes missing 

399 mafft_file : str 

400 path to file which will contain the protein alignment 

401 btr_file : str 

402 path to file which will contain the nucleotide alignment back-translated from protein 

403 alignment 

404 num_fam : int 

405 current family number 

406 ngenomes : int 

407 total number of genomes in dataset 

408 logger : logging.Logger 

409 logger with queueHandler to give logs to main logger 

410 

411 Returns 

412 ------- 

413 bool or str 

414 - False if problem with extractions or with alignment or with backtranslation 

415 - 'nb_seqs' = number of sequences aligned if everything went well (extractions and 

416 alignment ok, btr created without problem) 

417 - "OK" if extractions and alignments went well, and btr already exists and is ok 

418 """ 

419 # Check number of proteins extracted 

420 # =check that nb_prt (or nb_gen, which should be the same) + nb_miss = nb_genomes 

421 # returns number of genomes extracted (so, excludes missing genomes for the family) 

422 nbfprt = check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger) 

423 nbfal = None 

424 # If problem with extractions (0 proteins extracted), remove mafft and btr files if they exist, so that they will 

425 # be regenerated 

426 if not nbfprt: 

427 utils.remove(mafft_file) 

428 utils.remove(btr_file) 

429 return False 

430 # If mafft file already exists, check the number of proteins aligned corresponds to number of 

431 # proteins extracted. If not, remove mafft and btr files. 

432 if os.path.isfile(mafft_file): 

433 # There can be nbfprt (number of proteins extracted)  

434 # or nb_genomes (proteins extracted + missing added with '-') 

435 nbfal1 = check_nb_seqs(mafft_file, nbfprt, logger, "") 

436 nbfal2 = check_nb_seqs(mafft_file, ngenomes, logger, "") 

437 # if nbfal1: missing genomes have not been added yet. Save this value for later 

438 if nbfal1: 

439 nbfal = nbfal1 

440 # if nbfal2: missing genomes already there, save for later 

441 elif nbfal2: 

442 nbfal = nbfal2 

443 # If not any of those 2 numbers: error 

444 else: 

445 message = (f"fam {num_fam}: Will redo alignment, because found a different number of proteins " 

446 f"extracted in {prt_file} ({nbfprt}) and proteins aligned in " 

447 f"existing {mafft_file}") 

448 logger.error(message) 

449 os.remove(mafft_file) 

450 utils.remove(btr_file) 

451 # If mafft file does not exist (removed because problem in its alignment, or just not generated 

452 # yet), remove btr (will be regenerated), and do alignment with mafft 

453 if not os.path.isfile(mafft_file): 

454 utils.remove(btr_file) # remove if exists... 

455 nbfal = mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger) 

456 # If problem with alignment, return False 

457 if not nbfal: 

458 return False 

459 # If btr file already exists, means that it was already done before, and not removed because 

460 # extractions and mafft files are ok. So, return True, saying that btr file is done, 

461 # next step will be to check it, add missing genomes etc. 

462 if os.path.isfile(btr_file): 

463 message = (f"fam {num_fam}: Will redo back-translation, because found a different number of " 

464 f"proteins aligned in {mafft_file} ({nbfal}) and genes back-translated in " 

465 f"existing {btr_file}") 

466 # btr file contains either nbfal entries (number of proteins extracted) if it was not completed  

467 # with missing genomes, or ngenomes if it was completed. If it is not the case, remove it 

468 # (will be regenerated) 

469 res = check_nb_seqs(btr_file, [nbfal, ngenomes], logger, message) 

470 if not res: 

471 utils.remove(btr_file) 

472 else: 

473 return "OK" 

474 # If btr file does not exist (removed because problem with mafft generated before, 

475 # or just not generated yet), do back-translation, and return: 

476 # - number of sequences back-translated if it went well, 

477 # - False otherwise 

478 return back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger) 

479 

480 

481def check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger): 

482 """ 

483 Check that extractions went well for the given family: 

484 

485 - check number of proteins and genes extracted compared to the 

486 number of genomes 

487 

488 Parameters 

489 ---------- 

490 num_fam : int 

491 current family number 

492 miss_file : str 

493 path to file containing the list of genomes missing for the current family 

494 prt_file : str 

495 path to file containing all proteins extracted 

496 gen_file : str 

497 path to file containing all genes extracted 

498 ngenomes : int 

499 total number of genomes in dataset 

500 logger : logging.Logger 

501 logger with queueHandler to give logs to main logger 

502 

503 Returns 

504 ------- 

505 bool or int 

506 False if any problem (nbmiss+prt != nbgenomes or nbmiss+gen != nbgenomes). If no 

507 problem, returns the number of proteins/genes extracted 

508 """ 

509 logger.log(utils.detail_lvl(), f"Checking extractions for family {num_fam}") 

510 

511 # Check that extractions went well 

512 nbmiss = utils.count(miss_file) 

513 # If files with proteins extracted do not even exist, close with error 

514 # (they should have been created at the previous step) 

515 if not os.path.isfile(gen_file): 

516 logger.error(f"fam {num_fam}: no file with genes extracted " 

517 f"('{gen_file}'). Cannot align.") 

518 sys.exit(1) 

519 if not os.path.isfile(prt_file): 

520 logger.error(f"fam {num_fam}: no file with proteins extracted " 

521 f"('{prt_file}'). Cannot align.") 

522 sys.exit(1) 

523 nbfprt = utils.grep(prt_file, "^>", counts=True) 

524 nbfgen = utils.grep(gen_file, "^>", counts=True) 

525 if nbmiss + nbfprt != ngenomes: 

526 logger.error(("fam {}: wrong sum of missing genomes ({}) and prt extracted ({}) for {} " 

527 "genomes in the dataset.").format(num_fam, nbmiss, nbfprt, ngenomes)) 

528 return False 

529 if nbmiss + nbfgen != ngenomes: 

530 logger.error(("fam {}: wrong sum of missing genomes ({}) and gen extracted ({}) for {} " 

531 "genomes in the dataset.").format(num_fam, nbmiss, nbfgen, ngenomes)) 

532 return False 

533 return nbfprt 

534 

535 

536def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger): 

537 """ 

538 Align all proteins of the given family with mafft 

539 

540 Parameters 

541 ---------- 

542 num_fam : int 

543 current family number 

544 prt_file : str 

545 path to file containing all proteins extracted 

546 mafft_file : str 

547 path to file which will contain proteins alignment 

548 nbfprt : int 

549 number of proteins extracted in prt file 

550 logger : logging.Logger 

551 logger with queueHandler to give logs to main logger 

552 

553 Returns 

554 ------- 

555 bool 

556 True if no problem (alignment ok, same number of proteins extracted and aligned), 

557 False otherwise 

558 """ 

559 logger.log(utils.detail_lvl(), f"Aligning family {num_fam}") 

560 cmd = f"mafft --auto --amino {prt_file}" 

561 error = f"Problem while trying to align fam {num_fam}" 

562 stdout = open(mafft_file, "w") 

563 stderr = open(mafft_file + ".log", "w") 

564 logger.log(utils.detail_lvl(), f"Mafft command: {cmd}") 

565 ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger) 

566 stdout.close() 

567 if not isinstance(ret, int): 

568 ret = ret.returncode 

569 if ret != 0: 

570 os.remove(mafft_file) 

571 return False 

572 message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins " 

573 f"aligned in {mafft_file}") 

574 return check_nb_seqs(mafft_file, nbfprt, logger, message) 

575 

576 

577def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger): 

578 """ 

579 Backtranslate protein alignment to nucleotides 

580 

581 Parameters 

582 ---------- 

583 num_fam : int 

584 current family number. Used for log messages 

585 mafft_file : str 

586 path to file containing protein alignments by mafft 

587 gen_file : str 

588 path to file containing all sequences, not aligned, in nucleotides. It is used to 

589 convert the alignment in proteins into a nucleotide alignment 

590 btr_file : str 

591 path to the file that will contain the nucleotide alignment 

592 nbfal : int 

593 number of sequences aligned for the family by mafft 

594 logger : logging.Logger 

595 logger with queueHandler to give logs to main logger 

596 

597 Returns 

598 ------- 

599 bool 

600 - False if problem (back-translation, different number of families...) 

601 - number of sequences in btr file if everything went well 

602 """ 

603 logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}") 

604 curpath = os.path.dirname(os.path.abspath(__file__)) 

605 awk_script = os.path.join(curpath, "prt2codon.awk") 

606 cmd = f"awk -f {awk_script} {mafft_file} {gen_file}" 

607 stdout = open(btr_file, "w") 

608 error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment" 

609 ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger) 

610 stdout.close() 

611 if not isinstance(ret, int): 

612 ret = ret.returncode 

613 if ret != 0: 

614 os.remove(btr_file) 

615 return False 

616 message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes " 

617 f"back-translated in {btr_file}") 

618 # Check number of sequences in btr file, and return True/False according to it 

619 # It should contain the same number of sequences as the mafft file. 

620 return check_nb_seqs(mafft_file, nbfal, logger, message) 

621 

622 

623def check_nb_seqs(alnfile, nbfal, logger, message=""): 

624 """ 

625 Check the number of sequences in the given alignment file 

626 

627 Parameters 

628 ---------- 

629 alnfile : str 

630 path to alignment file 

631 nbfal : int or [] 

632 expected number of sequences, or list of expected numbers of sequences 

633 logger : logging.Logger 

634 logger with queueHandler to give logs to main logger 

635 message : str 

636 message to write when sequence numbers do not match 

637 

638 Returns 

639 ------- 

640 bool or int 

641 - False if not same number of sequences 

642 - nbseqs in align file if found among values in 'nbfal' 

643 """ 

644 nbseqs = utils.grep(alnfile, "^>", counts=True) 

645 if isinstance(nbfal, int): 

646 nbfal = [nbfal] 

647 for num in nbfal: 

648 if nbseqs == num: 

649 return nbseqs 

650 if message: 

651 logger.error(f"{message} ({nbseqs})") 

652 return False 

653 

654 

655def check_lens(aln_file, num_fam, logger): 

656 """ 

657 In the given alignment file, check that all sequences have the same length. 

658 If there is no problem, it returns the length of alignment, and the number 

659 of sequences aligned. 

660 

661 Parameters 

662 ---------- 

663 aln_file : str 

664 path to the alignment file to check 

665 num_fam : int 

666 current family number. Used for log message if problem 

667 logger : logging.Logger 

668 logger having a queue Handler to give logs to the main logger in the main process 

669 

670 Returns 

671 ------- 

672 bool or tuple 

673 False if there is a problem in the alignment (sequences do not all have the 

674 same length). 

675 If they all have the same length, returns this length and the number of sequences 

676 """ 

677 nb_gen = 0 

678 all_sums = set() 

679 with open(aln_file, "r") as btrf: 

680 cur_sum = 0 

681 start = True 

682 for line in btrf: 

683 if line.startswith(">"): 

684 nb_gen += 1 

685 if not start: 

686 all_sums.add(cur_sum) 

687 cur_sum = 0 

688 else: 

689 start = False 

690 cur_sum += len(line.strip()) 

691 all_sums.add(cur_sum) 

692 if len(all_sums) > 1: 

693 logger.error(f"Nucleic alignments for family {num_fam} (in {aln_file}) do not all have the same " 

694 f"length. Lengths found are: {all_sums}\n") 

695 return False 

696 # Return sequence length and number of sequences in alignment file 

697 return list(all_sums)[0], nb_gen