Coverage for PanACoTA/utils.py: 100%

465 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Util functions and classes. 

38 

39 

40@author gem 

41April 2017 

42""" 

43 

44import os 

45import sys 

46import re 

47import glob 

48import subprocess 

49import shutil 

50import shlex 

51import progressbar 

52 

53# Logging 

54import logging 

55from logging.handlers import RotatingFileHandler 

56from colorlog import ColoredFormatter 

57 

58try: 

59 import cPickle as pickle 

60except: 

61 try: 

62 import _pickle as pickle 

63 except: # pragma: no cover 

64 import pickle 

65 

66 

67def init_logger(logfile_base, level, name, log_details=False, verbose=0, quiet=False): 

68 """ 

69 Create logger and its handlers, and set them to the given level 

70 

71 level hierarchy: ``CRITICAL > ERROR > WARNING > INFO > DETAILS > DEBUG`` 

72 

73 Messages from all levels are written in 'logfile'.log 

74 

75 Messages for levels less than WARNING (only INFO and DEBUG) written to stdout 

76 

77 Messages for levels equal or higher than WARNING written to stderr 

78 

79 Messages for levels equal or higher than WARNING written in `logfile`.log.err 

80 

81 

82 Parameters 

83 ---------- 

84 logfile_base : str 

85 base of filename to use for logs. Will add '.log', '.log.details' and '.log.err' for\ 

86 the 3 log files created 

87 level : int 

88 minimum level that must be considered. 

89 name : str or None 

90 if we need to name the logger (used for tests) 

91 log_details : bool 

92 if True, force creation of .log.details file. Otherwise, just create 

93 it if needed according to level 

94 

95 verbose : int 

96 be more verbose: 

97 default (0): info in stdout, error and more in stderr ; 

98 info and more in *.log ; warning and more in *.log.err 

99 1 = add warnings in stderr 

100 2 = like 1 + add details to stdout (by default only INFO) + add details to *.log.details 

101 >15: add debug to stdout and create *.log.debug with all levels 

102 quiet : bool 

103 True if nothing must be sent to stdout/stderr, False otherwise 

104 """ 

105 import time 

106 

107 # time when soft is launched 

108 time_start = time.strftime("_%y-%m-%d_%H-%m-%S") 

109 

110 # create logger 

111 logger = logging.getLogger(name) 

112 

113 # Determine logfile names 

114 logfile = logfile_base + ".log" 

115 if os.path.isfile(logfile): 

116 logfile = logfile_base + "-" + time_start + ".log" 

117 errfile = logfile_base + ".log.err" 

118 if os.path.isfile(errfile): 

119 errfile = logfile_base + "-" + time_start + ".log.err" 

120 detailfile = logfile_base + ".log.details" 

121 if os.path.isfile(detailfile): 

122 detailfile = logfile_base + "-" + time_start + ".log.details" 

123 debugfile = logfile_base + ".log.debug" 

124 if os.path.isfile(debugfile): 

125 debugfile = logfile_base + "-" + time_start + ".log.debug" 

126 

127 # Create a new logging level: details (between info and debug) 

128 # Used to add details to the log file, but not to stdout, while still having 

129 # the possibility to put debug messages, used only for development. 

130 def details(self, message, *args, **kws): 

131 """ 

132 Define a new log level: details 

133 """ 

134 if self.isEnabledFor(logging.DETAIL): 

135 self._log(logging.DETAIL, message, args, **kws) 

136 logging.addLevelName(detail_lvl(), "DETAIL") 

137 logging.DETAIL = detail_lvl() 

138 logging.Logger.details = details 

139 

140 # set level of logger 

141 logger.setLevel(logging.DEBUG) 

142 

143 # create formatter for log messages: 

144 # "timestamp :: level :: message" 

145 # (add :: %(name)s to add the logger name) 

146 my_format = '[%(asctime)s] :: %(levelname)s :: %(message)s' 

147 formatter_file = logging.Formatter(my_format, '%Y-%m-%d %H:%M:%S') 

148 my_format_stream = '%(log_color)s * [%(asctime)s] : %(levelname)s %(reset)s %(message)s' 

149 formatter_stream = ColoredFormatter(my_format_stream, datefmt='%Y-%m-%d %H:%M:%S', 

150 log_colors={'DEBUG' : 'cyan', 

151 'INFO' : 'green', 

152 'DETAIL' : 'cyan', 

153 'WARNING' : 'yellow', 

154 'ERROR' : 'red', 

155 'CRITICAL' : 'red', 

156 }) 

157 

158 

159 

160 # Create handler 1: writing to 'logfile'. mode 'write', max size = 1Mo. 

161 # If logfile is 1Mo, it is renamed to logfile.1, and next logs are still 

162 # written to logfile. Then, logfile.1 is renamed to logfile.2, logfile to 

163 # logfile.1 etc. We allow maximum 5 log files. 

164 # logfile contains everything from INFO level (INFO, WARNING, ERROR) 

165 logfile_handler = RotatingFileHandler(logfile, 'w', 10000000, 5) 

166 # set level to the same as the logger level 

167 logfile_handler.setLevel(logging.INFO) 

168 logfile_handler.setFormatter(formatter_file) # add formatter 

169 logger.addHandler(logfile_handler) # add handler to logger 

170 

171 # Create handler 2: errfile. Write only warnings and errors 

172 errfile_handler = RotatingFileHandler(errfile, 'w', 10000000, 5) 

173 errfile_handler.setLevel(logging.WARNING) 

174 errfile_handler.setFormatter(formatter_file) # add formatter 

175 logger.addHandler(errfile_handler) # add handler to logger 

176 

177 

178 # Create handler 3: detailsfile. Write everything to this file, except debug 

179 # Create it only if: 

180 # - level is <= info (for modules which have no details, so detailsfile is the same as 

181 # logfile) 

182 # - details==True force creation of detailsfile 

183 # - quiet==True nothing in stdout, put all log files so that user can check 

184 if level < logging.INFO or quiet or log_details: 

185 detfile_handler = RotatingFileHandler(detailfile, 'w', 10000000, 5) 

186 detfile_handler.setLevel(logging.DETAIL) 

187 detfile_handler.setFormatter(formatter_file) # add formatter 

188 logger.addHandler(detfile_handler) # add handler to logger 

189 

190 # Formats for detailed log files 

191 my_format_debug = '[%(asctime)s] :: %(levelname)s (from %(name)s logger) :: %(message)s' 

192 formatter_file_debug = logging.Formatter(my_format_debug, '%Y-%m-%d %H:%M:%S') 

193 

194 # Create handler 4: debug file. Write everything 

195 if level < logging.DETAIL: 

196 debugfile_handler = RotatingFileHandler(debugfile, 'w', 10000000, 5) 

197 debugfile_handler.setLevel(logging.DEBUG) 

198 debugfile_handler.setFormatter(formatter_file_debug) # add formatter 

199 logger.addHandler(debugfile_handler) # add handler to logger 

200 

201 # If not quiet, add handlers for stdout and stderr 

202 if not quiet: 

203 # Create handler 4: write to stdout 

204 stream_handler = logging.StreamHandler(sys.stdout) 

205 # By default, write everything 

206 stream_handler.setLevel(logging.DEBUG) 

207 # BUT: don't write messages >= WARNING (warning, error, critical) 

208 stream_handler.addFilter(LessThanFilter(logging.WARNING)) 

209 # if not verbose (level 0 or 1): only put info in stdout (remove details and debug) 

210 if verbose < 2: 

211 stream_handler.addFilter(NoLevelFilter(logging.DETAIL)) 

212 stream_handler.addFilter(NoLevelFilter(logging.DEBUG)) 

213 stream_handler.setFormatter(formatter_stream) 

214 logger.addHandler(stream_handler) # add handler to logger 

215 

216 # Create handler 5: write to stderr 

217 err_handler = logging.StreamHandler(sys.stderr) 

218 

219 if verbose > 0: 

220 err_handler.setLevel(logging.WARNING) # write all messages >= WARNING 

221 else: 

222 err_handler.setLevel(logging.ERROR) # write all messages >= ERROR 

223 err_handler.setFormatter(formatter_stream) 

224 logger.addHandler(err_handler) # add handler to logger 

225 logger.propagate = False 

226 return logfile, logger 

227 

228 

229class LessThanFilter(logging.Filter): 

230 """ 

231 When using log, when a level is set to a handler, it is a minimum level. All 

232 levels higher than it will be printed. If you want to print only until 

233 a given level (no levels higher than the specified one), use this class like this: 

234 handler.addFilter(LessThanFilter(level)) 

235 """ 

236 

237 def __init__(self, level): 

238 self._level = level 

239 logging.Filter.__init__(self) 

240 

241 def filter(self, rec): 

242 """ 

243 Function to decide if given log has to be logged or not, according to its level 

244 

245 Parameters 

246 ---------- 

247 rec : current record handled by logger 

248 

249 Returns 

250 ------- 

251 bool 

252 True if level of current log is less than the defined limit, False otherwise 

253 """ 

254 return rec.levelno < self._level 

255 

256 

257class NoLevelFilter(logging.Filter): 

258 """ 

259 When using log, specify a given level that must not be taken into account by the handler. 

260 This is used for the stdout handler. We want to print, by default, 

261 DEBUG (for development use) and INFO levels, but not DETAILS level (which is between 

262 DEBUG and INFO). We want to print DETAIL only if verbose option was set 

263 """ 

264 

265 def __init__(self, level): 

266 self._level = level 

267 logging.Filter.__init__(self) 

268 

269 def filter(self, rec): 

270 """ 

271 Function to decide if given log has to be logged or not, according to its level 

272 

273 Parameters 

274 ---------- 

275 rec : current record handled by logger 

276 

277 Returns 

278 ------- 

279 bool 

280 True if level of current log is different from forbidden level, False if it is the same 

281 """ 

282 return rec.levelno != self._level 

283 

284 

285def check_installed(cmd): 

286 """ 

287 Check if the command 'cmd' is in $PATH and can then be executed 

288 

289 Parameters 

290 ---------- 

291 cmd : str 

292 command to run 

293 

294 Returns 

295 ------- 

296 bool 

297 True if installed, False otherwise 

298 """ 

299 torun = "which " + cmd 

300 trying = subprocess.Popen(shlex.split(torun), stdout=subprocess.PIPE) 

301 out, _ = trying.communicate() 

302 if trying.returncode == 0: 

303 if os.path.isfile(out.strip()): 

304 return True 

305 return False 

306 

307 

308def run_cmd(cmd, error, eof=False, **kwargs): 

309 """ 

310 Run the given command line. If the return code is not 0, print error message. 

311 if eof (exit on fail) is True, exit program if error code is not 0. 

312 

313 Parameters 

314 ---------- 

315 cmd : str 

316 command to run 

317 error : str 

318 error message to print if error while running command 

319 eof : bool 

320 True: exit program if command failed, False: do not exit even if command fails 

321 kwargs : Object 

322 Can provide a logger, stdout and/or stderr streams 

323 

324 Returns 

325 ------- 

326 subprocess.Popen 

327 returns object of subprocess call (has attributes returncode, pid, communicate etc.) 

328 

329 """ 

330 if "logger" not in kwargs: 

331 logger = logging.getLogger("utils.run_cmd") 

332 else: 

333 logger = kwargs["logger"] 

334 if "stdout" not in kwargs: 

335 kwargs["stdout"] = None 

336 if "stderr" not in kwargs: 

337 kwargs["stderr"] = None 

338 try: 

339 call = subprocess.Popen(shlex.split(cmd), stdout=kwargs["stdout"], 

340 stderr=kwargs["stderr"]) 

341 call.wait() 

342 retcode = call.returncode 

343 except OSError: 

344 logger.error(f"error: command '>{cmd}' is not possible.") 

345 if eof: 

346 sys.exit(1) 

347 else: 

348 return 1 

349 if retcode != 0: 

350 logger.error(error) 

351 if eof: 

352 sys.exit(retcode) 

353 return call 

354 

355 

356def plot_distr(values, limit, title, text, logger): 

357 """ 

358 Plot histogram of given 'values', and add a vertical line corresponding to the chosen 

359 'limit' and return the mpl figure 

360 

361 Parameters 

362 ---------- 

363 values : list 

364 list of values 

365 limit : int 

366 limit for which a vertical line must be drawn 

367 title : str 

368 Title to give to plot 

369 text : str 

370 text to write near the vertical line representing the limit 

371 logger : logging.Logger 

372 logger object to write log information 

373 

374 Returns 

375 ------- 

376 matplotlib.figure.Figure 

377 figure generated 

378 """ 

379 import math 

380 import numpy as np 

381 import matplotlib 

382 matplotlib.use('AGG') 

383 from matplotlib import pyplot as plt 

384 plt.close("all") 

385 fig = plt.figure(figsize=(10, 7)) 

386 ax = fig.add_subplot(1, 1, 1) 

387 max_x = max(values) 

388 # if too many values, group them to have less bins in the histogram. 

389 # Put 'group_values' values in each bin -> 

390 # if less than 300 values, 1 value per bin, otherwise more values per bin 

391 group_values = int(max_x / 300) + 1 

392 dec_ax = math.exp(0.001 * max_x) - 1 

393 dec_text = 3 * dec_ax 

394 bins = np.arange(0, max_x + 2 * group_values, group_values) - 0.5 

395 ax.hist(values, bins=bins, edgecolor="black", color="blue") 

396 ax.set_xlim(0.5, max_x + 0.5 * group_values) 

397 ax.axvline(x=limit + 0.5 * group_values + dec_ax, color="r") 

398 ax.text(x=limit + 0.5 * group_values + dec_text, y=plt.ylim()[1] / 2, 

399 s=text + " " + str(limit), color="r", rotation=90) 

400 ax.set_title(title) 

401 return fig 

402 

403 

404def write_warning_skipped(skipped, do_format=False, prodigal_only=False, logfile=""): 

405 """ 

406 At the end of the script, write a warning to the user with the names of the genomes 

407 which had problems with prokka. 

408 

409 Parameters 

410 ---------- 

411 skipped : list 

412 list of genomes with problems 

413 do_format : bool 

414 if False, genomes were not skipped because of format step, but before that.\ 

415 if True, they were skipped because of format 

416 prodigal_only : bool 

417 if False: used prokka to annotate 

418 if True: used prodigal to annotate 

419 """ 

420 if not prodigal_only: 

421 soft = "prokka" 

422 else: 

423 soft = "prodigal" 

424 logger = logging.getLogger("utils") 

425 list_to_write = "\n".join(["\t- " + genome for genome in skipped]) 

426 if not do_format: 

427 logger.info(f"WARNING: Some genomes could not be annotated. See {soft}") 

428 logger.warning(f"{soft} had problems while annotating some genomes, or " 

429 "did not find any gene. Hence, they are not formatted, and absent " 

430 "from your output database. Please look at the " 

431 "current error log " 

432 "(<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err) to get more " 

433 "information on the problems. Here are those " 

434 f"genomes:\n{list_to_write}") 

435 else: 

436 logger.info(f"WARNING: Some genomes could not be formatted. See {logfile}") 

437 logger.warning((f"Some genomes were annotated by {soft}, but could not be formatted, " 

438 "and are hence absent from your output database. Please look at " 

439 "'<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err' and " 

440 ".details files to get more information about why they could not be " 

441 f"formatted.\n{list_to_write}")) 

442 

443 

444def write_genomes_info(genomes, kept_genomes, list_file, res_path, qc=False): 

445 """ 

446 Write the list of genomes discarded to a file (qc=False), so that users can 

447 keep a trace of them, with their information (nb contigs, L90 etc.) 

448 

449 If qc=True, we stop after QC. 

450 -> Write the list of genomes that would be kept for annotation with all 

451 their information (L90, size, #contig) 

452 

453 

454 Parameters 

455 ---------- 

456 genomes : dict 

457 {genome: [gembase_start_name, orig_seq_file, to_annotate_seq_file, 

458 genome_size, nb_contigs, L90]} 

459 kept_genomes : list 

460 list of genomes kept 

461 list_file : str 

462 path to input file containing the list of genomes 

463 res_path : str 

464 folder where results must be saved 

465 qc : bool 

466 * True: called only if QC only. Name this file info-genomes-<list_file>.txt to put 

467 information on genomes that would be annotated if not QC only 

468 * otherwise (False), called in any case. Name this file discarded-<list_file>.txt 

469 and write all discarded genomes, whether sequences kept are next annotated or not 

470 => columns: orig_name, to_annotate, gsize, nb_conts, L90 

471 """ 

472 logger = logging.getLogger("utils") 

473 # number of genomes discarded 

474 nb_disc = len(genomes) - len(kept_genomes) 

475 # Log number of genomes discarded. 

476 if not qc and nb_disc < 2: 

477 logger.info(f"{nb_disc} genome was discarded.") 

478 elif not qc: 

479 logger.info(f"{nb_disc} genomes were discarded.") 

480 # Get input list file name (without path) 

481 _, name_lst = os.path.split(list_file) 

482 # if not QC, write discarded genomes to a file "discarded-[list_file].lst" 

483 if not qc: 

484 outdisc = os.path.join(res_path, 

485 "discarded-" + ".".join(name_lst.split(".")[:-1]) + ".lst") 

486 logger.info("Writing discarded genomes to {}".format(outdisc)) 

487 # if QC, there is no 'discarded genome', just write information on all analyzed genomes 

488 else: 

489 outdisc = os.path.join(res_path, 

490 "ALL-GENOMES-info-" + ".".join(name_lst.split(".")[:-1]) + ".lst") 

491 logger.info("Writing information on genomes in {}".format(outdisc)) 

492 with open(outdisc, "w") as outdf: 

493 outdf.write("\t".join(["orig_name", "to_annotate", "gsize", "nb_conts", "L90"]) + "\n") 

494 for genome, values in genomes.items(): 

495 if genome in kept_genomes: 

496 continue 

497 _, _, to_annotate, gsize, nbcont, l90 = [str(x) for x in values] 

498 to_annotate_file = os.path.basename(to_annotate) 

499 outdf.write("\t".join([genome, to_annotate_file, gsize, nbcont, l90]) + "\n") 

500 

501 

502def write_lstinfo(list_file, genomes, outdir): 

503 """ 

504 Write lstinfo file, with following columns: 

505 gembase_name, orig_name, to_annotate_name, size, nbcontigs, l90 

506 

507 Parameters 

508 ---------- 

509 list_file : str 

510 input file containing the list of genomes 

511 genomes : dict 

512 {genome: [gembase_start_name, seq_file, seq_to_annotate, genome_size, nb_contigs, L90]} 

513 outdir : str 

514 folder where results must be saved 

515 

516 """ 

517 _, name_lst = os.path.split(list_file) 

518 outlst = os.path.join(outdir, "LSTINFO-" + ".".join(name_lst.split(".")[:-1]) + ".lst") 

519 with open(outlst, "w") as outf: 

520 outf.write("\t".join(["gembase_name", "orig_name", "to_annotate", "gsize", 

521 "nb_conts", "L90"]) + "\n") 

522 for genome, values in sorted(genomes.items(), key=sort_genomes_byname_l90_nbcont): 

523 gembase, _, to_annote, gsize, nbcont, l90 = [str(x) for x in values] 

524 outf.write("\t".join([gembase, genome, to_annote, gsize, nbcont, l90]) + "\n") 

525 return outlst 

526 

527 

528def sort_genomes_by_name(x): 

529 """ 

530 order by: 

531 

532 - species 

533 - in each species, by strain number 

534 

535 Parameters 

536 ---------- 

537 x : tuple or str 

538 [genome_orig, [gembase, path, gsize, nbcont, L90]] with gembase = species.date.strain 

539 

540 Returns 

541 ------- 

542 str 

543 variable to take into account for sorting. If format is ESCO.1512.00001 return\ 

544 ESCO and 00001. Otherwise, just return x itself (sort by alphabetical order) 

545 """ 

546 # get gembase name 

547 if isinstance(x, tuple): 

548 x = x[1][0] 

549 

550 # if format is ESCO.1512.00001 sort by ESCO, then 00001 

551 if "." in x and len(x.split(".")) == 3: 

552 return x.split(".")[0], int(x.split(".")[-1]) 

553 # if format is not like this, just return alphabetical order 

554 return x, 

555 

556 

557def sort_genomes_byname_l90_nbcont(x): 

558 """ 

559 Sort all genomes with the following criteria: 

560 

561 - sort by species (x[1][0] is species.date) 

562 - for each species, sort by l90 

563 - for same l90, sort by nb contigs 

564 

565 Parameters 

566 ---------- 

567 x : [[]] 

568 [genome_name, [species.date, path, path_to_seq, gsize, nbcont, L90]] 

569 

570 Returns 

571 ------- 

572 tuple 

573 information on species, l90 and nb_contigs 

574 """ 

575 return x[1][0].split(".")[0], x[1][-1], x[1][-2] 

576 

577 

578def sort_genomes_l90_nbcont(x): 

579 """ 

580 Sort all genomes with the following criteria: 

581 

582 - for each strain, sort by l90 

583 - for same l90, sort by nb contigs 

584 

585 Parameters 

586 ---------- 

587 x : [[]] 

588 [genome_name, [species.date, path, gsize, nbcont, L90]] 

589 

590 Returns 

591 ------- 

592 tuple 

593 information on l90 and nb_contigs 

594 """ 

595 return x[1][-1], x[1][-2] 

596 

597 

598def sort_proteins(x): 

599 """ 

600 order by: 

601 

602 - species 

603 - in each species, strain number 

604 - in each species and strain number, by protein number 

605 

606 Parameters 

607 ---------- 

608 x : str 

609 species.date.strain.contig_protnum 

610 

611 Returns 

612 ------- 

613 str 

614 variable to take into account for sorting. If format is ESCO.1512.00001.i0002_12124,\ 

615 return ESCO, 00001 and 12124. If not, it must be something_00001:\ 

616 return something and 00001. 

617 """ 

618 try: 

619 # if format is ESCO.1512.00001.i0002_12124, sort by ESCO, then 00001, then 12124 

620 if "." in x and len(x.split(".")) >= 3: 

621 return x.split(".")[0], int(x.split(".")[2].split("_")[0]), int(x.split("_")[-1]) 

622 # if format is not like this, it must be something_00001: 

623 # sort by 'something' and then 00001 

624 return "_".join(x.split("_")[:-1]), int(x.split("_")[-1]) 

625 except (IndexError, ValueError): 

626 logger = logging.getLogger("utils") 

627 logger.error(("ERROR: Protein {} does not have the required format. " 

628 "It must contain, at least <alpha-num_only>_<num_only>, and at best " 

629 "<name>.<date>.<strain_num>.<contig_info>_<prot_num>. " 

630 "Please change its name.").format(x)) 

631 sys.exit(1) 

632 

633 

634def read_genomes(list_file, name, date, dbpath, tmp_path, logger): 

635 """ 

636 Read list of genomes, and return them. 

637 If a genome has a name, also return it. Otherwise, return the name given by user. 

638 

639 Check that the given genome file exists in dbpath. Otherwise, put an error message, 

640 and ignore this file. 

641 

642 Parameters 

643 ---------- 

644 list_file : str 

645 input file containing the list of genomes 

646 name : str 

647 Default species name 

648 date : str 

649 Default date 

650 dbpath : str 

651 path to folder containing original genome files 

652 tmp_path : str 

653 path to folder which will contain the genome files to use before annotation, if\ 

654 needed to change them from original file (for example, merging several contig files\ 

655 in one file, split at each stretch of 5 'N', etc.). 

656 

657 Returns 

658 ------- 

659 dict 

660 {genome: spegenus.date} spegenus.date = name.date 

661 """ 

662 logger.info("Reading genomes") 

663 genomes = {} 

664 # Check that given list file exists 

665 if not os.path.isfile(list_file): 

666 logger.error(("ERROR: Your list file '{}' does not exist. " 

667 "Please provide a list file.\n Ending program.").format(list_file)) 

668 sys.exit(1) 

669 # List file exists: open it and read it 

670 with open(list_file, "r") as lff: 

671 for line in lff: 

672 line = line.strip() 

673 # empty line: go to the next one 

674 if line == "": 

675 continue 

676 # If separator ::, look for species and/or date 

677 if "::" in line: 

678 # genomes_inf = genome filename(s) separated by space 

679 # name_inf = <name>.<date> 

680 genomes_inf, name_inf = line.split("::") 

681 genomes_inf = genomes_inf.strip() 

682 cur_name, cur_date = read_info(name_inf, name, date, genomes_inf) 

683 # If no separator '::', no information on name and date of genome: use default ones 

684 # Line only contains genome name (filename in given db_path) 

685 else: 

686 genomes_inf = line.strip() 

687 cur_name = name 

688 cur_date = date 

689 # If several file names, check that each one exists, and concatenate the existing files 

690 genomes_inf = genomes_inf.split() 

691 if len(genomes_inf) > 1: 

692 to_concat = [] 

693 for file in genomes_inf: 

694 if os.path.isfile(os.path.join(dbpath, file)): 

695 to_concat.append(file) 

696 else: 

697 logger.warning(("{} genome file does not exist. Its file will be " 

698 "ignored when concatenating {}").format(file, genomes_inf)) 

699 # If there are files to concatenate, concatenate them 

700 if to_concat: 

701 genome_name = to_concat[0] + "-all.fna" 

702 concat_file = os.path.join(tmp_path, genome_name) 

703 to_concat = [os.path.join(dbpath, gname) for gname in to_concat] 

704 # Put all genomes listed in 'to_concat' into a same file named 'concat_file' 

705 cat(to_concat, concat_file) 

706 else: 

707 # No genome file listed exists. No sequence = genome ignored 

708 logger.warning(("None of the genome files in {} exist. " 

709 "This genome will be ignored.").format(genomes_inf)) 

710 genome_name = "" 

711 # If only 1 sequence file, check that it exists, and take its name 

712 else: 

713 # Genome file given does not exist 

714 if not os.path.isfile(os.path.join(dbpath, genomes_inf[0])): 

715 logger.warning(("{} genome file does not exist. " 

716 "It will be ignored.").format(genomes_inf[0])) 

717 genome_name = "" 

718 # Genome file exists: get genome_name (from information in first field) 

719 else: 

720 genome_name = genomes_inf[0] 

721 # If there is a genome file (concatenated from several, or already existing), get the full name (with date) 

722 if genome_name != "": 

723 genomes[genome_name] = [cur_name + "." + cur_date] 

724 return genomes 

725 

726 

727def read_genomes_info(list_file, name, date=None, logger=None): 

728 """ 

729 Read a lstinfo file containing the list of genomes with information (L90, genome size etc.). 

730 1 line per genome, 4 required columns (Others will just be ignored): 

731 to_annotate gsize nb_conts L90 

732 

733 Check that the given genome file (to_annotate column) exists. 

734 

735 Parameters 

736 ---------- 

737 list_file : str 

738 input file containing information on genomes (to_annotate, size, L90, nb_contigs) 

739 name : str 

740 Default species name 

741 date : str 

742 Default date 

743 logger : logging.Logger 

744 logger object to write log information 

745 

746 

747 Returns 

748 ------- 

749 dict 

750 genomes = {genome: 

751 [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90] 

752 } 

753 """ 

754 if not logger: 

755 logger = logging.getLogger("prepare.utils") 

756 logger.info(f"Reading given information on your genomes in {list_file}") 

757 genomes = {} 

758 if name and date: 

759 spegenus = f"{name}.{date}" 

760 column_order = {} # Put the number of column corresponding to each field 

761 if not os.path.isfile(list_file): 

762 logger.error(f"ERROR: The info file {list_file} that you gave does not exist. " 

763 "Please provide the right path/name for this file.\nEnding program.") 

764 sys.exit(1) 

765 message_no_header = (f"ERROR: It seems that your info file {list_file} does not have a " 

766 "header, or this header does not have, at least, the required " 

767 "columns tab separated: to_annotate, gsize nb_conts and L90 (in any " 

768 "order).\nEnding program.") 

769 with open(list_file, "r") as lff: 

770 for line in lff: 

771 line = line.strip() 

772 # Ignore empty lines 

773 if line == "": 

774 continue 

775 # Header line: Just get column number corresponding to each field 

776 if "to_annotate" in line: 

777 column_headers = line.split("\t") 

778 column_order = {header:num for num,header in enumerate(column_headers)} 

779 found = [head for head in ["to_annotate", "gsize", "nb_conts", "L90"] 

780 if head in column_order] 

781 if len(found) != 4: 

782 logger.error(message_no_header) 

783 sys.exit(1) 

784 # If no header found, error message and exit 

785 if not column_order: 

786 logger.error(message_no_header) 

787 sys.exit(1) 

788 # Get all information on the given genome 

789 # line.strip().split() -> all given information. 

790 # So, at least to_annotate, gsize, nbcont, L90 

791 # Get numeric information 

792 try: 

793 infos = line.strip().split() 

794 # Get genome name with its path to db_dir 

795 gpath = infos[column_order["to_annotate"]] 

796 gfile = os.path.basename(gpath) 

797 gname = os.path.splitext(gfile)[0] 

798 gsize = int(infos[column_order["gsize"]]) 

799 gl90 = int(infos[column_order["L90"]]) 

800 gcont = int(infos[column_order["nb_conts"]]) 

801 # If invalid values, warning message and ignore genome 

802 except ValueError: 

803 logger.warning(f"For genome {gname}, at least one of your columns 'gsize', " 

804 "'nb_conts' or 'L90' contains a non numeric value. " 

805 "This genome will be ignored.") 

806 continue 

807 # If no value for at least 1 field, warning message and ignore genome 

808 except IndexError: 

809 logger.error(f"ERROR: Check that all fields of {list_file} are filled in each " 

810 "line (can be 'NA')") 

811 sys.exit(1) 

812 # Could we find genome file? 

813 # Check if genome file exists in db_path. 

814 if not os.path.isfile(gpath): 

815 logger.warning(f"{gpath} genome file does not exist. This genome will be ignored.") 

816 continue 

817 # cur genome information to save: 

818 # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90] 

819 if name and date: 

820 genomes[gpath] = [spegenus, gpath, gpath, gsize, gcont, gl90] 

821 # If called from prepare, no need to rename genomes 

822 else: 

823 gfile = os.path.basename(gpath) 

824 gname = os.path.splitext(gfile)[0] 

825 genomes[gfile] = [gname, gpath, gpath, gsize, gcont, gl90] 

826 if len(genomes) > 0: 

827 logger.info(("Found {} genomes in total").format(len(genomes))) 

828 else: 

829 logger.error(f"No genome listed in {list_file} was found.") 

830 sys.exit(1) 

831 return genomes 

832 

833 

834def read_info(name_inf, name, date, genomes_inf): 

835 """ 

836 From the given information in 'name_inf', check if there is a name (and if its 

837 format is ok) and if there is a date (and if its format is ok). 

838 If no name (resp. no date), return default name (resp. default date). 

839 

840 Parameters 

841 ---------- 

842 name_inf : str 

843 information on current genome, which could contain a species name and a date 

844 name : str 

845 default species name 

846 date : str 

847 default date 

848 genomes_inf : str 

849 current genome filename. Used to complete information when there is a warning (species\ 

850 name or date given not in the right format...) 

851 

852 Returns 

853 ------- 

854 (cur_name, cur_date) : tuple 

855 with: 

856 

857 - curname: name to use for this genome (can be the default one, or the one read from\ 

858 'name_inf' 

859 - curdate: date to use for this genome (default or read from 'name_inf') 

860 """ 

861 logger = logging.getLogger("utils") 

862 # information on current genome, which could contain a species name and a date: split to get 

863 # name and date 

864 name_inf = name_inf.strip().split(".") 

865 # if only 1 field provided (means only name is provided). 

866 # (Otherwise, if only date is provided, it is with '.DATE', so 

867 # len(name_inf) = 2 with name_inf[0] = "") 

868 if len(name_inf) == 1: 

869 # No genome name: use default one 

870 if name_inf[0] == "": 

871 cur_name = name 

872 # Genome name, and in right format: use it 

873 elif check_format(name_inf[0]): 

874 cur_name = name_inf[0] 

875 # Genome name but wrong format: warning message and use default one 

876 else: 

877 logger.warning(("Invalid name {} given for genome {}. Only put " 

878 "4 alphanumeric characters in your date and name. " 

879 "For this genome, the default name ({}) will be " 

880 "used.").format(name_inf[0], genomes_inf, name)) 

881 cur_name = name 

882 # In any case, no given date: use default date 

883 cur_date = date 

884 

885 # More than 2 informations (more than 2 fields separated by a '.'): 

886 # - either user put more information 

887 # - either user put a '.' inside the name or date field: wrong format 

888 # -> warning message and use default date and name. 

889 elif len(name_inf) > 2: 

890 logger.warning(("Invalid name/date given for genome {}. Only put " 

891 "4 alphanumeric characters in your date and name. For " 

892 "this genome, the default name ({}) and date ({}) will " 

893 "be used.").format(genomes_inf, name, date)) 

894 cur_name = name 

895 cur_date = date 

896 # information on name and date given (can be empty) 

897 else: 

898 cur_name, cur_date = name_inf 

899 # name given empty -> use default name 

900 if cur_name == "": 

901 cur_name = name 

902 # date given empty -> use default date 

903 if cur_date == "": 

904 cur_date = date 

905 # name given not empty but wrong format -> warning message and default name 

906 if not check_format(cur_name): 

907 logger.warning(("Invalid name {} given for genome {}. Only put " 

908 "4 alphanumeric characters in your date and name. " 

909 "For this genome, the default name ({}) " 

910 "will be used.").format(cur_name, genomes_inf, name)) 

911 cur_name = name 

912 # date given not empty but wrong format -> warning message and default date 

913 if not check_format(cur_date): 

914 logger.warning(("Invalid date {} given for genome {}. Only put " 

915 "4 alphanumeric characters in your date and name. " 

916 "For this genome, the default date ({}) " 

917 "will be used.").format(cur_date, genomes_inf, date)) 

918 cur_date = date 

919 return cur_name, cur_date 

920 

921 

922def cat(list_files, output, title=None): 

923 """ 

924 Equivalent of 'cat' unix command. 

925 

926 Concatenate all files in 'list_files' and save result in 'output' folder. 

927 Concat using shutil.copyfileobj, in order to copy by chunks, to 

928 avoid memory problems if files are big. 

929 

930 Parameters 

931 ---------- 

932 list_files : list 

933 list of filenames to concatenate 

934 output : str 

935 output filename, where all concatenated files will be written 

936 title : str or None 

937 if you want to show a progressbar while concatenating files, add a title for this 

938 progressbar here. If no title, nothing will be shown during concatenation. 

939 

940 """ 

941 bar = None 

942 curnum = None 

943 if title: 

944 nbfiles = len(list_files) 

945 widgets = [title + ': ', progressbar.Bar(marker='█', left='', right='', fill=' '), 

946 ' ', progressbar.Counter(), f"/{nbfiles}" ' (', 

947 progressbar.Percentage(), ") - ", progressbar.Timer()] 

948 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfiles, term_width=79).start() 

949 curnum = 1 

950 with open(output, "w") as outf: 

951 for file in list_files: 

952 if title: 

953 bar.update(curnum) 

954 curnum += 1 

955 with open(file, "r") as inf: 

956 shutil.copyfileobj(inf, outf) 

957 if title: 

958 bar.finish() 

959 

960 

961def grep(filein, pattern, counts=False): 

962 """ 

963 Equivalent of 'grep' unix command 

964 

965 By default, returns all the lines containing the given pattern. 

966 If counts = True, returns the number of lines containing the pattern. 

967 

968 Parameters 

969 ---------- 

970 filein : str 

971 path to the file in which pattern must be searched 

972 pattern : str 

973 pattern to search 

974 counts : bool 

975 True if you want to count how many lines have the pattern and return this number,\ 

976 False if you want to return all lines containing the pattern. 

977 

978 Returns 

979 ------- 

980 list or int 

981 list of lines if counts=False; number of lines if counts=True 

982 """ 

983 num = 0 

984 lines = [] 

985 with open(filein, "r") as inf: 

986 for line in inf: 

987 if re.search(pattern, line): 

988 lines.append(line.strip()) 

989 num += 1 

990 if counts: 

991 return num 

992 else: 

993 return lines 

994 

995 

996def count(filein, get="lines"): 

997 """ 

998 Similar to 'wc' unix command. 

999 

1000 Count the number of what is given in 'get'. It can be: 

1001 

1002 - lines (default) 

1003 - words 

1004 

1005 Parameters 

1006 ---------- 

1007 filein : str 

1008 path to the file for which we want to count lines or words 

1009 get : ["lines", "words"] 

1010 either lines to count the number of lines in the file, or words to count the number\ 

1011 of words. 

1012 

1013 Returns 

1014 ------- 

1015 int 

1016 Number of lines or words according to value of 'get' parameter. 

1017 """ 

1018 gets = ["lines", "words"] 

1019 if get not in gets: 

1020 logger = logging.getLogger("utils") 

1021 logger.error("Choose what you want to count among {}.".format(gets)) 

1022 sys.exit(1) 

1023 num = 0 

1024 with open(filein, "r") as inf: 

1025 for line in inf: 

1026 if get == "lines": 

1027 num += 1 

1028 elif get == "words": 

1029 num += len(line.split()) 

1030 return num 

1031 

1032 

1033def check_format(info): 

1034 """ 

1035 Check that the given information (can be the genomes name or the date) is in the right 

1036 format: it should have 4 characters, all alphanumeric. 

1037 

1038 Parameters 

1039 ---------- 

1040 info : str 

1041 information to check 

1042 

1043 Returns 

1044 ------- 

1045 bool 

1046 True if right format, False otherwise 

1047 """ 

1048 if len(info) != 4: 

1049 return False 

1050 return info.isalnum() 

1051 

1052 

1053def check_out_dirs(resdir): 

1054 """ 

1055 Check that there is no file in: 

1056 

1057 - resdir/LSTINFO 

1058 - resdir/Genes 

1059 - resdir/Proteins 

1060 - resdir/Replicons 

1061 - resdir/gff3 

1062 

1063 Parameters 

1064 ---------- 

1065 resdir : str 

1066 path to result directory 

1067 

1068 """ 

1069 logger = logging.getLogger("utils") 

1070 if glob.glob(os.path.join(resdir, "LSTINFO", "*.lst")): 

1071 logger.error("ERROR: Your output directory already has .lst files in the " 

1072 "LSTINFO folder. Provide another result directory, or remove the " 

1073 "files in this one.\nEnding program.") 

1074 sys.exit(1) 

1075 if glob.glob(os.path.join(resdir, "Proteins", "*.prt")): 

1076 logger.error("ERROR: Your output directory already has .prt files in the " 

1077 "Proteins folder. Provide another result directory, or remove the " 

1078 "files in this one.\nEnding program.") 

1079 sys.exit(1) 

1080 if glob.glob(os.path.join(resdir, "Genes", "*.gen")): 

1081 logger.error("ERROR: Your output directory already has .gen files in the " 

1082 "Genes folder. Provide another result directory, or remove the " 

1083 "files in this one.\nEnding program.") 

1084 sys.exit(1) 

1085 if glob.glob(os.path.join(resdir, "Replicons", "*.fna")): 

1086 logger.error("ERROR: Your output directory already has .fna files in the " 

1087 "Replicons folder. Provide another result directory, or remove the " 

1088 "files in this one.\nEnding program.") 

1089 sys.exit(1) 

1090 if glob.glob(os.path.join(resdir, "gff3", "*.gff")): 

1091 logger.error("ERROR: Your output directory already has .gff files in the " 

1092 "gff3 folder. Provide another result directory, or remove the " 

1093 "files in this one.\nEnding program.") 

1094 sys.exit(1) 

1095 

1096 

1097def get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger): 

1098 """ 

1099 For the given genome (sequence in gpath), rename all its contigs 

1100 with the new name: 'gembase_name', and save the output sequence in outfile. 

1101 

1102 For each contig renamed, save its new name as well as its size. This will be used to generate 

1103 gff files 

1104 

1105 Parameters 

1106 ---------- 

1107 gembase_name : str 

1108 genome name to use (species.date.strain) 

1109 gpath : str 

1110 path to the genome sequence 

1111 outfile : str 

1112 path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers 

1113 

1114 Returns 

1115 ------- 

1116 tuple 

1117 - Dict of all contigs with their original and new name: (list of str) 

1118 {>orig_name: >new_name} 

1119 - Dict of all contigs with their size: (list of str) 

1120 {"new_name': 'size1"} 

1121 """ 

1122 # Initialize variables 

1123 

1124 # Contig number 

1125 contig_num = 1 

1126 # contig size 

1127 cont_size = 0 

1128 # List of contigs (str) [<name>\t<orig_name>] 

1129 contigs = {} 

1130 # List of contigs (str) with their sizes [<name>\t<size>] 

1131 sizes = {} 

1132 # Name of previous contig (to put to contigs, as we need to wait for the next 

1133 # contig to know the size of the previous one) 

1134 prev_cont = "" 

1135 prev_orig_name = "" 

1136 # sequence of previous contig 

1137 seq = "" 

1138 

1139 # Read input sequence given to prodigal, and open file where sequences with new 

1140 # headers must be written. 

1141 with open(gpath, "r") as gpf, open(outfile, "w") as grf: 

1142 for line in gpf: 

1143 # When we find a new header line, convert its name to gembase format, and write it 

1144 # to output replicon file 

1145 if line.startswith(">") : 

1146 # If not first contig (contigs not empty): 

1147 # - add its name as well as its size to contigs list 

1148 # - add its name with its original name to 

1149 # - write header ("<contig name> <size>") to replicon file 

1150 if prev_cont: 

1151 cont = " ".join([prev_cont, str(cont_size)]) + "\n" 

1152 prevcont_nohead = prev_cont.split(">")[1] 

1153 prev_orig_name_nohead = prev_orig_name.split(">")[1] 

1154 if prev_orig_name_nohead: 

1155 if prev_orig_name_nohead in contigs: 

1156 logger.error(f"several contigs have the same name " 

1157 f"{prev_orig_name_nohead} in {gpath}.") 

1158 return False, False 

1159 sizes[prevcont_nohead] = cont_size 

1160 contigs[prev_orig_name_nohead] = prevcont_nohead 

1161 grf.write(cont) 

1162 grf.write(seq) 

1163 prev_cont = ">" + gembase_name + "." + str(contig_num).zfill(4) 

1164 # keep only first string of contig 

1165 prev_orig_name = line.strip().split()[0] 

1166 contig_num += 1 

1167 cont_size = 0 

1168 seq = "" 

1169 # Sequence line: write it as is in replicon file, and add its size to cont_size. 

1170 else: 

1171 seq += line 

1172 cont_size += len(line.strip()) 

1173 # Write last contig, if there is one (if gpath not empty) 

1174 if prev_cont: 

1175 cont = " ".join([prev_cont, str(cont_size)]) + "\n" 

1176 prevcont_nohead = "".join(prev_cont.split(">")[1:]) 

1177 prev_orig_name_nohead = prev_orig_name.split(">")[1] 

1178 if prev_orig_name_nohead: 

1179 if prev_orig_name_nohead in contigs: 

1180 logger.error(f"several contigs have the same name {prev_orig_name_nohead} " 

1181 f"in {gpath}.") 

1182 return False, False 

1183 contigs[prev_orig_name_nohead] = prevcont_nohead 

1184 sizes[prevcont_nohead] = cont_size 

1185 grf.write(cont) 

1186 grf.write(seq) 

1187 if not contigs: 

1188 logger.error(f"Your genome {gpath} does not contain any sequence, " 

1189 "or is not in fasta format.") 

1190 return contigs, sizes 

1191# Add test with empty gpath 

1192# Add test with non fasta gpath 

1193 

1194 

1195def logger_thread(q): 

1196 """ 

1197 Queue listener used in a thread to handle the logs put to a QueueHandler 

1198 by several processes (multiprocessing.pool.map_async for example) 

1199 

1200 Parameters 

1201 ---------- 

1202 q : multiprocessing.managers.AutoProxy[Queue] 

1203 queue to listen 

1204 

1205 """ 

1206 while True: 

1207 record = q.get() 

1208 if record is None: 

1209 break 

1210 logger = logging.getLogger(record.name) 

1211 logger.handle(record) 

1212 

1213 

1214def detail_lvl(): 

1215 """ 

1216 Get the int level corresponding to "DETAIL" 

1217 

1218 Returns 

1219 ------- 

1220 int 

1221 int corresponding to the level "DETAIL" 

1222 """ 

1223 return 15 

1224 

1225 

1226def save_bin(objects, fileout): 

1227 """ 

1228 Save python 'objects' in a binary file called 'fileout' 

1229 

1230 Parameters 

1231 ---------- 

1232 objects : Object 

1233 python object to save 

1234 fileout : str 

1235 path to binary file where objects must be saved 

1236 

1237 """ 

1238 with open(fileout, "wb") as binf: 

1239 pickle.dump(objects, binf) 

1240 

1241 

1242def load_bin(binfile): 

1243 """ 

1244 Unpickle python objects from the binary file 'binfile' 

1245 

1246 Parameters 

1247 ---------- 

1248 binfile : str 

1249 path to binary file containing python object 

1250 

1251 Returns 

1252 ------- 

1253 Object 

1254 The python objects unpickled 

1255 

1256 """ 

1257 with open(binfile, "rb") as binf: 

1258 objects = pickle.load(binf) 

1259 return objects 

1260 

1261 

1262def write_list(list_names, fileout): 

1263 """ 

1264 Write the given list of strings to a file, 1 per line 

1265 """ 

1266 with open(fileout, "w") as fo: 

1267 for genome in list_names: 

1268 fo.write(str(genome) + "\n") 

1269 

1270 

1271def list_to_str(list, sep='\t'): 

1272 """ 

1273 Return a string corresponding to the given list, with all elements separated 

1274 by a space. Used to write a list into a file. Ex:: 

1275 

1276 [1, 2, "toto"] -> "1 2 toto" 

1277 

1278 Parameters 

1279 ---------- 

1280 list : list 

1281 list of elements that we would like to write 

1282 sep : str 

1283 Separator to use between the different elements 

1284 

1285 Returns 

1286 ------- 

1287 str 

1288 the string to write 

1289 """ 

1290 list_write = [str(l) for l in list] 

1291 return sep.join(list_write) + "\n" 

1292 

1293 

1294def remove(infile): 

1295 """ 

1296 Remove the given file if it exists 

1297 

1298 Parameters 

1299 ---------- 

1300 infile : str 

1301 path to file to remove 

1302 

1303 """ 

1304 if os.path.isfile(infile): 

1305 os.remove(infile) 

1306 

1307 

1308def thread_progressbar(widgets, stop): 

1309 """ 

1310 Thread running an "inifite" progress bar, while the main thread is working. 

1311 Once this progressbar has to stop, we send a signal. 

1312 

1313 Parameters 

1314 ---------- 

1315 widgets : list 

1316 list of widgets to put in the progressbar 

1317 stop : function 

1318 function returning False when thread can run, True when it has to stop. 

1319 """ 

1320 if widgets: 

1321 bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50) 

1322 while True: 

1323 bar.update() 

1324 if stop(): 

1325 print() 

1326 break