Coverage for PanACoTA/utils.py: 100%

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Util functions and classes.

40@author gem

41April 2017

42"""

44import os

45import sys

46import re

47import glob

48import subprocess

49import shutil

50import shlex

51import progressbar

53# Logging

54import logging

55from logging.handlers import RotatingFileHandler

56from colorlog import ColoredFormatter

58try:

59 import cPickle as pickle

60except:

61 try:

62 import _pickle as pickle

63 except: # pragma: no cover

64 import pickle

67def init_logger(logfile_base, level, name, log_details=False, verbose=0, quiet=False):

68 """

69 Create logger and its handlers, and set them to the given level

71 level hierarchy: ``CRITICAL > ERROR > WARNING > INFO > DETAILS > DEBUG``

73 Messages from all levels are written in 'logfile'.log

75 Messages for levels less than WARNING (only INFO and DEBUG) written to stdout

77 Messages for levels equal or higher than WARNING written to stderr

79 Messages for levels equal or higher than WARNING written in `logfile`.log.err

82 Parameters

83 ----------

84 logfile_base : str

85 base of filename to use for logs. Will add '.log', '.log.details' and '.log.err' for\

86 the 3 log files created

87 level : int

88 minimum level that must be considered.

89 name : str or None

90 if we need to name the logger (used for tests)

91 log_details : bool

92 if True, force creation of .log.details file. Otherwise, just create

93 it if needed according to level

95 verbose : int

96 be more verbose:

97 default (0): info in stdout, error and more in stderr ;

98 info and more in *.log ; warning and more in *.log.err

99 1 = add warnings in stderr

100 2 = like 1 + add details to stdout (by default only INFO) + add details to *.log.details

101 >15: add debug to stdout and create *.log.debug with all levels

102 quiet : bool

103 True if nothing must be sent to stdout/stderr, False otherwise

104 """

105 import time

106

107 # time when soft is launched

108 time_start = time.strftime("_%y-%m-%d_%H-%m-%S")

109

110 # create logger

111 logger = logging.getLogger(name)

112

113 # Determine logfile names

114 logfile = logfile_base + ".log"

115 if os.path.isfile(logfile):

116 logfile = logfile_base + "-" + time_start + ".log"

117 errfile = logfile_base + ".log.err"

118 if os.path.isfile(errfile):

119 errfile = logfile_base + "-" + time_start + ".log.err"

120 detailfile = logfile_base + ".log.details"

121 if os.path.isfile(detailfile):

122 detailfile = logfile_base + "-" + time_start + ".log.details"

123 debugfile = logfile_base + ".log.debug"

124 if os.path.isfile(debugfile):

125 debugfile = logfile_base + "-" + time_start + ".log.debug"

126

127 # Create a new logging level: details (between info and debug)

128 # Used to add details to the log file, but not to stdout, while still having

129 # the possibility to put debug messages, used only for development.

130 def details(self, message, *args, **kws):

131 """

132 Define a new log level: details

133 """

134 if self.isEnabledFor(logging.DETAIL):

135 self._log(logging.DETAIL, message, args, **kws)

136 logging.addLevelName(detail_lvl(), "DETAIL")

137 logging.DETAIL = detail_lvl()

138 logging.Logger.details = details

139

140 # set level of logger

141 logger.setLevel(logging.DEBUG)

142

143 # create formatter for log messages:

144 # "timestamp :: level :: message"

145 # (add :: %(name)s to add the logger name)

146 my_format = '[%(asctime)s] :: %(levelname)s :: %(message)s'

147 formatter_file = logging.Formatter(my_format, '%Y-%m-%d %H:%M:%S')

148 my_format_stream = '%(log_color)s * [%(asctime)s] : %(levelname)s %(reset)s %(message)s'

149 formatter_stream = ColoredFormatter(my_format_stream, datefmt='%Y-%m-%d %H:%M:%S',

150 log_colors={'DEBUG' : 'cyan',

151 'INFO' : 'green',

152 'DETAIL' : 'cyan',

153 'WARNING' : 'yellow',

154 'ERROR' : 'red',

155 'CRITICAL' : 'red',

156 })

157

158

159

160 # Create handler 1: writing to 'logfile'. mode 'write', max size = 1Mo.

161 # If logfile is 1Mo, it is renamed to logfile.1, and next logs are still

162 # written to logfile. Then, logfile.1 is renamed to logfile.2, logfile to

163 # logfile.1 etc. We allow maximum 5 log files.

164 # logfile contains everything from INFO level (INFO, WARNING, ERROR)

165 logfile_handler = RotatingFileHandler(logfile, 'w', 10000000, 5)

166 # set level to the same as the logger level

167 logfile_handler.setLevel(logging.INFO)

168 logfile_handler.setFormatter(formatter_file) # add formatter

169 logger.addHandler(logfile_handler) # add handler to logger

170

171 # Create handler 2: errfile. Write only warnings and errors

172 errfile_handler = RotatingFileHandler(errfile, 'w', 10000000, 5)

173 errfile_handler.setLevel(logging.WARNING)

174 errfile_handler.setFormatter(formatter_file) # add formatter

175 logger.addHandler(errfile_handler) # add handler to logger

176

177

178 # Create handler 3: detailsfile. Write everything to this file, except debug

179 # Create it only if:

180 # - level is <= info (for modules which have no details, so detailsfile is the same as

181 # logfile)

182 # - details==True force creation of detailsfile

183 # - quiet==True nothing in stdout, put all log files so that user can check

184 if level < logging.INFO or quiet or log_details:

185 detfile_handler = RotatingFileHandler(detailfile, 'w', 10000000, 5)

186 detfile_handler.setLevel(logging.DETAIL)

187 detfile_handler.setFormatter(formatter_file) # add formatter

188 logger.addHandler(detfile_handler) # add handler to logger

189

190 # Formats for detailed log files

191 my_format_debug = '[%(asctime)s] :: %(levelname)s (from %(name)s logger) :: %(message)s'

192 formatter_file_debug = logging.Formatter(my_format_debug, '%Y-%m-%d %H:%M:%S')

193

194 # Create handler 4: debug file. Write everything

195 if level < logging.DETAIL:

196 debugfile_handler = RotatingFileHandler(debugfile, 'w', 10000000, 5)

197 debugfile_handler.setLevel(logging.DEBUG)

198 debugfile_handler.setFormatter(formatter_file_debug) # add formatter

199 logger.addHandler(debugfile_handler) # add handler to logger

200

201 # If not quiet, add handlers for stdout and stderr

202 if not quiet:

203 # Create handler 4: write to stdout

204 stream_handler = logging.StreamHandler(sys.stdout)

205 # By default, write everything

206 stream_handler.setLevel(logging.DEBUG)

207 # BUT: don't write messages >= WARNING (warning, error, critical)

208 stream_handler.addFilter(LessThanFilter(logging.WARNING))

209 # if not verbose (level 0 or 1): only put info in stdout (remove details and debug)

210 if verbose < 2:

211 stream_handler.addFilter(NoLevelFilter(logging.DETAIL))

212 stream_handler.addFilter(NoLevelFilter(logging.DEBUG))

213 stream_handler.setFormatter(formatter_stream)

214 logger.addHandler(stream_handler) # add handler to logger

215

216 # Create handler 5: write to stderr

217 err_handler = logging.StreamHandler(sys.stderr)

218

219 if verbose > 0:

220 err_handler.setLevel(logging.WARNING) # write all messages >= WARNING

221 else:

222 err_handler.setLevel(logging.ERROR) # write all messages >= ERROR

223 err_handler.setFormatter(formatter_stream)

224 logger.addHandler(err_handler) # add handler to logger

225 logger.propagate = False

226 return logfile, logger

227

228

229class LessThanFilter(logging.Filter):

230 """

231 When using log, when a level is set to a handler, it is a minimum level. All

232 levels higher than it will be printed. If you want to print only until

233 a given level (no levels higher than the specified one), use this class like this:

234 handler.addFilter(LessThanFilter(level))

235 """

236

237 def __init__(self, level):

238 self._level = level

239 logging.Filter.__init__(self)

240

241 def filter(self, rec):

242 """

243 Function to decide if given log has to be logged or not, according to its level

244

245 Parameters

246 ----------

247 rec : current record handled by logger

248

249 Returns

250 -------

251 bool

252 True if level of current log is less than the defined limit, False otherwise

253 """

254 return rec.levelno < self._level

255

256

257class NoLevelFilter(logging.Filter):

258 """

259 When using log, specify a given level that must not be taken into account by the handler.

260 This is used for the stdout handler. We want to print, by default,

261 DEBUG (for development use) and INFO levels, but not DETAILS level (which is between

262 DEBUG and INFO). We want to print DETAIL only if verbose option was set

263 """

264

265 def __init__(self, level):

266 self._level = level

267 logging.Filter.__init__(self)

268

269 def filter(self, rec):

270 """

271 Function to decide if given log has to be logged or not, according to its level

272

273 Parameters

274 ----------

275 rec : current record handled by logger

276

277 Returns

278 -------

279 bool

280 True if level of current log is different from forbidden level, False if it is the same

281 """

282 return rec.levelno != self._level

283

284

285def check_installed(cmd):

286 """

287 Check if the command 'cmd' is in $PATH and can then be executed

288

289 Parameters

290 ----------

291 cmd : str

292 command to run

293

294 Returns

295 -------

296 bool

297 True if installed, False otherwise

298 """

299 torun = "which " + cmd

300 trying = subprocess.Popen(shlex.split(torun), stdout=subprocess.PIPE)

301 out, _ = trying.communicate()

302 if trying.returncode == 0:

303 if os.path.isfile(out.strip()):

304 return True

305 return False

306

307

308def run_cmd(cmd, error, eof=False, **kwargs):

309 """

310 Run the given command line. If the return code is not 0, print error message.

311 if eof (exit on fail) is True, exit program if error code is not 0.

312

313 Parameters

314 ----------

315 cmd : str

316 command to run

317 error : str

318 error message to print if error while running command

319 eof : bool

320 True: exit program if command failed, False: do not exit even if command fails

321 kwargs : Object

322 Can provide a logger, stdout and/or stderr streams

323

324 Returns

325 -------

326 subprocess.Popen

327 returns object of subprocess call (has attributes returncode, pid, communicate etc.)

328

329 """

330 if "logger" not in kwargs:

331 logger = logging.getLogger("utils.run_cmd")

332 else:

333 logger = kwargs["logger"]

334 if "stdout" not in kwargs:

335 kwargs["stdout"] = None

336 if "stderr" not in kwargs:

337 kwargs["stderr"] = None

338 try:

339 call = subprocess.Popen(shlex.split(cmd), stdout=kwargs["stdout"],

340 stderr=kwargs["stderr"])

341 call.wait()

342 retcode = call.returncode

343 except OSError:

344 logger.error(f"error: command '>{cmd}' is not possible.")

345 if eof:

346 sys.exit(1)

347 else:

348 return 1

349 if retcode != 0:

350 logger.error(error)

351 if eof:

352 sys.exit(retcode)

353 return call

354

355

356def plot_distr(values, limit, title, text, logger):

357 """

358 Plot histogram of given 'values', and add a vertical line corresponding to the chosen

359 'limit' and return the mpl figure

360

361 Parameters

362 ----------

363 values : list

364 list of values

365 limit : int

366 limit for which a vertical line must be drawn

367 title : str

368 Title to give to plot

369 text : str

370 text to write near the vertical line representing the limit

371 logger : logging.Logger

372 logger object to write log information

373

374 Returns

375 -------

376 matplotlib.figure.Figure

377 figure generated

378 """

379 import math

380 import numpy as np

381 import matplotlib

382 matplotlib.use('AGG')

383 from matplotlib import pyplot as plt

384 plt.close("all")

385 fig = plt.figure(figsize=(10, 7))

386 ax = fig.add_subplot(1, 1, 1)

387 max_x = max(values)

388 # if too many values, group them to have less bins in the histogram.

389 # Put 'group_values' values in each bin ->

390 # if less than 300 values, 1 value per bin, otherwise more values per bin

391 group_values = int(max_x / 300) + 1

392 dec_ax = math.exp(0.001 * max_x) - 1

393 dec_text = 3 * dec_ax

394 bins = np.arange(0, max_x + 2 * group_values, group_values) - 0.5

395 ax.hist(values, bins=bins, edgecolor="black", color="blue")

396 ax.set_xlim(0.5, max_x + 0.5 * group_values)

397 ax.axvline(x=limit + 0.5 * group_values + dec_ax, color="r")

398 ax.text(x=limit + 0.5 * group_values + dec_text, y=plt.ylim()[1] / 2,

399 s=text + " " + str(limit), color="r", rotation=90)

400 ax.set_title(title)

401 return fig

402

403

404def write_warning_skipped(skipped, do_format=False, prodigal_only=False, logfile=""):

405 """

406 At the end of the script, write a warning to the user with the names of the genomes

407 which had problems with prokka.

408

409 Parameters

410 ----------

411 skipped : list

412 list of genomes with problems

413 do_format : bool

414 if False, genomes were not skipped because of format step, but before that.\

415 if True, they were skipped because of format

416 prodigal_only : bool

417 if False: used prokka to annotate

418 if True: used prodigal to annotate

419 """

420 if not prodigal_only:

421 soft = "prokka"

422 else:

423 soft = "prodigal"

424 logger = logging.getLogger("utils")

425 list_to_write = "\n".join(["\t- " + genome for genome in skipped])

426 if not do_format:

427 logger.info(f"WARNING: Some genomes could not be annotated. See {soft}")

428 logger.warning(f"{soft} had problems while annotating some genomes, or "

429 "did not find any gene. Hence, they are not formatted, and absent "

430 "from your output database. Please look at the "

431 "current error log "

432 "(<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err) to get more "

433 "information on the problems. Here are those "

434 f"genomes:\n{list_to_write}")

435 else:

436 logger.info(f"WARNING: Some genomes could not be formatted. See {logfile}")

437 logger.warning((f"Some genomes were annotated by {soft}, but could not be formatted, "

438 "and are hence absent from your output database. Please look at "

439 "'<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err' and "

440 ".details files to get more information about why they could not be "

441 f"formatted.\n{list_to_write}"))

442

443

444def write_genomes_info(genomes, kept_genomes, list_file, res_path, qc=False):

445 """

446 Write the list of genomes discarded to a file (qc=False), so that users can

447 keep a trace of them, with their information (nb contigs, L90 etc.)

448

449 If qc=True, we stop after QC.

450 -> Write the list of genomes that would be kept for annotation with all

451 their information (L90, size, #contig)

452

453

454 Parameters

455 ----------

456 genomes : dict

457 {genome: [gembase_start_name, orig_seq_file, to_annotate_seq_file,

458 genome_size, nb_contigs, L90]}

459 kept_genomes : list

460 list of genomes kept

461 list_file : str

462 path to input file containing the list of genomes

463 res_path : str

464 folder where results must be saved

465 qc : bool

466 * True: called only if QC only. Name this file info-genomes-<list_file>.txt to put

467 information on genomes that would be annotated if not QC only

468 * otherwise (False), called in any case. Name this file discarded-<list_file>.txt

469 and write all discarded genomes, whether sequences kept are next annotated or not

470 => columns: orig_name, to_annotate, gsize, nb_conts, L90

471 """

472 logger = logging.getLogger("utils")

473 # number of genomes discarded

474 nb_disc = len(genomes) - len(kept_genomes)

475 # Log number of genomes discarded.

476 if not qc and nb_disc < 2:

477 logger.info(f"{nb_disc} genome was discarded.")

478 elif not qc:

479 logger.info(f"{nb_disc} genomes were discarded.")

480 # Get input list file name (without path)

481 _, name_lst = os.path.split(list_file)

482 # if not QC, write discarded genomes to a file "discarded-[list_file].lst"

483 if not qc:

484 outdisc = os.path.join(res_path,

485 "discarded-" + ".".join(name_lst.split(".")[:-1]) + ".lst")

486 logger.info("Writing discarded genomes to {}".format(outdisc))

487 # if QC, there is no 'discarded genome', just write information on all analyzed genomes

488 else:

489 outdisc = os.path.join(res_path,

490 "ALL-GENOMES-info-" + ".".join(name_lst.split(".")[:-1]) + ".lst")

491 logger.info("Writing information on genomes in {}".format(outdisc))

492 with open(outdisc, "w") as outdf:

493 outdf.write("\t".join(["orig_name", "to_annotate", "gsize", "nb_conts", "L90"]) + "\n")

494 for genome, values in genomes.items():

495 if genome in kept_genomes:

496 continue

497 _, _, to_annotate, gsize, nbcont, l90 = [str(x) for x in values]

498 to_annotate_file = os.path.basename(to_annotate)

499 outdf.write("\t".join([genome, to_annotate_file, gsize, nbcont, l90]) + "\n")

500

501

502def write_lstinfo(list_file, genomes, outdir):

503 """

504 Write lstinfo file, with following columns:

505 gembase_name, orig_name, to_annotate_name, size, nbcontigs, l90

506

507 Parameters

508 ----------

509 list_file : str

510 input file containing the list of genomes

511 genomes : dict

512 {genome: [gembase_start_name, seq_file, seq_to_annotate, genome_size, nb_contigs, L90]}

513 outdir : str

514 folder where results must be saved

515

516 """

517 _, name_lst = os.path.split(list_file)

518 outlst = os.path.join(outdir, "LSTINFO-" + ".".join(name_lst.split(".")[:-1]) + ".lst")

519 with open(outlst, "w") as outf:

520 outf.write("\t".join(["gembase_name", "orig_name", "to_annotate", "gsize",

521 "nb_conts", "L90"]) + "\n")

522 for genome, values in sorted(genomes.items(), key=sort_genomes_byname_l90_nbcont):

523 gembase, _, to_annote, gsize, nbcont, l90 = [str(x) for x in values]

524 outf.write("\t".join([gembase, genome, to_annote, gsize, nbcont, l90]) + "\n")

525 return outlst

526

527

528def sort_genomes_by_name(x):

529 """

530 order by:

531

532 - species

533 - in each species, by strain number

534

535 Parameters

536 ----------

537 x : tuple or str

538 [genome_orig, [gembase, path, gsize, nbcont, L90]] with gembase = species.date.strain

539

540 Returns

541 -------

542 str

543 variable to take into account for sorting. If format is ESCO.1512.00001 return\

544 ESCO and 00001. Otherwise, just return x itself (sort by alphabetical order)

545 """

546 # get gembase name

547 if isinstance(x, tuple):

548 x = x[1][0]

549

550 # if format is ESCO.1512.00001 sort by ESCO, then 00001

551 if "." in x and len(x.split(".")) == 3:

552 return x.split(".")[0], int(x.split(".")[-1])

553 # if format is not like this, just return alphabetical order

554 return x,

555

556

557def sort_genomes_byname_l90_nbcont(x):

558 """

559 Sort all genomes with the following criteria:

560

561 - sort by species (x[1][0] is species.date)

562 - for each species, sort by l90

563 - for same l90, sort by nb contigs

564

565 Parameters

566 ----------

567 x : [[]]

568 [genome_name, [species.date, path, path_to_seq, gsize, nbcont, L90]]

569

570 Returns

571 -------

572 tuple

573 information on species, l90 and nb_contigs

574 """

575 return x[1][0].split(".")[0], x[1][-1], x[1][-2]

576

577

578def sort_genomes_l90_nbcont(x):

579 """

580 Sort all genomes with the following criteria:

581

582 - for each strain, sort by l90

583 - for same l90, sort by nb contigs

584

585 Parameters

586 ----------

587 x : [[]]

588 [genome_name, [species.date, path, gsize, nbcont, L90]]

589

590 Returns

591 -------

592 tuple

593 information on l90 and nb_contigs

594 """

595 return x[1][-1], x[1][-2]

596

597

598def sort_proteins(x):

599 """

600 order by:

601

602 - species

603 - in each species, strain number

604 - in each species and strain number, by protein number

605

606 Parameters

607 ----------

608 x : str

609 species.date.strain.contig_protnum

610

611 Returns

612 -------

613 str

614 variable to take into account for sorting. If format is ESCO.1512.00001.i0002_12124,\

615 return ESCO, 00001 and 12124. If not, it must be something_00001:\

616 return something and 00001.

617 """

618 try:

619 # if format is ESCO.1512.00001.i0002_12124, sort by ESCO, then 00001, then 12124

620 if "." in x and len(x.split(".")) >= 3:

621 return x.split(".")[0], int(x.split(".")[2].split("_")[0]), int(x.split("_")[-1])

622 # if format is not like this, it must be something_00001:

623 # sort by 'something' and then 00001

624 return "_".join(x.split("_")[:-1]), int(x.split("_")[-1])

625 except (IndexError, ValueError):

626 logger = logging.getLogger("utils")

627 logger.error(("ERROR: Protein {} does not have the required format. "

628 "It must contain, at least <alpha-num_only>_<num_only>, and at best "

629 "<name>.<date>.<strain_num>.<contig_info>_<prot_num>. "

630 "Please change its name.").format(x))

631 sys.exit(1)

632

633

634def read_genomes(list_file, name, date, dbpath, tmp_path, logger):

635 """

636 Read list of genomes, and return them.

637 If a genome has a name, also return it. Otherwise, return the name given by user.

638

639 Check that the given genome file exists in dbpath. Otherwise, put an error message,

640 and ignore this file.

641

642 Parameters

643 ----------

644 list_file : str

645 input file containing the list of genomes

646 name : str

647 Default species name

648 date : str

649 Default date

650 dbpath : str

651 path to folder containing original genome files

652 tmp_path : str

653 path to folder which will contain the genome files to use before annotation, if\

654 needed to change them from original file (for example, merging several contig files\

655 in one file, split at each stretch of 5 'N', etc.).

656

657 Returns

658 -------

659 dict

660 {genome: spegenus.date} spegenus.date = name.date

661 """

662 logger.info("Reading genomes")

663 genomes = {}

664 # Check that given list file exists

665 if not os.path.isfile(list_file):

666 logger.error(("ERROR: Your list file '{}' does not exist. "

667 "Please provide a list file.\n Ending program.").format(list_file))

668 sys.exit(1)

669 # List file exists: open it and read it

670 with open(list_file, "r") as lff:

671 for line in lff:

672 line = line.strip()

673 # empty line: go to the next one

674 if line == "":

675 continue

676 # If separator ::, look for species and/or date

677 if "::" in line:

678 # genomes_inf = genome filename(s) separated by space

679 # name_inf = <name>.<date>

680 genomes_inf, name_inf = line.split("::")

681 genomes_inf = genomes_inf.strip()

682 cur_name, cur_date = read_info(name_inf, name, date, genomes_inf)

683 # If no separator '::', no information on name and date of genome: use default ones

684 # Line only contains genome name (filename in given db_path)

685 else:

686 genomes_inf = line.strip()

687 cur_name = name

688 cur_date = date

689 # If several file names, check that each one exists, and concatenate the existing files

690 genomes_inf = genomes_inf.split()

691 if len(genomes_inf) > 1:

692 to_concat = []

693 for file in genomes_inf:

694 if os.path.isfile(os.path.join(dbpath, file)):

695 to_concat.append(file)

696 else:

697 logger.warning(("{} genome file does not exist. Its file will be "

698 "ignored when concatenating {}").format(file, genomes_inf))

699 # If there are files to concatenate, concatenate them

700 if to_concat:

701 genome_name = to_concat[0] + "-all.fna"

702 concat_file = os.path.join(tmp_path, genome_name)

703 to_concat = [os.path.join(dbpath, gname) for gname in to_concat]

704 # Put all genomes listed in 'to_concat' into a same file named 'concat_file'

705 cat(to_concat, concat_file)

706 else:

707 # No genome file listed exists. No sequence = genome ignored

708 logger.warning(("None of the genome files in {} exist. "

709 "This genome will be ignored.").format(genomes_inf))

710 genome_name = ""

711 # If only 1 sequence file, check that it exists, and take its name

712 else:

713 # Genome file given does not exist

714 if not os.path.isfile(os.path.join(dbpath, genomes_inf[0])):

715 logger.warning(("{} genome file does not exist. "

716 "It will be ignored.").format(genomes_inf[0]))

717 genome_name = ""

718 # Genome file exists: get genome_name (from information in first field)

719 else:

720 genome_name = genomes_inf[0]

721 # If there is a genome file (concatenated from several, or already existing), get the full name (with date)

722 if genome_name != "":

723 genomes[genome_name] = [cur_name + "." + cur_date]

724 return genomes

725

726

727def read_genomes_info(list_file, name, date=None, logger=None):

728 """

729 Read a lstinfo file containing the list of genomes with information (L90, genome size etc.).

730 1 line per genome, 4 required columns (Others will just be ignored):

731 to_annotate gsize nb_conts L90

732

733 Check that the given genome file (to_annotate column) exists.

734

735 Parameters

736 ----------

737 list_file : str

738 input file containing information on genomes (to_annotate, size, L90, nb_contigs)

739 name : str

740 Default species name

741 date : str

742 Default date

743 logger : logging.Logger

744 logger object to write log information

745

746

747 Returns

748 -------

749 dict

750 genomes = {genome:

751 [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]

752 }

753 """

754 if not logger:

755 logger = logging.getLogger("prepare.utils")

756 logger.info(f"Reading given information on your genomes in {list_file}")

757 genomes = {}

758 if name and date:

759 spegenus = f"{name}.{date}"

760 column_order = {} # Put the number of column corresponding to each field

761 if not os.path.isfile(list_file):

762 logger.error(f"ERROR: The info file {list_file} that you gave does not exist. "

763 "Please provide the right path/name for this file.\nEnding program.")

764 sys.exit(1)

765 message_no_header = (f"ERROR: It seems that your info file {list_file} does not have a "

766 "header, or this header does not have, at least, the required "

767 "columns tab separated: to_annotate, gsize nb_conts and L90 (in any "

768 "order).\nEnding program.")

769 with open(list_file, "r") as lff:

770 for line in lff:

771 line = line.strip()

772 # Ignore empty lines

773 if line == "":

774 continue

775 # Header line: Just get column number corresponding to each field

776 if "to_annotate" in line:

777 column_headers = line.split("\t")

778 column_order = {header:num for num,header in enumerate(column_headers)}

779 found = [head for head in ["to_annotate", "gsize", "nb_conts", "L90"]

780 if head in column_order]

781 if len(found) != 4:

782 logger.error(message_no_header)

783 sys.exit(1)

784 # If no header found, error message and exit

785 if not column_order:

786 logger.error(message_no_header)

787 sys.exit(1)

788 # Get all information on the given genome

789 # line.strip().split() -> all given information.

790 # So, at least to_annotate, gsize, nbcont, L90

791 # Get numeric information

792 try:

793 infos = line.strip().split()

794 # Get genome name with its path to db_dir

795 gpath = infos[column_order["to_annotate"]]

796 gfile = os.path.basename(gpath)

797 gname = os.path.splitext(gfile)[0]

798 gsize = int(infos[column_order["gsize"]])

799 gl90 = int(infos[column_order["L90"]])

800 gcont = int(infos[column_order["nb_conts"]])

801 # If invalid values, warning message and ignore genome

802 except ValueError:

803 logger.warning(f"For genome {gname}, at least one of your columns 'gsize', "

804 "'nb_conts' or 'L90' contains a non numeric value. "

805 "This genome will be ignored.")

806 continue

807 # If no value for at least 1 field, warning message and ignore genome

808 except IndexError:

809 logger.error(f"ERROR: Check that all fields of {list_file} are filled in each "

810 "line (can be 'NA')")

811 sys.exit(1)

812 # Could we find genome file?

813 # Check if genome file exists in db_path.

814 if not os.path.isfile(gpath):

815 logger.warning(f"{gpath} genome file does not exist. This genome will be ignored.")

816 continue

817 # cur genome information to save:

818 # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90]

819 if name and date:

820 genomes[gpath] = [spegenus, gpath, gpath, gsize, gcont, gl90]

821 # If called from prepare, no need to rename genomes

822 else:

823 gfile = os.path.basename(gpath)

824 gname = os.path.splitext(gfile)[0]

825 genomes[gfile] = [gname, gpath, gpath, gsize, gcont, gl90]

826 if len(genomes) > 0:

827 logger.info(("Found {} genomes in total").format(len(genomes)))

828 else:

829 logger.error(f"No genome listed in {list_file} was found.")

830 sys.exit(1)

831 return genomes

832

833

834def read_info(name_inf, name, date, genomes_inf):

835 """

836 From the given information in 'name_inf', check if there is a name (and if its

837 format is ok) and if there is a date (and if its format is ok).

838 If no name (resp. no date), return default name (resp. default date).

839

840 Parameters

841 ----------

842 name_inf : str

843 information on current genome, which could contain a species name and a date

844 name : str

845 default species name

846 date : str

847 default date

848 genomes_inf : str

849 current genome filename. Used to complete information when there is a warning (species\

850 name or date given not in the right format...)

851

852 Returns

853 -------

854 (cur_name, cur_date) : tuple

855 with:

856

857 - curname: name to use for this genome (can be the default one, or the one read from\

858 'name_inf'

859 - curdate: date to use for this genome (default or read from 'name_inf')

860 """

861 logger = logging.getLogger("utils")

862 # information on current genome, which could contain a species name and a date: split to get

863 # name and date

864 name_inf = name_inf.strip().split(".")

865 # if only 1 field provided (means only name is provided).

866 # (Otherwise, if only date is provided, it is with '.DATE', so

867 # len(name_inf) = 2 with name_inf[0] = "")

868 if len(name_inf) == 1:

869 # No genome name: use default one

870 if name_inf[0] == "":

871 cur_name = name

872 # Genome name, and in right format: use it

873 elif check_format(name_inf[0]):

874 cur_name = name_inf[0]

875 # Genome name but wrong format: warning message and use default one

876 else:

877 logger.warning(("Invalid name {} given for genome {}. Only put "

878 "4 alphanumeric characters in your date and name. "

879 "For this genome, the default name ({}) will be "

880 "used.").format(name_inf[0], genomes_inf, name))

881 cur_name = name

882 # In any case, no given date: use default date

883 cur_date = date

884

885 # More than 2 informations (more than 2 fields separated by a '.'):

886 # - either user put more information

887 # - either user put a '.' inside the name or date field: wrong format

888 # -> warning message and use default date and name.

889 elif len(name_inf) > 2:

890 logger.warning(("Invalid name/date given for genome {}. Only put "

891 "4 alphanumeric characters in your date and name. For "

892 "this genome, the default name ({}) and date ({}) will "

893 "be used.").format(genomes_inf, name, date))

894 cur_name = name

895 cur_date = date

896 # information on name and date given (can be empty)

897 else:

898 cur_name, cur_date = name_inf

899 # name given empty -> use default name

900 if cur_name == "":

901 cur_name = name

902 # date given empty -> use default date

903 if cur_date == "":

904 cur_date = date

905 # name given not empty but wrong format -> warning message and default name

906 if not check_format(cur_name):

907 logger.warning(("Invalid name {} given for genome {}. Only put "

908 "4 alphanumeric characters in your date and name. "

909 "For this genome, the default name ({}) "

910 "will be used.").format(cur_name, genomes_inf, name))

911 cur_name = name

912 # date given not empty but wrong format -> warning message and default date

913 if not check_format(cur_date):

914 logger.warning(("Invalid date {} given for genome {}. Only put "

915 "4 alphanumeric characters in your date and name. "

916 "For this genome, the default date ({}) "

917 "will be used.").format(cur_date, genomes_inf, date))

918 cur_date = date

919 return cur_name, cur_date

920

921

922def cat(list_files, output, title=None):

923 """

924 Equivalent of 'cat' unix command.

925

926 Concatenate all files in 'list_files' and save result in 'output' folder.

927 Concat using shutil.copyfileobj, in order to copy by chunks, to

928 avoid memory problems if files are big.

929

930 Parameters

931 ----------

932 list_files : list

933 list of filenames to concatenate

934 output : str

935 output filename, where all concatenated files will be written

936 title : str or None

937 if you want to show a progressbar while concatenating files, add a title for this

938 progressbar here. If no title, nothing will be shown during concatenation.

939

940 """

941 bar = None

942 curnum = None

943 if title:

944 nbfiles = len(list_files)

945 widgets = [title + ': ', progressbar.Bar(marker='█', left='', right='', fill=' '),

946 ' ', progressbar.Counter(), f"/{nbfiles}" ' (',

947 progressbar.Percentage(), ") - ", progressbar.Timer()]

948 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfiles, term_width=79).start()

949 curnum = 1

950 with open(output, "w") as outf:

951 for file in list_files:

952 if title:

953 bar.update(curnum)

954 curnum += 1

955 with open(file, "r") as inf:

956 shutil.copyfileobj(inf, outf)

957 if title:

958 bar.finish()

959

960

961def grep(filein, pattern, counts=False):

962 """

963 Equivalent of 'grep' unix command

964

965 By default, returns all the lines containing the given pattern.

966 If counts = True, returns the number of lines containing the pattern.

967

968 Parameters

969 ----------

970 filein : str

971 path to the file in which pattern must be searched

972 pattern : str

973 pattern to search

974 counts : bool

975 True if you want to count how many lines have the pattern and return this number,\

976 False if you want to return all lines containing the pattern.

977

978 Returns

979 -------

980 list or int

981 list of lines if counts=False; number of lines if counts=True

982 """

983 num = 0

984 lines = []

985 with open(filein, "r") as inf:

986 for line in inf:

987 if re.search(pattern, line):

988 lines.append(line.strip())

989 num += 1

990 if counts:

991 return num

992 else:

993 return lines

994

995

996def count(filein, get="lines"):

997 """

998 Similar to 'wc' unix command.

999

1000 Count the number of what is given in 'get'. It can be:

1001

1002 - lines (default)

1003 - words

1004

1005 Parameters

1006 ----------

1007 filein : str

1008 path to the file for which we want to count lines or words

1009 get : ["lines", "words"]

1010 either lines to count the number of lines in the file, or words to count the number\

1011 of words.

1012

1013 Returns

1014 -------

1015 int

1016 Number of lines or words according to value of 'get' parameter.

1017 """

1018 gets = ["lines", "words"]

1019 if get not in gets:

1020 logger = logging.getLogger("utils")

1021 logger.error("Choose what you want to count among {}.".format(gets))

1022 sys.exit(1)

1023 num = 0

1024 with open(filein, "r") as inf:

1025 for line in inf:

1026 if get == "lines":

1027 num += 1

1028 elif get == "words":

1029 num += len(line.split())

1030 return num

1031

1032

1033def check_format(info):

1034 """

1035 Check that the given information (can be the genomes name or the date) is in the right

1036 format: it should have 4 characters, all alphanumeric.

1037

1038 Parameters

1039 ----------

1040 info : str

1041 information to check

1042

1043 Returns

1044 -------

1045 bool

1046 True if right format, False otherwise

1047 """

1048 if len(info) != 4:

1049 return False

1050 return info.isalnum()

1051

1052

1053def check_out_dirs(resdir):

1054 """

1055 Check that there is no file in:

1056

1057 - resdir/LSTINFO

1058 - resdir/Genes

1059 - resdir/Proteins

1060 - resdir/Replicons

1061 - resdir/gff3

1062

1063 Parameters

1064 ----------

1065 resdir : str

1066 path to result directory

1067

1068 """

1069 logger = logging.getLogger("utils")

1070 if glob.glob(os.path.join(resdir, "LSTINFO", "*.lst")):

1071 logger.error("ERROR: Your output directory already has .lst files in the "

1072 "LSTINFO folder. Provide another result directory, or remove the "

1073 "files in this one.\nEnding program.")

1074 sys.exit(1)

1075 if glob.glob(os.path.join(resdir, "Proteins", "*.prt")):

1076 logger.error("ERROR: Your output directory already has .prt files in the "

1077 "Proteins folder. Provide another result directory, or remove the "

1078 "files in this one.\nEnding program.")

1079 sys.exit(1)

1080 if glob.glob(os.path.join(resdir, "Genes", "*.gen")):

1081 logger.error("ERROR: Your output directory already has .gen files in the "

1082 "Genes folder. Provide another result directory, or remove the "

1083 "files in this one.\nEnding program.")

1084 sys.exit(1)

1085 if glob.glob(os.path.join(resdir, "Replicons", "*.fna")):

1086 logger.error("ERROR: Your output directory already has .fna files in the "

1087 "Replicons folder. Provide another result directory, or remove the "

1088 "files in this one.\nEnding program.")

1089 sys.exit(1)

1090 if glob.glob(os.path.join(resdir, "gff3", "*.gff")):

1091 logger.error("ERROR: Your output directory already has .gff files in the "

1092 "gff3 folder. Provide another result directory, or remove the "

1093 "files in this one.\nEnding program.")

1094 sys.exit(1)

1095

1096

1097def get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger):

1098 """

1099 For the given genome (sequence in gpath), rename all its contigs

1100 with the new name: 'gembase_name', and save the output sequence in outfile.

1101

1102 For each contig renamed, save its new name as well as its size. This will be used to generate

1103 gff files

1104

1105 Parameters

1106 ----------

1107 gembase_name : str

1108 genome name to use (species.date.strain)

1109 gpath : str

1110 path to the genome sequence

1111 outfile : str

1112 path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers

1113

1114 Returns

1115 -------

1116 tuple

1117 - Dict of all contigs with their original and new name: (list of str)

1118 {>orig_name: >new_name}

1119 - Dict of all contigs with their size: (list of str)

1120 {"new_name': 'size1"}

1121 """

1122 # Initialize variables

1123

1124 # Contig number

1125 contig_num = 1

1126 # contig size

1127 cont_size = 0

1128 # List of contigs (str) [<name>\t<orig_name>]

1129 contigs = {}

1130 # List of contigs (str) with their sizes [<name>\t<size>]

1131 sizes = {}

1132 # Name of previous contig (to put to contigs, as we need to wait for the next

1133 # contig to know the size of the previous one)

1134 prev_cont = ""

1135 prev_orig_name = ""

1136 # sequence of previous contig

1137 seq = ""

1138

1139 # Read input sequence given to prodigal, and open file where sequences with new

1140 # headers must be written.

1141 with open(gpath, "r") as gpf, open(outfile, "w") as grf:

1142 for line in gpf:

1143 # When we find a new header line, convert its name to gembase format, and write it

1144 # to output replicon file

1145 if line.startswith(">") :

1146 # If not first contig (contigs not empty):

1147 # - add its name as well as its size to contigs list

1148 # - add its name with its original name to

1149 # - write header ("<contig name> <size>") to replicon file

1150 if prev_cont:

1151 cont = " ".join([prev_cont, str(cont_size)]) + "\n"

1152 prevcont_nohead = prev_cont.split(">")[1]

1153 prev_orig_name_nohead = prev_orig_name.split(">")[1]

1154 if prev_orig_name_nohead:

1155 if prev_orig_name_nohead in contigs:

1156 logger.error(f"several contigs have the same name "

1157 f"{prev_orig_name_nohead} in {gpath}.")

1158 return False, False

1159 sizes[prevcont_nohead] = cont_size

1160 contigs[prev_orig_name_nohead] = prevcont_nohead

1161 grf.write(cont)

1162 grf.write(seq)

1163 prev_cont = ">" + gembase_name + "." + str(contig_num).zfill(4)

1164 # keep only first string of contig

1165 prev_orig_name = line.strip().split()[0]

1166 contig_num += 1

1167 cont_size = 0

1168 seq = ""

1169 # Sequence line: write it as is in replicon file, and add its size to cont_size.

1170 else:

1171 seq += line

1172 cont_size += len(line.strip())

1173 # Write last contig, if there is one (if gpath not empty)

1174 if prev_cont:

1175 cont = " ".join([prev_cont, str(cont_size)]) + "\n"

1176 prevcont_nohead = "".join(prev_cont.split(">")[1:])

1177 prev_orig_name_nohead = prev_orig_name.split(">")[1]

1178 if prev_orig_name_nohead:

1179 if prev_orig_name_nohead in contigs:

1180 logger.error(f"several contigs have the same name {prev_orig_name_nohead} "

1181 f"in {gpath}.")

1182 return False, False

1183 contigs[prev_orig_name_nohead] = prevcont_nohead

1184 sizes[prevcont_nohead] = cont_size

1185 grf.write(cont)

1186 grf.write(seq)

1187 if not contigs:

1188 logger.error(f"Your genome {gpath} does not contain any sequence, "

1189 "or is not in fasta format.")

1190 return contigs, sizes

1191# Add test with empty gpath

1192# Add test with non fasta gpath

1193

1194

1195def logger_thread(q):

1196 """

1197 Queue listener used in a thread to handle the logs put to a QueueHandler

1198 by several processes (multiprocessing.pool.map_async for example)

1199

1200 Parameters

1201 ----------

1202 q : multiprocessing.managers.AutoProxy[Queue]

1203 queue to listen

1204

1205 """

1206 while True:

1207 record = q.get()

1208 if record is None:

1209 break

1210 logger = logging.getLogger(record.name)

1211 logger.handle(record)

1212

1213

1214def detail_lvl():

1215 """

1216 Get the int level corresponding to "DETAIL"

1217

1218 Returns

1219 -------

1220 int

1221 int corresponding to the level "DETAIL"

1222 """

1223 return 15

1224

1225

1226def save_bin(objects, fileout):

1227 """

1228 Save python 'objects' in a binary file called 'fileout'

1229

1230 Parameters

1231 ----------

1232 objects : Object

1233 python object to save

1234 fileout : str

1235 path to binary file where objects must be saved

1236

1237 """

1238 with open(fileout, "wb") as binf:

1239 pickle.dump(objects, binf)

1240

1241

1242def load_bin(binfile):

1243 """

1244 Unpickle python objects from the binary file 'binfile'

1245

1246 Parameters

1247 ----------

1248 binfile : str

1249 path to binary file containing python object

1250

1251 Returns

1252 -------

1253 Object

1254 The python objects unpickled

1255

1256 """

1257 with open(binfile, "rb") as binf:

1258 objects = pickle.load(binf)

1259 return objects

1260

1261

1262def write_list(list_names, fileout):

1263 """

1264 Write the given list of strings to a file, 1 per line

1265 """

1266 with open(fileout, "w") as fo:

1267 for genome in list_names:

1268 fo.write(str(genome) + "\n")

1269

1270

1271def list_to_str(list, sep='\t'):

1272 """

1273 Return a string corresponding to the given list, with all elements separated

1274 by a space. Used to write a list into a file. Ex::

1275

1276 [1, 2, "toto"] -> "1 2 toto"

1277

1278 Parameters

1279 ----------

1280 list : list

1281 list of elements that we would like to write

1282 sep : str

1283 Separator to use between the different elements

1284

1285 Returns

1286 -------

1287 str

1288 the string to write

1289 """

1290 list_write = [str(l) for l in list]

1291 return sep.join(list_write) + "\n"

1292

1293

1294def remove(infile):

1295 """

1296 Remove the given file if it exists

1297

1298 Parameters

1299 ----------

1300 infile : str

1301 path to file to remove

1302

1303 """

1304 if os.path.isfile(infile):

1305 os.remove(infile)

1306

1307

1308def thread_progressbar(widgets, stop):

1309 """

1310 Thread running an "inifite" progress bar, while the main thread is working.

1311 Once this progressbar has to stop, we send a signal.

1312

1313 Parameters

1314 ----------

1315 widgets : list

1316 list of widgets to put in the progressbar

1317 stop : function

1318 function returning False when thread can run, True when it has to stop.

1319 """

1320 if widgets:

1321 bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)

1322 while True:

1323 bar.update()

1324 if stop():

1325 print()

1326 break