Coverage for PanACoTA/pangenome_module/protein_seq

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions to build a bank of all proteins to include in the pangenome

39@author gem

40April 2017

41"""

42from PanACoTA import utils

43from PanACoTA import utils_pangenome as utilsp

44import logging

45import os

47logger = logging.getLogger('pangenome.bank')

50def build_prt_bank(lstinfo, dbpath, name, spedir, quiet):

51 """

52 Build a file containing all proteins of all genomes contained in lstinfo.

54 Parameters

55 ----------

56 lstinfo : str

57 1 line per genome, only 1st column considered here, as the genome name

58 without extension

59 dbpath : str

60 Proteins folder, containing all proteins for each genome. Each genome has

61 its own protein file, called `<genome_name>.prt`.

62 name : str

63 dataset name, used to name the output databank: <outdir>/<name>.All.prt

64 spedir : str or None

65 By default, output file is saved in dbpath directory. If it must be saved somewhere

66 else, it is specified here.

67 quiet : bool

68 True if nothing must be written in stdout/stderr, False otherwise

70 Returns

71 -------

72 str

73 name (with path) of the protein databank generated

74 """

75 if not spedir:

76 outdir = dbpath

77 else:

78 os.makedirs(spedir, exist_ok=True)

79 outdir = spedir

80 outfile = os.path.join(outdir, name + ".All.prt")

81 if os.path.isfile(outfile):

82 logger.warning((f"Protein bank {outfile} already exists. "

83 "It will be used by mmseqs."))

84 return outfile

85 logger.info(f"Building bank with all proteins to {outfile}")

86 genomes = utilsp.read_lstinfo(lstinfo, logger)

87 all_names = [os.path.join(dbpath, gen + ".prt") for gen in genomes]

88 if quiet:

89 utils.cat(all_names, outfile)

90 else:

91 utils.cat(all_names, outfile, title="Building bank")

92 return outfile

Coverage for PanACoTA/pangenome_module/protein_seq_functions.py: 100%

22 statements