Coverage for PanACoTA/pangenome_module/protein_seq_functions.py: 100%
22 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to build a bank of all proteins to include in the pangenome
39@author gem
40April 2017
41"""
42from PanACoTA import utils
43from PanACoTA import utils_pangenome as utilsp
44import logging
45import os
47logger = logging.getLogger('pangenome.bank')
50def build_prt_bank(lstinfo, dbpath, name, spedir, quiet):
51 """
52 Build a file containing all proteins of all genomes contained in lstinfo.
54 Parameters
55 ----------
56 lstinfo : str
57 1 line per genome, only 1st column considered here, as the genome name
58 without extension
59 dbpath : str
60 Proteins folder, containing all proteins for each genome. Each genome has
61 its own protein file, called `<genome_name>.prt`.
62 name : str
63 dataset name, used to name the output databank: <outdir>/<name>.All.prt
64 spedir : str or None
65 By default, output file is saved in dbpath directory. If it must be saved somewhere
66 else, it is specified here.
67 quiet : bool
68 True if nothing must be written in stdout/stderr, False otherwise
70 Returns
71 -------
72 str
73 name (with path) of the protein databank generated
74 """
75 if not spedir:
76 outdir = dbpath
77 else:
78 os.makedirs(spedir, exist_ok=True)
79 outdir = spedir
80 outfile = os.path.join(outdir, name + ".All.prt")
81 if os.path.isfile(outfile):
82 logger.warning((f"Protein bank {outfile} already exists. "
83 "It will be used by mmseqs."))
84 return outfile
85 logger.info(f"Building bank with all proteins to {outfile}")
86 genomes = utilsp.read_lstinfo(lstinfo, logger)
87 all_names = [os.path.join(dbpath, gen + ".prt") for gen in genomes]
88 if quiet:
89 utils.cat(all_names, outfile)
90 else:
91 utils.cat(all_names, outfile, title="Building bank")
92 return outfile