Coverage for PanACoTA/pangenome_module/protein_seq_functions.py: 100%

22 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to build a bank of all proteins to include in the pangenome 

38 

39@author gem 

40April 2017 

41""" 

42from PanACoTA import utils 

43from PanACoTA import utils_pangenome as utilsp 

44import logging 

45import os 

46 

47logger = logging.getLogger('pangenome.bank') 

48 

49 

50def build_prt_bank(lstinfo, dbpath, name, spedir, quiet): 

51 """ 

52 Build a file containing all proteins of all genomes contained in lstinfo. 

53 

54 Parameters 

55 ---------- 

56 lstinfo : str 

57 1 line per genome, only 1st column considered here, as the genome name 

58 without extension 

59 dbpath : str 

60 Proteins folder, containing all proteins for each genome. Each genome has 

61 its own protein file, called `<genome_name>.prt`. 

62 name : str 

63 dataset name, used to name the output databank: <outdir>/<name>.All.prt 

64 spedir : str or None 

65 By default, output file is saved in dbpath directory. If it must be saved somewhere 

66 else, it is specified here. 

67 quiet : bool 

68 True if nothing must be written in stdout/stderr, False otherwise 

69 

70 Returns 

71 ------- 

72 str 

73 name (with path) of the protein databank generated 

74 """ 

75 if not spedir: 

76 outdir = dbpath 

77 else: 

78 os.makedirs(spedir, exist_ok=True) 

79 outdir = spedir 

80 outfile = os.path.join(outdir, name + ".All.prt") 

81 if os.path.isfile(outfile): 

82 logger.warning((f"Protein bank {outfile} already exists. " 

83 "It will be used by mmseqs.")) 

84 return outfile 

85 logger.info(f"Building bank with all proteins to {outfile}") 

86 genomes = utilsp.read_lstinfo(lstinfo, logger) 

87 all_names = [os.path.join(dbpath, gen + ".prt") for gen in genomes] 

88 if quiet: 

89 utils.cat(all_names, outfile) 

90 else: 

91 utils.cat(all_names, outfile, title="Building bank") 

92 return outfile