Coverage for PanACoTA/utils_pangenome.py: 100%
74 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions used to deal with pangenome file
39@author gem
40April 2017
41"""
42import logging
43import os
44import sys
45from PanACoTA import utils
47logger = logging.getLogger("utils.pan")
50def read_pangenome(pangenome, logger, families=None):
51 """
52 Read pangenome information
54 Read pangenome according to what is available. First, check if python objects are available,
55 then if not, search for the binary file, and if not, read the text file.
57 Parameters
58 ----------
59 pangenome : str
60 path to pangenome file
61 families : dict or None
62 {num: [members]} if families are given. If not (must read them from binary file if exists\
63 or pangenome file otherwise), None.
65 Returns
66 -------
67 (fams_by_strain, families, all_strains) : tuple
68 with:
70 - fams_by_strain: {fam_num: {strain: [members]}}
71 - families: {fam_num: [all members]}
72 - all_strains: list of all genome names
73 """
74 if families:
75 fams_by_strain, all_strains = get_fams_info(families, logger)
76 if not os.path.isfile(pangenome + ".bin"):
77 logger.details("Saving all information to a binary file for later use")
78 utils.save_bin([fams_by_strain, families, all_strains], pangenome + ".bin")
79 elif os.path.isfile(pangenome + ".bin"):
80 logger.info("Retrieving info from binary file")
81 fams_by_strain, families, all_strains = utils.load_bin(pangenome + ".bin")
82 else:
83 fams_by_strain, families, all_strains = read_pan_file(pangenome, logger)
84 logger.info("Saving all information to a binary file for later use")
85 utils.save_bin([fams_by_strain, families, all_strains], pangenome + ".bin")
86 return fams_by_strain, families, all_strains
89def get_fams_info(families, logger):
90 """
91 From all families as list of members, get more information:
93 - all strains found, sorted by species name
94 - for each family, sort members by strain
96 Parameters
97 ----------
98 families : dict
99 {num: [members]}
100 logger : logging.Logger
101 logger object to write log information
103 Returns
104 -------
105 (fams_by_strain, sorted_all_strains) : tuple
106 with:
108 - fams_by_strain: {fam_num: {strain: [members], strain: [members]}}
109 - sorted_all_strains: list of all strains found, sorted by species
110 """
111 logger.info("Retrieving information from pan families")
112 fams_by_strain = {}
113 all_strains = set()
114 for num, fam in families.items():
115 fams_by_strain[num] = {}
116 for gene in fam:
117 read_gene(gene, num, fams_by_strain, all_strains)
118 sort_all_strains = sorted(list(all_strains), key=utils.sort_genomes_by_name)
119 return fams_by_strain, sort_all_strains
122def read_pan_file(filein, logger):
123 """
124 Read PanGenome file in 'filein', and put it into Python objects
126 Parameters
127 ----------
128 filein : str
129 path to pangenome file
130 logger :
132 Returns
133 -------
134 (fams_by_strain, families, sort_all_strains) : tuple
135 with:
137 - fams_by_strain: {fam_num: {strain: [members]}}
138 - families: {fam_num: [all members]}
139 - sort_all_strains: list of all genome names, sorted by species name
140 """
141 logger.info("Reading and getting information from pangenome file")
142 fams_by_strain = {}
143 families = {}
144 all_strains = set()
145 nfam = 0
146 with open(filein, 'r') as coref:
147 for line in coref:
148 genes = line.strip().split()
149 fam_num = genes[0]
150 fams_by_strain[fam_num] = {}
151 genes_ok = genes[1:]
152 for gene in genes_ok:
153 read_gene(gene, fam_num, fams_by_strain, all_strains)
154 families[fam_num] = genes_ok
155 nfam += 1
156 if not families or all_strains == {''}:
157 logger.error("Error in pangenome file. No family found.")
158 sys.exit(1)
159 sort_all_strains = sorted(list(all_strains), key=utils.sort_genomes_by_name)
160 return fams_by_strain, families, sort_all_strains
163def read_gene(gene, num, fams_by_strain, all_strains):
164 """
165 Read information from a given gene name, and save it to appropriate dicts
167 Parameters
168 ----------
169 gene : str
170 gene name (species.date.strain.contig_number
171 num : str
172 num of family from which the given gene is
173 fams_by_strain : dict
174 {fam_num: {strain: [members]}}
175 all_strains : set
176 set of all strains
178 """
179 # if format is ESCO.1512.00001.i001_12313 genome name is ESCO.1512.00001
180 if "." in gene and len(gene.split(".")) >= 3:
181 strain = ".".join(gene.split("_")[0].split(".")[:3])
182 # otherwise, genename is everything before the last "_"
183 else:
184 strain = "_".join(gene.split("_")[:-1])
185 if strain in fams_by_strain[num]:
186 fams_by_strain[num][strain].append(gene)
187 else:
188 fams_by_strain[num][strain] = [gene]
189 if strain not in all_strains:
190 all_strains.add(strain)
193def read_lstinfo(lstinfo, logger):
194 """
195 Read lstinfo file and return list of genomes
197 Parameters
198 ----------
199 lstinfo : str
200 File containing the list of all genomes to include in the pan-genome,
201 1 genome per line. Here, only the first column will be used.
203 Returns
204 -------
205 list
206 list of genomes
207 """
208 genomes = []
209 if not os.path.isfile(lstinfo):
210 logger.error(f"{lstinfo} file not found.")
211 sys.exit(1)
212 with open(lstinfo) as lstf:
213 for line in lstf:
214 # skip header
215 if "_name" in line:
216 continue
217 genome = line.split()[0].strip()
218 genomes.append(genome)
219 if genomes == []:
220 logger.error(f"No genome found in {lstinfo} file.")
221 sys.exit(1)
222 return genomes