Coverage for PanACoTA/corepers_module/persistent_functions.py: 100%
70 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to generate a persistent genome from a pangenome.
39@author gem
40April 2017
41"""
42import logging
43import math
45from PanACoTA import utils
46from PanACoTA import utils_pangenome as utilsp
48logger = logging.getLogger("corepers.pers")
51def get_subset_genomes(fam_by_strain, fam_all_members, list_file):
52 """
53 If the user gives a list of genomes, which is a subset of all genomes in the pangenome,
54 just keep them in fam_by_strain and fam_all_members dicts. This will give the pangenome of those genomes.
55 They will then be handled by the get_pers function to get corresponding core/persistent genome
57 Parameters
58 ----------
59 fam_by_strain : dict
60 {fam_num: {genome1: [members], genome2: [members]}, fam_num2: {genome1: [members]}}
61 fam_all_members : dict
62 {fam_num: [all members]}
63 list_file : str
64 name of file containing all genome names
66 Return
67 ------
68 tuple
70 sub_fbs = {fam_num: {genome: [members]}} -> with only genomes in list_genomes
71 sub_fam = {fam_num: [members]} -> with only members from genomes in list_genomes
72 """
73 logger.info(f"Getting subset of pangenome for genomes in {list_file}.")
74 list_genomes = utilsp.read_lstinfo(list_file, logger)
75 sub_fbs = {}
76 sub_fam = {}
77 for fam_num, family in fam_by_strain.items():
78 kept = {genome:members for genome, members in family.items() if genome in list_genomes}
79 if kept != {}:
80 sub_fbs[fam_num] = kept
81 sub_fam[fam_num] = [member for member in fam_all_members[fam_num] if is_in_subset(member, list_genomes)]
82 return sub_fbs, sub_fam, list_genomes
85def is_in_subset(member, list_genomes):
86 """
87 From a list of members, keep only those in the given list of genomes
89 Parameters
90 ----------
91 members : str
92 protein name
93 list_genomes : str
94 filename containing list of genomes
96 Return
97 ------
98 bool
99 True if member is in list_genomes, False otherwise
100 """
101 # if format is gembase (ex: ESCO.1512.00001.i0002_12124), genome name is 3 first . separated fields
102 if "." in member and len(member.split(".")) >= 3:
103 genome = ".".join(member.split(".")[:3])
104 return genome in list_genomes
105 else:
106 # if format is not like this, it must be something_00001 (genomeName_proteinNumber):
107 return member.split("_")[0] in list_genomes
111def get_pers(fam_by_strain, fam_all_members, nb_strains, tol=1, multi=False, mixed=False,
112 floor=False):
113 """
114 From the list of families, get the Pers Genome families, that are families having at least
115 tol% of 'nb_strain' members.
117 Parameters
118 ----------
119 fam_by_strain : dict
120 {fam_num: {genome1: [members], genome2: [members]}, fam_num2: {genome1: [members]}}
121 fam_all_members : dict
122 {fam_num: [all members]}
123 nb_strains : int
124 total number of strains/genomes in dataset
125 tol : float
126 min percentage of different genomes present in a family
127 ex: if tol=50%, and there are 8 genomes. If a family contains 3 genomes, it is not
128 persistent. If it contains 7 genomes, it can be persistent (depends on multi and
129 mixed parameters)
130 multi : bool
131 True if multiple genes from the same genome/strain in a family are tolerated. -> a family is considered as multi-persistent if it has members from at least 'tol%' genomes
132 False otherwise
133 mixed : bool
134 True if mixed families are allowed (mixed family = exactly 1 member per genome
135 for at least tol% of the genomes, 0 or several members allowed for other (1-tol)% genomes)
136 floor : bool
137 Use a minimum number of genomes containing a gene to consider the family
138 persistent equal to: floor(nb_strains*tol) genomes if True, ceil(nb_strains*tol)
139 if False.
141 Returns
142 -------
143 dict
144 {fam_num: [list of members]} for persistent families
145 """
146 logger.info("Generating Persistent genome of a dataset "
147 f"containing {nb_strains} genomes")
148 pers = {} # {fam_num: {strain1: [genes from strain1], strain2: [genes from strain2]}}
149 fams = {} # {fam_num: [list of members]}
150 if floor:
151 min_members = math.floor(tol * nb_strains)
152 else:
153 min_members = math.ceil(tol * nb_strains)
154 for fam_num, family in fam_by_strain.items():
155 # If enough strains and multi accepted, or multi not accepted but 1 member per
156 # strain, add family to core
157 if mixed:
158 if mixed_family(family, min_members):
159 pers[fam_num] = family
160 fams[fam_num] = fam_all_members[fam_num]
161 else:
162 if len(family) >= min_members and (multi or uniq_members(family)):
163 pers[fam_num] = family
164 fams[fam_num] = fam_all_members[fam_num]
165 # coregenome computed
166 if tol == 1 and not multi and not mixed:
167 logger.info(f"The core genome contains {len(pers)} families, each one having "
168 f"exactly {int(min_members)} members, from the {nb_strains} different genomes.")
169 # multi persistent genome with multigenic families allowed
170 elif multi:
171 logger.info(f"The persistent genome contains {len(pers)} families with members present "
172 f"in at least {min_members} different genomes ({tol*100}% of the total number of "
173 "genomes).")
174 # mixed persistent genome, tol% families with exactly 1 member from each genome,
175 # multigenic families allowed for the '1-tol'% remaining families
176 elif mixed:
177 logger.info(f"The persistent genome contains {len(pers)} families, "
178 f"each one having exactly 1 member from at least {tol*100}% of the genomes ({min_members} "
179 f"genomes). In the remaining {round((1-tol)*100,3)}% genomes, there can be 0, 1 or "
180 "several members.")
181 # Strict persistent genome. tol% families with exactly one member in each genome
182 else:
183 logger.info(f"The persistent genome contains {len(pers)} families, each one having "
184 f"exactly 1 member from at least {tol*100}% of the {nb_strains} "
185 f"different genomes (that is {min_members} genomes). The other genomes are absent from "
186 "the family.")
187 return fams
190def mixed_family(family, thres):
191 """
192 1 family = several genomes (genome=strain), each containing x members
193 Returns True if at least 'thres' genomes of the family have exactly 1 member.
195 Parameters
196 ----------
197 family : dict
198 {strain1: [members in strain1]}
199 thres : float
200 minimum number of genomes which must have exactly 1 member
202 Returns
203 -------
204 bool
205 """
206 nb_uniq = 0
207 for genome, members in family.items():
208 if nb_uniq < thres:
209 if len(members) == 0:
210 logger.warning(f"Problem, no members for {genome}!")
211 if len(members) == 1:
212 nb_uniq += 1
213 else:
214 return True
215 return nb_uniq >= thres
218def uniq_members(family, num=1):
219 """
220 Returns True if, in the family, each genome has no more than 'num' member(s),
221 False otherwise (multigenic family)
223 Parameters
224 ----------
225 family : dict
226 {strain1: [members in strain1], strain2: [members in strain2]}
227 num : int
228 max number of members allowed in each genome to return True
230 Returns
231 -------
232 bool
233 """
234 for genome, members in family.items():
235 if len(members) == 0:
236 logger.warning(f"Problem, no members for {genome}!")
237 if len(members) > num:
238 return False
239 return True
242def write_persistent(fams, outfile):
243 """
244 Write persistent families into output file
246 Parameters
247 ----------
248 fams : dict
249 {num_fam: [members]}
250 outfile : str
251 output file to write all families
252 """
253 with open(outfile, "w") as outf:
254 # Order by family number
255 for num_fam in sorted(fams, key=lambda x: int(x)):
256 outf.write(str(num_fam)) # Write family num
257 fam = fams[num_fam] # Get list of members of the family
258 for mem in sorted(fam, key=utils.sort_proteins):
259 outf.write(" " + mem)
260 outf.write("\n")