Coverage for PanACoTA/corepers_module/persistent_functions.py: 100%

70 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to generate a persistent genome from a pangenome. 

38 

39@author gem 

40April 2017 

41""" 

42import logging 

43import math 

44 

45from PanACoTA import utils 

46from PanACoTA import utils_pangenome as utilsp 

47 

48logger = logging.getLogger("corepers.pers") 

49 

50 

51def get_subset_genomes(fam_by_strain, fam_all_members, list_file): 

52 """ 

53 If the user gives a list of genomes, which is a subset of all genomes in the pangenome,  

54 just keep them in fam_by_strain and fam_all_members dicts. This will give the pangenome of those genomes. 

55 They will then be handled by the get_pers function to get corresponding core/persistent genome 

56 

57 Parameters 

58 ---------- 

59 fam_by_strain : dict 

60 {fam_num: {genome1: [members], genome2: [members]}, fam_num2: {genome1: [members]}} 

61 fam_all_members : dict 

62 {fam_num: [all members]} 

63 list_file : str 

64 name of file containing all genome names 

65 

66 Return 

67 ------ 

68 tuple 

69 

70 sub_fbs = {fam_num: {genome: [members]}} -> with only genomes in list_genomes 

71 sub_fam = {fam_num: [members]} -> with only members from genomes in list_genomes 

72 """ 

73 logger.info(f"Getting subset of pangenome for genomes in {list_file}.") 

74 list_genomes = utilsp.read_lstinfo(list_file, logger) 

75 sub_fbs = {} 

76 sub_fam = {} 

77 for fam_num, family in fam_by_strain.items(): 

78 kept = {genome:members for genome, members in family.items() if genome in list_genomes} 

79 if kept != {}: 

80 sub_fbs[fam_num] = kept 

81 sub_fam[fam_num] = [member for member in fam_all_members[fam_num] if is_in_subset(member, list_genomes)] 

82 return sub_fbs, sub_fam, list_genomes 

83 

84 

85def is_in_subset(member, list_genomes): 

86 """ 

87 From a list of members, keep only those in the given list of genomes 

88 

89 Parameters 

90 ---------- 

91 members : str 

92 protein name 

93 list_genomes : str 

94 filename containing list of genomes 

95 

96 Return 

97 ------ 

98 bool 

99 True if member is in list_genomes, False otherwise 

100 """ 

101 # if format is gembase (ex: ESCO.1512.00001.i0002_12124), genome name is 3 first . separated fields 

102 if "." in member and len(member.split(".")) >= 3: 

103 genome = ".".join(member.split(".")[:3]) 

104 return genome in list_genomes 

105 else: 

106 # if format is not like this, it must be something_00001 (genomeName_proteinNumber): 

107 return member.split("_")[0] in list_genomes 

108 

109 

110 

111def get_pers(fam_by_strain, fam_all_members, nb_strains, tol=1, multi=False, mixed=False, 

112 floor=False): 

113 """ 

114 From the list of families, get the Pers Genome families, that are families having at least 

115 tol% of 'nb_strain' members. 

116 

117 Parameters 

118 ---------- 

119 fam_by_strain : dict 

120 {fam_num: {genome1: [members], genome2: [members]}, fam_num2: {genome1: [members]}} 

121 fam_all_members : dict 

122 {fam_num: [all members]} 

123 nb_strains : int 

124 total number of strains/genomes in dataset 

125 tol : float 

126 min percentage of different genomes present in a family 

127 ex: if tol=50%, and there are 8 genomes. If a family contains 3 genomes, it is not 

128 persistent. If it contains 7 genomes, it can be persistent (depends on multi and 

129 mixed parameters) 

130 multi : bool 

131 True if multiple genes from the same genome/strain in a family are tolerated. -> a family is considered as multi-persistent if it has members from at least 'tol%' genomes 

132 False otherwise 

133 mixed : bool 

134 True if mixed families are allowed (mixed family = exactly 1 member per genome 

135 for at least tol% of the genomes, 0 or several members allowed for other (1-tol)% genomes) 

136 floor : bool 

137 Use a minimum number of genomes containing a gene to consider the family 

138 persistent equal to: floor(nb_strains*tol) genomes if True, ceil(nb_strains*tol) 

139 if False. 

140 

141 Returns 

142 ------- 

143 dict 

144 {fam_num: [list of members]} for persistent families 

145 """ 

146 logger.info("Generating Persistent genome of a dataset " 

147 f"containing {nb_strains} genomes") 

148 pers = {} # {fam_num: {strain1: [genes from strain1], strain2: [genes from strain2]}} 

149 fams = {} # {fam_num: [list of members]} 

150 if floor: 

151 min_members = math.floor(tol * nb_strains) 

152 else: 

153 min_members = math.ceil(tol * nb_strains) 

154 for fam_num, family in fam_by_strain.items(): 

155 # If enough strains and multi accepted, or multi not accepted but 1 member per 

156 # strain, add family to core 

157 if mixed: 

158 if mixed_family(family, min_members): 

159 pers[fam_num] = family 

160 fams[fam_num] = fam_all_members[fam_num] 

161 else: 

162 if len(family) >= min_members and (multi or uniq_members(family)): 

163 pers[fam_num] = family 

164 fams[fam_num] = fam_all_members[fam_num] 

165 # coregenome computed 

166 if tol == 1 and not multi and not mixed: 

167 logger.info(f"The core genome contains {len(pers)} families, each one having " 

168 f"exactly {int(min_members)} members, from the {nb_strains} different genomes.") 

169 # multi persistent genome with multigenic families allowed 

170 elif multi: 

171 logger.info(f"The persistent genome contains {len(pers)} families with members present " 

172 f"in at least {min_members} different genomes ({tol*100}% of the total number of " 

173 "genomes).") 

174 # mixed persistent genome, tol% families with exactly 1 member from each genome, 

175 # multigenic families allowed for the '1-tol'% remaining families 

176 elif mixed: 

177 logger.info(f"The persistent genome contains {len(pers)} families, " 

178 f"each one having exactly 1 member from at least {tol*100}% of the genomes ({min_members} " 

179 f"genomes). In the remaining {round((1-tol)*100,3)}% genomes, there can be 0, 1 or " 

180 "several members.") 

181 # Strict persistent genome. tol% families with exactly one member in each genome 

182 else: 

183 logger.info(f"The persistent genome contains {len(pers)} families, each one having " 

184 f"exactly 1 member from at least {tol*100}% of the {nb_strains} " 

185 f"different genomes (that is {min_members} genomes). The other genomes are absent from " 

186 "the family.") 

187 return fams 

188 

189 

190def mixed_family(family, thres): 

191 """ 

192 1 family = several genomes (genome=strain), each containing x members 

193 Returns True if at least 'thres' genomes of the family have exactly 1 member. 

194 

195 Parameters 

196 ---------- 

197 family : dict 

198 {strain1: [members in strain1]} 

199 thres : float 

200 minimum number of genomes which must have exactly 1 member 

201 

202 Returns 

203 ------- 

204 bool 

205 """ 

206 nb_uniq = 0 

207 for genome, members in family.items(): 

208 if nb_uniq < thres: 

209 if len(members) == 0: 

210 logger.warning(f"Problem, no members for {genome}!") 

211 if len(members) == 1: 

212 nb_uniq += 1 

213 else: 

214 return True 

215 return nb_uniq >= thres 

216 

217 

218def uniq_members(family, num=1): 

219 """ 

220 Returns True if, in the family, each genome has no more than 'num' member(s), 

221 False otherwise (multigenic family) 

222 

223 Parameters 

224 ---------- 

225 family : dict 

226 {strain1: [members in strain1], strain2: [members in strain2]} 

227 num : int 

228 max number of members allowed in each genome to return True 

229 

230 Returns 

231 ------- 

232 bool 

233 """ 

234 for genome, members in family.items(): 

235 if len(members) == 0: 

236 logger.warning(f"Problem, no members for {genome}!") 

237 if len(members) > num: 

238 return False 

239 return True 

240 

241 

242def write_persistent(fams, outfile): 

243 """ 

244 Write persistent families into output file 

245 

246 Parameters 

247 ---------- 

248 fams : dict 

249 {num_fam: [members]} 

250 outfile : str 

251 output file to write all families 

252 """ 

253 with open(outfile, "w") as outf: 

254 # Order by family number 

255 for num_fam in sorted(fams, key=lambda x: int(x)): 

256 outf.write(str(num_fam)) # Write family num 

257 fam = fams[num_fam] # Get list of members of the family 

258 for mem in sorted(fam, key=utils.sort_proteins): 

259 outf.write(" " + mem) 

260 outf.write("\n")