Coverage for PanACoTA/align_module/pan_to_pergenome.py: 100%

86 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37From the Persistent Genome file, group all persistent proteins per genome, in order to be 

38able to extract them faster after. 

39 

40Input: 

41 

42- Persistent genome (1 line per family, 1st column is family number, others are all members) 

43- list of all genomes (1 genome per line, only first column is considered) 

44- output directory 

45- dname: the name of the dataset, used to name output files 

46 

47Output: 

48 

49- for each genome: ``<outdir>/List-<dname>/<dname>-getEntry_gen_<genome>.txt``: list of all 

50 genes from the 'genome' which correspond to persistent proteins. 2 columns: 

51 the first one is the protein name, the second is the filename to which it must be extracted 

52 (corresponding to the family in the persistent genome). 

53- for each genome: ``<outdir>/List-<dname>/<dname>-getEntry_prt_<genome>.txt``: same as 

54 previous file but with the list of proteins to extract instead of genes. 

55- for each family: ``<outdir>/Align-<dname>/<dname>-current.<fam_num>.miss.lst``: list of 

56 genomes which do not have members in family 'fam_num'. 

57 

58@author GEM 

59November 2016 

60""" 

61 

62import os 

63import sys 

64import logging 

65 

66logger = logging.getLogger("align.pan_to_pergenome") 

67 

68 

69def get_per_genome(persgen, list_gen, dname, outdir): 

70 """ 

71 From persistent genome and list of all genomes, sort persistent proteins by genome 

72 

73 For each genome, write all persistent proteins to a file, with the family from which they 

74 are, in order to extract those proteins after. 

75 For each family, also save the names of genomes which do not have any member. This will 

76 be used to complete the alignments by stretches of '-'. 

77 

78 Parameters 

79 ---------- 

80 persgen : str 

81 file containing persistent genome 

82 list_gen : str 

83 file containing the list of all genomes 

84 dname : str 

85 name of the dataset 

86 outdir : str 

87 Directory where files must be saved. Will create 2 subfolders: ``Align-<dname>`` 

88 and ``List-<dname>`` 

89 

90 Returns 

91 ------- 

92 (all_genomes, aldir, listdir, families) : tuple 

93 

94 * all_genomes : [] list of all genome names 

95 * aldir : str, path to align directory 

96 * listdir : str, path to List directory 

97 * families : str, list of family numbers 

98 """ 

99 # Define output directories 

100 aldir = os.path.join(outdir, "Align-" + dname) 

101 listdir = os.path.join(outdir, "List-" + dname) 

102 os.makedirs(aldir, exist_ok=True) 

103 os.makedirs(listdir, exist_ok=True) 

104 

105 # Get list of all genomes 

106 all_genomes = get_all_genomes(list_gen) 

107 logger.info(f"Found {len(all_genomes)} genomes.") 

108 

109 # Sort proteins by strain 

110 logger.info("Reading PersGenome and constructing lists of missing genomes in each family.") 

111 all_prots, fam_genomes, several = proteins_per_strain(persgen, all_genomes) 

112 # Write output files 

113 write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes) 

114 write_missing_genomes(fam_genomes, several, all_genomes, aldir, dname) 

115 return all_genomes, aldir, listdir, fam_genomes.keys() 

116 

117 

118def get_all_genomes(list_gen): 

119 """ 

120 Read the file containing the genomes list and get all genome names 

121 

122 Parameters 

123 ---------- 

124 list_gen : str 

125 File containing the list of all genomes 

126 

127 Returns 

128 ------- 

129 list 

130 list of all genome names 

131 

132 """ 

133 all_genomes = [] 

134 with open(list_gen, "r") as lgf: 

135 for line in lgf: 

136 if "gembase" not in line: 

137 genome = line.strip().split()[0] 

138 all_genomes.append(genome) 

139 return all_genomes 

140 

141 

142def proteins_per_strain(persgen, all_genomes): 

143 """ 

144 From the persistentGenome file, get all persistent proteins, and classify them 

145 according to the strain from which they are. 

146 

147 Parameters 

148 ---------- 

149 persgen : str 

150 File containing persistent genome 

151 

152 Returns 

153 ------- 

154 (all_prots, fam_genomes, several) : tuple 

155 

156 * all_prots: dict, {strain: {member: fam_num}} 

157 * fam_genomes: dict, {fam_num: set(genomes having a member in fam)} 

158 * several: dict, {fam_num: set(genomes having several members in fam)} 

159 """ 

160 logger.info("Getting all persistent proteins and classify by strain.") 

161 all_genomes = frozenset(all_genomes) 

162 all_prots = {} # {strain: {member: fam_num}} 

163 fam_genomes = {} # {fam_num: set(genomes having a member in fam)} 

164 several = {} # {fam_num: set(genomes having several members in fam)} 

165 with open(persgen, "r") as pgf: 

166 for line in pgf: 

167 fam_num = line.split()[0] 

168 members = line.strip().split()[1:] 

169 fam_genomes[fam_num] = set() 

170 several[fam_num] = set() 

171 for mem in members: 

172 # if format is ESCO.1512.00001.i0002_12124, strain is 3 first fields 

173 # separated by '.' 

174 if "." in mem and len(mem.split(".")) >= 3: 

175 strain = ".".join(mem.split(".")[:3]) 

176 if strain not in all_genomes: 

177 strain = "_".join(mem.split("_")[:-1]) 

178 # if format is not like this, it must be something_00001: 

179 else: 

180 strain = "_".join(mem.split("_")[:-1]) 

181 # if strain not already in fam_genomes, add it 

182 if strain not in fam_genomes[fam_num]: 

183 fam_genomes[fam_num].add(strain) 

184 # If strain already in fam_genomes, it has several members: add it to several 

185 elif strain not in several[fam_num]: 

186 several[fam_num].add(strain) 

187 if strain not in all_prots: 

188 all_prots[strain] = {} 

189 if mem in all_prots[strain]: 

190 logger.warning((" problem: {} already exists, in family {}. Conflict with " 

191 "family {}.").format(mem, all_prots[strain][mem], fam_num)) 

192 all_prots[strain][mem] = fam_num 

193 return all_prots, fam_genomes, several 

194 

195 

196def write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes): 

197 """ 

198 For each species, write all its persistent proteins into a file, with the 

199 persistent family in which they are. Those files will be used to extract all 

200 proteins. 

201 

202 Parameters 

203 ---------- 

204 all_prots : dict 

205 {strain: {member: fam_num}} 

206 several : dict 

207 {fam_num: [genomes having several members in family]} 

208 listdir : str 

209 directory where lists of proteins per genome must be saved 

210 aldir : str 

211 directory where extracted proteins per family must be saved 

212 dname : str 

213 name of dataset 

214 all_genomes : list 

215 list of all genomes 

216 """ 

217 for strain, member in all_prots.items(): 

218 write_genome_file(listdir, aldir, dname, strain, member, several) 

219 error = [] 

220 for strain in all_genomes: 

221 gegenfile = os.path.join(listdir, dname + "-getEntry_gen_" + strain + ".txt") 

222 geprtfile = os.path.join(listdir, dname + "-getEntry_prt_" + strain + ".txt") 

223 if not os.path.isfile(gegenfile) or not os.path.isfile(geprtfile): 

224 error.append(strain) 

225 if error: 

226 for gen in error: 

227 logger.error(f"There is not any protein for genome {gen} in any family! " 

228 "The program will close, please fix this problem to be able to " 

229 "run the alignments") 

230 sys.exit(1) 

231 

232 

233def write_genome_file(listdir, aldir, dname, strain, member, several): 

234 """ 

235 For a given genome, write all the proteins and genes to extract to its file. 

236 If one of the 2 files (proteins and genes) already exists, overwrite it. 

237 If no file exists -> write them. 

238 If the 2 files exist -> warning saying that we use already existing files. 

239 

240 Parameters 

241 ---------- 

242 listdir : str 

243 path to List directory 

244 aldir : str 

245 path to Align directory 

246 dname : str 

247 name of dataset 

248 strain : str 

249 current genome name 

250 member : dict 

251 {member: fam_num} 

252 several : dict 

253 {fam_num: [genomes having several members in family]} 

254 """ 

255 # If the 2 files exist, use them as they are 

256 gegenfile = os.path.join(listdir, dname + "-getEntry_gen_" + strain + ".txt") 

257 geprtfile = os.path.join(listdir, dname + "-getEntry_prt_" + strain + ".txt") 

258 if os.path.isfile(gegenfile) and os.path.isfile(geprtfile): 

259 logger.warning(f"For genome {strain}, {geprtfile} and {gegenfile} already exist. " 

260 "The program will use them " 

261 "to extract proteins and genes. If you prefer to rewrite them, use " 

262 "option -F (or --force).") 

263 return 

264 

265 # If at least one of the 2 files already exists, overwrite both files 

266 with open(gegenfile, "w") as gegf, open(geprtfile, "w") as gepf: 

267 for mem, fam in member.items(): 

268 if strain not in several[fam]: 

269 genfile = os.path.join(aldir, dname + "-current." + fam + ".gen") 

270 gegf.write(mem + " " + genfile + "\n") 

271 prtfile = os.path.join(aldir, dname + "-current." + fam + ".prt") 

272 gepf.write(mem + " " + prtfile + "\n") 

273 

274 

275def write_missing_genomes(fam_genomes, several, all_genomes, aldir, dname): 

276 """ 

277 For each family, write the names of all genomes which do not have any member in 

278 the family. 

279 

280 Parameters 

281 ---------- 

282 fam_genomes : dict 

283 {fam_num: [genomes having a member in fam]} 

284 several : dict 

285 {fam_num: [genomes having several members in family]} 

286 all_genomes : list 

287 list of all genomes 

288 aldir : str 

289 directory where the lists of missing genomes per family must be saved 

290 dname : str 

291 name of dataset 

292 """ 

293 for fam, genomes in fam_genomes.items(): 

294 # File where missing genomes will be written 

295 missfile = os.path.join(aldir, f"{dname}-current.{fam}.miss.lst") 

296 with open(missfile, "w") as mff: 

297 # missing = missing or several members: 

298 # miss: all genomes - genomes in the family 

299 # several: add to 'miss' genomes with several members in the family 

300 missing = (set(all_genomes) - set(genomes)).union(set(several[fam])) 

301 if missing: 

302 mff.write("\n".join(missing) + "\n")