Coverage for PanACoTA/align_module/pan_to

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37From the Persistent Genome file, group all persistent proteins per genome, in order to be

38able to extract them faster after.

40Input:

42- Persistent genome (1 line per family, 1st column is family number, others are all members)

43- list of all genomes (1 genome per line, only first column is considered)

44- output directory

45- dname: the name of the dataset, used to name output files

47Output:

49- for each genome: ``<outdir>/List-<dname>/<dname>-getEntry_gen_<genome>.txt``: list of all

50 genes from the 'genome' which correspond to persistent proteins. 2 columns:

51 the first one is the protein name, the second is the filename to which it must be extracted

52 (corresponding to the family in the persistent genome).

53- for each genome: ``<outdir>/List-<dname>/<dname>-getEntry_prt_<genome>.txt``: same as

54 previous file but with the list of proteins to extract instead of genes.

55- for each family: ``<outdir>/Align-<dname>/<dname>-current.<fam_num>.miss.lst``: list of

56 genomes which do not have members in family 'fam_num'.

58@author GEM

59November 2016

60"""

62import os

63import sys

64import logging

66logger = logging.getLogger("align.pan_to_pergenome")

69def get_per_genome(persgen, list_gen, dname, outdir):

70 """

71 From persistent genome and list of all genomes, sort persistent proteins by genome

73 For each genome, write all persistent proteins to a file, with the family from which they

74 are, in order to extract those proteins after.

75 For each family, also save the names of genomes which do not have any member. This will

76 be used to complete the alignments by stretches of '-'.

78 Parameters

79 ----------

80 persgen : str

81 file containing persistent genome

82 list_gen : str

83 file containing the list of all genomes

84 dname : str

85 name of the dataset

86 outdir : str

87 Directory where files must be saved. Will create 2 subfolders: ``Align-<dname>``

88 and ``List-<dname>``

90 Returns

91 -------

92 (all_genomes, aldir, listdir, families) : tuple

94 * all_genomes : [] list of all genome names

95 * aldir : str, path to align directory

96 * listdir : str, path to List directory

97 * families : str, list of family numbers

98 """

99 # Define output directories

100 aldir = os.path.join(outdir, "Align-" + dname)

101 listdir = os.path.join(outdir, "List-" + dname)

102 os.makedirs(aldir, exist_ok=True)

103 os.makedirs(listdir, exist_ok=True)

104

105 # Get list of all genomes

106 all_genomes = get_all_genomes(list_gen)

107 logger.info(f"Found {len(all_genomes)} genomes.")

108

109 # Sort proteins by strain

110 logger.info("Reading PersGenome and constructing lists of missing genomes in each family.")

111 all_prots, fam_genomes, several = proteins_per_strain(persgen, all_genomes)

112 # Write output files

113 write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes)

114 write_missing_genomes(fam_genomes, several, all_genomes, aldir, dname)

115 return all_genomes, aldir, listdir, fam_genomes.keys()

116

117

118def get_all_genomes(list_gen):

119 """

120 Read the file containing the genomes list and get all genome names

121

122 Parameters

123 ----------

124 list_gen : str

125 File containing the list of all genomes

126

127 Returns

128 -------

129 list

130 list of all genome names

131

132 """

133 all_genomes = []

134 with open(list_gen, "r") as lgf:

135 for line in lgf:

136 if "gembase" not in line:

137 genome = line.strip().split()[0]

138 all_genomes.append(genome)

139 return all_genomes

140

141

142def proteins_per_strain(persgen, all_genomes):

143 """

144 From the persistentGenome file, get all persistent proteins, and classify them

145 according to the strain from which they are.

146

147 Parameters

148 ----------

149 persgen : str

150 File containing persistent genome

151

152 Returns

153 -------

154 (all_prots, fam_genomes, several) : tuple

155

156 * all_prots: dict, {strain: {member: fam_num}}

157 * fam_genomes: dict, {fam_num: set(genomes having a member in fam)}

158 * several: dict, {fam_num: set(genomes having several members in fam)}

159 """

160 logger.info("Getting all persistent proteins and classify by strain.")

161 all_genomes = frozenset(all_genomes)

162 all_prots = {} # {strain: {member: fam_num}}

163 fam_genomes = {} # {fam_num: set(genomes having a member in fam)}

164 several = {} # {fam_num: set(genomes having several members in fam)}

165 with open(persgen, "r") as pgf:

166 for line in pgf:

167 fam_num = line.split()[0]

168 members = line.strip().split()[1:]

169 fam_genomes[fam_num] = set()

170 several[fam_num] = set()

171 for mem in members:

172 # if format is ESCO.1512.00001.i0002_12124, strain is 3 first fields

173 # separated by '.'

174 if "." in mem and len(mem.split(".")) >= 3:

175 strain = ".".join(mem.split(".")[:3])

176 if strain not in all_genomes:

177 strain = "_".join(mem.split("_")[:-1])

178 # if format is not like this, it must be something_00001:

179 else:

180 strain = "_".join(mem.split("_")[:-1])

181 # if strain not already in fam_genomes, add it

182 if strain not in fam_genomes[fam_num]:

183 fam_genomes[fam_num].add(strain)

184 # If strain already in fam_genomes, it has several members: add it to several

185 elif strain not in several[fam_num]:

186 several[fam_num].add(strain)

187 if strain not in all_prots:

188 all_prots[strain] = {}

189 if mem in all_prots[strain]:

190 logger.warning((" problem: {} already exists, in family {}. Conflict with "

191 "family {}.").format(mem, all_prots[strain][mem], fam_num))

192 all_prots[strain][mem] = fam_num

193 return all_prots, fam_genomes, several

194

195

196def write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes):

197 """

198 For each species, write all its persistent proteins into a file, with the

199 persistent family in which they are. Those files will be used to extract all

200 proteins.

201

202 Parameters

203 ----------

204 all_prots : dict

205 {strain: {member: fam_num}}

206 several : dict

207 {fam_num: [genomes having several members in family]}

208 listdir : str

209 directory where lists of proteins per genome must be saved

210 aldir : str

211 directory where extracted proteins per family must be saved

212 dname : str

213 name of dataset

214 all_genomes : list

215 list of all genomes

216 """

217 for strain, member in all_prots.items():

218 write_genome_file(listdir, aldir, dname, strain, member, several)

219 error = []

220 for strain in all_genomes:

221 gegenfile = os.path.join(listdir, dname + "-getEntry_gen_" + strain + ".txt")

222 geprtfile = os.path.join(listdir, dname + "-getEntry_prt_" + strain + ".txt")

223 if not os.path.isfile(gegenfile) or not os.path.isfile(geprtfile):

224 error.append(strain)

225 if error:

226 for gen in error:

227 logger.error(f"There is not any protein for genome {gen} in any family! "

228 "The program will close, please fix this problem to be able to "

229 "run the alignments")

230 sys.exit(1)

231

232

233def write_genome_file(listdir, aldir, dname, strain, member, several):

234 """

235 For a given genome, write all the proteins and genes to extract to its file.

236 If one of the 2 files (proteins and genes) already exists, overwrite it.

237 If no file exists -> write them.

238 If the 2 files exist -> warning saying that we use already existing files.

239

240 Parameters

241 ----------

242 listdir : str

243 path to List directory

244 aldir : str

245 path to Align directory

246 dname : str

247 name of dataset

248 strain : str

249 current genome name

250 member : dict

251 {member: fam_num}

252 several : dict

253 {fam_num: [genomes having several members in family]}

254 """

255 # If the 2 files exist, use them as they are

256 gegenfile = os.path.join(listdir, dname + "-getEntry_gen_" + strain + ".txt")

257 geprtfile = os.path.join(listdir, dname + "-getEntry_prt_" + strain + ".txt")

258 if os.path.isfile(gegenfile) and os.path.isfile(geprtfile):

259 logger.warning(f"For genome {strain}, {geprtfile} and {gegenfile} already exist. "

260 "The program will use them "

261 "to extract proteins and genes. If you prefer to rewrite them, use "

262 "option -F (or --force).")

263 return

264

265 # If at least one of the 2 files already exists, overwrite both files

266 with open(gegenfile, "w") as gegf, open(geprtfile, "w") as gepf:

267 for mem, fam in member.items():

268 if strain not in several[fam]:

269 genfile = os.path.join(aldir, dname + "-current." + fam + ".gen")

270 gegf.write(mem + " " + genfile + "\n")

271 prtfile = os.path.join(aldir, dname + "-current." + fam + ".prt")

272 gepf.write(mem + " " + prtfile + "\n")

273

274

275def write_missing_genomes(fam_genomes, several, all_genomes, aldir, dname):

276 """

277 For each family, write the names of all genomes which do not have any member in

278 the family.

279

280 Parameters

281 ----------

282 fam_genomes : dict

283 {fam_num: [genomes having a member in fam]}

284 several : dict

285 {fam_num: [genomes having several members in family]}

286 all_genomes : list

287 list of all genomes

288 aldir : str

289 directory where the lists of missing genomes per family must be saved

290 dname : str

291 name of dataset

292 """

293 for fam, genomes in fam_genomes.items():

294 # File where missing genomes will be written

295 missfile = os.path.join(aldir, f"{dname}-current.{fam}.miss.lst")

296 with open(missfile, "w") as mff:

297 # missing = missing or several members:

298 # miss: all genomes - genomes in the family

299 # several: add to 'miss' genomes with several members in the family

300 missing = (set(all_genomes) - set(genomes)).union(set(several[fam]))

301 if missing:

302 mff.write("\n".join(missing) + "\n")

Coverage for PanACoTA/align_module/pan_to_pergenome.py: 100%

86 statements