Coverage for PanACoTA/align_module/pan_to_pergenome.py: 100%
86 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37From the Persistent Genome file, group all persistent proteins per genome, in order to be
38able to extract them faster after.
40Input:
42- Persistent genome (1 line per family, 1st column is family number, others are all members)
43- list of all genomes (1 genome per line, only first column is considered)
44- output directory
45- dname: the name of the dataset, used to name output files
47Output:
49- for each genome: ``<outdir>/List-<dname>/<dname>-getEntry_gen_<genome>.txt``: list of all
50 genes from the 'genome' which correspond to persistent proteins. 2 columns:
51 the first one is the protein name, the second is the filename to which it must be extracted
52 (corresponding to the family in the persistent genome).
53- for each genome: ``<outdir>/List-<dname>/<dname>-getEntry_prt_<genome>.txt``: same as
54 previous file but with the list of proteins to extract instead of genes.
55- for each family: ``<outdir>/Align-<dname>/<dname>-current.<fam_num>.miss.lst``: list of
56 genomes which do not have members in family 'fam_num'.
58@author GEM
59November 2016
60"""
62import os
63import sys
64import logging
66logger = logging.getLogger("align.pan_to_pergenome")
69def get_per_genome(persgen, list_gen, dname, outdir):
70 """
71 From persistent genome and list of all genomes, sort persistent proteins by genome
73 For each genome, write all persistent proteins to a file, with the family from which they
74 are, in order to extract those proteins after.
75 For each family, also save the names of genomes which do not have any member. This will
76 be used to complete the alignments by stretches of '-'.
78 Parameters
79 ----------
80 persgen : str
81 file containing persistent genome
82 list_gen : str
83 file containing the list of all genomes
84 dname : str
85 name of the dataset
86 outdir : str
87 Directory where files must be saved. Will create 2 subfolders: ``Align-<dname>``
88 and ``List-<dname>``
90 Returns
91 -------
92 (all_genomes, aldir, listdir, families) : tuple
94 * all_genomes : [] list of all genome names
95 * aldir : str, path to align directory
96 * listdir : str, path to List directory
97 * families : str, list of family numbers
98 """
99 # Define output directories
100 aldir = os.path.join(outdir, "Align-" + dname)
101 listdir = os.path.join(outdir, "List-" + dname)
102 os.makedirs(aldir, exist_ok=True)
103 os.makedirs(listdir, exist_ok=True)
105 # Get list of all genomes
106 all_genomes = get_all_genomes(list_gen)
107 logger.info(f"Found {len(all_genomes)} genomes.")
109 # Sort proteins by strain
110 logger.info("Reading PersGenome and constructing lists of missing genomes in each family.")
111 all_prots, fam_genomes, several = proteins_per_strain(persgen, all_genomes)
112 # Write output files
113 write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes)
114 write_missing_genomes(fam_genomes, several, all_genomes, aldir, dname)
115 return all_genomes, aldir, listdir, fam_genomes.keys()
118def get_all_genomes(list_gen):
119 """
120 Read the file containing the genomes list and get all genome names
122 Parameters
123 ----------
124 list_gen : str
125 File containing the list of all genomes
127 Returns
128 -------
129 list
130 list of all genome names
132 """
133 all_genomes = []
134 with open(list_gen, "r") as lgf:
135 for line in lgf:
136 if "gembase" not in line:
137 genome = line.strip().split()[0]
138 all_genomes.append(genome)
139 return all_genomes
142def proteins_per_strain(persgen, all_genomes):
143 """
144 From the persistentGenome file, get all persistent proteins, and classify them
145 according to the strain from which they are.
147 Parameters
148 ----------
149 persgen : str
150 File containing persistent genome
152 Returns
153 -------
154 (all_prots, fam_genomes, several) : tuple
156 * all_prots: dict, {strain: {member: fam_num}}
157 * fam_genomes: dict, {fam_num: set(genomes having a member in fam)}
158 * several: dict, {fam_num: set(genomes having several members in fam)}
159 """
160 logger.info("Getting all persistent proteins and classify by strain.")
161 all_genomes = frozenset(all_genomes)
162 all_prots = {} # {strain: {member: fam_num}}
163 fam_genomes = {} # {fam_num: set(genomes having a member in fam)}
164 several = {} # {fam_num: set(genomes having several members in fam)}
165 with open(persgen, "r") as pgf:
166 for line in pgf:
167 fam_num = line.split()[0]
168 members = line.strip().split()[1:]
169 fam_genomes[fam_num] = set()
170 several[fam_num] = set()
171 for mem in members:
172 # if format is ESCO.1512.00001.i0002_12124, strain is 3 first fields
173 # separated by '.'
174 if "." in mem and len(mem.split(".")) >= 3:
175 strain = ".".join(mem.split(".")[:3])
176 if strain not in all_genomes:
177 strain = "_".join(mem.split("_")[:-1])
178 # if format is not like this, it must be something_00001:
179 else:
180 strain = "_".join(mem.split("_")[:-1])
181 # if strain not already in fam_genomes, add it
182 if strain not in fam_genomes[fam_num]:
183 fam_genomes[fam_num].add(strain)
184 # If strain already in fam_genomes, it has several members: add it to several
185 elif strain not in several[fam_num]:
186 several[fam_num].add(strain)
187 if strain not in all_prots:
188 all_prots[strain] = {}
189 if mem in all_prots[strain]:
190 logger.warning((" problem: {} already exists, in family {}. Conflict with "
191 "family {}.").format(mem, all_prots[strain][mem], fam_num))
192 all_prots[strain][mem] = fam_num
193 return all_prots, fam_genomes, several
196def write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes):
197 """
198 For each species, write all its persistent proteins into a file, with the
199 persistent family in which they are. Those files will be used to extract all
200 proteins.
202 Parameters
203 ----------
204 all_prots : dict
205 {strain: {member: fam_num}}
206 several : dict
207 {fam_num: [genomes having several members in family]}
208 listdir : str
209 directory where lists of proteins per genome must be saved
210 aldir : str
211 directory where extracted proteins per family must be saved
212 dname : str
213 name of dataset
214 all_genomes : list
215 list of all genomes
216 """
217 for strain, member in all_prots.items():
218 write_genome_file(listdir, aldir, dname, strain, member, several)
219 error = []
220 for strain in all_genomes:
221 gegenfile = os.path.join(listdir, dname + "-getEntry_gen_" + strain + ".txt")
222 geprtfile = os.path.join(listdir, dname + "-getEntry_prt_" + strain + ".txt")
223 if not os.path.isfile(gegenfile) or not os.path.isfile(geprtfile):
224 error.append(strain)
225 if error:
226 for gen in error:
227 logger.error(f"There is not any protein for genome {gen} in any family! "
228 "The program will close, please fix this problem to be able to "
229 "run the alignments")
230 sys.exit(1)
233def write_genome_file(listdir, aldir, dname, strain, member, several):
234 """
235 For a given genome, write all the proteins and genes to extract to its file.
236 If one of the 2 files (proteins and genes) already exists, overwrite it.
237 If no file exists -> write them.
238 If the 2 files exist -> warning saying that we use already existing files.
240 Parameters
241 ----------
242 listdir : str
243 path to List directory
244 aldir : str
245 path to Align directory
246 dname : str
247 name of dataset
248 strain : str
249 current genome name
250 member : dict
251 {member: fam_num}
252 several : dict
253 {fam_num: [genomes having several members in family]}
254 """
255 # If the 2 files exist, use them as they are
256 gegenfile = os.path.join(listdir, dname + "-getEntry_gen_" + strain + ".txt")
257 geprtfile = os.path.join(listdir, dname + "-getEntry_prt_" + strain + ".txt")
258 if os.path.isfile(gegenfile) and os.path.isfile(geprtfile):
259 logger.warning(f"For genome {strain}, {geprtfile} and {gegenfile} already exist. "
260 "The program will use them "
261 "to extract proteins and genes. If you prefer to rewrite them, use "
262 "option -F (or --force).")
263 return
265 # If at least one of the 2 files already exists, overwrite both files
266 with open(gegenfile, "w") as gegf, open(geprtfile, "w") as gepf:
267 for mem, fam in member.items():
268 if strain not in several[fam]:
269 genfile = os.path.join(aldir, dname + "-current." + fam + ".gen")
270 gegf.write(mem + " " + genfile + "\n")
271 prtfile = os.path.join(aldir, dname + "-current." + fam + ".prt")
272 gepf.write(mem + " " + prtfile + "\n")
275def write_missing_genomes(fam_genomes, several, all_genomes, aldir, dname):
276 """
277 For each family, write the names of all genomes which do not have any member in
278 the family.
280 Parameters
281 ----------
282 fam_genomes : dict
283 {fam_num: [genomes having a member in fam]}
284 several : dict
285 {fam_num: [genomes having several members in family]}
286 all_genomes : list
287 list of all genomes
288 aldir : str
289 directory where the lists of missing genomes per family must be saved
290 dname : str
291 name of dataset
292 """
293 for fam, genomes in fam_genomes.items():
294 # File where missing genomes will be written
295 missfile = os.path.join(aldir, f"{dname}-current.{fam}.miss.lst")
296 with open(missfile, "w") as mff:
297 # missing = missing or several members:
298 # miss: all genomes - genomes in the family
299 # several: add to 'miss' genomes with several members in the family
300 missing = (set(all_genomes) - set(genomes)).union(set(several[fam]))
301 if missing:
302 mff.write("\n".join(missing) + "\n")