Coverage for PanACoTA/align_module/get_seqs.py: 100%
108 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36import sys
37import os
38import logging
39import progressbar
41from PanACoTA import utils
43logger = logging.getLogger("align.extract")
46def get_all_seqs(all_genomes, dname, dbpath, listdir, aldir, all_fams, quiet):
47 """
48 For all genomes, extract its proteins present in a persistent family to the file
49 corresponding to this family.
51 Parameters
52 ----------
53 all_genomes : []
54 list of all genome names
55 dname : str
56 name of dataset
57 dbpath : str
58 path to folder containing 'Proteins' and 'Genes' folders
59 listdir : str
60 path to folder containing the lists of proteins/genes to extract
61 aldir : str
62 path to folder where extracted proteins/genes must be saved
63 all_fams : []
64 list of all family numbers
65 quiet : bool
66 True if nothin must be written to stdout/stderr, False otherwise
67 """
68 # Get list of files not already existing
69 files_todo = check_existing_extract(all_fams, aldir, dname)
70 if len(files_todo) == 0:
71 logger.info(("All extraction files already existing (see detailed log for "
72 "more information)"))
73 logger.warning("All prt and gene files for all families already exist. The program "
74 "will use them for the next step. If you want to re-extract a given "
75 "family, remove its prt and gen extraction files. If you want to "
76 "re-extract all families, use option -F (or --force).")
77 return
78 logger.info("Extracting proteins and genes from all genomes")
79 nbgen = len(all_genomes)
80 bar = None
81 curnum = 1
82 if not quiet:
83 widgets = ['Extraction:',
84 progressbar.Bar(marker='█', left='', right='', fill=' '),
85 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',
86 progressbar.Percentage(), ") - ", progressbar.Timer(), ' ',
87 progressbar.ETA()]
88 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start()
89 curnum = 1
90 for genome in all_genomes:
91 ge_gen = os.path.join(listdir, dname + "-getEntry_gen_" + genome + ".txt")
92 ge_prt = os.path.join(listdir, dname + "-getEntry_prt_" + genome + ".txt")
93 logger.details(f"Extracting proteins and genes from {genome}")
94 prtdb = os.path.join(dbpath, "Proteins", genome + ".prt")
95 gendb = os.path.join(dbpath, "Genes", genome + ".gen")
96 get_genome_seqs(prtdb, ge_prt, files_todo)
97 get_genome_seqs(gendb, ge_gen, files_todo)
98 if not quiet:
99 bar.update(curnum)
100 curnum += 1
101 if not quiet:
102 bar.finish()
105def check_existing_extract(all_fams, aldir, dname):
106 """
107 For each family, check if its prt and gen extraction file already exist.
108 If both exist, no need to re-extract for those families.
109 If only one or no one exists, put to list to extract.
111 Parameters
112 ----------
113 all_fams : list
114 list of all family numbers
115 aldir : str
116 path to directory where extraction files must be saved
117 dname : str
118 name of the dataset
120 Returns
121 -------
122 []
123 list of files that must be generated (prt and gen files)
124 """
125 extract_fams = []
126 for fam in all_fams:
127 genfile = os.path.join(aldir, "{}-current.{}.gen".format(dname, fam))
128 prtfile = os.path.join(aldir, "{}-current.{}.prt".format(dname, fam))
129 if not os.path.isfile(genfile) or not os.path.isfile(prtfile):
130 # At least 1 file missing: re-extract all proteins and all genes
131 utils.remove(genfile)
132 utils.remove(prtfile)
133 # As we re-extract proteins and genes, redo alignments
134 mafft_file = os.path.join(aldir, "{}-mafft-align.{}.aln".format(dname, fam))
135 btr_file = os.path.join(aldir, "{}-mafft-prt2nuc.{}.aln".format(dname, fam))
136 utils.remove(mafft_file)
137 utils.remove(btr_file)
138 extract_fams.append(genfile)
139 extract_fams.append(prtfile)
140 # If we re-extract at least 1 family, redo the final files (concatenation and group by genome)
141 if len(extract_fams) > 0:
142 concat = os.path.join(aldir, "{}-complete.cat.aln".format(dname))
143 outdir = os.path.split(aldir)[0]
144 treedir = os.path.join(outdir, "Phylo-" + dname)
145 grpfile = os.path.join(treedir, dname + ".grp.aln")
146 utils.remove(concat)
147 utils.remove(grpfile)
148 return extract_fams
151def get_genome_seqs(fasta, tabfile, files_todo, outfile=None):
152 """
153 From a fasta file, extract all sequences given in the tab file.
154 The tab file can contain:
156 - 1 sequence name per line -> all sequences will be extracted to the same file
157 - 1 sequence + 1 filename per line -> each sequence will be extracted in the given file
159 If outfile not given, the tab file must contain 2 columns (1 for the sequence name,
160 1 for its output file). If an outfile is given, only the 1st column of tab file
161 will be considered, and all sequences will be extracted to the given outfile.
163 Parameters
164 ----------
165 fasta : str
166 path to fasta file from which sequences must be extracted
167 tabfile : str
168 path to the tab file containing the names of sequences to extract
169 files_todo : list
170 list of files which must be generated (prt and gen files). Others
171 already exist, so ignore them.
172 outfile : str or None
173 if None, the tab file must contain 2 columns (1 for the sequence name,
174 1 for its output file). If an outfile is given (not None), only the 1st column of tab file
175 will be considered, and all sequences will be extracted to the given outfile.
176 """
177 with open(tabfile, "r") as tabf:
178 to_extract = get_names_to_extract(tabf, outfile)
179 if outfile:
180 if os.path.isfile(outfile):
181 logger.warning("Sequences are already extracted in {}. This will "
182 "be used for next step. If you want "
183 "to re-extract all sequences, use option -F (or "
184 "--force)".format(outfile))
185 return
186 with open(fasta, "r") as fasf, open(outfile, "a") as outf:
187 extract_sequences(to_extract, fasf, outf=outf)
188 else:
189 with open(fasta, "r") as fasf:
190 extract_sequences(to_extract, fasf, files_todo=files_todo)
193def get_names_to_extract(tabf, outfile):
194 """
195 From the tab file, get names of sequences to extract.
197 Parameters
198 ----------
199 tabf : _io.TextIO
200 open file containing names of sequences to extract
201 outfile : str or None
202 if None, the tab file must contain 2 columns (1 for the sequence name,
203 1 for its output file). If an outfile is given (not None), only the 1st column of tab file
204 will be considered, and all sequences will be given 'outfile' as output file
206 Returns
207 -------
208 dict
209 {sequence_to_extract: file_to_which_it_will_be_extracted}
210 """
211 to_extract = {}
212 for line in tabf:
213 if outfile:
214 seq = line.split()[0].strip()
215 out = outfile
216 else:
217 try:
218 seq, out = line.split()[:2]
219 except ValueError:
220 logger.error(("Your file {} does not contain an output filename for {}. "
221 "Please give an output filename for each sequence to extract, "
222 "or give a general output filename where all sequences will "
223 "be extracted.").format(tabf.name, line.strip()))
224 sys.exit(1)
225 to_extract[seq] = out
226 return to_extract
229def extract_sequences(to_extract, fasf, files_todo=None, outf=None):
230 """
231 Extract sequences from an open fasta file 'fasf', and a list of sequences to
232 extract
234 Parameters
235 ----------
236 to_extract : dict or []
237 {sequence_to_extract: file_to_which_it_will_be_extracted} or list of sequences to
238 extract, all in a same outfile (name must be given in 'outf')
239 fasf : _io.TextIO
240 open file containing sequences in multi-fasta format
241 files_todo : list or None
242 list of files which must be generated (prt and gen files). Others
243 already exist, so ignore them.
244 outf : _io.TextIO or None
245 If an outfile is given (not None), and 'to_extract' is a dict, only its keys will be
246 considered, and all these sequences will be extracted to 'outfile' (if 'to_extract' is a
247 list, will extract all sequences of this list). Otherwise, if None,
248 each sequence will be extracted to its corresponding value in 'to_extract'.
249 """
251 # Create optimized index for requests
252 if files_todo is None:
253 files_todo = []
254 files_todo = frozenset(files_todo)
255 if type(to_extract) == list:
256 to_extract = frozenset(to_extract)
257 # State machine variables
258 previous_fp = None
260 for line in fasf:
261 if line[0] == '>':
262 # Close previous file if needed
263 if outf is None and previous_fp is not None:
264 previous_fp.close()
265 previous_fp = None
267 # Extract sequence name
268 last_char = line.find('\t')
269 if last_char == -1:
270 last_char = line.find(' ')
271 if last_char == -1:
272 last_char = len(line)
273 seq = line[1:last_char].strip()
274 # seq = line.strip().split()[0][1:]
276 # Seq is part of sequences to extract
277 if seq in to_extract:
278 # Open the right file
279 previous_fp = outf
280 if previous_fp is None:
281 out = to_extract[seq]
282 if out in files_todo:
283 previous_fp = open(out, "a")
284 else:
285 print(f"Sequence {seq} not written because no output file specified", file=sys.stderr)
287 # Write the line content if the file pointer is opened
288 if previous_fp is not None:
289 previous_fp.write(line)
291 if outf is None and previous_fp is not None:
292 previous_fp.close()