Coverage for PanACoTA/annotate_module/genome_seq_functions.py: 100%
150 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to:
39- analyse a genome (cut at stretch of N if asked, calc L90, nb cont, size...)
40- if genome cut by stretch of N, write the new sequence to new file
41- rename genome contigs and write new sequence to new file
44@author gem
45April 2017
46"""
47import os
48import re
49import sys
50import numpy as np
51import logging
52import progressbar
54from PanACoTA import utils
56logger = logging.getLogger("annotate.gseq_functions")
58def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, soft, logger, quiet=False):
59 """
61 Parameters
62 ----------
63 genomes : dict
64 {genome: spegenus.date}
65 dbpath : str
66 path to folder containing genomes
67 tmp_path : str
68 path to put out files
69 nbn : int
70 minimum number of 'N' required to cut into a new contig
71 soft : str
72 soft used (prokka, prodigal, or None if called by prepare module)
73 logger : logging.Logger
74 logger object to write log information. Because this function can be called from
75 prepare module, where sub logger name is different
76 quiet : bool
77 True if nothing must be written to stdout/stderr, False otherwise
79 Returns
80 -------
81 dict
82 {genome: [spegenus.date, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
84 """
85 cut = nbn > 0
86 pat = None ## To put pattern with which sequence must be cut
87 if cut:
88 pat = 'N' * nbn + "+"
89 nbgen = len(genomes)
90 bar = None
91 curnum = None
92 if cut:
93 logger.info(("Cutting genomes at each time there are at least {} 'N' in a row, "
94 "and then, calculating genome size, number of contigs and L90.").format(nbn))
95 else:
96 logger.info("Calculating genome size, number of contigs, L90")
97 if not quiet:
98 # Create progressbar
99 widgets = ['Analysis: ', progressbar.Bar(marker='█', left='', right=''),
100 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',
101 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - ',
102 progressbar.ETA()
103 ]
104 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start()
105 curnum = 1
106 toremove = []
107 # Analyse genomes 1 by 1
108 for genome, name in genomes.items():
109 # If not quiet option, show progress bar
110 if not quiet:
111 bar.update(curnum)
112 curnum += 1
113 # analyse genome, and check everything went well.
114 # exception if binary file
115 try:
116 res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger)
117 except UnicodeDecodeError:
118 logger.warning(f"'{genome}' does not seem to be a fasta file. It will be ignored.")
119 res = False
120 # Problem while analysing genome -> genome ignored
121 if not res:
122 toremove.append(genome)
123 # If there are some genomes to remove (analysis failed), remove them from genomes dict.
124 if toremove:
125 for gen in toremove:
126 del genomes[gen]
127 if not genomes:
128 logger.error(f"No genome was found in the database folder {dbpath}. See logfile "
129 "for more information.")
130 sys.exit(1)
131 if not quiet:
132 bar.finish()
133 return 0
136def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger):
137 """
138 Analyse given genome:
140 - if cut is asked:
142 - cut its contigs at each time that 'pat' is seen
143 - save cut genome in new file
145 - calculate genome size, L90, nb contigs and save it into genomes
147 Parameters
148 ----------
149 genome : str
150 given genome to analyse
151 dbpath : str
152 path to the folder containing the given genome sequence
153 tmp_path : str
154 path to folder where output files must be saved.
155 cut : bool
156 True if contigs must be cut, False otherwise
157 pat : str
158 pattern on which contigs must be cut. ex: "NNNNN"
159 genomes : dict
160 {genome_file: [genome_name]} as input, and will be changed to\
161 {genome_file: [genome_name, path, path_annotate, gsize, nbcont, L90]}
162 soft : str
163 soft used (prokka, prodigal, or None if called by prepare module)
165 Returns
166 -------
167 bool
168 True if genome analysis went well, False otherwise
169 Modifies 'genomes' for the analysed genome: -> {genome_file: [genome_name, path,
170 path_annotate, gsize, nbcont, L90]}
171 """
172 gpath, grespath = get_output_dir(soft, dbpath, tmp_path, genome, cut, pat)
173 if not os.path.exists(gpath):
174 logger.error(f"The file {gpath} does not exist")
175 return False
176 # Open original sequence file
177 with open(gpath, "r") as genf:
178 # If a new file must be created (sequences cut), open it
179 gresf = None
180 if grespath:
181 gresf = open(grespath, "w")
183 # Initialize variables
184 cur_contig_name = "" # header text
185 contig_sizes = {} # {header text: size}
186 cur_seq = "" # sequence
187 num = 1 # Used to get unique contig names
189 # Read each line of original sequence
190 for line in genf:
191 #### NEW CONTIG
192 # Line corresponding to a new contig
193 if line.startswith(">"):
194 # If not first contig, write info to output file (if needed)
195 if cur_seq != "":
196 num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes,
197 gresf, num, logger)
198 # If problem while formatting contig, return False -> genome ignored
199 if num == -1:
200 return False
201 # Get contig name for next turn, and reset sequence
202 cur_contig_name = line.strip()
203 # Initialize for next contig
204 cur_seq = ""
205 # #### SEQUENCE LINE
206 # If sequence line, keep it, all in upper case
207 else:
208 # Add this line without \n to sequence (1 sequence per line)
209 cur_seq += line.strip().upper()
211 # LAST CONTIG
212 if cur_contig_name != "":
213 num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf,
214 num, logger)
215 if num == -1:
216 return False
217 # GLOBAL INFORMATION
218 nbcont = len(contig_sizes)
219 gsize = sum(contig_sizes.values())
220 if nbcont == 0 or gsize == 0:
221 logger.warning(f"Your file {gpath} does not contain any gene. Please check that you "
222 "really gave a fasta sequence file")
223 if grespath:
224 gresf.close()
225 os.remove(grespath)
226 return False
227 l90 = calc_l90(contig_sizes)
228 # Everything ok for this genome -> complete its list of information in genomes dict
229 # [spegenus.date] -> [spegenus.date, gpath, gpath_to_annotate, gsize, nbcont, L90]}
230 if grespath:
231 genomes[genome] += [gpath, grespath, gsize, nbcont, l90]
232 else:
233 genomes[genome] += [gpath, gpath, gsize, nbcont, l90]
234 # If we wrote a new sequence file, close it
235 if grespath:
236 gresf.close()
237 return True
240def get_output_dir(soft, dbpath, tmp_path, genome, cut, pat):
241 """
242 Get output file to put sequence cut and/or sequence with shorter contigs (prokka)
244 Parameters
245 ----------
246 soft : str
247 soft used (prokka, prodigal, or None if called by prepare module)
248 dbpath : str
249 path to the folder containing the given genome sequence
250 tmp_path : str
251 path to folder where output files must be saved.
252 genome : str
253 genome name
254 cut : bool
255 True if contigs must be cut, False otherwise
256 pat : str
257 pattern on which contigs must be cut. ex: "NNNNN"
259 Return
260 ------
261 grespath : str
262 path to ouput file. None if no need to create new sequence file
263 """
264 # Path to sequence to analyze
265 gpath = os.path.join(dbpath, genome)
266 # genome file is in dbpath except if it was in several files in dbpath,
267 # in which case it has been concatenated to a file in tmp_path
268 if not os.path.isfile(gpath):
269 gpath = os.path.join(tmp_path, genome)
270 # New file create if needed. If not (prodigal and not cut), empty filename
271 grespath = None
272 # If user asks to cut at each 'pat', need to create a new sequence file,
273 # whatever the annotation soft used
274 if cut:
275 new_file = genome + "_{}-split{}N.fna".format(soft, len(pat) - 1)
276 grespath = os.path.join(tmp_path, new_file)
277 # If no cutl, just keep original sequence, no need to create new file.
278 # Just check that contigs have different names
279 return gpath, grespath
282def format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, num, logger):
283 """
284 Format given contig, and save to output file if needed
286 - if cut: cut it and write each subsequence
287 - write new contig just check that contig names are different
289 Parameters
290 ----------
291 cut : bool
292 True if contigs must be cut, False otherwise
293 pat : str
294 pattern on which contigs must be cut. ex: "NNNNN"
295 cur_seq : str
296 current sequence (aa)
297 cur_contig_name : str
298 name of current contig
299 genome : str
300 name of current genome
301 cont_sizes : dict
302 {contig_name : sequence length}
303 gresf : io.TextIOWrappe
304 open file to write new sequence. If we are annotating with prodigal and not cutting,
305 there is no new sequence -> gref is None
306 num : int
307 current contig number
308 logger : logging.Logger
309 logger object to write log information
311 Returns
312 -------
313 bool
314 True if contig has been written without problem, False if any problem
315 """
316 # "CUT" if cut: need to cut at each 'pat' -> write new header + seq in new file
317 if cut:
318 # Cut sequence and write header + sequence to res file
319 num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num)
320 # No cut -> no new file created, but check contig unique names
321 else:
322 if cur_contig_name in contig_sizes.keys():
323 logger.error(f"In genome {genome}, '{cur_contig_name}' contig name is used for "
324 "several contigs. Please put different names for each contig. "
325 "This genome will be ignored.")
326 return -1
327 else:
328 contig_sizes[cur_contig_name] = len(cur_seq)
329 return num
332def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num):
333 """
334 Save the contig read just before into dicts and write it to sequence file.
335 Unique ID of contig must be in the first field of header, before the first space
336 (required by prokka)
338 Parameters
339 ----------
340 pat : str
341 pattern to split a contig. None if we do not want to split
342 whole_seq : str
343 sequence of current contig, to save once split according to pat
344 cur_contig_name : str
345 name of current contig to save once split according to pat
346 contig_sizes : dict
347 {name: size} save cur_contig once split according to pat
348 gresf : _io.TextIOWrapper
349 file open in w mode to save the split sequence
350 num : int
351 current contig number.
353 Returns
354 -------
355 int
356 new contig number, after giving number(s) to the current contig
357 """
358 # split contig each time a stretch of at least nbn 'N' is found (pattern pat)
359 if not pat:
360 cont_parts = [whole_seq]
361 else:
362 cont_parts = re.split(pat, whole_seq)
364 # save contig parts
365 for seq in cont_parts:
366 # Only save non empty contigs (with some patterns, it could arrive that
367 # we get empty contigs, if 2 occurrences of the pattern are side by side).
368 if len(seq) == 0:
369 continue
370 new_contig_name = ">{}_{}\n".format(num, cur_contig_name.split(">")[1])
371 contig_sizes[new_contig_name] = len(seq)
372 gresf.write(new_contig_name)
373 gresf.write(seq + "\n")
374 num += 1
375 return num
378def calc_l90(contig_sizes):
379 """
380 Calc L90 of a given genome
382 Parameters
383 ----------
384 contig_sizes : dict
385 {name: size}
387 Returns
388 -------
389 None or int
390 if L90 found, returns L90. Otherwise, returns nothing
391 """
392 gsize = sum(contig_sizes.values())
393 sizes = [contig_sizes[cont] for cont in contig_sizes]
394 cum_sizes = np.cumsum(sorted(sizes, reverse=True))
395 lim = 0.9 * gsize
396 for num, val in enumerate(cum_sizes):
397 if val >= lim:
398 return num + 1
401def rename_all_genomes(genomes):
402 """
403 FUNCTION DIRECTLY CALLED FROM MAIN ANNOTATE MODULE (step 3)
404 Sort kept genomes by L90 and then nb contigs.
405 For each genome, assign a strain number, and rename all its contigs.
407 Parameters
408 ----------
409 genomes : dict
410 {genome: [name, path, path_to_seq, gsize, nbcont, L90]} as input, and will become\
411 {genome: [gembase_name, path, path_to_seq, gsize, nbcont, L90]} at the end
413 Return
414 ------
415 change 1st field of genomes dict. name -> gembase_name (with strain number)
417 """
418 logger.info(f"Renaming kept genomes according to their quality ({len(genomes)} genomes)")
419 # Keep first genome name to give to prodigal for training
420 first_gname = None
421 # Keep previous genome name (ESCO.0109 -> ESCO)
422 last_name = ""
423 # Keep last strain number
424 last_strain = 0
425 # "SAEN.1015.{}".format(str(last_strain).zfill(5))
426 # Sort genomes by species, L90 and nb_contigs
427 for genome, [name, _, _, _, _, _] in sorted(genomes.items(),
428 key=utils.sort_genomes_byname_l90_nbcont):
429 if not first_gname:
430 first_gname = genome
431 # first genome, or new strain name (ex: ESCO vs EXPL)
432 # -> keep this new name, and add 1 to next strain number
433 if last_name != name.split(".")[0]:
434 last_strain = 1
435 last_name = name.split(".")[0]
436 # same strain name
437 # -> write this new sequence, and go to next one (strain += 1)
438 else:
439 last_strain += 1
440 # Write information to "genomes" dict.
441 gembase_name = ".".join([name, str(last_strain).zfill(5)])
442 genomes[genome][0] = gembase_name
443 return first_gname
446def plot_distributions(genomes, res_path, listfile_base, l90, nbconts):
447 """
448 FUNCTION DIRECTLY CALLED FROM MAIN ANNOTATE MODULE (step2)
449 Plot distributions of L90 and nbcontig values.
451 Parameters
452 ----------
453 genomes : dict
454 {genome: [name, orig_path, to_annotate_path, size, nbcont, l90]}
455 res_path : str
456 path to put all output files
457 listfile_base : str
458 name of list file
459 l90 : int
460 L90 threshold
461 nbconts : int
462 nb contigs threshold
464 Returns
465 -------
466 (l90_vals, nbcont_vals, dist1, dist2) :
468 - l90_vals : list of l90 values for all genomes
469 - nbcont_vals : list of nbcontigs for all genomes
470 - dist1 : matplotlib figure of distribution of L90 values
471 - dist2 : matplotlib figure of distribution of nb contigs values
473 """
474 logger.info("Generating distribution of L90 and #contigs graphs.")
475 l90_vals = [val for _, (_, _, _, _, _, val) in genomes.items()]
476 outl90 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png")
477 nbcont_vals = [val for _, (_, _, _, _, val, _) in genomes.items()]
478 outnbcont = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png")
479 dist1 = utils.plot_distr(l90_vals, l90, "L90 distribution for all genomes",
480 "max L90 =", logger)
481 dist2 = utils.plot_distr(nbcont_vals, nbconts,
482 "Distribution of number of contigs among all genomes",
483 "max #contigs =", logger)
484 dist1.savefig(outl90)
485 dist2.savefig(outnbcont)
486 return l90_vals, nbcont_vals, dist1, dist2