Coverage for PanACoTA/prepare_module/filter_genomes.py: 100%
164 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
3# ###############################################################################
4# This file is part of PanACOTA. #
5# #
6# Authors: Amandine Perrin #
7# Copyright © 2018-2020 Institut Pasteur (Paris). #
8# See the COPYRIGHT file for details. #
9# #
10# PanACOTA is a software providing tools for large scale bacterial comparative #
11# genomics. From a set of complete and/or draft genomes, you can: #
12# - Do a quality control of your strains, to eliminate poor quality #
13# genomes, which would not give any information for the comparative study #
14# - Uniformly annotate all genomes #
15# - Do a Pan-genome #
16# - Do a Core or Persistent genome #
17# - Align all Core/Persistent families #
18# - Infer a phylogenetic tree from the Core/Persistent families #
19# #
20# PanACOTA is free software: you can redistribute it and/or modify it under the #
21# terms of the Affero GNU General Public License as published by the Free #
22# Software Foundation, either version 3 of the License, or (at your option) #
23# any later version. #
24# #
25# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
26# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
27# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
28# for more details. #
29# #
30# You should have received a copy of the Affero GNU General Public License #
31# along with PanACOTA (COPYING file). #
32# If not, see <https://www.gnu.org/licenses/>. #
33# ###############################################################################
35"""
36Functions helping for doing quality control on genomes in order to eliminate
37bad quality sequences, and then run Mash loops in order to discard too close genomes.
39@author gem
40August 2019
41"""
43import os
44import sys
45import glob
46import logging
47import progressbar
48import scipy.sparse
49from scipy.sparse import dok_matrix
51from PanACoTA import utils
52from PanACoTA.annotate_module import genome_seq_functions as gfunc
54logger = logging.getLogger("prepare.filter")
57def check_quality(species_linked, db_path, tmp_dir, max_l90, max_cont, cutn):
58 """
59 Do a quality control of all genomes in db_path
61 Parameters
62 ----------
63 outdir : str
64 directory for all results (refseq downloads, database init etc)
65 species_linked : str
66 given NCBI species with '_' instead of spaces, or NCBI taxID if species
67 name not given
68 dbpath : str
69 directory to 'Database_init' containing all .fna files
70 tmp_dir : str
71 directory where all tmp files must be saved (files cut at each stretch of 'x' N)
72 max_l90 : int
73 max L90 value tolerated to keep a genome
74 max_cont : int
75 Max number of contigs tolerated to keep a genome
76 cutn : int
77 cut at each stretch of this number of 'N'. Don't cut if equal to 0
79 Returns
80 -------
81 genomes : {genome_file: [genome_name, orig_path, path_to_seq_to_annotate, size,
82 nbcont, l90]}
83 no need for small name, we won't annotate genomes. genome_name is the same as filename
84 but without extension
85 """
86 # Check database folder exists
87 if not os.path.isdir(db_path):
88 logger.error(f"{db_path} does not exist.")
89 sys.exit(1)
90 if not os.path.isdir(tmp_dir):
91 logger.error(f"{tmp_dir} does not exist.")
92 sys.exit(1)
93 # Get all genome filenames
94 all_genomes = os.listdir(db_path)
95 if len(all_genomes) == 0:
96 logger.error(f"There is no genome in {db_path}.")
97 sys.exit(1)
98 # Get name of genomes without extension
99 genomes = {g:[os.path.splitext(g)[0]] for g in all_genomes}
100 logger.info("Total number of genomes for {}: {}".format(species_linked, len(all_genomes)))
102 # cut at stretches of 'N' if asked, and get L90, nbcontig, size for all genomes
103 # -> {genome_file: [genome_g, orig_path, to_annotate_path, size, nbcont, l90]}
104 gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, "prepare", logger, quiet=False)
105 return genomes
107def sort_genomes_minhash(genomes, max_l90, max_cont):
108 """
109 Sort genomes:
110 - draft genomes, sorted by L90 and then nb_contigs
112 Parameters
113 ----------
115 genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size,
116 nbcont, l90]}
117 max_l90 : int
118 max L90 value tolerated to keep a genome
119 max_cont : int
120 Max number of contigs tolerated to keep a genome
122 Returns
123 -------
124 sorted_genomes: list of 'genome_file' for all genomes kept (L90 and nbcont ok),
125 ordered by decreasing quality
126 """
127 logger.info("Sorting all {} genomes by quality".format(len(genomes)))
128 sorted_genomes = []
129 # Add all genomes, sorted by L90 and then nb_cont, if L90 <= 100 and nb_cont <= 999
130 nbcomp = 0
131 nb_disc = 0
132 for ginfo in sorted(genomes.items(), key=utils.sort_genomes_l90_nbcont):
133 gname, (_, _, _, _, nb_cont, L90) = ginfo
134 if L90 > max_l90 or nb_cont > max_cont:
135 nb_disc += 1
136 else:
137 sorted_genomes.append(gname)
138 logger.info(f"{len(sorted_genomes)} genomes after quality control ({nb_disc} "
139 "discarded)")
140 return sorted_genomes
143def iterative_mash(sorted_genomes, genomes, outdir, species_linked, min_dist, max_dist,
144 threads, quiet):
145 """
146 Run mash all vs all, to get all pairwise distances.
147 Then, take the first genome of the list, and remove those for which the distance to it
148 is not between 1e-4 and 0.06. Restart with the next genome kept in the list, and so on
149 until the last genome.
151 Parameters
152 ----------
153 sorted_genomes: list
154 list of 'genome_file' for all genomes kept (L90 and nbcont ok)
155 genomes : dict
156 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
157 outdir : str
158 path to directory where all results are saved
159 species_linked : str
160 species name if given, otherwise species taxID
161 min_dist : float
162 lower limit of distance between 2 genomes to keep them
163 max_dist : float
164 max limit of distance between 2 genomes to keep them
165 threads :
166 max number of threads to use
167 quiet : bool
168 True if nothing must be sent to stdout/stderr, False otherwise
170 Returns
171 -------
173 genomes_removed : dict
174 {genome_name: [ref_name, dist]} genome against which 'genome_name' is removed, and corresponding distance (justifying removal)
175 """
176 logger.info("Starting filtering steps according to distance between genomes.")
177 # Run mash all vs all
178 mash_dir = os.path.join(outdir, "mash_files")
179 os.makedirs(mash_dir, exist_ok=True)
180 # List of genome sequences to compare
181 list_reps = os.path.join(mash_dir, f"list-to-sketch-{species_linked}.txt")
182 # Mash logfile
183 mash_log = os.path.join(mash_dir, f"mash-all-{species_linked}.log")
184 # Binary file generated by minhash sketch (index of all sequences)
185 out_msh = os.path.join(mash_dir, f"all-genomes-{species_linked}")
186 # Matrix with pairwise distances between all genomes
187 matrix = os.path.join(mash_dir, f"matrix-all-genomes-{species_linked}.txt")
188 # Binary file to save matrix of pairwise distances
189 sparse_mat = os.path.join(mash_dir, f"matrix-all-genomes-{species_linked}.npz")
191 # Sketch genomes
192 sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log, threads)
194 # Compute pairwise distances
195 compare_all(out_msh, matrix, sparse_mat, mash_log, threads)
196 # Iteratively discard genomes
197 # List of genomes to compare to the next ones until a limit value is reached.
198 # genomes ordered by decreasing L90/nbcont (used to pop elements in comparing step)
199 to_try = sorted_genomes[::-1]
200 # Put list of genomes removed by mash comparison, and why
201 # (out of limits distance with which genome)
202 genomes_removed = {} # {genome: [compared_with, dist]}
203 nbgen = len(to_try)
205 # Get link between genome_file (genomes key) and place in sorted_genomes
206 corresp_file = {genome: num for num, genome in enumerate(sorted_genomes)}
207 # Read matrix (from npz file if existing, otherwise from txt file)
208 if os.path.exists(sparse_mat):
209 logger.info(f"Loading matrix contained in {sparse_mat}")
210 # convert matrix returned by load_npz (coo format, as saved) to dok format
211 mat_sp = scipy.sparse.load_npz(sparse_mat).todok()
212 # Read matrix txt file generated by minhash, and save this python object matrix to a npz file.
213 else:
214 logger.info("Reading matrix from txt file generated by Mash.")
215 mat_sp = read_matrix(genomes, sorted_genomes, matrix)
216 # corresp_file = {genome_file : num of genome in sorted_genomes}
217 logger.info("Saving matrix to npz file to be loaded quicker if needed later")
218 # Convert dok_matrix to coo format, as dok format is not allowed by save_npz
219 coo_mat = mat_sp.tocoo()
220 scipy.sparse.save_npz(sparse_mat, coo_mat)
222 # Iteratively discard genomes too close or too far
223 logger.info("Starting iterative discarding steps")
224 if not quiet:
225 widgets = ['Genomes compared: ',
226 progressbar.Bar(marker='█', left='', right='', fill=' '), ' ',
227 progressbar.Counter(), "/{}".format(nbgen), ' ',
228 progressbar.Timer(), ' - '
229 ]
230 bar = progressbar.ProgressBar(widgets=widgets, max_value=len(to_try), term_width=79).start()
231 done = 0
233 while len(to_try) > 1:
234 mash_step(to_try, corresp_file, mat_sp, genomes_removed, min_dist, max_dist)
235 if not quiet:
236 done = nbgen - len(to_try)
237 bar.update(done)
238 if not quiet:
239 bar.finish()
240 logger.info("Final number of genomes in dataset: {}".format(nbgen - len(genomes_removed)))
241 return genomes_removed
244def sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log, threads):
245 """
246 Sketch all genomes to a combined archive.
248 Parameters
249 ----------
250 genomes : dict
251 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
252 sorted_genomes: list
253 list of 'genome_file' for all genomes kept (L90 and nbcont ok), ordered by
254 decreasing quality
255 outdir : str
256 path to directory where all results are saved
257 list_reps : str
258 file with list of genomes to sketch. File will be emptied if it contain something, and
259 filled with the informations from 'genomes'.
260 out_msh : str
261 output of mash
262 mash_log : str
263 mash logfile
264 threads :
265 max number of threads to use
267 Returns
268 -------
270 return value (0 if OK, 1 if error)
272 """
273 # If given outdir does not exist, close it
274 if not os.path.isdir(outdir):
275 logger.error(f"Your output directory '{outdir}' does not exist.")
276 sys.exit(1)
277 # Empty list_reps file
278 open(list_reps, "w").close()
279 # Complete paths to genomes to compare: 'path_to_seq_to_annotate' = genome_file[2]
280 file_paths = [genomes[g][2] for g in sorted_genomes]
281 # Write list of genomes to compare to a file
282 utils.write_list(file_paths, list_reps)
283 # Sketch all genome sequences if not already done
284 if os.path.isfile(out_msh + ".msh"):
285 logger.warning(f"Mash sketch file {out_msh}.msh already exists. PanACoTA will "
286 "use it for next step.")
287 os.remove(list_reps)
288 return 0
289 logger.info("Sketching all genomes...")
290 cmd_sketch = f"mash sketch -o {out_msh} -p {threads} -l {list_reps} -s 1e4"
291 logger.details(cmd_sketch)
292 error_sketch = (f"Error while trying to sketch {len(sorted_genomes)} genomes to combined "
293 "archive. Maybe some genome sequences in "
294 "'tmp_files' are missing! Check logfile: "
295 f"{mash_log}")
297 outf = open(mash_log, "w")
298 utils.run_cmd(cmd_sketch, error_sketch, eof=True, stdout=outf, stderr=outf, logger=logger)
299 outf.close()
300 return 0
303def compare_all(out_msh, matrix, npz_matrix, mash_log, threads):
304 """
305 Comparing all pairwise genomes that are already been sketched in the given file.
307 Parameters
308 ----------
309 out_msh : str
310 output of mash
311 matrix : str
312 File to put generated matrix of pairwise distances between all genomes
313 npz_matrix : str
314 matrix of pairwise distances saved in a binary file
315 mash_log : str
316 mash logfile
317 threads :
318 max number of threads to use
320 Returns
321 -------
323 return code
324 """
325 # txt matrix already exists
326 if os.path.isfile(matrix):
327 logger.warning("Matrix file {} already exists. The program will use this distance matrix "
328 "to filter all genomes according to their distances.".format(matrix))
329 return 0
330 # npz matrix already exists
331 if os.path.isfile(npz_matrix):
332 logger.warning("Matrix file {} already exists. The program will use this distance matrix "
333 "to filter all genomes according to their distances.".format(matrix))
334 return 0
335 logger.info("Computing pairwise distances between all genomes")
336 cmd_dist = f"mash dist -p {threads} {out_msh}.msh {out_msh}.msh"
337 logger.details(cmd_dist)
338 # Open matfile to write matrix inside
339 matfile = open(matrix, "w")
340 # Open mash log to add log of 'mash dist' to log of 'mash sketch'
341 outf = open(mash_log, "a")
342 error_dist = ("Error while trying to estimate pairwise distances between all genomes. "
343 f"See {mash_log}.")
344 utils.run_cmd(cmd_dist, error_dist, eof=True, stdout=matfile, stderr=outf)
345 outf.close()
346 matfile.close()
347 return 0
350def mash_step(to_try, corresp, mat_sp, genomes_removed, min_dist, max_dist):
351 """
352 Prepare a mash run, with a given genome as reference, and others to compare to.
354 Parameters
355 ----------
356 to_try : list
357 list of genome_file (keys of 'genomes') to compare, ordered by decreasing L90/nbcont
358 corresp : dict
359 {genome_file : num_of_genome in sorted_genomes}
360 mat_sp : scipy.sparse.dok.dok_matrix
361 triangle matrix containing pairwise distance comparisons
362 genomes_removed : dict
363 {genome_file: [ref_name, dist]} genome against which 'genome_name' is removed, and
364 corresponding distance (justifying removal)
365 min_dist : float
366 lower limit of distance between 2 genomes to keep them
367 max_dist : float
368 max limit of distance between 2 genomes to keep them
370 Returns
371 -------
373 to_try is updated (reference element and all genomes not compatible with it are removed)
374 genomes_removed is updated
375 return code (0 if no problem)
377 """
378 # Get last element (which is the 'best' genome), and remove it from the list
379 ref_name = to_try.pop()
380 # Line of genome in mat_sp
381 ref_num = corresp[ref_name]
382 # Genomes (ordered by increasing L90/nbcont) to compare to the selected element (ref_name)
383 others = to_try[::-1]
385 # For each genome, compare its distance to reference genome 'ref_name'
386 for gname in others:
387 # Column of genome in mat_sp
388 other_num = corresp[gname]
389 # Get distance between the 2 genomes
390 if ref_num < other_num:
391 dist = mat_sp[ref_num, other_num]
392 else:
393 logger.warning("Should never happen as mat_sp is a triangle matrix!")
394 dist = mat_sp[other_num, ref_num]
395 # If distance not in the limits, remove genome from to_try and add to genomes_removed list
396 if not min_dist <= dist <= max_dist:
397 to_try.remove(gname)
398 genomes_removed[gname] = [ref_name, dist]
399 return 0
402def read_matrix(genomes, sorted_genomes, matrix):
403 """
404 Read the matrix of pairwise distances between all genomes, and save it to a sparse
405 matrix (only upper triangle).
407 Parameters
408 ----------
409 genomes : dict
410 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
411 sorted_genomes: list
412 list of 'genome_file' for all genomes kept (L90 and nbcont ok)
413 matrix : str
414 File containing the matrix of pairwise distances between all genomes
416 Returns
417 -------
419 mat_sp : str
420 python dok_matrix object
421 """
422 if not os.path.isfile(matrix):
423 logger.error(f"Matrix file {matrix} does not exist. We cannot read it "
424 "and do the next steps. Program ending.")
425 sys.exit(1)
427 nbgen = len(sorted_genomes)
428 corresp_abs = {genomes[genome][2]: num for num, genome in enumerate(sorted_genomes)}
429 # Create square matrix with nbgen cols/lines. dok format is a 'Dictionary Of Keys'
430 # -> writes (0, 1) value
431 mat_sp = dok_matrix((nbgen, nbgen), dtype=float)
432 # Write matrix values
433 with open(matrix, "r") as matf:
434 for line in matf:
435 path1, path2, dist = line.split()[:3]
436 num1 = corresp_abs[path1]
437 num2 = corresp_abs[path2]
438 # only in lower triangle (no duplicate)
439 if num1 < num2:
440 mat_sp[num1, num2] = float(dist)
441 else:
442 mat_sp[num2, num1] = float(dist)
443 return mat_sp
446def write_outputfiles(genomes, sorted_genomes, genomes_removed, outdir, gspecies, min_dist, max_dist):
447 """
448 Write the list of genomes kept in a file, 1 genome per line -> will be the input file for
449 annotation and next steps
450 Write discarded genomes to another file, with, for each line:
451 - genome name
452 - problem when compared with which other genome
453 - distance to this other genomes
455 Parameters
456 ----------
457 genomes : dict
458 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
459 sorted_genomes: list
460 list of 'genome_file' for all genomes kept (L90 and nbcont ok)
461 genomes_removed : dict
462 {genome_name: [ref_name, dist]} genome against which 'genome_name' is removed, and corresponding distance (justifying removal)
463 outdir : str
464 directory where those list files must be created
465 gspecies : str
466 species name if given, otherwise species taxID
467 min_dist : float
468 lower limit of distance between 2 genomes to keep them
469 max_dist : float
470 upper limit of distance between 2 genomes to keep them
472 Returns
473 -------
474 return code
475 """
476 if not os.path.isdir(outdir):
477 logger.error(f"The given output directory ({outdir}) does not exist. We cannot "
478 "create output files there")
479 sys.exit(1)
480 list_file = os.path.join(outdir, f"LSTINFO-{gspecies}-filtered-{min_dist}_{max_dist}.txt")
481 kept_genomes = []
482 discard_file = os.path.join(outdir, f"discarded-by-minhash-{gspecies}-{min_dist}_{max_dist}.txt")
484 # Get list of kept genomes and write them in list_file
485 for genome in sorted_genomes:
486 if genome not in genomes_removed:
487 kept_genomes.append(genome)
489 # Write 4 columns in LSTINFO file (genomes kept) :
490 # path to file analyzed, genome size, nbcont and L90
491 with open(list_file, "w") as lf:
492 lf.write('to_annotate\tgsize\tnb_conts\tL90\n')
493 # For each genome in kept_genomes, find required information on this it, using 'genomes'
494 for g in kept_genomes:
495 _, _, analyzed, size, nbcont, l90 = genomes[g]
496 towrite = utils.list_to_str([analyzed, size, nbcont, l90], sep="\t")
497 lf.write(towrite)
499 # Write list of discarded genomes and why they are discarded
500 with open(discard_file, "w") as disf:
501 disf.write("genome_name\tproblem_compared_with\tdist\n")
502 for genome, info in genomes_removed.items():
503 disf.write(utils.list_to_str([genome] + info))
504 logger.info(f"Final list of genomes in the dataset: {list_file}")
505 logger.info(f"List of genomes discarded by minhash steps: {discard_file}")
506 return list_file