Coverage for PanACoTA/prepare_module/filter_genomes.py: 100%

164 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2 

3# ############################################################################### 

4# This file is part of PanACOTA. # 

5# # 

6# Authors: Amandine Perrin # 

7# Copyright © 2018-2020 Institut Pasteur (Paris). # 

8# See the COPYRIGHT file for details. # 

9# # 

10# PanACOTA is a software providing tools for large scale bacterial comparative # 

11# genomics. From a set of complete and/or draft genomes, you can: # 

12# - Do a quality control of your strains, to eliminate poor quality # 

13# genomes, which would not give any information for the comparative study # 

14# - Uniformly annotate all genomes # 

15# - Do a Pan-genome # 

16# - Do a Core or Persistent genome # 

17# - Align all Core/Persistent families # 

18# - Infer a phylogenetic tree from the Core/Persistent families # 

19# # 

20# PanACOTA is free software: you can redistribute it and/or modify it under the # 

21# terms of the Affero GNU General Public License as published by the Free # 

22# Software Foundation, either version 3 of the License, or (at your option) # 

23# any later version. # 

24# # 

25# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

26# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

27# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

28# for more details. # 

29# # 

30# You should have received a copy of the Affero GNU General Public License # 

31# along with PanACOTA (COPYING file). # 

32# If not, see <https://www.gnu.org/licenses/>. # 

33# ############################################################################### 

34 

35""" 

36Functions helping for doing quality control on genomes in order to eliminate 

37bad quality sequences, and then run Mash loops in order to discard too close genomes. 

38 

39@author gem 

40August 2019 

41""" 

42 

43import os 

44import sys 

45import glob 

46import logging 

47import progressbar 

48import scipy.sparse 

49from scipy.sparse import dok_matrix 

50 

51from PanACoTA import utils 

52from PanACoTA.annotate_module import genome_seq_functions as gfunc 

53 

54logger = logging.getLogger("prepare.filter") 

55 

56 

57def check_quality(species_linked, db_path, tmp_dir, max_l90, max_cont, cutn): 

58 """ 

59 Do a quality control of all genomes in db_path 

60 

61 Parameters 

62 ---------- 

63 outdir : str 

64 directory for all results (refseq downloads, database init etc) 

65 species_linked : str 

66 given NCBI species with '_' instead of spaces, or NCBI taxID if species 

67 name not given 

68 dbpath : str 

69 directory to 'Database_init' containing all .fna files 

70 tmp_dir : str 

71 directory where all tmp files must be saved (files cut at each stretch of 'x' N) 

72 max_l90 : int 

73 max L90 value tolerated to keep a genome 

74 max_cont : int 

75 Max number of contigs tolerated to keep a genome 

76 cutn : int 

77 cut at each stretch of this number of 'N'. Don't cut if equal to 0 

78 

79 Returns 

80 ------- 

81 genomes : {genome_file: [genome_name, orig_path, path_to_seq_to_annotate, size, 

82 nbcont, l90]} 

83 no need for small name, we won't annotate genomes. genome_name is the same as filename 

84 but without extension 

85 """ 

86 # Check database folder exists 

87 if not os.path.isdir(db_path): 

88 logger.error(f"{db_path} does not exist.") 

89 sys.exit(1) 

90 if not os.path.isdir(tmp_dir): 

91 logger.error(f"{tmp_dir} does not exist.") 

92 sys.exit(1) 

93 # Get all genome filenames 

94 all_genomes = os.listdir(db_path) 

95 if len(all_genomes) == 0: 

96 logger.error(f"There is no genome in {db_path}.") 

97 sys.exit(1) 

98 # Get name of genomes without extension 

99 genomes = {g:[os.path.splitext(g)[0]] for g in all_genomes} 

100 logger.info("Total number of genomes for {}: {}".format(species_linked, len(all_genomes))) 

101 

102 # cut at stretches of 'N' if asked, and get L90, nbcontig, size for all genomes 

103 # -> {genome_file: [genome_g, orig_path, to_annotate_path, size, nbcont, l90]} 

104 gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, "prepare", logger, quiet=False) 

105 return genomes 

106 

107def sort_genomes_minhash(genomes, max_l90, max_cont): 

108 """ 

109 Sort genomes: 

110 - draft genomes, sorted by L90 and then nb_contigs 

111 

112 Parameters 

113 ---------- 

114 

115 genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, 

116 nbcont, l90]} 

117 max_l90 : int 

118 max L90 value tolerated to keep a genome 

119 max_cont : int 

120 Max number of contigs tolerated to keep a genome 

121 

122 Returns 

123 ------- 

124 sorted_genomes: list of 'genome_file' for all genomes kept (L90 and nbcont ok), 

125 ordered by decreasing quality 

126 """ 

127 logger.info("Sorting all {} genomes by quality".format(len(genomes))) 

128 sorted_genomes = [] 

129 # Add all genomes, sorted by L90 and then nb_cont, if L90 <= 100 and nb_cont <= 999 

130 nbcomp = 0 

131 nb_disc = 0 

132 for ginfo in sorted(genomes.items(), key=utils.sort_genomes_l90_nbcont): 

133 gname, (_, _, _, _, nb_cont, L90) = ginfo 

134 if L90 > max_l90 or nb_cont > max_cont: 

135 nb_disc += 1 

136 else: 

137 sorted_genomes.append(gname) 

138 logger.info(f"{len(sorted_genomes)} genomes after quality control ({nb_disc} " 

139 "discarded)") 

140 return sorted_genomes 

141 

142 

143def iterative_mash(sorted_genomes, genomes, outdir, species_linked, min_dist, max_dist, 

144 threads, quiet): 

145 """ 

146 Run mash all vs all, to get all pairwise distances. 

147 Then, take the first genome of the list, and remove those for which the distance to it 

148 is not between 1e-4 and 0.06. Restart with the next genome kept in the list, and so on 

149 until the last genome. 

150 

151 Parameters 

152 ---------- 

153 sorted_genomes: list 

154 list of 'genome_file' for all genomes kept (L90 and nbcont ok) 

155 genomes : dict 

156 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} 

157 outdir : str 

158 path to directory where all results are saved 

159 species_linked : str 

160 species name if given, otherwise species taxID 

161 min_dist : float 

162 lower limit of distance between 2 genomes to keep them 

163 max_dist : float 

164 max limit of distance between 2 genomes to keep them 

165 threads : 

166 max number of threads to use 

167 quiet : bool 

168 True if nothing must be sent to stdout/stderr, False otherwise 

169 

170 Returns 

171 ------- 

172 

173 genomes_removed : dict 

174 {genome_name: [ref_name, dist]} genome against which 'genome_name' is removed, and corresponding distance (justifying removal) 

175 """ 

176 logger.info("Starting filtering steps according to distance between genomes.") 

177 # Run mash all vs all 

178 mash_dir = os.path.join(outdir, "mash_files") 

179 os.makedirs(mash_dir, exist_ok=True) 

180 # List of genome sequences to compare 

181 list_reps = os.path.join(mash_dir, f"list-to-sketch-{species_linked}.txt") 

182 # Mash logfile 

183 mash_log = os.path.join(mash_dir, f"mash-all-{species_linked}.log") 

184 # Binary file generated by minhash sketch (index of all sequences) 

185 out_msh = os.path.join(mash_dir, f"all-genomes-{species_linked}") 

186 # Matrix with pairwise distances between all genomes 

187 matrix = os.path.join(mash_dir, f"matrix-all-genomes-{species_linked}.txt") 

188 # Binary file to save matrix of pairwise distances 

189 sparse_mat = os.path.join(mash_dir, f"matrix-all-genomes-{species_linked}.npz") 

190 

191 # Sketch genomes 

192 sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log, threads) 

193 

194 # Compute pairwise distances 

195 compare_all(out_msh, matrix, sparse_mat, mash_log, threads) 

196 # Iteratively discard genomes 

197 # List of genomes to compare to the next ones until a limit value is reached. 

198 # genomes ordered by decreasing L90/nbcont (used to pop elements in comparing step) 

199 to_try = sorted_genomes[::-1] 

200 # Put list of genomes removed by mash comparison, and why 

201 # (out of limits distance with which genome) 

202 genomes_removed = {} # {genome: [compared_with, dist]} 

203 nbgen = len(to_try) 

204 

205 # Get link between genome_file (genomes key) and place in sorted_genomes 

206 corresp_file = {genome: num for num, genome in enumerate(sorted_genomes)} 

207 # Read matrix (from npz file if existing, otherwise from txt file) 

208 if os.path.exists(sparse_mat): 

209 logger.info(f"Loading matrix contained in {sparse_mat}") 

210 # convert matrix returned by load_npz (coo format, as saved) to dok format 

211 mat_sp = scipy.sparse.load_npz(sparse_mat).todok() 

212 # Read matrix txt file generated by minhash, and save this python object matrix to a npz file. 

213 else: 

214 logger.info("Reading matrix from txt file generated by Mash.") 

215 mat_sp = read_matrix(genomes, sorted_genomes, matrix) 

216 # corresp_file = {genome_file : num of genome in sorted_genomes} 

217 logger.info("Saving matrix to npz file to be loaded quicker if needed later") 

218 # Convert dok_matrix to coo format, as dok format is not allowed by save_npz 

219 coo_mat = mat_sp.tocoo() 

220 scipy.sparse.save_npz(sparse_mat, coo_mat) 

221 

222 # Iteratively discard genomes too close or too far 

223 logger.info("Starting iterative discarding steps") 

224 if not quiet: 

225 widgets = ['Genomes compared: ', 

226 progressbar.Bar(marker='█', left='', right='', fill=' '), ' ', 

227 progressbar.Counter(), "/{}".format(nbgen), ' ', 

228 progressbar.Timer(), ' - ' 

229 ] 

230 bar = progressbar.ProgressBar(widgets=widgets, max_value=len(to_try), term_width=79).start() 

231 done = 0 

232 

233 while len(to_try) > 1: 

234 mash_step(to_try, corresp_file, mat_sp, genomes_removed, min_dist, max_dist) 

235 if not quiet: 

236 done = nbgen - len(to_try) 

237 bar.update(done) 

238 if not quiet: 

239 bar.finish() 

240 logger.info("Final number of genomes in dataset: {}".format(nbgen - len(genomes_removed))) 

241 return genomes_removed 

242 

243 

244def sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log, threads): 

245 """ 

246 Sketch all genomes to a combined archive. 

247 

248 Parameters 

249 ---------- 

250 genomes : dict 

251 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} 

252 sorted_genomes: list 

253 list of 'genome_file' for all genomes kept (L90 and nbcont ok), ordered by 

254 decreasing quality 

255 outdir : str 

256 path to directory where all results are saved 

257 list_reps : str 

258 file with list of genomes to sketch. File will be emptied if it contain something, and 

259 filled with the informations from 'genomes'. 

260 out_msh : str 

261 output of mash 

262 mash_log : str 

263 mash logfile 

264 threads : 

265 max number of threads to use 

266 

267 Returns 

268 ------- 

269 

270 return value (0 if OK, 1 if error) 

271 

272 """ 

273 # If given outdir does not exist, close it 

274 if not os.path.isdir(outdir): 

275 logger.error(f"Your output directory '{outdir}' does not exist.") 

276 sys.exit(1) 

277 # Empty list_reps file 

278 open(list_reps, "w").close() 

279 # Complete paths to genomes to compare: 'path_to_seq_to_annotate' = genome_file[2] 

280 file_paths = [genomes[g][2] for g in sorted_genomes] 

281 # Write list of genomes to compare to a file 

282 utils.write_list(file_paths, list_reps) 

283 # Sketch all genome sequences if not already done 

284 if os.path.isfile(out_msh + ".msh"): 

285 logger.warning(f"Mash sketch file {out_msh}.msh already exists. PanACoTA will " 

286 "use it for next step.") 

287 os.remove(list_reps) 

288 return 0 

289 logger.info("Sketching all genomes...") 

290 cmd_sketch = f"mash sketch -o {out_msh} -p {threads} -l {list_reps} -s 1e4" 

291 logger.details(cmd_sketch) 

292 error_sketch = (f"Error while trying to sketch {len(sorted_genomes)} genomes to combined " 

293 "archive. Maybe some genome sequences in " 

294 "'tmp_files' are missing! Check logfile: " 

295 f"{mash_log}") 

296 

297 outf = open(mash_log, "w") 

298 utils.run_cmd(cmd_sketch, error_sketch, eof=True, stdout=outf, stderr=outf, logger=logger) 

299 outf.close() 

300 return 0 

301 

302 

303def compare_all(out_msh, matrix, npz_matrix, mash_log, threads): 

304 """ 

305 Comparing all pairwise genomes that are already been sketched in the given file. 

306 

307 Parameters 

308 ---------- 

309 out_msh : str 

310 output of mash 

311 matrix : str 

312 File to put generated matrix of pairwise distances between all genomes 

313 npz_matrix : str 

314 matrix of pairwise distances saved in a binary file 

315 mash_log : str 

316 mash logfile 

317 threads : 

318 max number of threads to use 

319 

320 Returns 

321 ------- 

322 

323 return code 

324 """ 

325 # txt matrix already exists 

326 if os.path.isfile(matrix): 

327 logger.warning("Matrix file {} already exists. The program will use this distance matrix " 

328 "to filter all genomes according to their distances.".format(matrix)) 

329 return 0 

330 # npz matrix already exists 

331 if os.path.isfile(npz_matrix): 

332 logger.warning("Matrix file {} already exists. The program will use this distance matrix " 

333 "to filter all genomes according to their distances.".format(matrix)) 

334 return 0 

335 logger.info("Computing pairwise distances between all genomes") 

336 cmd_dist = f"mash dist -p {threads} {out_msh}.msh {out_msh}.msh" 

337 logger.details(cmd_dist) 

338 # Open matfile to write matrix inside 

339 matfile = open(matrix, "w") 

340 # Open mash log to add log of 'mash dist' to log of 'mash sketch' 

341 outf = open(mash_log, "a") 

342 error_dist = ("Error while trying to estimate pairwise distances between all genomes. " 

343 f"See {mash_log}.") 

344 utils.run_cmd(cmd_dist, error_dist, eof=True, stdout=matfile, stderr=outf) 

345 outf.close() 

346 matfile.close() 

347 return 0 

348 

349 

350def mash_step(to_try, corresp, mat_sp, genomes_removed, min_dist, max_dist): 

351 """ 

352 Prepare a mash run, with a given genome as reference, and others to compare to. 

353 

354 Parameters 

355 ---------- 

356 to_try : list 

357 list of genome_file (keys of 'genomes') to compare, ordered by decreasing L90/nbcont 

358 corresp : dict 

359 {genome_file : num_of_genome in sorted_genomes} 

360 mat_sp : scipy.sparse.dok.dok_matrix 

361 triangle matrix containing pairwise distance comparisons 

362 genomes_removed : dict 

363 {genome_file: [ref_name, dist]} genome against which 'genome_name' is removed, and 

364 corresponding distance (justifying removal) 

365 min_dist : float 

366 lower limit of distance between 2 genomes to keep them 

367 max_dist : float 

368 max limit of distance between 2 genomes to keep them 

369 

370 Returns 

371 ------- 

372 

373 to_try is updated (reference element and all genomes not compatible with it are removed) 

374 genomes_removed is updated 

375 return code (0 if no problem) 

376 

377 """ 

378 # Get last element (which is the 'best' genome), and remove it from the list 

379 ref_name = to_try.pop() 

380 # Line of genome in mat_sp 

381 ref_num = corresp[ref_name] 

382 # Genomes (ordered by increasing L90/nbcont) to compare to the selected element (ref_name) 

383 others = to_try[::-1] 

384 

385 # For each genome, compare its distance to reference genome 'ref_name' 

386 for gname in others: 

387 # Column of genome in mat_sp 

388 other_num = corresp[gname] 

389 # Get distance between the 2 genomes 

390 if ref_num < other_num: 

391 dist = mat_sp[ref_num, other_num] 

392 else: 

393 logger.warning("Should never happen as mat_sp is a triangle matrix!") 

394 dist = mat_sp[other_num, ref_num] 

395 # If distance not in the limits, remove genome from to_try and add to genomes_removed list 

396 if not min_dist <= dist <= max_dist: 

397 to_try.remove(gname) 

398 genomes_removed[gname] = [ref_name, dist] 

399 return 0 

400 

401 

402def read_matrix(genomes, sorted_genomes, matrix): 

403 """ 

404 Read the matrix of pairwise distances between all genomes, and save it to a sparse 

405 matrix (only upper triangle). 

406 

407 Parameters 

408 ---------- 

409 genomes : dict 

410 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} 

411 sorted_genomes: list 

412 list of 'genome_file' for all genomes kept (L90 and nbcont ok) 

413 matrix : str 

414 File containing the matrix of pairwise distances between all genomes 

415 

416 Returns 

417 ------- 

418 

419 mat_sp : str 

420 python dok_matrix object 

421 """ 

422 if not os.path.isfile(matrix): 

423 logger.error(f"Matrix file {matrix} does not exist. We cannot read it " 

424 "and do the next steps. Program ending.") 

425 sys.exit(1) 

426 

427 nbgen = len(sorted_genomes) 

428 corresp_abs = {genomes[genome][2]: num for num, genome in enumerate(sorted_genomes)} 

429 # Create square matrix with nbgen cols/lines. dok format is a 'Dictionary Of Keys' 

430 # -> writes (0, 1) value 

431 mat_sp = dok_matrix((nbgen, nbgen), dtype=float) 

432 # Write matrix values 

433 with open(matrix, "r") as matf: 

434 for line in matf: 

435 path1, path2, dist = line.split()[:3] 

436 num1 = corresp_abs[path1] 

437 num2 = corresp_abs[path2] 

438 # only in lower triangle (no duplicate) 

439 if num1 < num2: 

440 mat_sp[num1, num2] = float(dist) 

441 else: 

442 mat_sp[num2, num1] = float(dist) 

443 return mat_sp 

444 

445 

446def write_outputfiles(genomes, sorted_genomes, genomes_removed, outdir, gspecies, min_dist, max_dist): 

447 """ 

448 Write the list of genomes kept in a file, 1 genome per line -> will be the input file for 

449 annotation and next steps 

450 Write discarded genomes to another file, with, for each line: 

451 - genome name 

452 - problem when compared with which other genome 

453 - distance to this other genomes 

454 

455 Parameters 

456 ---------- 

457 genomes : dict 

458 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} 

459 sorted_genomes: list 

460 list of 'genome_file' for all genomes kept (L90 and nbcont ok) 

461 genomes_removed : dict 

462 {genome_name: [ref_name, dist]} genome against which 'genome_name' is removed, and corresponding distance (justifying removal) 

463 outdir : str 

464 directory where those list files must be created 

465 gspecies : str 

466 species name if given, otherwise species taxID 

467 min_dist : float 

468 lower limit of distance between 2 genomes to keep them 

469 max_dist : float 

470 upper limit of distance between 2 genomes to keep them 

471 

472 Returns 

473 ------- 

474 return code 

475 """ 

476 if not os.path.isdir(outdir): 

477 logger.error(f"The given output directory ({outdir}) does not exist. We cannot " 

478 "create output files there") 

479 sys.exit(1) 

480 list_file = os.path.join(outdir, f"LSTINFO-{gspecies}-filtered-{min_dist}_{max_dist}.txt") 

481 kept_genomes = [] 

482 discard_file = os.path.join(outdir, f"discarded-by-minhash-{gspecies}-{min_dist}_{max_dist}.txt") 

483 

484 # Get list of kept genomes and write them in list_file 

485 for genome in sorted_genomes: 

486 if genome not in genomes_removed: 

487 kept_genomes.append(genome) 

488 

489 # Write 4 columns in LSTINFO file (genomes kept) : 

490 # path to file analyzed, genome size, nbcont and L90 

491 with open(list_file, "w") as lf: 

492 lf.write('to_annotate\tgsize\tnb_conts\tL90\n') 

493 # For each genome in kept_genomes, find required information on this it, using 'genomes' 

494 for g in kept_genomes: 

495 _, _, analyzed, size, nbcont, l90 = genomes[g] 

496 towrite = utils.list_to_str([analyzed, size, nbcont, l90], sep="\t") 

497 lf.write(towrite) 

498 

499 # Write list of discarded genomes and why they are discarded 

500 with open(discard_file, "w") as disf: 

501 disf.write("genome_name\tproblem_compared_with\tdist\n") 

502 for genome, info in genomes_removed.items(): 

503 disf.write(utils.list_to_str([genome] + info)) 

504 logger.info(f"Final list of genomes in the dataset: {list_file}") 

505 logger.info(f"List of genomes discarded by minhash steps: {discard_file}") 

506 return list_file