Coverage for PanACoTA/prepare_module/filter

1#!/usr/bin/env python3

3# ###############################################################################

4# This file is part of PanACOTA. #

5# #

6# Authors: Amandine Perrin #

8# See the COPYRIGHT file for details. #

9# #

10# PanACOTA is a software providing tools for large scale bacterial comparative #

11# genomics. From a set of complete and/or draft genomes, you can: #

12# - Do a quality control of your strains, to eliminate poor quality #

13# genomes, which would not give any information for the comparative study #

14# - Uniformly annotate all genomes #

15# - Do a Pan-genome #

16# - Do a Core or Persistent genome #

17# - Align all Core/Persistent families #

18# - Infer a phylogenetic tree from the Core/Persistent families #

19# #

20# PanACOTA is free software: you can redistribute it and/or modify it under the #

21# terms of the Affero GNU General Public License as published by the Free #

22# Software Foundation, either version 3 of the License, or (at your option) #

23# any later version. #

24# #

25# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

26# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

27# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

28# for more details. #

29# #

30# You should have received a copy of the Affero GNU General Public License #

31# along with PanACOTA (COPYING file). #

32# If not, see <https://www.gnu.org/licenses/>. #

33# ###############################################################################

35"""

36Functions helping for doing quality control on genomes in order to eliminate

37bad quality sequences, and then run Mash loops in order to discard too close genomes.

39@author gem

40August 2019

41"""

43import os

44import sys

45import glob

46import logging

47import progressbar

48import scipy.sparse

49from scipy.sparse import dok_matrix

51from PanACoTA import utils

52from PanACoTA.annotate_module import genome_seq_functions as gfunc

54logger = logging.getLogger("prepare.filter")

57def check_quality(species_linked, db_path, tmp_dir, max_l90, max_cont, cutn):

58 """

59 Do a quality control of all genomes in db_path

61 Parameters

62 ----------

63 outdir : str

64 directory for all results (refseq downloads, database init etc)

65 species_linked : str

66 given NCBI species with '_' instead of spaces, or NCBI taxID if species

67 name not given

68 dbpath : str

69 directory to 'Database_init' containing all .fna files

70 tmp_dir : str

71 directory where all tmp files must be saved (files cut at each stretch of 'x' N)

72 max_l90 : int

73 max L90 value tolerated to keep a genome

74 max_cont : int

75 Max number of contigs tolerated to keep a genome

76 cutn : int

77 cut at each stretch of this number of 'N'. Don't cut if equal to 0

79 Returns

80 -------

81 genomes : {genome_file: [genome_name, orig_path, path_to_seq_to_annotate, size,

82 nbcont, l90]}

83 no need for small name, we won't annotate genomes. genome_name is the same as filename

84 but without extension

85 """

86 # Check database folder exists

87 if not os.path.isdir(db_path):

88 logger.error(f"{db_path} does not exist.")

89 sys.exit(1)

90 if not os.path.isdir(tmp_dir):

91 logger.error(f"{tmp_dir} does not exist.")

92 sys.exit(1)

93 # Get all genome filenames

94 all_genomes = os.listdir(db_path)

95 if len(all_genomes) == 0:

96 logger.error(f"There is no genome in {db_path}.")

97 sys.exit(1)

98 # Get name of genomes without extension

99 genomes = {g:[os.path.splitext(g)[0]] for g in all_genomes}

100 logger.info("Total number of genomes for {}: {}".format(species_linked, len(all_genomes)))

101

102 # cut at stretches of 'N' if asked, and get L90, nbcontig, size for all genomes

103 # -> {genome_file: [genome_g, orig_path, to_annotate_path, size, nbcont, l90]}

104 gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, "prepare", logger, quiet=False)

105 return genomes

106

107def sort_genomes_minhash(genomes, max_l90, max_cont):

108 """

109 Sort genomes:

110 - draft genomes, sorted by L90 and then nb_contigs

111

112 Parameters

113 ----------

114

115 genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size,

116 nbcont, l90]}

117 max_l90 : int

118 max L90 value tolerated to keep a genome

119 max_cont : int

120 Max number of contigs tolerated to keep a genome

121

122 Returns

123 -------

124 sorted_genomes: list of 'genome_file' for all genomes kept (L90 and nbcont ok),

125 ordered by decreasing quality

126 """

127 logger.info("Sorting all {} genomes by quality".format(len(genomes)))

128 sorted_genomes = []

129 # Add all genomes, sorted by L90 and then nb_cont, if L90 <= 100 and nb_cont <= 999

130 nbcomp = 0

131 nb_disc = 0

132 for ginfo in sorted(genomes.items(), key=utils.sort_genomes_l90_nbcont):

133 gname, (_, _, _, _, nb_cont, L90) = ginfo

134 if L90 > max_l90 or nb_cont > max_cont:

135 nb_disc += 1

136 else:

137 sorted_genomes.append(gname)

138 logger.info(f"{len(sorted_genomes)} genomes after quality control ({nb_disc} "

139 "discarded)")

140 return sorted_genomes

141

142

143def iterative_mash(sorted_genomes, genomes, outdir, species_linked, min_dist, max_dist,

144 threads, quiet):

145 """

146 Run mash all vs all, to get all pairwise distances.

147 Then, take the first genome of the list, and remove those for which the distance to it

148 is not between 1e-4 and 0.06. Restart with the next genome kept in the list, and so on

149 until the last genome.

150

151 Parameters

152 ----------

153 sorted_genomes: list

154 list of 'genome_file' for all genomes kept (L90 and nbcont ok)

155 genomes : dict

156 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}

157 outdir : str

158 path to directory where all results are saved

159 species_linked : str

160 species name if given, otherwise species taxID

161 min_dist : float

162 lower limit of distance between 2 genomes to keep them

163 max_dist : float

164 max limit of distance between 2 genomes to keep them

165 threads :

166 max number of threads to use

167 quiet : bool

168 True if nothing must be sent to stdout/stderr, False otherwise

169

170 Returns

171 -------

172

173 genomes_removed : dict

174 {genome_name: [ref_name, dist]} genome against which 'genome_name' is removed, and corresponding distance (justifying removal)

175 """

176 logger.info("Starting filtering steps according to distance between genomes.")

177 # Run mash all vs all

178 mash_dir = os.path.join(outdir, "mash_files")

179 os.makedirs(mash_dir, exist_ok=True)

180 # List of genome sequences to compare

181 list_reps = os.path.join(mash_dir, f"list-to-sketch-{species_linked}.txt")

182 # Mash logfile

183 mash_log = os.path.join(mash_dir, f"mash-all-{species_linked}.log")

184 # Binary file generated by minhash sketch (index of all sequences)

185 out_msh = os.path.join(mash_dir, f"all-genomes-{species_linked}")

186 # Matrix with pairwise distances between all genomes

187 matrix = os.path.join(mash_dir, f"matrix-all-genomes-{species_linked}.txt")

188 # Binary file to save matrix of pairwise distances

189 sparse_mat = os.path.join(mash_dir, f"matrix-all-genomes-{species_linked}.npz")

190

191 # Sketch genomes

192 sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log, threads)

193

194 # Compute pairwise distances

195 compare_all(out_msh, matrix, sparse_mat, mash_log, threads)

196 # Iteratively discard genomes

197 # List of genomes to compare to the next ones until a limit value is reached.

198 # genomes ordered by decreasing L90/nbcont (used to pop elements in comparing step)

199 to_try = sorted_genomes[::-1]

200 # Put list of genomes removed by mash comparison, and why

201 # (out of limits distance with which genome)

202 genomes_removed = {} # {genome: [compared_with, dist]}

203 nbgen = len(to_try)

204

205 # Get link between genome_file (genomes key) and place in sorted_genomes

206 corresp_file = {genome: num for num, genome in enumerate(sorted_genomes)}

207 # Read matrix (from npz file if existing, otherwise from txt file)

208 if os.path.exists(sparse_mat):

209 logger.info(f"Loading matrix contained in {sparse_mat}")

210 # convert matrix returned by load_npz (coo format, as saved) to dok format

211 mat_sp = scipy.sparse.load_npz(sparse_mat).todok()

212 # Read matrix txt file generated by minhash, and save this python object matrix to a npz file.

213 else:

214 logger.info("Reading matrix from txt file generated by Mash.")

215 mat_sp = read_matrix(genomes, sorted_genomes, matrix)

216 # corresp_file = {genome_file : num of genome in sorted_genomes}

217 logger.info("Saving matrix to npz file to be loaded quicker if needed later")

218 # Convert dok_matrix to coo format, as dok format is not allowed by save_npz

219 coo_mat = mat_sp.tocoo()

220 scipy.sparse.save_npz(sparse_mat, coo_mat)

221

222 # Iteratively discard genomes too close or too far

223 logger.info("Starting iterative discarding steps")

224 if not quiet:

225 widgets = ['Genomes compared: ',

226 progressbar.Bar(marker='█', left='', right='', fill=' '), ' ',

227 progressbar.Counter(), "/{}".format(nbgen), ' ',

228 progressbar.Timer(), ' - '

229 ]

230 bar = progressbar.ProgressBar(widgets=widgets, max_value=len(to_try), term_width=79).start()

231 done = 0

232

233 while len(to_try) > 1:

234 mash_step(to_try, corresp_file, mat_sp, genomes_removed, min_dist, max_dist)

235 if not quiet:

236 done = nbgen - len(to_try)

237 bar.update(done)

238 if not quiet:

239 bar.finish()

240 logger.info("Final number of genomes in dataset: {}".format(nbgen - len(genomes_removed)))

241 return genomes_removed

242

243

244def sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log, threads):

245 """

246 Sketch all genomes to a combined archive.

247

248 Parameters

249 ----------

250 genomes : dict

251 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}

252 sorted_genomes: list

253 list of 'genome_file' for all genomes kept (L90 and nbcont ok), ordered by

254 decreasing quality

255 outdir : str

256 path to directory where all results are saved

257 list_reps : str

258 file with list of genomes to sketch. File will be emptied if it contain something, and

259 filled with the informations from 'genomes'.

260 out_msh : str

261 output of mash

262 mash_log : str

263 mash logfile

264 threads :

265 max number of threads to use

266

267 Returns

268 -------

269

270 return value (0 if OK, 1 if error)

271

272 """

273 # If given outdir does not exist, close it

274 if not os.path.isdir(outdir):

275 logger.error(f"Your output directory '{outdir}' does not exist.")

276 sys.exit(1)

277 # Empty list_reps file

278 open(list_reps, "w").close()

279 # Complete paths to genomes to compare: 'path_to_seq_to_annotate' = genome_file[2]

280 file_paths = [genomes[g][2] for g in sorted_genomes]

281 # Write list of genomes to compare to a file

282 utils.write_list(file_paths, list_reps)

283 # Sketch all genome sequences if not already done

284 if os.path.isfile(out_msh + ".msh"):

285 logger.warning(f"Mash sketch file {out_msh}.msh already exists. PanACoTA will "

286 "use it for next step.")

287 os.remove(list_reps)

288 return 0

289 logger.info("Sketching all genomes...")

290 cmd_sketch = f"mash sketch -o {out_msh} -p {threads} -l {list_reps} -s 1e4"

291 logger.details(cmd_sketch)

292 error_sketch = (f"Error while trying to sketch {len(sorted_genomes)} genomes to combined "

293 "archive. Maybe some genome sequences in "

294 "'tmp_files' are missing! Check logfile: "

295 f"{mash_log}")

296

297 outf = open(mash_log, "w")

298 utils.run_cmd(cmd_sketch, error_sketch, eof=True, stdout=outf, stderr=outf, logger=logger)

299 outf.close()

300 return 0

301

302

303def compare_all(out_msh, matrix, npz_matrix, mash_log, threads):

304 """

305 Comparing all pairwise genomes that are already been sketched in the given file.

306

307 Parameters

308 ----------

309 out_msh : str

310 output of mash

311 matrix : str

312 File to put generated matrix of pairwise distances between all genomes

313 npz_matrix : str

314 matrix of pairwise distances saved in a binary file

315 mash_log : str

316 mash logfile

317 threads :

318 max number of threads to use

319

320 Returns

321 -------

322

323 return code

324 """

325 # txt matrix already exists

326 if os.path.isfile(matrix):

327 logger.warning("Matrix file {} already exists. The program will use this distance matrix "

328 "to filter all genomes according to their distances.".format(matrix))

329 return 0

330 # npz matrix already exists

331 if os.path.isfile(npz_matrix):

332 logger.warning("Matrix file {} already exists. The program will use this distance matrix "

333 "to filter all genomes according to their distances.".format(matrix))

334 return 0

335 logger.info("Computing pairwise distances between all genomes")

336 cmd_dist = f"mash dist -p {threads} {out_msh}.msh {out_msh}.msh"

337 logger.details(cmd_dist)

338 # Open matfile to write matrix inside

339 matfile = open(matrix, "w")

340 # Open mash log to add log of 'mash dist' to log of 'mash sketch'

341 outf = open(mash_log, "a")

342 error_dist = ("Error while trying to estimate pairwise distances between all genomes. "

343 f"See {mash_log}.")

344 utils.run_cmd(cmd_dist, error_dist, eof=True, stdout=matfile, stderr=outf)

345 outf.close()

346 matfile.close()

347 return 0

348

349

350def mash_step(to_try, corresp, mat_sp, genomes_removed, min_dist, max_dist):

351 """

352 Prepare a mash run, with a given genome as reference, and others to compare to.

353

354 Parameters

355 ----------

356 to_try : list

357 list of genome_file (keys of 'genomes') to compare, ordered by decreasing L90/nbcont

358 corresp : dict

359 {genome_file : num_of_genome in sorted_genomes}

360 mat_sp : scipy.sparse.dok.dok_matrix

361 triangle matrix containing pairwise distance comparisons

362 genomes_removed : dict

363 {genome_file: [ref_name, dist]} genome against which 'genome_name' is removed, and

364 corresponding distance (justifying removal)

365 min_dist : float

366 lower limit of distance between 2 genomes to keep them

367 max_dist : float

368 max limit of distance between 2 genomes to keep them

369

370 Returns

371 -------

372

373 to_try is updated (reference element and all genomes not compatible with it are removed)

374 genomes_removed is updated

375 return code (0 if no problem)

376

377 """

378 # Get last element (which is the 'best' genome), and remove it from the list

379 ref_name = to_try.pop()

380 # Line of genome in mat_sp

381 ref_num = corresp[ref_name]

382 # Genomes (ordered by increasing L90/nbcont) to compare to the selected element (ref_name)

383 others = to_try[::-1]

384

385 # For each genome, compare its distance to reference genome 'ref_name'

386 for gname in others:

387 # Column of genome in mat_sp

388 other_num = corresp[gname]

389 # Get distance between the 2 genomes

390 if ref_num < other_num:

391 dist = mat_sp[ref_num, other_num]

392 else:

393 logger.warning("Should never happen as mat_sp is a triangle matrix!")

394 dist = mat_sp[other_num, ref_num]

395 # If distance not in the limits, remove genome from to_try and add to genomes_removed list

396 if not min_dist <= dist <= max_dist:

397 to_try.remove(gname)

398 genomes_removed[gname] = [ref_name, dist]

399 return 0

400

401

402def read_matrix(genomes, sorted_genomes, matrix):

403 """

404 Read the matrix of pairwise distances between all genomes, and save it to a sparse

405 matrix (only upper triangle).

406

407 Parameters

408 ----------

409 genomes : dict

410 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}

411 sorted_genomes: list

412 list of 'genome_file' for all genomes kept (L90 and nbcont ok)

413 matrix : str

414 File containing the matrix of pairwise distances between all genomes

415

416 Returns

417 -------

418

419 mat_sp : str

420 python dok_matrix object

421 """

422 if not os.path.isfile(matrix):

423 logger.error(f"Matrix file {matrix} does not exist. We cannot read it "

424 "and do the next steps. Program ending.")

425 sys.exit(1)

426

427 nbgen = len(sorted_genomes)

428 corresp_abs = {genomes[genome][2]: num for num, genome in enumerate(sorted_genomes)}

429 # Create square matrix with nbgen cols/lines. dok format is a 'Dictionary Of Keys'

430 # -> writes (0, 1) value

431 mat_sp = dok_matrix((nbgen, nbgen), dtype=float)

432 # Write matrix values

433 with open(matrix, "r") as matf:

434 for line in matf:

435 path1, path2, dist = line.split()[:3]

436 num1 = corresp_abs[path1]

437 num2 = corresp_abs[path2]

438 # only in lower triangle (no duplicate)

439 if num1 < num2:

440 mat_sp[num1, num2] = float(dist)

441 else:

442 mat_sp[num2, num1] = float(dist)

443 return mat_sp

444

445

446def write_outputfiles(genomes, sorted_genomes, genomes_removed, outdir, gspecies, min_dist, max_dist):

447 """

448 Write the list of genomes kept in a file, 1 genome per line -> will be the input file for

449 annotation and next steps

450 Write discarded genomes to another file, with, for each line:

451 - genome name

452 - problem when compared with which other genome

453 - distance to this other genomes

454

455 Parameters

456 ----------

457 genomes : dict

458 {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}

459 sorted_genomes: list

460 list of 'genome_file' for all genomes kept (L90 and nbcont ok)

461 genomes_removed : dict

462 {genome_name: [ref_name, dist]} genome against which 'genome_name' is removed, and corresponding distance (justifying removal)

463 outdir : str

464 directory where those list files must be created

465 gspecies : str

466 species name if given, otherwise species taxID

467 min_dist : float

468 lower limit of distance between 2 genomes to keep them

469 max_dist : float

470 upper limit of distance between 2 genomes to keep them

471

472 Returns

473 -------

474 return code

475 """

476 if not os.path.isdir(outdir):

477 logger.error(f"The given output directory ({outdir}) does not exist. We cannot "

478 "create output files there")

479 sys.exit(1)

480 list_file = os.path.join(outdir, f"LSTINFO-{gspecies}-filtered-{min_dist}_{max_dist}.txt")

481 kept_genomes = []

482 discard_file = os.path.join(outdir, f"discarded-by-minhash-{gspecies}-{min_dist}_{max_dist}.txt")

483

484 # Get list of kept genomes and write them in list_file

485 for genome in sorted_genomes:

486 if genome not in genomes_removed:

487 kept_genomes.append(genome)

488

489 # Write 4 columns in LSTINFO file (genomes kept) :

490 # path to file analyzed, genome size, nbcont and L90

491 with open(list_file, "w") as lf:

492 lf.write('to_annotate\tgsize\tnb_conts\tL90\n')

493 # For each genome in kept_genomes, find required information on this it, using 'genomes'

494 for g in kept_genomes:

495 _, _, analyzed, size, nbcont, l90 = genomes[g]

496 towrite = utils.list_to_str([analyzed, size, nbcont, l90], sep="\t")

497 lf.write(towrite)

498

499 # Write list of discarded genomes and why they are discarded

500 with open(discard_file, "w") as disf:

501 disf.write("genome_name\tproblem_compared_with\tdist\n")

502 for genome, info in genomes_removed.items():

503 disf.write(utils.list_to_str([genome] + info))

504 logger.info(f"Final list of genomes in the dataset: {list_file}")

505 logger.info(f"List of genomes discarded by minhash steps: {discard_file}")

506 return list_file

Coverage for PanACoTA/prepare_module/filter_genomes.py: 100%

164 statements