Coverage for PanACoTA/annotate_module/genome_seq_functions.py: 100%

150 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to: 

38 

39- analyse a genome (cut at stretch of N if asked, calc L90, nb cont, size...) 

40- if genome cut by stretch of N, write the new sequence to new file 

41- rename genome contigs and write new sequence to new file 

42 

43 

44@author gem 

45April 2017 

46""" 

47import os 

48import re 

49import sys 

50import numpy as np 

51import logging 

52import progressbar 

53 

54from PanACoTA import utils 

55 

56logger = logging.getLogger("annotate.gseq_functions") 

57 

58def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, soft, logger, quiet=False): 

59 """ 

60 

61 Parameters 

62 ---------- 

63 genomes : dict 

64 {genome: spegenus.date} 

65 dbpath : str 

66 path to folder containing genomes 

67 tmp_path : str 

68 path to put out files 

69 nbn : int 

70 minimum number of 'N' required to cut into a new contig 

71 soft : str 

72 soft used (prokka, prodigal, or None if called by prepare module) 

73 logger : logging.Logger 

74 logger object to write log information. Because this function can be called from 

75 prepare module, where sub logger name is different 

76 quiet : bool 

77 True if nothing must be written to stdout/stderr, False otherwise 

78 

79 Returns 

80 ------- 

81 dict 

82 {genome: [spegenus.date, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} 

83 

84 """ 

85 cut = nbn > 0 

86 pat = None ## To put pattern with which sequence must be cut 

87 if cut: 

88 pat = 'N' * nbn + "+" 

89 nbgen = len(genomes) 

90 bar = None 

91 curnum = None 

92 if cut: 

93 logger.info(("Cutting genomes at each time there are at least {} 'N' in a row, " 

94 "and then, calculating genome size, number of contigs and L90.").format(nbn)) 

95 else: 

96 logger.info("Calculating genome size, number of contigs, L90") 

97 if not quiet: 

98 # Create progressbar 

99 widgets = ['Analysis: ', progressbar.Bar(marker='█', left='', right=''), 

100 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (', 

101 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - ', 

102 progressbar.ETA() 

103 ] 

104 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start() 

105 curnum = 1 

106 toremove = [] 

107 # Analyse genomes 1 by 1 

108 for genome, name in genomes.items(): 

109 # If not quiet option, show progress bar 

110 if not quiet: 

111 bar.update(curnum) 

112 curnum += 1 

113 # analyse genome, and check everything went well. 

114 # exception if binary file 

115 try: 

116 res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger) 

117 except UnicodeDecodeError: 

118 logger.warning(f"'{genome}' does not seem to be a fasta file. It will be ignored.") 

119 res = False 

120 # Problem while analysing genome -> genome ignored 

121 if not res: 

122 toremove.append(genome) 

123 # If there are some genomes to remove (analysis failed), remove them from genomes dict. 

124 if toremove: 

125 for gen in toremove: 

126 del genomes[gen] 

127 if not genomes: 

128 logger.error(f"No genome was found in the database folder {dbpath}. See logfile " 

129 "for more information.") 

130 sys.exit(1) 

131 if not quiet: 

132 bar.finish() 

133 return 0 

134 

135 

136def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger): 

137 """ 

138 Analyse given genome: 

139 

140 - if cut is asked: 

141 

142 - cut its contigs at each time that 'pat' is seen 

143 - save cut genome in new file 

144 

145 - calculate genome size, L90, nb contigs and save it into genomes 

146 

147 Parameters 

148 ---------- 

149 genome : str 

150 given genome to analyse 

151 dbpath : str 

152 path to the folder containing the given genome sequence 

153 tmp_path : str 

154 path to folder where output files must be saved. 

155 cut : bool 

156 True if contigs must be cut, False otherwise 

157 pat : str 

158 pattern on which contigs must be cut. ex: "NNNNN" 

159 genomes : dict 

160 {genome_file: [genome_name]} as input, and will be changed to\ 

161 {genome_file: [genome_name, path, path_annotate, gsize, nbcont, L90]} 

162 soft : str 

163 soft used (prokka, prodigal, or None if called by prepare module) 

164 

165 Returns 

166 ------- 

167 bool 

168 True if genome analysis went well, False otherwise 

169 Modifies 'genomes' for the analysed genome: -> {genome_file: [genome_name, path, 

170 path_annotate, gsize, nbcont, L90]} 

171 """ 

172 gpath, grespath = get_output_dir(soft, dbpath, tmp_path, genome, cut, pat) 

173 if not os.path.exists(gpath): 

174 logger.error(f"The file {gpath} does not exist") 

175 return False 

176 # Open original sequence file 

177 with open(gpath, "r") as genf: 

178 # If a new file must be created (sequences cut), open it 

179 gresf = None 

180 if grespath: 

181 gresf = open(grespath, "w") 

182 

183 # Initialize variables 

184 cur_contig_name = "" # header text 

185 contig_sizes = {} # {header text: size} 

186 cur_seq = "" # sequence 

187 num = 1 # Used to get unique contig names 

188 

189 # Read each line of original sequence 

190 for line in genf: 

191 #### NEW CONTIG 

192 # Line corresponding to a new contig 

193 if line.startswith(">"): 

194 # If not first contig, write info to output file (if needed) 

195 if cur_seq != "": 

196 num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, 

197 gresf, num, logger) 

198 # If problem while formatting contig, return False -> genome ignored 

199 if num == -1: 

200 return False 

201 # Get contig name for next turn, and reset sequence 

202 cur_contig_name = line.strip() 

203 # Initialize for next contig 

204 cur_seq = "" 

205 # #### SEQUENCE LINE 

206 # If sequence line, keep it, all in upper case 

207 else: 

208 # Add this line without \n to sequence (1 sequence per line) 

209 cur_seq += line.strip().upper() 

210 

211 # LAST CONTIG 

212 if cur_contig_name != "": 

213 num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, 

214 num, logger) 

215 if num == -1: 

216 return False 

217 # GLOBAL INFORMATION 

218 nbcont = len(contig_sizes) 

219 gsize = sum(contig_sizes.values()) 

220 if nbcont == 0 or gsize == 0: 

221 logger.warning(f"Your file {gpath} does not contain any gene. Please check that you " 

222 "really gave a fasta sequence file") 

223 if grespath: 

224 gresf.close() 

225 os.remove(grespath) 

226 return False 

227 l90 = calc_l90(contig_sizes) 

228 # Everything ok for this genome -> complete its list of information in genomes dict 

229 # [spegenus.date] -> [spegenus.date, gpath, gpath_to_annotate, gsize, nbcont, L90]} 

230 if grespath: 

231 genomes[genome] += [gpath, grespath, gsize, nbcont, l90] 

232 else: 

233 genomes[genome] += [gpath, gpath, gsize, nbcont, l90] 

234 # If we wrote a new sequence file, close it 

235 if grespath: 

236 gresf.close() 

237 return True 

238 

239 

240def get_output_dir(soft, dbpath, tmp_path, genome, cut, pat): 

241 """ 

242 Get output file to put sequence cut and/or sequence with shorter contigs (prokka) 

243 

244 Parameters 

245 ---------- 

246 soft : str 

247 soft used (prokka, prodigal, or None if called by prepare module) 

248 dbpath : str 

249 path to the folder containing the given genome sequence 

250 tmp_path : str 

251 path to folder where output files must be saved. 

252 genome : str 

253 genome name 

254 cut : bool 

255 True if contigs must be cut, False otherwise 

256 pat : str 

257 pattern on which contigs must be cut. ex: "NNNNN" 

258 

259 Return 

260 ------ 

261 grespath : str 

262 path to ouput file. None if no need to create new sequence file 

263 """ 

264 # Path to sequence to analyze 

265 gpath = os.path.join(dbpath, genome) 

266 # genome file is in dbpath except if it was in several files in dbpath, 

267 # in which case it has been concatenated to a file in tmp_path 

268 if not os.path.isfile(gpath): 

269 gpath = os.path.join(tmp_path, genome) 

270 # New file create if needed. If not (prodigal and not cut), empty filename 

271 grespath = None 

272 # If user asks to cut at each 'pat', need to create a new sequence file, 

273 # whatever the annotation soft used 

274 if cut: 

275 new_file = genome + "_{}-split{}N.fna".format(soft, len(pat) - 1) 

276 grespath = os.path.join(tmp_path, new_file) 

277 # If no cutl, just keep original sequence, no need to create new file. 

278 # Just check that contigs have different names 

279 return gpath, grespath 

280 

281 

282def format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, num, logger): 

283 """ 

284 Format given contig, and save to output file if needed 

285 

286 - if cut: cut it and write each subsequence 

287 - write new contig just check that contig names are different 

288 

289 Parameters 

290 ---------- 

291 cut : bool 

292 True if contigs must be cut, False otherwise 

293 pat : str 

294 pattern on which contigs must be cut. ex: "NNNNN" 

295 cur_seq : str 

296 current sequence (aa) 

297 cur_contig_name : str 

298 name of current contig 

299 genome : str 

300 name of current genome 

301 cont_sizes : dict 

302 {contig_name : sequence length} 

303 gresf : io.TextIOWrappe 

304 open file to write new sequence. If we are annotating with prodigal and not cutting, 

305 there is no new sequence -> gref is None 

306 num : int 

307 current contig number 

308 logger : logging.Logger 

309 logger object to write log information 

310 

311 Returns 

312 ------- 

313 bool 

314 True if contig has been written without problem, False if any problem 

315 """ 

316 # "CUT" if cut: need to cut at each 'pat' -> write new header + seq in new file 

317 if cut: 

318 # Cut sequence and write header + sequence to res file 

319 num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num) 

320 # No cut -> no new file created, but check contig unique names 

321 else: 

322 if cur_contig_name in contig_sizes.keys(): 

323 logger.error(f"In genome {genome}, '{cur_contig_name}' contig name is used for " 

324 "several contigs. Please put different names for each contig. " 

325 "This genome will be ignored.") 

326 return -1 

327 else: 

328 contig_sizes[cur_contig_name] = len(cur_seq) 

329 return num 

330 

331 

332def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num): 

333 """ 

334 Save the contig read just before into dicts and write it to sequence file. 

335 Unique ID of contig must be in the first field of header, before the first space 

336 (required by prokka) 

337 

338 Parameters 

339 ---------- 

340 pat : str 

341 pattern to split a contig. None if we do not want to split 

342 whole_seq : str 

343 sequence of current contig, to save once split according to pat 

344 cur_contig_name : str 

345 name of current contig to save once split according to pat 

346 contig_sizes : dict 

347 {name: size} save cur_contig once split according to pat 

348 gresf : _io.TextIOWrapper 

349 file open in w mode to save the split sequence 

350 num : int 

351 current contig number. 

352 

353 Returns 

354 ------- 

355 int 

356 new contig number, after giving number(s) to the current contig 

357 """ 

358 # split contig each time a stretch of at least nbn 'N' is found (pattern pat) 

359 if not pat: 

360 cont_parts = [whole_seq] 

361 else: 

362 cont_parts = re.split(pat, whole_seq) 

363 

364 # save contig parts 

365 for seq in cont_parts: 

366 # Only save non empty contigs (with some patterns, it could arrive that 

367 # we get empty contigs, if 2 occurrences of the pattern are side by side). 

368 if len(seq) == 0: 

369 continue 

370 new_contig_name = ">{}_{}\n".format(num, cur_contig_name.split(">")[1]) 

371 contig_sizes[new_contig_name] = len(seq) 

372 gresf.write(new_contig_name) 

373 gresf.write(seq + "\n") 

374 num += 1 

375 return num 

376 

377 

378def calc_l90(contig_sizes): 

379 """ 

380 Calc L90 of a given genome 

381 

382 Parameters 

383 ---------- 

384 contig_sizes : dict 

385 {name: size} 

386 

387 Returns 

388 ------- 

389 None or int 

390 if L90 found, returns L90. Otherwise, returns nothing 

391 """ 

392 gsize = sum(contig_sizes.values()) 

393 sizes = [contig_sizes[cont] for cont in contig_sizes] 

394 cum_sizes = np.cumsum(sorted(sizes, reverse=True)) 

395 lim = 0.9 * gsize 

396 for num, val in enumerate(cum_sizes): 

397 if val >= lim: 

398 return num + 1 

399 

400 

401def rename_all_genomes(genomes): 

402 """ 

403 FUNCTION DIRECTLY CALLED FROM MAIN ANNOTATE MODULE (step 3) 

404 Sort kept genomes by L90 and then nb contigs. 

405 For each genome, assign a strain number, and rename all its contigs. 

406 

407 Parameters 

408 ---------- 

409 genomes : dict 

410 {genome: [name, path, path_to_seq, gsize, nbcont, L90]} as input, and will become\ 

411 {genome: [gembase_name, path, path_to_seq, gsize, nbcont, L90]} at the end 

412 

413 Return 

414 ------ 

415 change 1st field of genomes dict. name -> gembase_name (with strain number) 

416 

417 """ 

418 logger.info(f"Renaming kept genomes according to their quality ({len(genomes)} genomes)") 

419 # Keep first genome name to give to prodigal for training 

420 first_gname = None 

421 # Keep previous genome name (ESCO.0109 -> ESCO) 

422 last_name = "" 

423 # Keep last strain number 

424 last_strain = 0 

425 # "SAEN.1015.{}".format(str(last_strain).zfill(5)) 

426 # Sort genomes by species, L90 and nb_contigs 

427 for genome, [name, _, _, _, _, _] in sorted(genomes.items(), 

428 key=utils.sort_genomes_byname_l90_nbcont): 

429 if not first_gname: 

430 first_gname = genome 

431 # first genome, or new strain name (ex: ESCO vs EXPL) 

432 # -> keep this new name, and add 1 to next strain number 

433 if last_name != name.split(".")[0]: 

434 last_strain = 1 

435 last_name = name.split(".")[0] 

436 # same strain name 

437 # -> write this new sequence, and go to next one (strain += 1) 

438 else: 

439 last_strain += 1 

440 # Write information to "genomes" dict. 

441 gembase_name = ".".join([name, str(last_strain).zfill(5)]) 

442 genomes[genome][0] = gembase_name 

443 return first_gname 

444 

445 

446def plot_distributions(genomes, res_path, listfile_base, l90, nbconts): 

447 """ 

448 FUNCTION DIRECTLY CALLED FROM MAIN ANNOTATE MODULE (step2) 

449 Plot distributions of L90 and nbcontig values. 

450 

451 Parameters 

452 ---------- 

453 genomes : dict 

454 {genome: [name, orig_path, to_annotate_path, size, nbcont, l90]} 

455 res_path : str 

456 path to put all output files 

457 listfile_base : str 

458 name of list file 

459 l90 : int 

460 L90 threshold 

461 nbconts : int 

462 nb contigs threshold 

463 

464 Returns 

465 ------- 

466 (l90_vals, nbcont_vals, dist1, dist2) : 

467 

468 - l90_vals : list of l90 values for all genomes 

469 - nbcont_vals : list of nbcontigs for all genomes 

470 - dist1 : matplotlib figure of distribution of L90 values 

471 - dist2 : matplotlib figure of distribution of nb contigs values 

472 

473 """ 

474 logger.info("Generating distribution of L90 and #contigs graphs.") 

475 l90_vals = [val for _, (_, _, _, _, _, val) in genomes.items()] 

476 outl90 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png") 

477 nbcont_vals = [val for _, (_, _, _, _, val, _) in genomes.items()] 

478 outnbcont = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png") 

479 dist1 = utils.plot_distr(l90_vals, l90, "L90 distribution for all genomes", 

480 "max L90 =", logger) 

481 dist2 = utils.plot_distr(nbcont_vals, nbconts, 

482 "Distribution of number of contigs among all genomes", 

483 "max #contigs =", logger) 

484 dist1.savefig(outl90) 

485 dist2.savefig(outnbcont) 

486 return l90_vals, nbcont_vals, dist1, dist2