Coverage for PanACoTA/annotate_module/genome_seq

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions to:

39- analyse a genome (cut at stretch of N if asked, calc L90, nb cont, size...)

40- if genome cut by stretch of N, write the new sequence to new file

41- rename genome contigs and write new sequence to new file

44@author gem

45April 2017

46"""

47import os

48import re

49import sys

50import numpy as np

51import logging

52import progressbar

54from PanACoTA import utils

56logger = logging.getLogger("annotate.gseq_functions")

58def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, soft, logger, quiet=False):

59 """

61 Parameters

62 ----------

63 genomes : dict

64 {genome: spegenus.date}

65 dbpath : str

66 path to folder containing genomes

67 tmp_path : str

68 path to put out files

69 nbn : int

70 minimum number of 'N' required to cut into a new contig

71 soft : str

72 soft used (prokka, prodigal, or None if called by prepare module)

73 logger : logging.Logger

74 logger object to write log information. Because this function can be called from

75 prepare module, where sub logger name is different

76 quiet : bool

77 True if nothing must be written to stdout/stderr, False otherwise

79 Returns

80 -------

81 dict

82 {genome: [spegenus.date, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}

84 """

85 cut = nbn > 0

86 pat = None ## To put pattern with which sequence must be cut

87 if cut:

88 pat = 'N' * nbn + "+"

89 nbgen = len(genomes)

90 bar = None

91 curnum = None

92 if cut:

93 logger.info(("Cutting genomes at each time there are at least {} 'N' in a row, "

94 "and then, calculating genome size, number of contigs and L90.").format(nbn))

95 else:

96 logger.info("Calculating genome size, number of contigs, L90")

97 if not quiet:

98 # Create progressbar

99 widgets = ['Analysis: ', progressbar.Bar(marker='█', left='', right=''),

100 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',

101 progressbar.Percentage(), ') - ', progressbar.Timer(), ' - ',

102 progressbar.ETA()

103 ]

104 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start()

105 curnum = 1

106 toremove = []

107 # Analyse genomes 1 by 1

108 for genome, name in genomes.items():

109 # If not quiet option, show progress bar

110 if not quiet:

111 bar.update(curnum)

112 curnum += 1

113 # analyse genome, and check everything went well.

114 # exception if binary file

115 try:

116 res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger)

117 except UnicodeDecodeError:

118 logger.warning(f"'{genome}' does not seem to be a fasta file. It will be ignored.")

119 res = False

120 # Problem while analysing genome -> genome ignored

121 if not res:

122 toremove.append(genome)

123 # If there are some genomes to remove (analysis failed), remove them from genomes dict.

124 if toremove:

125 for gen in toremove:

126 del genomes[gen]

127 if not genomes:

128 logger.error(f"No genome was found in the database folder {dbpath}. See logfile "

129 "for more information.")

130 sys.exit(1)

131 if not quiet:

132 bar.finish()

133 return 0

134

135

136def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger):

137 """

138 Analyse given genome:

139

140 - if cut is asked:

141

142 - cut its contigs at each time that 'pat' is seen

143 - save cut genome in new file

144

145 - calculate genome size, L90, nb contigs and save it into genomes

146

147 Parameters

148 ----------

149 genome : str

150 given genome to analyse

151 dbpath : str

152 path to the folder containing the given genome sequence

153 tmp_path : str

154 path to folder where output files must be saved.

155 cut : bool

156 True if contigs must be cut, False otherwise

157 pat : str

158 pattern on which contigs must be cut. ex: "NNNNN"

159 genomes : dict

160 {genome_file: [genome_name]} as input, and will be changed to\

161 {genome_file: [genome_name, path, path_annotate, gsize, nbcont, L90]}

162 soft : str

163 soft used (prokka, prodigal, or None if called by prepare module)

164

165 Returns

166 -------

167 bool

168 True if genome analysis went well, False otherwise

169 Modifies 'genomes' for the analysed genome: -> {genome_file: [genome_name, path,

170 path_annotate, gsize, nbcont, L90]}

171 """

172 gpath, grespath = get_output_dir(soft, dbpath, tmp_path, genome, cut, pat)

173 if not os.path.exists(gpath):

174 logger.error(f"The file {gpath} does not exist")

175 return False

176 # Open original sequence file

177 with open(gpath, "r") as genf:

178 # If a new file must be created (sequences cut), open it

179 gresf = None

180 if grespath:

181 gresf = open(grespath, "w")

182

183 # Initialize variables

184 cur_contig_name = "" # header text

185 contig_sizes = {} # {header text: size}

186 cur_seq = "" # sequence

187 num = 1 # Used to get unique contig names

188

189 # Read each line of original sequence

190 for line in genf:

191 #### NEW CONTIG

192 # Line corresponding to a new contig

193 if line.startswith(">"):

194 # If not first contig, write info to output file (if needed)

195 if cur_seq != "":

196 num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes,

197 gresf, num, logger)

198 # If problem while formatting contig, return False -> genome ignored

199 if num == -1:

200 return False

201 # Get contig name for next turn, and reset sequence

202 cur_contig_name = line.strip()

203 # Initialize for next contig

204 cur_seq = ""

205 # #### SEQUENCE LINE

206 # If sequence line, keep it, all in upper case

207 else:

208 # Add this line without \n to sequence (1 sequence per line)

209 cur_seq += line.strip().upper()

210

211 # LAST CONTIG

212 if cur_contig_name != "":

213 num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf,

214 num, logger)

215 if num == -1:

216 return False

217 # GLOBAL INFORMATION

218 nbcont = len(contig_sizes)

219 gsize = sum(contig_sizes.values())

220 if nbcont == 0 or gsize == 0:

221 logger.warning(f"Your file {gpath} does not contain any gene. Please check that you "

222 "really gave a fasta sequence file")

223 if grespath:

224 gresf.close()

225 os.remove(grespath)

226 return False

227 l90 = calc_l90(contig_sizes)

228 # Everything ok for this genome -> complete its list of information in genomes dict

229 # [spegenus.date] -> [spegenus.date, gpath, gpath_to_annotate, gsize, nbcont, L90]}

230 if grespath:

231 genomes[genome] += [gpath, grespath, gsize, nbcont, l90]

232 else:

233 genomes[genome] += [gpath, gpath, gsize, nbcont, l90]

234 # If we wrote a new sequence file, close it

235 if grespath:

236 gresf.close()

237 return True

238

239

240def get_output_dir(soft, dbpath, tmp_path, genome, cut, pat):

241 """

242 Get output file to put sequence cut and/or sequence with shorter contigs (prokka)

243

244 Parameters

245 ----------

246 soft : str

247 soft used (prokka, prodigal, or None if called by prepare module)

248 dbpath : str

249 path to the folder containing the given genome sequence

250 tmp_path : str

251 path to folder where output files must be saved.

252 genome : str

253 genome name

254 cut : bool

255 True if contigs must be cut, False otherwise

256 pat : str

257 pattern on which contigs must be cut. ex: "NNNNN"

258

259 Return

260 ------

261 grespath : str

262 path to ouput file. None if no need to create new sequence file

263 """

264 # Path to sequence to analyze

265 gpath = os.path.join(dbpath, genome)

266 # genome file is in dbpath except if it was in several files in dbpath,

267 # in which case it has been concatenated to a file in tmp_path

268 if not os.path.isfile(gpath):

269 gpath = os.path.join(tmp_path, genome)

270 # New file create if needed. If not (prodigal and not cut), empty filename

271 grespath = None

272 # If user asks to cut at each 'pat', need to create a new sequence file,

273 # whatever the annotation soft used

274 if cut:

275 new_file = genome + "_{}-split{}N.fna".format(soft, len(pat) - 1)

276 grespath = os.path.join(tmp_path, new_file)

277 # If no cutl, just keep original sequence, no need to create new file.

278 # Just check that contigs have different names

279 return gpath, grespath

280

281

282def format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, num, logger):

283 """

284 Format given contig, and save to output file if needed

285

286 - if cut: cut it and write each subsequence

287 - write new contig just check that contig names are different

288

289 Parameters

290 ----------

291 cut : bool

292 True if contigs must be cut, False otherwise

293 pat : str

294 pattern on which contigs must be cut. ex: "NNNNN"

295 cur_seq : str

296 current sequence (aa)

297 cur_contig_name : str

298 name of current contig

299 genome : str

300 name of current genome

301 cont_sizes : dict

302 {contig_name : sequence length}

303 gresf : io.TextIOWrappe

304 open file to write new sequence. If we are annotating with prodigal and not cutting,

305 there is no new sequence -> gref is None

306 num : int

307 current contig number

308 logger : logging.Logger

309 logger object to write log information

310

311 Returns

312 -------

313 bool

314 True if contig has been written without problem, False if any problem

315 """

316 # "CUT" if cut: need to cut at each 'pat' -> write new header + seq in new file

317 if cut:

318 # Cut sequence and write header + sequence to res file

319 num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num)

320 # No cut -> no new file created, but check contig unique names

321 else:

322 if cur_contig_name in contig_sizes.keys():

323 logger.error(f"In genome {genome}, '{cur_contig_name}' contig name is used for "

324 "several contigs. Please put different names for each contig. "

325 "This genome will be ignored.")

326 return -1

327 else:

328 contig_sizes[cur_contig_name] = len(cur_seq)

329 return num

330

331

332def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num):

333 """

334 Save the contig read just before into dicts and write it to sequence file.

335 Unique ID of contig must be in the first field of header, before the first space

336 (required by prokka)

337

338 Parameters

339 ----------

340 pat : str

341 pattern to split a contig. None if we do not want to split

342 whole_seq : str

343 sequence of current contig, to save once split according to pat

344 cur_contig_name : str

345 name of current contig to save once split according to pat

346 contig_sizes : dict

347 {name: size} save cur_contig once split according to pat

348 gresf : _io.TextIOWrapper

349 file open in w mode to save the split sequence

350 num : int

351 current contig number.

352

353 Returns

354 -------

355 int

356 new contig number, after giving number(s) to the current contig

357 """

358 # split contig each time a stretch of at least nbn 'N' is found (pattern pat)

359 if not pat:

360 cont_parts = [whole_seq]

361 else:

362 cont_parts = re.split(pat, whole_seq)

363

364 # save contig parts

365 for seq in cont_parts:

366 # Only save non empty contigs (with some patterns, it could arrive that

367 # we get empty contigs, if 2 occurrences of the pattern are side by side).

368 if len(seq) == 0:

369 continue

370 new_contig_name = ">{}_{}\n".format(num, cur_contig_name.split(">")[1])

371 contig_sizes[new_contig_name] = len(seq)

372 gresf.write(new_contig_name)

373 gresf.write(seq + "\n")

374 num += 1

375 return num

376

377

378def calc_l90(contig_sizes):

379 """

380 Calc L90 of a given genome

381

382 Parameters

383 ----------

384 contig_sizes : dict

385 {name: size}

386

387 Returns

388 -------

389 None or int

390 if L90 found, returns L90. Otherwise, returns nothing

391 """

392 gsize = sum(contig_sizes.values())

393 sizes = [contig_sizes[cont] for cont in contig_sizes]

394 cum_sizes = np.cumsum(sorted(sizes, reverse=True))

395 lim = 0.9 * gsize

396 for num, val in enumerate(cum_sizes):

397 if val >= lim:

398 return num + 1

399

400

401def rename_all_genomes(genomes):

402 """

403 FUNCTION DIRECTLY CALLED FROM MAIN ANNOTATE MODULE (step 3)

404 Sort kept genomes by L90 and then nb contigs.

405 For each genome, assign a strain number, and rename all its contigs.

406

407 Parameters

408 ----------

409 genomes : dict

410 {genome: [name, path, path_to_seq, gsize, nbcont, L90]} as input, and will become\

411 {genome: [gembase_name, path, path_to_seq, gsize, nbcont, L90]} at the end

412

413 Return

414 ------

415 change 1st field of genomes dict. name -> gembase_name (with strain number)

416

417 """

418 logger.info(f"Renaming kept genomes according to their quality ({len(genomes)} genomes)")

419 # Keep first genome name to give to prodigal for training

420 first_gname = None

421 # Keep previous genome name (ESCO.0109 -> ESCO)

422 last_name = ""

423 # Keep last strain number

424 last_strain = 0

425 # "SAEN.1015.{}".format(str(last_strain).zfill(5))

426 # Sort genomes by species, L90 and nb_contigs

427 for genome, [name, _, _, _, _, _] in sorted(genomes.items(),

428 key=utils.sort_genomes_byname_l90_nbcont):

429 if not first_gname:

430 first_gname = genome

431 # first genome, or new strain name (ex: ESCO vs EXPL)

432 # -> keep this new name, and add 1 to next strain number

433 if last_name != name.split(".")[0]:

434 last_strain = 1

435 last_name = name.split(".")[0]

436 # same strain name

437 # -> write this new sequence, and go to next one (strain += 1)

438 else:

439 last_strain += 1

440 # Write information to "genomes" dict.

441 gembase_name = ".".join([name, str(last_strain).zfill(5)])

442 genomes[genome][0] = gembase_name

443 return first_gname

444

445

446def plot_distributions(genomes, res_path, listfile_base, l90, nbconts):

447 """

448 FUNCTION DIRECTLY CALLED FROM MAIN ANNOTATE MODULE (step2)

449 Plot distributions of L90 and nbcontig values.

450

451 Parameters

452 ----------

453 genomes : dict

454 {genome: [name, orig_path, to_annotate_path, size, nbcont, l90]}

455 res_path : str

456 path to put all output files

457 listfile_base : str

458 name of list file

459 l90 : int

460 L90 threshold

461 nbconts : int

462 nb contigs threshold

463

464 Returns

465 -------

466 (l90_vals, nbcont_vals, dist1, dist2) :

467

468 - l90_vals : list of l90 values for all genomes

469 - nbcont_vals : list of nbcontigs for all genomes

470 - dist1 : matplotlib figure of distribution of L90 values

471 - dist2 : matplotlib figure of distribution of nb contigs values

472

473 """

474 logger.info("Generating distribution of L90 and #contigs graphs.")

475 l90_vals = [val for _, (_, _, _, _, _, val) in genomes.items()]

476 outl90 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png")

477 nbcont_vals = [val for _, (_, _, _, _, val, _) in genomes.items()]

478 outnbcont = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png")

479 dist1 = utils.plot_distr(l90_vals, l90, "L90 distribution for all genomes",

480 "max L90 =", logger)

481 dist2 = utils.plot_distr(nbcont_vals, nbconts,

482 "Distribution of number of contigs among all genomes",

483 "max #contigs =", logger)

484 dist1.savefig(outl90)

485 dist2.savefig(outnbcont)

486 return l90_vals, nbcont_vals, dist1, dist2

Coverage for PanACoTA/annotate_module/genome_seq_functions.py: 100%

150 statements