Coverage for PanACoTA/annotate_module/format

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions to convert prodigal result files to gembase format.

39 * Proteins: multifasta with all CDS in aa

40 * Replicons: (multi)fasta of genome sequences

41 * Genes: multifasta of all genes in nuc

42 * gff3: gff files without sequence

43 * LSTINFO: information on annotation. Columns are: "start end strand type locus

44 gene_name | product | EC_number | inference2" and strain is C (complement) or D (direct).

45 Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`

46 For prodigal: "start end strand type locus NA | NA | NA | NA", as there is no

47 functional annotation.

49@author gem

50July 2019

51"""

53import os

54import shutil

55import glob

56import logging

58import PanACoTA.utils as utils

59from PanACoTA.annotate_module import general_format_functions as gfunc

61logger = logging.getLogger("annotate.prodigal_format")

64def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,

65 rep_dir, gff_dir):

66 """

67 Format the given genome, and create its corresponding files in the following folders:

69 - Proteins

70 - Genes

71 - Replicons

72 - LSTINFO

73 - gff

75 Parameters

76 ----------

77 gpath : str

78 path to the genome sequence which was given to prodigal for annotation

79 name : str

80 gembase name of the genome

81 prod_path : str

82 directory where all tmp_files for all sequences are saved (sequences cut at each set

83 of 5N, prodigal results and logs)

84 lst_dir : str

85 path to LSTINFO folder

86 prot_dir : str

87 path to Proteins folder

88 gene_dir : str

89 path to Genes folder

90 rep_dir : str

91 path to Replicons folder

92 gff_dir : str

93 path to gff3 folder

95 Returns

96 -------

97 bool :

98 True if genome was correctly formatted, False otherwise

99 """

100 # Get directory where prodigal results for the current genome are saved

101 prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes")

102

103 # Get prodigal result files

104 prot_file = glob.glob(os.path.join(prodigal_dir, "*.faa"))[0]

105 gen_file = glob.glob(os.path.join(prodigal_dir, "*.ffn"))[0]

106 gff_file = glob.glob(os.path.join(prodigal_dir, "*.gff"))[0]

107

108 # Define names for generated gembase files

109 res_prot_file = os.path.join(prot_dir, name + ".prt")

110 res_gene_file = os.path.join(gene_dir, name + ".gen")

111 res_lst_file = os.path.join(lst_dir, name + ".lst")

112 res_rep_file = os.path.join(rep_dir, name + ".fna")

113 res_gff_file = os.path.join(gff_dir, name + ".gff")

114

115 # Generate replicon file (same as input sequence but with gembase formatted headers). From

116 # this file, get contig names, to be used to generate gff file

117 contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file, logger)

118 if not contigs:

119 try:

120 os.remove(res_rep_file)

121 os.remove(res_lst_file)

122 os.remove(res_gff_file)

123 os.remove(res_gene_file)

124 os.remove(res_prot_file)

125 except OSError:

126 pass

127 logger.error("Problems while generating Replicon file for {}".format(name))

128 return False

129

130 # First, create .gen and .lst files. If they could not be formatted,

131 # remove those files, and return False with error message

132 ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, gpath, name)

133 if not ok:

134 try:

135 os.remove(res_rep_file)

136 os.remove(res_gene_file)

137 os.remove(res_lst_file)

138 os.remove(res_gff_file)

139 os.remove(res_prot_file)

140 except OSError:

141 pass

142 logger.error(f"Problems while generating .gen and .lst files for {name}")

143 return False

144

145 # Create gff files.

146 ok = create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes)

147 # If problem while formatting the genome (rep or gff file), remove all

148 # already created files, and return False (genome not formatted) with error message.

149 if not ok:

150 try:

151 os.remove(res_gene_file)

152 os.remove(res_lst_file)

153 os.remove(res_rep_file)

154 os.remove(res_gff_file)

155 os.remove(res_prot_file)

156 except OSError:

157 pass

158 logger.error("Problems while generating .gff (gff3 folder) "

159 "file for {}".format(name))

160 return False

161

162 # Generate .prt files (in Proteins directory)

163 ok = create_prt(prot_file, res_prot_file, res_lst_file)

164 # If problem while formatting prt file, return False, delete all generated

165 # formatted files, and write an error message to user.

166 if not ok:

167 try:

168 os.remove(res_gene_file)

169 os.remove(res_lst_file)

170 os.remove(res_rep_file)

171 os.remove(res_gff_file)

172 os.remove(res_prot_file)

173 except OSError: # pragma: no cover

174 pass

175 logger.error("Problems while generating .prt file (Proteins folder) "

176 "for {}".format(name))

177 return False

178 return ok

179

180

181def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):

182 """

183 Generate .gen file, from sequences contained in .ffn, but changing the

184 headers to match with gembase format.

185 At the same time, generate .lst file, from the information given in prodigal ffn headers

186

187 Parameters

188 ----------

189 contigs : dict

190 {original_contig_name: gembase_contig_name}

191 gen_file : str

192 .ffn file generated by prodigal

193 res_gen_file : str

194 generated .gen file, to write in Genes directory

195 res_lst_file : str

196 generated .lst file to write in LSTINFO directory

197 gpath : str

198 path to the genome given to prodigal. Only used for error message

199 name : str

200 gembase name of the genome to format

201 logger : logging.Logger

202 logger object to put information

203

204 Returns

205 -------

206 bool :

207 True if conversion went well, False otherwise

208 """

209 # Variable which will contain the current gene sequence

210 seq = ""

211 # number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not

212 # re-start at 1 for each new contig)

213 locus_num = 0

214 # contig name of the last gene. To check if we are now in a new contig (-> loc = b) or not

215 prev_cont_name = ""

216 # Previous ontig number: contig number to use in gembase format

217 prev_cont_num = 0

218 contig_num = 0

219 # Keep start, end, strand and informations (prodigal gives information on start_type,

220 # gc_cont etc.) from the previous gene, before overwriting it with information

221 # on the new one

222 prev_start = ""

223 prev_end = ""

224 prev_strand = ""

225 prev_info = ""

226 # Update loc when contig changes ('b' if gene at the border of a contig, 'i' if it is inside)

227 prev_loc = "b"

228 # To start, the first gene is, by definition, at the border of the contig

229 loc = "b"

230 # Open files: .ffn prodigal to read, .gen and .lst gembase to create

231 with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\

232 open(res_lst_file, "w") as r_lst:

233 # Read all lines in ffn file (sequences in nuc. for each gene)

234 for lineffn in ffn:

235 # If it is a sequence, save it and go to next line

236 if not lineffn.startswith(">"):

237 seq += lineffn

238 continue

239 # Otherwise:

240 # - write header of previous sequence to .gen

241 # - write previous sequence (in 'seq') to .gen

242 # - write LSTINFO information to .lst

243 # - update information (new start, end, contig number etc.) for next gene

244 else:

245 # Get information given for the new gene (by .ffn file from prodigal)

246 (gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#")

247 # Get contig number from prodigal gene header: prodigal first part of header is:

248 # <original genome name contig name>_<protein number>

249 contig_name = gname.strip().split("_")

250 if len(contig_name) > 1:

251 contig_name = "_".join(contig_name[:-1])

252 else:

253 contig_name = contig_name[0]

254 # If new contig:

255 # - previous gene was the last of its contig -> prev_loc = "b" ;

256 # - the current gene is the first of its contig (loc = "b")

257 # - we must increment the contig number

258 if contig_name != prev_cont_name:

259 # Check that this contig name is in the list, and get its gembase contig number

260 if contig_name in contigs:

261 contig_num = contigs[contig_name].split(".")[-1]

262 # if not in the list, problem, return false

263 else:

264 logger.error(f"'{contig_name}' found in {gen_file} does not exist in "

265 f"{gpath}.")

266 return False

267 prev_loc = 'b'

268 loc = 'b'

269 # If not new contig. If prev_loc == 'b', previous gene is the first protein

270 # of this contig.

271 # Current gene will be inside the contig (except if new contig for the next gene,

272 # meaning only 1 gene in the contig)

273 else:

274 loc = 'i'

275

276 # If it is not the first gene of the genome, write previous gene information

277 if prev_start != "":

278 # Write line in LSTINFO file, + header and sequence to the gene file

279 lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",

280 prev_loc, name, prev_cont_num, "NA", prev_info,

281 "NA", prev_strand, prev_start, prev_end, r_lst)

282 gfunc.write_header(lstline, r_gen)

283 r_gen.write(seq)

284 # -> get new information, save it for the next gene, and go to next line

285 # Strands are 1/-1 in prodigal, while we use D,C -> convert, so that next time

286 # we find a new gene, it writes this before updating for this new gene

287 if int(strand) == 1:

288 strand = "D"

289 else:

290 strand = "C"

291 # Prepare variables for next gene

292 locus_num += 1

293 seq = ""

294 prev_cont_num = contig_num

295 prev_cont_name = contig_name

296 prev_start = start

297 prev_end = end

298 prev_strand = strand

299 prev_loc = loc

300 prev_info = info

301 # Write last gene of the genome (-> loc = 'b'),

302 # Just check that there was at least 1 gene found (prev_start != "").

303 # Otherwise, nothing to write

304 if prev_start != "":

305 prev_loc = "b"

306 lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",

307 prev_loc, name, prev_cont_num, "NA", prev_info, "NA",

308 prev_strand, prev_start, prev_end, r_lst)

309 gfunc.write_header(lstline, r_gen)

310 r_gen.write(seq)

311 return True

312

313

314def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes):

315 """

316 Create .gff3 file.

317

318 Format:

319

320 ##gff-version 3

321 ##sequence-region contig1 start end

322 ##sequence-region contig2 start end

323 ...

324 seqid(=contig) source type start end score strand phase attributes

325

326 All fields tab separated.

327 Empty fields contain '.'

328

329 For example:

330 ESCO.1017.00200.00001 Prodigal:2.6 CDS start end . + . ID=ESCO.1017.00200.b0001_00001;locus_tag=ESCO.1017.00200.b0001_00001;product=hypothetical protein

331

332

333 genome1_1 Prodigal_v2.6.3 CDS 213 1880 260.0 + 0 ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.534;conf=99.99;score=259.99;cscore=268.89;sscore=-8.89;rscore=-8.50;uscore=-4.34;tscore=3.95;

334

335 Parameters

336 ----------

337 gpath : str

338 path to the genome sequence given to prodigal. Only used for the error message

339 gff_file : str

340 path to gff file generated by prodigal

341 res-gff_file : str

342 path to the gff file that must be created in result database

343 res-lst_file : str

344 path to the lst file that was created in result database in the previous step

345 contigs : dict

346 dict of contig names with their size. ["original_name": "gembase_name"]

347 sizes : dict

348 dict of contig gembase names with their sizes {"gembase_name": size}

349

350 Returns

351 -------

352 bool

353 True if everything went well, False if any problem

354

355 """

356 # open gff generated by prodigal to read it

357 # open file to write new gff file

358 # open lst file to read all information saved from prodigal results

359 with open(gff_file, 'r') as gf, open(res_gff_file, "w") as rgf, open(res_lst_file, "r") as rlf:

360 # Write headers of gff3 file

361 rgf.write("##gff-version 3\n")

362 for ori_name, new_name in contigs.items():

363 # Write the list of contigs, with their size

364 end = sizes[new_name]

365 rgf.write(f"##sequence-region\t{new_name}\t1\t{end}\n")

366

367 # Now, convert each line of prodigal gff to gembase formatted gff line

368 for linegff in gf:

369 # Ignore gff lines starting by #. For the new gff, these are already all written at the

370 # beginning of the file.

371 if linegff.startswith("#"):

372 continue

373

374 # We do not write the sequences

375 if linegff.startswith(">"):

376 break

377

378 # Get all information from prodigal gff line. Strip each of them as trailing whitespace

379 # can be hidden (leading to information from gff considered as different from

380 # information from lst)

381 fields_g = linegff.split("\t")

382 fields_g = [info.strip() for info in fields_g]

383 (contig_name, source, type_g, start_g, end_g,

384 score, strand_g, phase, attributes) = fields_g

385 # Get information given to this same sequence from the lst file

386 # (next lst line corresponds to next gff line without #), as, for each format,

387 # there is 1 line per gene)

388 linelst = rlf.readline()

389 fields_l = linelst.split("\t")

390 fields_l = [info.strip() for info in fields_l]

391 start_l, end_l, strand_l, type_l, locus_l, _, _ = fields_l

392

393

394 # Get gff and ffn filenames to give information to user if error message

395 gff = os.path.basename(gff_file)

396 ffn = ".".join(gff.split(".")[:-1]) + ".ffn"

397 # Path where gff and ffn generated by prodigal are

398 tmp = gpath + "-prodigalRes"

399 # Get gene name given by prodigal to current gene

400 gname = attributes.split("ID=")[1].split(";")[0]

401

402 # Compare information from lst and information from prodigal gff (start,

403 # end and type of feature). They should correspond

404 for (elemg, eleml, label) in zip([start_g, end_g, type_g],

405 [start_l, end_l, type_l],

406 ["start", "end", "type"]):

407 # If 1 element is different (start or end position, or type), print error

408 # message and return False: this genome could not be converted to gff

409 if elemg != eleml:

410 logger.error(f"Files {ffn} and {gff} (in prodigal tmp_files: {tmp}) do not have "

411 f"the same {label} value for gene {gname} ({elemg} in gff, {eleml} in "

412 "ffn))")

413 return False

414

415 # Replace prodigal ID with the new gene name (in gembase format), found in the

416 # corresponding lst line

417 at_split = attributes.split(";")

418 new = [atr if "ID" not in atr else f'ID={locus_l}' for atr in at_split]

419

420 # Write new line of gff file

421 # Write new contig name

422 cname = contigs[contig_name]

423 info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g,

424 phase, ";".join(new)])

425 rgf.write(info + "\n")

426 return True

427

428

429def create_prt(prot_file, res_prot_file, res_lst_file):

430 """

431 Generate .prt file (gembase formatted gene names), from features contained in .lst file generated just before.

432

433 Parameters

434 ----------

435 prot_file : str

436 .faa file generated by prodigal

437 res_prot_file : str

438 output file, to write in Proteins directory

439 res_lst_file : str

440 .lst file to get all gene names in gembase format instead of re-generating them

441 Returns

442 -------

443 bool :

444 True if conversion went well, False otherwise

445 """

446

447 # Open:

448 # - prot file to read gene sequences from prodigal results

449 # - res_prot file to write sequences with gembase headers

450 # - res_lst_file to get gene gembase names and other infos (strand, size...)

451

452 with open(prot_file, "r") as faa, open(res_prot_file, "w") as r_prt,\

453 open(res_lst_file, "r") as r_lst:

454 # Read prt file generated by prodigal

455 for lineprot in faa:

456 # If protein sequence, write it

457 if not lineprot.startswith(">"):

458 r_prt.write(lineprot)

459 continue

460 # If header, replace by gembase header

461 # For that, get next lst line (corresponding to next protein,

462 # as there is 1 protein per line in .lst -> 1 protein per header in .prt)

463 linelst = r_lst.readline().strip()

464 # Try to get info from lstline.

465 # If lstline empty, it means that the current protein

466 # is missing from lst file. We already read the last protein of lst file.

467 if linelst != '':

468 # If not empty, check lst format, return False if not right format

469 try:

470 # If ok, gembase name is in the fifth column of lst file

471 start, end, _, _, gem_name, _, _ = linelst.split("\t")

472 except ValueError:

473 logger.error("Problem in format of lstline ({})".format(linelst))

474 return False

475 else:

476 logger.error("No more protein in lst file. We cannot get information on this "

477 "protein ({})! Check that you do not have more proteins than genes "

478 "in prodigal results".format(lineprot.strip()))

479 return False

480 # Write this gembase name as a new header

481 # Size of protein sequence is the third of gene sequence. Check that it is an int.

482 try :

483 size_gen = (int(end) - int(start) + 1)

484 except ValueError:

485 logger.error("Start and/or end of protein {} position is not a number (start "

486 "= {}; end = {})".format(gem_name, start, end))

487 return False

488 # Find size of protein in number of aa

489 # If number of nucleotides in gene cannot be divided by 3 to get number of amino-acids, there is a problem with this protein: return False to ignore this genome

490 size_prot = size_gen/3

491 if int(size_prot) != size_prot:

492 logger.error("Gene {} has a number of nucleotides ({}) that is not divisible "

493 "by 3.".format(gem_name, size_gen))

494 return False

495 gfunc.write_header(linelst, r_prt)

496 # new_header = "\t".join([gem_name, str(int(size_prot)), product, info])

497 # r_prt.write(">" + new_header + "\n")

498 # Check that there are no more proteins in lst than in this prt file

499 linelst = r_lst.readline()

500 if linelst.strip() != '':

501 gem_name = linelst.strip().split("\t")[4]

502 logger.error("Protein {} is in .lst file but its sequence is not in the protein "

503 "file generated by prodigal.".format(gem_name))

504 return False

505 return True

Coverage for PanACoTA/annotate_module/format_prodigal.py: 100%

180 statements