Coverage for PanACoTA/annotate_module/format

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions to:

39- convert prokka tbl file to our tab file

40- convert prokka ffn and faa headers to our format

41- Create the database, with the following folders in the given "res_path":

43 * Proteins: multifasta with all CDS in aa

44 * Replicons: multifasta of genome

45 * Genes: multifasta of all genes in nuc

46 * gff3: gff files without sequence

47 * LSTINFO: information on annotation. Columns are: "start end strand type locus\

48 gene_name | product | EC_number | inference2" with the same types as prokka file,\

49 and strain is C (complement) or D (direct). Locus is:\

50 `<genome_name>.<contig_num><i or b>_<protein_num>`

52@author gem

53April 2019

54"""

56import os

57import glob

58import sys

59import shutil

60import logging

61import PanACoTA.utils as utils

62from PanACoTA.annotate_module import general_format_functions as general

64logger = logging.getLogger("annotate.prokka_format")

67def format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, gene_dir,

68 rep_dir, gff_dir):

69 """

70 Format the given genome, and create its corresponding files in the following folders:

72 - Proteins

73 - Genes

74 - Replicons

75 - LSTINFO

76 - gff

78 Parameters

79 ----------

80 gpath : str

81 path to the genome sequence which was given to prokka for annotation

82 name : str

83 gembase name of the genome

84 prok_path : str

85 directory where prokka folders are saved

86 lst_dir : srt

87 path to LSTINFO folder

88 prot_dir : str

89 path to Proteins folder

90 gene_dir : str

91 path to Genes folder

92 rep_dir : str

93 path to Replicons folder

94 gff_dir : str

95 path to gff3 folder

97 Returns

98 -------

99 bool :

100 True if genome was correctly formatted, False otherwise

101 """

102 prokka_dir = os.path.join(prok_path, os.path.basename(gpath) + "-prokkaRes")

103 # Get needed Prokka result files

104 fna_file = glob.glob(os.path.join(prokka_dir, "*.fna"))[0]

105 prokka_tbl_file = glob.glob(os.path.join(prokka_dir, "*.tbl"))[0]

106 prokka_gff_file = glob.glob(os.path.join(prokka_dir, "*.gff"))[0]

107 prokka_ffn_file = glob.glob(os.path.join(prokka_dir, "*.ffn"))[0]

108 prokka_faa_file = glob.glob(os.path.join(prokka_dir, "*.faa"))[0]

109

110 # Define names for generated gembase files

111 res_lst_file = os.path.join(lst_dir, name + ".lst")

112 res_gff_file = os.path.join(gff_dir, name + ".gff")

113 res_gene_file = os.path.join(gene_dir, name + ".gen")

114 res_prt_file = os.path.join(prot_dir, name + ".prt")

115 res_rep_file = os.path.join(rep_dir, name + ".fna")

116

117

118 # Generate replicon file (same as input sequence but with gembase formatted headers). From

119 # this file, get contig names, to be used to generate gff file

120 contigs, sizes = utils.get_genome_contigs_and_rename(name, fna_file, res_rep_file, logger)

121 if not contigs:

122 try:

123 os.remove(res_rep_file)

124 os.remove(res_lst_file)

125 os.remove(res_gff_file)

126 os.remove(res_gene_file)

127 os.remove(res_prt_file)

128 except OSError:

129 pass

130 logger.error("Problems while generating Replicon file for {}".format(name))

131 return False

132

133 # Convert prokka tbl file to gembase .lst file format

134 ok_tbl = tbl2lst(prokka_tbl_file, res_lst_file, contigs, name, fna_file)

135 if not ok_tbl:

136 try:

137 os.remove(res_rep_file)

138 os.remove(res_lst_file)

139 os.remove(res_gff_file)

140 os.remove(res_gene_file)

141 os.remove(res_prt_file)

142 except OSError:

143 pass

144 logger.error("Problems while generating LSTINFO file for {}".format(name))

145 return False

146 # Create gff3 file for annotations

147 ok_gff = generate_gff(fna_file, prokka_gff_file, res_gff_file, res_lst_file, sizes, contigs)

148 if not ok_gff:

149 try:

150 os.remove(res_rep_file)

151 os.remove(res_lst_file)

152 os.remove(res_gff_file)

153 os.remove(res_gene_file)

154 os.remove(res_prt_file)

155 except OSError:

156 pass

157 logger.error("Problems while generating .gff file for {}".format(name))

158 return False

159 # create Genes file (and check no problem occurred with return code)

160 ok_gene = create_gen(prokka_ffn_file, res_lst_file, res_gene_file)

161 # If gene file not created because a problem occurred, return False:

162 # format did not run for this genome

163 if not ok_gene:

164 try:

165 os.remove(res_rep_file)

166 os.remove(res_lst_file)

167 os.remove(res_gff_file)

168 os.remove(res_gene_file)

169 os.remove(res_prt_file)

170 except OSError:

171 pass

172 logger.error("Problems while generating .gen file for {}".format(name))

173 return False

174

175

176 # If gene file was created, create Proteins file

177 ok_prt = create_prt(prokka_faa_file, res_lst_file, res_prt_file)

178 # If protein file not created, return False: format did not run for this genome

179 if not ok_prt:

180 try:

181 os.remove(res_lst_file)

182 os.remove(res_gff_file)

183 os.remove(res_gene_file)

184 os.remove(res_prt_file)

185 os.remove(res_rep_file)

186 # Remove twice to be able to check that when there is a problem while removing files,

187 # it generates the expected error

188 os.remove(res_rep_file)

189 except OSError:

190 pass

191 logger.error("Problems while generating .prt file for {}".format(name))

192 return False

193 return True

194

195

196def tbl2lst(tblfile, lstfile, contigs, genome, gpath):

197 """

198 Read prokka tbl file, and convert it to the lst file.

199

200 * prokka tbl file format::

201

202 >Feature contig_name

203 start end type

204 [EC_number text]

205 [gene text]

206 inference ab initio prediction:Prodigal:2.6

207 [inference text]

208 locus_tag test

209 product text

210

211 where `type` can be CDS, tRNA, rRNA, etc ...

212 lines between [] are not always present

213

214 * lst file format::

215

216 start end strand type locus gene_name | product | EC_number | inference 2 | db_xref

217

218 with the same types as prokka file, and strain is C (complement) or D (direct)

219 locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`

220

221 Parameters

222 ----------

223 tblfile : str

224 name of prokka output tbl file to read

225 lstfile : str

226 name of lst file to generate

227 contigs : dict

228 {original_contig_name: gembase_contig_name}

229 genome : str

230 genome name (gembase format)

231 gpath : str

232 path to the genome given to prodigal. Only used for error message

233 changed_name : bool

234 True if contig names have been changed (cutn != 0) -> contig names end by '_num',

235 False otherwise.

236

237 Returns

238 -------

239 bool :

240 True if genome name used in lstfile and prokka tblfile are the same, False otherwise

241 """

242 # Protein localisation in contig (b = border ; i = inside)

243 cont_loc = "b"

244 prev_cont_loc = "b"

245 # Current contig number. Used to compare with new one, to know if protein is

246 # inside or at the border of a contig

247 prev_cont_num = -1

248 prev_cont_name = ""

249 tbl_cont_name = ""

250 tbl_cont_num = 0 # Contig number in tbl file (<orig_name>_<cont_num>)

251 lst_cont_num = 0 # Exp cont num: new feature means exp_cont_num += 1

252 # prev_cont_num = -1 # Previous contig number seen

253 # Information on current feature. At the beginning, everything empty, no information

254 gene_name = "NA"

255 locus_num = "NA"

256 product = "NA"

257 ecnum = "NA"

258 inf2 = "NA"

259 db_xref = "NA"

260 start = "-1"

261 end = "-1"

262 strand = "D"

263 # Feature type (CDS, tRNA...)

264 feature_type = ""

265

266 # Check that tblfile is not empty

267 if os.stat(tblfile).st_size == 0:

268 logger.error(f"{tblfile} is empty.")

269 return False

270

271 with open(tblfile, "r") as tblf, open(lstfile, "w") as lstf:

272 for line in tblf:

273 elems = line.strip().split("\t")

274 if line.startswith(">Feature"):

275 tbl_cont_name = line.split()[-1]

276 else:

277 if not tbl_cont_name:

278 logger.error(f"Wrong format for {tblfile}.")

279 return False

280 # Get line type, and retrieve info according to it

281 # If it is not the line with start, end, type, there are only 2 elements

282 # in the line:

283 # - information_type information_value

284 if len(elems) == 2:

285 if "locus_tag" in elems[0]:

286 locus_num = elems[-1].split("_")[-1]

287 if "gene" in elems[0]:

288 gene_name = elems[1]

289 if "product" in elems[0]:

290 product = elems[1]

291 if "EC_number" in elems[0]:

292 ecnum = elems[1]

293 if "inference" in elems[0] and "ab initio prediction:Prodigal" not in line:

294 inf2 = elems[1]

295 if "db_xref" in elems[0]:

296 db_xref = elems[1]

297 # Information on start, end, type

298 else:

299 # new gene

300 # if new gene is not on the same contig as previously,

301 # get new contig and loc = 'b'

302 if tbl_cont_name != prev_cont_name:

303 # Check that this contig name is in the list, and get its gembase contig

304 # number

305 if tbl_cont_name in contigs:

306 contig_num = contigs[tbl_cont_name].split(".")[-1]

307 # if not in the list, problem, return false

308 else:

309 logger.error(f"'{tbl_cont_name}' found in {tblfile} does not exist in "

310 f"{gpath}.")

311 return False

312 # Previous loc was 'i' (because we were in the same contig as the

313 # previous one). But now, we know the it was the last gene of

314 # its contig: we change loc to 'b'

315 prev_cont_loc = "b"

316 cont_loc = "b"

317 # Same contig as previously: this gene is inside the contig (cont_loc = "i").

318 # If, in fact, it was the last gene of this contig, it will be changed

319 # when discovering next gene.

320 else:

321 cont_loc = 'i'

322

323 # If not first gene of the contig, write the previous gene to .lst file

324 # The first gene will be written while reading the 2nd gene

325 if start != "-1" and end != "-1" and not crispr:

326 lstline = general.write_gene(feature_type, locus_num,

327 gene_name, product,

328 prev_cont_loc, genome,

329 prev_cont_num, ecnum, inf2,

330 db_xref, strand, start, end, lstf)

331

332 # Get new values for the next gene: start, end, strand and feature type

333 start, end, feature_type = elems

334 crispr = "CRISPR" in feature_type or "repeat_region" in feature_type

335 if crispr:

336 continue

337

338 # Get strain of gene

339 if int(end) < int(start):

340 start, end = end, start

341 strand = "C"

342 else:

343 strand = "D"

344 # Initialize variables for next feature

345 # (except start, end, strand and feature type that we just calculated).

346 # prev_cont_num = exp_cont_num

347 prev_cont_loc = cont_loc

348 prev_cont_num = contig_num

349 prev_cont_name = tbl_cont_name

350 locus_num = "NA"

351 gene_name = "NA"

352 product = "NA"

353 ecnum = "NA"

354 inf2 = "NA"

355 db_xref = "NA"

356

357 # Write last feature

358 if start != -1 and end != -1:

359 prev_cont_loc = "b"

360 general.write_gene(feature_type, locus_num, gene_name, product,

361 prev_cont_loc, genome, prev_cont_num,

362 ecnum, inf2, db_xref, strand, start, end, lstf)

363 return True

364

365

366def generate_gff(gpath, prokka_gff_file, res_gff_file, res_lst_file, sizes, contigs):

367 """

368 From the lstinfo file and contig names (retrieved from generation of Replicons files),

369 generate a gff file.

370

371 Format:

372

373 ##gff-version 3

374 ##sequence-region contig1 start end

375 ##sequence-region contig2 start end

376 ...

377 seqid(=contig) source type start end score strand phase attributes

378

379 All fields tab separated.

380 Empty fields contain '.'

381

382 For example:

383 ESCO.1017.00200.00001 Prodigal:2.6 CDS start end . + . ID=ESCO.1017.00200.b0001_00001;locus_tag=ESCO.1017.00200.b0001_00001;product=hypothetical protein

384

385

386 genome1_1 Prodigal_v2.6.3 CDS 213 1880 260.0 + 0 ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.534;conf=99.99;score=259.99;cscore=268.89;sscore=-8.89;rscore=-8.50;uscore=-4.34;tscore=3.95;

387

388

389 Parameters

390 ----------

391 gpath : str

392 path to the genome sequence given to prokka. Only used for error message

393 res-gff_file : str

394 path to the gff file that must be created in result database

395 res-lst_file : str

396 path to the lst file that was created in result database in the previous step

397 sizes : list

398 dict of contig names with their size. {"gembase1": "size", "gembase2":"size2" ...]

399 contigs : list

400 dict of contig original and gembase names. {"contig1": "gembase1"...}

401

402 Returns

403 -------

404 bool :

405 True if conversion worked well, False otherwise

406 """

407 # open gff generated by prokka to read it

408 # open file to write new gff file (in gff3 folder)

409 # open lst file (from LSTINFO folder) to read all annotation information saved

410 # from prokka results

411 with open(prokka_gff_file, "r") as prokf, open(res_lst_file, "r") as lstf, \

412 open(res_gff_file, "w") as gfff:

413 # Write headers of gff3 file

414 gfff.write("##gff-version 3\n")

415 # Write all sequences with their size. Order by name in gembase format

416 for old, new in sorted(contigs.items(), key=lambda items:items[1]):

417 end = sizes[new]

418 # Write the list of cobisntigs, with their size

419 gfff.write(f"##sequence-region\t{new}\t{1}\t{end}\n")

420

421 # Now, convert each line of prokka gff to gembase formatted gff line

422 for linegff in prokf:

423 # Ignore gff lines starting by #. For the new gff, these are already all written at the

424 # beginning of the file.

425 if linegff.startswith("#"):

426 continue

427 # We do not write sequences

428 if linegff.startswith('>'):

429 break

430 # Try to extract information on new line

431 try:

432 # Get all information from prokka gff line. Strip each of them as trailing whitespace

433 # can be hidden (leading to information from gff considered as different from

434 # information from lst)

435 fields_g = linegff.strip().split("\t")

436 fields_g = [info.strip() for info in fields_g]

437 # Ignore lines with sequence

438 # if len(fields_g) != 9:

439 # continue

440 (contig_name, source, type_g, start_g, end_g,

441 score, strand_g, phase, attributes) = fields_g

442 # Ignore CRISPR

443 if "CRISPR" in type_g or "repeat_region" in type_g:

444 continue

445 # Get information given to this same sequence from the lst file

446 # (next lst line corresponds to next gff line without #), as, for each format,

447 # there is 1 line per gene)

448 linelst = lstf.readline()

449 fields_l = linelst.split("\t")

450 fields_l = [info.strip() for info in fields_l]

451 start_l, end_l, strand_l, type_l, locus_l, l_gene, l_info = fields_l

452 # Get gff and ffn filenames to give information to user if error message

453 gff = os.path.basename(prokka_gff_file)

454 tbl = gff.replace(".gff", ".tbl")

455 # Path where gff and ffn generated by prodigal are

456 tmp = gpath + "-prokkaRes"

457 # Get gene name given by prodigal to current gene

458 gname = attributes.split("ID=")[1].split(";")[0]

459 # Get locus_tag given by prokka to current feature (should be the same as ID)

460 loc_name = attributes.split("locus_tag=")[1].split(";")[0]

461 if loc_name != gname:

462 logger.error(f"Problem in {gff}: ID={gname} whereas locus_tag={loc_name}.")

463 return False

464 # Compare information from lst and information from prodigal gff (start,

465 # end and type of feature). They should correspond

466 for (elemg, eleml, label) in zip([start_g, end_g, type_g],

467 [start_l, end_l, type_l],

468 ["start", "end", "type"]):

469 # If 1 element is different (start or end position, or type), print error

470 # message and return False: this genome could not be converted to gff

471 if elemg != eleml:

472 logger.error(f"Files {tbl} and {gff} (in prokka tmp_files: {tmp}) "

473 f"do not have the same {label} value for gene {gname} ({elemg} "

474 f"in gff, {eleml} in tbl)")

475 return False

476

477 # Replace prokka ID and locus_tag with the new gene name (in gembase format),

478 # found in the corresponding lst line

479 at_split = attributes.split(";")

480 newID = [atr if "ID" not in atr else f'ID={locus_l}' for atr in at_split]

481 new = [atr if "locus_tag" not in atr else f'locus_tag={locus_l}' for atr in newID]

482

483 # Write new line of gff file

484 cname = contigs[contig_name]

485 info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g,

486 phase, ";".join(new)])

487 gfff.write(info + "\n")

488 except:

489 logger.error(f"Wrong format for {prokka_gff_file}.")

490 return False

491 return True

492

493

494def create_gen(ffnseq, lstfile, genseq):

495 """

496 Generate .gen file, from sequences contained in .ffn, but changing the

497 headers using the information in .lst

498

499 Parameters

500 ----------

501 ffnseq : str

502 .ffn file generated by prokka

503 lstfile : str

504 lstfile converted from prokka tbl file

505 genseq : str

506 output file, to write in Genes directory

507 logger : logging.Logger

508 logger object to put information

509

510 Returns

511 -------

512 bool :

513 True if conversion went well, False otherwise

514 """

515 problem = False

516 write = True # Write next sequence

517 with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen:

518 for line_ffn in ffn:

519 # Ignore gene that we do not want to write (should be a crispr)

520 # If line of sequence, write it as is, and go to next line

521 if not line_ffn.startswith(">"):

522 # We just read a seq line. If we can write (write is True), do it and go

523 # to next line

524 # Otherwise, just go to next line

525 if write:

526 gen.write(line_ffn)

527 continue

528 # Try to get gene ID. If does not work, ignore this gene (it may be a

529 # CRISPR, and we ignore them

530 test_gen_id = line_ffn.split()[0].split("_")[-1]

531 if not test_gen_id.isdigit():

532 # Maybe a CRISPR? Or wrong gene name? -> ignore

533 logger.log(utils.detail_lvl(),

534 f"Unknown header format for {line_ffn.strip()}. "

535 "This gene will be ignored in .gen output file.")

536 write = False

537 continue

538 # If ffn contains a gene header, find its information in lst file

539 else:

540 write = True

541 lstline = lst.readline().strip()

542 gen_id = int(test_gen_id)

543 # genID exists, ffn header is for a gene. Check that it corresponds to

544 # information in lst file.

545 id_lst = lstline.split("\t")[4].split("_")[-1]

546 gen_id_lst = int(id_lst)

547 # in lst, find the same gene ID as in ffn (some gene IDs in lst can be absent

548 # from ffn, if prokka do not give their sequence).

549 # As they are ordered by increasing number, go to next lstline until

550 # corresponding gene ID is found. However, if ffn ID > lst ID: ID does not

551 # exist in .lst -> problem.

552 while gen_id > gen_id_lst:

553 lstline = lst.readline().strip()

554 if not lstline:

555 gen_id_lst = "-1"

556 break

557 id_lst = lstline.split("\t")[4].split("_")[-1]

558 gen_id_lst = int(id_lst)

559 # If it found the same gene ID, write info in gene file

560 if gen_id == gen_id_lst:

561 general.write_header(lstline.strip(), gen)

562 # If gene ID of ffn not found, write error message and stop

563 else:

564 logger.error(f"Missing info for gene {line_ffn.strip()} "

565 f"(from {ffnseq}) in {lstfile}. If it is actually present "

566 "in the lst file, check that genes are ordered by increasing number in both lst and ffn files.")

567 return False

568 return True

569

570

571def create_prt(faaseq, lstfile, prtseq):

572 """

573 Generate .prt file, from sequences in .faa, but changing the headers

574 using information in .lst

575

576 **Note:** works if proteins are in increasing order (of number after "_" in their name)

577 in faa and tbl (hence lst) files.

578

579 If a header is not in the right format, or a protein exists in prt file but not in lstfile,

580 conversion is stopped, an error message is output, and prt file is removed.

581

582 Parameters

583 ----------

584 faaseq : str

585 faa file output of prokka

586 lstfile : str

587 lstinfo converted from prokka tab file

588 prtseq : str

589 output file where converted proteins must be saved

590

591 Returns

592 -------

593 bool :

594 True if conversion went well, False otherwise

595 """

596 problem = False

597 with open(faaseq) as faa, open(lstfile) as lst, open(prtseq, "w") as prt:

598 for line in faa:

599 # all header lines must start with PROKKA_<geneID>

600 if line.startswith(">"):

601 try:

602 # get gene ID

603 gen_id = int(line.split()[0].split("_")[-1])

604 except ValueError as err:

605 logger.error(f"Unknown header format {line.strip()} in {faaseq}. "

606 f"Gene ID is not a number.")

607 return False

608 gen_id_lst = 0

609 # get line of lst corresponding to the gene ID

610 lstline = ""

611 while gen_id > gen_id_lst:

612 lstline = lst.readline().strip()

613 id_lst = lstline.split("\t")[4].split("_")[-1]

614 # don't cast to int if info for a crispr

615 if id_lst.isdigit():

616 gen_id_lst = int(id_lst)

617 # check that gen_id is the same as the lst line

618 if gen_id == gen_id_lst:

619 general.write_header(lstline, prt)

620 else:

621 logger.error(f"Missing info for protein {line.strip()} (from {faaseq}) "

622 f"in {lstfile}. If it is actually present "

623 "in the lst file, check that proteins are ordered by increasing "

624 "number in both lst and faa files.")

625 return False

626 # not header: inside sequence, copy it to the .prt file

627 else:

628 prt.write(line)

629 return True

630

631

Coverage for PanACoTA/annotate_module/format_prokka.py: 100%

253 statements