Coverage for PanACoTA/annotate_module/format_prokka.py: 100%
253 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to:
39- convert prokka tbl file to our tab file
40- convert prokka ffn and faa headers to our format
41- Create the database, with the following folders in the given "res_path":
43 * Proteins: multifasta with all CDS in aa
44 * Replicons: multifasta of genome
45 * Genes: multifasta of all genes in nuc
46 * gff3: gff files without sequence
47 * LSTINFO: information on annotation. Columns are: "start end strand type locus\
48 gene_name | product | EC_number | inference2" with the same types as prokka file,\
49 and strain is C (complement) or D (direct). Locus is:\
50 `<genome_name>.<contig_num><i or b>_<protein_num>`
52@author gem
53April 2019
54"""
56import os
57import glob
58import sys
59import shutil
60import logging
61import PanACoTA.utils as utils
62from PanACoTA.annotate_module import general_format_functions as general
64logger = logging.getLogger("annotate.prokka_format")
67def format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, gene_dir,
68 rep_dir, gff_dir):
69 """
70 Format the given genome, and create its corresponding files in the following folders:
72 - Proteins
73 - Genes
74 - Replicons
75 - LSTINFO
76 - gff
78 Parameters
79 ----------
80 gpath : str
81 path to the genome sequence which was given to prokka for annotation
82 name : str
83 gembase name of the genome
84 prok_path : str
85 directory where prokka folders are saved
86 lst_dir : srt
87 path to LSTINFO folder
88 prot_dir : str
89 path to Proteins folder
90 gene_dir : str
91 path to Genes folder
92 rep_dir : str
93 path to Replicons folder
94 gff_dir : str
95 path to gff3 folder
97 Returns
98 -------
99 bool :
100 True if genome was correctly formatted, False otherwise
101 """
102 prokka_dir = os.path.join(prok_path, os.path.basename(gpath) + "-prokkaRes")
103 # Get needed Prokka result files
104 fna_file = glob.glob(os.path.join(prokka_dir, "*.fna"))[0]
105 prokka_tbl_file = glob.glob(os.path.join(prokka_dir, "*.tbl"))[0]
106 prokka_gff_file = glob.glob(os.path.join(prokka_dir, "*.gff"))[0]
107 prokka_ffn_file = glob.glob(os.path.join(prokka_dir, "*.ffn"))[0]
108 prokka_faa_file = glob.glob(os.path.join(prokka_dir, "*.faa"))[0]
110 # Define names for generated gembase files
111 res_lst_file = os.path.join(lst_dir, name + ".lst")
112 res_gff_file = os.path.join(gff_dir, name + ".gff")
113 res_gene_file = os.path.join(gene_dir, name + ".gen")
114 res_prt_file = os.path.join(prot_dir, name + ".prt")
115 res_rep_file = os.path.join(rep_dir, name + ".fna")
118 # Generate replicon file (same as input sequence but with gembase formatted headers). From
119 # this file, get contig names, to be used to generate gff file
120 contigs, sizes = utils.get_genome_contigs_and_rename(name, fna_file, res_rep_file, logger)
121 if not contigs:
122 try:
123 os.remove(res_rep_file)
124 os.remove(res_lst_file)
125 os.remove(res_gff_file)
126 os.remove(res_gene_file)
127 os.remove(res_prt_file)
128 except OSError:
129 pass
130 logger.error("Problems while generating Replicon file for {}".format(name))
131 return False
133 # Convert prokka tbl file to gembase .lst file format
134 ok_tbl = tbl2lst(prokka_tbl_file, res_lst_file, contigs, name, fna_file)
135 if not ok_tbl:
136 try:
137 os.remove(res_rep_file)
138 os.remove(res_lst_file)
139 os.remove(res_gff_file)
140 os.remove(res_gene_file)
141 os.remove(res_prt_file)
142 except OSError:
143 pass
144 logger.error("Problems while generating LSTINFO file for {}".format(name))
145 return False
146 # Create gff3 file for annotations
147 ok_gff = generate_gff(fna_file, prokka_gff_file, res_gff_file, res_lst_file, sizes, contigs)
148 if not ok_gff:
149 try:
150 os.remove(res_rep_file)
151 os.remove(res_lst_file)
152 os.remove(res_gff_file)
153 os.remove(res_gene_file)
154 os.remove(res_prt_file)
155 except OSError:
156 pass
157 logger.error("Problems while generating .gff file for {}".format(name))
158 return False
159 # create Genes file (and check no problem occurred with return code)
160 ok_gene = create_gen(prokka_ffn_file, res_lst_file, res_gene_file)
161 # If gene file not created because a problem occurred, return False:
162 # format did not run for this genome
163 if not ok_gene:
164 try:
165 os.remove(res_rep_file)
166 os.remove(res_lst_file)
167 os.remove(res_gff_file)
168 os.remove(res_gene_file)
169 os.remove(res_prt_file)
170 except OSError:
171 pass
172 logger.error("Problems while generating .gen file for {}".format(name))
173 return False
176 # If gene file was created, create Proteins file
177 ok_prt = create_prt(prokka_faa_file, res_lst_file, res_prt_file)
178 # If protein file not created, return False: format did not run for this genome
179 if not ok_prt:
180 try:
181 os.remove(res_lst_file)
182 os.remove(res_gff_file)
183 os.remove(res_gene_file)
184 os.remove(res_prt_file)
185 os.remove(res_rep_file)
186 # Remove twice to be able to check that when there is a problem while removing files,
187 # it generates the expected error
188 os.remove(res_rep_file)
189 except OSError:
190 pass
191 logger.error("Problems while generating .prt file for {}".format(name))
192 return False
193 return True
196def tbl2lst(tblfile, lstfile, contigs, genome, gpath):
197 """
198 Read prokka tbl file, and convert it to the lst file.
200 * prokka tbl file format::
202 >Feature contig_name
203 start end type
204 [EC_number text]
205 [gene text]
206 inference ab initio prediction:Prodigal:2.6
207 [inference text]
208 locus_tag test
209 product text
211 where `type` can be CDS, tRNA, rRNA, etc ...
212 lines between [] are not always present
214 * lst file format::
216 start end strand type locus gene_name | product | EC_number | inference 2 | db_xref
218 with the same types as prokka file, and strain is C (complement) or D (direct)
219 locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`
221 Parameters
222 ----------
223 tblfile : str
224 name of prokka output tbl file to read
225 lstfile : str
226 name of lst file to generate
227 contigs : dict
228 {original_contig_name: gembase_contig_name}
229 genome : str
230 genome name (gembase format)
231 gpath : str
232 path to the genome given to prodigal. Only used for error message
233 changed_name : bool
234 True if contig names have been changed (cutn != 0) -> contig names end by '_num',
235 False otherwise.
237 Returns
238 -------
239 bool :
240 True if genome name used in lstfile and prokka tblfile are the same, False otherwise
241 """
242 # Protein localisation in contig (b = border ; i = inside)
243 cont_loc = "b"
244 prev_cont_loc = "b"
245 # Current contig number. Used to compare with new one, to know if protein is
246 # inside or at the border of a contig
247 prev_cont_num = -1
248 prev_cont_name = ""
249 tbl_cont_name = ""
250 tbl_cont_num = 0 # Contig number in tbl file (<orig_name>_<cont_num>)
251 lst_cont_num = 0 # Exp cont num: new feature means exp_cont_num += 1
252 # prev_cont_num = -1 # Previous contig number seen
253 # Information on current feature. At the beginning, everything empty, no information
254 gene_name = "NA"
255 locus_num = "NA"
256 product = "NA"
257 ecnum = "NA"
258 inf2 = "NA"
259 db_xref = "NA"
260 start = "-1"
261 end = "-1"
262 strand = "D"
263 # Feature type (CDS, tRNA...)
264 feature_type = ""
266 # Check that tblfile is not empty
267 if os.stat(tblfile).st_size == 0:
268 logger.error(f"{tblfile} is empty.")
269 return False
271 with open(tblfile, "r") as tblf, open(lstfile, "w") as lstf:
272 for line in tblf:
273 elems = line.strip().split("\t")
274 if line.startswith(">Feature"):
275 tbl_cont_name = line.split()[-1]
276 else:
277 if not tbl_cont_name:
278 logger.error(f"Wrong format for {tblfile}.")
279 return False
280 # Get line type, and retrieve info according to it
281 # If it is not the line with start, end, type, there are only 2 elements
282 # in the line:
283 # - information_type information_value
284 if len(elems) == 2:
285 if "locus_tag" in elems[0]:
286 locus_num = elems[-1].split("_")[-1]
287 if "gene" in elems[0]:
288 gene_name = elems[1]
289 if "product" in elems[0]:
290 product = elems[1]
291 if "EC_number" in elems[0]:
292 ecnum = elems[1]
293 if "inference" in elems[0] and "ab initio prediction:Prodigal" not in line:
294 inf2 = elems[1]
295 if "db_xref" in elems[0]:
296 db_xref = elems[1]
297 # Information on start, end, type
298 else:
299 # new gene
300 # if new gene is not on the same contig as previously,
301 # get new contig and loc = 'b'
302 if tbl_cont_name != prev_cont_name:
303 # Check that this contig name is in the list, and get its gembase contig
304 # number
305 if tbl_cont_name in contigs:
306 contig_num = contigs[tbl_cont_name].split(".")[-1]
307 # if not in the list, problem, return false
308 else:
309 logger.error(f"'{tbl_cont_name}' found in {tblfile} does not exist in "
310 f"{gpath}.")
311 return False
312 # Previous loc was 'i' (because we were in the same contig as the
313 # previous one). But now, we know the it was the last gene of
314 # its contig: we change loc to 'b'
315 prev_cont_loc = "b"
316 cont_loc = "b"
317 # Same contig as previously: this gene is inside the contig (cont_loc = "i").
318 # If, in fact, it was the last gene of this contig, it will be changed
319 # when discovering next gene.
320 else:
321 cont_loc = 'i'
323 # If not first gene of the contig, write the previous gene to .lst file
324 # The first gene will be written while reading the 2nd gene
325 if start != "-1" and end != "-1" and not crispr:
326 lstline = general.write_gene(feature_type, locus_num,
327 gene_name, product,
328 prev_cont_loc, genome,
329 prev_cont_num, ecnum, inf2,
330 db_xref, strand, start, end, lstf)
332 # Get new values for the next gene: start, end, strand and feature type
333 start, end, feature_type = elems
334 crispr = "CRISPR" in feature_type or "repeat_region" in feature_type
335 if crispr:
336 continue
338 # Get strain of gene
339 if int(end) < int(start):
340 start, end = end, start
341 strand = "C"
342 else:
343 strand = "D"
344 # Initialize variables for next feature
345 # (except start, end, strand and feature type that we just calculated).
346 # prev_cont_num = exp_cont_num
347 prev_cont_loc = cont_loc
348 prev_cont_num = contig_num
349 prev_cont_name = tbl_cont_name
350 locus_num = "NA"
351 gene_name = "NA"
352 product = "NA"
353 ecnum = "NA"
354 inf2 = "NA"
355 db_xref = "NA"
357 # Write last feature
358 if start != -1 and end != -1:
359 prev_cont_loc = "b"
360 general.write_gene(feature_type, locus_num, gene_name, product,
361 prev_cont_loc, genome, prev_cont_num,
362 ecnum, inf2, db_xref, strand, start, end, lstf)
363 return True
366def generate_gff(gpath, prokka_gff_file, res_gff_file, res_lst_file, sizes, contigs):
367 """
368 From the lstinfo file and contig names (retrieved from generation of Replicons files),
369 generate a gff file.
371 Format:
373 ##gff-version 3
374 ##sequence-region contig1 start end
375 ##sequence-region contig2 start end
376 ...
377 seqid(=contig) source type start end score strand phase attributes
379 All fields tab separated.
380 Empty fields contain '.'
382 For example:
383 ESCO.1017.00200.00001 Prodigal:2.6 CDS start end . + . ID=ESCO.1017.00200.b0001_00001;locus_tag=ESCO.1017.00200.b0001_00001;product=hypothetical protein
386 genome1_1 Prodigal_v2.6.3 CDS 213 1880 260.0 + 0 ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.534;conf=99.99;score=259.99;cscore=268.89;sscore=-8.89;rscore=-8.50;uscore=-4.34;tscore=3.95;
389 Parameters
390 ----------
391 gpath : str
392 path to the genome sequence given to prokka. Only used for error message
393 res-gff_file : str
394 path to the gff file that must be created in result database
395 res-lst_file : str
396 path to the lst file that was created in result database in the previous step
397 sizes : list
398 dict of contig names with their size. {"gembase1": "size", "gembase2":"size2" ...]
399 contigs : list
400 dict of contig original and gembase names. {"contig1": "gembase1"...}
402 Returns
403 -------
404 bool :
405 True if conversion worked well, False otherwise
406 """
407 # open gff generated by prokka to read it
408 # open file to write new gff file (in gff3 folder)
409 # open lst file (from LSTINFO folder) to read all annotation information saved
410 # from prokka results
411 with open(prokka_gff_file, "r") as prokf, open(res_lst_file, "r") as lstf, \
412 open(res_gff_file, "w") as gfff:
413 # Write headers of gff3 file
414 gfff.write("##gff-version 3\n")
415 # Write all sequences with their size. Order by name in gembase format
416 for old, new in sorted(contigs.items(), key=lambda items:items[1]):
417 end = sizes[new]
418 # Write the list of cobisntigs, with their size
419 gfff.write(f"##sequence-region\t{new}\t{1}\t{end}\n")
421 # Now, convert each line of prokka gff to gembase formatted gff line
422 for linegff in prokf:
423 # Ignore gff lines starting by #. For the new gff, these are already all written at the
424 # beginning of the file.
425 if linegff.startswith("#"):
426 continue
427 # We do not write sequences
428 if linegff.startswith('>'):
429 break
430 # Try to extract information on new line
431 try:
432 # Get all information from prokka gff line. Strip each of them as trailing whitespace
433 # can be hidden (leading to information from gff considered as different from
434 # information from lst)
435 fields_g = linegff.strip().split("\t")
436 fields_g = [info.strip() for info in fields_g]
437 # Ignore lines with sequence
438 # if len(fields_g) != 9:
439 # continue
440 (contig_name, source, type_g, start_g, end_g,
441 score, strand_g, phase, attributes) = fields_g
442 # Ignore CRISPR
443 if "CRISPR" in type_g or "repeat_region" in type_g:
444 continue
445 # Get information given to this same sequence from the lst file
446 # (next lst line corresponds to next gff line without #), as, for each format,
447 # there is 1 line per gene)
448 linelst = lstf.readline()
449 fields_l = linelst.split("\t")
450 fields_l = [info.strip() for info in fields_l]
451 start_l, end_l, strand_l, type_l, locus_l, l_gene, l_info = fields_l
452 # Get gff and ffn filenames to give information to user if error message
453 gff = os.path.basename(prokka_gff_file)
454 tbl = gff.replace(".gff", ".tbl")
455 # Path where gff and ffn generated by prodigal are
456 tmp = gpath + "-prokkaRes"
457 # Get gene name given by prodigal to current gene
458 gname = attributes.split("ID=")[1].split(";")[0]
459 # Get locus_tag given by prokka to current feature (should be the same as ID)
460 loc_name = attributes.split("locus_tag=")[1].split(";")[0]
461 if loc_name != gname:
462 logger.error(f"Problem in {gff}: ID={gname} whereas locus_tag={loc_name}.")
463 return False
464 # Compare information from lst and information from prodigal gff (start,
465 # end and type of feature). They should correspond
466 for (elemg, eleml, label) in zip([start_g, end_g, type_g],
467 [start_l, end_l, type_l],
468 ["start", "end", "type"]):
469 # If 1 element is different (start or end position, or type), print error
470 # message and return False: this genome could not be converted to gff
471 if elemg != eleml:
472 logger.error(f"Files {tbl} and {gff} (in prokka tmp_files: {tmp}) "
473 f"do not have the same {label} value for gene {gname} ({elemg} "
474 f"in gff, {eleml} in tbl)")
475 return False
477 # Replace prokka ID and locus_tag with the new gene name (in gembase format),
478 # found in the corresponding lst line
479 at_split = attributes.split(";")
480 newID = [atr if "ID" not in atr else f'ID={locus_l}' for atr in at_split]
481 new = [atr if "locus_tag" not in atr else f'locus_tag={locus_l}' for atr in newID]
483 # Write new line of gff file
484 cname = contigs[contig_name]
485 info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g,
486 phase, ";".join(new)])
487 gfff.write(info + "\n")
488 except:
489 logger.error(f"Wrong format for {prokka_gff_file}.")
490 return False
491 return True
494def create_gen(ffnseq, lstfile, genseq):
495 """
496 Generate .gen file, from sequences contained in .ffn, but changing the
497 headers using the information in .lst
499 Parameters
500 ----------
501 ffnseq : str
502 .ffn file generated by prokka
503 lstfile : str
504 lstfile converted from prokka tbl file
505 genseq : str
506 output file, to write in Genes directory
507 logger : logging.Logger
508 logger object to put information
510 Returns
511 -------
512 bool :
513 True if conversion went well, False otherwise
514 """
515 problem = False
516 write = True # Write next sequence
517 with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen:
518 for line_ffn in ffn:
519 # Ignore gene that we do not want to write (should be a crispr)
520 # If line of sequence, write it as is, and go to next line
521 if not line_ffn.startswith(">"):
522 # We just read a seq line. If we can write (write is True), do it and go
523 # to next line
524 # Otherwise, just go to next line
525 if write:
526 gen.write(line_ffn)
527 continue
528 # Try to get gene ID. If does not work, ignore this gene (it may be a
529 # CRISPR, and we ignore them
530 test_gen_id = line_ffn.split()[0].split("_")[-1]
531 if not test_gen_id.isdigit():
532 # Maybe a CRISPR? Or wrong gene name? -> ignore
533 logger.log(utils.detail_lvl(),
534 f"Unknown header format for {line_ffn.strip()}. "
535 "This gene will be ignored in .gen output file.")
536 write = False
537 continue
538 # If ffn contains a gene header, find its information in lst file
539 else:
540 write = True
541 lstline = lst.readline().strip()
542 gen_id = int(test_gen_id)
543 # genID exists, ffn header is for a gene. Check that it corresponds to
544 # information in lst file.
545 id_lst = lstline.split("\t")[4].split("_")[-1]
546 gen_id_lst = int(id_lst)
547 # in lst, find the same gene ID as in ffn (some gene IDs in lst can be absent
548 # from ffn, if prokka do not give their sequence).
549 # As they are ordered by increasing number, go to next lstline until
550 # corresponding gene ID is found. However, if ffn ID > lst ID: ID does not
551 # exist in .lst -> problem.
552 while gen_id > gen_id_lst:
553 lstline = lst.readline().strip()
554 if not lstline:
555 gen_id_lst = "-1"
556 break
557 id_lst = lstline.split("\t")[4].split("_")[-1]
558 gen_id_lst = int(id_lst)
559 # If it found the same gene ID, write info in gene file
560 if gen_id == gen_id_lst:
561 general.write_header(lstline.strip(), gen)
562 # If gene ID of ffn not found, write error message and stop
563 else:
564 logger.error(f"Missing info for gene {line_ffn.strip()} "
565 f"(from {ffnseq}) in {lstfile}. If it is actually present "
566 "in the lst file, check that genes are ordered by increasing number in both lst and ffn files.")
567 return False
568 return True
571def create_prt(faaseq, lstfile, prtseq):
572 """
573 Generate .prt file, from sequences in .faa, but changing the headers
574 using information in .lst
576 **Note:** works if proteins are in increasing order (of number after "_" in their name)
577 in faa and tbl (hence lst) files.
579 If a header is not in the right format, or a protein exists in prt file but not in lstfile,
580 conversion is stopped, an error message is output, and prt file is removed.
582 Parameters
583 ----------
584 faaseq : str
585 faa file output of prokka
586 lstfile : str
587 lstinfo converted from prokka tab file
588 prtseq : str
589 output file where converted proteins must be saved
591 Returns
592 -------
593 bool :
594 True if conversion went well, False otherwise
595 """
596 problem = False
597 with open(faaseq) as faa, open(lstfile) as lst, open(prtseq, "w") as prt:
598 for line in faa:
599 # all header lines must start with PROKKA_<geneID>
600 if line.startswith(">"):
601 try:
602 # get gene ID
603 gen_id = int(line.split()[0].split("_")[-1])
604 except ValueError as err:
605 logger.error(f"Unknown header format {line.strip()} in {faaseq}. "
606 f"Gene ID is not a number.")
607 return False
608 gen_id_lst = 0
609 # get line of lst corresponding to the gene ID
610 lstline = ""
611 while gen_id > gen_id_lst:
612 lstline = lst.readline().strip()
613 id_lst = lstline.split("\t")[4].split("_")[-1]
614 # don't cast to int if info for a crispr
615 if id_lst.isdigit():
616 gen_id_lst = int(id_lst)
617 # check that gen_id is the same as the lst line
618 if gen_id == gen_id_lst:
619 general.write_header(lstline, prt)
620 else:
621 logger.error(f"Missing info for protein {line.strip()} (from {faaseq}) "
622 f"in {lstfile}. If it is actually present "
623 "in the lst file, check that proteins are ordered by increasing "
624 "number in both lst and faa files.")
625 return False
626 # not header: inside sequence, copy it to the .prt file
627 else:
628 prt.write(line)
629 return True