Coverage for PanACoTA/annotate_module/format_prodigal.py: 100%
180 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to convert prodigal result files to gembase format.
39 * Proteins: multifasta with all CDS in aa
40 * Replicons: (multi)fasta of genome sequences
41 * Genes: multifasta of all genes in nuc
42 * gff3: gff files without sequence
43 * LSTINFO: information on annotation. Columns are: "start end strand type locus
44 gene_name | product | EC_number | inference2" and strain is C (complement) or D (direct).
45 Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`
46 For prodigal: "start end strand type locus NA | NA | NA | NA", as there is no
47 functional annotation.
49@author gem
50July 2019
51"""
53import os
54import shutil
55import glob
56import logging
58import PanACoTA.utils as utils
59from PanACoTA.annotate_module import general_format_functions as gfunc
61logger = logging.getLogger("annotate.prodigal_format")
64def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
65 rep_dir, gff_dir):
66 """
67 Format the given genome, and create its corresponding files in the following folders:
69 - Proteins
70 - Genes
71 - Replicons
72 - LSTINFO
73 - gff
75 Parameters
76 ----------
77 gpath : str
78 path to the genome sequence which was given to prodigal for annotation
79 name : str
80 gembase name of the genome
81 prod_path : str
82 directory where all tmp_files for all sequences are saved (sequences cut at each set
83 of 5N, prodigal results and logs)
84 lst_dir : str
85 path to LSTINFO folder
86 prot_dir : str
87 path to Proteins folder
88 gene_dir : str
89 path to Genes folder
90 rep_dir : str
91 path to Replicons folder
92 gff_dir : str
93 path to gff3 folder
95 Returns
96 -------
97 bool :
98 True if genome was correctly formatted, False otherwise
99 """
100 # Get directory where prodigal results for the current genome are saved
101 prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes")
103 # Get prodigal result files
104 prot_file = glob.glob(os.path.join(prodigal_dir, "*.faa"))[0]
105 gen_file = glob.glob(os.path.join(prodigal_dir, "*.ffn"))[0]
106 gff_file = glob.glob(os.path.join(prodigal_dir, "*.gff"))[0]
108 # Define names for generated gembase files
109 res_prot_file = os.path.join(prot_dir, name + ".prt")
110 res_gene_file = os.path.join(gene_dir, name + ".gen")
111 res_lst_file = os.path.join(lst_dir, name + ".lst")
112 res_rep_file = os.path.join(rep_dir, name + ".fna")
113 res_gff_file = os.path.join(gff_dir, name + ".gff")
115 # Generate replicon file (same as input sequence but with gembase formatted headers). From
116 # this file, get contig names, to be used to generate gff file
117 contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file, logger)
118 if not contigs:
119 try:
120 os.remove(res_rep_file)
121 os.remove(res_lst_file)
122 os.remove(res_gff_file)
123 os.remove(res_gene_file)
124 os.remove(res_prot_file)
125 except OSError:
126 pass
127 logger.error("Problems while generating Replicon file for {}".format(name))
128 return False
130 # First, create .gen and .lst files. If they could not be formatted,
131 # remove those files, and return False with error message
132 ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, gpath, name)
133 if not ok:
134 try:
135 os.remove(res_rep_file)
136 os.remove(res_gene_file)
137 os.remove(res_lst_file)
138 os.remove(res_gff_file)
139 os.remove(res_prot_file)
140 except OSError:
141 pass
142 logger.error(f"Problems while generating .gen and .lst files for {name}")
143 return False
145 # Create gff files.
146 ok = create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes)
147 # If problem while formatting the genome (rep or gff file), remove all
148 # already created files, and return False (genome not formatted) with error message.
149 if not ok:
150 try:
151 os.remove(res_gene_file)
152 os.remove(res_lst_file)
153 os.remove(res_rep_file)
154 os.remove(res_gff_file)
155 os.remove(res_prot_file)
156 except OSError:
157 pass
158 logger.error("Problems while generating .gff (gff3 folder) "
159 "file for {}".format(name))
160 return False
162 # Generate .prt files (in Proteins directory)
163 ok = create_prt(prot_file, res_prot_file, res_lst_file)
164 # If problem while formatting prt file, return False, delete all generated
165 # formatted files, and write an error message to user.
166 if not ok:
167 try:
168 os.remove(res_gene_file)
169 os.remove(res_lst_file)
170 os.remove(res_rep_file)
171 os.remove(res_gff_file)
172 os.remove(res_prot_file)
173 except OSError: # pragma: no cover
174 pass
175 logger.error("Problems while generating .prt file (Proteins folder) "
176 "for {}".format(name))
177 return False
178 return ok
181def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
182 """
183 Generate .gen file, from sequences contained in .ffn, but changing the
184 headers to match with gembase format.
185 At the same time, generate .lst file, from the information given in prodigal ffn headers
187 Parameters
188 ----------
189 contigs : dict
190 {original_contig_name: gembase_contig_name}
191 gen_file : str
192 .ffn file generated by prodigal
193 res_gen_file : str
194 generated .gen file, to write in Genes directory
195 res_lst_file : str
196 generated .lst file to write in LSTINFO directory
197 gpath : str
198 path to the genome given to prodigal. Only used for error message
199 name : str
200 gembase name of the genome to format
201 logger : logging.Logger
202 logger object to put information
204 Returns
205 -------
206 bool :
207 True if conversion went well, False otherwise
208 """
209 # Variable which will contain the current gene sequence
210 seq = ""
211 # number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not
212 # re-start at 1 for each new contig)
213 locus_num = 0
214 # contig name of the last gene. To check if we are now in a new contig (-> loc = b) or not
215 prev_cont_name = ""
216 # Previous ontig number: contig number to use in gembase format
217 prev_cont_num = 0
218 contig_num = 0
219 # Keep start, end, strand and informations (prodigal gives information on start_type,
220 # gc_cont etc.) from the previous gene, before overwriting it with information
221 # on the new one
222 prev_start = ""
223 prev_end = ""
224 prev_strand = ""
225 prev_info = ""
226 # Update loc when contig changes ('b' if gene at the border of a contig, 'i' if it is inside)
227 prev_loc = "b"
228 # To start, the first gene is, by definition, at the border of the contig
229 loc = "b"
230 # Open files: .ffn prodigal to read, .gen and .lst gembase to create
231 with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\
232 open(res_lst_file, "w") as r_lst:
233 # Read all lines in ffn file (sequences in nuc. for each gene)
234 for lineffn in ffn:
235 # If it is a sequence, save it and go to next line
236 if not lineffn.startswith(">"):
237 seq += lineffn
238 continue
239 # Otherwise:
240 # - write header of previous sequence to .gen
241 # - write previous sequence (in 'seq') to .gen
242 # - write LSTINFO information to .lst
243 # - update information (new start, end, contig number etc.) for next gene
244 else:
245 # Get information given for the new gene (by .ffn file from prodigal)
246 (gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#")
247 # Get contig number from prodigal gene header: prodigal first part of header is:
248 # <original genome name contig name>_<protein number>
249 contig_name = gname.strip().split("_")
250 if len(contig_name) > 1:
251 contig_name = "_".join(contig_name[:-1])
252 else:
253 contig_name = contig_name[0]
254 # If new contig:
255 # - previous gene was the last of its contig -> prev_loc = "b" ;
256 # - the current gene is the first of its contig (loc = "b")
257 # - we must increment the contig number
258 if contig_name != prev_cont_name:
259 # Check that this contig name is in the list, and get its gembase contig number
260 if contig_name in contigs:
261 contig_num = contigs[contig_name].split(".")[-1]
262 # if not in the list, problem, return false
263 else:
264 logger.error(f"'{contig_name}' found in {gen_file} does not exist in "
265 f"{gpath}.")
266 return False
267 prev_loc = 'b'
268 loc = 'b'
269 # If not new contig. If prev_loc == 'b', previous gene is the first protein
270 # of this contig.
271 # Current gene will be inside the contig (except if new contig for the next gene,
272 # meaning only 1 gene in the contig)
273 else:
274 loc = 'i'
276 # If it is not the first gene of the genome, write previous gene information
277 if prev_start != "":
278 # Write line in LSTINFO file, + header and sequence to the gene file
279 lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",
280 prev_loc, name, prev_cont_num, "NA", prev_info,
281 "NA", prev_strand, prev_start, prev_end, r_lst)
282 gfunc.write_header(lstline, r_gen)
283 r_gen.write(seq)
284 # -> get new information, save it for the next gene, and go to next line
285 # Strands are 1/-1 in prodigal, while we use D,C -> convert, so that next time
286 # we find a new gene, it writes this before updating for this new gene
287 if int(strand) == 1:
288 strand = "D"
289 else:
290 strand = "C"
291 # Prepare variables for next gene
292 locus_num += 1
293 seq = ""
294 prev_cont_num = contig_num
295 prev_cont_name = contig_name
296 prev_start = start
297 prev_end = end
298 prev_strand = strand
299 prev_loc = loc
300 prev_info = info
301 # Write last gene of the genome (-> loc = 'b'),
302 # Just check that there was at least 1 gene found (prev_start != "").
303 # Otherwise, nothing to write
304 if prev_start != "":
305 prev_loc = "b"
306 lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",
307 prev_loc, name, prev_cont_num, "NA", prev_info, "NA",
308 prev_strand, prev_start, prev_end, r_lst)
309 gfunc.write_header(lstline, r_gen)
310 r_gen.write(seq)
311 return True
314def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes):
315 """
316 Create .gff3 file.
318 Format:
320 ##gff-version 3
321 ##sequence-region contig1 start end
322 ##sequence-region contig2 start end
323 ...
324 seqid(=contig) source type start end score strand phase attributes
326 All fields tab separated.
327 Empty fields contain '.'
329 For example:
330 ESCO.1017.00200.00001 Prodigal:2.6 CDS start end . + . ID=ESCO.1017.00200.b0001_00001;locus_tag=ESCO.1017.00200.b0001_00001;product=hypothetical protein
333 genome1_1 Prodigal_v2.6.3 CDS 213 1880 260.0 + 0 ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.534;conf=99.99;score=259.99;cscore=268.89;sscore=-8.89;rscore=-8.50;uscore=-4.34;tscore=3.95;
335 Parameters
336 ----------
337 gpath : str
338 path to the genome sequence given to prodigal. Only used for the error message
339 gff_file : str
340 path to gff file generated by prodigal
341 res-gff_file : str
342 path to the gff file that must be created in result database
343 res-lst_file : str
344 path to the lst file that was created in result database in the previous step
345 contigs : dict
346 dict of contig names with their size. ["original_name": "gembase_name"]
347 sizes : dict
348 dict of contig gembase names with their sizes {"gembase_name": size}
350 Returns
351 -------
352 bool
353 True if everything went well, False if any problem
355 """
356 # open gff generated by prodigal to read it
357 # open file to write new gff file
358 # open lst file to read all information saved from prodigal results
359 with open(gff_file, 'r') as gf, open(res_gff_file, "w") as rgf, open(res_lst_file, "r") as rlf:
360 # Write headers of gff3 file
361 rgf.write("##gff-version 3\n")
362 for ori_name, new_name in contigs.items():
363 # Write the list of contigs, with their size
364 end = sizes[new_name]
365 rgf.write(f"##sequence-region\t{new_name}\t1\t{end}\n")
367 # Now, convert each line of prodigal gff to gembase formatted gff line
368 for linegff in gf:
369 # Ignore gff lines starting by #. For the new gff, these are already all written at the
370 # beginning of the file.
371 if linegff.startswith("#"):
372 continue
374 # We do not write the sequences
375 if linegff.startswith(">"):
376 break
378 # Get all information from prodigal gff line. Strip each of them as trailing whitespace
379 # can be hidden (leading to information from gff considered as different from
380 # information from lst)
381 fields_g = linegff.split("\t")
382 fields_g = [info.strip() for info in fields_g]
383 (contig_name, source, type_g, start_g, end_g,
384 score, strand_g, phase, attributes) = fields_g
385 # Get information given to this same sequence from the lst file
386 # (next lst line corresponds to next gff line without #), as, for each format,
387 # there is 1 line per gene)
388 linelst = rlf.readline()
389 fields_l = linelst.split("\t")
390 fields_l = [info.strip() for info in fields_l]
391 start_l, end_l, strand_l, type_l, locus_l, _, _ = fields_l
394 # Get gff and ffn filenames to give information to user if error message
395 gff = os.path.basename(gff_file)
396 ffn = ".".join(gff.split(".")[:-1]) + ".ffn"
397 # Path where gff and ffn generated by prodigal are
398 tmp = gpath + "-prodigalRes"
399 # Get gene name given by prodigal to current gene
400 gname = attributes.split("ID=")[1].split(";")[0]
402 # Compare information from lst and information from prodigal gff (start,
403 # end and type of feature). They should correspond
404 for (elemg, eleml, label) in zip([start_g, end_g, type_g],
405 [start_l, end_l, type_l],
406 ["start", "end", "type"]):
407 # If 1 element is different (start or end position, or type), print error
408 # message and return False: this genome could not be converted to gff
409 if elemg != eleml:
410 logger.error(f"Files {ffn} and {gff} (in prodigal tmp_files: {tmp}) do not have "
411 f"the same {label} value for gene {gname} ({elemg} in gff, {eleml} in "
412 "ffn))")
413 return False
415 # Replace prodigal ID with the new gene name (in gembase format), found in the
416 # corresponding lst line
417 at_split = attributes.split(";")
418 new = [atr if "ID" not in atr else f'ID={locus_l}' for atr in at_split]
420 # Write new line of gff file
421 # Write new contig name
422 cname = contigs[contig_name]
423 info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g,
424 phase, ";".join(new)])
425 rgf.write(info + "\n")
426 return True
429def create_prt(prot_file, res_prot_file, res_lst_file):
430 """
431 Generate .prt file (gembase formatted gene names), from features contained in .lst file generated just before.
433 Parameters
434 ----------
435 prot_file : str
436 .faa file generated by prodigal
437 res_prot_file : str
438 output file, to write in Proteins directory
439 res_lst_file : str
440 .lst file to get all gene names in gembase format instead of re-generating them
441 Returns
442 -------
443 bool :
444 True if conversion went well, False otherwise
445 """
447 # Open:
448 # - prot file to read gene sequences from prodigal results
449 # - res_prot file to write sequences with gembase headers
450 # - res_lst_file to get gene gembase names and other infos (strand, size...)
452 with open(prot_file, "r") as faa, open(res_prot_file, "w") as r_prt,\
453 open(res_lst_file, "r") as r_lst:
454 # Read prt file generated by prodigal
455 for lineprot in faa:
456 # If protein sequence, write it
457 if not lineprot.startswith(">"):
458 r_prt.write(lineprot)
459 continue
460 # If header, replace by gembase header
461 # For that, get next lst line (corresponding to next protein,
462 # as there is 1 protein per line in .lst -> 1 protein per header in .prt)
463 linelst = r_lst.readline().strip()
464 # Try to get info from lstline.
465 # If lstline empty, it means that the current protein
466 # is missing from lst file. We already read the last protein of lst file.
467 if linelst != '':
468 # If not empty, check lst format, return False if not right format
469 try:
470 # If ok, gembase name is in the fifth column of lst file
471 start, end, _, _, gem_name, _, _ = linelst.split("\t")
472 except ValueError:
473 logger.error("Problem in format of lstline ({})".format(linelst))
474 return False
475 else:
476 logger.error("No more protein in lst file. We cannot get information on this "
477 "protein ({})! Check that you do not have more proteins than genes "
478 "in prodigal results".format(lineprot.strip()))
479 return False
480 # Write this gembase name as a new header
481 # Size of protein sequence is the third of gene sequence. Check that it is an int.
482 try :
483 size_gen = (int(end) - int(start) + 1)
484 except ValueError:
485 logger.error("Start and/or end of protein {} position is not a number (start "
486 "= {}; end = {})".format(gem_name, start, end))
487 return False
488 # Find size of protein in number of aa
489 # If number of nucleotides in gene cannot be divided by 3 to get number of amino-acids, there is a problem with this protein: return False to ignore this genome
490 size_prot = size_gen/3
491 if int(size_prot) != size_prot:
492 logger.error("Gene {} has a number of nucleotides ({}) that is not divisible "
493 "by 3.".format(gem_name, size_gen))
494 return False
495 gfunc.write_header(linelst, r_prt)
496 # new_header = "\t".join([gem_name, str(int(size_prot)), product, info])
497 # r_prt.write(">" + new_header + "\n")
498 # Check that there are no more proteins in lst than in this prt file
499 linelst = r_lst.readline()
500 if linelst.strip() != '':
501 gem_name = linelst.strip().split("\t")[4]
502 logger.error("Protein {} is in .lst file but its sequence is not in the protein "
503 "file generated by prodigal.".format(gem_name))
504 return False
505 return True