Coverage for PanACoTA/annotate_module/general_format_functions.py: 100%
77 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to convert prokka or prodigal results to gembase format:
39 * Proteins: multifasta with all CDS in aa
40 * Replicons: multifasta of genome
41 * Genes: multifasta of all genes in nuc
42 * gff3: gff files without sequence
43 * LSTINFO:
44 - if annotated by prokka: information on annotation. Columns are:
45 "start end strand type locus gene_name | product | EC_number | inference2"
46 with the same types as prokka file, and strain is C (complement) or D (direct).
47 Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`
48 - if annotated by prodigal
50@author gem
51May 2019
52"""
54import os
55import logging
56import logging.handlers
57import progressbar
58import multiprocessing
59import threading
60import PanACoTA.utils as utils
61from PanACoTA.annotate_module import format_prokka as fprokka
62from PanACoTA.annotate_module import format_prodigal as fprodigal
65main_logger = logging.getLogger("annotate.geneffunc")
68def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False):
69 """
70 For all genomes which were annotated (by prokka or prodigal), reformat them
71 in order to have, in 'res_path', the following folders:
73 * LSTINFO: containing a .lst file for each genome, with all genes
74 * Replicons: containing all multifasta files
75 * Genes: containing 1 multi-fasta per genome, with all its genes in nuc
76 * Proteins: containing 1 multi-fasta per genome, with all its proteins in aa
77 * gff: containing all gff files
79 Parameters
80 ----------
81 genomes_ok : dict
82 genomes to format (annotation was OK) ->
83 {genome: [name, gpath, to_annot, size, nbcont, l90]}
84 res_path : str
85 path to folder where the 4 directories must be created
86 annot_path : str
87 path to folder containing "<genome_name>-[prokka, prodigal]Res" where all prokka/prodigal
88 results are saved.
89 prodigal_only: True if it was annotated by prodigal, False if annotated by prokka
90 threads : int
91 number of threads to use to while formatting genomes
92 quiet : bool
93 True if nothing must be sent to stderr/stdout, False otherwise
95 Returns
96 -------
97 skipped_format : list
99 list of genomes skipped because they had a problem in format step
100 """
101 main_logger.info("Formatting all genomes")
102 lst_dir = os.path.join(res_path, "LSTINFO")
103 prot_dir = os.path.join(res_path, "Proteins")
104 gene_dir = os.path.join(res_path, "Genes")
105 rep_dir = os.path.join(res_path, "Replicons")
106 gff_dir = os.path.join(res_path, "gff3")
107 os.makedirs(lst_dir, exist_ok=True)
108 os.makedirs(prot_dir, exist_ok=True)
109 os.makedirs(gene_dir, exist_ok=True)
110 os.makedirs(rep_dir, exist_ok=True)
111 os.makedirs(gff_dir, exist_ok=True)
113 # If this function goes until here, it means that there is at least 1 genome to annotate
114 nbgen = len(genomes_ok)
115 bar = None
116 if not quiet:
117 # Create progressbar
118 widgets = ['Formatting genomes: ',
119 progressbar.Bar(marker='█', left='', right=''),
120 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',
121 progressbar.Percentage(), ") - ", progressbar.Timer()]
122 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start()
123 # Create a Queue to put logs from processes, and handle them after from a single thread
124 m = multiprocessing.Manager()
125 q = m.Queue()
127 # if at least 1 genome ok, try to format it
128 # arguments for 'handle_genome' function:
129 # (genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, rep_dir,
130 # gff_dir, results, prodigal_only, q)
131 params = [(genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir,
132 rep_dir, gff_dir, prodigal_only, q)
133 for genome, (name, _, gpath, _, _, _) in genomes_ok.items()]
135 # Create pool and launch parallel formating steps
136 pool = multiprocessing.Pool(threads)
137 final = pool.map_async(handle_genome, params, chunksize=1)
138 pool.close()
139 lp = threading.Thread(target=utils.logger_thread, args=(q,))
140 lp.start()
141 if not quiet:
142 while True:
143 if final.ready():
144 break
145 remaining = final._number_left
146 bar.update(nbgen - remaining)
147 bar.finish()
148 pool.join()
149 q.put(None)
150 lp.join()
151 res = final.get()
153 # List of genomes for which format step had problems: will be filled after
154 skipped_format = []
155 for output in res:
156 if not output[0]:
157 skipped_format.append(output[1])
158 return skipped_format
161def handle_genome(args):
162 """
163 For a given genome, check if it has been annotated (in results), if annotation
164 (by prokka or prodigal) ran without problems (result = True). In that case,
165 format the genome and get the output to see if everything went ok.
167 Parameters
168 ----------
169 args : tuple
170 (genome, name, gpath, prok_path, lst_dir, prot_dir,\
171 gene_dir, rep_dir, gff_dir, results, q)\
172 with:
174 * genome : original genome name
175 * name : gembase name of the genome
176 * gpath : path to the genome sequence which was given to prokka/prodigal for annotation
177 * annot_path : directory where prokka/prodigal folders are saved
178 * lst_dir : path to 'LSTINFO' folder
179 * prot_dir : path to 'Proteins' folder
180 * gene_dit : path to 'Genes' folder
181 * rep_dir : path to 'Replicons' folder
182 * gff_dir : path to 'gff3' folder
183 * prodigal_only : True if annotated by prodigal, False if annotated by prokka
184 * q : multiprocessing.managers.AutoProxy[Queue] queue to put logs during subprocess
186 Returns
187 -------
188 (bool, str) :
190 * True if genome was annotated as expected, False otherwise
191 * genome name (used to get info from the pool.map_async)
192 """
193 (genome, name, gpath, annot_path, lst_dir, prot_dir,
194 gene_dir, rep_dir, gff_dir, prodigal_only, q) = args
196 # Define which formatting must be used, given the annotation software
197 if prodigal_only:
198 format_one_genome = fprodigal.format_one_genome
199 else:
200 format_one_genome = fprokka.format_one_genome
201 # Set logger for this process
202 qh = logging.handlers.QueueHandler(q)
203 root = logging.getLogger()
204 root.setLevel(logging.DEBUG)
205 root.handlers = []
206 logging.addLevelName(utils.detail_lvl(), "DETAIL")
207 root.addHandler(qh)
208 logger = logging.getLogger('format.handle_genome')
209 # Handle genome
210 ok_format = format_one_genome(gpath, name, annot_path, lst_dir,
211 prot_dir, gene_dir, rep_dir, gff_dir)
212 return ok_format, genome
215def write_gene(gtype, locus_num, gene_name, product, cont_loc,
216 genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile):
217 """
218 Write given gene to output file
220 Parameters
221 ----------
222 gtype : str
223 type of feature (CDS, tRNA, etc.)
224 locus_num : str
225 number of the locus given by prokka/prodigal
226 gene_name : str
227 gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal)
228 product : str
229 found by prokka/Prodigal, "NA" if no product (always the case for prodigal)
230 cont_loc : str
231 'i' if the gene is inside a contig, 'b' if its on the border (first or last gene
232 of the contig)
233 genome : str
234 genome name (spegenus.date.strain_num)
235 cont_num : int
236 contig number
237 ecnum : str
238 EC number, found by prokka, or "NA" if no EC number (-> always for prodigal)
239 inf2 : str
240 more information found by prokka/prodigal, or "NA" if no more information
241 db_xref : str
242 db_xref given by Prokka ("NA" for prodigal)
243 strand : str
244 C (complement) or D (direct)
245 start : str
246 start of gene in the contig
247 end : str
248 end of gene in the contig
249 lstopenfile : _io.TextIOWrapper
250 open file where lstinfo must be written
252 Returns
253 -------
254 str :
255 lstline
256 """
257 locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc,
258 str(locus_num).zfill(5))
259 # If '|' character found in those fields, replace by '_' to avoid problems while parsing
260 more_info = "| {} | {} | {} | {}".format(product.replace("|", "_"),
261 ecnum.replace("|", "_"),
262 inf2.replace("|", "_"),
263 db_xref.replace("|", "_"))
264 lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info])
265 lstopenfile.write(lst_line + "\n")
266 return lst_line
269def write_header(lstline, outfile):
270 """
271 write header to output file. Header is generated from the lst line.
273 Parameters
274 ----------
275 lstline : str
276 current line of lst file
277 outfile : _io.TextIOWrapper
278 open file where header must be written
279 """
280 start, end, _, _, name, gene_name, info = lstline.split("\t")
281 size = int(end) - int(start) + 1
282 towrite = " ".join([name, str(size), gene_name, info])
283 outfile.write(">" + towrite + "\n")