Coverage for PanACoTA/annotate_module/general_format_functions.py: 100%

77 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to convert prokka or prodigal results to gembase format: 

38 

39 * Proteins: multifasta with all CDS in aa 

40 * Replicons: multifasta of genome 

41 * Genes: multifasta of all genes in nuc 

42 * gff3: gff files without sequence 

43 * LSTINFO: 

44 - if annotated by prokka: information on annotation. Columns are: 

45 "start end strand type locus gene_name | product | EC_number | inference2" 

46 with the same types as prokka file, and strain is C (complement) or D (direct). 

47 Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>` 

48 - if annotated by prodigal 

49 

50@author gem 

51May 2019 

52""" 

53 

54import os 

55import logging 

56import logging.handlers 

57import progressbar 

58import multiprocessing 

59import threading 

60import PanACoTA.utils as utils 

61from PanACoTA.annotate_module import format_prokka as fprokka 

62from PanACoTA.annotate_module import format_prodigal as fprodigal 

63 

64 

65main_logger = logging.getLogger("annotate.geneffunc") 

66 

67 

68def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False): 

69 """ 

70 For all genomes which were annotated (by prokka or prodigal), reformat them 

71 in order to have, in 'res_path', the following folders: 

72 

73 * LSTINFO: containing a .lst file for each genome, with all genes 

74 * Replicons: containing all multifasta files 

75 * Genes: containing 1 multi-fasta per genome, with all its genes in nuc 

76 * Proteins: containing 1 multi-fasta per genome, with all its proteins in aa 

77 * gff: containing all gff files 

78 

79 Parameters 

80 ---------- 

81 genomes_ok : dict 

82 genomes to format (annotation was OK) -> 

83 {genome: [name, gpath, to_annot, size, nbcont, l90]} 

84 res_path : str 

85 path to folder where the 4 directories must be created 

86 annot_path : str 

87 path to folder containing "<genome_name>-[prokka, prodigal]Res" where all prokka/prodigal 

88 results are saved. 

89 prodigal_only: True if it was annotated by prodigal, False if annotated by prokka 

90 threads : int 

91 number of threads to use to while formatting genomes 

92 quiet : bool 

93 True if nothing must be sent to stderr/stdout, False otherwise 

94 

95 Returns 

96 ------- 

97 skipped_format : list 

98 

99 list of genomes skipped because they had a problem in format step 

100 """ 

101 main_logger.info("Formatting all genomes") 

102 lst_dir = os.path.join(res_path, "LSTINFO") 

103 prot_dir = os.path.join(res_path, "Proteins") 

104 gene_dir = os.path.join(res_path, "Genes") 

105 rep_dir = os.path.join(res_path, "Replicons") 

106 gff_dir = os.path.join(res_path, "gff3") 

107 os.makedirs(lst_dir, exist_ok=True) 

108 os.makedirs(prot_dir, exist_ok=True) 

109 os.makedirs(gene_dir, exist_ok=True) 

110 os.makedirs(rep_dir, exist_ok=True) 

111 os.makedirs(gff_dir, exist_ok=True) 

112 

113 # If this function goes until here, it means that there is at least 1 genome to annotate 

114 nbgen = len(genomes_ok) 

115 bar = None 

116 if not quiet: 

117 # Create progressbar 

118 widgets = ['Formatting genomes: ', 

119 progressbar.Bar(marker='█', left='', right=''), 

120 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (', 

121 progressbar.Percentage(), ") - ", progressbar.Timer()] 

122 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start() 

123 # Create a Queue to put logs from processes, and handle them after from a single thread 

124 m = multiprocessing.Manager() 

125 q = m.Queue() 

126 

127 # if at least 1 genome ok, try to format it 

128 # arguments for 'handle_genome' function: 

129 # (genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, rep_dir, 

130 # gff_dir, results, prodigal_only, q) 

131 params = [(genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, 

132 rep_dir, gff_dir, prodigal_only, q) 

133 for genome, (name, _, gpath, _, _, _) in genomes_ok.items()] 

134 

135 # Create pool and launch parallel formating steps 

136 pool = multiprocessing.Pool(threads) 

137 final = pool.map_async(handle_genome, params, chunksize=1) 

138 pool.close() 

139 lp = threading.Thread(target=utils.logger_thread, args=(q,)) 

140 lp.start() 

141 if not quiet: 

142 while True: 

143 if final.ready(): 

144 break 

145 remaining = final._number_left 

146 bar.update(nbgen - remaining) 

147 bar.finish() 

148 pool.join() 

149 q.put(None) 

150 lp.join() 

151 res = final.get() 

152 

153 # List of genomes for which format step had problems: will be filled after 

154 skipped_format = [] 

155 for output in res: 

156 if not output[0]: 

157 skipped_format.append(output[1]) 

158 return skipped_format 

159 

160 

161def handle_genome(args): 

162 """ 

163 For a given genome, check if it has been annotated (in results), if annotation 

164 (by prokka or prodigal) ran without problems (result = True). In that case, 

165 format the genome and get the output to see if everything went ok. 

166 

167 Parameters 

168 ---------- 

169 args : tuple 

170 (genome, name, gpath, prok_path, lst_dir, prot_dir,\ 

171 gene_dir, rep_dir, gff_dir, results, q)\ 

172 with: 

173 

174 * genome : original genome name 

175 * name : gembase name of the genome 

176 * gpath : path to the genome sequence which was given to prokka/prodigal for annotation 

177 * annot_path : directory where prokka/prodigal folders are saved 

178 * lst_dir : path to 'LSTINFO' folder 

179 * prot_dir : path to 'Proteins' folder 

180 * gene_dit : path to 'Genes' folder 

181 * rep_dir : path to 'Replicons' folder 

182 * gff_dir : path to 'gff3' folder 

183 * prodigal_only : True if annotated by prodigal, False if annotated by prokka 

184 * q : multiprocessing.managers.AutoProxy[Queue] queue to put logs during subprocess 

185 

186 Returns 

187 ------- 

188 (bool, str) : 

189 

190 * True if genome was annotated as expected, False otherwise 

191 * genome name (used to get info from the pool.map_async) 

192 """ 

193 (genome, name, gpath, annot_path, lst_dir, prot_dir, 

194 gene_dir, rep_dir, gff_dir, prodigal_only, q) = args 

195 

196 # Define which formatting must be used, given the annotation software 

197 if prodigal_only: 

198 format_one_genome = fprodigal.format_one_genome 

199 else: 

200 format_one_genome = fprokka.format_one_genome 

201 # Set logger for this process 

202 qh = logging.handlers.QueueHandler(q) 

203 root = logging.getLogger() 

204 root.setLevel(logging.DEBUG) 

205 root.handlers = [] 

206 logging.addLevelName(utils.detail_lvl(), "DETAIL") 

207 root.addHandler(qh) 

208 logger = logging.getLogger('format.handle_genome') 

209 # Handle genome 

210 ok_format = format_one_genome(gpath, name, annot_path, lst_dir, 

211 prot_dir, gene_dir, rep_dir, gff_dir) 

212 return ok_format, genome 

213 

214 

215def write_gene(gtype, locus_num, gene_name, product, cont_loc, 

216 genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile): 

217 """ 

218 Write given gene to output file 

219 

220 Parameters 

221 ---------- 

222 gtype : str 

223 type of feature (CDS, tRNA, etc.) 

224 locus_num : str 

225 number of the locus given by prokka/prodigal 

226 gene_name : str 

227 gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal) 

228 product : str 

229 found by prokka/Prodigal, "NA" if no product (always the case for prodigal) 

230 cont_loc : str 

231 'i' if the gene is inside a contig, 'b' if its on the border (first or last gene 

232 of the contig) 

233 genome : str 

234 genome name (spegenus.date.strain_num) 

235 cont_num : int 

236 contig number 

237 ecnum : str 

238 EC number, found by prokka, or "NA" if no EC number (-> always for prodigal) 

239 inf2 : str 

240 more information found by prokka/prodigal, or "NA" if no more information 

241 db_xref : str 

242 db_xref given by Prokka ("NA" for prodigal) 

243 strand : str 

244 C (complement) or D (direct) 

245 start : str 

246 start of gene in the contig 

247 end : str 

248 end of gene in the contig 

249 lstopenfile : _io.TextIOWrapper 

250 open file where lstinfo must be written 

251 

252 Returns 

253 ------- 

254 str : 

255 lstline 

256 """ 

257 locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc, 

258 str(locus_num).zfill(5)) 

259 # If '|' character found in those fields, replace by '_' to avoid problems while parsing 

260 more_info = "| {} | {} | {} | {}".format(product.replace("|", "_"), 

261 ecnum.replace("|", "_"), 

262 inf2.replace("|", "_"), 

263 db_xref.replace("|", "_")) 

264 lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info]) 

265 lstopenfile.write(lst_line + "\n") 

266 return lst_line 

267 

268 

269def write_header(lstline, outfile): 

270 """ 

271 write header to output file. Header is generated from the lst line. 

272 

273 Parameters 

274 ---------- 

275 lstline : str 

276 current line of lst file 

277 outfile : _io.TextIOWrapper 

278 open file where header must be written 

279 """ 

280 start, end, _, _, name, gene_name, info = lstline.split("\t") 

281 size = int(end) - int(start) + 1 

282 towrite = " ".join([name, str(size), gene_name, info]) 

283 outfile.write(">" + towrite + "\n")