Coverage for PanACoTA/annotate_module/general_format

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions to convert prokka or prodigal results to gembase format:

39 * Proteins: multifasta with all CDS in aa

40 * Replicons: multifasta of genome

41 * Genes: multifasta of all genes in nuc

42 * gff3: gff files without sequence

43 * LSTINFO:

44 - if annotated by prokka: information on annotation. Columns are:

45 "start end strand type locus gene_name | product | EC_number | inference2"

46 with the same types as prokka file, and strain is C (complement) or D (direct).

47 Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`

48 - if annotated by prodigal

50@author gem

51May 2019

52"""

54import os

55import logging

56import logging.handlers

57import progressbar

58import multiprocessing

59import threading

60import PanACoTA.utils as utils

61from PanACoTA.annotate_module import format_prokka as fprokka

62from PanACoTA.annotate_module import format_prodigal as fprodigal

65main_logger = logging.getLogger("annotate.geneffunc")

68def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False):

69 """

70 For all genomes which were annotated (by prokka or prodigal), reformat them

71 in order to have, in 'res_path', the following folders:

73 * LSTINFO: containing a .lst file for each genome, with all genes

74 * Replicons: containing all multifasta files

75 * Genes: containing 1 multi-fasta per genome, with all its genes in nuc

76 * Proteins: containing 1 multi-fasta per genome, with all its proteins in aa

77 * gff: containing all gff files

79 Parameters

80 ----------

81 genomes_ok : dict

82 genomes to format (annotation was OK) ->

83 {genome: [name, gpath, to_annot, size, nbcont, l90]}

84 res_path : str

85 path to folder where the 4 directories must be created

86 annot_path : str

87 path to folder containing "<genome_name>-[prokka, prodigal]Res" where all prokka/prodigal

88 results are saved.

89 prodigal_only: True if it was annotated by prodigal, False if annotated by prokka

90 threads : int

91 number of threads to use to while formatting genomes

92 quiet : bool

93 True if nothing must be sent to stderr/stdout, False otherwise

95 Returns

96 -------

97 skipped_format : list

99 list of genomes skipped because they had a problem in format step

100 """

101 main_logger.info("Formatting all genomes")

102 lst_dir = os.path.join(res_path, "LSTINFO")

103 prot_dir = os.path.join(res_path, "Proteins")

104 gene_dir = os.path.join(res_path, "Genes")

105 rep_dir = os.path.join(res_path, "Replicons")

106 gff_dir = os.path.join(res_path, "gff3")

107 os.makedirs(lst_dir, exist_ok=True)

108 os.makedirs(prot_dir, exist_ok=True)

109 os.makedirs(gene_dir, exist_ok=True)

110 os.makedirs(rep_dir, exist_ok=True)

111 os.makedirs(gff_dir, exist_ok=True)

112

113 # If this function goes until here, it means that there is at least 1 genome to annotate

114 nbgen = len(genomes_ok)

115 bar = None

116 if not quiet:

117 # Create progressbar

118 widgets = ['Formatting genomes: ',

119 progressbar.Bar(marker='█', left='', right=''),

120 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',

121 progressbar.Percentage(), ") - ", progressbar.Timer()]

122 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start()

123 # Create a Queue to put logs from processes, and handle them after from a single thread

124 m = multiprocessing.Manager()

125 q = m.Queue()

126

127 # if at least 1 genome ok, try to format it

128 # arguments for 'handle_genome' function:

129 # (genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, rep_dir,

130 # gff_dir, results, prodigal_only, q)

131 params = [(genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir,

132 rep_dir, gff_dir, prodigal_only, q)

133 for genome, (name, _, gpath, _, _, _) in genomes_ok.items()]

134

135 # Create pool and launch parallel formating steps

136 pool = multiprocessing.Pool(threads)

137 final = pool.map_async(handle_genome, params, chunksize=1)

138 pool.close()

139 lp = threading.Thread(target=utils.logger_thread, args=(q,))

140 lp.start()

141 if not quiet:

142 while True:

143 if final.ready():

144 break

145 remaining = final._number_left

146 bar.update(nbgen - remaining)

147 bar.finish()

148 pool.join()

149 q.put(None)

150 lp.join()

151 res = final.get()

152

153 # List of genomes for which format step had problems: will be filled after

154 skipped_format = []

155 for output in res:

156 if not output[0]:

157 skipped_format.append(output[1])

158 return skipped_format

159

160

161def handle_genome(args):

162 """

163 For a given genome, check if it has been annotated (in results), if annotation

164 (by prokka or prodigal) ran without problems (result = True). In that case,

165 format the genome and get the output to see if everything went ok.

166

167 Parameters

168 ----------

169 args : tuple

170 (genome, name, gpath, prok_path, lst_dir, prot_dir,\

171 gene_dir, rep_dir, gff_dir, results, q)\

172 with:

173

174 * genome : original genome name

175 * name : gembase name of the genome

176 * gpath : path to the genome sequence which was given to prokka/prodigal for annotation

177 * annot_path : directory where prokka/prodigal folders are saved

178 * lst_dir : path to 'LSTINFO' folder

179 * prot_dir : path to 'Proteins' folder

180 * gene_dit : path to 'Genes' folder

181 * rep_dir : path to 'Replicons' folder

182 * gff_dir : path to 'gff3' folder

183 * prodigal_only : True if annotated by prodigal, False if annotated by prokka

184 * q : multiprocessing.managers.AutoProxy[Queue] queue to put logs during subprocess

185

186 Returns

187 -------

188 (bool, str) :

189

190 * True if genome was annotated as expected, False otherwise

191 * genome name (used to get info from the pool.map_async)

192 """

193 (genome, name, gpath, annot_path, lst_dir, prot_dir,

194 gene_dir, rep_dir, gff_dir, prodigal_only, q) = args

195

196 # Define which formatting must be used, given the annotation software

197 if prodigal_only:

198 format_one_genome = fprodigal.format_one_genome

199 else:

200 format_one_genome = fprokka.format_one_genome

201 # Set logger for this process

202 qh = logging.handlers.QueueHandler(q)

203 root = logging.getLogger()

204 root.setLevel(logging.DEBUG)

205 root.handlers = []

206 logging.addLevelName(utils.detail_lvl(), "DETAIL")

207 root.addHandler(qh)

208 logger = logging.getLogger('format.handle_genome')

209 # Handle genome

210 ok_format = format_one_genome(gpath, name, annot_path, lst_dir,

211 prot_dir, gene_dir, rep_dir, gff_dir)

212 return ok_format, genome

213

214

215def write_gene(gtype, locus_num, gene_name, product, cont_loc,

216 genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile):

217 """

218 Write given gene to output file

219

220 Parameters

221 ----------

222 gtype : str

223 type of feature (CDS, tRNA, etc.)

224 locus_num : str

225 number of the locus given by prokka/prodigal

226 gene_name : str

227 gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal)

228 product : str

229 found by prokka/Prodigal, "NA" if no product (always the case for prodigal)

230 cont_loc : str

231 'i' if the gene is inside a contig, 'b' if its on the border (first or last gene

232 of the contig)

233 genome : str

234 genome name (spegenus.date.strain_num)

235 cont_num : int

236 contig number

237 ecnum : str

238 EC number, found by prokka, or "NA" if no EC number (-> always for prodigal)

239 inf2 : str

240 more information found by prokka/prodigal, or "NA" if no more information

241 db_xref : str

242 db_xref given by Prokka ("NA" for prodigal)

243 strand : str

244 C (complement) or D (direct)

245 start : str

246 start of gene in the contig

247 end : str

248 end of gene in the contig

249 lstopenfile : _io.TextIOWrapper

250 open file where lstinfo must be written

251

252 Returns

253 -------

254 str :

255 lstline

256 """

257 locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc,

258 str(locus_num).zfill(5))

259 # If '|' character found in those fields, replace by '_' to avoid problems while parsing

260 more_info = "| {} | {} | {} | {}".format(product.replace("|", "_"),

261 ecnum.replace("|", "_"),

262 inf2.replace("|", "_"),

263 db_xref.replace("|", "_"))

264 lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info])

265 lstopenfile.write(lst_line + "\n")

266 return lst_line

267

268

269def write_header(lstline, outfile):

270 """

271 write header to output file. Header is generated from the lst line.

272

273 Parameters

274 ----------

275 lstline : str

276 current line of lst file

277 outfile : _io.TextIOWrapper

278 open file where header must be written

279 """

280 start, end, _, _, name, gene_name, info = lstline.split("\t")

281 size = int(end) - int(start) + 1

282 towrite = " ".join([name, str(size), gene_name, info])

283 outfile.write(">" + towrite + "\n")

Coverage for PanACoTA/annotate_module/general_format_functions.py: 100%

77 statements