Coverage for PanACoTA/annotate_module/format_prodigal.py: 100%

180 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to convert prodigal result files to gembase format. 

38 

39 * Proteins: multifasta with all CDS in aa 

40 * Replicons: (multi)fasta of genome sequences 

41 * Genes: multifasta of all genes in nuc 

42 * gff3: gff files without sequence 

43 * LSTINFO: information on annotation. Columns are: "start end strand type locus 

44 gene_name | product | EC_number | inference2" and strain is C (complement) or D (direct). 

45 Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>` 

46 For prodigal: "start end strand type locus NA | NA | NA | NA", as there is no 

47 functional annotation. 

48 

49@author gem 

50July 2019 

51""" 

52 

53import os 

54import shutil 

55import glob 

56import logging 

57 

58import PanACoTA.utils as utils 

59from PanACoTA.annotate_module import general_format_functions as gfunc 

60 

61logger = logging.getLogger("annotate.prodigal_format") 

62 

63 

64def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, 

65 rep_dir, gff_dir): 

66 """ 

67 Format the given genome, and create its corresponding files in the following folders: 

68 

69 - Proteins 

70 - Genes 

71 - Replicons 

72 - LSTINFO 

73 - gff 

74 

75 Parameters 

76 ---------- 

77 gpath : str 

78 path to the genome sequence which was given to prodigal for annotation 

79 name : str 

80 gembase name of the genome 

81 prod_path : str 

82 directory where all tmp_files for all sequences are saved (sequences cut at each set 

83 of 5N, prodigal results and logs) 

84 lst_dir : str 

85 path to LSTINFO folder 

86 prot_dir : str 

87 path to Proteins folder 

88 gene_dir : str 

89 path to Genes folder 

90 rep_dir : str 

91 path to Replicons folder 

92 gff_dir : str 

93 path to gff3 folder 

94 

95 Returns 

96 ------- 

97 bool : 

98 True if genome was correctly formatted, False otherwise 

99 """ 

100 # Get directory where prodigal results for the current genome are saved 

101 prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes") 

102 

103 # Get prodigal result files 

104 prot_file = glob.glob(os.path.join(prodigal_dir, "*.faa"))[0] 

105 gen_file = glob.glob(os.path.join(prodigal_dir, "*.ffn"))[0] 

106 gff_file = glob.glob(os.path.join(prodigal_dir, "*.gff"))[0] 

107 

108 # Define names for generated gembase files 

109 res_prot_file = os.path.join(prot_dir, name + ".prt") 

110 res_gene_file = os.path.join(gene_dir, name + ".gen") 

111 res_lst_file = os.path.join(lst_dir, name + ".lst") 

112 res_rep_file = os.path.join(rep_dir, name + ".fna") 

113 res_gff_file = os.path.join(gff_dir, name + ".gff") 

114 

115 # Generate replicon file (same as input sequence but with gembase formatted headers). From 

116 # this file, get contig names, to be used to generate gff file 

117 contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file, logger) 

118 if not contigs: 

119 try: 

120 os.remove(res_rep_file) 

121 os.remove(res_lst_file) 

122 os.remove(res_gff_file) 

123 os.remove(res_gene_file) 

124 os.remove(res_prot_file) 

125 except OSError: 

126 pass 

127 logger.error("Problems while generating Replicon file for {}".format(name)) 

128 return False 

129 

130 # First, create .gen and .lst files. If they could not be formatted, 

131 # remove those files, and return False with error message 

132 ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, gpath, name) 

133 if not ok: 

134 try: 

135 os.remove(res_rep_file) 

136 os.remove(res_gene_file) 

137 os.remove(res_lst_file) 

138 os.remove(res_gff_file) 

139 os.remove(res_prot_file) 

140 except OSError: 

141 pass 

142 logger.error(f"Problems while generating .gen and .lst files for {name}") 

143 return False 

144 

145 # Create gff files. 

146 ok = create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes) 

147 # If problem while formatting the genome (rep or gff file), remove all 

148 # already created files, and return False (genome not formatted) with error message. 

149 if not ok: 

150 try: 

151 os.remove(res_gene_file) 

152 os.remove(res_lst_file) 

153 os.remove(res_rep_file) 

154 os.remove(res_gff_file) 

155 os.remove(res_prot_file) 

156 except OSError: 

157 pass 

158 logger.error("Problems while generating .gff (gff3 folder) " 

159 "file for {}".format(name)) 

160 return False 

161 

162 # Generate .prt files (in Proteins directory) 

163 ok = create_prt(prot_file, res_prot_file, res_lst_file) 

164 # If problem while formatting prt file, return False, delete all generated 

165 # formatted files, and write an error message to user. 

166 if not ok: 

167 try: 

168 os.remove(res_gene_file) 

169 os.remove(res_lst_file) 

170 os.remove(res_rep_file) 

171 os.remove(res_gff_file) 

172 os.remove(res_prot_file) 

173 except OSError: # pragma: no cover 

174 pass 

175 logger.error("Problems while generating .prt file (Proteins folder) " 

176 "for {}".format(name)) 

177 return False 

178 return ok 

179 

180 

181def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name): 

182 """ 

183 Generate .gen file, from sequences contained in .ffn, but changing the 

184 headers to match with gembase format. 

185 At the same time, generate .lst file, from the information given in prodigal ffn headers 

186 

187 Parameters 

188 ---------- 

189 contigs : dict 

190 {original_contig_name: gembase_contig_name} 

191 gen_file : str 

192 .ffn file generated by prodigal 

193 res_gen_file : str 

194 generated .gen file, to write in Genes directory 

195 res_lst_file : str 

196 generated .lst file to write in LSTINFO directory 

197 gpath : str 

198 path to the genome given to prodigal. Only used for error message 

199 name : str 

200 gembase name of the genome to format 

201 logger : logging.Logger 

202 logger object to put information 

203 

204 Returns 

205 ------- 

206 bool : 

207 True if conversion went well, False otherwise 

208 """ 

209 # Variable which will contain the current gene sequence 

210 seq = "" 

211 # number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not 

212 # re-start at 1 for each new contig) 

213 locus_num = 0 

214 # contig name of the last gene. To check if we are now in a new contig (-> loc = b) or not 

215 prev_cont_name = "" 

216 # Previous ontig number: contig number to use in gembase format 

217 prev_cont_num = 0 

218 contig_num = 0 

219 # Keep start, end, strand and informations (prodigal gives information on start_type, 

220 # gc_cont etc.) from the previous gene, before overwriting it with information 

221 # on the new one 

222 prev_start = "" 

223 prev_end = "" 

224 prev_strand = "" 

225 prev_info = "" 

226 # Update loc when contig changes ('b' if gene at the border of a contig, 'i' if it is inside) 

227 prev_loc = "b" 

228 # To start, the first gene is, by definition, at the border of the contig 

229 loc = "b" 

230 # Open files: .ffn prodigal to read, .gen and .lst gembase to create 

231 with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\ 

232 open(res_lst_file, "w") as r_lst: 

233 # Read all lines in ffn file (sequences in nuc. for each gene) 

234 for lineffn in ffn: 

235 # If it is a sequence, save it and go to next line 

236 if not lineffn.startswith(">"): 

237 seq += lineffn 

238 continue 

239 # Otherwise: 

240 # - write header of previous sequence to .gen 

241 # - write previous sequence (in 'seq') to .gen 

242 # - write LSTINFO information to .lst 

243 # - update information (new start, end, contig number etc.) for next gene 

244 else: 

245 # Get information given for the new gene (by .ffn file from prodigal) 

246 (gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#") 

247 # Get contig number from prodigal gene header: prodigal first part of header is: 

248 # <original genome name contig name>_<protein number> 

249 contig_name = gname.strip().split("_") 

250 if len(contig_name) > 1: 

251 contig_name = "_".join(contig_name[:-1]) 

252 else: 

253 contig_name = contig_name[0] 

254 # If new contig: 

255 # - previous gene was the last of its contig -> prev_loc = "b" ; 

256 # - the current gene is the first of its contig (loc = "b") 

257 # - we must increment the contig number 

258 if contig_name != prev_cont_name: 

259 # Check that this contig name is in the list, and get its gembase contig number 

260 if contig_name in contigs: 

261 contig_num = contigs[contig_name].split(".")[-1] 

262 # if not in the list, problem, return false 

263 else: 

264 logger.error(f"'{contig_name}' found in {gen_file} does not exist in " 

265 f"{gpath}.") 

266 return False 

267 prev_loc = 'b' 

268 loc = 'b' 

269 # If not new contig. If prev_loc == 'b', previous gene is the first protein 

270 # of this contig. 

271 # Current gene will be inside the contig (except if new contig for the next gene, 

272 # meaning only 1 gene in the contig) 

273 else: 

274 loc = 'i' 

275 

276 # If it is not the first gene of the genome, write previous gene information 

277 if prev_start != "": 

278 # Write line in LSTINFO file, + header and sequence to the gene file 

279 lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 

280 prev_loc, name, prev_cont_num, "NA", prev_info, 

281 "NA", prev_strand, prev_start, prev_end, r_lst) 

282 gfunc.write_header(lstline, r_gen) 

283 r_gen.write(seq) 

284 # -> get new information, save it for the next gene, and go to next line 

285 # Strands are 1/-1 in prodigal, while we use D,C -> convert, so that next time 

286 # we find a new gene, it writes this before updating for this new gene 

287 if int(strand) == 1: 

288 strand = "D" 

289 else: 

290 strand = "C" 

291 # Prepare variables for next gene 

292 locus_num += 1 

293 seq = "" 

294 prev_cont_num = contig_num 

295 prev_cont_name = contig_name 

296 prev_start = start 

297 prev_end = end 

298 prev_strand = strand 

299 prev_loc = loc 

300 prev_info = info 

301 # Write last gene of the genome (-> loc = 'b'), 

302 # Just check that there was at least 1 gene found (prev_start != ""). 

303 # Otherwise, nothing to write 

304 if prev_start != "": 

305 prev_loc = "b" 

306 lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 

307 prev_loc, name, prev_cont_num, "NA", prev_info, "NA", 

308 prev_strand, prev_start, prev_end, r_lst) 

309 gfunc.write_header(lstline, r_gen) 

310 r_gen.write(seq) 

311 return True 

312 

313 

314def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes): 

315 """ 

316 Create .gff3 file. 

317 

318 Format: 

319 

320 ##gff-version 3 

321 ##sequence-region contig1 start end 

322 ##sequence-region contig2 start end 

323 ... 

324 seqid(=contig) source type start end score strand phase attributes 

325 

326 All fields tab separated. 

327 Empty fields contain '.' 

328 

329 For example: 

330 ESCO.1017.00200.00001 Prodigal:2.6 CDS start end . + . ID=ESCO.1017.00200.b0001_00001;locus_tag=ESCO.1017.00200.b0001_00001;product=hypothetical protein 

331 

332 

333 genome1_1 Prodigal_v2.6.3 CDS 213 1880 260.0 + 0 ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.534;conf=99.99;score=259.99;cscore=268.89;sscore=-8.89;rscore=-8.50;uscore=-4.34;tscore=3.95; 

334 

335 Parameters 

336 ---------- 

337 gpath : str 

338 path to the genome sequence given to prodigal. Only used for the error message 

339 gff_file : str 

340 path to gff file generated by prodigal 

341 res-gff_file : str 

342 path to the gff file that must be created in result database 

343 res-lst_file : str 

344 path to the lst file that was created in result database in the previous step 

345 contigs : dict 

346 dict of contig names with their size. ["original_name": "gembase_name"] 

347 sizes : dict 

348 dict of contig gembase names with their sizes {"gembase_name": size} 

349 

350 Returns 

351 ------- 

352 bool 

353 True if everything went well, False if any problem 

354 

355 """ 

356 # open gff generated by prodigal to read it 

357 # open file to write new gff file 

358 # open lst file to read all information saved from prodigal results 

359 with open(gff_file, 'r') as gf, open(res_gff_file, "w") as rgf, open(res_lst_file, "r") as rlf: 

360 # Write headers of gff3 file 

361 rgf.write("##gff-version 3\n") 

362 for ori_name, new_name in contigs.items(): 

363 # Write the list of contigs, with their size 

364 end = sizes[new_name] 

365 rgf.write(f"##sequence-region\t{new_name}\t1\t{end}\n") 

366 

367 # Now, convert each line of prodigal gff to gembase formatted gff line 

368 for linegff in gf: 

369 # Ignore gff lines starting by #. For the new gff, these are already all written at the 

370 # beginning of the file. 

371 if linegff.startswith("#"): 

372 continue 

373 

374 # We do not write the sequences 

375 if linegff.startswith(">"): 

376 break 

377 

378 # Get all information from prodigal gff line. Strip each of them as trailing whitespace 

379 # can be hidden (leading to information from gff considered as different from 

380 # information from lst) 

381 fields_g = linegff.split("\t") 

382 fields_g = [info.strip() for info in fields_g] 

383 (contig_name, source, type_g, start_g, end_g, 

384 score, strand_g, phase, attributes) = fields_g 

385 # Get information given to this same sequence from the lst file 

386 # (next lst line corresponds to next gff line without #), as, for each format, 

387 # there is 1 line per gene) 

388 linelst = rlf.readline() 

389 fields_l = linelst.split("\t") 

390 fields_l = [info.strip() for info in fields_l] 

391 start_l, end_l, strand_l, type_l, locus_l, _, _ = fields_l 

392 

393 

394 # Get gff and ffn filenames to give information to user if error message 

395 gff = os.path.basename(gff_file) 

396 ffn = ".".join(gff.split(".")[:-1]) + ".ffn" 

397 # Path where gff and ffn generated by prodigal are 

398 tmp = gpath + "-prodigalRes" 

399 # Get gene name given by prodigal to current gene 

400 gname = attributes.split("ID=")[1].split(";")[0] 

401 

402 # Compare information from lst and information from prodigal gff (start, 

403 # end and type of feature). They should correspond 

404 for (elemg, eleml, label) in zip([start_g, end_g, type_g], 

405 [start_l, end_l, type_l], 

406 ["start", "end", "type"]): 

407 # If 1 element is different (start or end position, or type), print error 

408 # message and return False: this genome could not be converted to gff 

409 if elemg != eleml: 

410 logger.error(f"Files {ffn} and {gff} (in prodigal tmp_files: {tmp}) do not have " 

411 f"the same {label} value for gene {gname} ({elemg} in gff, {eleml} in " 

412 "ffn))") 

413 return False 

414 

415 # Replace prodigal ID with the new gene name (in gembase format), found in the 

416 # corresponding lst line 

417 at_split = attributes.split(";") 

418 new = [atr if "ID" not in atr else f'ID={locus_l}' for atr in at_split] 

419 

420 # Write new line of gff file 

421 # Write new contig name 

422 cname = contigs[contig_name] 

423 info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g, 

424 phase, ";".join(new)]) 

425 rgf.write(info + "\n") 

426 return True 

427 

428 

429def create_prt(prot_file, res_prot_file, res_lst_file): 

430 """ 

431 Generate .prt file (gembase formatted gene names), from features contained in .lst file generated just before. 

432 

433 Parameters 

434 ---------- 

435 prot_file : str 

436 .faa file generated by prodigal 

437 res_prot_file : str 

438 output file, to write in Proteins directory 

439 res_lst_file : str 

440 .lst file to get all gene names in gembase format instead of re-generating them 

441 Returns 

442 ------- 

443 bool : 

444 True if conversion went well, False otherwise 

445 """ 

446 

447 # Open: 

448 # - prot file to read gene sequences from prodigal results 

449 # - res_prot file to write sequences with gembase headers 

450 # - res_lst_file to get gene gembase names and other infos (strand, size...) 

451 

452 with open(prot_file, "r") as faa, open(res_prot_file, "w") as r_prt,\ 

453 open(res_lst_file, "r") as r_lst: 

454 # Read prt file generated by prodigal 

455 for lineprot in faa: 

456 # If protein sequence, write it 

457 if not lineprot.startswith(">"): 

458 r_prt.write(lineprot) 

459 continue 

460 # If header, replace by gembase header 

461 # For that, get next lst line (corresponding to next protein, 

462 # as there is 1 protein per line in .lst -> 1 protein per header in .prt) 

463 linelst = r_lst.readline().strip() 

464 # Try to get info from lstline. 

465 # If lstline empty, it means that the current protein 

466 # is missing from lst file. We already read the last protein of lst file. 

467 if linelst != '': 

468 # If not empty, check lst format, return False if not right format 

469 try: 

470 # If ok, gembase name is in the fifth column of lst file 

471 start, end, _, _, gem_name, _, _ = linelst.split("\t") 

472 except ValueError: 

473 logger.error("Problem in format of lstline ({})".format(linelst)) 

474 return False 

475 else: 

476 logger.error("No more protein in lst file. We cannot get information on this " 

477 "protein ({})! Check that you do not have more proteins than genes " 

478 "in prodigal results".format(lineprot.strip())) 

479 return False 

480 # Write this gembase name as a new header 

481 # Size of protein sequence is the third of gene sequence. Check that it is an int. 

482 try : 

483 size_gen = (int(end) - int(start) + 1) 

484 except ValueError: 

485 logger.error("Start and/or end of protein {} position is not a number (start " 

486 "= {}; end = {})".format(gem_name, start, end)) 

487 return False 

488 # Find size of protein in number of aa 

489 # If number of nucleotides in gene cannot be divided by 3 to get number of amino-acids, there is a problem with this protein: return False to ignore this genome 

490 size_prot = size_gen/3 

491 if int(size_prot) != size_prot: 

492 logger.error("Gene {} has a number of nucleotides ({}) that is not divisible " 

493 "by 3.".format(gem_name, size_gen)) 

494 return False 

495 gfunc.write_header(linelst, r_prt) 

496 # new_header = "\t".join([gem_name, str(int(size_prot)), product, info]) 

497 # r_prt.write(">" + new_header + "\n") 

498 # Check that there are no more proteins in lst than in this prt file 

499 linelst = r_lst.readline() 

500 if linelst.strip() != '': 

501 gem_name = linelst.strip().split("\t")[4] 

502 logger.error("Protein {} is in .lst file but its sequence is not in the protein " 

503 "file generated by prodigal.".format(gem_name)) 

504 return False 

505 return True