Coverage for PanACoTA/align_module/post_align.py: 100%

119 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Concatenate all alignment files of all families. Then, 

38group alignments by genome. 

39 

40@author: GEM, Institut Pasteur 

41March 2017 

42""" 

43 

44import os 

45import sys 

46import logging 

47import progressbar 

48import multiprocessing 

49from PanACoTA import utils 

50 

51logger = logging.getLogger("align.post") 

52 

53 

54def post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet): 

55 """ 

56 After the alignment of all proteins by family: 

57 

58 - concatenate all alignment files 

59 - group the alignment by genome 

60 

61 Parameters 

62 ---------- 

63 fam_nums : [] 

64 list of family numbers 

65 all_genomes : [] 

66 list of all genomes in dataset 

67 prefix : str 

68 path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files easily) 

69 outdir : str 

70 path to output directory, containing Aldir and Listdir, and that will also contain Treedir 

71 dname : str 

72 name of dataset (used to name concat and grouped files, as well as tree folder) 

73 prot_ali : bool 

74 true: also give concatenated alignment in aa 

75 quiet : bool 

76 True if nothing must be sent to sdtout/stderr, False otherwise 

77 """ 

78 all_alns_nucl, status_nucl = concat_alignments(fam_nums, prefix, "nucl", quiet) 

79 treedir = os.path.join(outdir, "Phylo-" + dname) 

80 os.makedirs(treedir, exist_ok=True) 

81 outfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln") 

82 res_nucl = launch_group_by_genome(all_genomes, all_alns_nucl, status_nucl, outfile_nucl, dname, "nucleic", quiet) 

83 if not res_nucl: 

84 utils.remove(all_alns_nucl) 

85 utils.remove(outfile_nucl) 

86 logger.error("An error occurred. We could not group DNA alignments by genome.") 

87 sys.exit(1) 

88 if prot_ali: 

89 all_alns_aa, status_aa = concat_alignments(fam_nums, prefix, "aa", quiet) 

90 outfile_aa = os.path.join(treedir, dname + ".aa.grp.aln") 

91 res_aa = launch_group_by_genome(all_genomes, all_alns_aa, status_aa, outfile_aa, dname, "protein", quiet) 

92 if not res_aa: 

93 utils.remove(all_alns_aa) 

94 utils.remove(outfile_aa) 

95 logger.error("An error occurred. We could not group protein alignments by genome.") 

96 return outfile_nucl 

97 

98 

99def concat_alignments(fam_nums, prefix, ali_type, quiet): 

100 """ 

101 Concatenate all family alignment files to a unique file 

102 

103 Parameters 

104 ---------- 

105 fam_nums : [] 

106 list of family numbers 

107 prefix : str 

108 path to ``aldir/<name of dataset>-[mafft-align or mafft-prt2nuc]``  

109 (used to get extraction, alignment and btr files easily) 

110 ali_type : str 

111 aa or nucl 

112 quiet : bool 

113 True if nothing must be sent to sdtout/stderr, False otherwise 

114 

115 Returns 

116 ------- 

117 tuple 

118 (output, str) with: 

119 

120 - output: path to file containing concatenation of all alignments 

121 - str: "OK" if concatenation file already exists, "Done" if just did concatenation 

122 """ 

123 if ali_type == "aa": 

124 info = "mafft-align" 

125 elif ali_type == "nucl": 

126 info = "mafft-prt2nuc" 

127 else: 

128 logger.error(f"Not possible to concatenate '{ali_type}' type of alignments.") 

129 sys.exit(1) 

130 output = f"{prefix}-complete.{ali_type}.cat.aln" 

131 if os.path.isfile(output): 

132 logger.info(f"{ali_type} alignments already concatenated") 

133 logger.warning(f"{ali_type} alignments already concatenated in {output}. Program will use " 

134 "it for next steps. If you want to redo it, remove it before " 

135 "running.") 

136 return output, "OK" 

137 logger.info(f"Concatenating all {ali_type} alignment files") 

138 list_files = [f"{prefix}-{info}.{num_fam}.aln" for num_fam in fam_nums] 

139 # Check that all files exist 

140 for f in list_files: 

141 if not os.path.isfile(f): 

142 logger.error(f"The alignment file {f} does not exist. Please check the families you " 

143 "want, and their corresponding alignment files") 

144 sys.exit(1) 

145 if quiet: 

146 utils.cat(list_files, output) 

147 else: 

148 utils.cat(list_files, output, title="Concatenation") 

149 return output, "Done" 

150 

151 

152def launch_group_by_genome(all_genomes, all_alns, status, outfile, dname, type_ali, quiet): 

153 """ 

154 Function calling group_by_genome in a pool, while giving information to user 

155 (time elapsed) 

156 

157 Parameters 

158 ---------- 

159 all_genomes : [] 

160 list of all genomes in the dataset 

161 all_alns : str 

162 path to file containing all alignments concatenated 

163 status : str 

164 "OK" if concatenation file already existed before running, "Done" if just did concatenation 

165 outfile : str 

166 file containing all families align by genome 

167 dname : str 

168 name of dataset 

169 type_ali : str 

170 nucleic or protein 

171 quiet : bool 

172 True if nothing must be sent to sdtout/stderr, False otherwise 

173 

174 Returns 

175 ------- 

176 bool 

177 - True if everything went well or was already done 

178 - False if error occurred in at least one step 

179 """ 

180 # Status = Done means that we just did the concatenation. So, if grouped by genome 

181 # file already exists, remove it. 

182 if status == "Done": 

183 if os.path.isfile(outfile): 

184 utils.remove(outfile) 

185 # Status was not 'Done' (it was 'ok', concat file already existed). And by_genome file 

186 # also already exists. Warn user 

187 if os.path.isfile(outfile): 

188 logger.info(f"{type_ali} alignments already grouped by genome") 

189 logger.warning((f"{type_ali} alignments already grouped by genome in {outfile}. " 

190 "Program will end. ")) 

191 return True 

192 logger.info(f"Grouping {type_ali} alignments per genome") 

193 bar = None 

194 if not quiet: 

195 widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")), 

196 " - ", progressbar.Timer()] 

197 bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50) 

198 pool = multiprocessing.Pool(1) 

199 args = [all_genomes, all_alns, outfile] 

200 final = pool.map_async(group_by_genome, [args], chunksize=1) 

201 pool.close() 

202 if not quiet: 

203 while True: 

204 if final.ready(): 

205 break 

206 bar.update() 

207 bar.finish() 

208 pool.join() 

209 return False not in final.get() 

210 

211 

212def group_by_genome(args): 

213 """ 

214 From the alignment file 'all_alns' containing all proteins, group the alignments of 

215 proteins by their genome (listed in 'all_genomes'), and save the result 

216 in 'treedir' 

217 

218 Parameters 

219 ---------- 

220 args : tuple 

221 - all_genomes: list of all genomes 

222 - all_alns: path to file containing all alignments concatenated 

223 - outfile: path to file which will contain alignments grouped by genome 

224 

225 Returns 

226 ------- 

227 bool 

228 - True if everything went well 

229 - False if problem when trying to group by genomes 

230 """ 

231 all_genomes, all_alns, outfile = args 

232 sequences = read_alignments(all_alns, all_genomes) 

233 if not sequences: 

234 return False 

235 write_groups(outfile, sequences) 

236 return True 

237 

238 

239def read_alignments(all_alns, all_genomes): 

240 """ 

241 Read alignment file, and assign each sequence to a genome 

242 

243 Parameters 

244 ---------- 

245 all_alns : str 

246 path to file containing all alignments concatenated 

247 all_genomes : [] 

248 list of all genomes 

249 

250 Returns 

251 ------- 

252 dict or None 

253 - {genome_name: [list of sequences for this genome]} 

254 - None if problem with a protein for which we don't find the genome 

255 """ 

256 sequences = {} # name: [ordered list of sequences] 

257 genome = None 

258 seq = "" 

259 with open(all_alns, 'r') as alnf: 

260 for line in alnf: 

261 if line.startswith(">"): 

262 # If new header, write previous protein name/sequence to 'sequences' 

263 if genome and seq: 

264 sequences[genome].append(seq) 

265 seq = "" 

266 # Get new genome header 

267 genome = get_genome(line, all_genomes) 

268 if not genome: 

269 return None 

270 if genome not in sequences: 

271 sequences[genome] = [] 

272 else: 

273 seq += line.strip() 

274 if genome and seq: 

275 sequences[genome].append(seq) 

276 per_genome = [len(seq) for seq in sequences.values()] 

277 if len(set(per_genome)) != 1: 

278 logger.error("Problems occurred while grouping alignments by genome: all genomes " 

279 "do not have the same number of sequences. Check that each protein " 

280 "name contains the name of the genome from which it comes.") 

281 return None 

282 logger.log(utils.detail_lvl(), f"{per_genome[0]} sequences found per genome") 

283 return sequences 

284 

285 

286def write_groups(outfile, sequences): 

287 """ 

288 Writing alignments per genome to output file. 

289 

290 Parameters 

291 ---------- 

292 outfile : str 

293 path to file that will contain alignments grouped by genome 

294 sequences : dict 

295 {genome_name: [list of sequences (DNA, prot...) for this genome]} 

296 """ 

297 logger.log(utils.detail_lvl(), "Writing alignments per genome") 

298 with open(outfile, "w") as outf: 

299 for genome in sorted(sequences, key=utils.sort_genomes_by_name): 

300 # write header for genome 

301 outf.write(">" + genome + "\n") 

302 # Write all sequences 

303 outf.write("".join(sequences[genome]) + "\n") 

304 

305 

306def get_genome(header, all_genomes): 

307 """ 

308 Find to which genome belongs 'header' 

309 

310 Parameters 

311 ---------- 

312 header : str 

313 header read in alignment file 

314 all_genomes : [] 

315 list of all genomes 

316 

317 Returns 

318 ------- 

319 str or None 

320 name of genome from which the header is 

321 None if no genome found 

322 """ 

323 # Name of protein is not always genome-name_num 

324 # Ex: in gembase complete DB: >TOTO.0215.00002.i006_00065 is from genome TOTO.0215.00002 

325 # So, genome name cannot be deduced directly from header. But it is always included in header 

326 header = header.split(">")[1].split()[0] 

327 

328 for genome in all_genomes: 

329 if header.startswith(genome): 

330 # header should start with the genome name. Nothing before it. 

331 # Ex: >86KG_12345 is from genome 86KG. >6KG_12345 is from genome 6KG, not 86KG 

332 return genome 

333 logger.error((f"Protein {header} does not correspond to any genome name " 

334 f"given... {all_genomes}")) 

335 return None