Coverage for PanACoTA/align_module/get_seqs.py: 100%

108 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36import sys 

37import os 

38import logging 

39import progressbar 

40 

41from PanACoTA import utils 

42 

43logger = logging.getLogger("align.extract") 

44 

45 

46def get_all_seqs(all_genomes, dname, dbpath, listdir, aldir, all_fams, quiet): 

47 """ 

48 For all genomes, extract its proteins present in a persistent family to the file 

49 corresponding to this family. 

50 

51 Parameters 

52 ---------- 

53 all_genomes : [] 

54 list of all genome names 

55 dname : str 

56 name of dataset 

57 dbpath : str 

58 path to folder containing 'Proteins' and 'Genes' folders 

59 listdir : str 

60 path to folder containing the lists of proteins/genes to extract 

61 aldir : str 

62 path to folder where extracted proteins/genes must be saved 

63 all_fams : [] 

64 list of all family numbers 

65 quiet : bool 

66 True if nothin must be written to stdout/stderr, False otherwise 

67 """ 

68 # Get list of files not already existing 

69 files_todo = check_existing_extract(all_fams, aldir, dname) 

70 if len(files_todo) == 0: 

71 logger.info(("All extraction files already existing (see detailed log for " 

72 "more information)")) 

73 logger.warning("All prt and gene files for all families already exist. The program " 

74 "will use them for the next step. If you want to re-extract a given " 

75 "family, remove its prt and gen extraction files. If you want to " 

76 "re-extract all families, use option -F (or --force).") 

77 return 

78 logger.info("Extracting proteins and genes from all genomes") 

79 nbgen = len(all_genomes) 

80 bar = None 

81 curnum = 1 

82 if not quiet: 

83 widgets = ['Extraction:', 

84 progressbar.Bar(marker='█', left='', right='', fill=' '), 

85 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (', 

86 progressbar.Percentage(), ") - ", progressbar.Timer(), ' ', 

87 progressbar.ETA()] 

88 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start() 

89 curnum = 1 

90 for genome in all_genomes: 

91 ge_gen = os.path.join(listdir, dname + "-getEntry_gen_" + genome + ".txt") 

92 ge_prt = os.path.join(listdir, dname + "-getEntry_prt_" + genome + ".txt") 

93 logger.details(f"Extracting proteins and genes from {genome}") 

94 prtdb = os.path.join(dbpath, "Proteins", genome + ".prt") 

95 gendb = os.path.join(dbpath, "Genes", genome + ".gen") 

96 get_genome_seqs(prtdb, ge_prt, files_todo) 

97 get_genome_seqs(gendb, ge_gen, files_todo) 

98 if not quiet: 

99 bar.update(curnum) 

100 curnum += 1 

101 if not quiet: 

102 bar.finish() 

103 

104 

105def check_existing_extract(all_fams, aldir, dname): 

106 """ 

107 For each family, check if its prt and gen extraction file already exist. 

108 If both exist, no need to re-extract for those families. 

109 If only one or no one exists, put to list to extract. 

110 

111 Parameters 

112 ---------- 

113 all_fams : list 

114 list of all family numbers 

115 aldir : str 

116 path to directory where extraction files must be saved 

117 dname : str 

118 name of the dataset 

119 

120 Returns 

121 ------- 

122 [] 

123 list of files that must be generated (prt and gen files) 

124 """ 

125 extract_fams = [] 

126 for fam in all_fams: 

127 genfile = os.path.join(aldir, "{}-current.{}.gen".format(dname, fam)) 

128 prtfile = os.path.join(aldir, "{}-current.{}.prt".format(dname, fam)) 

129 if not os.path.isfile(genfile) or not os.path.isfile(prtfile): 

130 # At least 1 file missing: re-extract all proteins and all genes 

131 utils.remove(genfile) 

132 utils.remove(prtfile) 

133 # As we re-extract proteins and genes, redo alignments 

134 mafft_file = os.path.join(aldir, "{}-mafft-align.{}.aln".format(dname, fam)) 

135 btr_file = os.path.join(aldir, "{}-mafft-prt2nuc.{}.aln".format(dname, fam)) 

136 utils.remove(mafft_file) 

137 utils.remove(btr_file) 

138 extract_fams.append(genfile) 

139 extract_fams.append(prtfile) 

140 # If we re-extract at least 1 family, redo the final files (concatenation and group by genome) 

141 if len(extract_fams) > 0: 

142 concat = os.path.join(aldir, "{}-complete.cat.aln".format(dname)) 

143 outdir = os.path.split(aldir)[0] 

144 treedir = os.path.join(outdir, "Phylo-" + dname) 

145 grpfile = os.path.join(treedir, dname + ".grp.aln") 

146 utils.remove(concat) 

147 utils.remove(grpfile) 

148 return extract_fams 

149 

150 

151def get_genome_seqs(fasta, tabfile, files_todo, outfile=None): 

152 """ 

153 From a fasta file, extract all sequences given in the tab file. 

154 The tab file can contain: 

155 

156 - 1 sequence name per line -> all sequences will be extracted to the same file 

157 - 1 sequence + 1 filename per line -> each sequence will be extracted in the given file 

158 

159 If outfile not given, the tab file must contain 2 columns (1 for the sequence name, 

160 1 for its output file). If an outfile is given, only the 1st column of tab file 

161 will be considered, and all sequences will be extracted to the given outfile. 

162 

163 Parameters 

164 ---------- 

165 fasta : str 

166 path to fasta file from which sequences must be extracted 

167 tabfile : str 

168 path to the tab file containing the names of sequences to extract 

169 files_todo : list 

170 list of files which must be generated (prt and gen files). Others 

171 already exist, so ignore them. 

172 outfile : str or None 

173 if None, the tab file must contain 2 columns (1 for the sequence name, 

174 1 for its output file). If an outfile is given (not None), only the 1st column of tab file 

175 will be considered, and all sequences will be extracted to the given outfile. 

176 """ 

177 with open(tabfile, "r") as tabf: 

178 to_extract = get_names_to_extract(tabf, outfile) 

179 if outfile: 

180 if os.path.isfile(outfile): 

181 logger.warning("Sequences are already extracted in {}. This will " 

182 "be used for next step. If you want " 

183 "to re-extract all sequences, use option -F (or " 

184 "--force)".format(outfile)) 

185 return 

186 with open(fasta, "r") as fasf, open(outfile, "a") as outf: 

187 extract_sequences(to_extract, fasf, outf=outf) 

188 else: 

189 with open(fasta, "r") as fasf: 

190 extract_sequences(to_extract, fasf, files_todo=files_todo) 

191 

192 

193def get_names_to_extract(tabf, outfile): 

194 """ 

195 From the tab file, get names of sequences to extract. 

196 

197 Parameters 

198 ---------- 

199 tabf : _io.TextIO 

200 open file containing names of sequences to extract 

201 outfile : str or None 

202 if None, the tab file must contain 2 columns (1 for the sequence name, 

203 1 for its output file). If an outfile is given (not None), only the 1st column of tab file 

204 will be considered, and all sequences will be given 'outfile' as output file 

205 

206 Returns 

207 ------- 

208 dict 

209 {sequence_to_extract: file_to_which_it_will_be_extracted} 

210 """ 

211 to_extract = {} 

212 for line in tabf: 

213 if outfile: 

214 seq = line.split()[0].strip() 

215 out = outfile 

216 else: 

217 try: 

218 seq, out = line.split()[:2] 

219 except ValueError: 

220 logger.error(("Your file {} does not contain an output filename for {}. " 

221 "Please give an output filename for each sequence to extract, " 

222 "or give a general output filename where all sequences will " 

223 "be extracted.").format(tabf.name, line.strip())) 

224 sys.exit(1) 

225 to_extract[seq] = out 

226 return to_extract 

227 

228 

229def extract_sequences(to_extract, fasf, files_todo=None, outf=None): 

230 """ 

231 Extract sequences from an open fasta file 'fasf', and a list of sequences to 

232 extract 

233 

234 Parameters 

235 ---------- 

236 to_extract : dict or [] 

237 {sequence_to_extract: file_to_which_it_will_be_extracted} or list of sequences to 

238 extract, all in a same outfile (name must be given in 'outf') 

239 fasf : _io.TextIO 

240 open file containing sequences in multi-fasta format 

241 files_todo : list or None 

242 list of files which must be generated (prt and gen files). Others 

243 already exist, so ignore them. 

244 outf : _io.TextIO or None 

245 If an outfile is given (not None), and 'to_extract' is a dict, only its keys will be 

246 considered, and all these sequences will be extracted to 'outfile' (if 'to_extract' is a 

247 list, will extract all sequences of this list). Otherwise, if None, 

248 each sequence will be extracted to its corresponding value in 'to_extract'. 

249 """ 

250 

251 # Create optimized index for requests 

252 if files_todo is None: 

253 files_todo = [] 

254 files_todo = frozenset(files_todo) 

255 if type(to_extract) == list: 

256 to_extract = frozenset(to_extract) 

257 # State machine variables 

258 previous_fp = None 

259 

260 for line in fasf: 

261 if line[0] == '>': 

262 # Close previous file if needed 

263 if outf is None and previous_fp is not None: 

264 previous_fp.close() 

265 previous_fp = None 

266 

267 # Extract sequence name 

268 last_char = line.find('\t') 

269 if last_char == -1: 

270 last_char = line.find(' ') 

271 if last_char == -1: 

272 last_char = len(line) 

273 seq = line[1:last_char].strip() 

274 # seq = line.strip().split()[0][1:] 

275 

276 # Seq is part of sequences to extract 

277 if seq in to_extract: 

278 # Open the right file 

279 previous_fp = outf 

280 if previous_fp is None: 

281 out = to_extract[seq] 

282 if out in files_todo: 

283 previous_fp = open(out, "a") 

284 else: 

285 print(f"Sequence {seq} not written because no output file specified", file=sys.stderr) 

286 

287 # Write the line content if the file pointer is opened 

288 if previous_fp is not None: 

289 previous_fp.write(line) 

290 

291 if outf is None and previous_fp is not None: 

292 previous_fp.close()