Coverage for PanACoTA/align_module/get

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36import sys

37import os

38import logging

39import progressbar

41from PanACoTA import utils

43logger = logging.getLogger("align.extract")

46def get_all_seqs(all_genomes, dname, dbpath, listdir, aldir, all_fams, quiet):

47 """

48 For all genomes, extract its proteins present in a persistent family to the file

49 corresponding to this family.

51 Parameters

52 ----------

53 all_genomes : []

54 list of all genome names

55 dname : str

56 name of dataset

57 dbpath : str

58 path to folder containing 'Proteins' and 'Genes' folders

59 listdir : str

60 path to folder containing the lists of proteins/genes to extract

61 aldir : str

62 path to folder where extracted proteins/genes must be saved

63 all_fams : []

64 list of all family numbers

65 quiet : bool

66 True if nothin must be written to stdout/stderr, False otherwise

67 """

68 # Get list of files not already existing

69 files_todo = check_existing_extract(all_fams, aldir, dname)

70 if len(files_todo) == 0:

71 logger.info(("All extraction files already existing (see detailed log for "

72 "more information)"))

73 logger.warning("All prt and gene files for all families already exist. The program "

74 "will use them for the next step. If you want to re-extract a given "

75 "family, remove its prt and gen extraction files. If you want to "

76 "re-extract all families, use option -F (or --force).")

77 return

78 logger.info("Extracting proteins and genes from all genomes")

79 nbgen = len(all_genomes)

80 bar = None

81 curnum = 1

82 if not quiet:

83 widgets = ['Extraction:',

84 progressbar.Bar(marker='█', left='', right='', fill=' '),

85 ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',

86 progressbar.Percentage(), ") - ", progressbar.Timer(), ' ',

87 progressbar.ETA()]

88 bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start()

89 curnum = 1

90 for genome in all_genomes:

91 ge_gen = os.path.join(listdir, dname + "-getEntry_gen_" + genome + ".txt")

92 ge_prt = os.path.join(listdir, dname + "-getEntry_prt_" + genome + ".txt")

93 logger.details(f"Extracting proteins and genes from {genome}")

94 prtdb = os.path.join(dbpath, "Proteins", genome + ".prt")

95 gendb = os.path.join(dbpath, "Genes", genome + ".gen")

96 get_genome_seqs(prtdb, ge_prt, files_todo)

97 get_genome_seqs(gendb, ge_gen, files_todo)

98 if not quiet:

99 bar.update(curnum)

100 curnum += 1

101 if not quiet:

102 bar.finish()

103

104

105def check_existing_extract(all_fams, aldir, dname):

106 """

107 For each family, check if its prt and gen extraction file already exist.

108 If both exist, no need to re-extract for those families.

109 If only one or no one exists, put to list to extract.

110

111 Parameters

112 ----------

113 all_fams : list

114 list of all family numbers

115 aldir : str

116 path to directory where extraction files must be saved

117 dname : str

118 name of the dataset

119

120 Returns

121 -------

122 []

123 list of files that must be generated (prt and gen files)

124 """

125 extract_fams = []

126 for fam in all_fams:

127 genfile = os.path.join(aldir, "{}-current.{}.gen".format(dname, fam))

128 prtfile = os.path.join(aldir, "{}-current.{}.prt".format(dname, fam))

129 if not os.path.isfile(genfile) or not os.path.isfile(prtfile):

130 # At least 1 file missing: re-extract all proteins and all genes

131 utils.remove(genfile)

132 utils.remove(prtfile)

133 # As we re-extract proteins and genes, redo alignments

134 mafft_file = os.path.join(aldir, "{}-mafft-align.{}.aln".format(dname, fam))

135 btr_file = os.path.join(aldir, "{}-mafft-prt2nuc.{}.aln".format(dname, fam))

136 utils.remove(mafft_file)

137 utils.remove(btr_file)

138 extract_fams.append(genfile)

139 extract_fams.append(prtfile)

140 # If we re-extract at least 1 family, redo the final files (concatenation and group by genome)

141 if len(extract_fams) > 0:

142 concat = os.path.join(aldir, "{}-complete.cat.aln".format(dname))

143 outdir = os.path.split(aldir)[0]

144 treedir = os.path.join(outdir, "Phylo-" + dname)

145 grpfile = os.path.join(treedir, dname + ".grp.aln")

146 utils.remove(concat)

147 utils.remove(grpfile)

148 return extract_fams

149

150

151def get_genome_seqs(fasta, tabfile, files_todo, outfile=None):

152 """

153 From a fasta file, extract all sequences given in the tab file.

154 The tab file can contain:

155

156 - 1 sequence name per line -> all sequences will be extracted to the same file

157 - 1 sequence + 1 filename per line -> each sequence will be extracted in the given file

158

159 If outfile not given, the tab file must contain 2 columns (1 for the sequence name,

160 1 for its output file). If an outfile is given, only the 1st column of tab file

161 will be considered, and all sequences will be extracted to the given outfile.

162

163 Parameters

164 ----------

165 fasta : str

166 path to fasta file from which sequences must be extracted

167 tabfile : str

168 path to the tab file containing the names of sequences to extract

169 files_todo : list

170 list of files which must be generated (prt and gen files). Others

171 already exist, so ignore them.

172 outfile : str or None

173 if None, the tab file must contain 2 columns (1 for the sequence name,

174 1 for its output file). If an outfile is given (not None), only the 1st column of tab file

175 will be considered, and all sequences will be extracted to the given outfile.

176 """

177 with open(tabfile, "r") as tabf:

178 to_extract = get_names_to_extract(tabf, outfile)

179 if outfile:

180 if os.path.isfile(outfile):

181 logger.warning("Sequences are already extracted in {}. This will "

182 "be used for next step. If you want "

183 "to re-extract all sequences, use option -F (or "

184 "--force)".format(outfile))

185 return

186 with open(fasta, "r") as fasf, open(outfile, "a") as outf:

187 extract_sequences(to_extract, fasf, outf=outf)

188 else:

189 with open(fasta, "r") as fasf:

190 extract_sequences(to_extract, fasf, files_todo=files_todo)

191

192

193def get_names_to_extract(tabf, outfile):

194 """

195 From the tab file, get names of sequences to extract.

196

197 Parameters

198 ----------

199 tabf : _io.TextIO

200 open file containing names of sequences to extract

201 outfile : str or None

202 if None, the tab file must contain 2 columns (1 for the sequence name,

203 1 for its output file). If an outfile is given (not None), only the 1st column of tab file

204 will be considered, and all sequences will be given 'outfile' as output file

205

206 Returns

207 -------

208 dict

209 {sequence_to_extract: file_to_which_it_will_be_extracted}

210 """

211 to_extract = {}

212 for line in tabf:

213 if outfile:

214 seq = line.split()[0].strip()

215 out = outfile

216 else:

217 try:

218 seq, out = line.split()[:2]

219 except ValueError:

220 logger.error(("Your file {} does not contain an output filename for {}. "

221 "Please give an output filename for each sequence to extract, "

222 "or give a general output filename where all sequences will "

223 "be extracted.").format(tabf.name, line.strip()))

224 sys.exit(1)

225 to_extract[seq] = out

226 return to_extract

227

228

229def extract_sequences(to_extract, fasf, files_todo=None, outf=None):

230 """

231 Extract sequences from an open fasta file 'fasf', and a list of sequences to

232 extract

233

234 Parameters

235 ----------

236 to_extract : dict or []

237 {sequence_to_extract: file_to_which_it_will_be_extracted} or list of sequences to

238 extract, all in a same outfile (name must be given in 'outf')

239 fasf : _io.TextIO

240 open file containing sequences in multi-fasta format

241 files_todo : list or None

242 list of files which must be generated (prt and gen files). Others

243 already exist, so ignore them.

244 outf : _io.TextIO or None

245 If an outfile is given (not None), and 'to_extract' is a dict, only its keys will be

246 considered, and all these sequences will be extracted to 'outfile' (if 'to_extract' is a

247 list, will extract all sequences of this list). Otherwise, if None,

248 each sequence will be extracted to its corresponding value in 'to_extract'.

249 """

250

251 # Create optimized index for requests

252 if files_todo is None:

253 files_todo = []

254 files_todo = frozenset(files_todo)

255 if type(to_extract) == list:

256 to_extract = frozenset(to_extract)

257 # State machine variables

258 previous_fp = None

259

260 for line in fasf:

261 if line[0] == '>':

262 # Close previous file if needed

263 if outf is None and previous_fp is not None:

264 previous_fp.close()

265 previous_fp = None

266

267 # Extract sequence name

268 last_char = line.find('\t')

269 if last_char == -1:

270 last_char = line.find(' ')

271 if last_char == -1:

272 last_char = len(line)

273 seq = line[1:last_char].strip()

274 # seq = line.strip().split()[0][1:]

275

276 # Seq is part of sequences to extract

277 if seq in to_extract:

278 # Open the right file

279 previous_fp = outf

280 if previous_fp is None:

281 out = to_extract[seq]

282 if out in files_todo:

283 previous_fp = open(out, "a")

284 else:

285 print(f"Sequence {seq} not written because no output file specified", file=sys.stderr)

286

287 # Write the line content if the file pointer is opened

288 if previous_fp is not None:

289 previous_fp.write(line)

290

291 if outf is None and previous_fp is not None:

292 previous_fp.close()

Coverage for PanACoTA/align_module/get_seqs.py: 100%

108 statements