Coverage for PanACoTA/prepare_module/download_genomes

1#!/usr/bin/env python3

3# ###############################################################################

4# This file is part of PanACOTA. #

5# #

6# Authors: Amandine Perrin #

8# See the COPYRIGHT file for details. #

9# #

10# PanACOTA is a software providing tools for large scale bacterial comparative #

11# genomics. From a set of complete and/or draft genomes, you can: #

12# - Do a quality control of your strains, to eliminate poor quality #

13# genomes, which would not give any information for the comparative study #

14# - Uniformly annotate all genomes #

15# - Do a Pan-genome #

16# - Do a Core or Persistent genome #

17# - Align all Core/Persistent families #

18# - Infer a phylogenetic tree from the Core/Persistent families #

19# #

20# PanACOTA is free software: you can redistribute it and/or modify it under the #

21# terms of the Affero GNU General Public License as published by the Free #

22# Software Foundation, either version 3 of the License, or (at your option) #

23# any later version. #

24# #

25# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

26# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

27# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

28# for more details. #

29# #

30# You should have received a copy of the Affero GNU General Public License #

31# along with PanACOTA (COPYING file). #

32# If not, see <https://www.gnu.org/licenses/>. #

33# ###############################################################################

35"""

36Functions helping for downloading refseq genomes of a species,

37gunzip them, adding complete genomes...

39@author gem

40August 2017

41"""

42import os

43import logging

44import shutil

45import sys

46import glob

47import urllib.request

48import ncbi_genome_download as ngd

50from PanACoTA import utils

53logger = logging.getLogger("prepare.dds")

56def download_from_ncbi(species_linked, section, ncbi_species_name,

57 ncbi_species_taxid, ncbi_taxid, spe_strains, levels, outdir, threads):

58 """

59 Download ncbi genomes of given species

61 Parameters

62 ----------

63 species_linked : str

64 given NCBI species with '_' instead of spaces, or NCBI taxID if species

65 name not given

66 section : str

67 genbank or only refseq (default = refseq)

68 ncbi_species_name : str or None

69 name of species to download: user given NCBI species. None if

70 no species name given

71 ncbi_species_taxid : int

72 species taxid given in NCBI (-T option)

73 ncbi_taxid : int

74 taxid given in NCBI (-t option)

75 spe_strains : str

76 specific strain name, or comma-separated strain names

77 (or name of a file with one strain name per line)

78 outdir : str

79 Directory where downloaded sequences must be saved

80 threads : int

81 Number f threads to use to download genome sequences

83 Returns

84 -------

85 str :

86 Output filename of downloaded summary

88 """

89 # Name of summary file, with metadata for each strain:

90 sumfile = os.path.join(outdir, f"assembly_summary-{species_linked}.txt")

91 abs_sumfile = os.path.abspath(sumfile)

92 # arguments needed to download all genomes of the given species

93 abs_outdir = os.path.abspath(outdir)

94 keyargs = {"section": section, "file_formats": "fasta",

95 "output": abs_outdir,

96 "parallel": threads, "groups": "bacteria",

97 "metadata_table":abs_sumfile}

98 message = f"From {section}: "

100 # Specific strains: downloaded only if compatible with ncbi species/taxids

101 if spe_strains:

102 keyargs["strains"] = spe_strains

103 if os.path.isfile(spe_strains):

104 message += f"Downloading all strains specified in {spe_strains} file"

105 else:

106 message += f"Downloading the following specified strain(s): {spe_strains}"

107 if ncbi_species_name or ncbi_species_taxid or ncbi_taxid:

108 message += ", which also have: "

109 if ncbi_species_name:

110 keyargs["genera"] = ncbi_species_name

111 message += f"\n\t-NCBI species = {ncbi_species_name}"

112 if ncbi_species_taxid:

113 keyargs["species_taxids"] = ncbi_species_taxid

114 message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}"

115 if ncbi_taxid:

116 keyargs["taxids"] = ncbi_taxid

117 message += f"\n\t-NCBI_taxid = {ncbi_taxid})."

118 # Not downloading specific strains, but a sub-species: must be compatible with species given

119 elif ncbi_taxid:

120 keyargs["taxids"] = ncbi_taxid

121 message += f"Downloading genomes with NCBI_taxid = {ncbi_taxid}"

122 if ncbi_species_name or ncbi_species_taxid:

123 message += ", which also have: "

124 if ncbi_species_name:

125 keyargs["genera"] = ncbi_species_name

126 message += f"\n\t-NCBI species = {ncbi_species_name}"

127 if ncbi_species_taxid:

128 keyargs["species_taxids"] = ncbi_species_taxid

129 message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}"

130 # Downloading all genomes of a species

131 else:

132 message += "Downloading all genomes of "

133 # If NCBI species given, add it to arguments to download genomes,

134 # and write it to info message

135 if ncbi_species_name:

136 keyargs["genera"] = ncbi_species_name

137 message += f"NCBI species = {ncbi_species_name}"

138 # If NCBI species given, add it to arguments to download genomes,

139 # and write it to info message

140 if ncbi_species_taxid:

141 keyargs["species_taxids"] = ncbi_species_taxid

142 if ncbi_species_name:

143 message += f" (NCBI_species_taxid = {ncbi_species_taxid})."

144 else:

145 message += f"NCBI_species_taxid = {ncbi_species_taxid}"

146

147 # If assembly level(s) given, add it to arguments, and write to info message

148 if levels:

149 keyargs["assembly_levels"] = levels

150 message += f" (Only those assembly levels: {levels}). "

151

152 logger.info(f"Metadata for all genomes will be saved in {sumfile}")

153 logger.info(message)

154

155 # Download genomes

156 max_retries = 15 # If connection to NCBI fails, how many retry downloads must be done

157 error_message = ("No strain correspond to your request. If you are sure there should have "

158 "some, check that you gave valid NCBI taxid and/or "

159 "NCBI species name and/or NCBI strain name. If you gave several, check that "

160 "given taxIDs and names are compatible.")

161 # widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")),

162 # " - ", progressbar.Timer()]

163 # bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)

164 try:

165 # Download genomes

166 # ret = None

167 # while True:

168 # if ret:

169 # break

170 # bar.update()

171 ret = ngd.download(**keyargs)

172

173 except: # pragma: no cover

174 # Error message if crash during execution of ncbi_genome_download

175 logger.error(error_message)

176 # bar.finish()

177 sys.exit(1)

178 attempts = 0

179 while ret == 75 and attempts < max_retries: # pragma: no cover

180 # bar.update()

181 attempts += 1

182 logging.error(('Downloading from NCBI failed due to a connection error, '

183 'retrying. Already retried so far: %s'), attempts)

184 ret = ngd.download(**keyargs)

185 # bar.finish()

186 # Message if NGD did not manage to download the genomes (wrong species name/taxid)

187 if ret != 0:

188 # Error message

189 logger.error(error_message)

190 sys.exit(1)

191 nb_gen, db_dir = to_database(outdir, section)

192 return db_dir, nb_gen

193

194

195def to_database(outdir, section):

196 """

197 Move .fna.gz files to 'database_init' folder, and uncompress them.

198

199 Parameters

200 ----------

201 outdir : str

202 directory where all results are (for now, refseq/genbank folders, assembly summary and log

203 section : str

204 refseq (default) or genbank

205

206 Returns

207 -------

208 nb_gen : number of genomes downloaded

209 db_dir : directory where are all fna files downloaded from refseq/genbank

210 """

211 # Copy .gz files in a new folder, and Unzip them in this new folder

212 logger.info("Uncompressing genome files.")

213 # Folder where are .gz files

214 download_dir = os.path.join(outdir, section, "bacteria")

215 # If no folder output/refseq/bacteria: error, no genome found

216 # (or output/genbank/bacteria)

217 if not os.path.exists(download_dir):

218 logger.error(f"The folder containing genomes downloaded from NCBI {section} "

219 f"({download_dir}) does not exist. Check that you really downloaded "

220 "sequences (fna.gz) and that they are in this folder.")

221 sys.exit(1)

222 # If folder output/<refseq or genbank>/bacteria empty: error, no genome found

223 list_downloads = os.listdir(download_dir)

224 if list_downloads == []:

225 logger.error(f"The folder supposed to contain genomes downloaded from NCBI {section} "

226 f"({download_dir}) exists but is empty. Check that you really downloaded "

227 "sequences (fna.gz).")

228 sys.exit(1)

229 # Create directory to put uncompressed genomes

230 db_dir = os.path.join(outdir, "Database_init")

231 os.makedirs(db_dir, exist_ok=True)

232 nb_gen = 0

233 # For each subfolder of download dir, move the .gz file it contains (if possible)

234 # to the new database folder

235 for g_folder in os.listdir(download_dir):

236 fasta = glob.glob(os.path.join(download_dir, g_folder, "*.fna.gz"))

237 # No .gz file in folder

238 if len(fasta) == 0:

239 logger.warning("Problem with genome in {}: no compressed fasta file downloaded. "

240 "This genome will be ignored.".format(g_folder))

241 continue

242 # Several gz files in folder

243 elif len(fasta) > 1:

244 logger.warning("Problem with genome in {}: several compressed fasta files found. "

245 "This genome will be ignored.".format(g_folder))

246 continue

247 # Copy gz file to new folder

248 fasta_file = os.path.basename(fasta[0])

249 fasta_out = os.path.join(db_dir, fasta_file)

250 shutil.copy(fasta[0], fasta_out)

251 # Uncompress file copied

252 cmd = f"gunzip {fasta_out} -f"

253 error = f"Error while trying to uncompress {fasta_out}. This genome will be ignored."

254 call = utils.run_cmd(cmd, error)

255 # Problem with uncompressing: genome ignored (remove gz file from new folder)

256 if call.returncode != 0:

257 os.remove(fasta_out)

258 continue

259 nb_gen += 1

260 return nb_gen, db_dir

Coverage for PanACoTA/prepare_module/download_genomes_func.py: 100%

99 statements