Coverage for PanACoTA/prepare_module/download_genomes_func.py: 100%

99 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2 

3# ############################################################################### 

4# This file is part of PanACOTA. # 

5# # 

6# Authors: Amandine Perrin # 

7# Copyright © 2018-2020 Institut Pasteur (Paris). # 

8# See the COPYRIGHT file for details. # 

9# # 

10# PanACOTA is a software providing tools for large scale bacterial comparative # 

11# genomics. From a set of complete and/or draft genomes, you can: # 

12# - Do a quality control of your strains, to eliminate poor quality # 

13# genomes, which would not give any information for the comparative study # 

14# - Uniformly annotate all genomes # 

15# - Do a Pan-genome # 

16# - Do a Core or Persistent genome # 

17# - Align all Core/Persistent families # 

18# - Infer a phylogenetic tree from the Core/Persistent families # 

19# # 

20# PanACOTA is free software: you can redistribute it and/or modify it under the # 

21# terms of the Affero GNU General Public License as published by the Free # 

22# Software Foundation, either version 3 of the License, or (at your option) # 

23# any later version. # 

24# # 

25# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

26# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

27# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

28# for more details. # 

29# # 

30# You should have received a copy of the Affero GNU General Public License # 

31# along with PanACOTA (COPYING file). # 

32# If not, see <https://www.gnu.org/licenses/>. # 

33# ############################################################################### 

34 

35""" 

36Functions helping for downloading refseq genomes of a species, 

37gunzip them, adding complete genomes... 

38 

39@author gem 

40August 2017 

41""" 

42import os 

43import logging 

44import shutil 

45import sys 

46import glob 

47import urllib.request 

48import ncbi_genome_download as ngd 

49 

50from PanACoTA import utils 

51 

52 

53logger = logging.getLogger("prepare.dds") 

54 

55 

56def download_from_ncbi(species_linked, section, ncbi_species_name, 

57 ncbi_species_taxid, ncbi_taxid, spe_strains, levels, outdir, threads): 

58 """ 

59 Download ncbi genomes of given species 

60 

61 Parameters 

62 ---------- 

63 species_linked : str 

64 given NCBI species with '_' instead of spaces, or NCBI taxID if species 

65 name not given 

66 section : str 

67 genbank or only refseq (default = refseq) 

68 ncbi_species_name : str or None 

69 name of species to download: user given NCBI species. None if 

70 no species name given 

71 ncbi_species_taxid : int 

72 species taxid given in NCBI (-T option) 

73 ncbi_taxid : int 

74 taxid given in NCBI (-t option) 

75 spe_strains : str 

76 specific strain name, or comma-separated strain names  

77 (or name of a file with one strain name per line) 

78 outdir : str 

79 Directory where downloaded sequences must be saved 

80 threads : int 

81 Number f threads to use to download genome sequences 

82 

83 Returns 

84 ------- 

85 str : 

86 Output filename of downloaded summary 

87 

88 """ 

89 # Name of summary file, with metadata for each strain: 

90 sumfile = os.path.join(outdir, f"assembly_summary-{species_linked}.txt") 

91 abs_sumfile = os.path.abspath(sumfile) 

92 # arguments needed to download all genomes of the given species 

93 abs_outdir = os.path.abspath(outdir) 

94 keyargs = {"section": section, "file_formats": "fasta", 

95 "output": abs_outdir, 

96 "parallel": threads, "groups": "bacteria", 

97 "metadata_table":abs_sumfile} 

98 message = f"From {section}: " 

99 

100 # Specific strains: downloaded only if compatible with ncbi species/taxids 

101 if spe_strains: 

102 keyargs["strains"] = spe_strains 

103 if os.path.isfile(spe_strains): 

104 message += f"Downloading all strains specified in {spe_strains} file" 

105 else: 

106 message += f"Downloading the following specified strain(s): {spe_strains}" 

107 if ncbi_species_name or ncbi_species_taxid or ncbi_taxid: 

108 message += ", which also have: " 

109 if ncbi_species_name: 

110 keyargs["genera"] = ncbi_species_name 

111 message += f"\n\t-NCBI species = {ncbi_species_name}" 

112 if ncbi_species_taxid: 

113 keyargs["species_taxids"] = ncbi_species_taxid 

114 message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}" 

115 if ncbi_taxid: 

116 keyargs["taxids"] = ncbi_taxid 

117 message += f"\n\t-NCBI_taxid = {ncbi_taxid})." 

118 # Not downloading specific strains, but a sub-species: must be compatible with species given 

119 elif ncbi_taxid: 

120 keyargs["taxids"] = ncbi_taxid 

121 message += f"Downloading genomes with NCBI_taxid = {ncbi_taxid}" 

122 if ncbi_species_name or ncbi_species_taxid: 

123 message += ", which also have: " 

124 if ncbi_species_name: 

125 keyargs["genera"] = ncbi_species_name 

126 message += f"\n\t-NCBI species = {ncbi_species_name}" 

127 if ncbi_species_taxid: 

128 keyargs["species_taxids"] = ncbi_species_taxid 

129 message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}" 

130 # Downloading all genomes of a species 

131 else: 

132 message += "Downloading all genomes of " 

133 # If NCBI species given, add it to arguments to download genomes,  

134 # and write it to info message 

135 if ncbi_species_name: 

136 keyargs["genera"] = ncbi_species_name 

137 message += f"NCBI species = {ncbi_species_name}" 

138 # If NCBI species given, add it to arguments to download genomes,  

139 # and write it to info message 

140 if ncbi_species_taxid: 

141 keyargs["species_taxids"] = ncbi_species_taxid 

142 if ncbi_species_name: 

143 message += f" (NCBI_species_taxid = {ncbi_species_taxid})." 

144 else: 

145 message += f"NCBI_species_taxid = {ncbi_species_taxid}" 

146 

147 # If assembly level(s) given, add it to arguments, and write to info message 

148 if levels: 

149 keyargs["assembly_levels"] = levels 

150 message += f" (Only those assembly levels: {levels}). " 

151 

152 logger.info(f"Metadata for all genomes will be saved in {sumfile}") 

153 logger.info(message) 

154 

155 # Download genomes 

156 max_retries = 15 # If connection to NCBI fails, how many retry downloads must be done 

157 error_message = ("No strain correspond to your request. If you are sure there should have " 

158 "some, check that you gave valid NCBI taxid and/or " 

159 "NCBI species name and/or NCBI strain name. If you gave several, check that " 

160 "given taxIDs and names are compatible.") 

161 # widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")), 

162 # " - ", progressbar.Timer()] 

163 # bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50) 

164 try: 

165 # Download genomes 

166 # ret = None 

167 # while True: 

168 # if ret: 

169 # break 

170 # bar.update() 

171 ret = ngd.download(**keyargs) 

172 

173 except: # pragma: no cover 

174 # Error message if crash during execution of ncbi_genome_download 

175 logger.error(error_message) 

176 # bar.finish() 

177 sys.exit(1) 

178 attempts = 0 

179 while ret == 75 and attempts < max_retries: # pragma: no cover 

180 # bar.update() 

181 attempts += 1 

182 logging.error(('Downloading from NCBI failed due to a connection error, ' 

183 'retrying. Already retried so far: %s'), attempts) 

184 ret = ngd.download(**keyargs) 

185 # bar.finish() 

186 # Message if NGD did not manage to download the genomes (wrong species name/taxid) 

187 if ret != 0: 

188 # Error message 

189 logger.error(error_message) 

190 sys.exit(1) 

191 nb_gen, db_dir = to_database(outdir, section) 

192 return db_dir, nb_gen 

193 

194 

195def to_database(outdir, section): 

196 """ 

197 Move .fna.gz files to 'database_init' folder, and uncompress them. 

198 

199 Parameters 

200 ---------- 

201 outdir : str 

202 directory where all results are (for now, refseq/genbank folders, assembly summary and log 

203 section : str 

204 refseq (default) or genbank 

205 

206 Returns 

207 ------- 

208 nb_gen : number of genomes downloaded 

209 db_dir : directory where are all fna files downloaded from refseq/genbank 

210 """ 

211 # Copy .gz files in a new folder, and Unzip them in this new folder 

212 logger.info("Uncompressing genome files.") 

213 # Folder where are .gz files 

214 download_dir = os.path.join(outdir, section, "bacteria") 

215 # If no folder output/refseq/bacteria: error, no genome found 

216 # (or output/genbank/bacteria) 

217 if not os.path.exists(download_dir): 

218 logger.error(f"The folder containing genomes downloaded from NCBI {section} " 

219 f"({download_dir}) does not exist. Check that you really downloaded " 

220 "sequences (fna.gz) and that they are in this folder.") 

221 sys.exit(1) 

222 # If folder output/<refseq or genbank>/bacteria empty: error, no genome found 

223 list_downloads = os.listdir(download_dir) 

224 if list_downloads == []: 

225 logger.error(f"The folder supposed to contain genomes downloaded from NCBI {section} " 

226 f"({download_dir}) exists but is empty. Check that you really downloaded " 

227 "sequences (fna.gz).") 

228 sys.exit(1) 

229 # Create directory to put uncompressed genomes 

230 db_dir = os.path.join(outdir, "Database_init") 

231 os.makedirs(db_dir, exist_ok=True) 

232 nb_gen = 0 

233 # For each subfolder of download dir, move the .gz file it contains (if possible) 

234 # to the new database folder 

235 for g_folder in os.listdir(download_dir): 

236 fasta = glob.glob(os.path.join(download_dir, g_folder, "*.fna.gz")) 

237 # No .gz file in folder 

238 if len(fasta) == 0: 

239 logger.warning("Problem with genome in {}: no compressed fasta file downloaded. " 

240 "This genome will be ignored.".format(g_folder)) 

241 continue 

242 # Several gz files in folder 

243 elif len(fasta) > 1: 

244 logger.warning("Problem with genome in {}: several compressed fasta files found. " 

245 "This genome will be ignored.".format(g_folder)) 

246 continue 

247 # Copy gz file to new folder 

248 fasta_file = os.path.basename(fasta[0]) 

249 fasta_out = os.path.join(db_dir, fasta_file) 

250 shutil.copy(fasta[0], fasta_out) 

251 # Uncompress file copied 

252 cmd = f"gunzip {fasta_out} -f" 

253 error = f"Error while trying to uncompress {fasta_out}. This genome will be ignored." 

254 call = utils.run_cmd(cmd, error) 

255 # Problem with uncompressing: genome ignored (remove gz file from new folder) 

256 if call.returncode != 0: 

257 os.remove(fasta_out) 

258 continue 

259 nb_gen += 1 

260 return nb_gen, db_dir