Coverage for PanACoTA/prepare_module/download_genomes_func.py: 100%
99 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
3# ###############################################################################
4# This file is part of PanACOTA. #
5# #
6# Authors: Amandine Perrin #
7# Copyright © 2018-2020 Institut Pasteur (Paris). #
8# See the COPYRIGHT file for details. #
9# #
10# PanACOTA is a software providing tools for large scale bacterial comparative #
11# genomics. From a set of complete and/or draft genomes, you can: #
12# - Do a quality control of your strains, to eliminate poor quality #
13# genomes, which would not give any information for the comparative study #
14# - Uniformly annotate all genomes #
15# - Do a Pan-genome #
16# - Do a Core or Persistent genome #
17# - Align all Core/Persistent families #
18# - Infer a phylogenetic tree from the Core/Persistent families #
19# #
20# PanACOTA is free software: you can redistribute it and/or modify it under the #
21# terms of the Affero GNU General Public License as published by the Free #
22# Software Foundation, either version 3 of the License, or (at your option) #
23# any later version. #
24# #
25# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
26# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
27# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
28# for more details. #
29# #
30# You should have received a copy of the Affero GNU General Public License #
31# along with PanACOTA (COPYING file). #
32# If not, see <https://www.gnu.org/licenses/>. #
33# ###############################################################################
35"""
36Functions helping for downloading refseq genomes of a species,
37gunzip them, adding complete genomes...
39@author gem
40August 2017
41"""
42import os
43import logging
44import shutil
45import sys
46import glob
47import urllib.request
48import ncbi_genome_download as ngd
50from PanACoTA import utils
53logger = logging.getLogger("prepare.dds")
56def download_from_ncbi(species_linked, section, ncbi_species_name,
57 ncbi_species_taxid, ncbi_taxid, spe_strains, levels, outdir, threads):
58 """
59 Download ncbi genomes of given species
61 Parameters
62 ----------
63 species_linked : str
64 given NCBI species with '_' instead of spaces, or NCBI taxID if species
65 name not given
66 section : str
67 genbank or only refseq (default = refseq)
68 ncbi_species_name : str or None
69 name of species to download: user given NCBI species. None if
70 no species name given
71 ncbi_species_taxid : int
72 species taxid given in NCBI (-T option)
73 ncbi_taxid : int
74 taxid given in NCBI (-t option)
75 spe_strains : str
76 specific strain name, or comma-separated strain names
77 (or name of a file with one strain name per line)
78 outdir : str
79 Directory where downloaded sequences must be saved
80 threads : int
81 Number f threads to use to download genome sequences
83 Returns
84 -------
85 str :
86 Output filename of downloaded summary
88 """
89 # Name of summary file, with metadata for each strain:
90 sumfile = os.path.join(outdir, f"assembly_summary-{species_linked}.txt")
91 abs_sumfile = os.path.abspath(sumfile)
92 # arguments needed to download all genomes of the given species
93 abs_outdir = os.path.abspath(outdir)
94 keyargs = {"section": section, "file_formats": "fasta",
95 "output": abs_outdir,
96 "parallel": threads, "groups": "bacteria",
97 "metadata_table":abs_sumfile}
98 message = f"From {section}: "
100 # Specific strains: downloaded only if compatible with ncbi species/taxids
101 if spe_strains:
102 keyargs["strains"] = spe_strains
103 if os.path.isfile(spe_strains):
104 message += f"Downloading all strains specified in {spe_strains} file"
105 else:
106 message += f"Downloading the following specified strain(s): {spe_strains}"
107 if ncbi_species_name or ncbi_species_taxid or ncbi_taxid:
108 message += ", which also have: "
109 if ncbi_species_name:
110 keyargs["genera"] = ncbi_species_name
111 message += f"\n\t-NCBI species = {ncbi_species_name}"
112 if ncbi_species_taxid:
113 keyargs["species_taxids"] = ncbi_species_taxid
114 message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}"
115 if ncbi_taxid:
116 keyargs["taxids"] = ncbi_taxid
117 message += f"\n\t-NCBI_taxid = {ncbi_taxid})."
118 # Not downloading specific strains, but a sub-species: must be compatible with species given
119 elif ncbi_taxid:
120 keyargs["taxids"] = ncbi_taxid
121 message += f"Downloading genomes with NCBI_taxid = {ncbi_taxid}"
122 if ncbi_species_name or ncbi_species_taxid:
123 message += ", which also have: "
124 if ncbi_species_name:
125 keyargs["genera"] = ncbi_species_name
126 message += f"\n\t-NCBI species = {ncbi_species_name}"
127 if ncbi_species_taxid:
128 keyargs["species_taxids"] = ncbi_species_taxid
129 message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}"
130 # Downloading all genomes of a species
131 else:
132 message += "Downloading all genomes of "
133 # If NCBI species given, add it to arguments to download genomes,
134 # and write it to info message
135 if ncbi_species_name:
136 keyargs["genera"] = ncbi_species_name
137 message += f"NCBI species = {ncbi_species_name}"
138 # If NCBI species given, add it to arguments to download genomes,
139 # and write it to info message
140 if ncbi_species_taxid:
141 keyargs["species_taxids"] = ncbi_species_taxid
142 if ncbi_species_name:
143 message += f" (NCBI_species_taxid = {ncbi_species_taxid})."
144 else:
145 message += f"NCBI_species_taxid = {ncbi_species_taxid}"
147 # If assembly level(s) given, add it to arguments, and write to info message
148 if levels:
149 keyargs["assembly_levels"] = levels
150 message += f" (Only those assembly levels: {levels}). "
152 logger.info(f"Metadata for all genomes will be saved in {sumfile}")
153 logger.info(message)
155 # Download genomes
156 max_retries = 15 # If connection to NCBI fails, how many retry downloads must be done
157 error_message = ("No strain correspond to your request. If you are sure there should have "
158 "some, check that you gave valid NCBI taxid and/or "
159 "NCBI species name and/or NCBI strain name. If you gave several, check that "
160 "given taxIDs and names are compatible.")
161 # widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")),
162 # " - ", progressbar.Timer()]
163 # bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)
164 try:
165 # Download genomes
166 # ret = None
167 # while True:
168 # if ret:
169 # break
170 # bar.update()
171 ret = ngd.download(**keyargs)
173 except: # pragma: no cover
174 # Error message if crash during execution of ncbi_genome_download
175 logger.error(error_message)
176 # bar.finish()
177 sys.exit(1)
178 attempts = 0
179 while ret == 75 and attempts < max_retries: # pragma: no cover
180 # bar.update()
181 attempts += 1
182 logging.error(('Downloading from NCBI failed due to a connection error, '
183 'retrying. Already retried so far: %s'), attempts)
184 ret = ngd.download(**keyargs)
185 # bar.finish()
186 # Message if NGD did not manage to download the genomes (wrong species name/taxid)
187 if ret != 0:
188 # Error message
189 logger.error(error_message)
190 sys.exit(1)
191 nb_gen, db_dir = to_database(outdir, section)
192 return db_dir, nb_gen
195def to_database(outdir, section):
196 """
197 Move .fna.gz files to 'database_init' folder, and uncompress them.
199 Parameters
200 ----------
201 outdir : str
202 directory where all results are (for now, refseq/genbank folders, assembly summary and log
203 section : str
204 refseq (default) or genbank
206 Returns
207 -------
208 nb_gen : number of genomes downloaded
209 db_dir : directory where are all fna files downloaded from refseq/genbank
210 """
211 # Copy .gz files in a new folder, and Unzip them in this new folder
212 logger.info("Uncompressing genome files.")
213 # Folder where are .gz files
214 download_dir = os.path.join(outdir, section, "bacteria")
215 # If no folder output/refseq/bacteria: error, no genome found
216 # (or output/genbank/bacteria)
217 if not os.path.exists(download_dir):
218 logger.error(f"The folder containing genomes downloaded from NCBI {section} "
219 f"({download_dir}) does not exist. Check that you really downloaded "
220 "sequences (fna.gz) and that they are in this folder.")
221 sys.exit(1)
222 # If folder output/<refseq or genbank>/bacteria empty: error, no genome found
223 list_downloads = os.listdir(download_dir)
224 if list_downloads == []:
225 logger.error(f"The folder supposed to contain genomes downloaded from NCBI {section} "
226 f"({download_dir}) exists but is empty. Check that you really downloaded "
227 "sequences (fna.gz).")
228 sys.exit(1)
229 # Create directory to put uncompressed genomes
230 db_dir = os.path.join(outdir, "Database_init")
231 os.makedirs(db_dir, exist_ok=True)
232 nb_gen = 0
233 # For each subfolder of download dir, move the .gz file it contains (if possible)
234 # to the new database folder
235 for g_folder in os.listdir(download_dir):
236 fasta = glob.glob(os.path.join(download_dir, g_folder, "*.fna.gz"))
237 # No .gz file in folder
238 if len(fasta) == 0:
239 logger.warning("Problem with genome in {}: no compressed fasta file downloaded. "
240 "This genome will be ignored.".format(g_folder))
241 continue
242 # Several gz files in folder
243 elif len(fasta) > 1:
244 logger.warning("Problem with genome in {}: several compressed fasta files found. "
245 "This genome will be ignored.".format(g_folder))
246 continue
247 # Copy gz file to new folder
248 fasta_file = os.path.basename(fasta[0])
249 fasta_out = os.path.join(db_dir, fasta_file)
250 shutil.copy(fasta[0], fasta_out)
251 # Uncompress file copied
252 cmd = f"gunzip {fasta_out} -f"
253 error = f"Error while trying to uncompress {fasta_out}. This genome will be ignored."
254 call = utils.run_cmd(cmd, error)
255 # Problem with uncompressing: genome ignored (remove gz file from new folder)
256 if call.returncode != 0:
257 os.remove(fasta_out)
258 continue
259 nb_gen += 1
260 return nb_gen, db_dir