Coverage for PanACoTA/subcommands/prepare.py: 99%

146 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Subcommand to prepare a dataset: 

38 

39- Download all genomes of a given species from refseq 

40- Filter them with L90 and number of contigs thresholds 

41- Remove too close/far genomes using Mash 

42 

43 

44@author Amandine PERRIN 

45August 2019 

46""" 

47import os 

48import sys 

49import logging 

50from termcolor import colored 

51 

52from PanACoTA import __version__ as version 

53from PanACoTA import utils 

54from PanACoTA.prepare_module import download_genomes_func as dgf 

55from PanACoTA.prepare_module import filter_genomes as fg 

56 

57 

58def main_from_parse(arguments): 

59 """ 

60 Call main function from the arguments given by parser 

61 

62 Parameters 

63 ---------- 

64 arguments : argparse.Namespace 

65 result of argparse parsing of all arguments in command line 

66 

67 """ 

68 cmd = "PanACoTA " + ' '.join(arguments.argv) 

69 main(cmd, arguments.ncbi_species_name, arguments.ncbi_species_taxid, arguments.ncbi_taxid, arguments.strains, 

70 arguments.levels, arguments.ncbi_section, arguments.outdir, arguments.tmp_dir, arguments.parallel, arguments.norefseq, 

71 arguments.db_dir, arguments.only_mash, 

72 arguments.info_file, arguments.l90, arguments.nbcont, arguments.cutn, arguments.min_dist, 

73 arguments.max_dist, arguments.verbose, arguments.quiet) 

74 

75 

76def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, ncbi_section, 

77 outdir, tmp_dir, threads, norefseq, db_dir, 

78 only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet): 

79 """ 

80 Main method, constructing the draft dataset for the given species 

81 

82 verbosity: 

83 - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more 

84 - 1: same as 0 + WARNING in stderr 

85 - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details 

86 - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug 

87 

88 

89 Parameters 

90 ---------- 

91 cmd : str 

92 command line used to launch this program 

93 ncbi_species_name : str 

94 name of species to download, as given by NCBI 

95 ncbi_species_taxid : int 

96 species taxid given in NCBI 

97 ncbi_taxid : int 

98 NCBI taxid (sub-species) 

99 ncbi_strains : str 

100 specific strains to download 

101 levels: str 

102 Level of assembly to download. Choice between 'all', 'complete', 'chromosome', 

103 'scaffold', 'contig'. Default is 'all' 

104 outdir : str 

105 path to output directory (where created database will be saved). 

106 tmp_dir : str 

107 Path to directory where tmp files are saved (sequences split at each row of 5 'N') 

108 threads : int 

109 max number of threads to use 

110 norefseq : bool 

111 True if user does not want to download again the database 

112 db_dir : str 

113 Name of the folder where already downloaded fasta files are saved. 

114 only_mash : bool 

115 True if user user already has the database and quality of each genome (L90, #contigs etc.) 

116 info_file : str 

117 File containing information on QC if it was already ran before (columns to_annotate, 

118 gsize, nb_conts and L90). 

119 l90 : int 

120 Max L90 allowed to keep a genome 

121 nbcont : int 

122 Max number of contigs allowed to keep a genome 

123 cutn : int 

124 cut at each when there are 'cutn' N in a row. Don't cut if equal to 0 

125 min_dist : int 

126 lower limit of distance between 2 genomes to keep them 

127 max_dist : int 

128 upper limit of distance between 2 genomes to keep them (default is 0.06) 

129 verbose : int 

130 verbosity: 

131 - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, 

132 .log.err contains warning and more 

133 - 1: same as 0 + WARNING in stderr 

134 - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details 

135 - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything 

136 from info to debug 

137 quiet : bool 

138 True if nothing must be sent to stdout/stderr, False otherwise 

139 """ 

140 

141 # get species name in NCBI format 

142 # -> will be used to name output directory 

143 # -> will be used to download summary file if given species corresponds to NCBI name 

144 if ncbi_species_name: 

145 species_linked = "_".join(ncbi_species_name.split()) 

146 species_linked = "_".join(species_linked.split("/")) 

147 

148 # if species name not given by user, use species taxID (if given) to name output directory 

149 elif ncbi_species_taxid: 

150 species_linked = str(ncbi_species_taxid) 

151 # if species name not species taxid by user, use taxID (if given) to name output directory 

152 elif ncbi_taxid: 

153 species_linked = str(ncbi_taxid) 

154 # If no species nor taxID, get specific strain names 

155 elif ncbi_strains: 

156 if os.path.isfile(ncbi_strains): 

157 species_linked = os.path.basename(ncbi_strains) 

158 species_linked = os.path.splitext(species_linked)[0] 

159 else: 

160 species_linked = "_".join(ncbi_strains.split()) 

161 species_linked = "-".join(species_linked.split("/")) 

162 species_linked = "_and_".join(species_linked.split(",")) 

163 # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA 

164 else: 

165 species_linked = "NA" 

166 # Default outdir is species name if given, or species taxID 

167 if not outdir: 

168 outdir = species_linked 

169 # Default tmp_dir is outdir/tmp_files 

170 if not tmp_dir: 

171 tmp_dir = os.path.join(outdir, "tmp_files") 

172 # directory that will be created by ncbi_genome_download 

173 ncbidir = os.path.join(outdir, ncbi_section, "bacteria") 

174 os.makedirs(outdir, exist_ok=True) 

175 os.makedirs(tmp_dir, exist_ok=True) 

176 

177 # Initialize logger 

178 # set level of logger: level is the minimum level that will be considered. 

179 if verbose <= 1: 

180 level = logging.INFO 

181 # for verbose = 2, ignore only debug 

182 if verbose >= 2 and verbose < 15: 

183 level = utils.detail_lvl() # int corresponding to detail level 

184 # for verbose >= 15, write everything 

185 if verbose >= 15: 

186 level = logging.DEBUG 

187 logfile_base = os.path.join(outdir, "PanACoTA_prepare_{}").format(species_linked) 

188 logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', log_details=True, 

189 verbose=verbose, quiet=quiet) 

190 

191 # Message on what will be done (cmd, cores used) 

192 logger.info(f'PanACoTA version {version}') 

193 logger.info("Command used\n \t > " + cmd) 

194 message = f"'PanACoTA prepare' will run on {threads} " 

195 message += f"cores" if threads>1 else "core" 

196 logger.info(message) 

197 

198 # Start prepare step 

199 # Run more than only mash filter (!only_mash): 

200 # - start from QC and mash (norefseq) 

201 # - start from genome download (!norefseq)) 

202 if not only_mash: 

203 # Not only mash, so a new info file will be created. If the user still gave an info 

204 # file (he will be warned that it will be ignored), rename it with '.bak' 

205 # to avoid erasing it 

206 if info_file and os.path.isfile(info_file): 

207 os.rename(info_file, info_file + ".back") 

208 

209 # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes 

210 # -> if not, error and exit 

211 if norefseq: 

212 logger.warning(f'You asked to skip {ncbi_section} downloads.') 

213 

214 # -> if db_dir given, watch for sequences there. If does not exist, error and exit 

215 # (user gave a directory (even if it does not exist), so we won't look for 

216 # the sequences in other folders) 

217 if db_dir: 

218 if not os.path.exists(db_dir): 

219 logger.error(f"Database folder {db_dir} supposed to contain fasta " 

220 "sequences does not " 

221 "exist. Please give a valid folder, or leave the default " 

222 "directory (no '-d' option).") 

223 sys.exit(1) 

224 # -> If user did not give db_dir, genomes could be in 

225 # outdir/Database_init/<genome_name>.fna 

226 else: 

227 db_dir = os.path.join(outdir, "Database_init") 

228 # If it does not exist, check if default compressed files folder exists. 

229 if not os.path.exists(db_dir): 

230 logger.warning(f"Database folder {db_dir} supposed to contain fasta " 

231 "sequences does not " 

232 "exist. We will check if the download folder (with compressed " 

233 "sequences) exists.") 

234 # -> if not in database_init, genomes must be in 

235 # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case, 

236 # uncompress and add them to Database_init 

237 if not os.path.exists(ncbidir): 

238 logger.error(f"Folder {ncbidir} does not exist. You do not have any " 

239 "genome to analyse. Possible reasons:\n" 

240 "- if you want to rerun analysis in the same folder as " 

241 "sequences were downloaded (my_outdir/Database_init or " 

242 f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' " 

243 "option\n" 

244 "- if you want to rerun analysis and save them in a new " 

245 "output folder called 'new_outdir', make sure you have " 

246 "'-o new_outdir' option, " 

247 "and you specified where the uncompressed sequences to " 

248 "use are ('-d sequence_database_path'). ") 

249 sys.exit(1) 

250 # add genomes from refseq/bacteria folder to Database_init 

251 nb_gen, _ = dgf.to_database(outdir, ncbi_section) 

252 # No sequence: Do all steps -> download, QC, mash filter 

253 else: 

254 # Download all genomes of the given taxID 

255 db_dir, nb_gen = dgf.download_from_ncbi(species_linked, ncbi_section, ncbi_species_name, ncbi_species_taxid, 

256 ncbi_taxid, ncbi_strains, levels, outdir, threads) 

257 logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded") 

258 

259 # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones 

260 genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90, nbcont, cutn) 

261 

262 # Do only mash filter. Genomes must be already downloaded, and there must be a file with 

263 # all information on these genomes (L90 etc.) 

264 else: 

265 logger.warning('You asked to run only mash steps.') 

266 if not os.path.exists(info_file): # info-file missing -> error and exit 

267 logger.error(f"Your info file {info_file} does not exist. Please provide the " 

268 "right name/path, or remove the '--mash-only option to rerun " 

269 "quality control.") 

270 sys.exit(1) 

271 logger.info(("You want to run only mash steps. Getting information " 

272 "from {}").format(info_file)) 

273 genomes = utils.read_genomes_info(info_file, species_linked, ) 

274 

275 # Run Mash 

276 # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} 

277 # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes) 

278 sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont) 

279 

280 # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 

281 discQC = f"by-L90_nbcont-{species_linked}.txt" 

282 utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir) 

283 

284 # Remove genomes not corresponding to mash filters 

285 removed = fg.iterative_mash(sorted_genomes, genomes, outdir, species_linked, 

286 min_dist, max_dist, threads, quiet) 

287 # Write list of genomes kept, and list of genomes discarded by mash step 

288 info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir, species_linked, 

289 min_dist, max_dist) 

290 logger.info("End") 

291 return info_file 

292 

293 

294def build_parser(parser): 

295 """ 

296 Method to create a parser for command-line options 

297 

298 Parameters 

299 ---------- 

300 parser : argparse.ArgumentParser 

301 parser to configure in order to extract command-line arguments 

302 """ 

303 import argparse 

304 from PanACoTA import utils_argparse 

305 

306 general = parser.add_argument_group('General arguments') 

307 general.add_argument("-T", dest="ncbi_species_taxid", default="", 

308 help=("Species taxid to download, corresponding to the " 

309 "'species taxid' provided by the NCBI. " 

310 "This will download all sequences of this species and all its sub-species." 

311 "A comma-separated list of species taxids can also be provided. " 

312 "(Ex: -T 573 for Klebsiella pneumoniae)") 

313 ) 

314 general.add_argument("-t", dest="ncbi_taxid", default="", 

315 help=("Taxid to download. " 

316 "This can be the taxid of a sub-species, or of a specific strain. " 

317 "A taxid of a subspecies will download all strains in this subspecies " 

318 "EXCEPT the ones which have a specific taxid." 

319 "A comma-separated list of taxids can also be provided." 

320 "Ex: '-t 72407' will download all 'general' K. pneumoniae subsp. pneumoniae strains, " 

321 "and '-t 1123862' will download the strain K. pneumoniae subsp. pneumoniae Kp13 " 

322 "(not included in -t 72407, as it is a strain of the subspecies with a specific taxid).") 

323 ) 

324 general.add_argument("-S", dest="strains", default="", 

325 help=("List of strains to download. " 

326 "A comma-separated list of strain names is possible, " 

327 "as well as a path to a filename containing one name per line." 

328 "Ex: '-S SB2390, IA565' for Klebsiella pneumoniae SB2390 and Klebsiella pneumoniae IA565 strains" 

329 "Ex: '-S path/to/list.txt' path to file with strain names, one per line.") 

330 ) 

331 general.add_argument("-g", dest="ncbi_species_name", default="", 

332 help=("Species to download, corresponding to the " 

333 "'organism name' provided by the NCBI. Give name between " 

334 "quotes (for example \"escherichia coli\")") 

335 ) 

336 general.add_argument("-s", dest="ncbi_section", default="refseq", choices = ["refseq", "genbank"], 

337 help=("NCBI section to download: all genbank, or only refseq (default)") 

338 ) 

339 general.add_argument("-l", "--assembly_level", dest="levels", default="", 

340 help=("Assembly levels of genomes to download (default: all). " 

341 "Possible levels are: 'all', 'complete', 'chromosome', " 

342 "'scaffold', 'contig'." 

343 "You can also provide a comma-separated list of assembly " 

344 "levels. For ex: 'complete,chromosome'") 

345 ) 

346 general.add_argument("-o", dest="outdir", 

347 help=("Give the path to the directory where you want to save the " 

348 "downloaded database. In the given directory, it will create " 

349 "a folder 'Database_init' containing all fasta " 

350 "files that will be sent to the control procedure, as well as " 

351 "a folder 'refseq' with all original compressed files " 

352 "downloaded from refseq. By default, this output dir name is the " 

353 "ncbi_species name if given, or ncbi_species_taxid or ncbi_taxid otherwise.") 

354 ) 

355 general.add_argument("--tmp", dest="tmp_dir", 

356 help=("Specify where the temporary files (sequence split by rows " 

357 "of 'N', sequence with new contig names etc.) must be saved. " 

358 "By default, it will be saved in your " 

359 "out_dir/tmp_files.") 

360 ) 

361 general.add_argument("--cutn", dest="cutn", type=utils_argparse.positive_int, default=5, 

362 help=("By default, each genome will be cut into new contigs when " 

363 "at least 5 'N' in a row are found in its sequence. " 

364 "If you don't want to " 

365 "cut genomes into new contigs when there are rows of 'N', " 

366 "put 0 to this option. If you want to cut from a different number " 

367 "of 'N' in a row, put this value to this option.") 

368 ) 

369 general.add_argument("--l90", dest="l90", type=int, default=100, 

370 help="Maximum value of L90 allowed to keep a genome. Default is 100.") 

371 general.add_argument("--nbcont", dest="nbcont", type=utils_argparse.cont_num, default=999, 

372 help=("Maximum number of contigs allowed to keep a genome. " 

373 "Default is 999.")) 

374 general.add_argument("--min_dist", dest="min_dist", default=1e-4, 

375 type=utils_argparse.mash_dist, 

376 help="By default, genomes whose distance to the reference is not " 

377 "between 1e-4 and 0.06 are discarded. You can specify your own " 

378 "lower limit (instead of 1e-4) with this option.") 

379 general.add_argument("--max_dist", dest="max_dist", default=0.06, 

380 type=utils_argparse.mash_dist, 

381 help="By default, genomes whose distance to the reference is not " 

382 "between 1e-4 and 0.06 are discarded. You can specify your own " 

383 "lower limit (instead of 0.06) with this option.") 

384 general.add_argument("-p", "--threads", dest="parallel", type=utils_argparse.thread_num, 

385 default=1, help=("Run 'N' downloads in parallel (default=1). Put 0 if " 

386 "you want to use all cores of your computer.")) 

387 

388 optional = parser.add_argument_group('Alternatives') 

389 optional.add_argument("--norefseq", dest="norefseq", action="store_true", 

390 help=("If you already downloaded refseq genomes and do not want to " 

391 "check them, add this option to directly go to the next steps:" 

392 "quality control (L90, number of contigs...) and mash filter. " 

393 "Don't forget to specify the db_dir (-d option) where you " 

394 "already have your genome sequences.")) 

395 optional.add_argument("-d", dest="db_dir", 

396 help=("If your already downloaded sequences are not in the default " 

397 "directory (outdir/Database_init), you can specify here the " 

398 "path to those fasta files.")) 

399 optional.add_argument('-M', '--only-mash', dest="only_mash", action="store_true", 

400 help=("Add this option if you already downloaded complete and refseq " 

401 "genomes, and ran quality control (you have, a file " 

402 "containing all genome names, as well as their genome size, " 

403 "number of contigs and L90 values). " 

404 "It will then get information on genomes quality from this " 

405 "file, and run mash steps.")) 

406 optional.add_argument("--info", dest="info_file", 

407 help=("If you already ran the quality control, specify from which " 

408 "file PanACoTA can read this information, in order to proceed " 

409 "to the mash step. This file must contain at " 

410 "least 4 columns, tab separated, with the following headers: " 

411 "'to_annotate', 'gsize', 'nb_conts', 'L90'. Any other column " 

412 "will be ignored.")) 

413 

414 helper = parser.add_argument_group('Others') 

415 helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0, 

416 help="Increase verbosity in stdout/stderr and log files.") 

417 helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False, 

418 help=("Do not display anything to stdout/stderr. log files will " 

419 "still be created.")) 

420 helper.add_argument("-h", "--help", dest="help", action="help", 

421 help="show this help message and exit") 

422 

423 

424def parse(parser, argu): 

425 """ 

426 Parse arguments given to parser 

427 

428 Parameters 

429 ---------- 

430 parser : argparse.ArgumentParser 

431 the parser used 

432 argu : [str] 

433 command-line given by user, to parse using parser 

434 

435 Returns 

436 ------- 

437 argparse.Namespace 

438 Parsed arguments 

439 """ 

440 import argparse 

441 

442 args = parser.parse_args(argu) 

443 return check_args(parser, args) 

444 

445 

446def check_args(parser, args): 

447 """ 

448 Check that arguments given to parser are as expected. 

449 

450 Parameters 

451 ---------- 

452 parser : argparse.ArgumentParser 

453 The parser used to parse command-line 

454 args : argparse.Namespace 

455 Parsed arguments 

456 

457 Returns 

458 ------- 

459 argparse.Namespace or None 

460 The arguments parsed, updated according to some rules. Exit program 

461 with error message if error occurs with arguments given. 

462 

463 """ 

464 # Message if user kept default thresholds for L90 and nbcont. Just to warn him, to be sure 

465 # it was on purpose 

466 def thresholds_message(l90, nbcont): 

467 return (" !! Your genomes will be filtered, and only the ones with 'L90' <= {} " 

468 "and 'number of contigs' < {} will be kept. If you want to change those " 

469 "thresholds, use '--l90' and '--nbcont' " 

470 "options.".format(args.l90, args.nbcont)) 

471 

472 # ERRORS 

473 

474 # We don't want to run only mash, nor only quality control, but don't give a NCBI taxID. 

475 # -> Give at least 1! 

476 if (not args.only_mash and not args.norefseq and 

477 not args.ncbi_species_taxid and not args.ncbi_species_name and 

478 not args.ncbi_taxid and not args.strains): 

479 parser.error("As you did not put the '--norefseq' nor the '-M' option, it means that " 

480 "you want to download refseq (or genbank) genomes. But you did not provide " 

481 "any information, so PanACoTA cannot guess which species you want to " 

482 "download. Specify NCBI_taxid (-t), and/or NCBI species taxid (-T) " 

483 "and/or NCBI_species (-g) and/or NCBI_strain (-S) to download, or add one of " 

484 "the 2 options (--norefseq or -M) if you want to skip the 'download step'.") 

485 

486 # If norefseq, give output directory 

487 # - folder containing Database_init, with all sequences 

488 # - or new folder where you want to put the new results 

489 if args.norefseq and not args.outdir: 

490 parser.error("You must provide an output directory, where your results will be saved.") 

491 

492 # If user wants only mash steps, check that he gave info file, and outdir 

493 if args.only_mash and not args.info_file: 

494 parser.error("If you want to run only Mash filtering steps, please give the " 

495 "info file with the required information (see '--info' option)") 

496 if args.only_mash and not args.outdir: 

497 parser.error("If you want to run only Mash filtering steps, please give the " 

498 "output directory where you want to save your results (see '-o' option)") 

499 

500 # Cannot be verbose and quiet at the same time 

501 if int(args.verbose) > 0 and args.quiet: 

502 parser.error("Choose between a verbose output (-v) or a quiet output (-q)." 

503 " You cannot have both.") 

504 

505 # min_dist must be higher than max_dist 

506 if float(args.min_dist) >= float(args.max_dist): 

507 parser.error(f"min_dist ({args.min_dist}) cannot be higher " 

508 f"than max_dist ({args.max_dist})") 

509 

510 # Check that levels, if given, are among possible ones 

511 possible = ["all", "complete", "chromosome", "scaffold", "contig"] 

512 if args.levels: 

513 for level in args.levels.split(","): 

514 if level not in possible: 

515 parser.error("Please choose between available assembly levels: 'all', 'complete', " 

516 "'chromosome', 'scaffold', 'contig'. If several levels, provide a " 

517 f"comma-separated list. Invalid value: '{args.levels}'") 

518 

519 # WARNINGS 

520 # User did not specify a species name 

521 if not args.ncbi_species_name and not args.outdir: 

522 if args.ncbi_species_taxid: 

523 print(colored("WARNING: you did not provide a species name ('-g species' option) " 

524 "nor an output directory ('-o outdir'). " 

525 "All files will be downloaded in a folder called with the NCBI species " 

526 f"taxid {args.ncbi_species_taxid} instead of the species name.", "yellow")) 

527 elif args.strains: 

528 print(colored("WARNING: you did not provide a species name ('-g species' option) " 

529 "nor a species taxid ('-T spetaxid') nor an output directory ('-o outdir'). " 

530 "All files will be downloaded in a folder called with the specified strains " 

531 f"names {args.strains} instead of the species name.", "yellow")) 

532 else: 

533 print(colored("WARNING: you did not provide a species name ('-g species' option) " 

534 "nor a species taxid ('-T spetaxid') nor an output directory ('-o outdir'). " 

535 "All files will be downloaded in a folder called with the NCBI " 

536 f"taxid {args.ncbi_taxid}.", "yellow")) 

537 # If user wants to cut genomes, warn him to check that it is on purpose (because default is cut at each 5'N') 

538 if args.cutn == 5: 

539 message = (" !! Your genomes will be split when sequence contains at " 

540 "least {}'N' in a row. If you want to change this threshold, use " 

541 "'--cutn n' option (n=0 if you do not want to cut)").format(args.cutn) 

542 print(colored(message, "yellow")) 

543 

544 # Warn user about selection of genomes thresholds 

545 if args.l90 == 100 or args.nbcont == 999: 

546 print(colored(thresholds_message(args.l90, args.nbcont), "yellow")) 

547 

548 # Warn if user gave info file, but does not ask to run only Mash -> info file will be ignored 

549 if (args.info_file and not args.only_mash) or (args.info_file and not args.norefseq): 

550 message = (" !! You gave an info file (--info option), but did not ask to run only Mash " 

551 "step (-M option). Your info file will be ignored (and renamed with '.back' " 

552 "at the end), and another one will " 

553 "be created with the new calculated values.") 

554 print(colored(message, "yellow")) 

555 return args 

556 

557 

558 

559if __name__ == '__main__': 

560 import argparse 

561 from textwrap import dedent 

562 header = ''' 

563 ___ _____ ___ _____ _____ 

564 ( _`\ ( _ )( _`\ (_ _)( _ ) 

565 | |_) ) _ _ ___ | (_) || ( (_) _ | | | (_) | 

566 | ,__/'/'_` )/' _ `\| _ || | _ /'_`\ | | | _ | 

567 | | ( (_| || ( ) || | | || (_( )( (_) )| | | | | | 

568 (_) `\__,_)(_) (_)(_) (_)(____/'`\___/'(_) (_) (_) 

569 

570 

571 Large scale comparative genomics tools 

572 

573 ------------------------------------------- 

574 ''' 

575 my_parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 

576 description=dedent(header), add_help=False) 

577 build_parser(my_parser) 

578 OPTIONS = parse(my_parser, sys.argv[1:]) 

579 main_from_parse(OPTIONS)