Coverage for PanACoTA/annotate_module/format_prokka.py: 100%

253 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to: 

38 

39- convert prokka tbl file to our tab file 

40- convert prokka ffn and faa headers to our format 

41- Create the database, with the following folders in the given "res_path": 

42 

43 * Proteins: multifasta with all CDS in aa 

44 * Replicons: multifasta of genome 

45 * Genes: multifasta of all genes in nuc 

46 * gff3: gff files without sequence 

47 * LSTINFO: information on annotation. Columns are: "start end strand type locus\ 

48 gene_name | product | EC_number | inference2" with the same types as prokka file,\ 

49 and strain is C (complement) or D (direct). Locus is:\ 

50 `<genome_name>.<contig_num><i or b>_<protein_num>` 

51 

52@author gem 

53April 2019 

54""" 

55 

56import os 

57import glob 

58import sys 

59import shutil 

60import logging 

61import PanACoTA.utils as utils 

62from PanACoTA.annotate_module import general_format_functions as general 

63 

64logger = logging.getLogger("annotate.prokka_format") 

65 

66 

67def format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, gene_dir, 

68 rep_dir, gff_dir): 

69 """ 

70 Format the given genome, and create its corresponding files in the following folders: 

71 

72 - Proteins 

73 - Genes 

74 - Replicons 

75 - LSTINFO 

76 - gff 

77 

78 Parameters 

79 ---------- 

80 gpath : str 

81 path to the genome sequence which was given to prokka for annotation 

82 name : str 

83 gembase name of the genome 

84 prok_path : str 

85 directory where prokka folders are saved 

86 lst_dir : srt 

87 path to LSTINFO folder 

88 prot_dir : str 

89 path to Proteins folder 

90 gene_dir : str 

91 path to Genes folder 

92 rep_dir : str 

93 path to Replicons folder 

94 gff_dir : str 

95 path to gff3 folder 

96 

97 Returns 

98 ------- 

99 bool : 

100 True if genome was correctly formatted, False otherwise 

101 """ 

102 prokka_dir = os.path.join(prok_path, os.path.basename(gpath) + "-prokkaRes") 

103 # Get needed Prokka result files 

104 fna_file = glob.glob(os.path.join(prokka_dir, "*.fna"))[0] 

105 prokka_tbl_file = glob.glob(os.path.join(prokka_dir, "*.tbl"))[0] 

106 prokka_gff_file = glob.glob(os.path.join(prokka_dir, "*.gff"))[0] 

107 prokka_ffn_file = glob.glob(os.path.join(prokka_dir, "*.ffn"))[0] 

108 prokka_faa_file = glob.glob(os.path.join(prokka_dir, "*.faa"))[0] 

109 

110 # Define names for generated gembase files 

111 res_lst_file = os.path.join(lst_dir, name + ".lst") 

112 res_gff_file = os.path.join(gff_dir, name + ".gff") 

113 res_gene_file = os.path.join(gene_dir, name + ".gen") 

114 res_prt_file = os.path.join(prot_dir, name + ".prt") 

115 res_rep_file = os.path.join(rep_dir, name + ".fna") 

116 

117 

118 # Generate replicon file (same as input sequence but with gembase formatted headers). From 

119 # this file, get contig names, to be used to generate gff file 

120 contigs, sizes = utils.get_genome_contigs_and_rename(name, fna_file, res_rep_file, logger) 

121 if not contigs: 

122 try: 

123 os.remove(res_rep_file) 

124 os.remove(res_lst_file) 

125 os.remove(res_gff_file) 

126 os.remove(res_gene_file) 

127 os.remove(res_prt_file) 

128 except OSError: 

129 pass 

130 logger.error("Problems while generating Replicon file for {}".format(name)) 

131 return False 

132 

133 # Convert prokka tbl file to gembase .lst file format 

134 ok_tbl = tbl2lst(prokka_tbl_file, res_lst_file, contigs, name, fna_file) 

135 if not ok_tbl: 

136 try: 

137 os.remove(res_rep_file) 

138 os.remove(res_lst_file) 

139 os.remove(res_gff_file) 

140 os.remove(res_gene_file) 

141 os.remove(res_prt_file) 

142 except OSError: 

143 pass 

144 logger.error("Problems while generating LSTINFO file for {}".format(name)) 

145 return False 

146 # Create gff3 file for annotations 

147 ok_gff = generate_gff(fna_file, prokka_gff_file, res_gff_file, res_lst_file, sizes, contigs) 

148 if not ok_gff: 

149 try: 

150 os.remove(res_rep_file) 

151 os.remove(res_lst_file) 

152 os.remove(res_gff_file) 

153 os.remove(res_gene_file) 

154 os.remove(res_prt_file) 

155 except OSError: 

156 pass 

157 logger.error("Problems while generating .gff file for {}".format(name)) 

158 return False 

159 # create Genes file (and check no problem occurred with return code) 

160 ok_gene = create_gen(prokka_ffn_file, res_lst_file, res_gene_file) 

161 # If gene file not created because a problem occurred, return False: 

162 # format did not run for this genome 

163 if not ok_gene: 

164 try: 

165 os.remove(res_rep_file) 

166 os.remove(res_lst_file) 

167 os.remove(res_gff_file) 

168 os.remove(res_gene_file) 

169 os.remove(res_prt_file) 

170 except OSError: 

171 pass 

172 logger.error("Problems while generating .gen file for {}".format(name)) 

173 return False 

174 

175 

176 # If gene file was created, create Proteins file 

177 ok_prt = create_prt(prokka_faa_file, res_lst_file, res_prt_file) 

178 # If protein file not created, return False: format did not run for this genome 

179 if not ok_prt: 

180 try: 

181 os.remove(res_lst_file) 

182 os.remove(res_gff_file) 

183 os.remove(res_gene_file) 

184 os.remove(res_prt_file) 

185 os.remove(res_rep_file) 

186 # Remove twice to be able to check that when there is a problem while removing files, 

187 # it generates the expected error 

188 os.remove(res_rep_file) 

189 except OSError: 

190 pass 

191 logger.error("Problems while generating .prt file for {}".format(name)) 

192 return False 

193 return True 

194 

195 

196def tbl2lst(tblfile, lstfile, contigs, genome, gpath): 

197 """ 

198 Read prokka tbl file, and convert it to the lst file. 

199 

200 * prokka tbl file format:: 

201 

202 >Feature contig_name 

203 start end type 

204 [EC_number text] 

205 [gene text] 

206 inference ab initio prediction:Prodigal:2.6 

207 [inference text] 

208 locus_tag test 

209 product text 

210 

211 where `type` can be CDS, tRNA, rRNA, etc ... 

212 lines between [] are not always present 

213 

214 * lst file format:: 

215 

216 start end strand type locus gene_name | product | EC_number | inference 2 | db_xref 

217 

218 with the same types as prokka file, and strain is C (complement) or D (direct) 

219 locus is: `<genome_name>.<contig_num><i or b>_<protein_num>` 

220 

221 Parameters 

222 ---------- 

223 tblfile : str 

224 name of prokka output tbl file to read 

225 lstfile : str 

226 name of lst file to generate 

227 contigs : dict 

228 {original_contig_name: gembase_contig_name} 

229 genome : str 

230 genome name (gembase format) 

231 gpath : str 

232 path to the genome given to prodigal. Only used for error message 

233 changed_name : bool 

234 True if contig names have been changed (cutn != 0) -> contig names end by '_num', 

235 False otherwise. 

236 

237 Returns 

238 ------- 

239 bool : 

240 True if genome name used in lstfile and prokka tblfile are the same, False otherwise 

241 """ 

242 # Protein localisation in contig (b = border ; i = inside) 

243 cont_loc = "b" 

244 prev_cont_loc = "b" 

245 # Current contig number. Used to compare with new one, to know if protein is 

246 # inside or at the border of a contig 

247 prev_cont_num = -1 

248 prev_cont_name = "" 

249 tbl_cont_name = "" 

250 tbl_cont_num = 0 # Contig number in tbl file (<orig_name>_<cont_num>) 

251 lst_cont_num = 0 # Exp cont num: new feature means exp_cont_num += 1 

252 # prev_cont_num = -1 # Previous contig number seen 

253 # Information on current feature. At the beginning, everything empty, no information 

254 gene_name = "NA" 

255 locus_num = "NA" 

256 product = "NA" 

257 ecnum = "NA" 

258 inf2 = "NA" 

259 db_xref = "NA" 

260 start = "-1" 

261 end = "-1" 

262 strand = "D" 

263 # Feature type (CDS, tRNA...) 

264 feature_type = "" 

265 

266 # Check that tblfile is not empty 

267 if os.stat(tblfile).st_size == 0: 

268 logger.error(f"{tblfile} is empty.") 

269 return False 

270 

271 with open(tblfile, "r") as tblf, open(lstfile, "w") as lstf: 

272 for line in tblf: 

273 elems = line.strip().split("\t") 

274 if line.startswith(">Feature"): 

275 tbl_cont_name = line.split()[-1] 

276 else: 

277 if not tbl_cont_name: 

278 logger.error(f"Wrong format for {tblfile}.") 

279 return False 

280 # Get line type, and retrieve info according to it 

281 # If it is not the line with start, end, type, there are only 2 elements 

282 # in the line: 

283 # - information_type information_value 

284 if len(elems) == 2: 

285 if "locus_tag" in elems[0]: 

286 locus_num = elems[-1].split("_")[-1] 

287 if "gene" in elems[0]: 

288 gene_name = elems[1] 

289 if "product" in elems[0]: 

290 product = elems[1] 

291 if "EC_number" in elems[0]: 

292 ecnum = elems[1] 

293 if "inference" in elems[0] and "ab initio prediction:Prodigal" not in line: 

294 inf2 = elems[1] 

295 if "db_xref" in elems[0]: 

296 db_xref = elems[1] 

297 # Information on start, end, type 

298 else: 

299 # new gene 

300 # if new gene is not on the same contig as previously, 

301 # get new contig and loc = 'b' 

302 if tbl_cont_name != prev_cont_name: 

303 # Check that this contig name is in the list, and get its gembase contig 

304 # number 

305 if tbl_cont_name in contigs: 

306 contig_num = contigs[tbl_cont_name].split(".")[-1] 

307 # if not in the list, problem, return false 

308 else: 

309 logger.error(f"'{tbl_cont_name}' found in {tblfile} does not exist in " 

310 f"{gpath}.") 

311 return False 

312 # Previous loc was 'i' (because we were in the same contig as the 

313 # previous one). But now, we know the it was the last gene of 

314 # its contig: we change loc to 'b' 

315 prev_cont_loc = "b" 

316 cont_loc = "b" 

317 # Same contig as previously: this gene is inside the contig (cont_loc = "i"). 

318 # If, in fact, it was the last gene of this contig, it will be changed 

319 # when discovering next gene. 

320 else: 

321 cont_loc = 'i' 

322 

323 # If not first gene of the contig, write the previous gene to .lst file 

324 # The first gene will be written while reading the 2nd gene 

325 if start != "-1" and end != "-1" and not crispr: 

326 lstline = general.write_gene(feature_type, locus_num, 

327 gene_name, product, 

328 prev_cont_loc, genome, 

329 prev_cont_num, ecnum, inf2, 

330 db_xref, strand, start, end, lstf) 

331 

332 # Get new values for the next gene: start, end, strand and feature type 

333 start, end, feature_type = elems 

334 crispr = "CRISPR" in feature_type or "repeat_region" in feature_type 

335 if crispr: 

336 continue 

337 

338 # Get strain of gene 

339 if int(end) < int(start): 

340 start, end = end, start 

341 strand = "C" 

342 else: 

343 strand = "D" 

344 # Initialize variables for next feature 

345 # (except start, end, strand and feature type that we just calculated). 

346 # prev_cont_num = exp_cont_num 

347 prev_cont_loc = cont_loc 

348 prev_cont_num = contig_num 

349 prev_cont_name = tbl_cont_name 

350 locus_num = "NA" 

351 gene_name = "NA" 

352 product = "NA" 

353 ecnum = "NA" 

354 inf2 = "NA" 

355 db_xref = "NA" 

356 

357 # Write last feature 

358 if start != -1 and end != -1: 

359 prev_cont_loc = "b" 

360 general.write_gene(feature_type, locus_num, gene_name, product, 

361 prev_cont_loc, genome, prev_cont_num, 

362 ecnum, inf2, db_xref, strand, start, end, lstf) 

363 return True 

364 

365 

366def generate_gff(gpath, prokka_gff_file, res_gff_file, res_lst_file, sizes, contigs): 

367 """ 

368 From the lstinfo file and contig names (retrieved from generation of Replicons files), 

369 generate a gff file. 

370 

371 Format: 

372 

373 ##gff-version 3 

374 ##sequence-region contig1 start end 

375 ##sequence-region contig2 start end 

376 ... 

377 seqid(=contig) source type start end score strand phase attributes 

378 

379 All fields tab separated. 

380 Empty fields contain '.' 

381 

382 For example: 

383 ESCO.1017.00200.00001 Prodigal:2.6 CDS start end . + . ID=ESCO.1017.00200.b0001_00001;locus_tag=ESCO.1017.00200.b0001_00001;product=hypothetical protein 

384 

385 

386 genome1_1 Prodigal_v2.6.3 CDS 213 1880 260.0 + 0 ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.534;conf=99.99;score=259.99;cscore=268.89;sscore=-8.89;rscore=-8.50;uscore=-4.34;tscore=3.95; 

387 

388 

389 Parameters 

390 ---------- 

391 gpath : str 

392 path to the genome sequence given to prokka. Only used for error message 

393 res-gff_file : str 

394 path to the gff file that must be created in result database 

395 res-lst_file : str 

396 path to the lst file that was created in result database in the previous step 

397 sizes : list 

398 dict of contig names with their size. {"gembase1": "size", "gembase2":"size2" ...] 

399 contigs : list 

400 dict of contig original and gembase names. {"contig1": "gembase1"...} 

401 

402 Returns 

403 ------- 

404 bool : 

405 True if conversion worked well, False otherwise 

406 """ 

407 # open gff generated by prokka to read it 

408 # open file to write new gff file (in gff3 folder) 

409 # open lst file (from LSTINFO folder) to read all annotation information saved 

410 # from prokka results 

411 with open(prokka_gff_file, "r") as prokf, open(res_lst_file, "r") as lstf, \ 

412 open(res_gff_file, "w") as gfff: 

413 # Write headers of gff3 file 

414 gfff.write("##gff-version 3\n") 

415 # Write all sequences with their size. Order by name in gembase format 

416 for old, new in sorted(contigs.items(), key=lambda items:items[1]): 

417 end = sizes[new] 

418 # Write the list of cobisntigs, with their size 

419 gfff.write(f"##sequence-region\t{new}\t{1}\t{end}\n") 

420 

421 # Now, convert each line of prokka gff to gembase formatted gff line 

422 for linegff in prokf: 

423 # Ignore gff lines starting by #. For the new gff, these are already all written at the 

424 # beginning of the file. 

425 if linegff.startswith("#"): 

426 continue 

427 # We do not write sequences 

428 if linegff.startswith('>'): 

429 break 

430 # Try to extract information on new line 

431 try: 

432 # Get all information from prokka gff line. Strip each of them as trailing whitespace 

433 # can be hidden (leading to information from gff considered as different from 

434 # information from lst) 

435 fields_g = linegff.strip().split("\t") 

436 fields_g = [info.strip() for info in fields_g] 

437 # Ignore lines with sequence 

438 # if len(fields_g) != 9: 

439 # continue 

440 (contig_name, source, type_g, start_g, end_g, 

441 score, strand_g, phase, attributes) = fields_g 

442 # Ignore CRISPR 

443 if "CRISPR" in type_g or "repeat_region" in type_g: 

444 continue 

445 # Get information given to this same sequence from the lst file 

446 # (next lst line corresponds to next gff line without #), as, for each format, 

447 # there is 1 line per gene) 

448 linelst = lstf.readline() 

449 fields_l = linelst.split("\t") 

450 fields_l = [info.strip() for info in fields_l] 

451 start_l, end_l, strand_l, type_l, locus_l, l_gene, l_info = fields_l 

452 # Get gff and ffn filenames to give information to user if error message 

453 gff = os.path.basename(prokka_gff_file) 

454 tbl = gff.replace(".gff", ".tbl") 

455 # Path where gff and ffn generated by prodigal are 

456 tmp = gpath + "-prokkaRes" 

457 # Get gene name given by prodigal to current gene 

458 gname = attributes.split("ID=")[1].split(";")[0] 

459 # Get locus_tag given by prokka to current feature (should be the same as ID) 

460 loc_name = attributes.split("locus_tag=")[1].split(";")[0] 

461 if loc_name != gname: 

462 logger.error(f"Problem in {gff}: ID={gname} whereas locus_tag={loc_name}.") 

463 return False 

464 # Compare information from lst and information from prodigal gff (start, 

465 # end and type of feature). They should correspond 

466 for (elemg, eleml, label) in zip([start_g, end_g, type_g], 

467 [start_l, end_l, type_l], 

468 ["start", "end", "type"]): 

469 # If 1 element is different (start or end position, or type), print error 

470 # message and return False: this genome could not be converted to gff 

471 if elemg != eleml: 

472 logger.error(f"Files {tbl} and {gff} (in prokka tmp_files: {tmp}) " 

473 f"do not have the same {label} value for gene {gname} ({elemg} " 

474 f"in gff, {eleml} in tbl)") 

475 return False 

476 

477 # Replace prokka ID and locus_tag with the new gene name (in gembase format), 

478 # found in the corresponding lst line 

479 at_split = attributes.split(";") 

480 newID = [atr if "ID" not in atr else f'ID={locus_l}' for atr in at_split] 

481 new = [atr if "locus_tag" not in atr else f'locus_tag={locus_l}' for atr in newID] 

482 

483 # Write new line of gff file 

484 cname = contigs[contig_name] 

485 info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g, 

486 phase, ";".join(new)]) 

487 gfff.write(info + "\n") 

488 except: 

489 logger.error(f"Wrong format for {prokka_gff_file}.") 

490 return False 

491 return True 

492 

493 

494def create_gen(ffnseq, lstfile, genseq): 

495 """ 

496 Generate .gen file, from sequences contained in .ffn, but changing the 

497 headers using the information in .lst 

498 

499 Parameters 

500 ---------- 

501 ffnseq : str 

502 .ffn file generated by prokka 

503 lstfile : str 

504 lstfile converted from prokka tbl file 

505 genseq : str 

506 output file, to write in Genes directory 

507 logger : logging.Logger 

508 logger object to put information 

509 

510 Returns 

511 ------- 

512 bool : 

513 True if conversion went well, False otherwise 

514 """ 

515 problem = False 

516 write = True # Write next sequence 

517 with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen: 

518 for line_ffn in ffn: 

519 # Ignore gene that we do not want to write (should be a crispr) 

520 # If line of sequence, write it as is, and go to next line 

521 if not line_ffn.startswith(">"): 

522 # We just read a seq line. If we can write (write is True), do it and go 

523 # to next line 

524 # Otherwise, just go to next line 

525 if write: 

526 gen.write(line_ffn) 

527 continue 

528 # Try to get gene ID. If does not work, ignore this gene (it may be a 

529 # CRISPR, and we ignore them 

530 test_gen_id = line_ffn.split()[0].split("_")[-1] 

531 if not test_gen_id.isdigit(): 

532 # Maybe a CRISPR? Or wrong gene name? -> ignore 

533 logger.log(utils.detail_lvl(), 

534 f"Unknown header format for {line_ffn.strip()}. " 

535 "This gene will be ignored in .gen output file.") 

536 write = False 

537 continue 

538 # If ffn contains a gene header, find its information in lst file 

539 else: 

540 write = True 

541 lstline = lst.readline().strip() 

542 gen_id = int(test_gen_id) 

543 # genID exists, ffn header is for a gene. Check that it corresponds to 

544 # information in lst file. 

545 id_lst = lstline.split("\t")[4].split("_")[-1] 

546 gen_id_lst = int(id_lst) 

547 # in lst, find the same gene ID as in ffn (some gene IDs in lst can be absent 

548 # from ffn, if prokka do not give their sequence). 

549 # As they are ordered by increasing number, go to next lstline until 

550 # corresponding gene ID is found. However, if ffn ID > lst ID: ID does not 

551 # exist in .lst -> problem. 

552 while gen_id > gen_id_lst: 

553 lstline = lst.readline().strip() 

554 if not lstline: 

555 gen_id_lst = "-1" 

556 break 

557 id_lst = lstline.split("\t")[4].split("_")[-1] 

558 gen_id_lst = int(id_lst) 

559 # If it found the same gene ID, write info in gene file 

560 if gen_id == gen_id_lst: 

561 general.write_header(lstline.strip(), gen) 

562 # If gene ID of ffn not found, write error message and stop 

563 else: 

564 logger.error(f"Missing info for gene {line_ffn.strip()} " 

565 f"(from {ffnseq}) in {lstfile}. If it is actually present " 

566 "in the lst file, check that genes are ordered by increasing number in both lst and ffn files.") 

567 return False 

568 return True 

569 

570 

571def create_prt(faaseq, lstfile, prtseq): 

572 """ 

573 Generate .prt file, from sequences in .faa, but changing the headers 

574 using information in .lst 

575 

576 **Note:** works if proteins are in increasing order (of number after "_" in their name) 

577 in faa and tbl (hence lst) files. 

578 

579 If a header is not in the right format, or a protein exists in prt file but not in lstfile, 

580 conversion is stopped, an error message is output, and prt file is removed. 

581 

582 Parameters 

583 ---------- 

584 faaseq : str 

585 faa file output of prokka 

586 lstfile : str 

587 lstinfo converted from prokka tab file 

588 prtseq : str 

589 output file where converted proteins must be saved 

590 

591 Returns 

592 ------- 

593 bool : 

594 True if conversion went well, False otherwise 

595 """ 

596 problem = False 

597 with open(faaseq) as faa, open(lstfile) as lst, open(prtseq, "w") as prt: 

598 for line in faa: 

599 # all header lines must start with PROKKA_<geneID> 

600 if line.startswith(">"): 

601 try: 

602 # get gene ID 

603 gen_id = int(line.split()[0].split("_")[-1]) 

604 except ValueError as err: 

605 logger.error(f"Unknown header format {line.strip()} in {faaseq}. " 

606 f"Gene ID is not a number.") 

607 return False 

608 gen_id_lst = 0 

609 # get line of lst corresponding to the gene ID 

610 lstline = "" 

611 while gen_id > gen_id_lst: 

612 lstline = lst.readline().strip() 

613 id_lst = lstline.split("\t")[4].split("_")[-1] 

614 # don't cast to int if info for a crispr 

615 if id_lst.isdigit(): 

616 gen_id_lst = int(id_lst) 

617 # check that gen_id is the same as the lst line 

618 if gen_id == gen_id_lst: 

619 general.write_header(lstline, prt) 

620 else: 

621 logger.error(f"Missing info for protein {line.strip()} (from {faaseq}) " 

622 f"in {lstfile}. If it is actually present " 

623 "in the lst file, check that proteins are ordered by increasing " 

624 "number in both lst and faa files.") 

625 return False 

626 # not header: inside sequence, copy it to the .prt file 

627 else: 

628 prt.write(line) 

629 return True 

630 

631