Coverage for PanACoTA/utils_pangenome.py: 100%

74 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions used to deal with pangenome file 

38 

39@author gem 

40April 2017 

41""" 

42import logging 

43import os 

44import sys 

45from PanACoTA import utils 

46 

47logger = logging.getLogger("utils.pan") 

48 

49 

50def read_pangenome(pangenome, logger, families=None): 

51 """ 

52 Read pangenome information 

53 

54 Read pangenome according to what is available. First, check if python objects are available, 

55 then if not, search for the binary file, and if not, read the text file. 

56 

57 Parameters 

58 ---------- 

59 pangenome : str 

60 path to pangenome file 

61 families : dict or None 

62 {num: [members]} if families are given. If not (must read them from binary file if exists\ 

63 or pangenome file otherwise), None. 

64 

65 Returns 

66 ------- 

67 (fams_by_strain, families, all_strains) : tuple 

68 with: 

69 

70 - fams_by_strain: {fam_num: {strain: [members]}} 

71 - families: {fam_num: [all members]} 

72 - all_strains: list of all genome names 

73 """ 

74 if families: 

75 fams_by_strain, all_strains = get_fams_info(families, logger) 

76 if not os.path.isfile(pangenome + ".bin"): 

77 logger.details("Saving all information to a binary file for later use") 

78 utils.save_bin([fams_by_strain, families, all_strains], pangenome + ".bin") 

79 elif os.path.isfile(pangenome + ".bin"): 

80 logger.info("Retrieving info from binary file") 

81 fams_by_strain, families, all_strains = utils.load_bin(pangenome + ".bin") 

82 else: 

83 fams_by_strain, families, all_strains = read_pan_file(pangenome, logger) 

84 logger.info("Saving all information to a binary file for later use") 

85 utils.save_bin([fams_by_strain, families, all_strains], pangenome + ".bin") 

86 return fams_by_strain, families, all_strains 

87 

88 

89def get_fams_info(families, logger): 

90 """ 

91 From all families as list of members, get more information: 

92 

93 - all strains found, sorted by species name 

94 - for each family, sort members by strain 

95 

96 Parameters 

97 ---------- 

98 families : dict 

99 {num: [members]} 

100 logger : logging.Logger 

101 logger object to write log information 

102 

103 Returns 

104 ------- 

105 (fams_by_strain, sorted_all_strains) : tuple 

106 with: 

107 

108 - fams_by_strain: {fam_num: {strain: [members], strain: [members]}} 

109 - sorted_all_strains: list of all strains found, sorted by species 

110 """ 

111 logger.info("Retrieving information from pan families") 

112 fams_by_strain = {} 

113 all_strains = set() 

114 for num, fam in families.items(): 

115 fams_by_strain[num] = {} 

116 for gene in fam: 

117 read_gene(gene, num, fams_by_strain, all_strains) 

118 sort_all_strains = sorted(list(all_strains), key=utils.sort_genomes_by_name) 

119 return fams_by_strain, sort_all_strains 

120 

121 

122def read_pan_file(filein, logger): 

123 """ 

124 Read PanGenome file in 'filein', and put it into Python objects 

125 

126 Parameters 

127 ---------- 

128 filein : str 

129 path to pangenome file 

130 logger : 

131 

132 Returns 

133 ------- 

134 (fams_by_strain, families, sort_all_strains) : tuple 

135 with: 

136 

137 - fams_by_strain: {fam_num: {strain: [members]}} 

138 - families: {fam_num: [all members]} 

139 - sort_all_strains: list of all genome names, sorted by species name 

140 """ 

141 logger.info("Reading and getting information from pangenome file") 

142 fams_by_strain = {} 

143 families = {} 

144 all_strains = set() 

145 nfam = 0 

146 with open(filein, 'r') as coref: 

147 for line in coref: 

148 genes = line.strip().split() 

149 fam_num = genes[0] 

150 fams_by_strain[fam_num] = {} 

151 genes_ok = genes[1:] 

152 for gene in genes_ok: 

153 read_gene(gene, fam_num, fams_by_strain, all_strains) 

154 families[fam_num] = genes_ok 

155 nfam += 1 

156 if not families or all_strains == {''}: 

157 logger.error("Error in pangenome file. No family found.") 

158 sys.exit(1) 

159 sort_all_strains = sorted(list(all_strains), key=utils.sort_genomes_by_name) 

160 return fams_by_strain, families, sort_all_strains 

161 

162 

163def read_gene(gene, num, fams_by_strain, all_strains): 

164 """ 

165 Read information from a given gene name, and save it to appropriate dicts 

166 

167 Parameters 

168 ---------- 

169 gene : str 

170 gene name (species.date.strain.contig_number 

171 num : str 

172 num of family from which the given gene is 

173 fams_by_strain : dict 

174 {fam_num: {strain: [members]}} 

175 all_strains : set 

176 set of all strains 

177 

178 """ 

179 # if format is ESCO.1512.00001.i001_12313 genome name is ESCO.1512.00001 

180 if "." in gene and len(gene.split(".")) >= 3: 

181 strain = ".".join(gene.split("_")[0].split(".")[:3]) 

182 # otherwise, genename is everything before the last "_" 

183 else: 

184 strain = "_".join(gene.split("_")[:-1]) 

185 if strain in fams_by_strain[num]: 

186 fams_by_strain[num][strain].append(gene) 

187 else: 

188 fams_by_strain[num][strain] = [gene] 

189 if strain not in all_strains: 

190 all_strains.add(strain) 

191 

192 

193def read_lstinfo(lstinfo, logger): 

194 """ 

195 Read lstinfo file and return list of genomes 

196 

197 Parameters 

198 ---------- 

199 lstinfo : str 

200 File containing the list of all genomes to include in the pan-genome,  

201 1 genome per line. Here, only the first column will be used.  

202 

203 Returns 

204 ------- 

205 list 

206 list of genomes 

207 """ 

208 genomes = [] 

209 if not os.path.isfile(lstinfo): 

210 logger.error(f"{lstinfo} file not found.") 

211 sys.exit(1) 

212 with open(lstinfo) as lstf: 

213 for line in lstf: 

214 # skip header 

215 if "_name" in line: 

216 continue 

217 genome = line.split()[0].strip() 

218 genomes.append(genome) 

219 if genomes == []: 

220 logger.error(f"No genome found in {lstinfo} file.") 

221 sys.exit(1) 

222 return genomes