Coverage for PanACoTA/utils_pangenome.py: 100%

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions used to deal with pangenome file

39@author gem

40April 2017

41"""

42import logging

43import os

44import sys

45from PanACoTA import utils

47logger = logging.getLogger("utils.pan")

50def read_pangenome(pangenome, logger, families=None):

51 """

52 Read pangenome information

54 Read pangenome according to what is available. First, check if python objects are available,

55 then if not, search for the binary file, and if not, read the text file.

57 Parameters

58 ----------

59 pangenome : str

60 path to pangenome file

61 families : dict or None

62 {num: [members]} if families are given. If not (must read them from binary file if exists\

63 or pangenome file otherwise), None.

65 Returns

66 -------

67 (fams_by_strain, families, all_strains) : tuple

68 with:

70 - fams_by_strain: {fam_num: {strain: [members]}}

71 - families: {fam_num: [all members]}

72 - all_strains: list of all genome names

73 """

74 if families:

75 fams_by_strain, all_strains = get_fams_info(families, logger)

76 if not os.path.isfile(pangenome + ".bin"):

77 logger.details("Saving all information to a binary file for later use")

78 utils.save_bin([fams_by_strain, families, all_strains], pangenome + ".bin")

79 elif os.path.isfile(pangenome + ".bin"):

80 logger.info("Retrieving info from binary file")

81 fams_by_strain, families, all_strains = utils.load_bin(pangenome + ".bin")

82 else:

83 fams_by_strain, families, all_strains = read_pan_file(pangenome, logger)

84 logger.info("Saving all information to a binary file for later use")

85 utils.save_bin([fams_by_strain, families, all_strains], pangenome + ".bin")

86 return fams_by_strain, families, all_strains

89def get_fams_info(families, logger):

90 """

91 From all families as list of members, get more information:

93 - all strains found, sorted by species name

94 - for each family, sort members by strain

96 Parameters

97 ----------

98 families : dict

99 {num: [members]}

100 logger : logging.Logger

101 logger object to write log information

102

103 Returns

104 -------

105 (fams_by_strain, sorted_all_strains) : tuple

106 with:

107

108 - fams_by_strain: {fam_num: {strain: [members], strain: [members]}}

109 - sorted_all_strains: list of all strains found, sorted by species

110 """

111 logger.info("Retrieving information from pan families")

112 fams_by_strain = {}

113 all_strains = set()

114 for num, fam in families.items():

115 fams_by_strain[num] = {}

116 for gene in fam:

117 read_gene(gene, num, fams_by_strain, all_strains)

118 sort_all_strains = sorted(list(all_strains), key=utils.sort_genomes_by_name)

119 return fams_by_strain, sort_all_strains

120

121

122def read_pan_file(filein, logger):

123 """

124 Read PanGenome file in 'filein', and put it into Python objects

125

126 Parameters

127 ----------

128 filein : str

129 path to pangenome file

130 logger :

131

132 Returns

133 -------

134 (fams_by_strain, families, sort_all_strains) : tuple

135 with:

136

137 - fams_by_strain: {fam_num: {strain: [members]}}

138 - families: {fam_num: [all members]}

139 - sort_all_strains: list of all genome names, sorted by species name

140 """

141 logger.info("Reading and getting information from pangenome file")

142 fams_by_strain = {}

143 families = {}

144 all_strains = set()

145 nfam = 0

146 with open(filein, 'r') as coref:

147 for line in coref:

148 genes = line.strip().split()

149 fam_num = genes[0]

150 fams_by_strain[fam_num] = {}

151 genes_ok = genes[1:]

152 for gene in genes_ok:

153 read_gene(gene, fam_num, fams_by_strain, all_strains)

154 families[fam_num] = genes_ok

155 nfam += 1

156 if not families or all_strains == {''}:

157 logger.error("Error in pangenome file. No family found.")

158 sys.exit(1)

159 sort_all_strains = sorted(list(all_strains), key=utils.sort_genomes_by_name)

160 return fams_by_strain, families, sort_all_strains

161

162

163def read_gene(gene, num, fams_by_strain, all_strains):

164 """

165 Read information from a given gene name, and save it to appropriate dicts

166

167 Parameters

168 ----------

169 gene : str

170 gene name (species.date.strain.contig_number

171 num : str

172 num of family from which the given gene is

173 fams_by_strain : dict

174 {fam_num: {strain: [members]}}

175 all_strains : set

176 set of all strains

177

178 """

179 # if format is ESCO.1512.00001.i001_12313 genome name is ESCO.1512.00001

180 if "." in gene and len(gene.split(".")) >= 3:

181 strain = ".".join(gene.split("_")[0].split(".")[:3])

182 # otherwise, genename is everything before the last "_"

183 else:

184 strain = "_".join(gene.split("_")[:-1])

185 if strain in fams_by_strain[num]:

186 fams_by_strain[num][strain].append(gene)

187 else:

188 fams_by_strain[num][strain] = [gene]

189 if strain not in all_strains:

190 all_strains.add(strain)

191

192

193def read_lstinfo(lstinfo, logger):

194 """

195 Read lstinfo file and return list of genomes

196

197 Parameters

198 ----------

199 lstinfo : str

200 File containing the list of all genomes to include in the pan-genome,

201 1 genome per line. Here, only the first column will be used.

202

203 Returns

204 -------

205 list

206 list of genomes

207 """

208 genomes = []

209 if not os.path.isfile(lstinfo):

210 logger.error(f"{lstinfo} file not found.")

211 sys.exit(1)

212 with open(lstinfo) as lstf:

213 for line in lstf:

214 # skip header

215 if "_name" in line:

216 continue

217 genome = line.split()[0].strip()

218 genomes.append(genome)

219 if genomes == []:

220 logger.error(f"No genome found in {lstinfo} file.")

221 sys.exit(1)

222 return genomes