Coverage for PanACoTA/pangenome_module/post

1#!/usr/bin/env python3

2# coding: utf-8

4# ###############################################################################

5# This file is part of PanACOTA. #

6# #

7# Authors: Amandine Perrin #

9# See the COPYRIGHT file for details. #

10# #

11# PanACOTA is a software providing tools for large scale bacterial comparative #

12# genomics. From a set of complete and/or draft genomes, you can: #

13# - Do a quality control of your strains, to eliminate poor quality #

14# genomes, which would not give any information for the comparative study #

15# - Uniformly annotate all genomes #

16# - Do a Pan-genome #

17# - Do a Core or Persistent genome #

18# - Align all Core/Persistent families #

19# - Infer a phylogenetic tree from the Core/Persistent families #

20# #

21# PanACOTA is free software: you can redistribute it and/or modify it under the #

22# terms of the Affero GNU General Public License as published by the Free #

23# Software Foundation, either version 3 of the License, or (at your option) #

24# any later version. #

25# #

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #

29# for more details. #

30# #

31# You should have received a copy of the Affero GNU General Public License #

32# along with PanACOTA (COPYING file). #

33# If not, see <https://www.gnu.org/licenses/>. #

34# ###############################################################################

36"""

37Functions to generate the matrix pan_quali, pan_quanti, as well

38as a summary file for the pangenome.

40@author gem

41April 2017

42"""

43import logging

44import numpy as np

46from PanACoTA import utils

47from PanACoTA import utils_pangenome as utilsp

49logger = logging.getLogger("pangenome.post-treat")

52def post_treat(families, pangenome):

53 """

54 From clusters = {num: [members]}, create:

56 - a pan_quali matrix (lines = families, columns = genomes, 1 if genome present in\

57 family, 0 otherwise)

58 - a pan_quanti matrix (lines = families, columns = genomes, number of members from given\

59 genome in the given family)

60 - a summary file: lines = families. For each family:

62 - nb_members: total number of members

63 - sum_quanti: should be the same as nb_members!

64 - sum_quali: number of different genomes in family

65 - nb_0: number of missing genomes in family

66 - nb_mono: number of genomes with exactly 1 member

67 - nb_multi: number of genomes with more than 1 member

68 - sum_0-mono-multi: should be equal to the total number of genomes in dataset

69 - max_multi: maximum number of members from 1 genome

71 Parameters

72 ----------

73 families : dict

74 {num_fam: [list of members]}. Can be None, and then they will be retrieved from the\

75 pangenome file

76 pangenome : str

77 file containing pangenome

78 """

79 fams_by_strain, families, all_strains = utilsp.read_pangenome(pangenome, logger, families)

80 open_outputs_to_write(fams_by_strain, families, all_strains, pangenome)

81 # result of open_outputs_to_write = (qualis, quantis, summaries)

84def open_outputs_to_write(fams_by_strain, families, all_strains, pangenome):

85 """

86 Open output files, and call function to generate the matrix and summary file,

87 and write it in those output files

89 Parameters

90 ----------

91 fams_by_strain : dict

92 {fam_num: {strain: [members]}}

93 families : dict

94 {fam_num: [all members]}

95 all_strains : list

96 list of all genome names

97 pangenome : str

98 filename containing pangenome. Will be extended for the 3 output files

100 Returns

101 -------

102 (qualis, quantis, summaries) : tuple

103

104 with:

105

106 - qualis = {fam_num: [0 if no gene for species, 1 if at least 1 gene, for each\

107 species in all_strains]}

108 - quantis = {fam_num: [number of genes for each strain in all_strains]}

109 - summaries = {fam_num: [nb_members, sum_quanti, sum_quali,\

110 nb_0, nb_mono, nb_multi, sum_0-mono-multi, max_multi]}

111

112 """

113 panquali = pangenome + ".quali.txt"

114 panquanti = pangenome + ".quanti.txt"

115 pansum = pangenome + ".summary.txt"

116 with open(pansum, "w") as psf:

117 psf.write("num_fam,nb_members,sum_quanti,sum_quali,"

118 "nb_0,nb_mono,nb_multi,sum_0_mono_multi,max_multi\n")

119 res = generate_and_write_outputs(fams_by_strain, families,

120 all_strains, panquali, panquanti, psf)

121 return res

122

123

124def generate_and_write_outputs(fams_by_strain, families, all_strains, panquali, panquanti, psf):

125 """

126 From the python objects of pangenome, generate qualitative and quantitative matrix,

127 as well as summary file.

128

129 Parameters

130 ----------

131 fams_by_strain : dict

132 {fam_num: {strain: [members]}}

133 families : dict

134 {fam_num: [all members]}

135 all_strains : list

136 list of all strains

137 pqlf : _io.TextIOWrapper

138 open file where qualitative matrix will be written

139 pqtf : _io.TextIOWrapper

140 open file where quantitative matrix will be written

141 psf : _io.TextIOWrapper

142 open file where summary will be written

143

144 Returns

145 -------

146 (qualis, quantis, summaries) : tuple

147

148 with:

149

150 - qualis = {fam_num: [0 if no gene for species, 1 if at least 1 gene, for each\

151 species in all_strains]}

152 - quantis = {fam_num: [number of genes for each strain in all_strains]}

153 - summaries = {fam_num: [nb_members, sum_quanti, sum_quali,\

154 nb_0, nb_mono, nb_multi, sum_0-mono-multi, max_multi]}

155

156 """

157 logger.info("Generating qualitative and quantitative matrix, and summary file")

158

159 # Matrix has:

160 # - 1 row per family (header will be added after)

161 # - 1 column for fam nums + 1 column per strain

162 matrix_quali = np.empty((len(families), len(all_strains) + 1), dtype=int)

163 matrix_quanti = np.empty((len(families), len(all_strains) + 1), dtype=int)

164

165 # also save matrix as python objects

166 qualis = {}

167 quantis = {}

168 summaries = {}

169 row = 0

170 for fam_num in sorted(fams_by_strain, key=lambda x: int(x)):

171 strains = fams_by_strain[fam_num]

172 quali = [1 if strain in strains else 0 for strain in all_strains]

173 quanti = [len(strains[strain]) if strain in strains else 0 for strain in all_strains]

174 nb_0 = quanti.count(0)

175 nb_mono = quanti.count(1)

176 nb_multi = len(quanti) - nb_0 - nb_mono

177 max_multi = max(quanti)

178 # Add line to quali and quanti matrices

179 matrix_quali[row,:] = [fam_num] + quali

180 matrix_quanti[row,:] = [fam_num] + quanti

181 # Write summary line

182 summ = [len(families[fam_num]), sum(quanti), sum(quali),

183 nb_0, nb_mono, nb_multi, nb_0 + nb_mono + nb_multi, max_multi]

184 psf.write(f"{fam_num},{utils.list_to_str(summ, sep=',')}")

185 # Complete python objects with quali, quanti, sumary

186 qualis[fam_num] = quali

187 quantis[fam_num] = quanti

188 summaries[fam_num] = summ

189 row += 1

190 # Add headers to quali and quanti matrix

191 header = np.array(["fam_num"] + all_strains)

192 matrix_quali = np.vstack((header, matrix_quali))

193 matrix_quanti = np.vstack((header, matrix_quanti))

194 # Transpose matrix: lines = genomes, columns = families

195 tmatrix_quali = matrix_quali.transpose()

196 np.savetxt(panquali, tmatrix_quali, delimiter=",", fmt="%s")

197 tmatrix_quanti = matrix_quanti.transpose()

198 np.savetxt(panquanti, tmatrix_quanti, delimiter=",", fmt="%s")

199 return qualis, quantis, summaries

Coverage for PanACoTA/pangenome_module/post_treatment.py: 100%

49 statements