Coverage for PanACoTA/pangenome_module/post_treatment.py: 100%

49 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-20 14:37 +0000

1#!/usr/bin/env python3 

2# coding: utf-8 

3 

4# ############################################################################### 

5# This file is part of PanACOTA. # 

6# # 

7# Authors: Amandine Perrin # 

8# Copyright © 2018-2020 Institut Pasteur (Paris). # 

9# See the COPYRIGHT file for details. # 

10# # 

11# PanACOTA is a software providing tools for large scale bacterial comparative # 

12# genomics. From a set of complete and/or draft genomes, you can: # 

13# - Do a quality control of your strains, to eliminate poor quality # 

14# genomes, which would not give any information for the comparative study # 

15# - Uniformly annotate all genomes # 

16# - Do a Pan-genome # 

17# - Do a Core or Persistent genome # 

18# - Align all Core/Persistent families # 

19# - Infer a phylogenetic tree from the Core/Persistent families # 

20# # 

21# PanACOTA is free software: you can redistribute it and/or modify it under the # 

22# terms of the Affero GNU General Public License as published by the Free # 

23# Software Foundation, either version 3 of the License, or (at your option) # 

24# any later version. # 

25# # 

26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY # 

27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # 

28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License # 

29# for more details. # 

30# # 

31# You should have received a copy of the Affero GNU General Public License # 

32# along with PanACOTA (COPYING file). # 

33# If not, see <https://www.gnu.org/licenses/>. # 

34# ############################################################################### 

35 

36""" 

37Functions to generate the matrix pan_quali, pan_quanti, as well 

38as a summary file for the pangenome. 

39 

40@author gem 

41April 2017 

42""" 

43import logging 

44import numpy as np 

45 

46from PanACoTA import utils 

47from PanACoTA import utils_pangenome as utilsp 

48 

49logger = logging.getLogger("pangenome.post-treat") 

50 

51 

52def post_treat(families, pangenome): 

53 """ 

54 From clusters = {num: [members]}, create: 

55 

56 - a pan_quali matrix (lines = families, columns = genomes, 1 if genome present in\ 

57 family, 0 otherwise) 

58 - a pan_quanti matrix (lines = families, columns = genomes, number of members from given\ 

59 genome in the given family) 

60 - a summary file: lines = families. For each family: 

61 

62 - nb_members: total number of members 

63 - sum_quanti: should be the same as nb_members! 

64 - sum_quali: number of different genomes in family 

65 - nb_0: number of missing genomes in family 

66 - nb_mono: number of genomes with exactly 1 member 

67 - nb_multi: number of genomes with more than 1 member 

68 - sum_0-mono-multi: should be equal to the total number of genomes in dataset 

69 - max_multi: maximum number of members from 1 genome 

70 

71 Parameters 

72 ---------- 

73 families : dict 

74 {num_fam: [list of members]}. Can be None, and then they will be retrieved from the\ 

75 pangenome file 

76 pangenome : str 

77 file containing pangenome 

78 """ 

79 fams_by_strain, families, all_strains = utilsp.read_pangenome(pangenome, logger, families) 

80 open_outputs_to_write(fams_by_strain, families, all_strains, pangenome) 

81 # result of open_outputs_to_write = (qualis, quantis, summaries) 

82 

83 

84def open_outputs_to_write(fams_by_strain, families, all_strains, pangenome): 

85 """ 

86 Open output files, and call function to generate the matrix and summary file, 

87 and write it in those output files 

88 

89 Parameters 

90 ---------- 

91 fams_by_strain : dict 

92 {fam_num: {strain: [members]}} 

93 families : dict 

94 {fam_num: [all members]} 

95 all_strains : list 

96 list of all genome names 

97 pangenome : str 

98 filename containing pangenome. Will be extended for the 3 output files 

99 

100 Returns 

101 ------- 

102 (qualis, quantis, summaries) : tuple 

103 

104 with: 

105 

106 - qualis = {fam_num: [0 if no gene for species, 1 if at least 1 gene, for each\ 

107 species in all_strains]} 

108 - quantis = {fam_num: [number of genes for each strain in all_strains]} 

109 - summaries = {fam_num: [nb_members, sum_quanti, sum_quali,\ 

110 nb_0, nb_mono, nb_multi, sum_0-mono-multi, max_multi]} 

111 

112 """ 

113 panquali = pangenome + ".quali.txt" 

114 panquanti = pangenome + ".quanti.txt" 

115 pansum = pangenome + ".summary.txt" 

116 with open(pansum, "w") as psf: 

117 psf.write("num_fam,nb_members,sum_quanti,sum_quali," 

118 "nb_0,nb_mono,nb_multi,sum_0_mono_multi,max_multi\n") 

119 res = generate_and_write_outputs(fams_by_strain, families, 

120 all_strains, panquali, panquanti, psf) 

121 return res 

122 

123 

124def generate_and_write_outputs(fams_by_strain, families, all_strains, panquali, panquanti, psf): 

125 """ 

126 From the python objects of pangenome, generate qualitative and quantitative matrix, 

127 as well as summary file. 

128 

129 Parameters 

130 ---------- 

131 fams_by_strain : dict 

132 {fam_num: {strain: [members]}} 

133 families : dict 

134 {fam_num: [all members]} 

135 all_strains : list 

136 list of all strains 

137 pqlf : _io.TextIOWrapper 

138 open file where qualitative matrix will be written 

139 pqtf : _io.TextIOWrapper 

140 open file where quantitative matrix will be written 

141 psf : _io.TextIOWrapper 

142 open file where summary will be written 

143 

144 Returns 

145 ------- 

146 (qualis, quantis, summaries) : tuple 

147 

148 with: 

149 

150 - qualis = {fam_num: [0 if no gene for species, 1 if at least 1 gene, for each\ 

151 species in all_strains]} 

152 - quantis = {fam_num: [number of genes for each strain in all_strains]} 

153 - summaries = {fam_num: [nb_members, sum_quanti, sum_quali,\ 

154 nb_0, nb_mono, nb_multi, sum_0-mono-multi, max_multi]} 

155 

156 """ 

157 logger.info("Generating qualitative and quantitative matrix, and summary file") 

158 

159 # Matrix has: 

160 # - 1 row per family (header will be added after) 

161 # - 1 column for fam nums + 1 column per strain 

162 matrix_quali = np.empty((len(families), len(all_strains) + 1), dtype=int) 

163 matrix_quanti = np.empty((len(families), len(all_strains) + 1), dtype=int) 

164 

165 # also save matrix as python objects 

166 qualis = {} 

167 quantis = {} 

168 summaries = {} 

169 row = 0 

170 for fam_num in sorted(fams_by_strain, key=lambda x: int(x)): 

171 strains = fams_by_strain[fam_num] 

172 quali = [1 if strain in strains else 0 for strain in all_strains] 

173 quanti = [len(strains[strain]) if strain in strains else 0 for strain in all_strains] 

174 nb_0 = quanti.count(0) 

175 nb_mono = quanti.count(1) 

176 nb_multi = len(quanti) - nb_0 - nb_mono 

177 max_multi = max(quanti) 

178 # Add line to quali and quanti matrices 

179 matrix_quali[row,:] = [fam_num] + quali 

180 matrix_quanti[row,:] = [fam_num] + quanti 

181 # Write summary line 

182 summ = [len(families[fam_num]), sum(quanti), sum(quali), 

183 nb_0, nb_mono, nb_multi, nb_0 + nb_mono + nb_multi, max_multi] 

184 psf.write(f"{fam_num},{utils.list_to_str(summ, sep=',')}") 

185 # Complete python objects with quali, quanti, sumary 

186 qualis[fam_num] = quali 

187 quantis[fam_num] = quanti 

188 summaries[fam_num] = summ 

189 row += 1 

190 # Add headers to quali and quanti matrix 

191 header = np.array(["fam_num"] + all_strains) 

192 matrix_quali = np.vstack((header, matrix_quali)) 

193 matrix_quanti = np.vstack((header, matrix_quanti)) 

194 # Transpose matrix: lines = genomes, columns = families 

195 tmatrix_quali = matrix_quali.transpose() 

196 np.savetxt(panquali, tmatrix_quali, delimiter=",", fmt="%s") 

197 tmatrix_quanti = matrix_quanti.transpose() 

198 np.savetxt(panquanti, tmatrix_quanti, delimiter=",", fmt="%s") 

199 return qualis, quantis, summaries