Coverage for PanACoTA/pangenome_module/post_treatment.py: 100%
49 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 14:37 +0000
1#!/usr/bin/env python3
2# coding: utf-8
4# ###############################################################################
5# This file is part of PanACOTA. #
6# #
7# Authors: Amandine Perrin #
8# Copyright © 2018-2020 Institut Pasteur (Paris). #
9# See the COPYRIGHT file for details. #
10# #
11# PanACOTA is a software providing tools for large scale bacterial comparative #
12# genomics. From a set of complete and/or draft genomes, you can: #
13# - Do a quality control of your strains, to eliminate poor quality #
14# genomes, which would not give any information for the comparative study #
15# - Uniformly annotate all genomes #
16# - Do a Pan-genome #
17# - Do a Core or Persistent genome #
18# - Align all Core/Persistent families #
19# - Infer a phylogenetic tree from the Core/Persistent families #
20# #
21# PanACOTA is free software: you can redistribute it and/or modify it under the #
22# terms of the Affero GNU General Public License as published by the Free #
23# Software Foundation, either version 3 of the License, or (at your option) #
24# any later version. #
25# #
26# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
27# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
28# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
29# for more details. #
30# #
31# You should have received a copy of the Affero GNU General Public License #
32# along with PanACOTA (COPYING file). #
33# If not, see <https://www.gnu.org/licenses/>. #
34# ###############################################################################
36"""
37Functions to generate the matrix pan_quali, pan_quanti, as well
38as a summary file for the pangenome.
40@author gem
41April 2017
42"""
43import logging
44import numpy as np
46from PanACoTA import utils
47from PanACoTA import utils_pangenome as utilsp
49logger = logging.getLogger("pangenome.post-treat")
52def post_treat(families, pangenome):
53 """
54 From clusters = {num: [members]}, create:
56 - a pan_quali matrix (lines = families, columns = genomes, 1 if genome present in\
57 family, 0 otherwise)
58 - a pan_quanti matrix (lines = families, columns = genomes, number of members from given\
59 genome in the given family)
60 - a summary file: lines = families. For each family:
62 - nb_members: total number of members
63 - sum_quanti: should be the same as nb_members!
64 - sum_quali: number of different genomes in family
65 - nb_0: number of missing genomes in family
66 - nb_mono: number of genomes with exactly 1 member
67 - nb_multi: number of genomes with more than 1 member
68 - sum_0-mono-multi: should be equal to the total number of genomes in dataset
69 - max_multi: maximum number of members from 1 genome
71 Parameters
72 ----------
73 families : dict
74 {num_fam: [list of members]}. Can be None, and then they will be retrieved from the\
75 pangenome file
76 pangenome : str
77 file containing pangenome
78 """
79 fams_by_strain, families, all_strains = utilsp.read_pangenome(pangenome, logger, families)
80 open_outputs_to_write(fams_by_strain, families, all_strains, pangenome)
81 # result of open_outputs_to_write = (qualis, quantis, summaries)
84def open_outputs_to_write(fams_by_strain, families, all_strains, pangenome):
85 """
86 Open output files, and call function to generate the matrix and summary file,
87 and write it in those output files
89 Parameters
90 ----------
91 fams_by_strain : dict
92 {fam_num: {strain: [members]}}
93 families : dict
94 {fam_num: [all members]}
95 all_strains : list
96 list of all genome names
97 pangenome : str
98 filename containing pangenome. Will be extended for the 3 output files
100 Returns
101 -------
102 (qualis, quantis, summaries) : tuple
104 with:
106 - qualis = {fam_num: [0 if no gene for species, 1 if at least 1 gene, for each\
107 species in all_strains]}
108 - quantis = {fam_num: [number of genes for each strain in all_strains]}
109 - summaries = {fam_num: [nb_members, sum_quanti, sum_quali,\
110 nb_0, nb_mono, nb_multi, sum_0-mono-multi, max_multi]}
112 """
113 panquali = pangenome + ".quali.txt"
114 panquanti = pangenome + ".quanti.txt"
115 pansum = pangenome + ".summary.txt"
116 with open(pansum, "w") as psf:
117 psf.write("num_fam,nb_members,sum_quanti,sum_quali,"
118 "nb_0,nb_mono,nb_multi,sum_0_mono_multi,max_multi\n")
119 res = generate_and_write_outputs(fams_by_strain, families,
120 all_strains, panquali, panquanti, psf)
121 return res
124def generate_and_write_outputs(fams_by_strain, families, all_strains, panquali, panquanti, psf):
125 """
126 From the python objects of pangenome, generate qualitative and quantitative matrix,
127 as well as summary file.
129 Parameters
130 ----------
131 fams_by_strain : dict
132 {fam_num: {strain: [members]}}
133 families : dict
134 {fam_num: [all members]}
135 all_strains : list
136 list of all strains
137 pqlf : _io.TextIOWrapper
138 open file where qualitative matrix will be written
139 pqtf : _io.TextIOWrapper
140 open file where quantitative matrix will be written
141 psf : _io.TextIOWrapper
142 open file where summary will be written
144 Returns
145 -------
146 (qualis, quantis, summaries) : tuple
148 with:
150 - qualis = {fam_num: [0 if no gene for species, 1 if at least 1 gene, for each\
151 species in all_strains]}
152 - quantis = {fam_num: [number of genes for each strain in all_strains]}
153 - summaries = {fam_num: [nb_members, sum_quanti, sum_quali,\
154 nb_0, nb_mono, nb_multi, sum_0-mono-multi, max_multi]}
156 """
157 logger.info("Generating qualitative and quantitative matrix, and summary file")
159 # Matrix has:
160 # - 1 row per family (header will be added after)
161 # - 1 column for fam nums + 1 column per strain
162 matrix_quali = np.empty((len(families), len(all_strains) + 1), dtype=int)
163 matrix_quanti = np.empty((len(families), len(all_strains) + 1), dtype=int)
165 # also save matrix as python objects
166 qualis = {}
167 quantis = {}
168 summaries = {}
169 row = 0
170 for fam_num in sorted(fams_by_strain, key=lambda x: int(x)):
171 strains = fams_by_strain[fam_num]
172 quali = [1 if strain in strains else 0 for strain in all_strains]
173 quanti = [len(strains[strain]) if strain in strains else 0 for strain in all_strains]
174 nb_0 = quanti.count(0)
175 nb_mono = quanti.count(1)
176 nb_multi = len(quanti) - nb_0 - nb_mono
177 max_multi = max(quanti)
178 # Add line to quali and quanti matrices
179 matrix_quali[row,:] = [fam_num] + quali
180 matrix_quanti[row,:] = [fam_num] + quanti
181 # Write summary line
182 summ = [len(families[fam_num]), sum(quanti), sum(quali),
183 nb_0, nb_mono, nb_multi, nb_0 + nb_mono + nb_multi, max_multi]
184 psf.write(f"{fam_num},{utils.list_to_str(summ, sep=',')}")
185 # Complete python objects with quali, quanti, sumary
186 qualis[fam_num] = quali
187 quantis[fam_num] = quanti
188 summaries[fam_num] = summ
189 row += 1
190 # Add headers to quali and quanti matrix
191 header = np.array(["fam_num"] + all_strains)
192 matrix_quali = np.vstack((header, matrix_quali))
193 matrix_quanti = np.vstack((header, matrix_quanti))
194 # Transpose matrix: lines = genomes, columns = families
195 tmatrix_quali = matrix_quali.transpose()
196 np.savetxt(panquali, tmatrix_quali, delimiter=",", fmt="%s")
197 tmatrix_quanti = matrix_quanti.transpose()
198 np.savetxt(panquanti, tmatrix_quanti, delimiter=",", fmt="%s")
199 return qualis, quantis, summaries