comparison benchmarking.py @ 1:867f17ede7f3 draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2
author galaxyp
date Thu, 14 Sep 2023 12:49:19 +0000
parents
children
comparison
equal deleted inserted replaced
0:6d93529d19d4 1:867f17ede7f3
1 import argparse
2 import os
3
4 import numpy as np
5 import pandas as pd
6
7 # Define the ArgumentParser
8 parser = argparse.ArgumentParser("List of natural abundances of the isotopes")
9
10 parser.add_argument(
11 "--input", type=str, metavar="data", help="Input file/folder", required=True
12 )
13
14 parser.add_argument(
15 "--isotope_abundance_matrix",
16 type=str,
17 metavar="data",
18 help="Isotope abundance matrix",
19 required=True,
20 )
21 parser.add_argument(
22 "--isotope",
23 type=str,
24 metavar="ISOTOPE",
25 help="Isotope",
26 required=True,
27 )
28 parser.add_argument(
29 "--out_summary",
30 type=str,
31 metavar="output",
32 help="Peptide summary output",
33 required=False,
34 )
35 parser.add_argument(
36 "--out_filtered", type=str, metavar="output", help="Filtered output", required=False
37 )
38 parser.add_argument(
39 "--nominal_values",
40 type=str,
41 metavar="nominal_values",
42 help="Table giving nominal values",
43 default=None,
44 required=False,
45 )
46
47 # Indicate end of argument definitions and parse args
48 args = parser.parse_args()
49
50
51 def parse_nominal_values(filename):
52 nominal_values = {}
53 if not filename:
54 return nominal_values
55 with open(filename) as fh:
56 for line in fh:
57 line = line.strip()
58 if len(line) == 0 or line[0] == "#":
59 continue
60 line = line.split()
61 nominal_values[line[0]] = line[1]
62 return nominal_values
63
64
65 # Benchmarking section
66 # the functions for optimising calis-p data
67
68
69 def load_calisp_data(filename, factor):
70 # (1) load data
71 file_count = 1
72 if os.path.isdir(filename):
73 file_data = []
74 file_count = len(os.listdir(filename))
75 for f in os.listdir(filename):
76 f = os.path.join(filename, f)
77 file_data.append(pd.read_feather(f))
78 base, _ = os.path.splitext(f)
79 file_data[-1].to_csv(f"{base}.tsv", sep="\t", index=False)
80 data = pd.concat(file_data)
81 else:
82 data = pd.read_feather(filename)
83 base, _ = os.path.splitext(filename)
84 data.to_csv(f"{base}.tsv", sep="\t", index=False)
85
86 file_success_count = len(data["ms_run"].unique())
87 # (2) calculate deltas
88 # ((1-f)/f) - 1 == 1/f -2
89 data["delta_na"] = data["ratio_na"] / ((1 / factor) - 2) * 1000
90 data["delta_fft"] = data["ratio_fft"] / ((1 / factor) - 2) * 1000
91 print(
92 f"Loaded {len(data.index)} isotopic patterns from {file_success_count}/{file_count} file(s)"
93 )
94 return data
95
96
97 def filter_calisp_data(data, target):
98 if target.lower() == "na":
99 subdata = data.loc[
100 lambda df: (df["flag_peak_at_minus_one_pos"] == False) # noqa: E712
101 & (df["flag_pattern_is_wobbly"] == False) # noqa: E712
102 & (df["flag_psm_has_low_confidence"] == False) # noqa: E712
103 & (df["flag_psm_is_ambiguous"] == False) # noqa: E712
104 & (df["flag_pattern_is_contaminated"] == False) # noqa: E712
105 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712
106 :,
107 ]
108 elif target.lower() == "fft":
109 subdata = data.loc[
110 lambda df: (df["error_fft"] < 0.001)
111 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712
112 :,
113 ]
114 elif target.lower() == "clumpy":
115 subdata = data.loc[
116 lambda df: (df["error_clumpy"] < 0.001)
117 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712
118 :,
119 ]
120
121 print(
122 f"{len(subdata.index)} ({len(subdata.index)/len(data.index)*100:.1f}%) remaining after filters."
123 )
124 return subdata
125
126
127 def estimate_clumpiness(data):
128 subdata = data.loc[lambda df: df["error_clumpy"] < 0.001, :]
129 clumpiness = []
130 for c in ["c1", "c2", "c3", "c4", "c5", "c6"]:
131 try:
132 count, division = np.histogram(subdata[c], bins=50)
133 count = count[1:-1]
134 opt = 0.02 * np.where(count == count.max())[0][0] / 0.96
135 clumpiness.append(opt)
136 except ValueError:
137 pass
138 return clumpiness / sum(clumpiness)
139
140
141 # the function for benchmarking
142 def benchmark_sip_mock_community_data(data, factor, nominal_values):
143 background_isotope = 1 - factor
144 background_unlabelled = factor
145
146 # For false positive discovery rates we set the threshold at the isotope/unlabelled associated with 1/4 of a generation
147 # of labeling. The E. coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label,
148 # and we take the background (1.07) into account as well.
149 thresholds = {
150 1: 1.07 + (1.7 - 1.07) / 4,
151 5: 1.07 + (4.2 - 1.07) / 4,
152 10: 1.07 + (7.1 - 1.07) / 4,
153 }
154
155 filenames = data["ms_run"].unique()
156 for fname in filenames:
157 print(f"Using nominal value {nominal_values.get(fname, 0)} for {fname}")
158
159 bin_names = data["bins"].unique()
160 peptide_sequences = data["peptide"].unique()
161 benchmarking = pd.DataFrame(
162 columns=[
163 "file",
164 "bins",
165 "% label",
166 "ratio",
167 "peptide",
168 "psm_mz",
169 "n(patterns)",
170 "mean intensity",
171 "ratio_NA median",
172 "N mean",
173 "ratio_NA SEM",
174 "ratio_FFT median",
175 "ratio_FFT SEM",
176 "False Positive",
177 ]
178 )
179 false_positives = 0
180 for p in peptide_sequences:
181 pep_data = data.loc[lambda df: df["peptide"] == p, :]
182 for b in bin_names:
183 # bindata = data.loc[lambda df: df["bins"] == b, :]
184 for f in filenames:
185 nominal_value = nominal_values.get(fname, 0)
186 unlabeled_fraction = 1 - nominal_value / 100
187 U = unlabeled_fraction * background_unlabelled
188 I = nominal_value / 100 + unlabeled_fraction * background_isotope
189 ratio = I / U * 100
190 pepfiledata = pep_data.loc[lambda df: df["ms_run"] == f, :]
191 is_false_positive = 0
192 try:
193 if (
194 b != "K12"
195 and pepfiledata["ratio_na"].median() > thresholds[nominal_value]
196 ):
197 is_false_positive = 1
198 false_positives += 1
199 except KeyError:
200 pass
201 benchmarking.loc[len(benchmarking)] = [
202 f,
203 b,
204 nominal_value,
205 ratio,
206 p,
207 pepfiledata["psm_mz"].median(),
208 len(pepfiledata.index),
209 pepfiledata["pattern_total_intensity"].mean(),
210 pepfiledata["ratio_na"].median(),
211 pepfiledata["psm_neutrons"].mean(),
212 pepfiledata["ratio_na"].sem(),
213 pepfiledata["ratio_fft"].median(),
214 pepfiledata["ratio_fft"].sem(),
215 is_false_positive,
216 ]
217
218 benchmarking = benchmarking.sort_values(["bins", "peptide"])
219 benchmarking = benchmarking.reset_index(drop=True)
220 return benchmarking
221
222
223 rowcol = {
224 "13C": (0, 1),
225 "14C": (0, 2),
226 "15N": (1, 1),
227 "17O": (2, 1),
228 "18O": (2, 2),
229 "2H": (3, 1),
230 "3H": (3, 2),
231 "33S": (4, 1),
232 "34S": (4, 2),
233 "36S": (4, 3),
234 }
235 with open(args.isotope_abundance_matrix) as iamf:
236 matrix = []
237 for line in iamf:
238 line = line.strip()
239 line = line.split("#")[0]
240 if line == "":
241 continue
242 matrix.append([float(x) for x in line.split()])
243 factor = matrix[rowcol[args.isotope][0]][rowcol[args.isotope][1]]
244 print(f"Using factor {factor}")
245
246
247 # cleaning and filtering data
248 data = load_calisp_data(args.input, factor)
249
250 if args.out_filtered:
251 data = filter_calisp_data(data, "na")
252 data["peptide_clean"] = data["peptide"]
253 data["peptide_clean"] = data["peptide_clean"].replace("'Oxidation'", "", regex=True)
254 data["peptide_clean"] = data["peptide_clean"].replace(
255 "'Carbamidomethyl'", "", regex=True
256 )
257 data["peptide_clean"] = data["peptide_clean"].replace(r"\s*\[.*\]", "", regex=True)
258
259 data["ratio_na"] = data["ratio_na"] * 100
260 data["ratio_fft"] = data["ratio_fft"] * 100
261 data.to_csv(args.out_filtered, sep="\t", index=False)
262
263 # The column "% label" indicates the amount of label applied (percentage of label in the glucose). The amount of
264 # labeled E. coli cells added corresponded to 1 generation of labeling (50% of E. coli cells were labeled in
265 # all experiments except controls)
266
267 if args.out_summary:
268 nominal_values = parse_nominal_values(args.nominal_values)
269 benchmarks = benchmark_sip_mock_community_data(data, factor, nominal_values)
270 benchmarks.to_csv(args.out_summary, sep="\t", index=False)