0
|
1 #!/usr/bin/env python
|
|
2 import optparse
|
|
3 import sys
|
|
4
|
|
5 try:
|
|
6 # Ubuntu deps: gfortan libblas-dev liblapack-dev
|
|
7 # pip deps: numpy scipy
|
|
8 from math import sqrt
|
|
9 from scipy.optimize import root
|
|
10 from numpy import arange, exp, concatenate, sum, log, array, seterr
|
|
11 except ImportError:
|
|
12 # Allow this script to be used for global FDR even
|
|
13 # if these dependencies are not present.
|
|
14 pass
|
|
15
|
|
16
|
|
17 SEPARATORS = {"TAB": "\t",
|
|
18 "SPACE": " ",
|
|
19 "COMMA": ","
|
|
20 }
|
|
21
|
|
22
|
|
23 def __main__():
|
|
24 run_script()
|
|
25
|
|
26
|
|
27 def append_fdr(input_file, output, settings):
|
|
28 sorted_scores, accum_hits, accum_decoys = _accum_decoys(input_file, settings)
|
|
29 fdr_array = compute_fdr(sorted_scores, accum_hits, accum_decoys, settings)
|
|
30 index = 0
|
|
31 for line in __read_lines(input_file):
|
|
32 if not line or line.startswith('#'):
|
|
33 continue
|
|
34 entry = Entry(line, settings, index)
|
|
35 this_fdr = fdr_array[entry.score]
|
|
36 new_line = "%s%s%f" % (line, settings["separator"], this_fdr)
|
|
37 print >> output, new_line
|
|
38 index += 1
|
|
39
|
|
40
|
|
41 def compute_fdr(sorted_scores, accum_hits, accum_decoys, settings):
|
|
42 fdr_type = settings["fdr_type"]
|
|
43 compute_functions = {"global_conservative": _compute_fdr_global_conservative,
|
|
44 "global_permissive": _compute_fdr_global_permissive,
|
|
45 #"pspep": _compute_pspep
|
|
46 }
|
|
47 return compute_functions[fdr_type](sorted_scores, accum_hits, accum_decoys, settings)
|
|
48 #return compute_functions[fdr_type](all_hits_array, decoy_hits_array, settings)
|
|
49
|
|
50
|
|
51 def _compute_pspep(all_hits, decoy_hits, settings):
|
|
52 scaling = _get_scaling(settings)
|
|
53 seterr(all="ignore")
|
|
54 sigma = array([sqrt(x) if x > 0 else 0.2 for x in decoy_hits])
|
|
55 if isinstance(all_hits, list):
|
|
56 all_hits = array(all_hits)
|
|
57 if isinstance(decoy_hits, list):
|
|
58 decoy_hits = array(decoy_hits)
|
|
59 searchSeg = concatenate((exp(arange(-8, 9, 2)), -1 * exp(arange(-8, 9, 2))))
|
|
60 bestResids = sys.float_info.max
|
|
61 bestResidsComb = [0.0, 0.0, 0.0]
|
|
62 for aEst in searchSeg:
|
|
63 for bEst in searchSeg:
|
|
64 for cEst in searchSeg:
|
|
65 try:
|
|
66 sol = _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma)
|
|
67 if sol[3] and sol[3] < bestResids:
|
|
68 bestResids = sol[3]
|
|
69 bestResidsComb = sol[0:3]
|
|
70 except:
|
|
71 pass
|
|
72 (a, b, c) = bestResidsComb[0:3]
|
|
73 fdr_local = scaling * (exp(b * (all_hits - a)) / (exp(b * (all_hits - a)) + 1)) * c
|
|
74 return fdr_local
|
|
75
|
|
76
|
|
77 def _get_scaling(settings):
|
|
78 scaling = float(settings.get("scaling", "2.0"))
|
|
79 return scaling
|
|
80
|
|
81
|
|
82 def _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma, scaling=2):
|
|
83 guess = [aEst, bEst, cEst]
|
|
84
|
|
85 def f(a, b, c):
|
|
86 return c * (log(exp(b * (all_hits - a)) + 1) - log(exp(-b * a) + 1)) / b
|
|
87
|
|
88 def fcn(p):
|
|
89 a = p[0]
|
|
90 b = p[1]
|
|
91 c = p[2]
|
|
92 return (decoy_hits - f(a, b, c)) / sigma
|
|
93
|
|
94 solution = root(fcn, guess, method='lm')
|
|
95 a = solution.x[0]
|
|
96 b = solution.x[1]
|
|
97 c = solution.x[2]
|
|
98 resids = sum((decoy_hits - f(a, b, c)) ** 2) / len(all_hits)
|
|
99 return (a, b, c, resids)
|
|
100
|
|
101
|
|
102 def _compute_fdr_global_conservative(sorted_scores, accum_hits, accum_decoys, settings):
|
|
103 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings)
|
|
104 fdrs = {}
|
|
105 max_fdr = -1
|
|
106 for score in sorted_scores:
|
|
107 raw_fdr = raw_fdrs[score]
|
|
108 if raw_fdr > max_fdr:
|
|
109 max_fdr = raw_fdr
|
|
110 fdrs[score] = max_fdr
|
|
111 return fdrs
|
|
112
|
|
113
|
|
114 def _compute_fdr_global_permissive(sorted_scores, accum_hits, accum_decoys, settings):
|
|
115 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings)
|
|
116 fdrs = {}
|
|
117 index = len(sorted_scores) - 1
|
|
118 min_fdr = 1
|
|
119 while index >= 0:
|
|
120 score = sorted_scores[index]
|
|
121 raw_fdr = raw_fdrs[score]
|
|
122 if raw_fdr < min_fdr:
|
|
123 min_fdr = raw_fdr
|
|
124 fdrs[score] = min_fdr
|
|
125 index -= 1
|
|
126 return fdrs
|
|
127
|
|
128
|
|
129 def build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings):
|
|
130 scaling = _get_scaling(settings)
|
|
131 fdrs = {}
|
|
132 for score in sorted_scores:
|
|
133 fdrs[score] = (scaling * accum_decoys[score]) / accum_hits[score]
|
|
134 return fdrs
|
|
135
|
|
136
|
|
137 def __read_lines(input_file):
|
|
138 with open(input_file, 'r') as input:
|
|
139 for i, line in enumerate(input):
|
|
140 line = line.rstrip('\r\n')
|
|
141 yield line
|
|
142
|
|
143
|
|
144 def __read_entries(input_file, settings):
|
|
145 total_hits = 0
|
|
146 for line in __read_lines(input_file):
|
|
147 if not line or line.startswith('#'):
|
|
148 continue
|
|
149 entry = Entry(line, settings, total_hits)
|
|
150 total_hits = total_hits + 1
|
|
151 yield entry
|
|
152
|
|
153
|
|
154 class Entry(object):
|
|
155
|
|
156 def __init__(self, line, settings, index):
|
|
157 self.settings = settings
|
|
158 line_parts = line.split(settings["separator"])
|
|
159 self.identifier = line_parts[settings["identifiers_index"]]
|
|
160 if settings["score_column"]:
|
|
161 self.score = float(line_parts[settings["score_column"]])
|
|
162 else:
|
|
163 self.score = index
|
|
164
|
|
165 @property
|
|
166 def is_decoy(self):
|
|
167 return self.identifier.startswith(self.settings["decoy_prefix"])
|
|
168
|
|
169
|
|
170 def _accum_decoys(input_file, settings):
|
|
171 hits_at_score = {}
|
|
172 decoys_at_score = {}
|
|
173 for entry in __read_entries(input_file, settings):
|
|
174 score = entry.score
|
|
175 score_total = hits_at_score.get(score, 0) + 1
|
|
176 score_decoys = decoys_at_score.get(score, 0) + (1 if entry.is_decoy else 0)
|
|
177 hits_at_score[score] = score_total
|
|
178 decoys_at_score[score] = score_decoys
|
|
179 sorted_scores = sorted(hits_at_score, reverse=not settings["invert_score"])
|
|
180 accum_hits = {}
|
|
181 accum_decoys = {}
|
|
182 accum_hit_count = 0
|
|
183 accum_decoy_count = 0
|
|
184 for score in sorted_scores:
|
|
185 accum_decoy_count += decoys_at_score[score]
|
|
186 accum_hit_count += hits_at_score[score]
|
|
187 accum_hits[score] = accum_hit_count
|
|
188 accum_decoys[score] = accum_decoy_count
|
|
189 return (sorted_scores, accum_hits, accum_decoys)
|
|
190
|
|
191
|
|
192 def _build_arrays(input_file, settings, sorted_scores, accum_hits, accum_decoys):
|
|
193 all_hits = []
|
|
194 decoy_hits = []
|
|
195 for entry in __read_entries(input_file, settings):
|
|
196 score = entry.score
|
|
197 all_hits.append(accum_hits[score])
|
|
198 decoy_hits.append(accum_decoys[score])
|
|
199
|
|
200 return (all_hits, decoy_hits)
|
|
201
|
|
202
|
|
203 def run_script():
|
|
204 parser = optparse.OptionParser()
|
|
205 parser.add_option("--input")
|
|
206 parser.add_option("--output")
|
|
207 parser.add_option("--decoy_prefix")
|
|
208 parser.add_option("--identifiers_column")
|
|
209 parser.add_option("--separator", default="TAB")
|
|
210 parser.add_option("--fdr_type", default="global_conservative")
|
|
211 parser.add_option("--scaling")
|
|
212 parser.add_option("--score_column", default=None)
|
|
213 # By default higher score is better.
|
|
214 parser.add_option("--invert_score", default=False, action="store_true")
|
|
215
|
|
216 (options, args) = parser.parse_args()
|
|
217 decoy_prefix = options.decoy_prefix
|
|
218 identifiers_column = options.identifiers_column
|
|
219 score_column = options.score_column
|
|
220 separator = SEPARATORS[options.separator]
|
|
221 settings = {"decoy_prefix": decoy_prefix,
|
|
222 "identifiers_index": int(identifiers_column) - 1,
|
|
223 "fdr_type": options.fdr_type,
|
|
224 "separator": separator,
|
|
225 "scaling": options.scaling,
|
|
226 "invert_score": options.invert_score
|
|
227 }
|
|
228 if score_column:
|
|
229 settings["score_column"] = int(score_column) - 1
|
|
230 else:
|
|
231 settings["score_column"] = None
|
|
232 # Assume data is descending, use index as score and invert.
|
|
233 settings["invert_score"] = True
|
|
234 with open(options.output, 'w') as output:
|
|
235 append_fdr(options.input, output, settings)
|
|
236
|
|
237
|
|
238 if __name__ == '__main__':
|
|
239 __main__()
|