Mercurial > repos > galaxyp > appendfdr
comparison append_fdr.py @ 0:ef7cc296f063 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 16:42:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ef7cc296f063 |
---|---|
1 #!/usr/bin/env python | |
2 import optparse | |
3 import sys | |
4 | |
5 try: | |
6 # Ubuntu deps: gfortan libblas-dev liblapack-dev | |
7 # pip deps: numpy scipy | |
8 from math import sqrt | |
9 from scipy.optimize import root | |
10 from numpy import arange, exp, concatenate, sum, log, array, seterr | |
11 except ImportError: | |
12 # Allow this script to be used for global FDR even | |
13 # if these dependencies are not present. | |
14 pass | |
15 | |
16 | |
17 SEPARATORS = {"TAB": "\t", | |
18 "SPACE": " ", | |
19 "COMMA": "," | |
20 } | |
21 | |
22 | |
23 def __main__(): | |
24 run_script() | |
25 | |
26 | |
27 def append_fdr(input_file, output, settings): | |
28 sorted_scores, accum_hits, accum_decoys = _accum_decoys(input_file, settings) | |
29 fdr_array = compute_fdr(sorted_scores, accum_hits, accum_decoys, settings) | |
30 index = 0 | |
31 for line in __read_lines(input_file): | |
32 if not line or line.startswith('#'): | |
33 continue | |
34 entry = Entry(line, settings, index) | |
35 this_fdr = fdr_array[entry.score] | |
36 new_line = "%s%s%f" % (line, settings["separator"], this_fdr) | |
37 print >> output, new_line | |
38 index += 1 | |
39 | |
40 | |
41 def compute_fdr(sorted_scores, accum_hits, accum_decoys, settings): | |
42 fdr_type = settings["fdr_type"] | |
43 compute_functions = {"global_conservative": _compute_fdr_global_conservative, | |
44 "global_permissive": _compute_fdr_global_permissive, | |
45 #"pspep": _compute_pspep | |
46 } | |
47 return compute_functions[fdr_type](sorted_scores, accum_hits, accum_decoys, settings) | |
48 #return compute_functions[fdr_type](all_hits_array, decoy_hits_array, settings) | |
49 | |
50 | |
51 def _compute_pspep(all_hits, decoy_hits, settings): | |
52 scaling = _get_scaling(settings) | |
53 seterr(all="ignore") | |
54 sigma = array([sqrt(x) if x > 0 else 0.2 for x in decoy_hits]) | |
55 if isinstance(all_hits, list): | |
56 all_hits = array(all_hits) | |
57 if isinstance(decoy_hits, list): | |
58 decoy_hits = array(decoy_hits) | |
59 searchSeg = concatenate((exp(arange(-8, 9, 2)), -1 * exp(arange(-8, 9, 2)))) | |
60 bestResids = sys.float_info.max | |
61 bestResidsComb = [0.0, 0.0, 0.0] | |
62 for aEst in searchSeg: | |
63 for bEst in searchSeg: | |
64 for cEst in searchSeg: | |
65 try: | |
66 sol = _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma) | |
67 if sol[3] and sol[3] < bestResids: | |
68 bestResids = sol[3] | |
69 bestResidsComb = sol[0:3] | |
70 except: | |
71 pass | |
72 (a, b, c) = bestResidsComb[0:3] | |
73 fdr_local = scaling * (exp(b * (all_hits - a)) / (exp(b * (all_hits - a)) + 1)) * c | |
74 return fdr_local | |
75 | |
76 | |
77 def _get_scaling(settings): | |
78 scaling = float(settings.get("scaling", "2.0")) | |
79 return scaling | |
80 | |
81 | |
82 def _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma, scaling=2): | |
83 guess = [aEst, bEst, cEst] | |
84 | |
85 def f(a, b, c): | |
86 return c * (log(exp(b * (all_hits - a)) + 1) - log(exp(-b * a) + 1)) / b | |
87 | |
88 def fcn(p): | |
89 a = p[0] | |
90 b = p[1] | |
91 c = p[2] | |
92 return (decoy_hits - f(a, b, c)) / sigma | |
93 | |
94 solution = root(fcn, guess, method='lm') | |
95 a = solution.x[0] | |
96 b = solution.x[1] | |
97 c = solution.x[2] | |
98 resids = sum((decoy_hits - f(a, b, c)) ** 2) / len(all_hits) | |
99 return (a, b, c, resids) | |
100 | |
101 | |
102 def _compute_fdr_global_conservative(sorted_scores, accum_hits, accum_decoys, settings): | |
103 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings) | |
104 fdrs = {} | |
105 max_fdr = -1 | |
106 for score in sorted_scores: | |
107 raw_fdr = raw_fdrs[score] | |
108 if raw_fdr > max_fdr: | |
109 max_fdr = raw_fdr | |
110 fdrs[score] = max_fdr | |
111 return fdrs | |
112 | |
113 | |
114 def _compute_fdr_global_permissive(sorted_scores, accum_hits, accum_decoys, settings): | |
115 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings) | |
116 fdrs = {} | |
117 index = len(sorted_scores) - 1 | |
118 min_fdr = 1 | |
119 while index >= 0: | |
120 score = sorted_scores[index] | |
121 raw_fdr = raw_fdrs[score] | |
122 if raw_fdr < min_fdr: | |
123 min_fdr = raw_fdr | |
124 fdrs[score] = min_fdr | |
125 index -= 1 | |
126 return fdrs | |
127 | |
128 | |
129 def build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings): | |
130 scaling = _get_scaling(settings) | |
131 fdrs = {} | |
132 for score in sorted_scores: | |
133 fdrs[score] = (scaling * accum_decoys[score]) / accum_hits[score] | |
134 return fdrs | |
135 | |
136 | |
137 def __read_lines(input_file): | |
138 with open(input_file, 'r') as input: | |
139 for i, line in enumerate(input): | |
140 line = line.rstrip('\r\n') | |
141 yield line | |
142 | |
143 | |
144 def __read_entries(input_file, settings): | |
145 total_hits = 0 | |
146 for line in __read_lines(input_file): | |
147 if not line or line.startswith('#'): | |
148 continue | |
149 entry = Entry(line, settings, total_hits) | |
150 total_hits = total_hits + 1 | |
151 yield entry | |
152 | |
153 | |
154 class Entry(object): | |
155 | |
156 def __init__(self, line, settings, index): | |
157 self.settings = settings | |
158 line_parts = line.split(settings["separator"]) | |
159 self.identifier = line_parts[settings["identifiers_index"]] | |
160 if settings["score_column"]: | |
161 self.score = float(line_parts[settings["score_column"]]) | |
162 else: | |
163 self.score = index | |
164 | |
165 @property | |
166 def is_decoy(self): | |
167 return self.identifier.startswith(self.settings["decoy_prefix"]) | |
168 | |
169 | |
170 def _accum_decoys(input_file, settings): | |
171 hits_at_score = {} | |
172 decoys_at_score = {} | |
173 for entry in __read_entries(input_file, settings): | |
174 score = entry.score | |
175 score_total = hits_at_score.get(score, 0) + 1 | |
176 score_decoys = decoys_at_score.get(score, 0) + (1 if entry.is_decoy else 0) | |
177 hits_at_score[score] = score_total | |
178 decoys_at_score[score] = score_decoys | |
179 sorted_scores = sorted(hits_at_score, reverse=not settings["invert_score"]) | |
180 accum_hits = {} | |
181 accum_decoys = {} | |
182 accum_hit_count = 0 | |
183 accum_decoy_count = 0 | |
184 for score in sorted_scores: | |
185 accum_decoy_count += decoys_at_score[score] | |
186 accum_hit_count += hits_at_score[score] | |
187 accum_hits[score] = accum_hit_count | |
188 accum_decoys[score] = accum_decoy_count | |
189 return (sorted_scores, accum_hits, accum_decoys) | |
190 | |
191 | |
192 def _build_arrays(input_file, settings, sorted_scores, accum_hits, accum_decoys): | |
193 all_hits = [] | |
194 decoy_hits = [] | |
195 for entry in __read_entries(input_file, settings): | |
196 score = entry.score | |
197 all_hits.append(accum_hits[score]) | |
198 decoy_hits.append(accum_decoys[score]) | |
199 | |
200 return (all_hits, decoy_hits) | |
201 | |
202 | |
203 def run_script(): | |
204 parser = optparse.OptionParser() | |
205 parser.add_option("--input") | |
206 parser.add_option("--output") | |
207 parser.add_option("--decoy_prefix") | |
208 parser.add_option("--identifiers_column") | |
209 parser.add_option("--separator", default="TAB") | |
210 parser.add_option("--fdr_type", default="global_conservative") | |
211 parser.add_option("--scaling") | |
212 parser.add_option("--score_column", default=None) | |
213 # By default higher score is better. | |
214 parser.add_option("--invert_score", default=False, action="store_true") | |
215 | |
216 (options, args) = parser.parse_args() | |
217 decoy_prefix = options.decoy_prefix | |
218 identifiers_column = options.identifiers_column | |
219 score_column = options.score_column | |
220 separator = SEPARATORS[options.separator] | |
221 settings = {"decoy_prefix": decoy_prefix, | |
222 "identifiers_index": int(identifiers_column) - 1, | |
223 "fdr_type": options.fdr_type, | |
224 "separator": separator, | |
225 "scaling": options.scaling, | |
226 "invert_score": options.invert_score | |
227 } | |
228 if score_column: | |
229 settings["score_column"] = int(score_column) - 1 | |
230 else: | |
231 settings["score_column"] = None | |
232 # Assume data is descending, use index as score and invert. | |
233 settings["invert_score"] = True | |
234 with open(options.output, 'w') as output: | |
235 append_fdr(options.input, output, settings) | |
236 | |
237 | |
238 if __name__ == '__main__': | |
239 __main__() |