comparison append_fdr.py @ 0:ef7cc296f063 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 16:42:08 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ef7cc296f063
1 #!/usr/bin/env python
2 import optparse
3 import sys
4
5 try:
6 # Ubuntu deps: gfortan libblas-dev liblapack-dev
7 # pip deps: numpy scipy
8 from math import sqrt
9 from scipy.optimize import root
10 from numpy import arange, exp, concatenate, sum, log, array, seterr
11 except ImportError:
12 # Allow this script to be used for global FDR even
13 # if these dependencies are not present.
14 pass
15
16
17 SEPARATORS = {"TAB": "\t",
18 "SPACE": " ",
19 "COMMA": ","
20 }
21
22
23 def __main__():
24 run_script()
25
26
27 def append_fdr(input_file, output, settings):
28 sorted_scores, accum_hits, accum_decoys = _accum_decoys(input_file, settings)
29 fdr_array = compute_fdr(sorted_scores, accum_hits, accum_decoys, settings)
30 index = 0
31 for line in __read_lines(input_file):
32 if not line or line.startswith('#'):
33 continue
34 entry = Entry(line, settings, index)
35 this_fdr = fdr_array[entry.score]
36 new_line = "%s%s%f" % (line, settings["separator"], this_fdr)
37 print >> output, new_line
38 index += 1
39
40
41 def compute_fdr(sorted_scores, accum_hits, accum_decoys, settings):
42 fdr_type = settings["fdr_type"]
43 compute_functions = {"global_conservative": _compute_fdr_global_conservative,
44 "global_permissive": _compute_fdr_global_permissive,
45 #"pspep": _compute_pspep
46 }
47 return compute_functions[fdr_type](sorted_scores, accum_hits, accum_decoys, settings)
48 #return compute_functions[fdr_type](all_hits_array, decoy_hits_array, settings)
49
50
51 def _compute_pspep(all_hits, decoy_hits, settings):
52 scaling = _get_scaling(settings)
53 seterr(all="ignore")
54 sigma = array([sqrt(x) if x > 0 else 0.2 for x in decoy_hits])
55 if isinstance(all_hits, list):
56 all_hits = array(all_hits)
57 if isinstance(decoy_hits, list):
58 decoy_hits = array(decoy_hits)
59 searchSeg = concatenate((exp(arange(-8, 9, 2)), -1 * exp(arange(-8, 9, 2))))
60 bestResids = sys.float_info.max
61 bestResidsComb = [0.0, 0.0, 0.0]
62 for aEst in searchSeg:
63 for bEst in searchSeg:
64 for cEst in searchSeg:
65 try:
66 sol = _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma)
67 if sol[3] and sol[3] < bestResids:
68 bestResids = sol[3]
69 bestResidsComb = sol[0:3]
70 except:
71 pass
72 (a, b, c) = bestResidsComb[0:3]
73 fdr_local = scaling * (exp(b * (all_hits - a)) / (exp(b * (all_hits - a)) + 1)) * c
74 return fdr_local
75
76
77 def _get_scaling(settings):
78 scaling = float(settings.get("scaling", "2.0"))
79 return scaling
80
81
82 def _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma, scaling=2):
83 guess = [aEst, bEst, cEst]
84
85 def f(a, b, c):
86 return c * (log(exp(b * (all_hits - a)) + 1) - log(exp(-b * a) + 1)) / b
87
88 def fcn(p):
89 a = p[0]
90 b = p[1]
91 c = p[2]
92 return (decoy_hits - f(a, b, c)) / sigma
93
94 solution = root(fcn, guess, method='lm')
95 a = solution.x[0]
96 b = solution.x[1]
97 c = solution.x[2]
98 resids = sum((decoy_hits - f(a, b, c)) ** 2) / len(all_hits)
99 return (a, b, c, resids)
100
101
102 def _compute_fdr_global_conservative(sorted_scores, accum_hits, accum_decoys, settings):
103 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings)
104 fdrs = {}
105 max_fdr = -1
106 for score in sorted_scores:
107 raw_fdr = raw_fdrs[score]
108 if raw_fdr > max_fdr:
109 max_fdr = raw_fdr
110 fdrs[score] = max_fdr
111 return fdrs
112
113
114 def _compute_fdr_global_permissive(sorted_scores, accum_hits, accum_decoys, settings):
115 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings)
116 fdrs = {}
117 index = len(sorted_scores) - 1
118 min_fdr = 1
119 while index >= 0:
120 score = sorted_scores[index]
121 raw_fdr = raw_fdrs[score]
122 if raw_fdr < min_fdr:
123 min_fdr = raw_fdr
124 fdrs[score] = min_fdr
125 index -= 1
126 return fdrs
127
128
129 def build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings):
130 scaling = _get_scaling(settings)
131 fdrs = {}
132 for score in sorted_scores:
133 fdrs[score] = (scaling * accum_decoys[score]) / accum_hits[score]
134 return fdrs
135
136
137 def __read_lines(input_file):
138 with open(input_file, 'r') as input:
139 for i, line in enumerate(input):
140 line = line.rstrip('\r\n')
141 yield line
142
143
144 def __read_entries(input_file, settings):
145 total_hits = 0
146 for line in __read_lines(input_file):
147 if not line or line.startswith('#'):
148 continue
149 entry = Entry(line, settings, total_hits)
150 total_hits = total_hits + 1
151 yield entry
152
153
154 class Entry(object):
155
156 def __init__(self, line, settings, index):
157 self.settings = settings
158 line_parts = line.split(settings["separator"])
159 self.identifier = line_parts[settings["identifiers_index"]]
160 if settings["score_column"]:
161 self.score = float(line_parts[settings["score_column"]])
162 else:
163 self.score = index
164
165 @property
166 def is_decoy(self):
167 return self.identifier.startswith(self.settings["decoy_prefix"])
168
169
170 def _accum_decoys(input_file, settings):
171 hits_at_score = {}
172 decoys_at_score = {}
173 for entry in __read_entries(input_file, settings):
174 score = entry.score
175 score_total = hits_at_score.get(score, 0) + 1
176 score_decoys = decoys_at_score.get(score, 0) + (1 if entry.is_decoy else 0)
177 hits_at_score[score] = score_total
178 decoys_at_score[score] = score_decoys
179 sorted_scores = sorted(hits_at_score, reverse=not settings["invert_score"])
180 accum_hits = {}
181 accum_decoys = {}
182 accum_hit_count = 0
183 accum_decoy_count = 0
184 for score in sorted_scores:
185 accum_decoy_count += decoys_at_score[score]
186 accum_hit_count += hits_at_score[score]
187 accum_hits[score] = accum_hit_count
188 accum_decoys[score] = accum_decoy_count
189 return (sorted_scores, accum_hits, accum_decoys)
190
191
192 def _build_arrays(input_file, settings, sorted_scores, accum_hits, accum_decoys):
193 all_hits = []
194 decoy_hits = []
195 for entry in __read_entries(input_file, settings):
196 score = entry.score
197 all_hits.append(accum_hits[score])
198 decoy_hits.append(accum_decoys[score])
199
200 return (all_hits, decoy_hits)
201
202
203 def run_script():
204 parser = optparse.OptionParser()
205 parser.add_option("--input")
206 parser.add_option("--output")
207 parser.add_option("--decoy_prefix")
208 parser.add_option("--identifiers_column")
209 parser.add_option("--separator", default="TAB")
210 parser.add_option("--fdr_type", default="global_conservative")
211 parser.add_option("--scaling")
212 parser.add_option("--score_column", default=None)
213 # By default higher score is better.
214 parser.add_option("--invert_score", default=False, action="store_true")
215
216 (options, args) = parser.parse_args()
217 decoy_prefix = options.decoy_prefix
218 identifiers_column = options.identifiers_column
219 score_column = options.score_column
220 separator = SEPARATORS[options.separator]
221 settings = {"decoy_prefix": decoy_prefix,
222 "identifiers_index": int(identifiers_column) - 1,
223 "fdr_type": options.fdr_type,
224 "separator": separator,
225 "scaling": options.scaling,
226 "invert_score": options.invert_score
227 }
228 if score_column:
229 settings["score_column"] = int(score_column) - 1
230 else:
231 settings["score_column"] = None
232 # Assume data is descending, use index as score and invert.
233 settings["invert_score"] = True
234 with open(options.output, 'w') as output:
235 append_fdr(options.input, output, settings)
236
237
238 if __name__ == '__main__':
239 __main__()