Mercurial > repos > pieterlukasse > prims_metabolomics
comparison query_metexp.py @ 21:19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
* Improved quantification output files of MsClust, a.o. sorting
mass list based on intensity (last two columns of quantification
files)
* Added Molecular Mass calculation method
author | pieter.lukasse@wur.nl |
---|---|
date | Wed, 05 Mar 2014 17:20:11 +0100 |
parents | |
children | cd4f13119afa |
comparison
equal
deleted
inserted
replaced
20:24fb75fedee0 | 21:19d8fd10248e |
---|---|
1 #!/usr/bin/env python | |
2 # encoding: utf-8 | |
3 ''' | |
4 Module to query a set of identifications against the METabolomics EXPlorer database. | |
5 | |
6 It will take the input file and for each record it will query the | |
7 molecular mass in the selected MetExp DB. If one or more compounds are found in the | |
8 MetExp DB then extra information regarding these compounds is added to the output file. | |
9 | |
10 The output file is thus the input file enriched with information about | |
11 related items found in the selected MetExp DB. | |
12 ''' | |
13 import csv | |
14 import sys | |
15 import fileinput | |
16 import urllib2 | |
17 from collections import OrderedDict | |
18 | |
19 __author__ = "Pieter Lukasse" | |
20 __contact__ = "pieter.lukasse@wur.nl" | |
21 __copyright__ = "Copyright, 2014, Plant Research International, WUR" | |
22 __license__ = "Apache v2" | |
23 | |
24 def _process_file(in_xsv, delim='\t'): | |
25 ''' | |
26 Generic method to parse a tab-separated file returning a dictionary with named columns | |
27 @param in_csv: input filename to be parsed | |
28 ''' | |
29 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) | |
30 return _process_data(data) | |
31 | |
32 def _process_data(data): | |
33 | |
34 header = data.pop(0) | |
35 # Create dictionary with column name as key | |
36 output = OrderedDict() | |
37 for index in xrange(len(header)): | |
38 output[header[index]] = [row[index] for row in data] | |
39 return output | |
40 | |
41 | |
42 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): | |
43 ''' | |
44 This method will iterate over the record in the input_data and | |
45 will enrich them with the related information found (if any) in the | |
46 MetExp Database. | |
47 ''' | |
48 merged = [] | |
49 | |
50 for i in xrange(len(input_data[input_data.keys()[0]])): | |
51 # Get the record in same dictionary format as input_data, but containing | |
52 # a value at each column instead of a list of all values of all records: | |
53 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) | |
54 | |
55 # read the molecular mass and formula: | |
56 cas_id = input_data_record[casid_col] | |
57 formula = input_data_record[formula_col] | |
58 molecular_mass = input_data_record[molecular_mass_col] | |
59 | |
60 # search for related records in MetExp: | |
61 data_found = None | |
62 if cas_id != "undef": | |
63 # 1- search for other experiments where this CAS id has been found: | |
64 query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method | |
65 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
66 data_type_found = "CAS" | |
67 if data_found == None: | |
68 # 2- search for other experiments where this FORMULA has been found: | |
69 query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method | |
70 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
71 data_type_found = "FORMULA" | |
72 if data_found == None: | |
73 # 3- search for other experiments where this MM has been found: | |
74 query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method | |
75 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
76 data_type_found = "MM" | |
77 | |
78 if data_found == None: | |
79 # If still nothing found, just add empty columns | |
80 extra_cols = ['', '','','','','','',''] | |
81 else: | |
82 # Add info found: | |
83 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) | |
84 | |
85 # Take all data and merge it into a "flat"/simple array of values: | |
86 field_values_list = _merge_data(input_data_record, extra_cols) | |
87 | |
88 merged.append(field_values_list) | |
89 | |
90 # return the merged/enriched records: | |
91 return merged | |
92 | |
93 | |
94 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): | |
95 ''' | |
96 This method will go over the data found and will return a | |
97 list with the following items: | |
98 - Experiment details where hits have been found : | |
99 'organism', 'tissue','experiment_name','user_name','column_type' | |
100 - Link that executes same query | |
101 | |
102 ''' | |
103 # set() makes a unique list: | |
104 organism_set = [] | |
105 tissue_set = [] | |
106 experiment_name_set = [] | |
107 user_name_set = [] | |
108 column_type_set = [] | |
109 cas_nr_set = [] | |
110 | |
111 if 'organism' in data_found: | |
112 organism_set = set(data_found['organism']) | |
113 if 'tissue' in data_found: | |
114 tissue_set = set(data_found['tissue']) | |
115 if 'experiment_name' in data_found: | |
116 experiment_name_set = set(data_found['experiment_name']) | |
117 if 'user_name' in data_found: | |
118 user_name_set = set(data_found['user_name']) | |
119 if 'column_type' in data_found: | |
120 column_type_set = set(data_found['column_type']) | |
121 if 'CAS' in data_found: | |
122 cas_nr_set = set(data_found['CAS']) | |
123 | |
124 | |
125 result = [data_type_found, | |
126 _to_xsv(organism_set), | |
127 _to_xsv(tissue_set), | |
128 _to_xsv(experiment_name_set), | |
129 _to_xsv(user_name_set), | |
130 _to_xsv(column_type_set), | |
131 _to_xsv(cas_nr_set), | |
132 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): | |
133 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] | |
134 return result | |
135 | |
136 | |
137 def _to_xsv(data_set): | |
138 result = "" | |
139 for item in data_set: | |
140 result = result + str(item) + "|" | |
141 return result | |
142 | |
143 | |
144 def _fire_query_and_return_dict(url): | |
145 ''' | |
146 This method will fire the query as a web-service call and | |
147 return the results as a list of dictionary objects | |
148 ''' | |
149 | |
150 try: | |
151 data = urllib2.urlopen(url).read() | |
152 | |
153 # transform to dictionary: | |
154 result = [] | |
155 data_rows = data.split("\n") | |
156 | |
157 # check if there is any data in the response: | |
158 if len(data_rows) <= 1 or data_rows[1].strip() == '': | |
159 # means there is only the header row...so no hits: | |
160 return None | |
161 | |
162 for data_row in data_rows: | |
163 if not data_row.strip() == '': | |
164 row_as_list = _str_to_list(data_row, delimiter='\t') | |
165 result.append(row_as_list) | |
166 | |
167 # return result processed into a dict: | |
168 return _process_data(result) | |
169 | |
170 except urllib2.HTTPError, e: | |
171 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) | |
172 except urllib2.URLError, e: | |
173 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") | |
174 | |
175 def _str_to_list(data_row, delimiter='\t'): | |
176 result = [] | |
177 for column in data_row.split(delimiter): | |
178 result.append(column) | |
179 return result | |
180 | |
181 | |
182 # alternative: ? | |
183 # s = requests.Session() | |
184 # s.verify = False | |
185 # #s.auth = (token01, token02) | |
186 # resp = s.get(url, params={'name': 'anonymous'}, stream=True) | |
187 # content = resp.content | |
188 # # transform to dictionary: | |
189 | |
190 | |
191 | |
192 | |
193 def _merge_data(input_data_record, extra_cols): | |
194 ''' | |
195 Adds the extra information to the existing data record and returns | |
196 the combined new record. | |
197 ''' | |
198 record = [] | |
199 for column in input_data_record: | |
200 record.append(input_data_record[column]) | |
201 | |
202 | |
203 # add extra columns | |
204 for column in extra_cols: | |
205 record.append(column) | |
206 | |
207 return record | |
208 | |
209 | |
210 def _save_data(data_rows, headers, out_csv): | |
211 ''' | |
212 Writes tab-separated data to file | |
213 @param data_rows: dictionary containing merged/enriched dataset | |
214 @param out_csv: output csv file | |
215 ''' | |
216 | |
217 # Open output file for writing | |
218 outfile_single_handle = open(out_csv, 'wb') | |
219 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | |
220 | |
221 # Write headers | |
222 output_single_handle.writerow(headers) | |
223 | |
224 # Write one line for each row | |
225 for data_row in data_rows: | |
226 output_single_handle.writerow(data_row) | |
227 | |
228 def _get_metexp_URL(metexp_dblink_file): | |
229 ''' | |
230 Read out and return the URL stored in the given file. | |
231 ''' | |
232 file_input = fileinput.input(metexp_dblink_file) | |
233 try: | |
234 for line in file_input: | |
235 if line[0] != '#': | |
236 # just return the first line that is not a comment line: | |
237 return line | |
238 finally: | |
239 file_input.close() | |
240 | |
241 | |
242 def main(): | |
243 ''' | |
244 MetExp Query main function | |
245 | |
246 The input file can be any tabular file, as long as it contains a column for the molecular mass | |
247 and one for the formula of the respective identification. These two columns are then | |
248 used to query against MetExp Database. | |
249 ''' | |
250 input_file = sys.argv[1] | |
251 casid_col = sys.argv[2] | |
252 formula_col = sys.argv[3] | |
253 molecular_mass_col = sys.argv[4] | |
254 metexp_dblink_file = sys.argv[5] | |
255 separation_method = sys.argv[6] | |
256 output_result = sys.argv[7] | |
257 | |
258 # Parse metexp_dblink_file to find the URL to the MetExp service: | |
259 metexp_dblink = _get_metexp_URL(metexp_dblink_file) | |
260 | |
261 # Parse tabular input file into dictionary/array: | |
262 input_data = _process_file(input_file) | |
263 | |
264 # Query data against MetExp DB : | |
265 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) | |
266 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', | |
267 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] | |
268 | |
269 _save_data(enriched_data, headers, output_result) | |
270 | |
271 | |
272 if __name__ == '__main__': | |
273 main() |