comparison query_metexp.py @ 21:19d8fd10248e

* Added interface to METEXP data store, including tool to fire queries in batch mode * Improved quantification output files of MsClust, a.o. sorting mass list based on intensity (last two columns of quantification files) * Added Molecular Mass calculation method
author pieter.lukasse@wur.nl
date Wed, 05 Mar 2014 17:20:11 +0100
parents
children cd4f13119afa
comparison
equal deleted inserted replaced
20:24fb75fedee0 21:19d8fd10248e
1 #!/usr/bin/env python
2 # encoding: utf-8
3 '''
4 Module to query a set of identifications against the METabolomics EXPlorer database.
5
6 It will take the input file and for each record it will query the
7 molecular mass in the selected MetExp DB. If one or more compounds are found in the
8 MetExp DB then extra information regarding these compounds is added to the output file.
9
10 The output file is thus the input file enriched with information about
11 related items found in the selected MetExp DB.
12 '''
13 import csv
14 import sys
15 import fileinput
16 import urllib2
17 from collections import OrderedDict
18
19 __author__ = "Pieter Lukasse"
20 __contact__ = "pieter.lukasse@wur.nl"
21 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
22 __license__ = "Apache v2"
23
24 def _process_file(in_xsv, delim='\t'):
25 '''
26 Generic method to parse a tab-separated file returning a dictionary with named columns
27 @param in_csv: input filename to be parsed
28 '''
29 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
30 return _process_data(data)
31
32 def _process_data(data):
33
34 header = data.pop(0)
35 # Create dictionary with column name as key
36 output = OrderedDict()
37 for index in xrange(len(header)):
38 output[header[index]] = [row[index] for row in data]
39 return output
40
41
42 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
43 '''
44 This method will iterate over the record in the input_data and
45 will enrich them with the related information found (if any) in the
46 MetExp Database.
47 '''
48 merged = []
49
50 for i in xrange(len(input_data[input_data.keys()[0]])):
51 # Get the record in same dictionary format as input_data, but containing
52 # a value at each column instead of a list of all values of all records:
53 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
54
55 # read the molecular mass and formula:
56 cas_id = input_data_record[casid_col]
57 formula = input_data_record[formula_col]
58 molecular_mass = input_data_record[molecular_mass_col]
59
60 # search for related records in MetExp:
61 data_found = None
62 if cas_id != "undef":
63 # 1- search for other experiments where this CAS id has been found:
64 query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
65 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
66 data_type_found = "CAS"
67 if data_found == None:
68 # 2- search for other experiments where this FORMULA has been found:
69 query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
70 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
71 data_type_found = "FORMULA"
72 if data_found == None:
73 # 3- search for other experiments where this MM has been found:
74 query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
75 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
76 data_type_found = "MM"
77
78 if data_found == None:
79 # If still nothing found, just add empty columns
80 extra_cols = ['', '','','','','','','']
81 else:
82 # Add info found:
83 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
84
85 # Take all data and merge it into a "flat"/simple array of values:
86 field_values_list = _merge_data(input_data_record, extra_cols)
87
88 merged.append(field_values_list)
89
90 # return the merged/enriched records:
91 return merged
92
93
94 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
95 '''
96 This method will go over the data found and will return a
97 list with the following items:
98 - Experiment details where hits have been found :
99 'organism', 'tissue','experiment_name','user_name','column_type'
100 - Link that executes same query
101
102 '''
103 # set() makes a unique list:
104 organism_set = []
105 tissue_set = []
106 experiment_name_set = []
107 user_name_set = []
108 column_type_set = []
109 cas_nr_set = []
110
111 if 'organism' in data_found:
112 organism_set = set(data_found['organism'])
113 if 'tissue' in data_found:
114 tissue_set = set(data_found['tissue'])
115 if 'experiment_name' in data_found:
116 experiment_name_set = set(data_found['experiment_name'])
117 if 'user_name' in data_found:
118 user_name_set = set(data_found['user_name'])
119 if 'column_type' in data_found:
120 column_type_set = set(data_found['column_type'])
121 if 'CAS' in data_found:
122 cas_nr_set = set(data_found['CAS'])
123
124
125 result = [data_type_found,
126 _to_xsv(organism_set),
127 _to_xsv(tissue_set),
128 _to_xsv(experiment_name_set),
129 _to_xsv(user_name_set),
130 _to_xsv(column_type_set),
131 _to_xsv(cas_nr_set),
132 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
133 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
134 return result
135
136
137 def _to_xsv(data_set):
138 result = ""
139 for item in data_set:
140 result = result + str(item) + "|"
141 return result
142
143
144 def _fire_query_and_return_dict(url):
145 '''
146 This method will fire the query as a web-service call and
147 return the results as a list of dictionary objects
148 '''
149
150 try:
151 data = urllib2.urlopen(url).read()
152
153 # transform to dictionary:
154 result = []
155 data_rows = data.split("\n")
156
157 # check if there is any data in the response:
158 if len(data_rows) <= 1 or data_rows[1].strip() == '':
159 # means there is only the header row...so no hits:
160 return None
161
162 for data_row in data_rows:
163 if not data_row.strip() == '':
164 row_as_list = _str_to_list(data_row, delimiter='\t')
165 result.append(row_as_list)
166
167 # return result processed into a dict:
168 return _process_data(result)
169
170 except urllib2.HTTPError, e:
171 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
172 except urllib2.URLError, e:
173 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
174
175 def _str_to_list(data_row, delimiter='\t'):
176 result = []
177 for column in data_row.split(delimiter):
178 result.append(column)
179 return result
180
181
182 # alternative: ?
183 # s = requests.Session()
184 # s.verify = False
185 # #s.auth = (token01, token02)
186 # resp = s.get(url, params={'name': 'anonymous'}, stream=True)
187 # content = resp.content
188 # # transform to dictionary:
189
190
191
192
193 def _merge_data(input_data_record, extra_cols):
194 '''
195 Adds the extra information to the existing data record and returns
196 the combined new record.
197 '''
198 record = []
199 for column in input_data_record:
200 record.append(input_data_record[column])
201
202
203 # add extra columns
204 for column in extra_cols:
205 record.append(column)
206
207 return record
208
209
210 def _save_data(data_rows, headers, out_csv):
211 '''
212 Writes tab-separated data to file
213 @param data_rows: dictionary containing merged/enriched dataset
214 @param out_csv: output csv file
215 '''
216
217 # Open output file for writing
218 outfile_single_handle = open(out_csv, 'wb')
219 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
220
221 # Write headers
222 output_single_handle.writerow(headers)
223
224 # Write one line for each row
225 for data_row in data_rows:
226 output_single_handle.writerow(data_row)
227
228 def _get_metexp_URL(metexp_dblink_file):
229 '''
230 Read out and return the URL stored in the given file.
231 '''
232 file_input = fileinput.input(metexp_dblink_file)
233 try:
234 for line in file_input:
235 if line[0] != '#':
236 # just return the first line that is not a comment line:
237 return line
238 finally:
239 file_input.close()
240
241
242 def main():
243 '''
244 MetExp Query main function
245
246 The input file can be any tabular file, as long as it contains a column for the molecular mass
247 and one for the formula of the respective identification. These two columns are then
248 used to query against MetExp Database.
249 '''
250 input_file = sys.argv[1]
251 casid_col = sys.argv[2]
252 formula_col = sys.argv[3]
253 molecular_mass_col = sys.argv[4]
254 metexp_dblink_file = sys.argv[5]
255 separation_method = sys.argv[6]
256 output_result = sys.argv[7]
257
258 # Parse metexp_dblink_file to find the URL to the MetExp service:
259 metexp_dblink = _get_metexp_URL(metexp_dblink_file)
260
261 # Parse tabular input file into dictionary/array:
262 input_data = _process_file(input_file)
263
264 # Query data against MetExp DB :
265 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
266 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
267 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
268
269 _save_data(enriched_data, headers, output_result)
270
271
272 if __name__ == '__main__':
273 main()