prims_metabolomics: query_mass

comparison query_mass_repos.py @ 23:85fd05d0d16c

New tool to Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers

author	pieter.lukasse@wur.nl
date	Thu, 03 Apr 2014 16:44:11 +0200
parents
children	60b53f2aa48a

comparison

equal deleted inserted replaced

-:cd4f13119afa
+:85fd05d0d16c
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of accurate mass values detected by high-resolution mass spectrometers
+against various repositories/services such as METabolomics EXPlorer database or the
+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
+It will take the input file and for each record it will query the
+molecular mass in the selected repository/service. If one or more compounds are found
+then extra information regarding these compounds is added to the output file.
+The output file is thus the input file enriched with information about
+related items found in the selected repository/service.
+The service should implement the following interface:
+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth   (txth means there is guaranteed to be a header line before the data)
+The output should be tab separated and should contain the following columns (in this order)
+db-name    molecular-formula    dbe    formula-weight    id    description
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+def _process_file(in_xsv, delim='\t'):
+'''
+Generic method to parse a tab-separated file returning a dictionary with named columns
+@param in_csv: input filename to be parsed
+'''
+data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+return _process_data(data)
+def _process_data(data):
+header = data.pop(0)
+# Create dictionary with column name as key
+output = OrderedDict()
+for index in xrange(len(header)):
+output[header[index]] = [row[index] for row in data]
+return output
+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
+'''
+This method will iterate over the record in the input_data and
+will enrich them with the related information found (if any) in the
+chosen repository/service
+# TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+'''
+merged = []
+for i in xrange(len(input_data[input_data.keys()[0]])):
+# Get the record in same dictionary format as input_data, but containing
+# a value at each column instead of a list of all values of all records:
+input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+# read the molecular mass :
+molecular_mass = input_data_record[molecular_mass_col]
+# search for related records in repository/service:
+data_found = None
+if molecular_mass != "":
+molecular_mass = float(molecular_mass)
+# 1- search for data around this MM:
+query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
+data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+data_type_found = "MM"
+if data_found == None:
+# If still nothing found, just add empty columns
+extra_cols = ['', '','','','','']
+else:
+# Add info found:
+extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+# Take all data and merge it into a "flat"/simple array of values:
+field_values_list = _merge_data(input_data_record, extra_cols)
+merged.append(field_values_list)
+# return the merged/enriched records:
+return merged
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+'''
+This method will go over the data found and will return a
+list with the following items:
+- details of hits found :
+db-name    molecular-formula    dbe    formula-weight    id    description
+- Link that executes same query
+'''
+# set() makes a unique list:
+db_name_set = []
+molecular_formula_set = []
+id_set = []
+description_set = []
+if 'db-name' in data_found:
+db_name_set = set(data_found['db-name'])
+elif '# db-name' in data_found:
+db_name_set = set(data_found['# db-name'])
+if 'molecular-formula' in data_found:
+molecular_formula_set = set(data_found['molecular-formula'])
+if 'id' in data_found:
+id_set = set(data_found['id'])
+if 'description' in data_found:
+description_set = set(data_found['description'])
+result = [data_type_found,
+_to_xsv(db_name_set),
+_to_xsv(molecular_formula_set),
+_to_xsv(id_set),
+_to_xsv(description_set),
+#To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+"=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+return result
+def _to_xsv(data_set):
+result = ""
+for item in data_set:
+result = result + str(item) + "|"
+return result
+def _fire_query_and_return_dict(url):
+'''
+This method will fire the query as a web-service call and
+return the results as a list of dictionary objects
+'''
+try:
+data = urllib2.urlopen(url).read()
+# transform to dictionary:
+result = []
+data_rows = data.split("\n")
+# remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
+data_rows_to_remove = []
+for data_row in data_rows:
+if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
+data_rows_to_remove.append(data_row)
+for data_row in data_rows_to_remove:
+data_rows.remove(data_row)
+# check if there is any data in the response:
+if len(data_rows) <= 1 or data_rows[1].strip() == '':
+# means there is only the header row...so no hits:
+return None
+for data_row in data_rows:
+if not data_row.strip() == '':
+row_as_list = _str_to_list(data_row, delimiter='\t')
+result.append(row_as_list)
+# return result processed into a dict:
+return _process_data(result)
+except urllib2.HTTPError, e:
+raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+except urllib2.URLError, e:
+raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
+def _str_to_list(data_row, delimiter='\t'):
+result = []
+for column in data_row.split(delimiter):
+result.append(column)
+return result
+# alternative: ?
+#     s = requests.Session()
+#     s.verify = False
+#     #s.auth = (token01, token02)
+#     resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+#     content = resp.content
+#     # transform to dictionary:
+def _merge_data(input_data_record, extra_cols):
+'''
+Adds the extra information to the existing data record and returns
+the combined new record.
+'''
+record = []
+for column in input_data_record:
+record.append(input_data_record[column])
+# add extra columns
+for column in extra_cols:
+record.append(column)
+return record
+def _save_data(data_rows, headers, out_csv):
+'''
+Writes tab-separated data to file
+@param data_rows: dictionary containing merged/enriched dataset
+@param out_csv: output csv file
+'''
+# Open output file for writing
+outfile_single_handle = open(out_csv, 'wb')
+output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+# Write headers
+output_single_handle.writerow(headers)
+# Write one line for each row
+for data_row in data_rows:
+output_single_handle.writerow(data_row)
+def _get_repository_URL(repository_file):
+'''
+Read out and return the URL stored in the given file.
+'''
+file_input = fileinput.input(repository_file)
+try:
+for line in file_input:
+if line[0] != '#':
+# just return the first line that is not a comment line:
+return line
+finally:
+file_input.close()
+def main():
+'''
+Query main function
+The input file can be any tabular file, as long as it contains a column for the molecular mass.
+This column is then used to query against the chosen repository/service Database.
+'''
+seconds_start = int(round(time.time()))
+input_file = sys.argv[1]
+molecular_mass_col = sys.argv[2]
+repository_file = sys.argv[3]
+error_margin = float(sys.argv[4])
+margin_unit = sys.argv[5]
+output_result = sys.argv[6]
+# Parse repository_file to find the URL to the service:
+repository_dblink = _get_repository_URL(repository_file)
+# Parse tabular input file into dictionary/array:
+input_data = _process_file(input_file)
+# Query data against repository :
+enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
+headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
+'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits']
+_save_data(enriched_data, headers, output_result)
+seconds_end = int(round(time.time()))
+print "Took " + str(seconds_end - seconds_start) + " seconds"
+if __name__ == '__main__':
+main()

Mercurial > repos > pieterlukasse > prims_metabolomics

comparison query_mass_repos.py @ 23:85fd05d0d16c