# HG changeset patch
# User pieter.lukasse@wur.nl
# Date 1396536251 -7200
# Node ID 85fd05d0d16cd8e82e7564cb370cea76190eac47
# Parent cd4f13119afad476e58af654922c6efc8ec4abb0
New tool to Query multiple public repositories for elemental compositions
from accurate mass values detected by high-resolution mass spectrometers
diff -r cd4f13119afa -r 85fd05d0d16c __init__.py
--- a/__init__.py Thu Mar 06 14:29:55 2014 +0100
+++ b/__init__.py Thu Apr 03 16:44:11 2014 +0200
@@ -1,6 +1,6 @@
-'''
-Module containing Galaxy tools for the GC/MS pipeline
-Created on Mar 6, 2012
-
-@author: marcelk
-'''
+'''
+Module containing Galaxy tools for the LC or GC/MS pipeline
+Created on Mar , 2014
+
+@author: pieter lukasse
+'''
\ No newline at end of file
diff -r cd4f13119afa -r 85fd05d0d16c datatypes_conf.xml
--- a/datatypes_conf.xml Thu Mar 06 14:29:55 2014 +0100
+++ b/datatypes_conf.xml Thu Apr 03 16:44:11 2014 +0200
@@ -3,9 +3,6 @@
-
-
diff -r cd4f13119afa -r 85fd05d0d16c query_mass_repos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.py Thu Apr 03 16:44:11 2014 +0200
@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of accurate mass values detected by high-resolution mass spectrometers
+against various repositories/services such as METabolomics EXPlorer database or the
+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository/service. If one or more compounds are found
+then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository/service.
+
+The service should implement the following interface:
+
+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)
+
+The output should be tab separated and should contain the following columns (in this order)
+db-name molecular-formula dbe formula-weight id description
+
+
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+ return _process_data(data)
+
+def _process_data(data):
+
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = OrderedDict()
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
+
+ '''
+ This method will iterate over the record in the input_data and
+ will enrich them with the related information found (if any) in the
+ chosen repository/service
+
+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+ '''
+ merged = []
+
+ for i in xrange(len(input_data[input_data.keys()[0]])):
+ # Get the record in same dictionary format as input_data, but containing
+ # a value at each column instead of a list of all values of all records:
+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+
+ # read the molecular mass :
+ molecular_mass = input_data_record[molecular_mass_col]
+
+
+ # search for related records in repository/service:
+ data_found = None
+ if molecular_mass != "":
+ molecular_mass = float(molecular_mass)
+
+ # 1- search for data around this MM:
+ query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
+
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "MM"
+
+
+ if data_found == None:
+ # If still nothing found, just add empty columns
+ extra_cols = ['', '','','','','']
+ else:
+ # Add info found:
+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+
+ # Take all data and merge it into a "flat"/simple array of values:
+ field_values_list = _merge_data(input_data_record, extra_cols)
+
+ merged.append(field_values_list)
+
+ # return the merged/enriched records:
+ return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+ '''
+ This method will go over the data found and will return a
+ list with the following items:
+ - details of hits found :
+ db-name molecular-formula dbe formula-weight id description
+ - Link that executes same query
+
+ '''
+
+ # set() makes a unique list:
+ db_name_set = []
+ molecular_formula_set = []
+ id_set = []
+ description_set = []
+
+
+ if 'db-name' in data_found:
+ db_name_set = set(data_found['db-name'])
+ elif '# db-name' in data_found:
+ db_name_set = set(data_found['# db-name'])
+ if 'molecular-formula' in data_found:
+ molecular_formula_set = set(data_found['molecular-formula'])
+ if 'id' in data_found:
+ id_set = set(data_found['id'])
+ if 'description' in data_found:
+ description_set = set(data_found['description'])
+
+ result = [data_type_found,
+ _to_xsv(db_name_set),
+ _to_xsv(molecular_formula_set),
+ _to_xsv(id_set),
+ _to_xsv(description_set),
+ #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+ "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+ return result
+
+
+def _to_xsv(data_set):
+ result = ""
+ for item in data_set:
+ result = result + str(item) + "|"
+ return result
+
+
+def _fire_query_and_return_dict(url):
+ '''
+ This method will fire the query as a web-service call and
+ return the results as a list of dictionary objects
+ '''
+
+ try:
+ data = urllib2.urlopen(url).read()
+
+ # transform to dictionary:
+ result = []
+ data_rows = data.split("\n")
+
+ # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
+ data_rows_to_remove = []
+ for data_row in data_rows:
+ if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
+ data_rows_to_remove.append(data_row)
+
+ for data_row in data_rows_to_remove:
+ data_rows.remove(data_row)
+
+ # check if there is any data in the response:
+ if len(data_rows) <= 1 or data_rows[1].strip() == '':
+ # means there is only the header row...so no hits:
+ return None
+
+ for data_row in data_rows:
+ if not data_row.strip() == '':
+ row_as_list = _str_to_list(data_row, delimiter='\t')
+ result.append(row_as_list)
+
+ # return result processed into a dict:
+ return _process_data(result)
+
+ except urllib2.HTTPError, e:
+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+ except urllib2.URLError, e:
+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):
+ result = []
+ for column in data_row.split(delimiter):
+ result.append(column)
+ return result
+
+
+# alternative: ?
+# s = requests.Session()
+# s.verify = False
+# #s.auth = (token01, token02)
+# resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+# content = resp.content
+# # transform to dictionary:
+
+
+
+
+def _merge_data(input_data_record, extra_cols):
+ '''
+ Adds the extra information to the existing data record and returns
+ the combined new record.
+ '''
+ record = []
+ for column in input_data_record:
+ record.append(input_data_record[column])
+
+
+ # add extra columns
+ for column in extra_cols:
+ record.append(column)
+
+ return record
+
+
+def _save_data(data_rows, headers, out_csv):
+ '''
+ Writes tab-separated data to file
+ @param data_rows: dictionary containing merged/enriched dataset
+ @param out_csv: output csv file
+ '''
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(headers)
+
+ # Write one line for each row
+ for data_row in data_rows:
+ output_single_handle.writerow(data_row)
+
+def _get_repository_URL(repository_file):
+ '''
+ Read out and return the URL stored in the given file.
+ '''
+ file_input = fileinput.input(repository_file)
+ try:
+ for line in file_input:
+ if line[0] != '#':
+ # just return the first line that is not a comment line:
+ return line
+ finally:
+ file_input.close()
+
+
+def main():
+ '''
+ Query main function
+
+ The input file can be any tabular file, as long as it contains a column for the molecular mass.
+ This column is then used to query against the chosen repository/service Database.
+ '''
+ seconds_start = int(round(time.time()))
+
+ input_file = sys.argv[1]
+ molecular_mass_col = sys.argv[2]
+ repository_file = sys.argv[3]
+ error_margin = float(sys.argv[4])
+ margin_unit = sys.argv[5]
+ output_result = sys.argv[6]
+
+ # Parse repository_file to find the URL to the service:
+ repository_dblink = _get_repository_URL(repository_file)
+
+ # Parse tabular input file into dictionary/array:
+ input_data = _process_file(input_file)
+
+ # Query data against repository :
+ enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
+ headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
+ 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits']
+
+ _save_data(enriched_data, headers, output_result)
+
+ seconds_end = int(round(time.time()))
+ print "Took " + str(seconds_end - seconds_start) + " seconds"
+
+
+
+if __name__ == '__main__':
+ main()
diff -r cd4f13119afa -r 85fd05d0d16c query_mass_repos.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.xml Thu Apr 03 16:44:11 2014 +0200
@@ -0,0 +1,106 @@
+
+ Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers
+
+ query_mass_repos.py
+ $input_file
+ $molecular_mass_col
+ "$repository_file"
+ $error_margin
+ $margin_unit
+ $output_result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher
+for elemental compositions from accurate mass values detected by high-resolution mass spectrometers.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository. If one or more compounds are found in the
+repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass.
+
+**Services that can be queried**
+
+================= =========================================================================
+Database Description
+----------------- -------------------------------------------------------------------------
+PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at
+ Plant Research International. NB: restricted access to employees with
+ access key.
+ExactMassDB A database of possible elemental compositions consits of C: 100,
+ H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and
+ the Lewis valence rules.
+ (via /mfsearcher/exmassdb/)
+ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental
+ compositions, filters some elemental compositions according to
+ the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2
+ database returns the same result as does HR2 with the same atom kind
+ and number condition as that used in construction of the ExactMassDB.
+ (via /mfsearcher/exmassdb-hr2/)
+Pep1000 A database of possible linear polypeptides that are
+ constructed with 20 kinds of amino acids and having molecular
+ weights smaller than 1000.
+ (via /mfsearcher/pep1000/)
+KEGG Re-calculated compound data from KEGG. Weekly updated.
+ (via /mfsearcher/kegg/)
+KNApSAcK Re-calculated compound data from KNApSAcK.
+ (via /mfsearcher/knapsack/)
+Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer .
+ (via /mfsearcher/flavonoidviewer/
+LipidMAPS Re-calculated compound data from LIPID MAPS.
+ (via /mfsearcher/lipidmaps/)
+HMDB Re-calculated compound data from Human Metabolome Database (HMDB)
+ Version 3.5.
+ (via /mfsearcher/hmdb/)
+PubChem Re-calculated compound data from PubChem. Monthly updated.
+ (via /mfsearcher/pubchem/)
+================= =========================================================================
+
+Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher
+
+
+
diff -r cd4f13119afa -r 85fd05d0d16c test/__init__.py
--- a/test/__init__.py Thu Mar 06 14:29:55 2014 +0100
+++ b/test/__init__.py Thu Apr 03 16:44:11 2014 +0200
@@ -1,1 +1,1 @@
-''' BRS GCMS Galaxy Tools Module '''
+''' unit tests '''
diff -r cd4f13119afa -r 85fd05d0d16c test/test_query_mass_repos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_mass_repos.py Thu Apr 03 16:44:11 2014 +0200
@@ -0,0 +1,62 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+from MS import query_mass_repos
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+
+
+ def test_simple(self):
+ '''
+ Simple initial test
+ '''
+ # Create out folder
+ outdir = "output/query_mass_repos/"
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ #Build up arguments and run
+
+ # input_file = sys.argv[1]
+ # molecular_mass_col = sys.argv[2]
+ # repository_file = sys.argv[3]
+ # mass_tolerance = float(sys.argv[4])
+ # output_result = sys.argv[5]
+
+ input_file = resource_filename(__name__, "data/service_query_tabular.txt")
+
+ molecular_mass_col = "MM"
+ dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt")
+ output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+
+
+
+ sys.argv = ['test',
+ input_file,
+ molecular_mass_col,
+ dblink_file,
+ '0.001',
+ 'ms',
+ output_result]
+
+ # Execute main function with arguments provided through sys.argv
+ query_mass_repos.main()
+
+
+
+
+
+def _read_file(filename):
+ '''
+ Helper method to quickly read a file
+ @param filename:
+ '''
+ with open(filename) as handle:
+ return handle.read()