# HG changeset patch # User pieter.lukasse@wur.nl # Date 1396536251 -7200 # Node ID 85fd05d0d16cd8e82e7564cb370cea76190eac47 # Parent cd4f13119afad476e58af654922c6efc8ec4abb0 New tool to Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers diff -r cd4f13119afa -r 85fd05d0d16c __init__.py --- a/__init__.py Thu Mar 06 14:29:55 2014 +0100 +++ b/__init__.py Thu Apr 03 16:44:11 2014 +0200 @@ -1,6 +1,6 @@ -''' -Module containing Galaxy tools for the GC/MS pipeline -Created on Mar 6, 2012 - -@author: marcelk -''' +''' +Module containing Galaxy tools for the LC or GC/MS pipeline +Created on Mar , 2014 + +@author: pieter lukasse +''' \ No newline at end of file diff -r cd4f13119afa -r 85fd05d0d16c datatypes_conf.xml --- a/datatypes_conf.xml Thu Mar 06 14:29:55 2014 +0100 +++ b/datatypes_conf.xml Thu Apr 03 16:44:11 2014 +0200 @@ -3,9 +3,6 @@ - - diff -r cd4f13119afa -r 85fd05d0d16c query_mass_repos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.py Thu Apr 03 16:44:11 2014 +0200 @@ -0,0 +1,289 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of accurate mass values detected by high-resolution mass spectrometers +against various repositories/services such as METabolomics EXPlorer database or the +MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). + +It will take the input file and for each record it will query the +molecular mass in the selected repository/service. If one or more compounds are found +then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository/service. + +The service should implement the following interface: + +http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) + +The output should be tab separated and should contain the following columns (in this order) +db-name molecular-formula dbe formula-weight id description + + +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): + + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + chosen repository/service + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass : + molecular_mass = input_data_record[molecular_mass_col] + + + # search for related records in repository/service: + data_found = None + if molecular_mass != "": + molecular_mass = float(molecular_mass) + + # 1- search for data around this MM: + query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" + + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - details of hits found : + db-name molecular-formula dbe formula-weight id description + - Link that executes same query + + ''' + + # set() makes a unique list: + db_name_set = [] + molecular_formula_set = [] + id_set = [] + description_set = [] + + + if 'db-name' in data_found: + db_name_set = set(data_found['db-name']) + elif '# db-name' in data_found: + db_name_set = set(data_found['# db-name']) + if 'molecular-formula' in data_found: + molecular_formula_set = set(data_found['molecular-formula']) + if 'id' in data_found: + id_set = set(data_found['id']) + if 'description' in data_found: + description_set = set(data_found['description']) + + result = [data_type_found, + _to_xsv(db_name_set), + _to_xsv(molecular_formula_set), + _to_xsv(id_set), + _to_xsv(description_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): + data_rows_to_remove = [] + for data_row in data_rows: + if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): + data_rows_to_remove.append(data_row) + + for data_row in data_rows_to_remove: + data_rows.remove(data_row) + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_repository_URL(repository_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(repository_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass. + This column is then used to query against the chosen repository/service Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + molecular_mass_col = sys.argv[2] + repository_file = sys.argv[3] + error_margin = float(sys.argv[4]) + margin_unit = sys.argv[5] + output_result = sys.argv[6] + + # Parse repository_file to find the URL to the service: + repository_dblink = _get_repository_URL(repository_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against repository : + enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) + headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', + 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main() diff -r cd4f13119afa -r 85fd05d0d16c query_mass_repos.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.xml Thu Apr 03 16:44:11 2014 +0200 @@ -0,0 +1,106 @@ + + Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers + + query_mass_repos.py + $input_file + $molecular_mass_col + "$repository_file" + $error_margin + $margin_unit + $output_result + + + + + + + + + + + + + + + + + + + + + +

 
+  
+.. class:: infomark  
+  
+This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher 
+for elemental compositions from accurate mass values detected by high-resolution mass spectrometers.
+
+It will take the input file and for each record it will query the 
+molecular mass in the selected repository. If one or more compounds are found in the
+repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file.
+
+The output file is thus the input file enriched with information about 
+related items found in the selected repository.  
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass.  
+
+**Services that can be queried**
+
+================= =========================================================================
+Database          Description
+----------------- -------------------------------------------------------------------------
+PRI-MetExp        LC-MS and GC-MS data from experiments from the metabolomics group at 
+                  Plant Research International. NB: restricted access to employees with 
+                  access key.    
+ExactMassDB       A database of possible elemental compositions consits of C: 100, 
+                  H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and 
+                  the Lewis valence rules.  
+                  (via /mfsearcher/exmassdb/)
+ExactMassDB-HR2   HR2, which is one of the fastest tools for calculation of elemental 
+                  compositions, filters some elemental compositions according to 
+                  the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 
+                  database returns the same result as does HR2 with the same atom kind 
+                  and number condition as that used in construction of the ExactMassDB.  
+                  (via /mfsearcher/exmassdb-hr2/)
+Pep1000           A database of possible linear polypeptides that are 
+                  constructed with 20 kinds of amino acids and having molecular 
+                  weights smaller than 1000.  
+                  (via /mfsearcher/pep1000/)
+KEGG              Re-calculated compound data from KEGG. Weekly updated.  
+                  (via /mfsearcher/kegg/)
+KNApSAcK          Re-calculated compound data from KNApSAcK.  
+                  (via /mfsearcher/knapsack/)
+Flavonoid Viewer  Re-calculated compound data from Flavonoid Viewer .  
+                  (via /mfsearcher/flavonoidviewer/
+LipidMAPS         Re-calculated compound data from LIPID MAPS.  
+                  (via /mfsearcher/lipidmaps/)
+HMDB              Re-calculated compound data from Human Metabolome Database (HMDB) 
+                  Version 3.5.  
+                  (via /mfsearcher/hmdb/)
+PubChem           Re-calculated compound data from PubChem. Monthly updated.  
+                  (via /mfsearcher/pubchem/)
+================= =========================================================================
+  
+Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher 
+    
+  
+


diff -r cd4f13119afa -r 85fd05d0d16c test/__init__.py
--- a/test/__init__.py	Thu Mar 06 14:29:55 2014 +0100
+++ b/test/__init__.py	Thu Apr 03 16:44:11 2014 +0200
@@ -1,1 +1,1 @@
-''' BRS GCMS Galaxy Tools Module '''
+''' unit tests '''
diff -r cd4f13119afa -r 85fd05d0d16c test/test_query_mass_repos.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_mass_repos.py	Thu Apr 03 16:44:11 2014 +0200
@@ -0,0 +1,62 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from MS import query_mass_repos
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+       
+        
+
+    def test_simple(self):
+        '''
+        Simple initial test
+        '''
+        # Create out folder
+        outdir = "output/query_mass_repos/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+        
+        #     input_file = sys.argv[1]
+        #     molecular_mass_col = sys.argv[2]
+        #     repository_file = sys.argv[3]
+        #     mass_tolerance = float(sys.argv[4])
+        #     output_result = sys.argv[5]
+        
+        input_file = resource_filename(__name__, "data/service_query_tabular.txt")
+
+        molecular_mass_col = "MM"
+        dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+    
+     
+
+    
+        sys.argv = ['test',
+                    input_file,
+                    molecular_mass_col,
+                    dblink_file, 
+                    '0.001',
+                    'ms',
+                    output_result]
+        
+        # Execute main function with arguments provided through sys.argv
+        query_mass_repos.main()
+        
+       
+        
+   
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()