changeset 21:19d8fd10248e

* Added interface to METEXP data store, including tool to fire queries in batch mode * Improved quantification output files of MsClust, a.o. sorting mass list based on intensity (last two columns of quantification files) * Added Molecular Mass calculation method
author pieter.lukasse@wur.nl
date Wed, 05 Mar 2014 17:20:11 +0100
parents 24fb75fedee0
children cd4f13119afa
files MsClust.jar README.rst combine_output.py export_to_metexp_tabular.py export_to_metexp_tabular.xml msclust.xml query_metexp.py query_metexp.xml rankfilterGCMS_tabular.xml static_resources/elements_and_masses.tab test/test_export_to_metexp_tabular.py test/test_query_metexp.py test/test_query_metexp_LARGE.py
diffstat 13 files changed, 826 insertions(+), 33 deletions(-) [+]
line wrap: on
line diff
Binary file MsClust.jar has changed
--- a/README.rst	Tue Feb 11 12:29:50 2014 +0100
+++ b/README.rst	Wed Mar 05 17:20:11 2014 +0100
@@ -19,6 +19,11 @@
 ============== ======================================================================
 Date            Changes
 -------------- ----------------------------------------------------------------------
+March 2014     * Added interface to METEXP data store, including tool to fire 
+                 queries in batch mode
+               * Improved quantification output files of MsClust, a.o. sorting 
+                 mass list based on intensity (last two columns of quantification
+                 files)  
 January 2014   * first release via Tool Shed, combining the RIQC and MsClust in a 
                  single package (this package)
                * integration with METEXP software (data store for metabolomics 
--- a/combine_output.py	Tue Feb 11 12:29:50 2014 +0100
+++ b/combine_output.py	Wed Mar 05 17:20:11 2014 +0100
@@ -155,12 +155,16 @@
     @param data: dictionary containing merged dataset
     @param out_csv: output csv file
     '''
-    header = ['Centrotype',
+    # Columns we don't repeat:
+    header_part1 = ['Centrotype',
               'cent.Factor',
               'scan nr.',
               'R.T. (umin)',
               'nr. Peaks',
-              'R.T.',
+              'R.T.']
+    # These are the headers/columns we repeat in case of 
+    # combining hits in one line (see alternative_headers method below):
+    header_part2 = [
               'Name',
               'FORMULA',
               'Library',
@@ -190,13 +194,21 @@
     output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
 
     # Write headers
-    output_single_handle.writerow(header)
-    output_multi_handle.writerow(header * nhits)
+    output_single_handle.writerow(header_part1 + header_part2)
+    output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
     # Combine all hits for each centrotype into one line
     line = []
     for centrotype_idx in xrange(len(data)):
+        i = 0
         for hit in data[centrotype_idx]:
-            line.extend(hit)
+            if i==0:
+                line.extend(hit)
+            else:
+                line.extend(hit[6:])
+            i = i+1
+        # small validation (if error, it is a programming error):
+        if i > nhits:
+            raise Exception('Error: more hits that expected for  centrotype_idx ' + centrotype_idx)
         output_multi_handle.writerow(line)
         line = []
 
@@ -205,6 +217,17 @@
         for hit in data[centrotype_idx]:
             output_single_handle.writerow(hit)
 
+def alternative_headers(header_part2, nr_alternative_hits):
+    ''' 
+    This method will iterate over the header names and add the string 'ALT#_' before each, 
+    where # is the number of the alternative, according to number of alternative hits we want to add
+    to final csv/tsv
+    '''
+    result = []
+    for i in xrange(nr_alternative_hits): 
+        for header_name in header_part2:
+            result.append("ALT" + str(i+1) + "_" + header_name) 
+    return result
 
 def main():
     '''
--- a/export_to_metexp_tabular.py	Tue Feb 11 12:29:50 2014 +0100
+++ b/export_to_metexp_tabular.py	Wed Mar 05 17:20:11 2014 +0100
@@ -5,17 +5,18 @@
 into a tabular file that can be uploaded to the MetExp database.
 
 RankFilter, CasLookup are already combined by combine_output.py so here we will use
-this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
-quantification files are to be combined with combine_output.py result as well. 
+this result. Furthermore here one of the MsClust
+quantification files containing the respective spectra details are to be combined as well. 
 
 Extra calculations performed:
 - The column MW is also added here and is derived from the column FORMULA found 
-  in combine_output.py result. 
+  in RankFilter, CasLookup combined result. 
   
-So in total here we merge 3 files and calculate one new column. 
+So in total here we merge 2 files and calculate one new column. 
 '''
-
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
 import csv
+import re
 import sys
 from collections import OrderedDict
 
@@ -40,14 +41,15 @@
 ONE_TO_ONE = 'one_to_one'
 N_TO_ONE = 'n_to_one'
 
-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
     '''
     Merges data from both input dictionaries based on the link fields. This method will
     build up a new list containing the merged hits as the items. 
     @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
     @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
     '''
-    # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):
+    # TODO test for correct input files -> same link_field values should be there 
+    # (test at least number of unique link_field values):
     #
     # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
     #    raise Exception('input files should have the same nr of key values  ')
@@ -64,17 +66,23 @@
             # Get the indices for current link_field_set1_value in both data-structures for proper matching
             set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
             set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
-            
-            
+            # Validation :
+            if len(set2index) == 0:
+                # means that corresponding data could not be found in set2, then throw error
+                raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + 
+                                link_field_set1_value + " only found in first dataset. ")
             
             merged_hits = []
             # Combine hits
             for hit in xrange(len(set1index)):
                 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
                 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
-                # corresponding value in the rankfilter or caslookup tables; i.e. 
-                # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
-                #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+                # corresponding value in the sets; i.e. 
+                # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
+                #                    represented by "key". 
+                # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
+                # So set1[x][set1index[n]] = set1.attributeX.instanceN
+                #
                 # It just ensures the entry is made available as a plain named array for easy access.
                 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
                 if relation_type == ONE_TO_ONE :
@@ -83,7 +91,7 @@
                     # is N to 1:
                     cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
                 
-                merged_hit = merge_function(rf_record, cl_record)
+                merged_hit = merge_function(rf_record, cl_record, metadata)
                 merged_hits.append(merged_hit)
                 
             merged.append(merged_hits)
@@ -103,29 +111,62 @@
     
     
     
-def _merge_records(rank_caslookup_combi, msclust_quant_record):
+def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
     '''
     Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
     
     @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
     @param msclust_quant_record: msclust quantification + spectrum record
     '''
-    i = 0
     record = []
     for column in rank_caslookup_combi:
         record.append(rank_caslookup_combi[column])
-        i += 1
     
     for column in msclust_quant_record:
         record.append(msclust_quant_record[column])
-        i += 1
+        
+    for column in metadata:
+        record.append(metadata[column])
+        
+    # add MOLECULAR MASS (MM) 
+    molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
+    # limit to two decimals:    
+    record.append("{0:.2f}".format(molecular_mass))    
+        
+    # add MOLECULAR WEIGHT (MW) - TODO - calculate this
+    record.append('0.0')    
+    
+    # level of identification and Location of reference standard
+    record.append('0')
+    record.append('')    
         
     return record
 
 
-
+def get_molecular_mass(formula):
+    '''
+    Calculates the molecular mass (MM). 
+    E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
+    '''
+    
+    # Each element is represented by a capital letter, followed optionally by 
+    # lower case, with one or more digits as for how many elements:
+    element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
 
-def _save_data(data, headers, nhits, out_csv):
+    total_mass = 0
+    for (element_name, count) in element_pattern.findall(formula):
+        if count == "":
+            count = 1
+        else:
+            count = int(count)
+        element_mass = float(elements_and_masses_map[element_name])  # "found: Python's built-in float type has double precision " (? check if really correct ?)
+        total_mass += element_mass * count
+        
+    return total_mass
+    
+    
+
+def _save_data(data, headers, out_csv):
     '''
     Writes tab-separated data to file
     @param data: dictionary containing merged dataset
@@ -139,12 +180,35 @@
     # Write headers
     output_single_handle.writerow(headers)
 
-    # Write one line for each centrotype
-    for centrotype_idx in xrange(len(data)):
-        for hit in data[centrotype_idx]:
+    # Write 
+    for item_idx in xrange(len(data)):
+        for hit in data[item_idx]:
             output_single_handle.writerow(hit)
 
 
+def _get_map_for_elements_and_masses(elements_and_masses):
+    '''
+    This method will read out the column 'Chemical symbol' and make a map 
+    of this, storing the column 'Relative atomic mass' as its value
+    '''
+    resultMap = {}
+    index = 0
+    for entry in elements_and_masses['Chemical symbol']:
+        resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
+        index += 1
+        
+    return resultMap
+
+
+def init_elements_and_masses_map():
+    '''
+    Initializes the lookup map containing the elements and their respective masses
+    '''
+    elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
+    global elements_and_masses_map
+    elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
+    
+
 def main():
     '''
     Combine Output main function
@@ -156,15 +220,27 @@
     rankfilter_and_caslookup_combined_file = sys.argv[1]
     msclust_quantification_and_spectra_file = sys.argv[2]
     output_csv = sys.argv[3]
+    # metadata
+    metadata = OrderedDict()
+    metadata['organism'] = sys.argv[4]
+    metadata['tissue'] = sys.argv[5]
+    metadata['experiment_name'] = sys.argv[6]
+    metadata['user_name'] = sys.argv[7]
+    metadata['column_type'] = sys.argv[8]
 
     # Read RankFilter and CasLookup output files
     rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
     msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
     
+    # Read elements and masses to use for the MW/MM calculation :
+    init_elements_and_masses_map()
+    
     merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', 
-                                msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE)
-    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()
-    _save_data(merged, headers, nhits, output_csv)
+                                msclust_quantification_and_spectra, 'centrotype', 
+                                _compare_records, _merge_records, metadata,
+                                N_TO_ONE)
+    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
+    _save_data(merged, headers, output_csv)
 
 
 if __name__ == '__main__':
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.xml	Wed Mar 05 17:20:11 2014 +0100
@@ -0,0 +1,57 @@
+<tool id="export_to_metexp_tabular" 
+    name="METEXP - Tabular file" 
+    version="0.1.0">
+  <description>Create tabular file for loading into METabolomics EXPlorer database</description>
+  <command interpreter="python">
+    export_to_metexp_tabular.py $rankfilter_and_caslookup_combi $msclust_quant_file $output_result 
+    $organism $tissue $experiment_name $user_name $column_type
+  </command>
+  <inputs>
+    <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output"
+    	help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/>
+    <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output" 
+    	help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/>
+    	
+    	
+   <param name="organism" type="text" size="80"
+           label="Organism(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB." /> 	
+   <param name="tissue" type="text" size="80"
+           label="Tissue(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB." />
+           
+   <param name="experiment_name" type="text" size="80"
+           label="Experiment name/code"
+           help="Name or code to store the results under. This can help you find the results back in MetExpDB." />  
+           
+   <param name="user_name" type="text" size="80"
+           label="User name"
+           help="User name or code to store the results under. This can help you find the results back in MetExpDB." />  
+                   
+    <param name="column_type" type="text" size="80"
+           label="Column type"
+           help="Column type to report with the results. This can help you find the results back in MetExpDB." />    
+    
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" />
+  </outputs>
+  <help>
+.. class:: infomark  
+  
+Tool to combine output from the tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
+
+RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
+this result. 
+
+**Notes**
+
+Extra calculations performed:
+- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. 
+  
+So in total here we merge 2 files and calculate one new column. 
+  
+    
+  </help>
+</tool>
--- a/msclust.xml	Tue Feb 11 12:29:50 2014 +0100
+++ b/msclust.xml	Wed Mar 05 17:20:11 2014 +0100
@@ -1,4 +1,4 @@
-<tool name="MsClust" id="msclust2" version="2.0.2">
+<tool name="MsClust" id="msclust2" version="2.0.3">
 	<description>Extracts fragmentation spectra from aligned data</description>
 	<!-- 
 	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.py	Wed Mar 05 17:20:11 2014 +0100
@@ -0,0 +1,273 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the 
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about 
+related items found in the selected MetExp DB.   
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+    return _process_data(data)
+    
+def _process_data(data):
+    
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = OrderedDict()
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+
+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
+    '''
+    This method will iterate over the record in the input_data and
+    will enrich them with the related information found (if any) in the 
+    MetExp Database.
+    '''
+    merged = []
+    
+    for i in xrange(len(input_data[input_data.keys()[0]])):
+        # Get the record in same dictionary format as input_data, but containing
+        # a value at each column instead of a list of all values of all records:
+        input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+        
+        # read the molecular mass and formula:
+        cas_id = input_data_record[casid_col]
+        formula = input_data_record[formula_col]
+        molecular_mass = input_data_record[molecular_mass_col]
+        
+        # search for related records in MetExp:
+        data_found = None
+        if cas_id != "undef": 
+            # 1- search for other experiments where this CAS id has been found:
+            query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "CAS"
+        if data_found == None:
+            # 2- search for other experiments where this FORMULA has been found:
+            query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "FORMULA"
+        if data_found == None:
+            # 3- search for other experiments where this MM has been found:
+            query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method 
+            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+            data_type_found = "MM"
+                
+        if data_found == None:
+            # If still nothing found, just add empty columns
+            extra_cols = ['', '','','','','','','']
+        else:
+            # Add info found:
+            extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+        
+        # Take all data and merge it into a "flat"/simple array of values:
+        field_values_list = _merge_data(input_data_record, extra_cols)
+    
+        merged.append(field_values_list)
+
+    # return the merged/enriched records:
+    return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+    '''
+    This method will go over the data found and will return a 
+    list with the following items:
+    - Experiment details where hits have been found :
+        'organism', 'tissue','experiment_name','user_name','column_type'
+    - Link that executes same query
+        
+    '''
+    # set() makes a unique list:
+    organism_set = []
+    tissue_set = []
+    experiment_name_set = []
+    user_name_set = []
+    column_type_set = []
+    cas_nr_set = []
+    
+    if 'organism' in data_found:
+        organism_set = set(data_found['organism'])
+    if 'tissue' in data_found:
+        tissue_set = set(data_found['tissue'])
+    if 'experiment_name' in data_found:
+        experiment_name_set = set(data_found['experiment_name'])
+    if 'user_name' in data_found:
+        user_name_set = set(data_found['user_name'])
+    if 'column_type' in data_found:
+        column_type_set = set(data_found['column_type'])
+    if 'CAS' in data_found:
+        cas_nr_set = set(data_found['CAS'])        
+    
+    
+    result = [data_type_found,
+              _to_xsv(organism_set),
+              _to_xsv(tissue_set),
+              _to_xsv(experiment_name_set),
+              _to_xsv(user_name_set),
+              _to_xsv(column_type_set),
+              _to_xsv(cas_nr_set),              
+              #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): 
+              "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+    return result
+
+
+def _to_xsv(data_set):
+    result = ""
+    for item in data_set:
+        result = result + str(item) + "|"    
+    return result
+
+
+def _fire_query_and_return_dict(url):
+    '''
+    This method will fire the query as a web-service call and 
+    return the results as a list of dictionary objects
+    '''
+    
+    try:
+        data = urllib2.urlopen(url).read()
+        
+        # transform to dictionary:
+        result = []
+        data_rows = data.split("\n")
+        
+        # check if there is any data in the response:
+        if len(data_rows) <= 1 or data_rows[1].strip() == '': 
+            # means there is only the header row...so no hits:
+            return None
+        
+        for data_row in data_rows:
+            if not data_row.strip() == '':
+                row_as_list = _str_to_list(data_row, delimiter='\t')
+                result.append(row_as_list)
+        
+        # return result processed into a dict:
+        return _process_data(result)
+        
+    except urllib2.HTTPError, e:
+        raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+    except urllib2.URLError, e:
+        raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):    
+    result = []
+    for column in data_row.split(delimiter):
+        result.append(column)
+    return result
+    
+    
+# alternative: ?    
+#     s = requests.Session()
+#     s.verify = False
+#     #s.auth = (token01, token02)
+#     resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+#     content = resp.content
+#     # transform to dictionary:
+    
+    
+    
+    
+def _merge_data(input_data_record, extra_cols):
+    '''
+    Adds the extra information to the existing data record and returns
+    the combined new record.
+    '''
+    record = []
+    for column in input_data_record:
+        record.append(input_data_record[column])
+    
+    
+    # add extra columns
+    for column in extra_cols:
+        record.append(column)    
+    
+    return record  
+    
+
+def _save_data(data_rows, headers, out_csv):
+    '''
+    Writes tab-separated data to file
+    @param data_rows: dictionary containing merged/enriched dataset
+    @param out_csv: output csv file
+    '''
+
+    # Open output file for writing
+    outfile_single_handle = open(out_csv, 'wb')
+    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+    # Write headers
+    output_single_handle.writerow(headers)
+
+    # Write one line for each row
+    for data_row in data_rows:
+        output_single_handle.writerow(data_row)
+
+def _get_metexp_URL(metexp_dblink_file):
+    '''
+    Read out and return the URL stored in the given file.
+    '''
+    file_input = fileinput.input(metexp_dblink_file)
+    try:
+        for line in file_input:
+            if line[0] != '#':
+                # just return the first line that is not a comment line:
+                return line
+    finally:
+        file_input.close()
+    
+
+def main():
+    '''
+    MetExp Query main function
+    
+    The input file can be any tabular file, as long as it contains a column for the molecular mass
+    and one for the formula of the respective identification. These two columns are then
+    used to query against MetExp Database.   
+    '''
+    input_file = sys.argv[1]
+    casid_col = sys.argv[2]
+    formula_col = sys.argv[3]
+    molecular_mass_col = sys.argv[4]
+    metexp_dblink_file = sys.argv[5]
+    separation_method = sys.argv[6]
+    output_result = sys.argv[7]
+
+    # Parse metexp_dblink_file to find the URL to the MetExp service:
+    metexp_dblink = _get_metexp_URL(metexp_dblink_file)
+    
+    # Parse tabular input file into dictionary/array:
+    input_data = _process_file(input_file)
+    
+    # Query data against MetExp DB :
+    enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
+    headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
+                                   'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
+    
+    _save_data(enriched_data, headers, output_result)
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.xml	Wed Mar 05 17:20:11 2014 +0100
@@ -0,0 +1,67 @@
+<tool id="query_metexp" 
+    name="METEXP - Query Database " 
+    version="0.1.0">
+  <description>Query a set of identifications against the METabolomics EXPlorer database</description>
+  <command interpreter="python">
+    query_metexp.py 
+    $input_file 
+    $casid_col
+    $formula_col 
+    $molecular_mass_col 
+    "$metexp_dblink_file"
+    $separation_method
+    $output_result 
+  </command>
+  <inputs>
+  
+   <param name="input_file" format="tabular" type="data" 
+        label="Input file"
+    	help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/>
+		
+   <param name="casid_col" type="text" size="50"
+           label="CAS ID column name"
+           value="CAS"
+           help="Name of the column containing the CAS code information (in the given input file)" /> 	
+   <param name="formula_col" type="text" size="50"
+           label="Formula ID column name"
+           value="FORMULA"
+           help="Name of the column containing the formula information (in the given input file)" /> 	
+   <param name="molecular_mass_col" type="text" size="50"
+           label="Molecular mass column name"
+           value="MM"
+           help="Name of the column containing the molecular mass information (in the given input file)" /> 	
+   
+   <param name="metexp_dblink_file" type="select" label="MetExp DB to query" 
+      		 help="Select the MetExp Database/backend which should be queried" 
+      		 dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/>
+      		 
+  <param name="separation_method" type="select" label="Data type to query">
+  	<option value="GC" selected="True">GC</option>
+    <option value="LC">LC</option>
+  </param>    		     	
+    
+  </inputs>
+  <outputs>
+    <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" />
+  </outputs>
+  <code file="match_library.py" /> <!-- file containing get_directory_files function used above-->
+  <help>
+.. class:: infomark  
+  
+This tool will Query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the 
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about 
+related items found in the selected MetExp DB.  
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass
+and one for the formula of the respective identification.  
+  
+    
+  </help>
+</tool>
--- a/rankfilterGCMS_tabular.xml	Tue Feb 11 12:29:50 2014 +0100
+++ b/rankfilterGCMS_tabular.xml	Wed Mar 05 17:20:11 2014 +0100
@@ -3,7 +3,7 @@
   <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command>
   <inputs>
     <param format="tabular" name="sample" type="data" label="Sample File" 
-	       help="Converted PDF file in tabular format" />
+	       help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />
 	<!-- question: is this calibration file not column specific as it includes RT info?? -->
     <!-- this one should be input file for now:<param name="calibration"  type="select" label="Calibration File" 
            help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/static_resources/elements_and_masses.tab	Wed Mar 05 17:20:11 2014 +0100
@@ -0,0 +1,104 @@
+Name	Atomic number	Chemical symbol	Relative atomic mass
+Hydrogen	1	H	1.01
+Helium	2	He	4
+Lithium	3	Li	6.94
+Beryllium	4	Be	9.01
+Boron	5	B	10.81
+Carbon	6	C	12.01
+Nitrogen	7	N	14.01
+Oxygen	8	O	16
+Fluorine	9	F	19
+Neon	10	Ne	20.18
+Sodium	11	Na	22.99
+Magnesium	12	Mg	24.31
+Aluminum	13	Al	26.98
+Silicon	14	Si	28.09
+Phosphorus	15	P	30.98
+Sulfur	16	S	32.06
+Chlorine	17	Cl	35.45
+Argon	18	Ar	39.95
+Potassium	19	K	39.1
+Calcium	20	Ca	40.08
+Scandium	21	Sc	44.96
+Titanium	22	Ti	47.9
+Vanadium	23	V	50.94
+Chromium	24	Cr	52
+Manganese	25	Mn	54.94
+Iron	26	Fe	55.85
+Cobalt	27	Co	58.93
+Nickel	28	Ni	58.71
+Copper	29	Cu	63.54
+Zinc	30	Zn	65.37
+Gallium	31	Ga	69.72
+Germanium	32	Ge	72.59
+Arsenic	33	As	74.99
+Selenium	34	Se	78.96
+Bromine	35	Br	79.91
+Krypton	36	Kr	83.8
+Rubidium	37	Rb	85.47
+Strontium	38	Sr	87.62
+Yttrium	39	Y	88.91
+Zirconium	40	Zr	91.22
+Niobium	41	Nb	92.91
+Molybdenum	42	Mo	95.94
+Technetium	43	Tc	96.91
+Ruthenium	44	Ru	101.07
+Rhodium	45	Rh	102.9
+Palladium	46	Pd	106.4
+Silver	47	Ag	107.87
+Cadmium	48	Cd	112.4
+Indium	49	In	114.82
+Tin	50	Sn	118.69
+Antimony	51	Sb	121.75
+Tellurium	52	Te	127.6
+Iodine	53	I	126.9
+Xenon	54	Xe	131.3
+Cesium	55	Cs	132.9
+Barium	56	Ba	137.34
+Lanthanum	57	La	138.91
+Cerium	58	Ce	140.12
+Praseodymium	59	Pr	140.91
+Neodymium	60	Nd	144.24
+Promethium	61	Pm	144.91
+Samarium	62	Sm	150.35
+Europium	63	Eu	151.96
+Gadolinium	64	Gd	157.25
+Terbium	65	Tb	158.92
+Dysprosium	66	Dy	162.5
+Holmium	67	Ho	164.93
+Erbium	68	Er	167.26
+Thulium	69	Tm	168.93
+Ytterbium	70	Yb	173.04
+Lutetium	71	Lu	174.97
+Hafnium	72	Hf	178.49
+Tantalum	73	Ta	180.95
+Wolfram	74	W	183.85
+Rhenium	75	Re	186.2
+Osmium	76	Os	190.2
+Iridium	77	Ir	192.22
+Platinum	78	Pt	195.09
+Gold	79	Au	196.97
+Mercury	80	Hg	200.59
+Thallium	81	Tl	204.37
+Lead	82	Pb	207.19
+Bismuth	83	Bi	208.98
+Polonium	84	Po	208.98
+Astatine	85	At	209.99
+Radon	86	Rn	222.02
+Francium	87	Fr	223.02
+Radium	88	Ra	226
+Actinium	89	Ac	227.03
+Thorium	90	Th	232.04
+Protactinium	91	Pa	231.04
+Uranium	92	U	238.03
+Neptunium	93	Np	237
+Plutonium	94	Pu	242
+Americium	95	Am	243.06
+Curium	96	Cm	247.07
+Berkelium	97	Bk	247.07
+Californium	98	Cf	251.08
+Einsteinium	99	Es	254.09
+Fermium	100	Fm	257.1
+Mendelevium	101	Md	257.1
+Nobelium	102	No	255.09
+Lawrencium	103	Lr	256.1
--- a/test/test_export_to_metexp_tabular.py	Tue Feb 11 12:29:50 2014 +0100
+++ b/test/test_export_to_metexp_tabular.py	Wed Mar 05 17:20:11 2014 +0100
@@ -10,6 +10,27 @@
 class IntegrationTest(unittest.TestCase):
 
 
+    def test_MM_calculations(self):
+        '''
+        test the implemented method for MM calculations for 
+        given chemical formulas
+        '''
+        export_to_metexp_tabular.init_elements_and_masses_map()
+        
+        formula = "C8H18O3"
+        # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertEqual(162.26, result)
+        
+        formula = "CH2O3Fe2Ni"
+        # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertAlmostEqual(232.44, result, 2)
+        
+        
+        
+        
+
     def test_combine_output_simple(self):
         '''
         comment me
@@ -28,7 +49,13 @@
         sys.argv = ['test',
                     rankfilter_and_caslookup_combined_file,
                     msclust_quantification_and_spectra_file,
-                    output_csv]
+                    output_csv, 
+                    'tomato',
+                    'leafs',
+                    'test experiment',
+                    'pieter',
+                    'DB5 column']
+        
         # Execute main function with arguments provided through sys.argv
         export_to_metexp_tabular.main()
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp.py	Wed Mar 05 17:20:11 2014 +0100
@@ -0,0 +1,82 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#     def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for 
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#         
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#         
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#         
+#         
+#         
+        
+
+    def test_simple(self):
+        '''
+        Simple initial test
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+        
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+        
+        input_file = resource_filename(__name__, "data/metexp_query_tabular.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+    
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col, 
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+        
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+        
+        # TODO - asserts  (base them on DB being filled with test data form metexp unit test for upload method)
+        # PA
+
+        
+   
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp_LARGE.py	Wed Mar 05 17:20:11 2014 +0100
@@ -0,0 +1,79 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#     def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for 
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#         
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#         
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#         
+#         
+#         
+        
+
+    def test_large(self):
+        '''
+        Simple test, but on larger set, last test executed in 28s
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+        
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+        
+        input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt")
+    
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col, 
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+        
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+
+        
+   
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()