Galaxy |

Changeset 21:19d8fd10248e (2014-03-05)

Previous changeset 20:24fb75fedee0 (2014-02-11) Next changeset 22:cd4f13119afa (2014-03-06)

Commit message:
* Added interface to METEXP data store, including tool to fire queries in batch mode * Improved quantification output files of MsClust, a.o. sorting mass list based on intensity (last two columns of quantification files) * Added Molecular Mass calculation method

modified:
MsClust.jar
README.rst
combine_output.py
export_to_metexp_tabular.py
msclust.xml
rankfilterGCMS_tabular.xml
test/test_export_to_metexp_tabular.py

added:
export_to_metexp_tabular.xml
query_metexp.py
query_metexp.xml
static_resources/elements_and_masses.tab
test/test_query_metexp.py
test/test_query_metexp_LARGE.py

diff -r 24fb75fedee0 -r 19d8fd10248e MsClust.jar

Binary file MsClust.jar has changed

diff -r 24fb75fedee0 -r 19d8fd10248e README.rst
--- a/README.rst Tue Feb 11 12:29:50 2014 +0100
+++ b/README.rst Wed Mar 05 17:20:11 2014 +0100

@@ -19,6 +19,11 @@
============== ======================================================================
Date            Changes
-------------- ----------------------------------------------------------------------
+March 2014     * Added interface to METEXP data store, including tool to fire
+                 queries in batch mode
+               * Improved quantification output files of MsClust, a.o. sorting
+                 mass list based on intensity (last two columns of quantification
+                 files)
January 2014   * first release via Tool Shed, combining the RIQC and MsClust in a
                  single package (this package)
                * integration with METEXP software (data store for metabolomics

diff -r 24fb75fedee0 -r 19d8fd10248e combine_output.py
--- a/combine_output.py Tue Feb 11 12:29:50 2014 +0100
+++ b/combine_output.py Wed Mar 05 17:20:11 2014 +0100

[

@@ -155,12 +155,16 @@
     @param data: dictionary containing merged dataset
     @param out_csv: output csv file
     '''
-    header = ['Centrotype',
+    # Columns we don't repeat:
+    header_part1 = ['Centrotype',
               'cent.Factor',
               'scan nr.',
               'R.T. (umin)',
               'nr. Peaks',
-              'R.T.',
+              'R.T.']
+    # These are the headers/columns we repeat in case of
+    # combining hits in one line (see alternative_headers method below):
+    header_part2 = [
               'Name',
               'FORMULA',
               'Library',
@@ -190,13 +194,21 @@
     output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")

     # Write headers
-    output_single_handle.writerow(header)
-    output_multi_handle.writerow(header * nhits)
+    output_single_handle.writerow(header_part1 + header_part2)
+    output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
     # Combine all hits for each centrotype into one line
     line = []
     for centrotype_idx in xrange(len(data)):
+        i = 0
         for hit in data[centrotype_idx]:
-            line.extend(hit)
+            if i==0:
+                line.extend(hit)
+            else:
+                line.extend(hit[6:])
+            i = i+1
+        # small validation (if error, it is a programming error):
+        if i > nhits:
+            raise Exception('Error: more hits that expected for  centrotype_idx ' + centrotype_idx)
         output_multi_handle.writerow(line)
         line = []

@@ -205,6 +217,17 @@
         for hit in data[centrotype_idx]:
             output_single_handle.writerow(hit)

+def alternative_headers(header_part2, nr_alternative_hits):
+    '''
+    This method will iterate over the header names and add the string 'ALT#_' before each,
+    where # is the number of the alternative, according to number of alternative hits we want to add
+    to final csv/tsv
+    '''
+    result = []
+    for i in xrange(nr_alternative_hits):
+        for header_name in header_part2:
+            result.append("ALT" + str(i+1) + "_" + header_name)
+    return result

def main():
     '''

diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.py
--- a/export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100
+++ b/export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100

[

b'@@ -5,17 +5,18 @@\n into a tabular file that can be uploaded to the MetExp database.\n \n RankFilter, CasLookup are already combined by combine_output.py so here we will use\n-this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust\n-quantification files are to be combined with combine_output.py result as well. \n+this result. Furthermore here one of the MsClust\n+quantification files containing the respective spectra details are to be combined as well. \n \n Extra calculations performed:\n - The column MW is also added here and is derived from the column FORMULA found \n- in combine_output.py result. \n+ in RankFilter, CasLookup combined result. \n \n-So in total here we merge 3 files and calculate one new column. \n+So in total here we merge 2 files and calculate one new column. \n \'\'\'\n-\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n import csv\n+import re\n import sys\n from collections import OrderedDict\n \n@@ -40,14 +41,15 @@\n ONE_TO_ONE = \'one_to_one\'\n N_TO_ONE = \'n_to_one\'\n \n-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):\n+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):\n \'\'\'\n Merges data from both input dictionaries based on the link fields. This method will\n build up a new list containing the merged hits as the items. \n @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)\n @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)\n \'\'\'\n- # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):\n+ # TODO test for correct input files -> same link_field values should be there \n+ # (test at least number of unique link_field values):\n #\n # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):\n # raise Exception(\'input files should have the same nr of key values \')\n@@ -64,17 +66,23 @@\n # Get the indices for current link_field_set1_value in both data-structures for proper matching\n set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]\n set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]\n- \n- \n+ # Validation :\n+ if len(set2index) == 0:\n+ # means that corresponding data could not be found in set2, then throw error\n+ raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + \n+ link_field_set1_value + " only found in first dataset. ")\n \n merged_hits = []\n # Combine hits\n for hit in xrange(len(set1index)):\n # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n- # corresponding value in the rankfilter or caslookup tables; i.e. \n- # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute\n- # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)\n+ # corresponding value in the sets; i.e. \n+ # set1[key] => returns the list/array with size = nrrows, with the values for the attribute\n+ # represented by "key". \n+ # set1index[hit] => points to the row nr=hit (hit is a rownr/index)\n+ # So set1[x][set1index[n]] = set1.attributeX.instanceN\n+ #\n # It just ensures '..b' of reference standard\n+ record.append(\'0\')\n+ record.append(\'\') \n \n return record\n \n \n-\n+def get_molecular_mass(formula):\n+ \'\'\'\n+ Calculates the molecular mass (MM). \n+ E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O\n+ \'\'\'\n+ \n+ # Each element is represented by a capital letter, followed optionally by \n+ # lower case, with one or more digits as for how many elements:\n+ element_pattern = re.compile("([A-Z][a-z]?)(\\d*)")\n \n-def _save_data(data, headers, nhits, out_csv):\n+ total_mass = 0\n+ for (element_name, count) in element_pattern.findall(formula):\n+ if count == "":\n+ count = 1\n+ else:\n+ count = int(count)\n+ element_mass = float(elements_and_masses_map[element_name]) # "found: Python\'s built-in float type has double precision " (? check if really correct ?)\n+ total_mass += element_mass * count\n+ \n+ return total_mass\n+ \n+ \n+\n+def _save_data(data, headers, out_csv):\n \'\'\'\n Writes tab-separated data to file\n @param data: dictionary containing merged dataset\n@@ -139,12 +180,35 @@\n # Write headers\n output_single_handle.writerow(headers)\n \n- # Write one line for each centrotype\n- for centrotype_idx in xrange(len(data)):\n- for hit in data[centrotype_idx]:\n+ # Write \n+ for item_idx in xrange(len(data)):\n+ for hit in data[item_idx]:\n output_single_handle.writerow(hit)\n \n \n+def _get_map_for_elements_and_masses(elements_and_masses):\n+ \'\'\'\n+ This method will read out the column \'Chemical symbol\' and make a map \n+ of this, storing the column \'Relative atomic mass\' as its value\n+ \'\'\'\n+ resultMap = {}\n+ index = 0\n+ for entry in elements_and_masses[\'Chemical symbol\']:\n+ resultMap[entry] = elements_and_masses[\'Relative atomic mass\'][index]\n+ index += 1\n+ \n+ return resultMap\n+\n+\n+def init_elements_and_masses_map():\n+ \'\'\'\n+ Initializes the lookup map containing the elements and their respective masses\n+ \'\'\'\n+ elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))\n+ global elements_and_masses_map\n+ elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)\n+ \n+\n def main():\n \'\'\'\n Combine Output main function\n@@ -156,15 +220,27 @@\n rankfilter_and_caslookup_combined_file = sys.argv[1]\n msclust_quantification_and_spectra_file = sys.argv[2]\n output_csv = sys.argv[3]\n+ # metadata\n+ metadata = OrderedDict()\n+ metadata[\'organism\'] = sys.argv[4]\n+ metadata[\'tissue\'] = sys.argv[5]\n+ metadata[\'experiment_name\'] = sys.argv[6]\n+ metadata[\'user_name\'] = sys.argv[7]\n+ metadata[\'column_type\'] = sys.argv[8]\n \n # Read RankFilter and CasLookup output files\n rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)\n msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, \',\')\n \n+ # Read elements and masses to use for the MW/MM calculation :\n+ init_elements_and_masses_map()\n+ \n merged, nhits = _merge_data(rankfilter_and_caslookup_combined, \'Centrotype\', \n- msclust_quantification_and_spectra, \'centrotype\', _compare_records, _merge_records, N_TO_ONE)\n- headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()\n- _save_data(merged, headers, nhits, output_csv)\n+ msclust_quantification_and_spectra, \'centrotype\', \n+ _compare_records, _merge_records, metadata,\n+ N_TO_ONE)\n+ headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + [\'MM\',\'MW\', \'Level of identification\', \'Location of reference standard\']\n+ _save_data(merged, headers, output_csv)\n \n \n if __name__ == \'__main__\':\n'

diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.xml Wed Mar 05 17:20:11 2014 +0100

@@ -0,0 +1,57 @@
+<tool id="export_to_metexp_tabular"
+    name="METEXP - Tabular file"
+    version="0.1.0">
+  <description>Create tabular file for loading into METabolomics EXPlorer database</description>
+  <command interpreter="python">
+    export_to_metexp_tabular.py $rankfilter_and_caslookup_combi $msclust_quant_file $output_result
+    $organism $tissue $experiment_name $user_name $column_type
+  </command>
+  <inputs>
+    <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output"
+     help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/>
+    <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output"
+     help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/>
+
+
+   <param name="organism" type="text" size="80"
+           label="Organism(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB." />
+   <param name="tissue" type="text" size="80"
+           label="Tissue(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB." />
+
+   <param name="experiment_name" type="text" size="80"
+           label="Experiment name/code"
+           help="Name or code to store the results under. This can help you find the results back in MetExpDB." />
+
+   <param name="user_name" type="text" size="80"
+           label="User name"
+           help="User name or code to store the results under. This can help you find the results back in MetExpDB." />
+
+    <param name="column_type" type="text" size="80"
+           label="Column type"
+           help="Column type to report with the results. This can help you find the results back in MetExpDB." />
+
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" />
+  </outputs>
+  <help>
+.. class:: infomark
+
+Tool to combine output from the tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
+
+RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
+this result.
+
+**Notes**
+
+Extra calculations performed:
+- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+
+
+  </help>
+</tool>

diff -r 24fb75fedee0 -r 19d8fd10248e msclust.xml
--- a/msclust.xml Tue Feb 11 12:29:50 2014 +0100
+++ b/msclust.xml Wed Mar 05 17:20:11 2014 +0100

@@ -1,4 +1,4 @@
-<tool name="MsClust" id="msclust2" version="2.0.2">
+<tool name="MsClust" id="msclust2" version="2.0.3">
<description>Extracts fragmentation spectra from aligned data</description>
<!--
For remote debugging start you listener on port 8000 and use the following as command interpreter:

diff -r 24fb75fedee0 -r 19d8fd10248e query_metexp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.py Wed Mar 05 17:20:11 2014 +0100

[

b'@@ -0,0 +1,273 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to query a set of identifications against the METabolomics EXPlorer database.\n+\n+It will take the input file and for each record it will query the \n+molecular mass in the selected MetExp DB. If one or more compounds are found in the\n+MetExp DB then extra information regarding these compounds is added to the output file.\n+\n+The output file is thus the input file enriched with information about \n+related items found in the selected MetExp DB. \n+\'\'\'\n+import csv\n+import sys\n+import fileinput\n+import urllib2\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_file(in_xsv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n+ return _process_data(data)\n+ \n+def _process_data(data):\n+ \n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):\n+ \'\'\'\n+ This method will iterate over the record in the input_data and\n+ will enrich them with the related information found (if any) in the \n+ MetExp Database.\n+ \'\'\'\n+ merged = []\n+ \n+ for i in xrange(len(input_data[input_data.keys()[0]])):\n+ # Get the record in same dictionary format as input_data, but containing\n+ # a value at each column instead of a list of all values of all records:\n+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n+ \n+ # read the molecular mass and formula:\n+ cas_id = input_data_record[casid_col]\n+ formula = input_data_record[formula_col]\n+ molecular_mass = input_data_record[molecular_mass_col]\n+ \n+ # search for related records in MetExp:\n+ data_found = None\n+ if cas_id != "undef": \n+ # 1- search for other experiments where this CAS id has been found:\n+ query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "CAS"\n+ if data_found == None:\n+ # 2- search for other experiments where this FORMULA has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "FORMULA"\n+ if data_found == None:\n+ # 3- search for other experiments where this MM has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method \n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "MM"\n+ \n+ if data_found == None:\n+ # If still nothing found, just add empty columns\n+ extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\',\'\',\'\']\n+ else:\n+ # Add info found:\n+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n+ \n+ # Take all data and merge it into a "flat"/simple array of values:\n+ field_values_list = _merge_data(input_data_record, extra_cols)\n+ \n+ merged.append(field_values_list)\n+\n+ # return the merged/enriched records:\n+ return merged\n+\n+\n+def _get_e'..b' \n+ # check if there is any data in the response:\n+ if len(data_rows) <= 1 or data_rows[1].strip() == \'\': \n+ # means there is only the header row...so no hits:\n+ return None\n+ \n+ for data_row in data_rows:\n+ if not data_row.strip() == \'\':\n+ row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n+ result.append(row_as_list)\n+ \n+ # return result processed into a dict:\n+ return _process_data(result)\n+ \n+ except urllib2.HTTPError, e:\n+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n+ except urllib2.URLError, e:\n+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")\n+\n+def _str_to_list(data_row, delimiter=\'\\t\'): \n+ result = []\n+ for column in data_row.split(delimiter):\n+ result.append(column)\n+ return result\n+ \n+ \n+# alternative: ? \n+# s = requests.Session()\n+# s.verify = False\n+# #s.auth = (token01, token02)\n+# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n+# content = resp.content\n+# # transform to dictionary:\n+ \n+ \n+ \n+ \n+def _merge_data(input_data_record, extra_cols):\n+ \'\'\'\n+ Adds the extra information to the existing data record and returns\n+ the combined new record.\n+ \'\'\'\n+ record = []\n+ for column in input_data_record:\n+ record.append(input_data_record[column])\n+ \n+ \n+ # add extra columns\n+ for column in extra_cols:\n+ record.append(column) \n+ \n+ return record \n+ \n+\n+def _save_data(data_rows, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data_rows: dictionary containing merged/enriched dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write one line for each row\n+ for data_row in data_rows:\n+ output_single_handle.writerow(data_row)\n+\n+def _get_metexp_URL(metexp_dblink_file):\n+ \'\'\'\n+ Read out and return the URL stored in the given file.\n+ \'\'\'\n+ file_input = fileinput.input(metexp_dblink_file)\n+ try:\n+ for line in file_input:\n+ if line[0] != \'#\':\n+ # just return the first line that is not a comment line:\n+ return line\n+ finally:\n+ file_input.close()\n+ \n+\n+def main():\n+ \'\'\'\n+ MetExp Query main function\n+ \n+ The input file can be any tabular file, as long as it contains a column for the molecular mass\n+ and one for the formula of the respective identification. These two columns are then\n+ used to query against MetExp Database. \n+ \'\'\'\n+ input_file = sys.argv[1]\n+ casid_col = sys.argv[2]\n+ formula_col = sys.argv[3]\n+ molecular_mass_col = sys.argv[4]\n+ metexp_dblink_file = sys.argv[5]\n+ separation_method = sys.argv[6]\n+ output_result = sys.argv[7]\n+\n+ # Parse metexp_dblink_file to find the URL to the MetExp service:\n+ metexp_dblink = _get_metexp_URL(metexp_dblink_file)\n+ \n+ # Parse tabular input file into dictionary/array:\n+ input_data = _process_file(input_file)\n+ \n+ # Query data against MetExp DB :\n+ enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)\n+ headers = input_data.keys() + [\'METEXP hits for \',\'METEXP hits: organisms\', \'METEXP hits: tissues\',\n+ \'METEXP hits: experiments\',\'METEXP hits: user names\',\'METEXP hits: column types\', \'METEXP hits: CAS nrs\', \'Link to METEXP hits\']\n+ \n+ _save_data(enriched_data, headers, output_result)\n+\n+\n+if __name__ == \'__main__\':\n+ main()\n'

diff -r 24fb75fedee0 -r 19d8fd10248e query_metexp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.xml Wed Mar 05 17:20:11 2014 +0100

@@ -0,0 +1,67 @@
+<tool id="query_metexp"
+    name="METEXP - Query Database "
+    version="0.1.0">
+  <description>Query a set of identifications against the METabolomics EXPlorer database</description>
+  <command interpreter="python">
+    query_metexp.py
+    $input_file
+    $casid_col
+    $formula_col
+    $molecular_mass_col
+    "$metexp_dblink_file"
+    $separation_method
+    $output_result
+  </command>
+  <inputs>
+
+   <param name="input_file" format="tabular" type="data"
+        label="Input file"
+     help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/>
+
+   <param name="casid_col" type="text" size="50"
+           label="CAS ID column name"
+           value="CAS"
+           help="Name of the column containing the CAS code information (in the given input file)" />
+   <param name="formula_col" type="text" size="50"
+           label="Formula ID column name"
+           value="FORMULA"
+           help="Name of the column containing the formula information (in the given input file)" />
+   <param name="molecular_mass_col" type="text" size="50"
+           label="Molecular mass column name"
+           value="MM"
+           help="Name of the column containing the molecular mass information (in the given input file)" />
+
+   <param name="metexp_dblink_file" type="select" label="MetExp DB to query"
+       help="Select the MetExp Database/backend which should be queried"
+       dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/>
+
+  <param name="separation_method" type="select" label="Data type to query">
+   <option value="GC" selected="True">GC</option>
+    <option value="LC">LC</option>
+  </param>
+
+  </inputs>
+  <outputs>
+    <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" />
+  </outputs>
+  <code file="match_library.py" /> 
+  <help>
+.. class:: infomark
+
+This tool will Query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass
+and one for the formula of the respective identification.
+
+
+  </help>
+</tool>

diff -r 24fb75fedee0 -r 19d8fd10248e rankfilterGCMS_tabular.xml
--- a/rankfilterGCMS_tabular.xml Tue Feb 11 12:29:50 2014 +0100
+++ b/rankfilterGCMS_tabular.xml Wed Mar 05 17:20:11 2014 +0100

@@ -3,7 +3,7 @@
   <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command>
   <inputs>
     <param format="tabular" name="sample" type="data" label="Sample File"
-        help="Converted PDF file in tabular format" />
+        help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />

     <!-- this one should be input file for now:<param name="calibration"  type="select" label="Calibration File"
            help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values"

diff -r 24fb75fedee0 -r 19d8fd10248e static_resources/elements_and_masses.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/static_resources/elements_and_masses.tab Wed Mar 05 17:20:11 2014 +0100

@@ -0,0 +1,104 @@
+Name Atomic number Chemical symbol Relative atomic mass
+Hydrogen 1 H 1.01
+Helium 2 He 4
+Lithium 3 Li 6.94
+Beryllium 4 Be 9.01
+Boron 5 B 10.81
+Carbon 6 C 12.01
+Nitrogen 7 N 14.01
+Oxygen 8 O 16
+Fluorine 9 F 19
+Neon 10 Ne 20.18
+Sodium 11 Na 22.99
+Magnesium 12 Mg 24.31
+Aluminum 13 Al 26.98
+Silicon 14 Si 28.09
+Phosphorus 15 P 30.98
+Sulfur 16 S 32.06
+Chlorine 17 Cl 35.45
+Argon 18 Ar 39.95
+Potassium 19 K 39.1
+Calcium 20 Ca 40.08
+Scandium 21 Sc 44.96
+Titanium 22 Ti 47.9
+Vanadium 23 V 50.94
+Chromium 24 Cr 52
+Manganese 25 Mn 54.94
+Iron 26 Fe 55.85
+Cobalt 27 Co 58.93
+Nickel 28 Ni 58.71
+Copper 29 Cu 63.54
+Zinc 30 Zn 65.37
+Gallium 31 Ga 69.72
+Germanium 32 Ge 72.59
+Arsenic 33 As 74.99
+Selenium 34 Se 78.96
+Bromine 35 Br 79.91
+Krypton 36 Kr 83.8
+Rubidium 37 Rb 85.47
+Strontium 38 Sr 87.62
+Yttrium 39 Y 88.91
+Zirconium 40 Zr 91.22
+Niobium 41 Nb 92.91
+Molybdenum 42 Mo 95.94
+Technetium 43 Tc 96.91
+Ruthenium 44 Ru 101.07
+Rhodium 45 Rh 102.9
+Palladium 46 Pd 106.4
+Silver 47 Ag 107.87
+Cadmium 48 Cd 112.4
+Indium 49 In 114.82
+Tin 50 Sn 118.69
+Antimony 51 Sb 121.75
+Tellurium 52 Te 127.6
+Iodine 53 I 126.9
+Xenon 54 Xe 131.3
+Cesium 55 Cs 132.9
+Barium 56 Ba 137.34
+Lanthanum 57 La 138.91
+Cerium 58 Ce 140.12
+Praseodymium 59 Pr 140.91
+Neodymium 60 Nd 144.24
+Promethium 61 Pm 144.91
+Samarium 62 Sm 150.35
+Europium 63 Eu 151.96
+Gadolinium 64 Gd 157.25
+Terbium 65 Tb 158.92
+Dysprosium 66 Dy 162.5
+Holmium 67 Ho 164.93
+Erbium 68 Er 167.26
+Thulium 69 Tm 168.93
+Ytterbium 70 Yb 173.04
+Lutetium 71 Lu 174.97
+Hafnium 72 Hf 178.49
+Tantalum 73 Ta 180.95
+Wolfram 74 W 183.85
+Rhenium 75 Re 186.2
+Osmium 76 Os 190.2
+Iridium 77 Ir 192.22
+Platinum 78 Pt 195.09
+Gold 79 Au 196.97
+Mercury 80 Hg 200.59
+Thallium 81 Tl 204.37
+Lead 82 Pb 207.19
+Bismuth 83 Bi 208.98
+Polonium 84 Po 208.98
+Astatine 85 At 209.99
+Radon 86 Rn 222.02
+Francium 87 Fr 223.02
+Radium 88 Ra 226
+Actinium 89 Ac 227.03
+Thorium 90 Th 232.04
+Protactinium 91 Pa 231.04
+Uranium 92 U 238.03
+Neptunium 93 Np 237
+Plutonium 94 Pu 242
+Americium 95 Am 243.06
+Curium 96 Cm 247.07
+Berkelium 97 Bk 247.07
+Californium 98 Cf 251.08
+Einsteinium 99 Es 254.09
+Fermium 100 Fm 257.1
+Mendelevium 101 Md 257.1
+Nobelium 102 No 255.09
+Lawrencium 103 Lr 256.1

diff -r 24fb75fedee0 -r 19d8fd10248e test/test_export_to_metexp_tabular.py
--- a/test/test_export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100
+++ b/test/test_export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100

[

@@ -10,6 +10,27 @@
class IntegrationTest(unittest.TestCase):

+    def test_MM_calculations(self):
+        '''
+        test the implemented method for MM calculations for
+        given chemical formulas
+        '''
+        export_to_metexp_tabular.init_elements_and_masses_map()
+
+        formula = "C8H18O3"
+        # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertEqual(162.26, result)
+
+        formula = "CH2O3Fe2Ni"
+        # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertAlmostEqual(232.44, result, 2)
+
+
+
+
+
     def test_combine_output_simple(self):
         '''
         comment me
@@ -28,7 +49,13 @@
         sys.argv = ['test',
                     rankfilter_and_caslookup_combined_file,
                     msclust_quantification_and_spectra_file,
-                    output_csv]
+                    output_csv,
+                    'tomato',
+                    'leafs',
+                    'test experiment',
+                    'pieter',
+                    'DB5 column']
+
         # Execute main function with arguments provided through sys.argv
         export_to_metexp_tabular.main()

diff -r 24fb75fedee0 -r 19d8fd10248e test/test_query_metexp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp.py Wed Mar 05 17:20:11 2014 +0100

[

@@ -0,0 +1,82 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#     def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+    def test_simple(self):
+        '''
+        Simple initial test
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/metexp_query_tabular.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col,
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+
+        # TODO - asserts  (base them on DB being filled with test data form metexp unit test for upload method)
+        # PA
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()

diff -r 24fb75fedee0 -r 19d8fd10248e test/test_query_metexp_LARGE.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp_LARGE.py Wed Mar 05 17:20:11 2014 +0100

[

@@ -0,0 +1,79 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#     def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+    def test_large(self):
+        '''
+        Simple test, but on larger set, last test executed in 28s
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt")
+
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col,
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()