| Previous changeset 20:24fb75fedee0 (2014-02-11) Next changeset 22:cd4f13119afa (2014-03-06) |
|
Commit message:
* Added interface to METEXP data store, including tool to fire queries in batch mode * Improved quantification output files of MsClust, a.o. sorting mass list based on intensity (last two columns of quantification files) * Added Molecular Mass calculation method |
|
modified:
MsClust.jar README.rst combine_output.py export_to_metexp_tabular.py msclust.xml rankfilterGCMS_tabular.xml test/test_export_to_metexp_tabular.py |
|
added:
export_to_metexp_tabular.xml query_metexp.py query_metexp.xml static_resources/elements_and_masses.tab test/test_query_metexp.py test/test_query_metexp_LARGE.py |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e MsClust.jar |
| b |
| Binary file MsClust.jar has changed |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e README.rst --- a/README.rst Tue Feb 11 12:29:50 2014 +0100 +++ b/README.rst Wed Mar 05 17:20:11 2014 +0100 |
| b |
| @@ -19,6 +19,11 @@ ============== ====================================================================== Date Changes -------------- ---------------------------------------------------------------------- +March 2014 * Added interface to METEXP data store, including tool to fire + queries in batch mode + * Improved quantification output files of MsClust, a.o. sorting + mass list based on intensity (last two columns of quantification + files) January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a single package (this package) * integration with METEXP software (data store for metabolomics |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e combine_output.py --- a/combine_output.py Tue Feb 11 12:29:50 2014 +0100 +++ b/combine_output.py Wed Mar 05 17:20:11 2014 +0100 |
| [ |
| @@ -155,12 +155,16 @@ @param data: dictionary containing merged dataset @param out_csv: output csv file ''' - header = ['Centrotype', + # Columns we don't repeat: + header_part1 = ['Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)', 'nr. Peaks', - 'R.T.', + 'R.T.'] + # These are the headers/columns we repeat in case of + # combining hits in one line (see alternative_headers method below): + header_part2 = [ 'Name', 'FORMULA', 'Library', @@ -190,13 +194,21 @@ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") # Write headers - output_single_handle.writerow(header) - output_multi_handle.writerow(header * nhits) + output_single_handle.writerow(header_part1 + header_part2) + output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) # Combine all hits for each centrotype into one line line = [] for centrotype_idx in xrange(len(data)): + i = 0 for hit in data[centrotype_idx]: - line.extend(hit) + if i==0: + line.extend(hit) + else: + line.extend(hit[6:]) + i = i+1 + # small validation (if error, it is a programming error): + if i > nhits: + raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) output_multi_handle.writerow(line) line = [] @@ -205,6 +217,17 @@ for hit in data[centrotype_idx]: output_single_handle.writerow(hit) +def alternative_headers(header_part2, nr_alternative_hits): + ''' + This method will iterate over the header names and add the string 'ALT#_' before each, + where # is the number of the alternative, according to number of alternative hits we want to add + to final csv/tsv + ''' + result = [] + for i in xrange(nr_alternative_hits): + for header_name in header_part2: + result.append("ALT" + str(i+1) + "_" + header_name) + return result def main(): ''' |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.py --- a/export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100 +++ b/export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100 |
| [ |
| b'@@ -5,17 +5,18 @@\n into a tabular file that can be uploaded to the MetExp database.\n \n RankFilter, CasLookup are already combined by combine_output.py so here we will use\n-this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust\n-quantification files are to be combined with combine_output.py result as well. \n+this result. Furthermore here one of the MsClust\n+quantification files containing the respective spectra details are to be combined as well. \n \n Extra calculations performed:\n - The column MW is also added here and is derived from the column FORMULA found \n- in combine_output.py result. \n+ in RankFilter, CasLookup combined result. \n \n-So in total here we merge 3 files and calculate one new column. \n+So in total here we merge 2 files and calculate one new column. \n \'\'\'\n-\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n import csv\n+import re\n import sys\n from collections import OrderedDict\n \n@@ -40,14 +41,15 @@\n ONE_TO_ONE = \'one_to_one\'\n N_TO_ONE = \'n_to_one\'\n \n-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):\n+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):\n \'\'\'\n Merges data from both input dictionaries based on the link fields. This method will\n build up a new list containing the merged hits as the items. \n @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)\n @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)\n \'\'\'\n- # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):\n+ # TODO test for correct input files -> same link_field values should be there \n+ # (test at least number of unique link_field values):\n #\n # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):\n # raise Exception(\'input files should have the same nr of key values \')\n@@ -64,17 +66,23 @@\n # Get the indices for current link_field_set1_value in both data-structures for proper matching\n set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]\n set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]\n- \n- \n+ # Validation :\n+ if len(set2index) == 0:\n+ # means that corresponding data could not be found in set2, then throw error\n+ raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + \n+ link_field_set1_value + " only found in first dataset. ")\n \n merged_hits = []\n # Combine hits\n for hit in xrange(len(set1index)):\n # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n- # corresponding value in the rankfilter or caslookup tables; i.e. \n- # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute\n- # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)\n+ # corresponding value in the sets; i.e. \n+ # set1[key] => returns the list/array with size = nrrows, with the values for the attribute\n+ # represented by "key". \n+ # set1index[hit] => points to the row nr=hit (hit is a rownr/index)\n+ # So set1[x][set1index[n]] = set1.attributeX.instanceN\n+ #\n # It just ensures '..b' of reference standard\n+ record.append(\'0\')\n+ record.append(\'\') \n \n return record\n \n \n-\n+def get_molecular_mass(formula):\n+ \'\'\'\n+ Calculates the molecular mass (MM). \n+ E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O\n+ \'\'\'\n+ \n+ # Each element is represented by a capital letter, followed optionally by \n+ # lower case, with one or more digits as for how many elements:\n+ element_pattern = re.compile("([A-Z][a-z]?)(\\d*)")\n \n-def _save_data(data, headers, nhits, out_csv):\n+ total_mass = 0\n+ for (element_name, count) in element_pattern.findall(formula):\n+ if count == "":\n+ count = 1\n+ else:\n+ count = int(count)\n+ element_mass = float(elements_and_masses_map[element_name]) # "found: Python\'s built-in float type has double precision " (? check if really correct ?)\n+ total_mass += element_mass * count\n+ \n+ return total_mass\n+ \n+ \n+\n+def _save_data(data, headers, out_csv):\n \'\'\'\n Writes tab-separated data to file\n @param data: dictionary containing merged dataset\n@@ -139,12 +180,35 @@\n # Write headers\n output_single_handle.writerow(headers)\n \n- # Write one line for each centrotype\n- for centrotype_idx in xrange(len(data)):\n- for hit in data[centrotype_idx]:\n+ # Write \n+ for item_idx in xrange(len(data)):\n+ for hit in data[item_idx]:\n output_single_handle.writerow(hit)\n \n \n+def _get_map_for_elements_and_masses(elements_and_masses):\n+ \'\'\'\n+ This method will read out the column \'Chemical symbol\' and make a map \n+ of this, storing the column \'Relative atomic mass\' as its value\n+ \'\'\'\n+ resultMap = {}\n+ index = 0\n+ for entry in elements_and_masses[\'Chemical symbol\']:\n+ resultMap[entry] = elements_and_masses[\'Relative atomic mass\'][index]\n+ index += 1\n+ \n+ return resultMap\n+\n+\n+def init_elements_and_masses_map():\n+ \'\'\'\n+ Initializes the lookup map containing the elements and their respective masses\n+ \'\'\'\n+ elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))\n+ global elements_and_masses_map\n+ elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)\n+ \n+\n def main():\n \'\'\'\n Combine Output main function\n@@ -156,15 +220,27 @@\n rankfilter_and_caslookup_combined_file = sys.argv[1]\n msclust_quantification_and_spectra_file = sys.argv[2]\n output_csv = sys.argv[3]\n+ # metadata\n+ metadata = OrderedDict()\n+ metadata[\'organism\'] = sys.argv[4]\n+ metadata[\'tissue\'] = sys.argv[5]\n+ metadata[\'experiment_name\'] = sys.argv[6]\n+ metadata[\'user_name\'] = sys.argv[7]\n+ metadata[\'column_type\'] = sys.argv[8]\n \n # Read RankFilter and CasLookup output files\n rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)\n msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, \',\')\n \n+ # Read elements and masses to use for the MW/MM calculation :\n+ init_elements_and_masses_map()\n+ \n merged, nhits = _merge_data(rankfilter_and_caslookup_combined, \'Centrotype\', \n- msclust_quantification_and_spectra, \'centrotype\', _compare_records, _merge_records, N_TO_ONE)\n- headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()\n- _save_data(merged, headers, nhits, output_csv)\n+ msclust_quantification_and_spectra, \'centrotype\', \n+ _compare_records, _merge_records, metadata,\n+ N_TO_ONE)\n+ headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + [\'MM\',\'MW\', \'Level of identification\', \'Location of reference standard\']\n+ _save_data(merged, headers, output_csv)\n \n \n if __name__ == \'__main__\':\n' |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.xml Wed Mar 05 17:20:11 2014 +0100 |
| b |
| @@ -0,0 +1,57 @@ +<tool id="export_to_metexp_tabular" + name="METEXP - Tabular file" + version="0.1.0"> + <description>Create tabular file for loading into METabolomics EXPlorer database</description> + <command interpreter="python"> + export_to_metexp_tabular.py $rankfilter_and_caslookup_combi $msclust_quant_file $output_result + $organism $tissue $experiment_name $user_name $column_type + </command> + <inputs> + <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output" + help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/> + <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output" + help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/> + + + <param name="organism" type="text" size="80" + label="Organism(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." /> + <param name="tissue" type="text" size="80" + label="Tissue(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." /> + + <param name="experiment_name" type="text" size="80" + label="Experiment name/code" + help="Name or code to store the results under. This can help you find the results back in MetExpDB." /> + + <param name="user_name" type="text" size="80" + label="User name" + help="User name or code to store the results under. This can help you find the results back in MetExpDB." /> + + <param name="column_type" type="text" size="80" + label="Column type" + help="Column type to report with the results. This can help you find the results back in MetExpDB." /> + + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" /> + </outputs> + <help> +.. class:: infomark + +Tool to combine output from the tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. + +RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use +this result. + +**Notes** + +Extra calculations performed: +- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. + + + </help> +</tool> |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e msclust.xml --- a/msclust.xml Tue Feb 11 12:29:50 2014 +0100 +++ b/msclust.xml Wed Mar 05 17:20:11 2014 +0100 |
| b |
| @@ -1,4 +1,4 @@ -<tool name="MsClust" id="msclust2" version="2.0.2"> +<tool name="MsClust" id="msclust2" version="2.0.3"> <description>Extracts fragmentation spectra from aligned data</description> <!-- For remote debugging start you listener on port 8000 and use the following as command interpreter: |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e query_metexp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.py Wed Mar 05 17:20:11 2014 +0100 |
| [ |
| b'@@ -0,0 +1,273 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to query a set of identifications against the METabolomics EXPlorer database.\n+\n+It will take the input file and for each record it will query the \n+molecular mass in the selected MetExp DB. If one or more compounds are found in the\n+MetExp DB then extra information regarding these compounds is added to the output file.\n+\n+The output file is thus the input file enriched with information about \n+related items found in the selected MetExp DB. \n+\'\'\'\n+import csv\n+import sys\n+import fileinput\n+import urllib2\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_file(in_xsv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n+ return _process_data(data)\n+ \n+def _process_data(data):\n+ \n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):\n+ \'\'\'\n+ This method will iterate over the record in the input_data and\n+ will enrich them with the related information found (if any) in the \n+ MetExp Database.\n+ \'\'\'\n+ merged = []\n+ \n+ for i in xrange(len(input_data[input_data.keys()[0]])):\n+ # Get the record in same dictionary format as input_data, but containing\n+ # a value at each column instead of a list of all values of all records:\n+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n+ \n+ # read the molecular mass and formula:\n+ cas_id = input_data_record[casid_col]\n+ formula = input_data_record[formula_col]\n+ molecular_mass = input_data_record[molecular_mass_col]\n+ \n+ # search for related records in MetExp:\n+ data_found = None\n+ if cas_id != "undef": \n+ # 1- search for other experiments where this CAS id has been found:\n+ query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "CAS"\n+ if data_found == None:\n+ # 2- search for other experiments where this FORMULA has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "FORMULA"\n+ if data_found == None:\n+ # 3- search for other experiments where this MM has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method \n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "MM"\n+ \n+ if data_found == None:\n+ # If still nothing found, just add empty columns\n+ extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\',\'\',\'\']\n+ else:\n+ # Add info found:\n+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n+ \n+ # Take all data and merge it into a "flat"/simple array of values:\n+ field_values_list = _merge_data(input_data_record, extra_cols)\n+ \n+ merged.append(field_values_list)\n+\n+ # return the merged/enriched records:\n+ return merged\n+\n+\n+def _get_e'..b' \n+ # check if there is any data in the response:\n+ if len(data_rows) <= 1 or data_rows[1].strip() == \'\': \n+ # means there is only the header row...so no hits:\n+ return None\n+ \n+ for data_row in data_rows:\n+ if not data_row.strip() == \'\':\n+ row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n+ result.append(row_as_list)\n+ \n+ # return result processed into a dict:\n+ return _process_data(result)\n+ \n+ except urllib2.HTTPError, e:\n+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n+ except urllib2.URLError, e:\n+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")\n+\n+def _str_to_list(data_row, delimiter=\'\\t\'): \n+ result = []\n+ for column in data_row.split(delimiter):\n+ result.append(column)\n+ return result\n+ \n+ \n+# alternative: ? \n+# s = requests.Session()\n+# s.verify = False\n+# #s.auth = (token01, token02)\n+# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n+# content = resp.content\n+# # transform to dictionary:\n+ \n+ \n+ \n+ \n+def _merge_data(input_data_record, extra_cols):\n+ \'\'\'\n+ Adds the extra information to the existing data record and returns\n+ the combined new record.\n+ \'\'\'\n+ record = []\n+ for column in input_data_record:\n+ record.append(input_data_record[column])\n+ \n+ \n+ # add extra columns\n+ for column in extra_cols:\n+ record.append(column) \n+ \n+ return record \n+ \n+\n+def _save_data(data_rows, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data_rows: dictionary containing merged/enriched dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write one line for each row\n+ for data_row in data_rows:\n+ output_single_handle.writerow(data_row)\n+\n+def _get_metexp_URL(metexp_dblink_file):\n+ \'\'\'\n+ Read out and return the URL stored in the given file.\n+ \'\'\'\n+ file_input = fileinput.input(metexp_dblink_file)\n+ try:\n+ for line in file_input:\n+ if line[0] != \'#\':\n+ # just return the first line that is not a comment line:\n+ return line\n+ finally:\n+ file_input.close()\n+ \n+\n+def main():\n+ \'\'\'\n+ MetExp Query main function\n+ \n+ The input file can be any tabular file, as long as it contains a column for the molecular mass\n+ and one for the formula of the respective identification. These two columns are then\n+ used to query against MetExp Database. \n+ \'\'\'\n+ input_file = sys.argv[1]\n+ casid_col = sys.argv[2]\n+ formula_col = sys.argv[3]\n+ molecular_mass_col = sys.argv[4]\n+ metexp_dblink_file = sys.argv[5]\n+ separation_method = sys.argv[6]\n+ output_result = sys.argv[7]\n+\n+ # Parse metexp_dblink_file to find the URL to the MetExp service:\n+ metexp_dblink = _get_metexp_URL(metexp_dblink_file)\n+ \n+ # Parse tabular input file into dictionary/array:\n+ input_data = _process_file(input_file)\n+ \n+ # Query data against MetExp DB :\n+ enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)\n+ headers = input_data.keys() + [\'METEXP hits for \',\'METEXP hits: organisms\', \'METEXP hits: tissues\',\n+ \'METEXP hits: experiments\',\'METEXP hits: user names\',\'METEXP hits: column types\', \'METEXP hits: CAS nrs\', \'Link to METEXP hits\']\n+ \n+ _save_data(enriched_data, headers, output_result)\n+\n+\n+if __name__ == \'__main__\':\n+ main()\n' |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e query_metexp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.xml Wed Mar 05 17:20:11 2014 +0100 |
| b |
| @@ -0,0 +1,67 @@ +<tool id="query_metexp" + name="METEXP - Query Database " + version="0.1.0"> + <description>Query a set of identifications against the METabolomics EXPlorer database</description> + <command interpreter="python"> + query_metexp.py + $input_file + $casid_col + $formula_col + $molecular_mass_col + "$metexp_dblink_file" + $separation_method + $output_result + </command> + <inputs> + + <param name="input_file" format="tabular" type="data" + label="Input file" + help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> + + <param name="casid_col" type="text" size="50" + label="CAS ID column name" + value="CAS" + help="Name of the column containing the CAS code information (in the given input file)" /> + <param name="formula_col" type="text" size="50" + label="Formula ID column name" + value="FORMULA" + help="Name of the column containing the formula information (in the given input file)" /> + <param name="molecular_mass_col" type="text" size="50" + label="Molecular mass column name" + value="MM" + help="Name of the column containing the molecular mass information (in the given input file)" /> + + <param name="metexp_dblink_file" type="select" label="MetExp DB to query" + help="Select the MetExp Database/backend which should be queried" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/> + + <param name="separation_method" type="select" label="Data type to query"> + <option value="GC" selected="True">GC</option> + <option value="LC">LC</option> + </param> + + </inputs> + <outputs> + <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> + <help> +.. class:: infomark + +This tool will Query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass +and one for the formula of the respective identification. + + + </help> +</tool> |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e rankfilterGCMS_tabular.xml --- a/rankfilterGCMS_tabular.xml Tue Feb 11 12:29:50 2014 +0100 +++ b/rankfilterGCMS_tabular.xml Wed Mar 05 17:20:11 2014 +0100 |
| b |
| @@ -3,7 +3,7 @@ <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command> <inputs> <param format="tabular" name="sample" type="data" label="Sample File" - help="Converted PDF file in tabular format" /> + help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> <!-- question: is this calibration file not column specific as it includes RT info?? --> <!-- this one should be input file for now:<param name="calibration" type="select" label="Calibration File" help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values" |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e static_resources/elements_and_masses.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/static_resources/elements_and_masses.tab Wed Mar 05 17:20:11 2014 +0100 |
| b |
| @@ -0,0 +1,104 @@ +Name Atomic number Chemical symbol Relative atomic mass +Hydrogen 1 H 1.01 +Helium 2 He 4 +Lithium 3 Li 6.94 +Beryllium 4 Be 9.01 +Boron 5 B 10.81 +Carbon 6 C 12.01 +Nitrogen 7 N 14.01 +Oxygen 8 O 16 +Fluorine 9 F 19 +Neon 10 Ne 20.18 +Sodium 11 Na 22.99 +Magnesium 12 Mg 24.31 +Aluminum 13 Al 26.98 +Silicon 14 Si 28.09 +Phosphorus 15 P 30.98 +Sulfur 16 S 32.06 +Chlorine 17 Cl 35.45 +Argon 18 Ar 39.95 +Potassium 19 K 39.1 +Calcium 20 Ca 40.08 +Scandium 21 Sc 44.96 +Titanium 22 Ti 47.9 +Vanadium 23 V 50.94 +Chromium 24 Cr 52 +Manganese 25 Mn 54.94 +Iron 26 Fe 55.85 +Cobalt 27 Co 58.93 +Nickel 28 Ni 58.71 +Copper 29 Cu 63.54 +Zinc 30 Zn 65.37 +Gallium 31 Ga 69.72 +Germanium 32 Ge 72.59 +Arsenic 33 As 74.99 +Selenium 34 Se 78.96 +Bromine 35 Br 79.91 +Krypton 36 Kr 83.8 +Rubidium 37 Rb 85.47 +Strontium 38 Sr 87.62 +Yttrium 39 Y 88.91 +Zirconium 40 Zr 91.22 +Niobium 41 Nb 92.91 +Molybdenum 42 Mo 95.94 +Technetium 43 Tc 96.91 +Ruthenium 44 Ru 101.07 +Rhodium 45 Rh 102.9 +Palladium 46 Pd 106.4 +Silver 47 Ag 107.87 +Cadmium 48 Cd 112.4 +Indium 49 In 114.82 +Tin 50 Sn 118.69 +Antimony 51 Sb 121.75 +Tellurium 52 Te 127.6 +Iodine 53 I 126.9 +Xenon 54 Xe 131.3 +Cesium 55 Cs 132.9 +Barium 56 Ba 137.34 +Lanthanum 57 La 138.91 +Cerium 58 Ce 140.12 +Praseodymium 59 Pr 140.91 +Neodymium 60 Nd 144.24 +Promethium 61 Pm 144.91 +Samarium 62 Sm 150.35 +Europium 63 Eu 151.96 +Gadolinium 64 Gd 157.25 +Terbium 65 Tb 158.92 +Dysprosium 66 Dy 162.5 +Holmium 67 Ho 164.93 +Erbium 68 Er 167.26 +Thulium 69 Tm 168.93 +Ytterbium 70 Yb 173.04 +Lutetium 71 Lu 174.97 +Hafnium 72 Hf 178.49 +Tantalum 73 Ta 180.95 +Wolfram 74 W 183.85 +Rhenium 75 Re 186.2 +Osmium 76 Os 190.2 +Iridium 77 Ir 192.22 +Platinum 78 Pt 195.09 +Gold 79 Au 196.97 +Mercury 80 Hg 200.59 +Thallium 81 Tl 204.37 +Lead 82 Pb 207.19 +Bismuth 83 Bi 208.98 +Polonium 84 Po 208.98 +Astatine 85 At 209.99 +Radon 86 Rn 222.02 +Francium 87 Fr 223.02 +Radium 88 Ra 226 +Actinium 89 Ac 227.03 +Thorium 90 Th 232.04 +Protactinium 91 Pa 231.04 +Uranium 92 U 238.03 +Neptunium 93 Np 237 +Plutonium 94 Pu 242 +Americium 95 Am 243.06 +Curium 96 Cm 247.07 +Berkelium 97 Bk 247.07 +Californium 98 Cf 251.08 +Einsteinium 99 Es 254.09 +Fermium 100 Fm 257.1 +Mendelevium 101 Md 257.1 +Nobelium 102 No 255.09 +Lawrencium 103 Lr 256.1 |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e test/test_export_to_metexp_tabular.py --- a/test/test_export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100 +++ b/test/test_export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100 |
| [ |
| @@ -10,6 +10,27 @@ class IntegrationTest(unittest.TestCase): + def test_MM_calculations(self): + ''' + test the implemented method for MM calculations for + given chemical formulas + ''' + export_to_metexp_tabular.init_elements_and_masses_map() + + formula = "C8H18O3" + # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertEqual(162.26, result) + + formula = "CH2O3Fe2Ni" + # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertAlmostEqual(232.44, result, 2) + + + + + def test_combine_output_simple(self): ''' comment me @@ -28,7 +49,13 @@ sys.argv = ['test', rankfilter_and_caslookup_combined_file, msclust_quantification_and_spectra_file, - output_csv] + output_csv, + 'tomato', + 'leafs', + 'test experiment', + 'pieter', + 'DB5 column'] + # Execute main function with arguments provided through sys.argv export_to_metexp_tabular.main() |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e test/test_query_metexp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp.py Wed Mar 05 17:20:11 2014 +0100 |
| [ |
| @@ -0,0 +1,82 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_simple(self): + ''' + Simple initial test + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + # TODO - asserts (base them on DB being filled with test data form metexp unit test for upload method) + # PA + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() |
| b |
| diff -r 24fb75fedee0 -r 19d8fd10248e test/test_query_metexp_LARGE.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp_LARGE.py Wed Mar 05 17:20:11 2014 +0100 |
| [ |
| @@ -0,0 +1,79 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_large(self): + ''' + Simple test, but on larger set, last test executed in 28s + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() |