| Previous changeset 22:cd4f13119afa (2014-03-06) Next changeset 24:385d21a8d0a0 (2014-04-03) |
|
Commit message:
New tool to Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers |
|
modified:
__init__.py datatypes_conf.xml test/__init__.py |
|
added:
query_mass_repos.py query_mass_repos.xml test/test_query_mass_repos.py |
| b |
| diff -r cd4f13119afa -r 85fd05d0d16c __init__.py --- a/__init__.py Thu Mar 06 14:29:55 2014 +0100 +++ b/__init__.py Thu Apr 03 16:44:11 2014 +0200 |
| b |
| @@ -1,6 +1,6 @@ -''' -Module containing Galaxy tools for the GC/MS pipeline -Created on Mar 6, 2012 - -@author: marcelk -''' +''' +Module containing Galaxy tools for the LC or GC/MS pipeline +Created on Mar , 2014 + +@author: pieter lukasse +''' \ No newline at end of file |
| b |
| diff -r cd4f13119afa -r 85fd05d0d16c datatypes_conf.xml --- a/datatypes_conf.xml Thu Mar 06 14:29:55 2014 +0100 +++ b/datatypes_conf.xml Thu Apr 03 16:44:11 2014 +0200 |
| b |
| @@ -3,9 +3,6 @@ <datatype_files> </datatype_files> <registration display_path="display_applications"> - <!-- type for the pdf --> - <datatype extension="pdf" type="galaxy.datatypes.data:Data" mimetype="application/octet-stream" - display_in_upload="true" subclass="true"/> <datatype extension="msclust.csv" type="galaxy.datatypes.tabular:Tabular" mimetype="text/csv" display_in_upload="true" subclass="true"> </datatype> </registration> |
| b |
| diff -r cd4f13119afa -r 85fd05d0d16c query_mass_repos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.py Thu Apr 03 16:44:11 2014 +0200 |
| [ |
| b'@@ -0,0 +1,289 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to query a set of accurate mass values detected by high-resolution mass spectrometers\n+against various repositories/services such as METabolomics EXPlorer database or the \n+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).\n+\n+It will take the input file and for each record it will query the \n+molecular mass in the selected repository/service. If one or more compounds are found \n+then extra information regarding these compounds is added to the output file.\n+\n+The output file is thus the input file enriched with information about \n+related items found in the selected repository/service. \n+\n+The service should implement the following interface: \n+\n+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)\n+\n+The output should be tab separated and should contain the following columns (in this order)\n+db-name molecular-formula dbe formula-weight id description\n+\n+\n+\'\'\'\n+import csv\n+import sys\n+import fileinput\n+import urllib2\n+import time\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_file(in_xsv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n+ return _process_data(data)\n+ \n+def _process_data(data):\n+ \n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):\n+ \n+ \'\'\'\n+ This method will iterate over the record in the input_data and\n+ will enrich them with the related information found (if any) in the \n+ chosen repository/service\n+ \n+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies\n+ \'\'\'\n+ merged = []\n+ \n+ for i in xrange(len(input_data[input_data.keys()[0]])):\n+ # Get the record in same dictionary format as input_data, but containing\n+ # a value at each column instead of a list of all values of all records:\n+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n+ \n+ # read the molecular mass :\n+ molecular_mass = input_data_record[molecular_mass_col]\n+ \n+ \n+ # search for related records in repository/service:\n+ data_found = None\n+ if molecular_mass != "": \n+ molecular_mass = float(molecular_mass)\n+ \n+ # 1- search for data around this MM:\n+ query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"\n+ \n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "MM"\n+ \n+ \n+ if data_found == None:\n+ # If still nothing found, just add empty columns\n+ extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\']\n+ else:\n+ # Add info found:\n+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n+ \n+ # Take all data and merge it into a "flat"/simple array of values:\n+ field_values_list = _merge_data(input_data_record, extra_cols)\n+ \n+ merged.append(field_values_list)\n+\n+ # return the merged/enriched records:\n+ return merged\n+\n+\n+def'..b'esponse:\n+ if len(data_rows) <= 1 or data_rows[1].strip() == \'\': \n+ # means there is only the header row...so no hits:\n+ return None\n+ \n+ for data_row in data_rows:\n+ if not data_row.strip() == \'\':\n+ row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n+ result.append(row_as_list)\n+ \n+ # return result processed into a dict:\n+ return _process_data(result)\n+ \n+ except urllib2.HTTPError, e:\n+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n+ except urllib2.URLError, e:\n+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")\n+\n+def _str_to_list(data_row, delimiter=\'\\t\'): \n+ result = []\n+ for column in data_row.split(delimiter):\n+ result.append(column)\n+ return result\n+ \n+ \n+# alternative: ? \n+# s = requests.Session()\n+# s.verify = False\n+# #s.auth = (token01, token02)\n+# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n+# content = resp.content\n+# # transform to dictionary:\n+ \n+ \n+ \n+ \n+def _merge_data(input_data_record, extra_cols):\n+ \'\'\'\n+ Adds the extra information to the existing data record and returns\n+ the combined new record.\n+ \'\'\'\n+ record = []\n+ for column in input_data_record:\n+ record.append(input_data_record[column])\n+ \n+ \n+ # add extra columns\n+ for column in extra_cols:\n+ record.append(column) \n+ \n+ return record \n+ \n+\n+def _save_data(data_rows, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data_rows: dictionary containing merged/enriched dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write one line for each row\n+ for data_row in data_rows:\n+ output_single_handle.writerow(data_row)\n+\n+def _get_repository_URL(repository_file):\n+ \'\'\'\n+ Read out and return the URL stored in the given file.\n+ \'\'\'\n+ file_input = fileinput.input(repository_file)\n+ try:\n+ for line in file_input:\n+ if line[0] != \'#\':\n+ # just return the first line that is not a comment line:\n+ return line\n+ finally:\n+ file_input.close()\n+ \n+\n+def main():\n+ \'\'\'\n+ Query main function\n+ \n+ The input file can be any tabular file, as long as it contains a column for the molecular mass.\n+ This column is then used to query against the chosen repository/service Database. \n+ \'\'\'\n+ seconds_start = int(round(time.time()))\n+ \n+ input_file = sys.argv[1]\n+ molecular_mass_col = sys.argv[2]\n+ repository_file = sys.argv[3]\n+ error_margin = float(sys.argv[4])\n+ margin_unit = sys.argv[5]\n+ output_result = sys.argv[6]\n+\n+ # Parse repository_file to find the URL to the service:\n+ repository_dblink = _get_repository_URL(repository_file)\n+ \n+ # Parse tabular input file into dictionary/array:\n+ input_data = _process_file(input_file)\n+ \n+ # Query data against repository :\n+ enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)\n+ headers = input_data.keys() + [\'SEARCH hits for \',\'SEARCH hits: db-names\', \'SEARCH hits: molecular-formulas \',\n+ \'SEARCH hits: ids\',\'SEARCH hits: descriptions\', \'Link to SEARCH hits\']\n+\n+ _save_data(enriched_data, headers, output_result)\n+ \n+ seconds_end = int(round(time.time()))\n+ print "Took " + str(seconds_end - seconds_start) + " seconds"\n+ \n+ \n+\n+if __name__ == \'__main__\':\n+ main()\n' |
| b |
| diff -r cd4f13119afa -r 85fd05d0d16c query_mass_repos.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.xml Thu Apr 03 16:44:11 2014 +0200 |
| b |
| @@ -0,0 +1,106 @@ +<tool id="query_mass_repos" + name="METEXP - Find elemental composition formulas based on mass values " + version="0.1.0"> + <description>Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers</description> + <command interpreter="python"> + query_mass_repos.py + $input_file + $molecular_mass_col + "$repository_file" + $error_margin + $margin_unit + $output_result + </command> + <inputs> + + <param name="input_file" format="tabular" type="data" + label="Input file" + help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> + + <param name="molecular_mass_col" type="text" size="50" + label="Molecular mass column name" + value="MM" + help="Name of the column containing the molecular mass information (in the given input file)" /> + + <param name="repository_file" type="select" label="Repository/service to query" + help="Select the repository/service which should be queried" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_MassSearch_Services")'/> + + <param name="error_margin" type="float" size="10" + label="Error marging" + value="0.01" + help="Mass difference allowed when searching in the repositories for a mass match." /> + + <param name="margin_unit" type="select" label="Margin unit"> + <option value="ms" selected="True">ms</option> + <option value="ppm">ppm</option> + </param> + <!-- TODO + <param name="metexp_access_key" type="text" size="50" + label="(Optional)MetExp access key" + value="" + help="Key needed to get access to MetExp services. Fill in if MetExp service was selected" /> --> + + </inputs> + <outputs> + <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> + <help> +.. class:: infomark + +This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher +for elemental compositions from accurate mass values detected by high-resolution mass spectrometers. + +It will take the input file and for each record it will query the +molecular mass in the selected repository. If one or more compounds are found in the +repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass. + +**Services that can be queried** + +================= ========================================================================= +Database Description +----------------- ------------------------------------------------------------------------- +PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at + Plant Research International. NB: restricted access to employees with + access key. +ExactMassDB A database of possible elemental compositions consits of C: 100, + H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and + the Lewis valence rules. + (via /mfsearcher/exmassdb/) +ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental + compositions, filters some elemental compositions according to + the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 + database returns the same result as does HR2 with the same atom kind + and number condition as that used in construction of the ExactMassDB. + (via /mfsearcher/exmassdb-hr2/) +Pep1000 A database of possible linear polypeptides that are + constructed with 20 kinds of amino acids and having molecular + weights smaller than 1000. + (via /mfsearcher/pep1000/) +KEGG Re-calculated compound data from KEGG. Weekly updated. + (via /mfsearcher/kegg/) +KNApSAcK Re-calculated compound data from KNApSAcK. + (via /mfsearcher/knapsack/) +Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer . + (via /mfsearcher/flavonoidviewer/ +LipidMAPS Re-calculated compound data from LIPID MAPS. + (via /mfsearcher/lipidmaps/) +HMDB Re-calculated compound data from Human Metabolome Database (HMDB) + Version 3.5. + (via /mfsearcher/hmdb/) +PubChem Re-calculated compound data from PubChem. Monthly updated. + (via /mfsearcher/pubchem/) +================= ========================================================================= + +Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher + + </help> +</tool> |
| b |
| diff -r cd4f13119afa -r 85fd05d0d16c test/__init__.py --- a/test/__init__.py Thu Mar 06 14:29:55 2014 +0100 +++ b/test/__init__.py Thu Apr 03 16:44:11 2014 +0200 |
| b |
| @@ -1,1 +1,1 @@ -''' BRS GCMS Galaxy Tools Module ''' +''' unit tests ''' |
| b |
| diff -r cd4f13119afa -r 85fd05d0d16c test/test_query_mass_repos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_mass_repos.py Thu Apr 03 16:44:11 2014 +0200 |
| [ |
| @@ -0,0 +1,62 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from MS import query_mass_repos +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + + + + def test_simple(self): + ''' + Simple initial test + ''' + # Create out folder + outdir = "output/query_mass_repos/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # repository_file = sys.argv[3] + # mass_tolerance = float(sys.argv[4]) + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/service_query_tabular.txt") + + molecular_mass_col = "MM" + dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt") + + + + + sys.argv = ['test', + input_file, + molecular_mass_col, + dblink_file, + '0.001', + 'ms', + output_result] + + # Execute main function with arguments provided through sys.argv + query_mass_repos.main() + + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() |