Previous changeset 5:41f122255d14 (2015-03-19) Next changeset 7:a1e3324dc244 (2015-03-19) |
Commit message:
reorganized sources |
added:
GCMS/__init__.py GCMS/combine_output.py GCMS/combine_output.xml GCMS/create_model.xml GCMS/library_lookup.py GCMS/library_lookup.xml GCMS/readme.txt METEXPtools/__init__.py METEXPtools/export_to_metexp_tabular.py METEXPtools/export_to_metexp_tabular.xml METEXPtools/query_metexp.py METEXPtools/query_metexp.xml METEXPtools/static_resources/README.txt METEXPtools/static_resources/elements_and_masses.tab MS/__init__.py MS/query_mass_repos.py MS/query_mass_repos.xml metaMS/INFO_structure_of_LC.txt metaMS/README.txt metaMS/__init__.py metaMS/metaMS_cmd_annotate.r metaMS/metaMS_cmd_pick_and_group.r metaMS/metams_lcms_annotate.xml metaMS/metams_lcms_pick_and_group.xml metaMS/next.r metaMS/tool_dependencies.xml metaMS/xcms_differential_analysis.r metaMS/xcms_differential_analysis.xml metaMS/xcms_get_alignment_eic.r metaMS/xcms_get_alignment_eic.xml metaMS/xcms_get_mass_eic.r metaMS/xcms_get_mass_eic.xml rankfilter_GCMS/nistpdf_to_tabular.py rankfilter_GCMS/nistpdf_to_tabular.xml rankfilter_GCMS/rankfilterGCMS_tabular.xml rankfilter_GCMS/select_on_rank.py rankfilter_GCMS/select_on_rank.xml |
removed:
__init__.py combine_output.py combine_output.xml create_model.xml export_to_metexp_tabular.py export_to_metexp_tabular.xml library_lookup.py library_lookup.xml match_library.py metaMS_cmd_annotate.r metaMS_cmd_pick_and_group.r metams_lcms_annotate.xml metams_lcms_pick_and_group.xml next.r query_mass_repos.py query_mass_repos.xml query_metexp.py query_metexp.xml rankfilterGCMS_tabular.xml rankfilter_GCMS/pdftotabular.py rankfilter_text2tabular.xml select_on_rank.py select_on_rank.xml static_resources/README.txt static_resources/elements_and_masses.tab tool_dependencies.xml xcms_differential_analysis.r xcms_differential_analysis.xml xcms_get_alignment_eic.r xcms_get_alignment_eic.xml xcms_get_mass_eic.r xcms_get_mass_eic.xml |
b |
diff -r 41f122255d14 -r 4393f982d18f GCMS/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/__init__.py Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,6 @@ +''' +Module containing Galaxy tools for the GC/MS pipeline +Created on Mar 6, 2012 + +@author: marcelk +''' |
b |
diff -r 41f122255d14 -r 4393f982d18f GCMS/combine_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/combine_output.py Thu Mar 19 12:22:23 2015 +0100 |
[ |
b'@@ -0,0 +1,253 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)\n+\'\'\'\n+\n+import csv\n+import re\n+import sys\n+import math\n+import pprint\n+\n+__author__ = "Marcel Kempenaar"\n+__contact__ = "brs@nbic.nl"\n+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n+__license__ = "MIT"\n+\n+def _process_data(in_csv):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_csv, \'rU\'), delimiter=\'\\t\'))\n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = {}\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _merge_data(rankfilter, caslookup):\n+ \'\'\'\n+ Merges data from both input dictionaries based on the Centrotype field. This method will\n+ build up a new list containing the merged hits as the items. \n+ @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)\n+ @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)\n+ \'\'\'\n+ # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:\n+ if (len(rankfilter[\'ID\']) != len(caslookup[\'Centrotype\'])):\n+ raise Exception(\'rankfilter and caslookup files should have the same nr of rows/records \')\n+ \n+ merged = []\n+ processed = {}\n+ for compound_id_idx in xrange(len(rankfilter[\'ID\'])):\n+ compound_id = rankfilter[\'ID\'][compound_id_idx]\n+ if not compound_id in processed :\n+ # keep track of processed items to not repeat them\n+ processed[compound_id] = compound_id\n+ # get centrotype nr\n+ centrotype = compound_id.split(\'-\')[0]\n+ # Get the indices for current compound ID in both data-structures for proper matching\n+ rindex = [index for index, value in enumerate(rankfilter[\'ID\']) if value == compound_id]\n+ cindex = [index for index, value in enumerate(caslookup[\'Centrotype\']) if value == centrotype]\n+ \n+ merged_hits = []\n+ # Combine hits\n+ for hit in xrange(len(rindex)):\n+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n+ # corresponding value in the rankfilter or caslookup tables; i.e. \n+ # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute\n+ # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)\n+ rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))\n+ cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))\n+ \n+ merged_hit = _add_hit(rf_record, cl_record)\n+ merged_hits.append(merged_hit)\n+ \n+ merged.append(merged_hits)\n+\n+ return merged, len(rindex)\n+\n+\n+def _add_hit(rankfilter, caslookup):\n+ \'\'\'\n+ Combines single records from both the RankFilter- and CasLookup-tools\n+ @param rankfilter: record (dictionary) of one compound in the RankFilter output\n+ @param caslookup: matching record (dictionary) of one compound in the CasLookup output\n+ \'\'\'\n+ # The ID in the RankFilter output contains the following 5 fields:\n+ rf_id = rankfilter[\'ID\'].split(\'-\')\n+ try:\n+ name, formula = _remove_formula(rankfilter[\'Name\'])\n+ hit = [rf_id[0], # Centrotype\n+ rf_id[1], # cent.Factor\n+ rf_id[2], # '..b'ULA\': \'N/A\',\n+ \'RI\': \'0.0\',\n+ \'Regression.Column.Name\': \'None\',\n+ \'min\': \'0.0\',\n+ \'max\': \'0.0\',\n+ \'nr.duplicates\': \'0\',\n+ \'Column.phase.type\': \'N/A\',\n+ \'Column.name\': \'N/A\'}\n+\n+\n+def _save_data(data, nhits, out_csv_single, out_csv_multi):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data: dictionary containing merged dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+ # Columns we don\'t repeat:\n+ header_part1 = [\'Centrotype\',\n+ \'cent.Factor\',\n+ \'scan nr.\',\n+ \'R.T. (umin)\',\n+ \'nr. Peaks\',\n+ \'R.T.\']\n+ # These are the headers/columns we repeat in case of \n+ # combining hits in one line (see alternative_headers method below):\n+ header_part2 = [\n+ \'Name\',\n+ \'FORMULA\',\n+ \'Library\',\n+ \'CAS\',\n+ \'Forward\',\n+ \'Reverse\',\n+ \'Avg. (Forward, Reverse)\',\n+ \'RIexp\',\n+ \'RI\',\n+ \'RIsvr\',\n+ \'RIexp - RIsvr\',\n+ \'RI - RIexp\',\n+ \'Regression.Column.Name\',\n+ \'min\',\n+ \'max\',\n+ \'nr.duplicates\',\n+ \'Column.phase.type\',\n+ \'Column.name\',\n+ \'Rank\',\n+ \'%rel.err\',\n+ \'Synonyms\']\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv_single, \'wb\')\n+ outfile_multi_handle = open(out_csv_multi, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(header_part1 + header_part2)\n+ output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))\n+ # Combine all hits for each centrotype into one line\n+ line = []\n+ for centrotype_idx in xrange(len(data)):\n+ i = 0\n+ for hit in data[centrotype_idx]:\n+ if i==0:\n+ line.extend(hit)\n+ else:\n+ line.extend(hit[6:])\n+ i = i+1\n+ # small validation (if error, it is a programming error):\n+ if i > nhits:\n+ raise Exception(\'Error: more hits that expected for centrotype_idx \' + centrotype_idx)\n+ output_multi_handle.writerow(line)\n+ line = []\n+\n+ # Write one line for each centrotype\n+ for centrotype_idx in xrange(len(data)):\n+ for hit in data[centrotype_idx]:\n+ output_single_handle.writerow(hit)\n+\n+def alternative_headers(header_part2, nr_alternative_hits):\n+ \'\'\' \n+ This method will iterate over the header names and add the string \'ALT#_\' before each, \n+ where # is the number of the alternative, according to number of alternative hits we want to add\n+ to final csv/tsv\n+ \'\'\'\n+ result = []\n+ for i in xrange(nr_alternative_hits): \n+ for header_name in header_part2:\n+ result.append("ALT" + str(i+1) + "_" + header_name) \n+ return result\n+\n+def main():\n+ \'\'\'\n+ Combine Output main function\n+ It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" \n+ NB: the caslookup_result_file will typically have fewer lines than\n+ rankfilter_result_file, so the merge has to consider this as well. The final file\n+ should have the same nr of lines as rankfilter_result_file.\n+ \'\'\'\n+ rankfilter_result_file = sys.argv[1]\n+ caslookup_result_file = sys.argv[2]\n+ output_single_csv = sys.argv[3]\n+ output_multi_csv = sys.argv[4]\n+\n+ # Read RankFilter and CasLookup output files\n+ rankfilter = _process_data(rankfilter_result_file)\n+ caslookup = _process_data(caslookup_result_file)\n+ merged, nhits = _merge_data(rankfilter, caslookup)\n+ _save_data(merged, nhits, output_single_csv, output_multi_csv)\n+\n+\n+if __name__ == \'__main__\':\n+ main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f GCMS/combine_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/combine_output.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,36 @@ +<tool id="combine_output" name="RIQC-Combine RankFilter and CasLookup output" version="1.0.2"> + <description>Perform a combination of output data from the RankFilter and CasLookup tools</description> + <command interpreter="python"> + combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi + </command> + <inputs> + <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output (Estimated RI)" + help="Select the output file from the RankFilter tool"/> + <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output ('Known' RI)" + help="Select the output file from the CasLookup tool"/> + <!-- <param TODO : could add "tolerance for ERI-KRI"(Estimated RI-Known RI)--> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} (Single) on ${on_string}" name="out_single" /> + <data format="tabular" label="${tool.name} (Multi) on ${on_string}" name="out_multi" /> + </outputs> + <help> +Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files. + +The files produced are contain either all hits for a compound on a single line (Single) or on separate lines +(Multi). + +.. class:: infomark + +**Notes** + +The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the +original headers kept intact. Processing steps include: + + - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values + - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)' + and 'nr. Peaks' fields. + - The formula is split off the 'Name' field in the RankFilter output + + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f GCMS/create_model.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/create_model.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,78 @@ +<tool id="create_poly_model" name="RIQC-Create Regression Model" version="1.0.2"> + <description>Generate coefficients to enable the regression from one GC-column + to another GC-column</description> + <command interpreter="Rscript">Rscripts/ridb-regression.R + $ridb + $out_model + $out_log + $min_residuals + $range_mod + $pvalue + $rsquared + $method + $plot + #if $plot + $model_graphics + #end if + </command> + <inputs> + <param format="tabular" name="ridb" type="select" label="Retention Index (RI) and GC columns Library file" + help="Select the RI library file of which all GC columns and their RI values + will be used to create a model" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/> + + <param name="method" type="select" label="Select regression method" + help="Method to use for calculating the model" > + <option value="poly" selected="True">Polynomial (3rd degree)</option> + <option value="linear">Linear</option> + </param> + <param name="min_residuals" type="integer" value="10" optional="False" + label="Minimum number of residuals" help="The minimum number of residuals + (datapoints) that both columns should have in common when calculating + the model" /> + <param name="range_mod" type="integer" value="0" optional="False" + label="Range modifier" help="Moves the range of the usable RI space by the + given percentage. Set to 0 to use the full range of available data." /> + <param name="pvalue" type="float" value="0.05" optional="False" min="0" max="1" + label="Pvalue to filter on" help="Set the upper limit for the pvalue (calculated) + by performing an ANOVA analysis on the created model). All models with higher + pvalues are discarded." /> + <param name="rsquared" type="float" value="0.95" optional="False" min="0" max="1" + label="R-squared to filter on" help="Set the lower limit for the R-squared, + all models with lower values are discarded." /> + <param name="plot" type="boolean" label="Create a separate plot for each model" + help="This will create a ZIP file in the history containing PDF plots" /> + </inputs> + <code file="match_library.py" /> + <outputs> + <data format="zip" label="Model Graphics of ${on_string}" name="model_graphics" > + <filter>(plot)</filter> + </data> + <data format="tabular" label="Regression logfile of ${on_string}" name="out_log" /> + <data format="tabular" label="Regression model of ${on_string}" name="out_model" /> + </outputs> + <help> +Calculates regression models for a permutation of all GC columns contained in the selected +RI database file. The method used for creating the model is either based on a 3rd degree +polynomial or a standard linear model. + +The *Minimum number of residuals* option will only allow regression if the columns it is based +on has at least that number of datapoints on the same compound. + +Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for +the *R squared* value. The produced logfile will state how many models have been discarded due +to this filtering. The output model file also includes the p-value and R squared value for +each created model. + +Graphical output of the models is available by selecting the plot option which shows the +data points used for the model as well as the fit itself and the range of data that will +be usable. + +.. class:: infomark + +**Notes** + +The output file produced by this tool is required as input for the CasLookup tool when +selecting to apply regression when finding hits in the RIDB. + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f GCMS/library_lookup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/library_lookup.py Thu Mar 19 12:22:23 2015 +0100 |
[ |
b'@@ -0,0 +1,327 @@\n+\'\'\'\n+Logic for searching a Retention Index database file given output from NIST\n+\'\'\'\n+import match_library\n+import re\n+import sys\n+import csv\n+\n+__author__ = "Marcel Kempenaar"\n+__contact__ = "brs@nbic.nl"\n+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n+__license__ = "MIT"\n+\n+def create_lookup_table(library_file, column_type_name, statphase):\n+ \'\'\'\n+ Creates a dictionary holding the contents of the library to be searched\n+ @param library_file: library to read\n+ @param column_type_name: the columns type name\n+ @param statphase: the columns stationary phase\n+ \'\'\'\n+ (data, header) = match_library.read_library(library_file)\n+ # Test for presence of required columns\n+ if (\'columntype\' not in header or\n+ \'columnphasetype\' not in header or\n+ \'cas\' not in header):\n+ raise IOError(\'Missing columns in \', library_file)\n+\n+ column_type_column = header.index("columntype")\n+ statphase_column = header.index("columnphasetype")\n+ cas_column = header.index("cas")\n+\n+ filtered_library = [line for line in data if line[column_type_column] == column_type_name\n+ and line[statphase_column] == statphase]\n+ lookup_dict = {}\n+ for element in filtered_library:\n+ # Here the cas_number is set to the numeric part of the cas_column value, so if the \n+ # cas_column value is \'C1433\' then cas_number will be \'1433\'\n+ cas_number = str(re.findall(r\'\\d+\', (element[cas_column]).strip())[0])\n+ try:\n+ lookup_dict[cas_number].append(element)\n+ except KeyError:\n+ lookup_dict[cas_number] = [element]\n+ return lookup_dict\n+\n+\n+def _preferred(hits, pref, ctype, polar, model, method):\n+ \'\'\'\n+ Returns all entries in the lookup_dict that have the same column name, type and polarity\n+ as given by the user, uses regression if selected given the model and method to use. The\n+ regression is applied on the column with the best R-squared value in the model\n+ @param hits: all entries in the lookup_dict for the given CAS number\n+ @param pref: preferred GC-column, can be one or more names\n+ @param ctype: column type (capillary etc.)\n+ @param polar: polarity (polar / non-polar etc.)\n+ @param model: data loaded from file containing regression models\n+ @param method: supported regression method (i.e. poly(nomial) or linear)\n+ \'\'\'\n+ match = []\n+ for column in pref:\n+ for hit in hits:\n+ if hit[4] == ctype and hit[5] == polar and hit[6] == column:\n+ # Create copy of found hit since it will be altered downstream\n+ match.extend(hit)\n+ return match, False\n+\n+ # No hit found for current CAS number, return if not performing regression\n+ if not model:\n+ return False, False\n+\n+ # Perform regression\n+ for column in pref:\n+ if column not in model:\n+ break\n+ # Order regression candidates by R-squared value (last element)\n+ order = sorted(model[column].items(), key=lambda col: col[1][-1])\n+ # Create list of regression candidate column names\n+ regress_columns = list(reversed([column for (column, _) in order]))\n+ # Names of available columns\n+ available = [hit[6] for hit in hits]\n+ \n+ # TODO: combine Rsquared and number of datapoints to get the best regression match\n+ \'\'\'\n+ # Iterate regression columns (in order) and retrieve their models\n+ models = {}\n+ for col in regress_columns:\n+ if col in available:\n+ hit = list(hits[available.index(col)])\n+ if hit[4] == ctype:\n+ # models contains all model data including residuals [-2] and rsquared [-1]\n+ models[pref[0]] = model[pref[0]][hit[6]] \n+ # Get the combined maximum for residuals and rsquared\n+ best_match = models[]\n+ # Apply regression\n+ if me'..b'se:\n+ found_hit[-1] = \'0\'\n+ data.append(found_hit)\n+ found_hit = \'\'\n+ else:\n+ data.append(default_hit(row, casf, compound_id))\n+ else:\n+ data.append(default_hit(row, casf, compound_id))\n+ \n+ casf = \'\'\n+ compound_id = \'\'\n+ found_hit = []\n+ dups = []\n+ return data\n+\n+\n+def _save_data(content, outfile):\n+ \'\'\'\n+ Write to output file\n+ @param content: content to write\n+ @param outfile: file to write to\n+ \'\'\'\n+ # header\n+ header = [\'CAS\',\n+ \'NAME\',\n+ \'FORMULA\',\n+ \'RI\',\n+ \'Column.type\',\n+ \'Column.phase.type\',\n+ \'Column.name\',\n+ \'phase.coding\',\n+ \'CAS_column.Name\',\n+ \'Centrotype\',\n+ \'Regression.Column.Name\',\n+ \'min\',\n+ \'max\',\n+ \'nr.duplicates\']\n+ output_handle = csv.writer(open(outfile, \'wb\'), delimiter="\\t")\n+ output_handle.writerow(header)\n+ for entry in content:\n+ output_handle.writerow(entry)\n+\n+\n+def _read_model(model_file):\n+ \'\'\'\n+ Creates an easy to search dictionary for getting the regression parameters\n+ for each valid combination of GC-columns\n+ @param model_file: filename containing the regression models\n+ \'\'\'\n+ regress = list(csv.reader(open(model_file, \'rU\'), delimiter=\'\\t\'))\n+ if len(regress.pop(0)) > 9:\n+ method = \'poly\'\n+ else:\n+ method = \'linear\'\n+\n+ model = {}\n+ # Create new dictionary for each GC-column\n+ for line in regress:\n+ model[line[0]] = {}\n+\n+ # Add data\n+ for line in regress:\n+ if method == \'poly\':\n+ model[line[0]][line[1]] = [float(col) for col in line[2:11]]\n+ else: # linear\n+ model[line[0]][line[1]] = [float(col) for col in line[2:9]]\n+\n+ return model, method\n+\n+\n+def _apply_poly_regression(column1, column2, retention_index, model):\n+ \'\'\'\n+ Calculates a new retention index (RI) value using a given 3rd-degree polynomial\n+ model based on data from GC columns 1 and 2\n+ @param column1: name of the selected GC-column\n+ @param column2: name of the GC-column to use for regression\n+ @param retention_index: RI to convert\n+ @param model: dictionary containing model information for all GC-columns\n+ \'\'\'\n+ coeff = model[column1][column2]\n+ # If the retention index to convert is within range of the data the model is based on, perform regression\n+ if coeff[4] < retention_index < coeff[5]:\n+ return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + \n+ (retention_index * coeff[1]) + coeff[0])\n+ else:\n+ return False\n+\n+\n+def _apply_linear_regression(column1, column2, retention_index, model):\n+ \'\'\'\n+ Calculates a new retention index (RI) value using a given linear model based on data\n+ from GC columns 1 and 2\n+ @param column1: name of the selected GC-column\n+ @param column2: name of the GC-column to use for regression\n+ @param retention_index: RI to convert\n+ @param model: dictionary containing model information for all GC-columns\n+ \'\'\'\n+ # TODO: No use of limits\n+ coeff = model[column1][column2]\n+ return coeff[1] * retention_index + coeff[0]\n+\n+\n+def main():\n+ \'\'\'\n+ Library Lookup main function\n+ \'\'\'\n+ library_file = sys.argv[1]\n+ nist_tabular_filename = sys.argv[2]\n+ ctype = sys.argv[3]\n+ polar = sys.argv[4]\n+ outfile = sys.argv[5]\n+ pref = sys.argv[6:-1]\n+ regress = sys.argv[-1]\n+\n+ if regress != \'False\':\n+ model, method = _read_model(regress)\n+ else:\n+ model, method = False, None\n+\n+ lookup_dict = create_lookup_table(library_file, ctype, polar)\n+ data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)\n+\n+ _save_data(data, outfile)\n+\n+\n+if __name__ == "__main__":\n+ main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f GCMS/library_lookup.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/library_lookup.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,75 @@ +<tool id="lookup_library" name="RIQC-Lookup RI for CAS numbers in library" version="1.0.2"> + <description>Lookup or estimate the RI using a "known RI values" CAS numbers library</description> + <command interpreter="python"> + library_lookup.py + $library_file + $input + "$col_type" + "$polarity" + $output + #for $ctype in $pref + ${ctype.columntype} + #end for + $regression.model + </command> + <inputs> + <!-- Regarding the <page> items: this blocks the use of this tool in Galaxy workflows. However, + alternatives like wrapping this in conditionals, repeats (to force a refresh_on_change as this option + is not working on its own) failed since the workflow editor does not support refreshes...not does the + workflow runtime support conditionals or repeats to be set at runtime. See also + galaxy-dev mail thread "when else" in <conditional> ? RE: refresh_on_change : is this a valid attribute? Any other ideas/options??" --> + <page> + <param format="tabular" name="input" type="data" label="NIST identifications as tabular file" + help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> + <param name="library_file" type="select" label="CAS x RI Library file" + help="Select a library/lookup file containing RI values for CAS numbers on various chromatography columns " + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/> + </page> + <page> + <param name="col_type" type="select" label="Select column type" refresh_on_change="true" + display="radio" dynamic_options='get_column_type(library_file)' + help="" /> + </page> + <page> + <param name="polarity" type="select" label="Select polarity" refresh_on_change="true" + display="radio" dynamic_options='filter_column(library_file,col_type)' + help="" /> + </page> + <page> + <conditional name="regression"> + <param name="regression_select" type="boolean" checked="false" label="Apply regression method" + help="If no data for the selected column is present in the database, selecting this option will try + to convert Retention Indices using data from other GC-columns with a regression method. Please + note that only the first given GC-column above will be used for this, any alternatives will be + ignored" /> + <when value="true"> + <param name="model" format="tabular" type="data" label="Tabular file containing regression model" + help="This file contains the coefficients used to perform the regression from one GC-column + to another GC-column"/> + </when> + <when value="false"> + <param name="model" type="hidden" value="False" /> + </when> + </conditional> + <repeat name="pref" title="Select column name preference"> + <param name="columntype" type="select" label="Column name" refresh_on_change="true" + dynamic_options='filter_column2(library_file, col_type, polarity)' + help="Select one or more column names for filtering. The order defines the priority." /> + </repeat> + </page> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on" name="output" /> +</outputs> +<code file="match_library.py" /> + <help> +Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library. +If a direct match is NOT found for the preferred column name, a regression can be done to find +the theoretical RI value based on known RI values for the CAS number on other column types (see step 4). +If there is no match for the CAS number on any column type, then the record is not given a RI. + + + + </help> + +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f GCMS/readme.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/readme.txt Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,25 @@ +==Description== +Galaxy tools for analyzing a PDF-result file generated by NIST-MSSearch by calculating ranks for +each putative metabolite and matching with a Retention Index Database by CAS-number + + +==Dependencies== +- Galaxy +- pdftotext (part of the 'poppler-utils' package (Ubuntu and Debian)) +- Python (>= 2.6) with the following package(s): + - numpy +- R + + +==Installation== + +- Optionally run the included Python unittests to verify the dependencies: + nosetests galaxy-dist/tools/GCMS + + + +==License== + +Licensing info can be found at the /LICENSE_AND_README folder files located +in the root of this (PRIMS-metabolomics) project. + |
b |
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/export_to_metexp_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/export_to_metexp_tabular.py Thu Mar 19 12:22:23 2015 +0100 |
[ |
b'@@ -0,0 +1,247 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust\n+into a tabular file that can be uploaded to the MetExp database.\n+\n+RankFilter, CasLookup are already combined by combine_output.py so here we will use\n+this result. Furthermore here one of the MsClust\n+quantification files containing the respective spectra details are to be combined as well. \n+\n+Extra calculations performed:\n+- The column MW is also added here and is derived from the column FORMULA found \n+ in RankFilter, CasLookup combined result. \n+ \n+So in total here we merge 2 files and calculate one new column. \n+\'\'\'\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n+import csv\n+import re\n+import sys\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2013, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_data(in_csv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_csv, \'rU\'), delimiter=delim))\n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+ONE_TO_ONE = \'one_to_one\'\n+N_TO_ONE = \'n_to_one\'\n+\n+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):\n+ \'\'\'\n+ Merges data from both input dictionaries based on the link fields. This method will\n+ build up a new list containing the merged hits as the items. \n+ @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)\n+ @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)\n+ \'\'\'\n+ # TODO test for correct input files -> same link_field values should be there \n+ # (test at least number of unique link_field values):\n+ #\n+ # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):\n+ # raise Exception(\'input files should have the same nr of key values \')\n+ \n+ \n+ merged = []\n+ processed = {}\n+ for link_field_set1_idx in xrange(len(set1[link_field_set1])):\n+ link_field_set1_value = set1[link_field_set1][link_field_set1_idx]\n+ if not link_field_set1_value in processed :\n+ # keep track of processed items to not repeat them\n+ processed[link_field_set1_value] = link_field_set1_value\n+\n+ # Get the indices for current link_field_set1_value in both data-structures for proper matching\n+ set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]\n+ set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]\n+ # Validation :\n+ if len(set2index) == 0:\n+ # means that corresponding data could not be found in set2, then throw error\n+ raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + \n+ link_field_set1_value + " only found in first dataset. ")\n+ \n+ merged_hits = []\n+ # Combine hits\n+ for hit in xrange(len(set1index)):\n+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n+ # corresponding value in the sets; i.e. \n+ # set1[key] => returns the list/array with size = nrrows, with the values for the attribute\n+ '..b'cation of reference standard\n+ record.append(\'0\')\n+ record.append(\'\') \n+ \n+ return record\n+\n+\n+def get_molecular_mass(formula):\n+ \'\'\'\n+ Calculates the molecular mass (MM). \n+ E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O\n+ \'\'\'\n+ \n+ # Each element is represented by a capital letter, followed optionally by \n+ # lower case, with one or more digits as for how many elements:\n+ element_pattern = re.compile("([A-Z][a-z]?)(\\d*)")\n+\n+ total_mass = 0\n+ for (element_name, count) in element_pattern.findall(formula):\n+ if count == "":\n+ count = 1\n+ else:\n+ count = int(count)\n+ element_mass = float(elements_and_masses_map[element_name]) # "found: Python\'s built-in float type has double precision " (? check if really correct ?)\n+ total_mass += element_mass * count\n+ \n+ return total_mass\n+ \n+ \n+\n+def _save_data(data, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data: dictionary containing merged dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write \n+ for item_idx in xrange(len(data)):\n+ for hit in data[item_idx]:\n+ output_single_handle.writerow(hit)\n+\n+\n+def _get_map_for_elements_and_masses(elements_and_masses):\n+ \'\'\'\n+ This method will read out the column \'Chemical symbol\' and make a map \n+ of this, storing the column \'Relative atomic mass\' as its value\n+ \'\'\'\n+ resultMap = {}\n+ index = 0\n+ for entry in elements_and_masses[\'Chemical symbol\']:\n+ resultMap[entry] = elements_and_masses[\'Relative atomic mass\'][index]\n+ index += 1\n+ \n+ return resultMap\n+\n+\n+def init_elements_and_masses_map():\n+ \'\'\'\n+ Initializes the lookup map containing the elements and their respective masses\n+ \'\'\'\n+ elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))\n+ global elements_and_masses_map\n+ elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)\n+ \n+\n+def main():\n+ \'\'\'\n+ Combine Output main function\n+ \n+ RankFilter, CasLookup are already combined by combine_output.py so here we will use\n+ this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust\n+ quantification files are to be combined with combine_output.py result as well. \n+ \'\'\'\n+ rankfilter_and_caslookup_combined_file = sys.argv[1]\n+ msclust_quantification_and_spectra_file = sys.argv[2]\n+ output_csv = sys.argv[3]\n+ # metadata\n+ metadata = OrderedDict()\n+ metadata[\'organism\'] = sys.argv[4]\n+ metadata[\'tissue\'] = sys.argv[5]\n+ metadata[\'experiment_name\'] = sys.argv[6]\n+ metadata[\'user_name\'] = sys.argv[7]\n+ metadata[\'column_type\'] = sys.argv[8]\n+\n+ # Read RankFilter and CasLookup output files\n+ rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)\n+ msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, \',\')\n+ \n+ # Read elements and masses to use for the MW/MM calculation :\n+ init_elements_and_masses_map()\n+ \n+ merged = _merge_data(rankfilter_and_caslookup_combined, \'Centrotype\', \n+ msclust_quantification_and_spectra, \'centrotype\', \n+ _compare_records, _merge_records, metadata,\n+ N_TO_ONE)\n+ headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + [\'MM\',\'MW\', \'Level of identification\', \'Location of reference standard\']\n+ _save_data(merged, headers, output_csv)\n+\n+\n+if __name__ == \'__main__\':\n+ main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/export_to_metexp_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/export_to_metexp_tabular.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,75 @@ +<tool id="export_to_metexp_tabular" + name="METEXP - Tabular file" + version="0.2.0"> + <description>Create tabular file for loading into METabolomics EXPlorer database</description> + <command interpreter="python"> + export_to_metexp_tabular.py + $rankfilter_and_caslookup_combi + $msclust_quant_file + $output_result + "$organism" + "$tissue" + "$experiment_name" + "$user_name" + "$column_type" + </command> + <inputs> + <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output" + help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/> + <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output" + help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/> + + + <param name="organism" type="text" size="80" + label="Organism(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." > + <validator type="empty_field" message="A value is required."></validator><!-- attribute optional="False" does not seem to work for params so validator is added --> + </param> + + <param name="tissue" type="text" size="80" + label="Tissue(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + <param name="experiment_name" type="text" size="80" + label="Experiment name/code" + help="Name or code to store the results under. This can help you find the results back in MetExpDB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + <param name="user_name" type="text" size="80" + label="User name" + help="User name or code to store the results under. This can help you find the results back in MetExpDB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + <param name="column_type" type="text" size="80" + label="Column type" + help="Column type to report with the results. This can help you find the results back in MetExpDB." > + <validator type="empty_field" message="A value is required."></validator> + </param> + + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" /> + </outputs> + <help> +.. class:: infomark + +Tool to combine output from the tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. + +RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use +this result. + +**Notes** + +Extra calculations performed: +- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. + + + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/query_metexp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/query_metexp.py Thu Mar 19 12:22:23 2015 +0100 |
[ |
b'@@ -0,0 +1,282 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to query a set of identifications against the METabolomics EXPlorer database.\n+\n+It will take the input file and for each record it will query the \n+molecular mass in the selected MetExp DB. If one or more compounds are found in the\n+MetExp DB then extra information regarding these compounds is added to the output file.\n+\n+The output file is thus the input file enriched with information about \n+related items found in the selected MetExp DB. \n+\'\'\'\n+import csv\n+import sys\n+import fileinput\n+import urllib2\n+import time\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_file(in_xsv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n+ return _process_data(data)\n+ \n+def _process_data(data):\n+ \n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):\n+ \'\'\'\n+ This method will iterate over the record in the input_data and\n+ will enrich them with the related information found (if any) in the \n+ MetExp Database.\n+ \n+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies\n+ \'\'\'\n+ merged = []\n+ \n+ for i in xrange(len(input_data[input_data.keys()[0]])):\n+ # Get the record in same dictionary format as input_data, but containing\n+ # a value at each column instead of a list of all values of all records:\n+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n+ \n+ # read the molecular mass and formula:\n+ cas_id = input_data_record[casid_col]\n+ formula = input_data_record[formula_col]\n+ molecular_mass = input_data_record[molecular_mass_col]\n+ \n+ # search for related records in MetExp:\n+ data_found = None\n+ if cas_id != "undef": \n+ # 1- search for other experiments where this CAS id has been found:\n+ query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "CAS"\n+ if data_found == None:\n+ # 2- search for other experiments where this FORMULA has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "FORMULA"\n+ if data_found == None:\n+ # 3- search for other experiments where this MM has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method \n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "MM"\n+ \n+ if data_found == None:\n+ # If still nothing found, just add empty columns\n+ extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\',\'\',\'\']\n+ else:\n+ # Add info found:\n+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n+ \n+ # Take all data and merge it into a "flat"/simple array of values:\n+ field_values_'..b'ne\n+ \n+ for data_row in data_rows:\n+ if not data_row.strip() == \'\':\n+ row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n+ result.append(row_as_list)\n+ \n+ # return result processed into a dict:\n+ return _process_data(result)\n+ \n+ except urllib2.HTTPError, e:\n+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n+ except urllib2.URLError, e:\n+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")\n+\n+def _str_to_list(data_row, delimiter=\'\\t\'): \n+ result = []\n+ for column in data_row.split(delimiter):\n+ result.append(column)\n+ return result\n+ \n+ \n+# alternative: ? \n+# s = requests.Session()\n+# s.verify = False\n+# #s.auth = (token01, token02)\n+# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n+# content = resp.content\n+# # transform to dictionary:\n+ \n+ \n+ \n+ \n+def _merge_data(input_data_record, extra_cols):\n+ \'\'\'\n+ Adds the extra information to the existing data record and returns\n+ the combined new record.\n+ \'\'\'\n+ record = []\n+ for column in input_data_record:\n+ record.append(input_data_record[column])\n+ \n+ \n+ # add extra columns\n+ for column in extra_cols:\n+ record.append(column) \n+ \n+ return record \n+ \n+\n+def _save_data(data_rows, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data_rows: dictionary containing merged/enriched dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write one line for each row\n+ for data_row in data_rows:\n+ output_single_handle.writerow(data_row)\n+\n+def _get_metexp_URL(metexp_dblink_file):\n+ \'\'\'\n+ Read out and return the URL stored in the given file.\n+ \'\'\'\n+ file_input = fileinput.input(metexp_dblink_file)\n+ try:\n+ for line in file_input:\n+ if line[0] != \'#\':\n+ # just return the first line that is not a comment line:\n+ return line\n+ finally:\n+ file_input.close()\n+ \n+\n+def main():\n+ \'\'\'\n+ MetExp Query main function\n+ \n+ The input file can be any tabular file, as long as it contains a column for the molecular mass\n+ and one for the formula of the respective identification. These two columns are then\n+ used to query against MetExp Database. \n+ \'\'\'\n+ seconds_start = int(round(time.time()))\n+ \n+ input_file = sys.argv[1]\n+ casid_col = sys.argv[2]\n+ formula_col = sys.argv[3]\n+ molecular_mass_col = sys.argv[4]\n+ metexp_dblink_file = sys.argv[5]\n+ separation_method = sys.argv[6]\n+ output_result = sys.argv[7]\n+\n+ # Parse metexp_dblink_file to find the URL to the MetExp service:\n+ metexp_dblink = _get_metexp_URL(metexp_dblink_file)\n+ \n+ # Parse tabular input file into dictionary/array:\n+ input_data = _process_file(input_file)\n+ \n+ # Query data against MetExp DB :\n+ enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)\n+ headers = input_data.keys() + [\'METEXP hits for \',\'METEXP hits: organisms\', \'METEXP hits: tissues\',\n+ \'METEXP hits: experiments\',\'METEXP hits: user names\',\'METEXP hits: column types\', \'METEXP hits: CAS nrs\', \'Link to METEXP hits\']\n+ \n+ _save_data(enriched_data, headers, output_result)\n+ \n+ seconds_end = int(round(time.time()))\n+ print "Took " + str(seconds_end - seconds_start) + " seconds"\n+ \n+ \n+\n+if __name__ == \'__main__\':\n+ main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/query_metexp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/query_metexp.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,69 @@ +<tool id="query_metexp" + name="METEXP - Query Database " + version="0.1.0"> + <description>Query a set of identifications against the METabolomics EXPeriments database</description> + <command interpreter="python"> + query_metexp.py + $input_file + "$casid_col" + "$formula_col" + "$molecular_mass_col" + "$metexp_dblink_file" + $separation_method + $output_result + </command> + <inputs> + + <param name="input_file" format="tabular" type="data" + label="Input file" + help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> + + <param name="separation_method" type="select" label="Data type to query"> + <option value="GC" selected="True">GC</option> + <option value="LC">LC</option> + </param> + + + <param name="casid_col" type="text" size="50" + label="CAS ID column name" + value="CAS" + help="Name of the column containing the CAS code information (in the given input file)" /> + <param name="formula_col" type="text" size="50" + label="Formula ID column name" + value="FORMULA" + help="Name of the column containing the formula information (in the given input file)" /> + <param name="molecular_mass_col" type="text" size="50" + label="Molecular mass column name" + value="MM" + help="Name of the column containing the molecular mass information (in the given input file)" /> + + <param name="metexp_dblink_file" type="select" label="MetExp DB to query" + help="Select the MetExp Database/backend which should be queried" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/> + + + </inputs> + <outputs> + <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> + <help> +.. class:: infomark + +This tool will Query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass +and one for the formula of the respective identification. + + + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/static_resources/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/static_resources/README.txt Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,3 @@ +This folder and respective files should be deployed together with the following tools: + + - ../export_to_metexp_tabular.py \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/static_resources/elements_and_masses.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/static_resources/elements_and_masses.tab Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,104 @@ +Name Atomic number Chemical symbol Relative atomic mass +Hydrogen 1 H 1.01 +Helium 2 He 4 +Lithium 3 Li 6.94 +Beryllium 4 Be 9.01 +Boron 5 B 10.81 +Carbon 6 C 12.01 +Nitrogen 7 N 14.01 +Oxygen 8 O 16 +Fluorine 9 F 19 +Neon 10 Ne 20.18 +Sodium 11 Na 22.99 +Magnesium 12 Mg 24.31 +Aluminum 13 Al 26.98 +Silicon 14 Si 28.09 +Phosphorus 15 P 30.98 +Sulfur 16 S 32.06 +Chlorine 17 Cl 35.45 +Argon 18 Ar 39.95 +Potassium 19 K 39.1 +Calcium 20 Ca 40.08 +Scandium 21 Sc 44.96 +Titanium 22 Ti 47.9 +Vanadium 23 V 50.94 +Chromium 24 Cr 52 +Manganese 25 Mn 54.94 +Iron 26 Fe 55.85 +Cobalt 27 Co 58.93 +Nickel 28 Ni 58.71 +Copper 29 Cu 63.54 +Zinc 30 Zn 65.37 +Gallium 31 Ga 69.72 +Germanium 32 Ge 72.59 +Arsenic 33 As 74.99 +Selenium 34 Se 78.96 +Bromine 35 Br 79.91 +Krypton 36 Kr 83.8 +Rubidium 37 Rb 85.47 +Strontium 38 Sr 87.62 +Yttrium 39 Y 88.91 +Zirconium 40 Zr 91.22 +Niobium 41 Nb 92.91 +Molybdenum 42 Mo 95.94 +Technetium 43 Tc 96.91 +Ruthenium 44 Ru 101.07 +Rhodium 45 Rh 102.9 +Palladium 46 Pd 106.4 +Silver 47 Ag 107.87 +Cadmium 48 Cd 112.4 +Indium 49 In 114.82 +Tin 50 Sn 118.69 +Antimony 51 Sb 121.75 +Tellurium 52 Te 127.6 +Iodine 53 I 126.9 +Xenon 54 Xe 131.3 +Cesium 55 Cs 132.9 +Barium 56 Ba 137.34 +Lanthanum 57 La 138.91 +Cerium 58 Ce 140.12 +Praseodymium 59 Pr 140.91 +Neodymium 60 Nd 144.24 +Promethium 61 Pm 144.91 +Samarium 62 Sm 150.35 +Europium 63 Eu 151.96 +Gadolinium 64 Gd 157.25 +Terbium 65 Tb 158.92 +Dysprosium 66 Dy 162.5 +Holmium 67 Ho 164.93 +Erbium 68 Er 167.26 +Thulium 69 Tm 168.93 +Ytterbium 70 Yb 173.04 +Lutetium 71 Lu 174.97 +Hafnium 72 Hf 178.49 +Tantalum 73 Ta 180.95 +Wolfram 74 W 183.85 +Rhenium 75 Re 186.2 +Osmium 76 Os 190.2 +Iridium 77 Ir 192.22 +Platinum 78 Pt 195.09 +Gold 79 Au 196.97 +Mercury 80 Hg 200.59 +Thallium 81 Tl 204.37 +Lead 82 Pb 207.19 +Bismuth 83 Bi 208.98 +Polonium 84 Po 208.98 +Astatine 85 At 209.99 +Radon 86 Rn 222.02 +Francium 87 Fr 223.02 +Radium 88 Ra 226 +Actinium 89 Ac 227.03 +Thorium 90 Th 232.04 +Protactinium 91 Pa 231.04 +Uranium 92 U 238.03 +Neptunium 93 Np 237 +Plutonium 94 Pu 242 +Americium 95 Am 243.06 +Curium 96 Cm 247.07 +Berkelium 97 Bk 247.07 +Californium 98 Cf 251.08 +Einsteinium 99 Es 254.09 +Fermium 100 Fm 257.1 +Mendelevium 101 Md 257.1 +Nobelium 102 No 255.09 +Lawrencium 103 Lr 256.1 |
b |
diff -r 41f122255d14 -r 4393f982d18f MS/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MS/__init__.py Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,6 @@ +''' +Module containing Galaxy tools for the LC or GC/MS pipeline +Created on Mar , 2014 + +@author: pieter lukasse +''' \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f MS/query_mass_repos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MS/query_mass_repos.py Thu Mar 19 12:22:23 2015 +0100 |
[ |
b'@@ -0,0 +1,289 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to query a set of accurate mass values detected by high-resolution mass spectrometers\n+against various repositories/services such as METabolomics EXPlorer database or the \n+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).\n+\n+It will take the input file and for each record it will query the \n+molecular mass in the selected repository/service. If one or more compounds are found \n+then extra information regarding these compounds is added to the output file.\n+\n+The output file is thus the input file enriched with information about \n+related items found in the selected repository/service. \n+\n+The service should implement the following interface: \n+\n+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)\n+\n+The output should be tab separated and should contain the following columns (in this order)\n+db-name molecular-formula dbe formula-weight id description\n+\n+\n+\'\'\'\n+import csv\n+import sys\n+import fileinput\n+import urllib2\n+import time\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_file(in_xsv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n+ return _process_data(data)\n+ \n+def _process_data(data):\n+ \n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):\n+ \n+ \'\'\'\n+ This method will iterate over the record in the input_data and\n+ will enrich them with the related information found (if any) in the \n+ chosen repository/service\n+ \n+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies\n+ \'\'\'\n+ merged = []\n+ \n+ for i in xrange(len(input_data[input_data.keys()[0]])):\n+ # Get the record in same dictionary format as input_data, but containing\n+ # a value at each column instead of a list of all values of all records:\n+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n+ \n+ # read the molecular mass :\n+ molecular_mass = input_data_record[molecular_mass_col]\n+ \n+ \n+ # search for related records in repository/service:\n+ data_found = None\n+ if molecular_mass != "": \n+ molecular_mass = float(molecular_mass)\n+ \n+ # 1- search for data around this MM:\n+ query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"\n+ \n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "MM"\n+ \n+ \n+ if data_found == None:\n+ # If still nothing found, just add empty columns\n+ extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\']\n+ else:\n+ # Add info found:\n+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n+ \n+ # Take all data and merge it into a "flat"/simple array of values:\n+ field_values_list = _merge_data(input_data_record, extra_cols)\n+ \n+ merged.append(field_values_list)\n+\n+ # return the merged/enriched records:\n+ return merged\n+\n+\n+def'..b'_rows[1].strip() == \'\': \n+ # means there is only the header row...so no hits:\n+ return None\n+ \n+ for data_row in data_rows:\n+ if not data_row.strip() == \'\':\n+ row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n+ result.append(row_as_list)\n+ \n+ # return result processed into a dict:\n+ return _process_data(result)\n+ \n+ except urllib2.HTTPError, e:\n+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n+ except urllib2.URLError, e:\n+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")\n+\n+def _str_to_list(data_row, delimiter=\'\\t\'): \n+ result = []\n+ for column in data_row.split(delimiter):\n+ result.append(column)\n+ return result\n+ \n+ \n+# alternative: ? \n+# s = requests.Session()\n+# s.verify = False\n+# #s.auth = (token01, token02)\n+# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n+# content = resp.content\n+# # transform to dictionary:\n+ \n+ \n+ \n+ \n+def _merge_data(input_data_record, extra_cols):\n+ \'\'\'\n+ Adds the extra information to the existing data record and returns\n+ the combined new record.\n+ \'\'\'\n+ record = []\n+ for column in input_data_record:\n+ record.append(input_data_record[column])\n+ \n+ \n+ # add extra columns\n+ for column in extra_cols:\n+ record.append(column) \n+ \n+ return record \n+ \n+\n+def _save_data(data_rows, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data_rows: dictionary containing merged/enriched dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write one line for each row\n+ for data_row in data_rows:\n+ output_single_handle.writerow(data_row)\n+\n+def _get_repository_URL(repository_file):\n+ \'\'\'\n+ Read out and return the URL stored in the given file.\n+ \'\'\'\n+ file_input = fileinput.input(repository_file)\n+ try:\n+ for line in file_input:\n+ if line[0] != \'#\':\n+ # just return the first line that is not a comment line:\n+ return line\n+ finally:\n+ file_input.close()\n+ \n+\n+def main():\n+ \'\'\'\n+ Query main function\n+ \n+ The input file can be any tabular file, as long as it contains a column for the molecular mass.\n+ This column is then used to query against the chosen repository/service Database. \n+ \'\'\'\n+ seconds_start = int(round(time.time()))\n+ \n+ input_file = sys.argv[1]\n+ molecular_mass_col = sys.argv[2]\n+ repository_file = sys.argv[3]\n+ error_margin = float(sys.argv[4])\n+ margin_unit = sys.argv[5]\n+ output_result = sys.argv[6]\n+\n+ # Parse repository_file to find the URL to the service:\n+ repository_dblink = _get_repository_URL(repository_file)\n+ \n+ # Parse tabular input file into dictionary/array:\n+ input_data = _process_file(input_file)\n+ \n+ # Query data against repository :\n+ enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)\n+ headers = input_data.keys() + [\'SEARCH hits for \',\'SEARCH hits: db-names\', \'SEARCH hits: molecular-formulas \',\n+ \'SEARCH hits: ids\',\'SEARCH hits: descriptions\', \'Link to SEARCH hits\'] #TODO - add min and max formula weigth columns\n+\n+ _save_data(enriched_data, headers, output_result)\n+ \n+ seconds_end = int(round(time.time()))\n+ print "Took " + str(seconds_end - seconds_start) + " seconds"\n+ \n+ \n+\n+if __name__ == \'__main__\':\n+ main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f MS/query_mass_repos.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MS/query_mass_repos.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,106 @@ +<tool id="query_mass_repos" + name="METEXP - Find elemental composition formulas based on mass values " + version="0.1.0"> + <description>Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers</description> + <command interpreter="python"> + query_mass_repos.py + $input_file + "$molecular_mass_col" + "$repository_file" + $error_margin + $margin_unit + $output_result + </command> + <inputs> + + <param name="input_file" format="tabular" type="data" + label="Input file" + help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> + + <param name="molecular_mass_col" type="text" size="50" + label="Molecular mass column name" + value="MM" + help="Name of the column containing the molecular mass information (in the given input file)" /> + + <param name="repository_file" type="select" label="Repository/service to query" + help="Select the repository/service which should be queried" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_MassSearch_Services")'/> + + <param name="error_margin" type="float" size="10" + label="Error marging" + value="0.01" + help="Mass difference allowed when searching in the repositories for a mass match." /> + + <param name="margin_unit" type="select" label="Margin unit"> + <option value="ms" selected="True">ms</option> + <option value="ppm">ppm</option> + </param> + <!-- TODO + <param name="metexp_access_key" type="text" size="50" + label="(Optional)MetExp access key" + value="" + help="Key needed to get access to MetExp services. Fill in if MetExp service was selected" /> --> + + </inputs> + <outputs> + <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> + <help> +.. class:: infomark + +This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher +for elemental compositions from accurate mass values detected by high-resolution mass spectrometers. + +It will take the input file and for each record it will query the +molecular mass in the selected repository. If one or more compounds are found in the +repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass. + +**Services that can be queried** + +================= ========================================================================= +Database Description +----------------- ------------------------------------------------------------------------- +PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at + Plant Research International. NB: restricted access to employees with + access key. +ExactMassDB A database of possible elemental compositions consits of C: 100, + H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and + the Lewis valence rules. + (via /mfsearcher/exmassdb/) +ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental + compositions, filters some elemental compositions according to + the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 + database returns the same result as does HR2 with the same atom kind + and number condition as that used in construction of the ExactMassDB. + (via /mfsearcher/exmassdb-hr2/) +Pep1000 A database of possible linear polypeptides that are + constructed with 20 kinds of amino acids and having molecular + weights smaller than 1000. + (via /mfsearcher/pep1000/) +KEGG Re-calculated compound data from KEGG. Weekly updated. + (via /mfsearcher/kegg/) +KNApSAcK Re-calculated compound data from KNApSAcK. + (via /mfsearcher/knapsack/) +Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer . + (via /mfsearcher/flavonoidviewer/ +LipidMAPS Re-calculated compound data from LIPID MAPS. + (via /mfsearcher/lipidmaps/) +HMDB Re-calculated compound data from Human Metabolome Database (HMDB) + Version 3.5. + (via /mfsearcher/hmdb/) +PubChem Re-calculated compound data from PubChem. Monthly updated. + (via /mfsearcher/pubchem/) +================= ========================================================================= + +Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher + + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f combine_output.py --- a/combine_output.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,253 +0,0 @@\n-#!/usr/bin/env python\n-# encoding: utf-8\n-\'\'\'\n-Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)\n-\'\'\'\n-\n-import csv\n-import re\n-import sys\n-import math\n-import pprint\n-\n-__author__ = "Marcel Kempenaar"\n-__contact__ = "brs@nbic.nl"\n-__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n-__license__ = "MIT"\n-\n-def _process_data(in_csv):\n- \'\'\'\n- Generic method to parse a tab-separated file returning a dictionary with named columns\n- @param in_csv: input filename to be parsed\n- \'\'\'\n- data = list(csv.reader(open(in_csv, \'rU\'), delimiter=\'\\t\'))\n- header = data.pop(0)\n- # Create dictionary with column name as key\n- output = {}\n- for index in xrange(len(header)):\n- output[header[index]] = [row[index] for row in data]\n- return output\n-\n-\n-def _merge_data(rankfilter, caslookup):\n- \'\'\'\n- Merges data from both input dictionaries based on the Centrotype field. This method will\n- build up a new list containing the merged hits as the items. \n- @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)\n- @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)\n- \'\'\'\n- # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:\n- if (len(rankfilter[\'ID\']) != len(caslookup[\'Centrotype\'])):\n- raise Exception(\'rankfilter and caslookup files should have the same nr of rows/records \')\n- \n- merged = []\n- processed = {}\n- for compound_id_idx in xrange(len(rankfilter[\'ID\'])):\n- compound_id = rankfilter[\'ID\'][compound_id_idx]\n- if not compound_id in processed :\n- # keep track of processed items to not repeat them\n- processed[compound_id] = compound_id\n- # get centrotype nr\n- centrotype = compound_id.split(\'-\')[0]\n- # Get the indices for current compound ID in both data-structures for proper matching\n- rindex = [index for index, value in enumerate(rankfilter[\'ID\']) if value == compound_id]\n- cindex = [index for index, value in enumerate(caslookup[\'Centrotype\']) if value == centrotype]\n- \n- merged_hits = []\n- # Combine hits\n- for hit in xrange(len(rindex)):\n- # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n- # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n- # corresponding value in the rankfilter or caslookup tables; i.e. \n- # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute\n- # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)\n- rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))\n- cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))\n- \n- merged_hit = _add_hit(rf_record, cl_record)\n- merged_hits.append(merged_hit)\n- \n- merged.append(merged_hits)\n-\n- return merged, len(rindex)\n-\n-\n-def _add_hit(rankfilter, caslookup):\n- \'\'\'\n- Combines single records from both the RankFilter- and CasLookup-tools\n- @param rankfilter: record (dictionary) of one compound in the RankFilter output\n- @param caslookup: matching record (dictionary) of one compound in the CasLookup output\n- \'\'\'\n- # The ID in the RankFilter output contains the following 5 fields:\n- rf_id = rankfilter[\'ID\'].split(\'-\')\n- try:\n- name, formula = _remove_formula(rankfilter[\'Name\'])\n- hit = [rf_id[0], # Centrotype\n- rf_id[1], # cent.Factor\n- rf_id[2], # '..b'ULA\': \'N/A\',\n- \'RI\': \'0.0\',\n- \'Regression.Column.Name\': \'None\',\n- \'min\': \'0.0\',\n- \'max\': \'0.0\',\n- \'nr.duplicates\': \'0\',\n- \'Column.phase.type\': \'N/A\',\n- \'Column.name\': \'N/A\'}\n-\n-\n-def _save_data(data, nhits, out_csv_single, out_csv_multi):\n- \'\'\'\n- Writes tab-separated data to file\n- @param data: dictionary containing merged dataset\n- @param out_csv: output csv file\n- \'\'\'\n- # Columns we don\'t repeat:\n- header_part1 = [\'Centrotype\',\n- \'cent.Factor\',\n- \'scan nr.\',\n- \'R.T. (umin)\',\n- \'nr. Peaks\',\n- \'R.T.\']\n- # These are the headers/columns we repeat in case of \n- # combining hits in one line (see alternative_headers method below):\n- header_part2 = [\n- \'Name\',\n- \'FORMULA\',\n- \'Library\',\n- \'CAS\',\n- \'Forward\',\n- \'Reverse\',\n- \'Avg. (Forward, Reverse)\',\n- \'RIexp\',\n- \'RI\',\n- \'RIsvr\',\n- \'RIexp - RIsvr\',\n- \'RI - RIexp\',\n- \'Regression.Column.Name\',\n- \'min\',\n- \'max\',\n- \'nr.duplicates\',\n- \'Column.phase.type\',\n- \'Column.name\',\n- \'Rank\',\n- \'%rel.err\',\n- \'Synonyms\']\n-\n- # Open output file for writing\n- outfile_single_handle = open(out_csv_single, \'wb\')\n- outfile_multi_handle = open(out_csv_multi, \'wb\')\n- output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n- output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\\t")\n-\n- # Write headers\n- output_single_handle.writerow(header_part1 + header_part2)\n- output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))\n- # Combine all hits for each centrotype into one line\n- line = []\n- for centrotype_idx in xrange(len(data)):\n- i = 0\n- for hit in data[centrotype_idx]:\n- if i==0:\n- line.extend(hit)\n- else:\n- line.extend(hit[6:])\n- i = i+1\n- # small validation (if error, it is a programming error):\n- if i > nhits:\n- raise Exception(\'Error: more hits that expected for centrotype_idx \' + centrotype_idx)\n- output_multi_handle.writerow(line)\n- line = []\n-\n- # Write one line for each centrotype\n- for centrotype_idx in xrange(len(data)):\n- for hit in data[centrotype_idx]:\n- output_single_handle.writerow(hit)\n-\n-def alternative_headers(header_part2, nr_alternative_hits):\n- \'\'\' \n- This method will iterate over the header names and add the string \'ALT#_\' before each, \n- where # is the number of the alternative, according to number of alternative hits we want to add\n- to final csv/tsv\n- \'\'\'\n- result = []\n- for i in xrange(nr_alternative_hits): \n- for header_name in header_part2:\n- result.append("ALT" + str(i+1) + "_" + header_name) \n- return result\n-\n-def main():\n- \'\'\'\n- Combine Output main function\n- It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" \n- NB: the caslookup_result_file will typically have fewer lines than\n- rankfilter_result_file, so the merge has to consider this as well. The final file\n- should have the same nr of lines as rankfilter_result_file.\n- \'\'\'\n- rankfilter_result_file = sys.argv[1]\n- caslookup_result_file = sys.argv[2]\n- output_single_csv = sys.argv[3]\n- output_multi_csv = sys.argv[4]\n-\n- # Read RankFilter and CasLookup output files\n- rankfilter = _process_data(rankfilter_result_file)\n- caslookup = _process_data(caslookup_result_file)\n- merged, nhits = _merge_data(rankfilter, caslookup)\n- _save_data(merged, nhits, output_single_csv, output_multi_csv)\n-\n-\n-if __name__ == \'__main__\':\n- main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f combine_output.xml --- a/combine_output.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,36 +0,0 @@ -<tool id="combine_output" name="RIQC-Combine RankFilter and CasLookup output" version="1.0.2"> - <description>Perform a combination of output data from the RankFilter and CasLookup tools</description> - <command interpreter="python"> - combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi - </command> - <inputs> - <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output (Estimated RI)" - help="Select the output file from the RankFilter tool"/> - <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output ('Known' RI)" - help="Select the output file from the CasLookup tool"/> - <!-- <param TODO : could add "tolerance for ERI-KRI"(Estimated RI-Known RI)--> - </inputs> - <outputs> - <data format="tabular" label="${tool.name} (Single) on ${on_string}" name="out_single" /> - <data format="tabular" label="${tool.name} (Multi) on ${on_string}" name="out_multi" /> - </outputs> - <help> -Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files. - -The files produced are contain either all hits for a compound on a single line (Single) or on separate lines -(Multi). - -.. class:: infomark - -**Notes** - -The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the -original headers kept intact. Processing steps include: - - - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values - - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)' - and 'nr. Peaks' fields. - - The formula is split off the 'Name' field in the RankFilter output - - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f create_model.xml --- a/create_model.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,78 +0,0 @@ -<tool id="create_poly_model" name="RIQC-Create Regression Model" version="1.0.2"> - <description>Generate coefficients to enable the regression from one GC-column - to another GC-column</description> - <command interpreter="Rscript">Rscripts/ridb-regression.R - $ridb - $out_model - $out_log - $min_residuals - $range_mod - $pvalue - $rsquared - $method - $plot - #if $plot - $model_graphics - #end if - </command> - <inputs> - <param format="tabular" name="ridb" type="select" label="Retention Index (RI) and GC columns Library file" - help="Select the RI library file of which all GC columns and their RI values - will be used to create a model" - dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/> - - <param name="method" type="select" label="Select regression method" - help="Method to use for calculating the model" > - <option value="poly" selected="True">Polynomial (3rd degree)</option> - <option value="linear">Linear</option> - </param> - <param name="min_residuals" type="integer" value="10" optional="False" - label="Minimum number of residuals" help="The minimum number of residuals - (datapoints) that both columns should have in common when calculating - the model" /> - <param name="range_mod" type="integer" value="0" optional="False" - label="Range modifier" help="Moves the range of the usable RI space by the - given percentage. Set to 0 to use the full range of available data." /> - <param name="pvalue" type="float" value="0.05" optional="False" min="0" max="1" - label="Pvalue to filter on" help="Set the upper limit for the pvalue (calculated) - by performing an ANOVA analysis on the created model). All models with higher - pvalues are discarded." /> - <param name="rsquared" type="float" value="0.95" optional="False" min="0" max="1" - label="R-squared to filter on" help="Set the lower limit for the R-squared, - all models with lower values are discarded." /> - <param name="plot" type="boolean" label="Create a separate plot for each model" - help="This will create a ZIP file in the history containing PDF plots" /> - </inputs> - <code file="match_library.py" /> - <outputs> - <data format="zip" label="Model Graphics of ${on_string}" name="model_graphics" > - <filter>(plot)</filter> - </data> - <data format="tabular" label="Regression logfile of ${on_string}" name="out_log" /> - <data format="tabular" label="Regression model of ${on_string}" name="out_model" /> - </outputs> - <help> -Calculates regression models for a permutation of all GC columns contained in the selected -RI database file. The method used for creating the model is either based on a 3rd degree -polynomial or a standard linear model. - -The *Minimum number of residuals* option will only allow regression if the columns it is based -on has at least that number of datapoints on the same compound. - -Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for -the *R squared* value. The produced logfile will state how many models have been discarded due -to this filtering. The output model file also includes the p-value and R squared value for -each created model. - -Graphical output of the models is available by selecting the plot option which shows the -data points used for the model as well as the fit itself and the range of data that will -be usable. - -.. class:: infomark - -**Notes** - -The output file produced by this tool is required as input for the CasLookup tool when -selecting to apply regression when finding hits in the RIDB. - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f export_to_metexp_tabular.py --- a/export_to_metexp_tabular.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,247 +0,0 @@\n-#!/usr/bin/env python\n-# encoding: utf-8\n-\'\'\'\n-Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust\n-into a tabular file that can be uploaded to the MetExp database.\n-\n-RankFilter, CasLookup are already combined by combine_output.py so here we will use\n-this result. Furthermore here one of the MsClust\n-quantification files containing the respective spectra details are to be combined as well. \n-\n-Extra calculations performed:\n-- The column MW is also added here and is derived from the column FORMULA found \n- in RankFilter, CasLookup combined result. \n- \n-So in total here we merge 2 files and calculate one new column. \n-\'\'\'\n-from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n-import csv\n-import re\n-import sys\n-from collections import OrderedDict\n-\n-__author__ = "Pieter Lukasse"\n-__contact__ = "pieter.lukasse@wur.nl"\n-__copyright__ = "Copyright, 2013, Plant Research International, WUR"\n-__license__ = "Apache v2"\n-\n-def _process_data(in_csv, delim=\'\\t\'):\n- \'\'\'\n- Generic method to parse a tab-separated file returning a dictionary with named columns\n- @param in_csv: input filename to be parsed\n- \'\'\'\n- data = list(csv.reader(open(in_csv, \'rU\'), delimiter=delim))\n- header = data.pop(0)\n- # Create dictionary with column name as key\n- output = OrderedDict()\n- for index in xrange(len(header)):\n- output[header[index]] = [row[index] for row in data]\n- return output\n-\n-ONE_TO_ONE = \'one_to_one\'\n-N_TO_ONE = \'n_to_one\'\n-\n-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):\n- \'\'\'\n- Merges data from both input dictionaries based on the link fields. This method will\n- build up a new list containing the merged hits as the items. \n- @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)\n- @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)\n- \'\'\'\n- # TODO test for correct input files -> same link_field values should be there \n- # (test at least number of unique link_field values):\n- #\n- # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):\n- # raise Exception(\'input files should have the same nr of key values \')\n- \n- \n- merged = []\n- processed = {}\n- for link_field_set1_idx in xrange(len(set1[link_field_set1])):\n- link_field_set1_value = set1[link_field_set1][link_field_set1_idx]\n- if not link_field_set1_value in processed :\n- # keep track of processed items to not repeat them\n- processed[link_field_set1_value] = link_field_set1_value\n-\n- # Get the indices for current link_field_set1_value in both data-structures for proper matching\n- set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]\n- set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]\n- # Validation :\n- if len(set2index) == 0:\n- # means that corresponding data could not be found in set2, then throw error\n- raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + \n- link_field_set1_value + " only found in first dataset. ")\n- \n- merged_hits = []\n- # Combine hits\n- for hit in xrange(len(set1index)):\n- # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n- # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n- # corresponding value in the sets; i.e. \n- # set1[key] => returns the list/array with size = nrrows, with the values for the attribute\n- '..b'cation of reference standard\n- record.append(\'0\')\n- record.append(\'\') \n- \n- return record\n-\n-\n-def get_molecular_mass(formula):\n- \'\'\'\n- Calculates the molecular mass (MM). \n- E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O\n- \'\'\'\n- \n- # Each element is represented by a capital letter, followed optionally by \n- # lower case, with one or more digits as for how many elements:\n- element_pattern = re.compile("([A-Z][a-z]?)(\\d*)")\n-\n- total_mass = 0\n- for (element_name, count) in element_pattern.findall(formula):\n- if count == "":\n- count = 1\n- else:\n- count = int(count)\n- element_mass = float(elements_and_masses_map[element_name]) # "found: Python\'s built-in float type has double precision " (? check if really correct ?)\n- total_mass += element_mass * count\n- \n- return total_mass\n- \n- \n-\n-def _save_data(data, headers, out_csv):\n- \'\'\'\n- Writes tab-separated data to file\n- @param data: dictionary containing merged dataset\n- @param out_csv: output csv file\n- \'\'\'\n-\n- # Open output file for writing\n- outfile_single_handle = open(out_csv, \'wb\')\n- output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n-\n- # Write headers\n- output_single_handle.writerow(headers)\n-\n- # Write \n- for item_idx in xrange(len(data)):\n- for hit in data[item_idx]:\n- output_single_handle.writerow(hit)\n-\n-\n-def _get_map_for_elements_and_masses(elements_and_masses):\n- \'\'\'\n- This method will read out the column \'Chemical symbol\' and make a map \n- of this, storing the column \'Relative atomic mass\' as its value\n- \'\'\'\n- resultMap = {}\n- index = 0\n- for entry in elements_and_masses[\'Chemical symbol\']:\n- resultMap[entry] = elements_and_masses[\'Relative atomic mass\'][index]\n- index += 1\n- \n- return resultMap\n-\n-\n-def init_elements_and_masses_map():\n- \'\'\'\n- Initializes the lookup map containing the elements and their respective masses\n- \'\'\'\n- elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))\n- global elements_and_masses_map\n- elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)\n- \n-\n-def main():\n- \'\'\'\n- Combine Output main function\n- \n- RankFilter, CasLookup are already combined by combine_output.py so here we will use\n- this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust\n- quantification files are to be combined with combine_output.py result as well. \n- \'\'\'\n- rankfilter_and_caslookup_combined_file = sys.argv[1]\n- msclust_quantification_and_spectra_file = sys.argv[2]\n- output_csv = sys.argv[3]\n- # metadata\n- metadata = OrderedDict()\n- metadata[\'organism\'] = sys.argv[4]\n- metadata[\'tissue\'] = sys.argv[5]\n- metadata[\'experiment_name\'] = sys.argv[6]\n- metadata[\'user_name\'] = sys.argv[7]\n- metadata[\'column_type\'] = sys.argv[8]\n-\n- # Read RankFilter and CasLookup output files\n- rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)\n- msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, \',\')\n- \n- # Read elements and masses to use for the MW/MM calculation :\n- init_elements_and_masses_map()\n- \n- merged = _merge_data(rankfilter_and_caslookup_combined, \'Centrotype\', \n- msclust_quantification_and_spectra, \'centrotype\', \n- _compare_records, _merge_records, metadata,\n- N_TO_ONE)\n- headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + [\'MM\',\'MW\', \'Level of identification\', \'Location of reference standard\']\n- _save_data(merged, headers, output_csv)\n-\n-\n-if __name__ == \'__main__\':\n- main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f export_to_metexp_tabular.xml --- a/export_to_metexp_tabular.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,75 +0,0 @@ -<tool id="export_to_metexp_tabular" - name="METEXP - Tabular file" - version="0.2.0"> - <description>Create tabular file for loading into METabolomics EXPlorer database</description> - <command interpreter="python"> - export_to_metexp_tabular.py - $rankfilter_and_caslookup_combi - $msclust_quant_file - $output_result - "$organism" - "$tissue" - "$experiment_name" - "$user_name" - "$column_type" - </command> - <inputs> - <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output" - help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/> - <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output" - help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/> - - - <param name="organism" type="text" size="80" - label="Organism(s) info" - help="Metadata information to accompany the results when stored in MetExp DB." > - <validator type="empty_field" message="A value is required."></validator><!-- attribute optional="False" does not seem to work for params so validator is added --> - </param> - - <param name="tissue" type="text" size="80" - label="Tissue(s) info" - help="Metadata information to accompany the results when stored in MetExp DB." > - <validator type="empty_field" message="A value is required."></validator> - </param> - - <param name="experiment_name" type="text" size="80" - label="Experiment name/code" - help="Name or code to store the results under. This can help you find the results back in MetExpDB." > - <validator type="empty_field" message="A value is required."></validator> - </param> - - <param name="user_name" type="text" size="80" - label="User name" - help="User name or code to store the results under. This can help you find the results back in MetExpDB." > - <validator type="empty_field" message="A value is required."></validator> - </param> - - <param name="column_type" type="text" size="80" - label="Column type" - help="Column type to report with the results. This can help you find the results back in MetExpDB." > - <validator type="empty_field" message="A value is required."></validator> - </param> - - </inputs> - <outputs> - <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" /> - </outputs> - <help> -.. class:: infomark - -Tool to combine output from the tools RankFilter, CasLookup and MsClust -into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. - -RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use -this result. - -**Notes** - -Extra calculations performed: -- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. - -So in total here we merge 2 files and calculate one new column. - - - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f library_lookup.py --- a/library_lookup.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,327 +0,0 @@\n-\'\'\'\n-Logic for searching a Retention Index database file given output from NIST\n-\'\'\'\n-import match_library\n-import re\n-import sys\n-import csv\n-\n-__author__ = "Marcel Kempenaar"\n-__contact__ = "brs@nbic.nl"\n-__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n-__license__ = "MIT"\n-\n-def create_lookup_table(library_file, column_type_name, statphase):\n- \'\'\'\n- Creates a dictionary holding the contents of the library to be searched\n- @param library_file: library to read\n- @param column_type_name: the columns type name\n- @param statphase: the columns stationary phase\n- \'\'\'\n- (data, header) = match_library.read_library(library_file)\n- # Test for presence of required columns\n- if (\'columntype\' not in header or\n- \'columnphasetype\' not in header or\n- \'cas\' not in header):\n- raise IOError(\'Missing columns in \', library_file)\n-\n- column_type_column = header.index("columntype")\n- statphase_column = header.index("columnphasetype")\n- cas_column = header.index("cas")\n-\n- filtered_library = [line for line in data if line[column_type_column] == column_type_name\n- and line[statphase_column] == statphase]\n- lookup_dict = {}\n- for element in filtered_library:\n- # Here the cas_number is set to the numeric part of the cas_column value, so if the \n- # cas_column value is \'C1433\' then cas_number will be \'1433\'\n- cas_number = str(re.findall(r\'\\d+\', (element[cas_column]).strip())[0])\n- try:\n- lookup_dict[cas_number].append(element)\n- except KeyError:\n- lookup_dict[cas_number] = [element]\n- return lookup_dict\n-\n-\n-def _preferred(hits, pref, ctype, polar, model, method):\n- \'\'\'\n- Returns all entries in the lookup_dict that have the same column name, type and polarity\n- as given by the user, uses regression if selected given the model and method to use. The\n- regression is applied on the column with the best R-squared value in the model\n- @param hits: all entries in the lookup_dict for the given CAS number\n- @param pref: preferred GC-column, can be one or more names\n- @param ctype: column type (capillary etc.)\n- @param polar: polarity (polar / non-polar etc.)\n- @param model: data loaded from file containing regression models\n- @param method: supported regression method (i.e. poly(nomial) or linear)\n- \'\'\'\n- match = []\n- for column in pref:\n- for hit in hits:\n- if hit[4] == ctype and hit[5] == polar and hit[6] == column:\n- # Create copy of found hit since it will be altered downstream\n- match.extend(hit)\n- return match, False\n-\n- # No hit found for current CAS number, return if not performing regression\n- if not model:\n- return False, False\n-\n- # Perform regression\n- for column in pref:\n- if column not in model:\n- break\n- # Order regression candidates by R-squared value (last element)\n- order = sorted(model[column].items(), key=lambda col: col[1][-1])\n- # Create list of regression candidate column names\n- regress_columns = list(reversed([column for (column, _) in order]))\n- # Names of available columns\n- available = [hit[6] for hit in hits]\n- \n- # TODO: combine Rsquared and number of datapoints to get the best regression match\n- \'\'\'\n- # Iterate regression columns (in order) and retrieve their models\n- models = {}\n- for col in regress_columns:\n- if col in available:\n- hit = list(hits[available.index(col)])\n- if hit[4] == ctype:\n- # models contains all model data including residuals [-2] and rsquared [-1]\n- models[pref[0]] = model[pref[0]][hit[6]] \n- # Get the combined maximum for residuals and rsquared\n- best_match = models[]\n- # Apply regression\n- if me'..b'se:\n- found_hit[-1] = \'0\'\n- data.append(found_hit)\n- found_hit = \'\'\n- else:\n- data.append(default_hit(row, casf, compound_id))\n- else:\n- data.append(default_hit(row, casf, compound_id))\n- \n- casf = \'\'\n- compound_id = \'\'\n- found_hit = []\n- dups = []\n- return data\n-\n-\n-def _save_data(content, outfile):\n- \'\'\'\n- Write to output file\n- @param content: content to write\n- @param outfile: file to write to\n- \'\'\'\n- # header\n- header = [\'CAS\',\n- \'NAME\',\n- \'FORMULA\',\n- \'RI\',\n- \'Column.type\',\n- \'Column.phase.type\',\n- \'Column.name\',\n- \'phase.coding\',\n- \'CAS_column.Name\',\n- \'Centrotype\',\n- \'Regression.Column.Name\',\n- \'min\',\n- \'max\',\n- \'nr.duplicates\']\n- output_handle = csv.writer(open(outfile, \'wb\'), delimiter="\\t")\n- output_handle.writerow(header)\n- for entry in content:\n- output_handle.writerow(entry)\n-\n-\n-def _read_model(model_file):\n- \'\'\'\n- Creates an easy to search dictionary for getting the regression parameters\n- for each valid combination of GC-columns\n- @param model_file: filename containing the regression models\n- \'\'\'\n- regress = list(csv.reader(open(model_file, \'rU\'), delimiter=\'\\t\'))\n- if len(regress.pop(0)) > 9:\n- method = \'poly\'\n- else:\n- method = \'linear\'\n-\n- model = {}\n- # Create new dictionary for each GC-column\n- for line in regress:\n- model[line[0]] = {}\n-\n- # Add data\n- for line in regress:\n- if method == \'poly\':\n- model[line[0]][line[1]] = [float(col) for col in line[2:11]]\n- else: # linear\n- model[line[0]][line[1]] = [float(col) for col in line[2:9]]\n-\n- return model, method\n-\n-\n-def _apply_poly_regression(column1, column2, retention_index, model):\n- \'\'\'\n- Calculates a new retention index (RI) value using a given 3rd-degree polynomial\n- model based on data from GC columns 1 and 2\n- @param column1: name of the selected GC-column\n- @param column2: name of the GC-column to use for regression\n- @param retention_index: RI to convert\n- @param model: dictionary containing model information for all GC-columns\n- \'\'\'\n- coeff = model[column1][column2]\n- # If the retention index to convert is within range of the data the model is based on, perform regression\n- if coeff[4] < retention_index < coeff[5]:\n- return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + \n- (retention_index * coeff[1]) + coeff[0])\n- else:\n- return False\n-\n-\n-def _apply_linear_regression(column1, column2, retention_index, model):\n- \'\'\'\n- Calculates a new retention index (RI) value using a given linear model based on data\n- from GC columns 1 and 2\n- @param column1: name of the selected GC-column\n- @param column2: name of the GC-column to use for regression\n- @param retention_index: RI to convert\n- @param model: dictionary containing model information for all GC-columns\n- \'\'\'\n- # TODO: No use of limits\n- coeff = model[column1][column2]\n- return coeff[1] * retention_index + coeff[0]\n-\n-\n-def main():\n- \'\'\'\n- Library Lookup main function\n- \'\'\'\n- library_file = sys.argv[1]\n- nist_tabular_filename = sys.argv[2]\n- ctype = sys.argv[3]\n- polar = sys.argv[4]\n- outfile = sys.argv[5]\n- pref = sys.argv[6:-1]\n- regress = sys.argv[-1]\n-\n- if regress != \'False\':\n- model, method = _read_model(regress)\n- else:\n- model, method = False, None\n-\n- lookup_dict = create_lookup_table(library_file, ctype, polar)\n- data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)\n-\n- _save_data(data, outfile)\n-\n-\n-if __name__ == "__main__":\n- main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f library_lookup.xml --- a/library_lookup.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,75 +0,0 @@ -<tool id="lookup_library" name="RIQC-Lookup RI for CAS numbers in library" version="1.0.2"> - <description>Lookup or estimate the RI using a "known RI values" CAS numbers library</description> - <command interpreter="python"> - library_lookup.py - $library_file - $input - "$col_type" - "$polarity" - $output - #for $ctype in $pref - ${ctype.columntype} - #end for - $regression.model - </command> - <inputs> - <!-- Regarding the <page> items: this blocks the use of this tool in Galaxy workflows. However, - alternatives like wrapping this in conditionals, repeats (to force a refresh_on_change as this option - is not working on its own) failed since the workflow editor does not support refreshes...not does the - workflow runtime support conditionals or repeats to be set at runtime. See also - galaxy-dev mail thread "when else" in <conditional> ? RE: refresh_on_change : is this a valid attribute? Any other ideas/options??" --> - <page> - <param format="tabular" name="input" type="data" label="NIST identifications as tabular file" - help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> - <param name="library_file" type="select" label="CAS x RI Library file" - help="Select a library/lookup file containing RI values for CAS numbers on various chromatography columns " - dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/> - </page> - <page> - <param name="col_type" type="select" label="Select column type" refresh_on_change="true" - display="radio" dynamic_options='get_column_type(library_file)' - help="" /> - </page> - <page> - <param name="polarity" type="select" label="Select polarity" refresh_on_change="true" - display="radio" dynamic_options='filter_column(library_file,col_type)' - help="" /> - </page> - <page> - <conditional name="regression"> - <param name="regression_select" type="boolean" checked="false" label="Apply regression method" - help="If no data for the selected column is present in the database, selecting this option will try - to convert Retention Indices using data from other GC-columns with a regression method. Please - note that only the first given GC-column above will be used for this, any alternatives will be - ignored" /> - <when value="true"> - <param name="model" format="tabular" type="data" label="Tabular file containing regression model" - help="This file contains the coefficients used to perform the regression from one GC-column - to another GC-column"/> - </when> - <when value="false"> - <param name="model" type="hidden" value="False" /> - </when> - </conditional> - <repeat name="pref" title="Select column name preference"> - <param name="columntype" type="select" label="Column name" refresh_on_change="true" - dynamic_options='filter_column2(library_file, col_type, polarity)' - help="Select one or more column names for filtering. The order defines the priority." /> - </repeat> - </page> - </inputs> - <outputs> - <data format="tabular" label="${tool.name} on" name="output" /> -</outputs> -<code file="match_library.py" /> - <help> -Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library. -If a direct match is NOT found for the preferred column name, a regression can be done to find -the theoretical RI value based on known RI values for the CAS number on other column types (see step 4). -If there is no match for the CAS number on any column type, then the record is not given a RI. - - - - </help> - -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f match_library.py --- a/match_library.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,133 +0,0 @@ -''' -Containing functions are called from Galaxy to populate lists/checkboxes with selectable items -''' -import csv -import glob -import os - - -__author__ = "Marcel Kempenaar" -__contact__ = "brs@nbic.nl" -__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" -__license__ = "MIT" - -def get_column_type(library_file): - ''' - Returns a Galaxy formatted list of tuples containing all possibilities for the - GC-column types. Used by the library_lookup.xml tool - @param library_file: given library file from which the list of GC-column types is extracted - ''' - if library_file == "": - galaxy_output = [("", "", False)] - else: - (data, header) = read_library(library_file) - - if 'columntype' not in header: - raise IOError('Missing columns in ', library_file) - - # Filter data on column type - column_type = header.index("columntype") - amounts_in_list_dict = count_occurrence([row[column_type] for row in data]) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()] - - return(galaxy_output) - - -def filter_column(library_file, column_type_name): - ''' - Filters the Retention Index database on column type - @param library_file: file containing the database - @param column_type_name: column type to filter on - ''' - if library_file == "": - galaxy_output = [("", "", False)] - else: - (data, header) = read_library(library_file) - - if ('columntype' not in header or - 'columnphasetype' not in header): - raise IOError('Missing columns in ', library_file) - - column_type = header.index("columntype") - statphase = header.index("columnphasetype") - - # Filter data on colunn type name - statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name] - amounts_in_list_dict = count_occurrence(statphase_list) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] - - return(sorted(galaxy_output)) - - -def filter_column2(library_file, column_type_name, statphase): - ''' - Filters the Retention Index database on column type - @param library_file: file containing the database - @param column_type_name: column type to filter on - @param statphase: stationary phase of the column to filter on - ''' - if library_file == "": - galaxy_output = [("", "", False)] - else: - (data, header) = read_library(library_file) - - if ('columntype' not in header or - 'columnphasetype' not in header or - 'columnname' not in header): - raise IOError('Missing columns in ', library_file) - - column_type_column = header.index("columntype") - statphase_column = header.index("columnphasetype") - column_name_column = header.index("columnname") - - # Filter data on given column type name and stationary phase - statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and - line[statphase_column] == statphase] - amounts_in_list_dict = count_occurrence(statphase_list) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] - - return(sorted(galaxy_output)) - - -def read_library(filename): - ''' - Reads a CSV file and returns its contents and a normalized header - @param filename: file to read - ''' - data = list(csv.reader(open(filename, 'rU'), delimiter='\t')) - header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)] - return(data, header_clean) - - - -def get_directory_files(dir_name): - ''' - Reads the directory and - returns the list of .txt files found as a dictionary - with file name and full path so that it can - fill a Galaxy drop-down combo box. - - ''' - files = glob.glob(dir_name + "/*.*") - if len(files) == 0: - # Configuration error: no library files found in <galaxy-home-dir>/" + dir_name : - galaxy_output = [("Configuration error: expected file not found in <galaxy-home-dir>/" + dir_name, "", False)] - else: - galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files] - return(galaxy_output) - -def get_file_name_no_ext(full_name): - ''' - returns just the last part of the name - ''' - simple_name = os.path.basename(full_name) - base, ext = os.path.splitext(simple_name) - return base - - -def count_occurrence(data_list): - ''' - Counts occurrences in a list and returns a dict with item:occurrence - @param data_list: list to count items from - ''' - return dict((key, data_list.count(key)) for key in set(data_list)) |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/INFO_structure_of_LC.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/INFO_structure_of_LC.txt Thu Mar 19 12:22:23 2015 +0100 |
[ |
b'@@ -0,0 +1,532 @@\n+str(LC)\r\n+List of 4\r\n+ $ PeakTable :\'data.frame\': 1828 obs. of 12 variables:\r\n+ ..$ ChemSpiderID : chr [1:1828] "" "" "" "" ...\r\n+ ..$ compound : chr [1:1828] "" "" "" "" ...\r\n+ ..$ pcgroup : num [1:1828] 187 226 226 226 226 226 226 306 154 154 ...\r\n+ ..$ adduct : chr [1:1828] "" "" "" "" ...\r\n+ ..$ isotopes : chr [1:1828] "" "" "" "" ...\r\n+ ..$ mz : num [1:1828] 253 233 214 215 298 ...\r\n+ ..$ rt : num [1:1828] 1.27 1.57 1.59 1.59 1.59 ...\r\n+ ..$ CLASS2 : num [1:1828] 2 2 2 2 2 2 2 2 2 2 ...\r\n+ ..$ STDmix_GC_01 : num [1:1828] 0 0 0 0 0 0 0 0 0 0 ...\r\n+ ..$ STDmix_GC_02 : num [1:1828] 0 0 0 0 0 0 0 0 0 0 ...\r\n+ ..$ STDmix_RP_pos01: num [1:1828] 144.5 32.6 58.5 38.4 152.3 ...\r\n+ ..$ STDmix_RP_pos02: num [1:1828] 343.4 54.5 79.1 64.9 210.8 ...\r\n+ $ xset :Formal class \'xsAnnotate\' [package "CAMERA"] with 15 slots\r\n+ .. ..@ groupInfo : num [1:1828, 1:13] 30.2 30.5 31.4 32.4 33.4 ...\r\n+ .. .. ..- attr(*, "dimnames")=List of 2\r\n+ .. .. .. ..$ : NULL\r\n+ .. .. .. ..$ : chr [1:13] "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", "CLASS1", "CLASS2", "STDmix_GC_01", "STDmix_GC_02", "STDmix_RP_pos01", "STDmix_RP_pos02"\r\n+ \r\n+ and groups() function:\r\n+ "mzmed", "mzmin", "mzmax", "rtmed", "rtmin", "rtmax", "npeaks", "CLASS1", "CLASS2"\r\n+ \r\n+ .. ..@ pspectra :List of 340\r\n+ .. .. ..$ : num [1:12] 2 4 5 10 11 13 14 15 16 38 ...\r\n+ .. .. ..$ : num [1:11] 3 12 45 76 99 ...\r\n+ .. .. ..$ : num [1:15] 7 8 9 113 162 ...\r\n+ .. .. ..$ : Named num 1\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:5] 6 150 473 478 802\r\n+ .. .. ..$ : num [1:5] 18 28 40 64 85\r\n+ .. .. ..$ : num [1:5] 89 528 533 596 872\r\n+ .. .. ..$ : num [1:2] 57 197\r\n+ .. .. ..$ : num [1:3] 174 708 841\r\n+ .. .. ..$ : Named num 23\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:4] 21 34 496 1497\r\n+ .. .. ..$ : num [1:3] 29 30 44\r\n+ .. .. ..$ : num [1:5] 283 423 452 486 527\r\n+ .. .. ..$ : num [1:6] 93 126 251 296 349 406\r\n+ .. .. ..$ : num [1:3] 301 479 664\r\n+ .. .. ..$ : num [1:5] 47 105 111 191 237\r\n+ .. .. ..$ : num [1:5] 185 562 683 923 1548\r\n+ .. .. ..$ : num [1:2] 469 809\r\n+ .. .. ..$ : num [1:9] 24 146 548 550 1009 ...\r\n+ .. .. ..$ : num [1:33] 65 72 134 139 196 200 235 299 355 361 ...\r\n+ .. .. ..$ : num [1:2] 135 234\r\n+ .. .. ..$ : num [1:4] 95 326 330 465\r\n+ .. .. ..$ : num [1:2] 384 699\r\n+ .. .. ..$ : Named num 388\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : Named num 249\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:4] 425 485 636 698\r\n+ .. .. ..$ : Named num 482\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:2] 310 429\r\n+ .. .. ..$ : num [1:97] 33 51 67 96 103 106 114 129 136 140 ...\r\n+ .. .. ..$ : Named num 436\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:3] 583 790 1029\r\n+ .. .. ..$ : Named num 593\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:2] 558 745\r\n+ .. .. ..$ : Named num 543\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:44] 43 60 101 121 124 179 211 214 257 263 ...\r\n+ .. .. ..$ : num [1:2] 243 941\r\n+ .. .. ..$ : num [1:4] 657 996 1286 1344\r\n+ .. .. ..$ : num [1:21] 145 195 199 217 230 268 271 279 665 743 ...\r\n+ .. .. ..$ : Named num 836\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : Named num 694\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : Named num 578\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : Named num 612\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : Named num 684\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:18] 31 163 194 395 441 ...\r\n+ .. .. ..$ : num [1:2] 79 282\r\n+ .. .. ..$ : Named num 297\r\n+ .. .. .. ..- attr(*, "names")= chr ""\r\n+ .. .. ..$ : num [1:10] 167 169 281 284 750 ...\r\n+ .. .. ..$ : num [1:30] 165 796 812 821 829 ...\r\n+ .. .. ..$ : num [1:4] 112 566 733 1552\r\n+ '..b't of 1\r\n+ .. .. ..$ enable: num 0\r\n+ .. ..@ .__classVersion__:Formal class \'Versions\' [package "Biobase"] with 1 slots\r\n+ .. .. .. ..@ .Data:List of 1\r\n+ .. .. .. .. ..$ : int [1:3] 1 13 333\r\n+ $ Annotation:List of 5\r\n+ ..$ annotation.table :\'data.frame\': 27 obs. of 13 variables:\r\n+ .. ..$ feature : int [1:27] 190 193 195 206 208 209 212 367 368 371 ...\r\n+ .. ..$ db_position : int [1:27] 21 22 23 21 23 24 22 1 2 4 ...\r\n+ .. ..$ ChemSpiderID: int [1:27] 10463792 10463792 10463792 10463792 10463792 10463792 10463792 8711 8711 8711 ...\r\n+ .. ..$ mz : num [1:27] 137 348 696 137 696 ...\r\n+ .. ..$ rt : num [1:27] 10.8 10.8 10.8 11.2 11.2 ...\r\n+ .. ..$ I : num [1:27] 10660 3754.5 32.4 5127 58.2 ...\r\n+ .. ..$ compound : Factor w/ 4 levels "D-(+)-Catechin",..: 4 4 4 4 4 4 4 1 1 1 ...\r\n+ .. ..$ db_mz : num [1:27] 137 348 696 137 696 ...\r\n+ .. ..$ db_rt : num [1:27] 10.8 10.8 10.8 10.8 10.8 ...\r\n+ .. ..$ db_I : num [1:27] 33.4 4478.7 58.2 33.4 58.2 ...\r\n+ .. ..$ db_ann : Factor w/ 1 level "automatic": 1 1 1 1 1 1 1 1 1 1 ...\r\n+ .. ..$ mz.err : num [1:27] 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 ...\r\n+ .. ..$ clid : int [1:27] 1 1 1 2 2 2 2 2 2 2 ...\r\n+ ..$ compounds : chr [1:4] "adenosine 2\'-monophosphate" "D-(+)-Catechin" "malvidin-3-glucoside" "rutin "\r\n+ ..$ ChemSpiderIDs : int [1:4] 10463792 8711 391785 4444362\r\n+ ..$ multiple.annotations: num(0) \r\n+ ..$ ann.features : int [1:27] 190 193 195 206 208 209 212 367 368 371 ...\r\n+ $ Settings :Formal class \'metaMSsettings\' [package "metaMS"] with 30 slots\r\n+ .. ..@ protocolName : chr "Synapt.QTOF.RP"\r\n+ .. ..@ chrom : chr "LC"\r\n+ .. ..@ PeakPicking :List of 5\r\n+ .. .. ..$ method : chr "matchedFilter"\r\n+ .. .. ..$ step : num 0.05\r\n+ .. .. ..$ fwhm : num 20\r\n+ .. .. ..$ snthresh: num 4\r\n+ .. .. ..$ max : num 50\r\n+ .. ..@ Alignment :List of 9\r\n+ .. .. ..$ min.class.fraction: num 0.3\r\n+ .. .. ..$ min.class.size : num 3\r\n+ .. .. ..$ mzwid : num 0.1\r\n+ .. .. ..$ bws : num [1:2] 30 10\r\n+ .. .. ..$ missingratio : num 0.2\r\n+ .. .. ..$ extraratio : num 0.1\r\n+ .. .. ..$ retcormethod : chr "linear"\r\n+ .. .. ..$ retcorfamily : chr "symmetric"\r\n+ .. .. ..$ fillPeaks : logi TRUE\r\n+ .. ..@ CAMERA :List of 3\r\n+ .. .. ..$ perfwhm : num 0.6\r\n+ .. .. ..$ cor_eic_th: num 0.7\r\n+ .. .. ..$ ppm : num 5\r\n+ .. ..@ match2DB.rtdiff : num 1.5\r\n+ .. ..@ match2DB.minfeat : num 2\r\n+ .. ..@ match2DB.rtval : num 0.1\r\n+ .. ..@ match2DB.mzdiff : num 0.005\r\n+ .. ..@ match2DB.ppm : num 5\r\n+ .. ..@ match2DB.simthresh : num(0) \r\n+ .. ..@ match2DB.timeComparison : chr(0) \r\n+ .. ..@ match2DB.RIdiff : num(0) \r\n+ .. ..@ DBconstruction.minfeat : num(0) \r\n+ .. ..@ DBconstruction.rttol : num(0) \r\n+ .. ..@ DBconstruction.mztol : num(0) \r\n+ .. ..@ DBconstruction.minintens : num(0) \r\n+ .. ..@ DBconstruction.intensityMeasure : chr(0) \r\n+ .. ..@ DBconstruction.DBthreshold : num(0) \r\n+ .. ..@ matchIrrelevants.irrelevantClasses: chr(0) \r\n+ .. ..@ matchIrrelevants.simthresh : num(0) \r\n+ .. ..@ matchIrrelevants.timeComparison : chr(0) \r\n+ .. ..@ matchIrrelevants.RIdiff : num(0) \r\n+ .. ..@ matchIrrelevants.rtdiff : num(0) \r\n+ .. ..@ betweenSamples.min.class.fraction : num(0) \r\n+ .. ..@ betweenSamples.min.class.size : num(0) \r\n+ .. ..@ betweenSamples.timeComparison : chr(0) \r\n+ .. ..@ betweenSamples.rtdiff : num(0) \r\n+ .. ..@ betweenSamples.RIdiff : num(0) \r\n+ .. ..@ betweenSamples.simthresh : num(0) \r\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/README.txt Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,70 @@ +Wrappers for: +- the metaMS R package by Ron Wehrens. +- the xcms package. + +Wrappers written by Pieter Lukasse. + + +Installation (when updating: close all vignettes [i.e. pdfs/manuals] !) + +In R: +source("http://bioconductor.org/biocLite.R") +biocLite("metaMS") +biocLite("multtest") +#biocLite("R2HTML") +# for "multi-threading" (actually starts multiple R processes for parallel processing): +install.packages("snow") +install.packages("Cairo") + +======Run metaMS_cmd_pick_and_group.r with:================= + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_pick_and_group.r test/extdata.zip test/example_settings.txt test/out/peakTable.txt test/out/xsAnnotatePrep.rdata positive test/out/html_peaks.html test/out + + +======Run metaMS_cmd_annotate.r with:================= + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_annotate.r test/LCDBtest.RData test/out/xsAnnotatePrep.rdata test/example_settings.txt test/out/annotationTable.txt "0" test/out/html_annot.html test/out + + +======Run xcms_differential_analysis.r with:================= + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_differential_analysis.r test/out/xsAnnotatePrep.rdata "CLASS1" "CLASS2" 10 test/out2/outtable.tsv test/out2/html/html.html test/out2/html + + +======Run xcms_get_eic.r with:================= + + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_alignment_eic.r test/out/xsAnnotatePrep.rdata 10 300 3 STDmix_GC_01,STDmix_GC_02 test/out3/html/html.html test/out3/html + +OR + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_mass_eic.r test/out/xsAnnotatePrep.rdata 10 3000 -1 -1 "77.98,231.96" 5 STDmix_GC_01,STDmix_GC_02 Yes raw test/out4/html/html.html test/out4/html + + + +!!!!!!!!!!!!!!!!Troubleshooting:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + +NetCDF is required. If the following is found in the installation.log + +In file included from rnetCDF.c:2:0: +rnetCDF.h:1:20: fatal error: netcdf.h: No such file or directory +compilation terminated. + + +then metaMS will not have been installed and running the tool will result in error: +<simpleError in library(metaMS): there is no package called 'metaMS'> + +Possible solution: +> Install the -dev of those packages to get the headers that are + required to compile the package. In this case, you need libnetcdf-dev, udunits-bin and libudunits2-dev + (from http://stackoverflow.com/questions/11319698/how-to-install-r-packages-rnetcdf-and-ncdf-on-ubuntu) +So +>>sudo apt-get install libnetcdf-dev, udunits-bin and libudunits2-dev + +Cairo / no X11 mode is required. + +Possible solution: +> install of cairo (http://www.cairographics.org/) and/or set CAIRO_CFLAGS/LIBS correspondingly. +So +>>sudo apt-get install libcairo2-dev |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/metaMS_cmd_annotate.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metaMS_cmd_annotate.r Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,86 @@ +## read args: +args <- commandArgs(TRUE) +## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData" +args.constructedDB <- args[1] +## data file in xset format: +args.xsetData <- args[2] +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- args[3] + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outAnnotationTable <- args[4] + +args.mass_error_function <- args[5] +if (args.mass_error_function == "0") + args.mass_error_function <- NULL +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +cat("\nSettings used===============:\n") +cat(readChar(args.settings, 1e5)) + + +tryCatch( + { + library(metaMS) + + ## load the constructed DB : + tempEnv <- new.env() + testDB <- load(args.constructedDB, envir=tempEnv) + xsetData <- readRDS(args.xsetData) + + ## load settings "script" into "customMetaMSsettings" + source(args.settings, local=tempEnv) + message(paste(" loaded : ", args.settings)) + + # Just to highlight: if you want to use more than one + # trigger runLC: + LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE) + + # write out runLC annotation results: + write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE) + + # the used constructed DB (write to log): + cat("\nConstructed DB info===============:\n") + str(tempEnv[[testDB[1]]]$Info) + cat("\nConstructed DB table===============:\n") + if (length(args) == 8) + { + write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE) + write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE) + } + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + html <- "<html><body><h1>Summary of annotation results:</h1>" + nrTotalFeatures <- nrow(LC$PeakTable) + nrAnnotatedFeatures <- nrow(LC$Annotation$annotation.table) + html <- paste(html,"<p>Total nr of features: ", nrTotalFeatures,"</p>", sep="") + html <- paste(html,"<p>Total nr of annotated features: ", nrAnnotatedFeatures,"</p>", sep="") + + html <- paste(html,"</body><html>") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/metaMS_cmd_pick_and_group.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metaMS_cmd_pick_and_group.r Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,92 @@ +## read args: +args <- commandArgs(TRUE) +## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/" +args.dataZip <- args[1] +args.zipExtrDir <- sub("\\.","_",paste(args[1],"dir", sep="")) +dir.create(file.path(args.zipExtrDir), showWarnings = FALSE, recursive = TRUE) +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- args[2] + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outPeakTable <- args[3] +args.xsetOut <- args[4] + +# polarity as explicit parameter: +args.runLC_polarity <- args[5] + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +cat("\nSettings used===============:\n") +cat(readChar(args.settings, 1e5)) + + +tryCatch( + { + library(metaMS) + + ## load the data files from a zip file + files <- unzip(args.dataZip, exdir=args.zipExtrDir) + + ## load settings "script" into "customMetaMSsettings" + tempEnv <- new.env() + source(args.settings, local=tempEnv) + message(paste(" loaded : ", args.settings)) + allSettings <- tempEnv[["customMetaMSsettings"]] + + # trigger runLC: + LC <- runLC(files, settings = allSettings, polarity=args.runLC_polarity, nSlaves=20, returnXset = TRUE) + + # write out runLC annotation results: + write.table(LC$PeakTable, args.outPeakTable, sep="\t", row.names=FALSE) + + # save xset as rdata: + xsAnnotatePreparedData <- LC$xset + saveRDS(xsAnnotatePreparedData, file=args.xsetOut) + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + html <- "<html><body><h1>Info on alignment quality </h1>" + # TODO add (nr and mass error) and group size + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="") + html <- paste(html,"<img src='figure_retcor.png' /><br/>", sep="") + png( figureName, type="cairo", width=1100,height=600 ) + retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden") + html <- paste(html,"<a>*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run, + but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once + this option is added in metaMS API. </a><br/>", sep="") + devname = dev.off() + + + gt <- groups(LC$xset@xcmsSet) + groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3) + + html <- paste(html,"</body><html>") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/metams_lcms_annotate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metams_lcms_annotate.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,121 @@ +<tool id="metams_lcms_annotate" name="METAMS-LC/MS Annotate" version="0.0.4"> + <description> Runs metaMS process for LC/MS feature annotation</description> + <requirements> + <requirement type="package" version="3.1.1">R_bioc_metams</requirement> + </requirements> + <command interpreter="Rscript"> + metaMS_cmd_annotate.r + $constructed_db + $xsetData + $customMetaMSsettings + $outputFile + #if $mzTol.mzTolType == "fixed" + 0 + #else + "$mzTol.mass_error_function" + #end if + $htmlReportFile + $htmlReportFile.files_path + $outputLog + </command> +<inputs> + <param name="constructed_db" type="select" label="Constructed DB" help="Reference annotation database generated from matching measurements of a mixture of chemical standards + against a manually validated reference table which contains the key analytical information for each standard." + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/metaMS")'/> + + <param name="xsetData" type="data" format="rdata" label="xcmsSet data file (xset RDATA)" help="E.g. output data file resulting from METAMS 'feature picking, aligning and grouping' run"/> + + <param name="protocolName" type="text" size="30" label="protocolName" value="e.g. Synapt.QTOF.RP" + help="Choose a name to give for the specific settings in the parameters below"/> + + <param name="rtdiff" type="float" size="10" value="1.5" label="rtdiff" help="(Annotation) Allowed rt difference (in minutes)"/> + + <conditional name="mzTol"> + <param name="mzTolType" type="select" size="30" label="(Annotation) m/z tolerance type"> + <option value="fixed" selected="true">Fixed tolerance</option> + <option value="adaptive" >Adaptive tolerance</option> + </param> + <when value="fixed"> + <param name="mzdiff" type="float" size="10" value="0.005" label="mzdiff" help="(Annotation) Fixed mass tolerance" /> + </when> + <when value="adaptive"> + <param name="ppm" type="float" size="10" value="5.0" label="ppm" help="(Annotation) Tolerance in ppm" /> + <param name="mass_error_function" type="text" area="true" size="3x70" label="(Annotation) Mass error function"/> + </when> + </conditional> + + <param name="rtval" type="float" size="10" value="0.1" label="(max)rtval" help="(Validation) Group items are clustered once more with hierarchical clustering ('complete' method) + based on their rt distances. Here one can specify the rt threshold for removing the items that have too diverging rt (the ones with rt difference + larger than rtval). " /> + <param name="minfeat" type="integer" size="10" value="2" label="minfeat" + help="(Validation) Threshold for the minimum number of features a + cluster/group should have (after rtval filtering above). Other clusters/groups are filtered out." /> + +</inputs> +<configfiles> + +<configfile name="customMetaMSsettings">## start comment + ## metaMS process settings + customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", + chrom = "LC") +metaSetting(customMetaMSsettings, "match2DB") <- list( + rtdiff = ${rtdiff}, + rtval = ${rtval}, + #if $mzTol.mzTolType == "fixed" + mzdiff = ${mzTol.mzdiff}, + #else + ppm = ${mzTol.ppm}, + #end if + minfeat = ${minfeat})</configfile> + +</configfiles> + +<outputs> + <data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - metaMS annotated file (TSV)"/> + <data name="outputLog" format="txt" label="${tool.name} on ${on_string} - metaMS LOG" hidden="True"/> + <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - metaMS report (HTML)"/> +</outputs> +<tests> + <test> + </test> +</tests> +<code file="../match_library.py" /> <!-- file containing get_directory_files function used above--> +<help> + +.. class:: infomark + +Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB. +The figure below shows the main parts of this metaMS process. + +.. image:: $PATH_TO_IMAGES/metaMS_annotate.png + + +.. class:: infomark + +The implemented annotation strategy can be broken down in the following steps: + +1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If +the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed +tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the +bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e. +features which are associated to more than one compound - are possible. This outcome does not +indicate a problem per se, but is an inherent drawback of co-elution. + +2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all +the experimental features which are assigned to a specific compound. A specific annotation is +confirmed only if more than minfeat features which differ in retention time less than rtval are +present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The +latter, indeed, accounts for shifts in retention time between the injection of the standards and the +metabolomics experiment under investigation. This time can be rather long, considering that the +standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift +between the ions of the same compound within the same batch of injections and therefore it has +only to account for the smaller shifts occurring during peak picking and alignment. + + + </help> + <citations> + <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> <!-- example + see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set + --> + </citations> +</tool> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/metams_lcms_pick_and_group.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metams_lcms_pick_and_group.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
b'@@ -0,0 +1,301 @@\n+<tool id="metams_lcms_pick_and_group" name="METAMS-LC/MS Pick, Align and Group" version="0.0.4">\r\n+\t<description> Runs metaMS process for LC/MS feature picking, aligning and grouping</description>\r\n+\t<requirements>\r\n+\t\t<requirement type="package" version="3.1.1">R_bioc_metams</requirement>\r\n+\t</requirements>\t\r\n+\t<command interpreter="Rscript">\r\n+\t\tmetaMS_cmd_pick_and_group.r \r\n+\t $data_files\r\n+\t $customMetaMSsettings\r\n+\t $outputFile \r\n+\t $xsetOut\r\n+\t $polarity\r\n+\t $htmlReportFile\r\n+\t $htmlReportFile.files_path\r\n+\t $outputLog\r\n+\t</command>\r\n+<inputs>\r\n+\t<param name="data_files" type="data" format="prims.fileset.zip" label="Data files (.zip file with CDF, mzML or mzXML files)" help=".zip file containing the CDF, mzML or mzXML files of the new measurements"/>\r\n+\t\r\n+\t<param name="protocolName" type="text" size="30" label="protocolName" value="e.g. Synapt.QTOF.RP" \r\n+\t\thelp="Choose a name to give for the specific settings in the parameters below"/><!-- TODO - let user choose this -->\r\n+\t\r\n+\t<param name="polarity" type="select" size="30" label="polarity" \r\n+\t\thelp="Which polarity mode was used for measuring of the ms sample">\r\n+\t\t<option value="positive" selected="true">positive</option>\r\n+\t\t<option value="negative" >negative</option>\r\n+\t</param>\r\n+\t\r\n+\t\r\n+\t<!-- ===========NB : if peak picking, alignment OR CAMERA settings have to be reused for runGC wrapper in the future, we can use Galaxy macro expansions here\r\n+\t to avoid defining these parameters again in the runGC wrapper ========================= -->\r\n+\t<conditional name="peakPicking">\r\n+\t\t<param name="method" type="select" size="30" label="PEAK PICKING method ====================================================="\r\n+\t\thelp="matchedFilter=Feature detection in the chromatographic time domain ; centWave=Feature detection for high resolution LC/MS data">\r\n+\t\t\t<option value="matchedFilter" selected="true">matchedFilter</option>\r\n+\t\t\t<option value="centWave" >centWave</option>\r\n+\t\t</param>\r\n+\t\t<when value="matchedFilter">\t\r\n+\t\t\t<param name="fwhm" type="integer" size="10" value="20" label="fwhm" \r\n+\t\t\thelp="full width at half maximum of matched filtration gaussian model peak. Only used to calculate the actual sigma" />\r\n+\t\t\t<param name="sigma_denom" type="float" size="10" value="2.3548" label="sigma_denominator" \r\n+\t\t\thelp="denominator for standard deviation (width) of matched filtration model peak (e.g. sigma = fwhm/2.3548)" />\r\n+\t\t\t<param name="max" type="integer" size="10" value="50" label="max" \r\n+\t\t\thelp="maximum number of peaks per extracted ion chromatogram" />\r\n+\t\t\t<param name="snthresh" type="integer" size="10" value="4" label="snthresh" \r\n+\t\t\thelp="signal to noise ratio cutoff" />\r\n+\t\t\t<param name="step" type="float" size="10" value="0.05" label="step" \r\n+\t\t\thelp="step size to use for profile generation"/>\r\n+\t\t\t<param name="steps" type="integer" size="10" value="2" label="steps" \r\n+\t\t\thelp="number of steps to merge prior to filtration"/>\r\n+\t\t\t<param name="mzdiff" type="float" size="10" value="0.8" label="mzdiff" \r\n+\t\t\thelp="minimum difference in m/z for peaks with overlapping retention times"/>\r\n+\t\t</when>\r\n+\t\t<when value="centWave">\r\n+\t\t\t<param name="ppm" type="integer" size="10" value="25" label="ppm" \r\n+\t\t\thelp="maxmial tolerated m/z deviation in consecutive scans, in ppm" />\r\n+\t\t\t<param name="peakwidth" type="text" size="10" value="20,50" label="peakwidth" \r\n+\t\t\thelp="Chromatographic peak width, given as range (min,max) in seconds" />\r\n+\t\t\t<param name="snthresh" type="integer" size="10" value="10" label="snthresh" \r\n+\t\t\thelp="signal to noise ratio cutoff" />\t\t\t\r\n+\t\t\t<param name="prefilter" type="text" size="10" value="3,100" label="prefilter=c(k,I)" \r\n+\t\t\t\thelp="Prefilter step for the first phase. Mass traces are only retained if \r\n+\t\t\t\tthey contain at least k peaks with intensity > = I" />\t\t\t\r\n+\t\t\t<param name="mzCenterFun" type="select" size="30" label="mzCenterFun" \r\n+\t\t\t\thelp="Function to calculate t'..b'akPicking.peakwidth}),\r\n+\t\t\t\t\t\t\t\tsnthresh = ${peakPicking.snthresh},\r\n+\t\t\t\t\t\t\t\tprefilter = c(${peakPicking.prefilter}),\r\n+\t\t\t\t\t\t\t\tmzCenterFun = "${peakPicking.mzCenterFun}",\r\n+\t\t\t\t\t\t\t\tintegrate = ${peakPicking.integrate},\r\n+\t\t\t\t\t\t\t\tmzdiff = ${peakPicking.mzdiff},\r\n+\t\t\t\t\t\t\t\tfitgauss = ${peakPicking.fitgauss},\r\n+\t\t\t\t\t\t\t\tnoise = ${peakPicking.noise}),\r\n+\t\t\t\t\t\t\t#end if\r\n+ Alignment = list(\r\n+ min.class.fraction = ${min_class_fraction},\r\n+ min.class.size = ${min_class_size},\r\n+ mzwid = ${mzwid},\r\n+ bws = c(${bws}),\r\n+ retcormethod = "${retcor.retcormethod}",\r\n+\t\t\t\t\t\t\t#if $retcor.retcormethod == "peakgroups"\r\n+ \t\tsmooth = "${retcor.smooth}",\r\n+ \tmissingratio = ${retcor.missingratio},\r\n+ \textraratio = ${retcor.extraratio},\r\n+ \tretcorfamily = "${retcor.retcorfamily}", \r\n+\t\t\t\t\t\t\t#else\r\n+\t\t\t\t\t\t\t\t##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS):\r\n+\t\t\t\t\t\t\t\tmethod = "${retcor.retcormethod}", \r\n+\t\t\t\t\t\t\t\tprofStep = ${retcor.profStep},\r\n+\t\t\t\t\t\t\t#end if\t\t\t\t\t\t\t\t \r\n+ fillPeaks = ${fillPeaks}),\r\n+ CAMERA = list(\r\n+ perfwhm = ${perfwhm},\r\n+ cor_eic_th = ${cor_eic_th},\r\n+ ppm= ${ppm},\r\n+ graphMethod= "${groupCorr_graphMethod}",\r\n+ pval= ${groupCorr_pval},\r\n+ calcCiS= ${groupCorr_calcCiS},\r\n+ calcIso= ${groupCorr_calcIso},\r\n+ calcCaS= ${groupCorr_calcCaS},\r\n+ maxcharge= ${findIsotopes_maxcharge},\r\n+ maxiso= ${findIsotopes_maxiso},\r\n+ minfrac= ${findIsotopes_minfrac},\r\n+ multiplier= ${findAdducts_multiplier}\r\n+ ))</configfile>\r\n+\r\n+</configfiles>\r\n+\r\n+<outputs>\r\n+\t<data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - peaks table (TSV)"/>\r\n+\t<data name="outputLog" format="txt" label="${tool.name} on ${on_string} - LOG" hidden="True"/>\r\n+\t<data name="xsetOut" format="rdata" label="${tool.name} on ${on_string} - xcmsSet (RDATA)"/>\r\n+\t<data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - report (HTML)"/>\r\n+</outputs>\r\n+<tests>\r\n+\t<test>\r\n+\t</test>\r\n+</tests>\r\n+<help>\r\n+\r\n+.. class:: infomark\r\n+ \r\n+Runs metaMS process for LC/MS feature feature picking, aligning and grouping. \r\n+This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms.\r\n+CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data.\r\n+The figure below shows the main parts of the metaMS process wrapped by this tool. \r\n+\r\n+.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png \r\n+\r\n+\r\n+From CAMERA documentation: \r\n+\r\n+.. image:: $PATH_TO_IMAGES/CAMERA_results.png \r\n+\r\n+**References**\r\n+\r\n+If you use this Galaxy tool in work leading to a scientific publication please\r\n+cite the following papers:\r\n+\r\n+Wehrens, R.; Weingart, G.; Mattivi, F. (2014). \r\n+metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics. \r\n+Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116. \r\n+doi: 10.1016/j.jchromb.2014.02.051 \r\n+handle: http://hdl.handle.net/10449/24012\r\n+\r\n+Wrapper by Pieter Lukasse.\r\n+\r\n+\r\n+ </help>\r\n+ <citations>\r\n+ <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> <!-- example \r\n+ see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set\r\n+ -->\r\n+ </citations>\r\n+</tool>\n\\ No newline at end of file\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/next.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/next.r Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,9 @@ +# next script, useful for plotting the CAMERA groups pseudo-spectra: +plotPsSpectrum( LC$xset,320,maxlabel=10) +320 is group id +plotEICs( LC$xset,35,maxlabel=5) + + +could use this for the items identified by "annotate" step. +could also integrate this to getEIC tool (add option to give group list instead of m/z list)...but then rt and other options + are also not so relevant...maybe a separate tool is more clear \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/tool_dependencies.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<tool_dependency> +<!-- see also http://wiki.galaxyproject.org/ToolShedToolFeatures for syntax help + --> + <package name="R_bioc_metams" version="3.1.1"> + <repository changeset_revision="4b30bdaf4dbd" name="prims_metabolomics_r_dependencies" owner="pieterlukasse" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <readme> + This dependency: + Ensures R 3.1.1 installation is triggered (via dependency). + Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed. + </readme> +</tool_dependency> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_differential_analysis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_differential_analysis.r Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,85 @@ +## read args: +args <- commandArgs(TRUE) +#cat("args <- \"\"\n") +## a xcms xset saved as .RData +args.xsetData <- args[1] +#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep="")) + +args.class1 <- args[2] +args.class2 <- args[3] +#cat(paste("args.class1 <- \"", args[2], "\"\n", sep="")) +#cat(paste("args.class2 <- \"", args[3], "\"\n", sep="")) + +args.topcount <- strtoi(args[4]) +#cat(paste("args.topcount <- ", args[4], "\n", sep="")) + +args.outTable <- args[5] + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] +#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep="")) +#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep="")) + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +tryCatch( + { + library(metaMS) + library(xcms) + #library("R2HTML") + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + + # info: levels(xcmsSet@phenoData$class) also gives access to the class names + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + # set cairo as default for png plots: + png = function (...) grDevices::png(...,type='cairo') + # run diffreport + reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640) + + # write out tsv table: + write.table(reporttab, args.outTable, sep="\t", row.names=FALSE) + + message("\nGenerating report.........") + + cat("<html><body><h1>Differential analysis report</h1>", file= args.htmlReportFile) + #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile) + figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="") + message(figuresPath) + listOfFiles <- list.files(path = figuresPath) + for (i in 1:length(listOfFiles)) + { + figureName <- listOfFiles[i] + # maybe we still need to copy the figures to the args.htmlReportFile.files_path + cat(paste("<img src='fig_eic/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE) + cat(paste("<img src='fig_box/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE) + } + + message("finished generating report") + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_differential_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_differential_analysis.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,55 @@ +<tool id="xcms_differential_analysis" name="XCMS Differential Analsysis" version="0.0.4"> + <description> Runs xcms diffreport function for differential Analsysis</description> + <requirements> + <requirement type="package" version="3.1.1">R_bioc_metams</requirement> + </requirements> + <command interpreter="Rscript"> + xcms_differential_analysis.r + $xsetData + "$class1" + "$class2" + $topcount + $outTable + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + </command> +<inputs> + + <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/> + + + <param name="class1" type="text" size="30" label="Class1 name" value="" help="Name of first class for the comparison"/> + <param name="class2" type="text" size="30" label="Class2 name" value="" help="Name of second class for the comparison"/> + + <param name="topcount" type="integer" size="10" value="10" label="Number of items to return" help="Top X differential items. E.g. if 10, it will return top 10 differential items." /> + +</inputs> +<outputs> + <data name="outTable" format="tabular" label="${tool.name} on ${on_string} - Top differential items (TSV)"/> + <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/> + <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - differential report (HTML)"/> +</outputs> +<tests> + <test> + </test> +</tests> +<help> + +.. class:: infomark + +Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for +the most significant differences. The figure below shows an example of such an EIC. + +.. image:: $PATH_TO_IMAGES/diffreport.png + + + + + </help> + <citations> + <citation type="doi">10.1021/ac051437y</citation> <!-- example + see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set + --> + </citations> +</tool> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_alignment_eic.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_alignment_eic.r Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,153 @@ +# shows all alignment results in a rt region +## read args: +args <- commandArgs(TRUE) +# xset data: +args.xsetData <- args[1] + +args.rtStart <- strtoi(args[2]) +args.rtEnd <- strtoi(args[3]) + +# limit max diff to 600 and minNrSamples to at least 2 : +if (args.rtEnd - args.rtStart > 600) + stop("The RT window should be <= 600") + +args.minNrSamples <- strtoi(args[4]) #min nr samples +if (args.minNrSamples < 2) + stop("Set 'Minimum number of samples' to 2 or higher") + + +args.sampleNames <- strsplit(args[5], ",")[[1]] +# trim leading and trailing spaces: +args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + + + +tryCatch( + { + library(metaMS) + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) + + write(paste("<html><body><h1>Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples</h1>"), + file=args.htmlReportFile) + + gt <- groups(xsetData) + message("\nParsed groups... ") + groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected + + if (length(groupidx1) > 0) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames) + #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw") + + #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE) + #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!! + sampNames <- sampnames(xsetData) + sampleNamesIdx <- match( args.sampleNames, sampNames) + message(paste("Samples: ", sampleNamesIdx)) + + #TODO html <- paste(html, "<table><tbody>") + message(paste("\nPlotting figures... ")) + + #get the mz list (interestingly, this [,"mz"] is relatively slow): + peakMzList <- xsetData@peaks[,"mz"] + peakSampleList <- xsetData@peaks[,"sample"] + #signal to noise list: + peakSnList <- xsetData@peaks[,"sn"] + + message(paste("Total nr of peaks: ", length(peakMzList))) + + for (i in 1:length(groupidx1)) + { + groupMembers <- xsetData@groupidx[[groupidx1[i]]] + + groupMzList <- "" + groupSampleList <- "" + finalGroupSize <- 0 + + for (j in 1:length(groupMembers)) + { + #get only the peaks from the selected samples: + memberSample <- peakSampleList[groupMembers[j]] + memberSn <- peakSnList[groupMembers[j]] + if (!is.na(memberSn) && memberSample %in% sampleNamesIdx) + { + message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample)) + memberMz <- peakMzList[groupMembers[j]] + + + if (finalGroupSize == 0) + { + groupMzList <- memberMz + groupSampleList <- sampNames[memberSample] + } else { + groupMzList <- paste(groupMzList,",",memberMz, sep="") + groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="") + } + + finalGroupSize <- finalGroupSize +1 + } + } + # here we do the real check on group size and only the groups that have enough + # members with signal to noise > 0 will be plotted here: + if (finalGroupSize >= args.minNrSamples) + { + message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... ")) + + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + write(paste("<img src='", "figure", i,".png' />", sep="") , + file=args.htmlReportFile, append=TRUE) + + png( figureName, type="cairo", width=800 ) + plot(eiccor, xsetData, groupidx = i) + devname = dev.off() + + write(paste("<p>Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA:<br/>", groupMzList,"</p>", sep="") , + file=args.htmlReportFile, append=TRUE) + write(paste("<p>m/z values found in the following samples respectively: <br/>", groupSampleList,"</p>", sep="") , + file=args.htmlReportFile, append=TRUE) + } + } + + } + + write("</body><html>", + file=args.htmlReportFile, append=TRUE) + message("finished generating report") + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_alignment_eic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_alignment_eic.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,73 @@ +<tool id="xcms_get_alignment_eic" name="XCMS Get Alignment EICs" version="0.0.4"> + <description> Extracts alignment EICs from feature alignment data</description> + <requirements> + <requirement type="package" version="3.1.1">R_bioc_metams</requirement> + </requirements> + <command interpreter="Rscript"> + xcms_get_alignment_eic.r + $xsetData + $rtStart + $rtEnd + $minNrSamples + "$sampleNames" + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + </command> +<inputs> + + <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/> + + + <param name="rtStart" type="integer" value="" size="10" label="RT start" help="Start of Retention Time region to plot" /> + <param name="rtEnd" type="integer" value="" size="10" label="RT end" help="End of Retention Time region to plot" /> + + <param name="minNrSamples" type="integer" size="10" value="10" label="Minimum number of samples in which aligned feature should be found" help="Can also read this as: Minimum + number of features in alignment. E.g. if set to 10, it means the alignment should consist of at least 10 peaks from 10 different samples aligned together." /> + + <param name="sampleNames" type="text" area="true" size="10x70" label="List of sample names" + value="sampleName1,sampleName2,etc" + help="Comma or line-separated list of sample names. Optional field where you can specify the subset of samples + to use for the EIC plots. NB: if your dataset has many samples, specifying a subset here can significantly speed up the processing time." > + <sanitizer> + <!-- this translates from line-separated to comma separated list, removes quotes --> + <valid/> + <mapping initial="none"> + <add source=" " target=","/> + <add source=" " target=""/> + <add source=""" target=""/> + </mapping> + </sanitizer> + </param> + + +</inputs> +<outputs> + <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/> + <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - EIC(s) report (HTML)"/> +</outputs> +<tests> + <test> + </test> +</tests> +<help> + +.. class:: infomark + +This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method. +It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between +two classes, with extra alignment information beneath it. + +.. image:: $PATH_TO_IMAGES/diffreport.png + +Alignment id: 1709. m/z list of peaks in alignment: +614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469 + + + </help> + <citations> + <citation type="doi">10.1021/ac051437y</citation> <!-- example + see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set + --> + </citations> +</tool> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_mass_eic.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_mass_eic.r Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,162 @@ +## read args: +args <- commandArgs(TRUE) +# xset data: +args.xsetData <- args[1] + +args.rtStart <- strtoi(args[2]) +args.rtEnd <- strtoi(args[3]) + +args.mzStart <- as.double(args[4]) +args.mzEnd <- as.double(args[5]) +# there are 2 options: specify a mz range or a mz list: +if (args.mzStart < 0) +{ + args.mzList <- as.double(strsplit(args[6], ",")[[1]]) + cat(typeof(as.double(strsplit(args[6], ",")[[1]]))) + args.mzTolPpm <- as.double(args[7]) + # calculate mzends based on ppm tol: + mzListEnd <- c() + mzListStart <- c() + for (i in 1:length(args.mzList)) + { + mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0 + mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0 + mzListEnd <- c(mzListEnd, mzEnd) + mzListStart <- c(mzListStart, mzStart) + } + str(mzListStart) + str(mzListEnd) +} else { + mzListEnd <- c(args.mzEnd) + mzListStart <- c(args.mzStart) +} + +args.sampleNames <- strsplit(args[8], ",")[[1]] +# trim leading and trailing spaces: +args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) + +args.combineSamples <- args[9] +args.rtPlotMode <- args[10] + +## report files +args.htmlReportFile <- args[11] +args.htmlReportFile.files_path <- args[12] + + +if (length(args) == 13) +{ + args.outLogFile <- args[13] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots +# TODO2 - let it run in parallel + +tryCatch( + { + library(metaMS) + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) + + html <- "<html><body><h1>Extracted Ion Chromatograms (EIC) matching criteria</h1>" + + if (args.combineSamples == "No") + { + if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart)) + stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart), + " masses while ", length(args.sampleNames), " sample names were given.")) + + iterSize <- length(args.sampleNames) + # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used + fixSampleIdx <- 1 + fixMzListIdx <- 1 + if (length(args.sampleNames) == 1) + { + fixSampleIdx <- 0 + iterSize <- length(mzListStart) + } + if (length(mzListStart) == 1) + { + fixMzListIdx <- 0 + } + lineColors <- rainbow(iterSize) + for (i in 0:(iterSize-1)) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, + mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE), + rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), + sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode) + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + html <- paste(html,"<img src='", "figure", i,".png' /><br/>", sep="") + png( figureName, type="cairo", width=1100,height=250 ) + #plot(eiccor, col=lineColors[i+1]) + # black is better in this case: + plot(eiccor) + legend('topright', # places a legend at the appropriate place + legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend + lty=c(1,1), # gives the legend appropriate symbols (lines) + lwd=c(2.5,2.5)) + + devname = dev.off() + } + + } else { + for (i in 1:length(mzListStart)) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, + mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE), + rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), + sampleidx=args.sampleNames, rt = args.rtPlotMode) + + #set size, set option (plot per sample, plot per mass) + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + html <- paste(html,"<img src='", "figure", i,".png' />", sep="") + png( figureName, type="cairo", width=1100,height=450 ) + lineColors <- rainbow(length(args.sampleNames)) + plot(eiccor, col=lineColors) + legend('topright', # places a legend at the appropriate place + legend=args.sampleNames, # puts text in the legend + lty=c(1,1), # gives the legend appropriate symbols (lines) + lwd=c(2.5,2.5), + col=lineColors) + devname = dev.off() + } + } + if (args.rtPlotMode == "corrected") + { + html <- paste(html,"<p>*rt values are corrected ones</p></body><html>") + } + html <- paste(html,"</body><html>") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_mass_eic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_mass_eic.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,117 @@ +<tool id="xcms_get_mass_eic" name="XCMS Get EICs" version="0.0.4"> + <description> Extracts EICs for a given list of masses</description> + <requirements> + <requirement type="package" version="3.1.1">R_bioc_metams</requirement> + </requirements> + <command interpreter="Rscript"> + xcms_get_mass_eic.r + $xsetData + $rtStart + $rtEnd + #if $massParameters.massParametersType == "window" + $massParameters.mzStart + $massParameters.mzEnd + -1 + "." + #else + -1 + -1 + "$massParameters.mzList" + $massParameters.mzTolPpm + #end if + "$sampleNames" + $combineSamples + $rtPlotMode + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + </command> +<inputs> + + <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/> + + + <param name="rtStart" type="integer" value="" size="10" label="RT start" help="Start of Retention Time region to plot" /> + <param name="rtEnd" type="integer" value="" size="10" label="RT end" help="End of Retention Time region to plot" /> + + <conditional name="massParameters"> + <param name="massParametersType" type="select" size="50" label="Give masses as" > + <option value="list" selected="true">m/z list</option> + <option value="window" >m/z window</option> + </param> + <when value="list"> + <param name="mzList" type="text" area="true" size="7x70" label="m/z list" + help="Comma or line-separated list of m/z values for which to plot an EIC. One EIC will be plotted for each m/z given here."> + <sanitizer> + <!-- this translates from line-separated to comma separated list, removes quotes --> + <valid/> + <mapping initial="none"> + <add source=" " target=","/> + <add source=" " target=""/> + <add source=""" target=""/> + </mapping> + </sanitizer> + </param> + <param name="mzTolPpm" type="integer" size="10" value="5" label="m/z tolerance (ppm)" /> + </when> + <when value="window"> + <param name="mzStart" type="float" value="" size="10" label="m/z start" help="Start of m/z window" /> + <param name="mzEnd" type="float" value="" size="10" label="m/z end" help="End of m/z window" /> + </when> + </conditional> + + + <param name="sampleNames" type="text" area="true" size="10x70" label="List of sample names" + value="sampleName1,sampleName2,etc" + help="Comma or line-separated list of sample names. Here you can specify the subset of samples + to use for the EIC plots. NB: if your dataset has many samples, specifying a subset here can significantly speed up the processing time." > + <sanitizer> + <!-- this translates from line-separated to comma separated list, removes quotes --> + <valid/> + <mapping initial="none"> + <add source=" " target=","/> + <add source=" " target=""/> + <add source=""" target=""/> + </mapping> + </sanitizer> + </param> + + <param name="combineSamples" type="select" size="50" label="Combine samples in EIC" + help="Combining samples means plot contains the combined EIC of a m/z in the different samples. When NOT combining, each plot + only contains the EIC of the m/z in the respectively given sample (the sample name from the sample list in the same position as the + m/z in the m/z list.). Tip: use Yes for visualizing EIC of grouped masses (MsClust or CAMERA groups), use No for visualizing EICs of the same mass in + the different samples."> + <option value="No" selected="true">No</option> + <option value="Yes" >Yes</option> + </param> + <param name="rtPlotMode" type="select" size="50" label="RT plot mode" + help="Select whether EIC should be on original or raw Retention Time"> + <option value="raw" selected="true">Raw</option> + <option value="corrected" >Corrected</option> + </param> +</inputs> +<outputs> + <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/> + <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - EIC(s) report (HTML)"/> +</outputs> +<tests> + <test> + </test> +</tests> +<help> + +.. class:: infomark + +This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method. +It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot. + +.. image:: $PATH_TO_IMAGES/massEIC.png + + + </help> + <citations> + <citation type="doi">10.1021/ac051437y</citation> <!-- example + see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set + --> + </citations> +</tool> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS_cmd_annotate.r --- a/metaMS_cmd_annotate.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,86 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData" -args.constructedDB <- args[1] -## data file in xset format: -args.xsetData <- args[2] -## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" -args.settings <- args[3] - -## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" -args.outAnnotationTable <- args[4] - -args.mass_error_function <- args[5] -if (args.mass_error_function == "0") - args.mass_error_function <- NULL -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -cat("\nSettings used===============:\n") -cat(readChar(args.settings, 1e5)) - - -tryCatch( - { - library(metaMS) - - ## load the constructed DB : - tempEnv <- new.env() - testDB <- load(args.constructedDB, envir=tempEnv) - xsetData <- readRDS(args.xsetData) - - ## load settings "script" into "customMetaMSsettings" - source(args.settings, local=tempEnv) - message(paste(" loaded : ", args.settings)) - - # Just to highlight: if you want to use more than one - # trigger runLC: - LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE) - - # write out runLC annotation results: - write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE) - - # the used constructed DB (write to log): - cat("\nConstructed DB info===============:\n") - str(tempEnv[[testDB[1]]]$Info) - cat("\nConstructed DB table===============:\n") - if (length(args) == 8) - { - write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE) - write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE) - } - - message("\nGenerating report.........") - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - html <- "<html><body><h1>Summary of annotation results:</h1>" - nrTotalFeatures <- nrow(LC$PeakTable) - nrAnnotatedFeatures <- nrow(LC$Annotation$annotation.table) - html <- paste(html,"<p>Total nr of features: ", nrTotalFeatures,"</p>", sep="") - html <- paste(html,"<p>Total nr of annotated features: ", nrAnnotatedFeatures,"</p>", sep="") - - html <- paste(html,"</body><html>") - message("finished generating report") - write(html,file=args.htmlReportFile) - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) |
b |
diff -r 41f122255d14 -r 4393f982d18f metaMS_cmd_pick_and_group.r --- a/metaMS_cmd_pick_and_group.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,92 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/" -args.dataZip <- args[1] -args.zipExtrDir <- sub("\\.","_",paste(args[1],"dir", sep="")) -dir.create(file.path(args.zipExtrDir), showWarnings = FALSE, recursive = TRUE) -## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" -args.settings <- args[2] - -## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" -args.outPeakTable <- args[3] -args.xsetOut <- args[4] - -# polarity as explicit parameter: -args.runLC_polarity <- args[5] - -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] - - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -cat("\nSettings used===============:\n") -cat(readChar(args.settings, 1e5)) - - -tryCatch( - { - library(metaMS) - - ## load the data files from a zip file - files <- unzip(args.dataZip, exdir=args.zipExtrDir) - - ## load settings "script" into "customMetaMSsettings" - tempEnv <- new.env() - source(args.settings, local=tempEnv) - message(paste(" loaded : ", args.settings)) - allSettings <- tempEnv[["customMetaMSsettings"]] - - # trigger runLC: - LC <- runLC(files, settings = allSettings, polarity=args.runLC_polarity, nSlaves=20, returnXset = TRUE) - - # write out runLC annotation results: - write.table(LC$PeakTable, args.outPeakTable, sep="\t", row.names=FALSE) - - # save xset as rdata: - xsAnnotatePreparedData <- LC$xset - saveRDS(xsAnnotatePreparedData, file=args.xsetOut) - - message("\nGenerating report.........") - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - html <- "<html><body><h1>Info on alignment quality </h1>" - # TODO add (nr and mass error) and group size - - message("\nPlotting figures... ") - figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="") - html <- paste(html,"<img src='figure_retcor.png' /><br/>", sep="") - png( figureName, type="cairo", width=1100,height=600 ) - retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden") - html <- paste(html,"<a>*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run, - but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once - this option is added in metaMS API. </a><br/>", sep="") - devname = dev.off() - - - gt <- groups(LC$xset@xcmsSet) - groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3) - - html <- paste(html,"</body><html>") - message("finished generating report") - write(html,file=args.htmlReportFile) - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) |
b |
diff -r 41f122255d14 -r 4393f982d18f metams_lcms_annotate.xml --- a/metams_lcms_annotate.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,121 +0,0 @@ -<tool id="metams_lcms_annotate" name="METAMS-LC/MS Annotate" version="0.0.4"> - <description> Runs metaMS process for LC/MS feature annotation</description> - <requirements> - <requirement type="package" version="3.1.1">R_bioc_metams</requirement> - </requirements> - <command interpreter="Rscript"> - metaMS_cmd_annotate.r - $constructed_db - $xsetData - $customMetaMSsettings - $outputFile - #if $mzTol.mzTolType == "fixed" - 0 - #else - "$mzTol.mass_error_function" - #end if - $htmlReportFile - $htmlReportFile.files_path - $outputLog - </command> -<inputs> - <param name="constructed_db" type="select" label="Constructed DB" help="Reference annotation database generated from matching measurements of a mixture of chemical standards - against a manually validated reference table which contains the key analytical information for each standard." - dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/metaMS")'/> - - <param name="xsetData" type="data" format="rdata" label="xcmsSet data file (xset RDATA)" help="E.g. output data file resulting from METAMS 'feature picking, aligning and grouping' run"/> - - <param name="protocolName" type="text" size="30" label="protocolName" value="e.g. Synapt.QTOF.RP" - help="Choose a name to give for the specific settings in the parameters below"/> - - <param name="rtdiff" type="float" size="10" value="1.5" label="rtdiff" help="(Annotation) Allowed rt difference (in minutes)"/> - - <conditional name="mzTol"> - <param name="mzTolType" type="select" size="30" label="(Annotation) m/z tolerance type"> - <option value="fixed" selected="true">Fixed tolerance</option> - <option value="adaptive" >Adaptive tolerance</option> - </param> - <when value="fixed"> - <param name="mzdiff" type="float" size="10" value="0.005" label="mzdiff" help="(Annotation) Fixed mass tolerance" /> - </when> - <when value="adaptive"> - <param name="ppm" type="float" size="10" value="5.0" label="ppm" help="(Annotation) Tolerance in ppm" /> - <param name="mass_error_function" type="text" area="true" size="3x70" label="(Annotation) Mass error function"/> - </when> - </conditional> - - <param name="rtval" type="float" size="10" value="0.1" label="(max)rtval" help="(Validation) Group items are clustered once more with hierarchical clustering ('complete' method) - based on their rt distances. Here one can specify the rt threshold for removing the items that have too diverging rt (the ones with rt difference - larger than rtval). " /> - <param name="minfeat" type="integer" size="10" value="2" label="minfeat" - help="(Validation) Threshold for the minimum number of features a - cluster/group should have (after rtval filtering above). Other clusters/groups are filtered out." /> - -</inputs> -<configfiles> - -<configfile name="customMetaMSsettings">## start comment - ## metaMS process settings - customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", - chrom = "LC") -metaSetting(customMetaMSsettings, "match2DB") <- list( - rtdiff = ${rtdiff}, - rtval = ${rtval}, - #if $mzTol.mzTolType == "fixed" - mzdiff = ${mzTol.mzdiff}, - #else - ppm = ${mzTol.ppm}, - #end if - minfeat = ${minfeat})</configfile> - -</configfiles> - -<outputs> - <data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - metaMS annotated file (TSV)"/> - <data name="outputLog" format="txt" label="${tool.name} on ${on_string} - metaMS LOG" hidden="True"/> - <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - metaMS report (HTML)"/> -</outputs> -<tests> - <test> - </test> -</tests> -<code file="match_library.py" /> <!-- file containing get_directory_files function used above--> -<help> - -.. class:: infomark - -Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB. -The figure below shows the main parts of this metaMS process. - -.. image:: $PATH_TO_IMAGES/metaMS_annotate.png - - -.. class:: infomark - -The implemented annotation strategy can be broken down in the following steps: - -1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If -the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed -tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the -bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e. -features which are associated to more than one compound - are possible. This outcome does not -indicate a problem per se, but is an inherent drawback of co-elution. - -2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all -the experimental features which are assigned to a specific compound. A specific annotation is -confirmed only if more than minfeat features which differ in retention time less than rtval are -present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The -latter, indeed, accounts for shifts in retention time between the injection of the standards and the -metabolomics experiment under investigation. This time can be rather long, considering that the -standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift -between the ions of the same compound within the same batch of injections and therefore it has -only to account for the smaller shifts occurring during peak picking and alignment. - - - </help> - <citations> - <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> <!-- example - see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set - --> - </citations> -</tool> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f metams_lcms_pick_and_group.xml --- a/metams_lcms_pick_and_group.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,301 +0,0 @@\n-<tool id="metams_lcms_pick_and_group" name="METAMS-LC/MS Pick, Align and Group" version="0.0.4">\r\n-\t<description> Runs metaMS process for LC/MS feature picking, aligning and grouping</description>\r\n-\t<requirements>\r\n-\t\t<requirement type="package" version="3.1.1">R_bioc_metams</requirement>\r\n-\t</requirements>\t\r\n-\t<command interpreter="Rscript">\r\n-\t\tmetaMS_cmd_pick_and_group.r \r\n-\t $data_files\r\n-\t $customMetaMSsettings\r\n-\t $outputFile \r\n-\t $xsetOut\r\n-\t $polarity\r\n-\t $htmlReportFile\r\n-\t $htmlReportFile.files_path\r\n-\t $outputLog\r\n-\t</command>\r\n-<inputs>\r\n-\t<param name="data_files" type="data" format="prims.fileset.zip" label="Data files (.zip file with CDF, mzML or mzXML files)" help=".zip file containing the CDF, mzML or mzXML files of the new measurements"/>\r\n-\t\r\n-\t<param name="protocolName" type="text" size="30" label="protocolName" value="e.g. Synapt.QTOF.RP" \r\n-\t\thelp="Choose a name to give for the specific settings in the parameters below"/><!-- TODO - let user choose this -->\r\n-\t\r\n-\t<param name="polarity" type="select" size="30" label="polarity" \r\n-\t\thelp="Which polarity mode was used for measuring of the ms sample">\r\n-\t\t<option value="positive" selected="true">positive</option>\r\n-\t\t<option value="negative" >negative</option>\r\n-\t</param>\r\n-\t\r\n-\t\r\n-\t<!-- ===========NB : if peak picking, alignment OR CAMERA settings have to be reused for runGC wrapper in the future, we can use Galaxy macro expansions here\r\n-\t to avoid defining these parameters again in the runGC wrapper ========================= -->\r\n-\t<conditional name="peakPicking">\r\n-\t\t<param name="method" type="select" size="30" label="PEAK PICKING method ====================================================="\r\n-\t\thelp="matchedFilter=Feature detection in the chromatographic time domain ; centWave=Feature detection for high resolution LC/MS data">\r\n-\t\t\t<option value="matchedFilter" selected="true">matchedFilter</option>\r\n-\t\t\t<option value="centWave" >centWave</option>\r\n-\t\t</param>\r\n-\t\t<when value="matchedFilter">\t\r\n-\t\t\t<param name="fwhm" type="integer" size="10" value="20" label="fwhm" \r\n-\t\t\thelp="full width at half maximum of matched filtration gaussian model peak. Only used to calculate the actual sigma" />\r\n-\t\t\t<param name="sigma_denom" type="float" size="10" value="2.3548" label="sigma_denominator" \r\n-\t\t\thelp="denominator for standard deviation (width) of matched filtration model peak (e.g. sigma = fwhm/2.3548)" />\r\n-\t\t\t<param name="max" type="integer" size="10" value="50" label="max" \r\n-\t\t\thelp="maximum number of peaks per extracted ion chromatogram" />\r\n-\t\t\t<param name="snthresh" type="integer" size="10" value="4" label="snthresh" \r\n-\t\t\thelp="signal to noise ratio cutoff" />\r\n-\t\t\t<param name="step" type="float" size="10" value="0.05" label="step" \r\n-\t\t\thelp="step size to use for profile generation"/>\r\n-\t\t\t<param name="steps" type="integer" size="10" value="2" label="steps" \r\n-\t\t\thelp="number of steps to merge prior to filtration"/>\r\n-\t\t\t<param name="mzdiff" type="float" size="10" value="0.8" label="mzdiff" \r\n-\t\t\thelp="minimum difference in m/z for peaks with overlapping retention times"/>\r\n-\t\t</when>\r\n-\t\t<when value="centWave">\r\n-\t\t\t<param name="ppm" type="integer" size="10" value="25" label="ppm" \r\n-\t\t\thelp="maxmial tolerated m/z deviation in consecutive scans, in ppm" />\r\n-\t\t\t<param name="peakwidth" type="text" size="10" value="20,50" label="peakwidth" \r\n-\t\t\thelp="Chromatographic peak width, given as range (min,max) in seconds" />\r\n-\t\t\t<param name="snthresh" type="integer" size="10" value="10" label="snthresh" \r\n-\t\t\thelp="signal to noise ratio cutoff" />\t\t\t\r\n-\t\t\t<param name="prefilter" type="text" size="10" value="3,100" label="prefilter=c(k,I)" \r\n-\t\t\t\thelp="Prefilter step for the first phase. Mass traces are only retained if \r\n-\t\t\t\tthey contain at least k peaks with intensity > = I" />\t\t\t\r\n-\t\t\t<param name="mzCenterFun" type="select" size="30" label="mzCenterFun" \r\n-\t\t\t\thelp="Function to calculate t'..b'akPicking.peakwidth}),\r\n-\t\t\t\t\t\t\t\tsnthresh = ${peakPicking.snthresh},\r\n-\t\t\t\t\t\t\t\tprefilter = c(${peakPicking.prefilter}),\r\n-\t\t\t\t\t\t\t\tmzCenterFun = "${peakPicking.mzCenterFun}",\r\n-\t\t\t\t\t\t\t\tintegrate = ${peakPicking.integrate},\r\n-\t\t\t\t\t\t\t\tmzdiff = ${peakPicking.mzdiff},\r\n-\t\t\t\t\t\t\t\tfitgauss = ${peakPicking.fitgauss},\r\n-\t\t\t\t\t\t\t\tnoise = ${peakPicking.noise}),\r\n-\t\t\t\t\t\t\t#end if\r\n- Alignment = list(\r\n- min.class.fraction = ${min_class_fraction},\r\n- min.class.size = ${min_class_size},\r\n- mzwid = ${mzwid},\r\n- bws = c(${bws}),\r\n- retcormethod = "${retcor.retcormethod}",\r\n-\t\t\t\t\t\t\t#if $retcor.retcormethod == "peakgroups"\r\n- \t\tsmooth = "${retcor.smooth}",\r\n- \tmissingratio = ${retcor.missingratio},\r\n- \textraratio = ${retcor.extraratio},\r\n- \tretcorfamily = "${retcor.retcorfamily}", \r\n-\t\t\t\t\t\t\t#else\r\n-\t\t\t\t\t\t\t\t##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS):\r\n-\t\t\t\t\t\t\t\tmethod = "${retcor.retcormethod}", \r\n-\t\t\t\t\t\t\t\tprofStep = ${retcor.profStep},\r\n-\t\t\t\t\t\t\t#end if\t\t\t\t\t\t\t\t \r\n- fillPeaks = ${fillPeaks}),\r\n- CAMERA = list(\r\n- perfwhm = ${perfwhm},\r\n- cor_eic_th = ${cor_eic_th},\r\n- ppm= ${ppm},\r\n- graphMethod= "${groupCorr_graphMethod}",\r\n- pval= ${groupCorr_pval},\r\n- calcCiS= ${groupCorr_calcCiS},\r\n- calcIso= ${groupCorr_calcIso},\r\n- calcCaS= ${groupCorr_calcCaS},\r\n- maxcharge= ${findIsotopes_maxcharge},\r\n- maxiso= ${findIsotopes_maxiso},\r\n- minfrac= ${findIsotopes_minfrac},\r\n- multiplier= ${findAdducts_multiplier}\r\n- ))</configfile>\r\n-\r\n-</configfiles>\r\n-\r\n-<outputs>\r\n-\t<data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - peaks table (TSV)"/>\r\n-\t<data name="outputLog" format="txt" label="${tool.name} on ${on_string} - LOG" hidden="True"/>\r\n-\t<data name="xsetOut" format="rdata" label="${tool.name} on ${on_string} - xcmsSet (RDATA)"/>\r\n-\t<data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - report (HTML)"/>\r\n-</outputs>\r\n-<tests>\r\n-\t<test>\r\n-\t</test>\r\n-</tests>\r\n-<help>\r\n-\r\n-.. class:: infomark\r\n- \r\n-Runs metaMS process for LC/MS feature feature picking, aligning and grouping. \r\n-This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms.\r\n-CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data.\r\n-The figure below shows the main parts of the metaMS process wrapped by this tool. \r\n-\r\n-.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png \r\n-\r\n-\r\n-From CAMERA documentation: \r\n-\r\n-.. image:: $PATH_TO_IMAGES/CAMERA_results.png \r\n-\r\n-**References**\r\n-\r\n-If you use this Galaxy tool in work leading to a scientific publication please\r\n-cite the following papers:\r\n-\r\n-Wehrens, R.; Weingart, G.; Mattivi, F. (2014). \r\n-metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics. \r\n-Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116. \r\n-doi: 10.1016/j.jchromb.2014.02.051 \r\n-handle: http://hdl.handle.net/10449/24012\r\n-\r\n-Wrapper by Pieter Lukasse.\r\n-\r\n-\r\n- </help>\r\n- <citations>\r\n- <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> <!-- example \r\n- see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set\r\n- -->\r\n- </citations>\r\n-</tool>\n\\ No newline at end of file\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f next.r --- a/next.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,9 +0,0 @@ -# next script, useful for plotting the CAMERA groups pseudo-spectra: -plotPsSpectrum( LC$xset,320,maxlabel=10) -320 is group id -plotEICs( LC$xset,35,maxlabel=5) - - -could use this for the items identified by "annotate" step. -could also integrate this to getEIC tool (add option to give group list instead of m/z list)...but then rt and other options - are also not so relevant...maybe a separate tool is more clear \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f query_mass_repos.py --- a/query_mass_repos.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,289 +0,0 @@\n-#!/usr/bin/env python\n-# encoding: utf-8\n-\'\'\'\n-Module to query a set of accurate mass values detected by high-resolution mass spectrometers\n-against various repositories/services such as METabolomics EXPlorer database or the \n-MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).\n-\n-It will take the input file and for each record it will query the \n-molecular mass in the selected repository/service. If one or more compounds are found \n-then extra information regarding these compounds is added to the output file.\n-\n-The output file is thus the input file enriched with information about \n-related items found in the selected repository/service. \n-\n-The service should implement the following interface: \n-\n-http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)\n-\n-The output should be tab separated and should contain the following columns (in this order)\n-db-name molecular-formula dbe formula-weight id description\n-\n-\n-\'\'\'\n-import csv\n-import sys\n-import fileinput\n-import urllib2\n-import time\n-from collections import OrderedDict\n-\n-__author__ = "Pieter Lukasse"\n-__contact__ = "pieter.lukasse@wur.nl"\n-__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n-__license__ = "Apache v2"\n-\n-def _process_file(in_xsv, delim=\'\\t\'):\n- \'\'\'\n- Generic method to parse a tab-separated file returning a dictionary with named columns\n- @param in_csv: input filename to be parsed\n- \'\'\'\n- data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n- return _process_data(data)\n- \n-def _process_data(data):\n- \n- header = data.pop(0)\n- # Create dictionary with column name as key\n- output = OrderedDict()\n- for index in xrange(len(header)):\n- output[header[index]] = [row[index] for row in data]\n- return output\n-\n-\n-def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):\n- \n- \'\'\'\n- This method will iterate over the record in the input_data and\n- will enrich them with the related information found (if any) in the \n- chosen repository/service\n- \n- # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies\n- \'\'\'\n- merged = []\n- \n- for i in xrange(len(input_data[input_data.keys()[0]])):\n- # Get the record in same dictionary format as input_data, but containing\n- # a value at each column instead of a list of all values of all records:\n- input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n- \n- # read the molecular mass :\n- molecular_mass = input_data_record[molecular_mass_col]\n- \n- \n- # search for related records in repository/service:\n- data_found = None\n- if molecular_mass != "": \n- molecular_mass = float(molecular_mass)\n- \n- # 1- search for data around this MM:\n- query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"\n- \n- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n- data_type_found = "MM"\n- \n- \n- if data_found == None:\n- # If still nothing found, just add empty columns\n- extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\']\n- else:\n- # Add info found:\n- extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n- \n- # Take all data and merge it into a "flat"/simple array of values:\n- field_values_list = _merge_data(input_data_record, extra_cols)\n- \n- merged.append(field_values_list)\n-\n- # return the merged/enriched records:\n- return merged\n-\n-\n-def'..b'_rows[1].strip() == \'\': \n- # means there is only the header row...so no hits:\n- return None\n- \n- for data_row in data_rows:\n- if not data_row.strip() == \'\':\n- row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n- result.append(row_as_list)\n- \n- # return result processed into a dict:\n- return _process_data(result)\n- \n- except urllib2.HTTPError, e:\n- raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n- except urllib2.URLError, e:\n- raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")\n-\n-def _str_to_list(data_row, delimiter=\'\\t\'): \n- result = []\n- for column in data_row.split(delimiter):\n- result.append(column)\n- return result\n- \n- \n-# alternative: ? \n-# s = requests.Session()\n-# s.verify = False\n-# #s.auth = (token01, token02)\n-# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n-# content = resp.content\n-# # transform to dictionary:\n- \n- \n- \n- \n-def _merge_data(input_data_record, extra_cols):\n- \'\'\'\n- Adds the extra information to the existing data record and returns\n- the combined new record.\n- \'\'\'\n- record = []\n- for column in input_data_record:\n- record.append(input_data_record[column])\n- \n- \n- # add extra columns\n- for column in extra_cols:\n- record.append(column) \n- \n- return record \n- \n-\n-def _save_data(data_rows, headers, out_csv):\n- \'\'\'\n- Writes tab-separated data to file\n- @param data_rows: dictionary containing merged/enriched dataset\n- @param out_csv: output csv file\n- \'\'\'\n-\n- # Open output file for writing\n- outfile_single_handle = open(out_csv, \'wb\')\n- output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n-\n- # Write headers\n- output_single_handle.writerow(headers)\n-\n- # Write one line for each row\n- for data_row in data_rows:\n- output_single_handle.writerow(data_row)\n-\n-def _get_repository_URL(repository_file):\n- \'\'\'\n- Read out and return the URL stored in the given file.\n- \'\'\'\n- file_input = fileinput.input(repository_file)\n- try:\n- for line in file_input:\n- if line[0] != \'#\':\n- # just return the first line that is not a comment line:\n- return line\n- finally:\n- file_input.close()\n- \n-\n-def main():\n- \'\'\'\n- Query main function\n- \n- The input file can be any tabular file, as long as it contains a column for the molecular mass.\n- This column is then used to query against the chosen repository/service Database. \n- \'\'\'\n- seconds_start = int(round(time.time()))\n- \n- input_file = sys.argv[1]\n- molecular_mass_col = sys.argv[2]\n- repository_file = sys.argv[3]\n- error_margin = float(sys.argv[4])\n- margin_unit = sys.argv[5]\n- output_result = sys.argv[6]\n-\n- # Parse repository_file to find the URL to the service:\n- repository_dblink = _get_repository_URL(repository_file)\n- \n- # Parse tabular input file into dictionary/array:\n- input_data = _process_file(input_file)\n- \n- # Query data against repository :\n- enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)\n- headers = input_data.keys() + [\'SEARCH hits for \',\'SEARCH hits: db-names\', \'SEARCH hits: molecular-formulas \',\n- \'SEARCH hits: ids\',\'SEARCH hits: descriptions\', \'Link to SEARCH hits\'] #TODO - add min and max formula weigth columns\n-\n- _save_data(enriched_data, headers, output_result)\n- \n- seconds_end = int(round(time.time()))\n- print "Took " + str(seconds_end - seconds_start) + " seconds"\n- \n- \n-\n-if __name__ == \'__main__\':\n- main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f query_mass_repos.xml --- a/query_mass_repos.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,106 +0,0 @@ -<tool id="query_mass_repos" - name="METEXP - Find elemental composition formulas based on mass values " - version="0.1.0"> - <description>Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers</description> - <command interpreter="python"> - query_mass_repos.py - $input_file - "$molecular_mass_col" - "$repository_file" - $error_margin - $margin_unit - $output_result - </command> - <inputs> - - <param name="input_file" format="tabular" type="data" - label="Input file" - help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> - - <param name="molecular_mass_col" type="text" size="50" - label="Molecular mass column name" - value="MM" - help="Name of the column containing the molecular mass information (in the given input file)" /> - - <param name="repository_file" type="select" label="Repository/service to query" - help="Select the repository/service which should be queried" - dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_MassSearch_Services")'/> - - <param name="error_margin" type="float" size="10" - label="Error marging" - value="0.01" - help="Mass difference allowed when searching in the repositories for a mass match." /> - - <param name="margin_unit" type="select" label="Margin unit"> - <option value="ms" selected="True">ms</option> - <option value="ppm">ppm</option> - </param> - <!-- TODO - <param name="metexp_access_key" type="text" size="50" - label="(Optional)MetExp access key" - value="" - help="Key needed to get access to MetExp services. Fill in if MetExp service was selected" /> --> - - </inputs> - <outputs> - <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> - </outputs> - <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> - <help> -.. class:: infomark - -This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher -for elemental compositions from accurate mass values detected by high-resolution mass spectrometers. - -It will take the input file and for each record it will query the -molecular mass in the selected repository. If one or more compounds are found in the -repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file. - -The output file is thus the input file enriched with information about -related items found in the selected repository. - -**Notes** - -The input file can be any tabular file, as long as it contains a column for the molecular mass. - -**Services that can be queried** - -================= ========================================================================= -Database Description ------------------ ------------------------------------------------------------------------- -PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at - Plant Research International. NB: restricted access to employees with - access key. -ExactMassDB A database of possible elemental compositions consits of C: 100, - H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and - the Lewis valence rules. - (via /mfsearcher/exmassdb/) -ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental - compositions, filters some elemental compositions according to - the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 - database returns the same result as does HR2 with the same atom kind - and number condition as that used in construction of the ExactMassDB. - (via /mfsearcher/exmassdb-hr2/) -Pep1000 A database of possible linear polypeptides that are - constructed with 20 kinds of amino acids and having molecular - weights smaller than 1000. - (via /mfsearcher/pep1000/) -KEGG Re-calculated compound data from KEGG. Weekly updated. - (via /mfsearcher/kegg/) -KNApSAcK Re-calculated compound data from KNApSAcK. - (via /mfsearcher/knapsack/) -Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer . - (via /mfsearcher/flavonoidviewer/ -LipidMAPS Re-calculated compound data from LIPID MAPS. - (via /mfsearcher/lipidmaps/) -HMDB Re-calculated compound data from Human Metabolome Database (HMDB) - Version 3.5. - (via /mfsearcher/hmdb/) -PubChem Re-calculated compound data from PubChem. Monthly updated. - (via /mfsearcher/pubchem/) -================= ========================================================================= - -Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher - - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f query_metexp.py --- a/query_metexp.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,282 +0,0 @@\n-#!/usr/bin/env python\n-# encoding: utf-8\n-\'\'\'\n-Module to query a set of identifications against the METabolomics EXPlorer database.\n-\n-It will take the input file and for each record it will query the \n-molecular mass in the selected MetExp DB. If one or more compounds are found in the\n-MetExp DB then extra information regarding these compounds is added to the output file.\n-\n-The output file is thus the input file enriched with information about \n-related items found in the selected MetExp DB. \n-\'\'\'\n-import csv\n-import sys\n-import fileinput\n-import urllib2\n-import time\n-from collections import OrderedDict\n-\n-__author__ = "Pieter Lukasse"\n-__contact__ = "pieter.lukasse@wur.nl"\n-__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n-__license__ = "Apache v2"\n-\n-def _process_file(in_xsv, delim=\'\\t\'):\n- \'\'\'\n- Generic method to parse a tab-separated file returning a dictionary with named columns\n- @param in_csv: input filename to be parsed\n- \'\'\'\n- data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n- return _process_data(data)\n- \n-def _process_data(data):\n- \n- header = data.pop(0)\n- # Create dictionary with column name as key\n- output = OrderedDict()\n- for index in xrange(len(header)):\n- output[header[index]] = [row[index] for row in data]\n- return output\n-\n-\n-def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):\n- \'\'\'\n- This method will iterate over the record in the input_data and\n- will enrich them with the related information found (if any) in the \n- MetExp Database.\n- \n- # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies\n- \'\'\'\n- merged = []\n- \n- for i in xrange(len(input_data[input_data.keys()[0]])):\n- # Get the record in same dictionary format as input_data, but containing\n- # a value at each column instead of a list of all values of all records:\n- input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n- \n- # read the molecular mass and formula:\n- cas_id = input_data_record[casid_col]\n- formula = input_data_record[formula_col]\n- molecular_mass = input_data_record[molecular_mass_col]\n- \n- # search for related records in MetExp:\n- data_found = None\n- if cas_id != "undef": \n- # 1- search for other experiments where this CAS id has been found:\n- query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method\n- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n- data_type_found = "CAS"\n- if data_found == None:\n- # 2- search for other experiments where this FORMULA has been found:\n- query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method\n- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n- data_type_found = "FORMULA"\n- if data_found == None:\n- # 3- search for other experiments where this MM has been found:\n- query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method \n- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n- data_type_found = "MM"\n- \n- if data_found == None:\n- # If still nothing found, just add empty columns\n- extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\',\'\',\'\']\n- else:\n- # Add info found:\n- extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n- \n- # Take all data and merge it into a "flat"/simple array of values:\n- field_values_'..b'ne\n- \n- for data_row in data_rows:\n- if not data_row.strip() == \'\':\n- row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n- result.append(row_as_list)\n- \n- # return result processed into a dict:\n- return _process_data(result)\n- \n- except urllib2.HTTPError, e:\n- raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n- except urllib2.URLError, e:\n- raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")\n-\n-def _str_to_list(data_row, delimiter=\'\\t\'): \n- result = []\n- for column in data_row.split(delimiter):\n- result.append(column)\n- return result\n- \n- \n-# alternative: ? \n-# s = requests.Session()\n-# s.verify = False\n-# #s.auth = (token01, token02)\n-# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n-# content = resp.content\n-# # transform to dictionary:\n- \n- \n- \n- \n-def _merge_data(input_data_record, extra_cols):\n- \'\'\'\n- Adds the extra information to the existing data record and returns\n- the combined new record.\n- \'\'\'\n- record = []\n- for column in input_data_record:\n- record.append(input_data_record[column])\n- \n- \n- # add extra columns\n- for column in extra_cols:\n- record.append(column) \n- \n- return record \n- \n-\n-def _save_data(data_rows, headers, out_csv):\n- \'\'\'\n- Writes tab-separated data to file\n- @param data_rows: dictionary containing merged/enriched dataset\n- @param out_csv: output csv file\n- \'\'\'\n-\n- # Open output file for writing\n- outfile_single_handle = open(out_csv, \'wb\')\n- output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n-\n- # Write headers\n- output_single_handle.writerow(headers)\n-\n- # Write one line for each row\n- for data_row in data_rows:\n- output_single_handle.writerow(data_row)\n-\n-def _get_metexp_URL(metexp_dblink_file):\n- \'\'\'\n- Read out and return the URL stored in the given file.\n- \'\'\'\n- file_input = fileinput.input(metexp_dblink_file)\n- try:\n- for line in file_input:\n- if line[0] != \'#\':\n- # just return the first line that is not a comment line:\n- return line\n- finally:\n- file_input.close()\n- \n-\n-def main():\n- \'\'\'\n- MetExp Query main function\n- \n- The input file can be any tabular file, as long as it contains a column for the molecular mass\n- and one for the formula of the respective identification. These two columns are then\n- used to query against MetExp Database. \n- \'\'\'\n- seconds_start = int(round(time.time()))\n- \n- input_file = sys.argv[1]\n- casid_col = sys.argv[2]\n- formula_col = sys.argv[3]\n- molecular_mass_col = sys.argv[4]\n- metexp_dblink_file = sys.argv[5]\n- separation_method = sys.argv[6]\n- output_result = sys.argv[7]\n-\n- # Parse metexp_dblink_file to find the URL to the MetExp service:\n- metexp_dblink = _get_metexp_URL(metexp_dblink_file)\n- \n- # Parse tabular input file into dictionary/array:\n- input_data = _process_file(input_file)\n- \n- # Query data against MetExp DB :\n- enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)\n- headers = input_data.keys() + [\'METEXP hits for \',\'METEXP hits: organisms\', \'METEXP hits: tissues\',\n- \'METEXP hits: experiments\',\'METEXP hits: user names\',\'METEXP hits: column types\', \'METEXP hits: CAS nrs\', \'Link to METEXP hits\']\n- \n- _save_data(enriched_data, headers, output_result)\n- \n- seconds_end = int(round(time.time()))\n- print "Took " + str(seconds_end - seconds_start) + " seconds"\n- \n- \n-\n-if __name__ == \'__main__\':\n- main()\n' |
b |
diff -r 41f122255d14 -r 4393f982d18f query_metexp.xml --- a/query_metexp.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,69 +0,0 @@ -<tool id="query_metexp" - name="METEXP - Query Database " - version="0.1.0"> - <description>Query a set of identifications against the METabolomics EXPeriments database</description> - <command interpreter="python"> - query_metexp.py - $input_file - "$casid_col" - "$formula_col" - "$molecular_mass_col" - "$metexp_dblink_file" - $separation_method - $output_result - </command> - <inputs> - - <param name="input_file" format="tabular" type="data" - label="Input file" - help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> - - <param name="separation_method" type="select" label="Data type to query"> - <option value="GC" selected="True">GC</option> - <option value="LC">LC</option> - </param> - - - <param name="casid_col" type="text" size="50" - label="CAS ID column name" - value="CAS" - help="Name of the column containing the CAS code information (in the given input file)" /> - <param name="formula_col" type="text" size="50" - label="Formula ID column name" - value="FORMULA" - help="Name of the column containing the formula information (in the given input file)" /> - <param name="molecular_mass_col" type="text" size="50" - label="Molecular mass column name" - value="MM" - help="Name of the column containing the molecular mass information (in the given input file)" /> - - <param name="metexp_dblink_file" type="select" label="MetExp DB to query" - help="Select the MetExp Database/backend which should be queried" - dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/> - - - </inputs> - <outputs> - <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> - </outputs> - <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> - <help> -.. class:: infomark - -This tool will Query a set of identifications against the METabolomics EXPlorer database. - -It will take the input file and for each record it will query the -molecular mass in the selected MetExp DB. If one or more compounds are found in the -MetExp DB then extra information regarding these compounds is added to the output file. - -The output file is thus the input file enriched with information about -related items found in the selected MetExp DB. - -**Notes** - -The input file can be any tabular file, as long as it contains a column for the molecular mass -and one for the formula of the respective identification. - - - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilterGCMS_tabular.xml --- a/rankfilterGCMS_tabular.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,80 +0,0 @@ -<tool id="rankfilterGCMS_tabular" name="RIQC-RankFilter GC-MS from tabular file" version="1.0.2"> - <description>Convert Retention Time to Retention Index</description> - <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command> - <inputs> - <param format="tabular" name="sample" type="data" label="Sample File" - help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> - <!-- question: is this calibration file not column specific as it includes RT info?? --> - <!-- this one should be input file for now:<param name="calibration" type="select" label="Calibration File" - help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values" - dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_Calibration_Files")'/> - --> - <param name="calibration" format="any" type="data" label="Calibration File" - help="Calibration file containing reference masses (e.g. alkanes) with their respective RT and RI values"/> - - <param name="analysis_type" type="select" format="text" label="Analysis Type" - help="Select the type of analysis that has been used to generate the sample file"> - <option value="NIST">NIST</option> - <option value="AMDIS">AMDIS</option> - </param> - <param name="model" type="select" format="text" label="Select a model to be used " - help="Both linear and (3rd degree) polynomial models are available "> - <option value="linear">Linear</option> - <option value="poly">Polynomial</option> - </param> - <param name="lib_data" type="select" label="Library" - help="Reference global lookup library file with CAS numbers and respective (previously calculated) RIsvr values" - dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_lookup_libraries")'/> - - <param name="window" type="float" label="Window" value="10.56" /> - </inputs> - <outputs> - <data format="tabular" label="${tool.name}" name="onefile" /> - </outputs> - <!-- file with implementation of the function get_directory_files() used above --> - <code file="match_library.py" /> - <configfiles> - <configfile name="input_file"> - sample = ${sample} - calibration = ${calibration} - lib_data = ${lib_data} - window = ${window} - analysis_type = ${analysis_type} - tabular = True - onefile = ${onefile} - model = ${model} - </configfile> - </configfiles> - <help> -Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the -given calibration file. - -It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File), -based on its CAS number, its respective RIsvr value in the given global lookup library -(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS). -This generates an prediction of the RI for -a compound according to the "RankFilter procedure" (RIsvr). - -Output is a tab separated file in which four columns are added: - - - **Rank** Calculated rank - - **RIexp** Experimental Retention Index (RI) - - **RIsvr** Calculated RI based on support vector regression (SVR) - - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp) - -.. class:: infomark - -**Notes** - - - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'. - - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will - be used to convert from XXXX to YYYY. - ------ - -**References** - - - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS - metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794 - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/nistpdf_to_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/nistpdf_to_tabular.py Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,45 @@ +""" +Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this software except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +import sys +import pdfread +from subprocess import call + +# NB: THIS TOOL SHOULD BE DEPRECATED ONCE NIST-Wrapper tool is stable and released. NIST-Wrapper tool already produces the necessary tabular file. + +def convert_pdftotext(filename, output_file): + ''' + Converts PDF file to text + @param filename: PDF file to parse + @param output_file: output text file for the hits + ''' + + # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The + # default is to 'undo' physical layout (columns, hyphenation, etc.) and output + # the text in reading order. + try: + call(["pdftotext", "-layout", filename, output_file]) + except: + raise Exception("Error while trying to convert PDF to text") + + + + +if __name__ == '__main__': + pdf_as_text = sys.argv[1]+".txt" + convert_pdftotext(sys.argv[1], pdf_as_text) + pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False) |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/nistpdf_to_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/nistpdf_to_tabular.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,14 @@ +<tool id="NDIStext2tabular" name="NIST_UTIL- NIST text to tabular format" version="1.0.2"> + <description>Convert NIST text to tabular format</description> + <command interpreter="python">nistpdf_to_tabular.py $input $output $output_err</command> + <inputs> + <param format="pdf" name="input" type="data" label="NIST identifications report (PDF)"/> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} output on ${on_string}" name="output" /> + <data format="tabular" label="${tool.name} error log" name="output_err" /> + </outputs> + <help> + This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools. + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/pdftotabular.py --- a/rankfilter_GCMS/pdftotabular.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,44 +0,0 @@ -""" -Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this software except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -""" - -import sys -import pdfread -from subprocess import call - - -def convert_pdftotext(filename, output_file): - ''' - Converts PDF file to text - @param filename: PDF file to parse - @param output_file: output text file for the hits - ''' - - # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The - # default is to 'undo' physical layout (columns, hyphenation, etc.) and output - # the text in reading order. - try: - call(["pdftotext", "-layout", filename, output_file]) - except: - raise Exception("Error while trying to convert PDF to text") - - - - -if __name__ == '__main__': - pdf_as_text = sys.argv[1]+".txt" - convert_pdftotext(sys.argv[1], pdf_as_text) - pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False) |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/rankfilterGCMS_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/rankfilterGCMS_tabular.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,80 @@ +<tool id="rankfilterGCMS_tabular" name="RIQC-RankFilter GC-MS from tabular file" version="1.0.2"> + <description>Convert Retention Time to Retention Index</description> + <command interpreter="python">rankfilter.py $input_file</command> + <inputs> + <param format="tabular" name="sample" type="data" label="Sample File" + help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> + <!-- question: is this calibration file not column specific as it includes RT info?? --> + <!-- this one should be input file for now:<param name="calibration" type="select" label="Calibration File" + help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_Calibration_Files")'/> + --> + <param name="calibration" format="any" type="data" label="Calibration File" + help="Calibration file containing reference masses (e.g. alkanes) with their respective RT and RI values"/> + + <param name="analysis_type" type="select" format="text" label="Analysis Type" + help="Select the type of analysis that has been used to generate the sample file"> + <option value="NIST">NIST</option> + <option value="AMDIS">AMDIS</option> + </param> + <param name="model" type="select" format="text" label="Select a model to be used " + help="Both linear and (3rd degree) polynomial models are available "> + <option value="linear">Linear</option> + <option value="poly">Polynomial</option> + </param> + <param name="lib_data" type="select" label="Library" + help="Reference global lookup library file with CAS numbers and respective (previously calculated) RIsvr values" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_lookup_libraries")'/> + + <param name="window" type="float" label="Window" value="10.56" /> + </inputs> + <outputs> + <data format="tabular" label="${tool.name}" name="onefile" /> + </outputs> + <!-- file with implementation of the function get_directory_files() used above --> + <code file="match_library.py" /> + <configfiles> + <configfile name="input_file"> + sample = ${sample} + calibration = ${calibration} + lib_data = ${lib_data} + window = ${window} + analysis_type = ${analysis_type} + tabular = True + onefile = ${onefile} + model = ${model} + </configfile> + </configfiles> + <help> +Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the +given calibration file. + +It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File), +based on its CAS number, its respective RIsvr value in the given global lookup library +(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS). +This generates an prediction of the RI for +a compound according to the "RankFilter procedure" (RIsvr). + +Output is a tab separated file in which four columns are added: + + - **Rank** Calculated rank + - **RIexp** Experimental Retention Index (RI) + - **RIsvr** Calculated RI based on support vector regression (SVR) + - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp) + +.. class:: infomark + +**Notes** + + - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'. + - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will + be used to convert from XXXX to YYYY. + +----- + +**References** + + - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS + metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794 + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/select_on_rank.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/select_on_rank.py Thu Mar 19 12:22:23 2015 +0100 |
[ |
@@ -0,0 +1,21 @@ +import csv +import sys + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +in_file = sys.argv[1] +out_file = sys.argv[2] +to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)] + +data = list(csv.reader(open(in_file, 'rb'), delimiter='\t')) +header = data.pop(0) +header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header] +rank = header_clean.index("rank") + +writer = csv.writer(open(out_file, 'wb'), delimiter='\t') +writer.writerow(header) +for select in to_select_list: + writer.writerows([i for i in data if i[rank] == select]) |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/select_on_rank.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/select_on_rank.xml Thu Mar 19 12:22:23 2015 +0100 |
b |
@@ -0,0 +1,15 @@ +<tool id="filter_on_rank" name="RIQC-Filter on rank" version="1.0.2"> + <description>Filter on the Rank field in the RankFilter output file</description> + <command interpreter="python">select_on_rank.py $input $output "$rank"</command> + <inputs> + <param format="tabular" name="input" type="data" label="Source file (RankFilter ouptut)"/> + <param format="tabular" help="Filter on (keep different values separate with a comma)" value ="1,2" + name="rank" type="text" label="Select Ranks to keep"/> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on ${on_string} selected ${rank}" name="output" /> + </outputs> + <help> +This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file). + </help> +</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f rankfilter_text2tabular.xml --- a/rankfilter_text2tabular.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,14 +0,0 @@ -<tool id="NDIStext2tabular" name="NIST_UTIL- NIST text to tabular format" version="1.0.2"> - <description>Convert NIST text to tabular format</description> - <command interpreter="python">rankfilter_GCMS/pdftotabular.py $input $output $output_err</command> - <inputs> - <param format="pdf" name="input" type="data" label="NIST identifications report (PDF)"/> - </inputs> - <outputs> - <data format="tabular" label="${tool.name} output on ${on_string}" name="output" /> - <data format="tabular" label="${tool.name} error log" name="output_err" /> - </outputs> - <help> - This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools. - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f select_on_rank.py --- a/select_on_rank.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,21 +0,0 @@ -import csv -import sys - -__author__ = "Marcel Kempenaar" -__contact__ = "brs@nbic.nl" -__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" -__license__ = "MIT" - -in_file = sys.argv[1] -out_file = sys.argv[2] -to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)] - -data = list(csv.reader(open(in_file, 'rb'), delimiter='\t')) -header = data.pop(0) -header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header] -rank = header_clean.index("rank") - -writer = csv.writer(open(out_file, 'wb'), delimiter='\t') -writer.writerow(header) -for select in to_select_list: - writer.writerows([i for i in data if i[rank] == select]) |
b |
diff -r 41f122255d14 -r 4393f982d18f select_on_rank.xml --- a/select_on_rank.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,15 +0,0 @@ -<tool id="filter_on_rank" name="RIQC-Filter on rank" version="1.0.2"> - <description>Filter on the Rank field in the RankFilter output file</description> - <command interpreter="python">select_on_rank.py $input $output "$rank"</command> - <inputs> - <param format="tabular" name="input" type="data" label="Source file (RankFilter ouptut)"/> - <param format="tabular" help="Filter on (keep different values separate with a comma)" value ="1,2" - name="rank" type="text" label="Select Ranks to keep"/> - </inputs> - <outputs> - <data format="tabular" label="${tool.name} on ${on_string} selected ${rank}" name="output" /> - </outputs> - <help> -This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file). - </help> -</tool> |
b |
diff -r 41f122255d14 -r 4393f982d18f static_resources/README.txt --- a/static_resources/README.txt Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,3 +0,0 @@ -This folder and respective files should be deployed together with the following tools: - - - ../export_to_metexp_tabular.py \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f static_resources/elements_and_masses.tab --- a/static_resources/elements_and_masses.tab Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,104 +0,0 @@ -Name Atomic number Chemical symbol Relative atomic mass -Hydrogen 1 H 1.01 -Helium 2 He 4 -Lithium 3 Li 6.94 -Beryllium 4 Be 9.01 -Boron 5 B 10.81 -Carbon 6 C 12.01 -Nitrogen 7 N 14.01 -Oxygen 8 O 16 -Fluorine 9 F 19 -Neon 10 Ne 20.18 -Sodium 11 Na 22.99 -Magnesium 12 Mg 24.31 -Aluminum 13 Al 26.98 -Silicon 14 Si 28.09 -Phosphorus 15 P 30.98 -Sulfur 16 S 32.06 -Chlorine 17 Cl 35.45 -Argon 18 Ar 39.95 -Potassium 19 K 39.1 -Calcium 20 Ca 40.08 -Scandium 21 Sc 44.96 -Titanium 22 Ti 47.9 -Vanadium 23 V 50.94 -Chromium 24 Cr 52 -Manganese 25 Mn 54.94 -Iron 26 Fe 55.85 -Cobalt 27 Co 58.93 -Nickel 28 Ni 58.71 -Copper 29 Cu 63.54 -Zinc 30 Zn 65.37 -Gallium 31 Ga 69.72 -Germanium 32 Ge 72.59 -Arsenic 33 As 74.99 -Selenium 34 Se 78.96 -Bromine 35 Br 79.91 -Krypton 36 Kr 83.8 -Rubidium 37 Rb 85.47 -Strontium 38 Sr 87.62 -Yttrium 39 Y 88.91 -Zirconium 40 Zr 91.22 -Niobium 41 Nb 92.91 -Molybdenum 42 Mo 95.94 -Technetium 43 Tc 96.91 -Ruthenium 44 Ru 101.07 -Rhodium 45 Rh 102.9 -Palladium 46 Pd 106.4 -Silver 47 Ag 107.87 -Cadmium 48 Cd 112.4 -Indium 49 In 114.82 -Tin 50 Sn 118.69 -Antimony 51 Sb 121.75 -Tellurium 52 Te 127.6 -Iodine 53 I 126.9 -Xenon 54 Xe 131.3 -Cesium 55 Cs 132.9 -Barium 56 Ba 137.34 -Lanthanum 57 La 138.91 -Cerium 58 Ce 140.12 -Praseodymium 59 Pr 140.91 -Neodymium 60 Nd 144.24 -Promethium 61 Pm 144.91 -Samarium 62 Sm 150.35 -Europium 63 Eu 151.96 -Gadolinium 64 Gd 157.25 -Terbium 65 Tb 158.92 -Dysprosium 66 Dy 162.5 -Holmium 67 Ho 164.93 -Erbium 68 Er 167.26 -Thulium 69 Tm 168.93 -Ytterbium 70 Yb 173.04 -Lutetium 71 Lu 174.97 -Hafnium 72 Hf 178.49 -Tantalum 73 Ta 180.95 -Wolfram 74 W 183.85 -Rhenium 75 Re 186.2 -Osmium 76 Os 190.2 -Iridium 77 Ir 192.22 -Platinum 78 Pt 195.09 -Gold 79 Au 196.97 -Mercury 80 Hg 200.59 -Thallium 81 Tl 204.37 -Lead 82 Pb 207.19 -Bismuth 83 Bi 208.98 -Polonium 84 Po 208.98 -Astatine 85 At 209.99 -Radon 86 Rn 222.02 -Francium 87 Fr 223.02 -Radium 88 Ra 226 -Actinium 89 Ac 227.03 -Thorium 90 Th 232.04 -Protactinium 91 Pa 231.04 -Uranium 92 U 238.03 -Neptunium 93 Np 237 -Plutonium 94 Pu 242 -Americium 95 Am 243.06 -Curium 96 Cm 247.07 -Berkelium 97 Bk 247.07 -Californium 98 Cf 251.08 -Einsteinium 99 Es 254.09 -Fermium 100 Fm 257.1 -Mendelevium 101 Md 257.1 -Nobelium 102 No 255.09 -Lawrencium 103 Lr 256.1 |
b |
diff -r 41f122255d14 -r 4393f982d18f tool_dependencies.xml --- a/tool_dependencies.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,13 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> -<!-- see also http://wiki.galaxyproject.org/ToolShedToolFeatures for syntax help - --> - <package name="R_bioc_metams" version="3.1.1"> - <repository changeset_revision="4b30bdaf4dbd" name="prims_metabolomics_r_dependencies" owner="pieterlukasse" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <readme> - This dependency: - Ensures R 3.1.1 installation is triggered (via dependency). - Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed. - </readme> -</tool_dependency> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f xcms_differential_analysis.r --- a/xcms_differential_analysis.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,85 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -#cat("args <- \"\"\n") -## a xcms xset saved as .RData -args.xsetData <- args[1] -#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep="")) - -args.class1 <- args[2] -args.class2 <- args[3] -#cat(paste("args.class1 <- \"", args[2], "\"\n", sep="")) -#cat(paste("args.class2 <- \"", args[3], "\"\n", sep="")) - -args.topcount <- strtoi(args[4]) -#cat(paste("args.topcount <- ", args[4], "\n", sep="")) - -args.outTable <- args[5] - -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] -#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep="")) -#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep="")) - - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -tryCatch( - { - library(metaMS) - library(xcms) - #library("R2HTML") - - # load the xset data : - xsetData <- readRDS(args.xsetData) - # if here to support both scenarios: - if ("xcmsSet" %in% slotNames(xsetData) ) - { - xsetData <- xsetData@xcmsSet - } - - - # info: levels(xcmsSet@phenoData$class) also gives access to the class names - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - # set cairo as default for png plots: - png = function (...) grDevices::png(...,type='cairo') - # run diffreport - reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640) - - # write out tsv table: - write.table(reporttab, args.outTable, sep="\t", row.names=FALSE) - - message("\nGenerating report.........") - - cat("<html><body><h1>Differential analysis report</h1>", file= args.htmlReportFile) - #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile) - figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="") - message(figuresPath) - listOfFiles <- list.files(path = figuresPath) - for (i in 1:length(listOfFiles)) - { - figureName <- listOfFiles[i] - # maybe we still need to copy the figures to the args.htmlReportFile.files_path - cat(paste("<img src='fig_eic/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE) - cat(paste("<img src='fig_box/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE) - } - - message("finished generating report") - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f xcms_differential_analysis.xml --- a/xcms_differential_analysis.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,55 +0,0 @@ -<tool id="xcms_differential_analysis" name="XCMS Differential Analsysis" version="0.0.4"> - <description> Runs xcms diffreport function for differential Analsysis</description> - <requirements> - <requirement type="package" version="3.1.1">R_bioc_metams</requirement> - </requirements> - <command interpreter="Rscript"> - xcms_differential_analysis.r - $xsetData - "$class1" - "$class2" - $topcount - $outTable - $htmlReportFile - $htmlReportFile.files_path - $outLogFile - </command> -<inputs> - - <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/> - - - <param name="class1" type="text" size="30" label="Class1 name" value="" help="Name of first class for the comparison"/> - <param name="class2" type="text" size="30" label="Class2 name" value="" help="Name of second class for the comparison"/> - - <param name="topcount" type="integer" size="10" value="10" label="Number of items to return" help="Top X differential items. E.g. if 10, it will return top 10 differential items." /> - -</inputs> -<outputs> - <data name="outTable" format="tabular" label="${tool.name} on ${on_string} - Top differential items (TSV)"/> - <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/> - <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - differential report (HTML)"/> -</outputs> -<tests> - <test> - </test> -</tests> -<help> - -.. class:: infomark - -Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for -the most significant differences. The figure below shows an example of such an EIC. - -.. image:: $PATH_TO_IMAGES/diffreport.png - - - - - </help> - <citations> - <citation type="doi">10.1021/ac051437y</citation> <!-- example - see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set - --> - </citations> -</tool> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f xcms_get_alignment_eic.r --- a/xcms_get_alignment_eic.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,153 +0,0 @@ -# shows all alignment results in a rt region -## read args: -args <- commandArgs(TRUE) -# xset data: -args.xsetData <- args[1] - -args.rtStart <- strtoi(args[2]) -args.rtEnd <- strtoi(args[3]) - -# limit max diff to 600 and minNrSamples to at least 2 : -if (args.rtEnd - args.rtStart > 600) - stop("The RT window should be <= 600") - -args.minNrSamples <- strtoi(args[4]) #min nr samples -if (args.minNrSamples < 2) - stop("Set 'Minimum number of samples' to 2 or higher") - - -args.sampleNames <- strsplit(args[5], ",")[[1]] -# trim leading and trailing spaces: -args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) - -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] - - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - - - -tryCatch( - { - library(metaMS) - - # load the xset data : - xsetData <- readRDS(args.xsetData) - # if here to support both scenarios: - if ("xcmsSet" %in% slotNames(xsetData) ) - { - xsetData <- xsetData@xcmsSet - } - - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) - - write(paste("<html><body><h1>Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples</h1>"), - file=args.htmlReportFile) - - gt <- groups(xsetData) - message("\nParsed groups... ") - groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected - - if (length(groupidx1) > 0) - { - message("\nGetting EIC... ") - eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames) - #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw") - - #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE) - #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!! - sampNames <- sampnames(xsetData) - sampleNamesIdx <- match( args.sampleNames, sampNames) - message(paste("Samples: ", sampleNamesIdx)) - - #TODO html <- paste(html, "<table><tbody>") - message(paste("\nPlotting figures... ")) - - #get the mz list (interestingly, this [,"mz"] is relatively slow): - peakMzList <- xsetData@peaks[,"mz"] - peakSampleList <- xsetData@peaks[,"sample"] - #signal to noise list: - peakSnList <- xsetData@peaks[,"sn"] - - message(paste("Total nr of peaks: ", length(peakMzList))) - - for (i in 1:length(groupidx1)) - { - groupMembers <- xsetData@groupidx[[groupidx1[i]]] - - groupMzList <- "" - groupSampleList <- "" - finalGroupSize <- 0 - - for (j in 1:length(groupMembers)) - { - #get only the peaks from the selected samples: - memberSample <- peakSampleList[groupMembers[j]] - memberSn <- peakSnList[groupMembers[j]] - if (!is.na(memberSn) && memberSample %in% sampleNamesIdx) - { - message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample)) - memberMz <- peakMzList[groupMembers[j]] - - - if (finalGroupSize == 0) - { - groupMzList <- memberMz - groupSampleList <- sampNames[memberSample] - } else { - groupMzList <- paste(groupMzList,",",memberMz, sep="") - groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="") - } - - finalGroupSize <- finalGroupSize +1 - } - } - # here we do the real check on group size and only the groups that have enough - # members with signal to noise > 0 will be plotted here: - if (finalGroupSize >= args.minNrSamples) - { - message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... ")) - - figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") - write(paste("<img src='", "figure", i,".png' />", sep="") , - file=args.htmlReportFile, append=TRUE) - - png( figureName, type="cairo", width=800 ) - plot(eiccor, xsetData, groupidx = i) - devname = dev.off() - - write(paste("<p>Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA:<br/>", groupMzList,"</p>", sep="") , - file=args.htmlReportFile, append=TRUE) - write(paste("<p>m/z values found in the following samples respectively: <br/>", groupSampleList,"</p>", sep="") , - file=args.htmlReportFile, append=TRUE) - } - } - - } - - write("</body><html>", - file=args.htmlReportFile, append=TRUE) - message("finished generating report") - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) |
b |
diff -r 41f122255d14 -r 4393f982d18f xcms_get_alignment_eic.xml --- a/xcms_get_alignment_eic.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,73 +0,0 @@ -<tool id="xcms_get_alignment_eic" name="XCMS Get Alignment EICs" version="0.0.4"> - <description> Extracts alignment EICs from feature alignment data</description> - <requirements> - <requirement type="package" version="3.1.1">R_bioc_metams</requirement> - </requirements> - <command interpreter="Rscript"> - xcms_get_alignment_eic.r - $xsetData - $rtStart - $rtEnd - $minNrSamples - "$sampleNames" - $htmlReportFile - $htmlReportFile.files_path - $outLogFile - </command> -<inputs> - - <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/> - - - <param name="rtStart" type="integer" value="" size="10" label="RT start" help="Start of Retention Time region to plot" /> - <param name="rtEnd" type="integer" value="" size="10" label="RT end" help="End of Retention Time region to plot" /> - - <param name="minNrSamples" type="integer" size="10" value="10" label="Minimum number of samples in which aligned feature should be found" help="Can also read this as: Minimum - number of features in alignment. E.g. if set to 10, it means the alignment should consist of at least 10 peaks from 10 different samples aligned together." /> - - <param name="sampleNames" type="text" area="true" size="10x70" label="List of sample names" - value="sampleName1,sampleName2,etc" - help="Comma or line-separated list of sample names. Optional field where you can specify the subset of samples - to use for the EIC plots. NB: if your dataset has many samples, specifying a subset here can significantly speed up the processing time." > - <sanitizer> - <!-- this translates from line-separated to comma separated list, removes quotes --> - <valid/> - <mapping initial="none"> - <add source=" " target=","/> - <add source=" " target=""/> - <add source=""" target=""/> - </mapping> - </sanitizer> - </param> - - -</inputs> -<outputs> - <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/> - <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - EIC(s) report (HTML)"/> -</outputs> -<tests> - <test> - </test> -</tests> -<help> - -.. class:: infomark - -This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method. -It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between -two classes, with extra alignment information beneath it. - -.. image:: $PATH_TO_IMAGES/diffreport.png - -Alignment id: 1709. m/z list of peaks in alignment: -614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469 - - - </help> - <citations> - <citation type="doi">10.1021/ac051437y</citation> <!-- example - see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set - --> - </citations> -</tool> \ No newline at end of file |
b |
diff -r 41f122255d14 -r 4393f982d18f xcms_get_mass_eic.r --- a/xcms_get_mass_eic.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,162 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -# xset data: -args.xsetData <- args[1] - -args.rtStart <- strtoi(args[2]) -args.rtEnd <- strtoi(args[3]) - -args.mzStart <- as.double(args[4]) -args.mzEnd <- as.double(args[5]) -# there are 2 options: specify a mz range or a mz list: -if (args.mzStart < 0) -{ - args.mzList <- as.double(strsplit(args[6], ",")[[1]]) - cat(typeof(as.double(strsplit(args[6], ",")[[1]]))) - args.mzTolPpm <- as.double(args[7]) - # calculate mzends based on ppm tol: - mzListEnd <- c() - mzListStart <- c() - for (i in 1:length(args.mzList)) - { - mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0 - mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0 - mzListEnd <- c(mzListEnd, mzEnd) - mzListStart <- c(mzListStart, mzStart) - } - str(mzListStart) - str(mzListEnd) -} else { - mzListEnd <- c(args.mzEnd) - mzListStart <- c(args.mzStart) -} - -args.sampleNames <- strsplit(args[8], ",")[[1]] -# trim leading and trailing spaces: -args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) - -args.combineSamples <- args[9] -args.rtPlotMode <- args[10] - -## report files -args.htmlReportFile <- args[11] -args.htmlReportFile.files_path <- args[12] - - -if (length(args) == 13) -{ - args.outLogFile <- args[13] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots -# TODO2 - let it run in parallel - -tryCatch( - { - library(metaMS) - - # load the xset data : - xsetData <- readRDS(args.xsetData) - # if here to support both scenarios: - if ("xcmsSet" %in% slotNames(xsetData) ) - { - xsetData <- xsetData@xcmsSet - } - - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) - - html <- "<html><body><h1>Extracted Ion Chromatograms (EIC) matching criteria</h1>" - - if (args.combineSamples == "No") - { - if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart)) - stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart), - " masses while ", length(args.sampleNames), " sample names were given.")) - - iterSize <- length(args.sampleNames) - # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used - fixSampleIdx <- 1 - fixMzListIdx <- 1 - if (length(args.sampleNames) == 1) - { - fixSampleIdx <- 0 - iterSize <- length(mzListStart) - } - if (length(mzListStart) == 1) - { - fixMzListIdx <- 0 - } - lineColors <- rainbow(iterSize) - for (i in 0:(iterSize-1)) - { - message("\nGetting EIC... ") - eiccor <- getEIC(xsetData, - mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE), - rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), - sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode) - - message("\nPlotting figures... ") - figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") - html <- paste(html,"<img src='", "figure", i,".png' /><br/>", sep="") - png( figureName, type="cairo", width=1100,height=250 ) - #plot(eiccor, col=lineColors[i+1]) - # black is better in this case: - plot(eiccor) - legend('topright', # places a legend at the appropriate place - legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend - lty=c(1,1), # gives the legend appropriate symbols (lines) - lwd=c(2.5,2.5)) - - devname = dev.off() - } - - } else { - for (i in 1:length(mzListStart)) - { - message("\nGetting EIC... ") - eiccor <- getEIC(xsetData, - mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE), - rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), - sampleidx=args.sampleNames, rt = args.rtPlotMode) - - #set size, set option (plot per sample, plot per mass) - - message("\nPlotting figures... ") - figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") - html <- paste(html,"<img src='", "figure", i,".png' />", sep="") - png( figureName, type="cairo", width=1100,height=450 ) - lineColors <- rainbow(length(args.sampleNames)) - plot(eiccor, col=lineColors) - legend('topright', # places a legend at the appropriate place - legend=args.sampleNames, # puts text in the legend - lty=c(1,1), # gives the legend appropriate symbols (lines) - lwd=c(2.5,2.5), - col=lineColors) - devname = dev.off() - } - } - if (args.rtPlotMode == "corrected") - { - html <- paste(html,"<p>*rt values are corrected ones</p></body><html>") - } - html <- paste(html,"</body><html>") - message("finished generating report") - write(html,file=args.htmlReportFile) - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) |
b |
diff -r 41f122255d14 -r 4393f982d18f xcms_get_mass_eic.xml --- a/xcms_get_mass_eic.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,117 +0,0 @@ -<tool id="xcms_get_mass_eic" name="XCMS Get EICs" version="0.0.4"> - <description> Extracts EICs for a given list of masses</description> - <requirements> - <requirement type="package" version="3.1.1">R_bioc_metams</requirement> - </requirements> - <command interpreter="Rscript"> - xcms_get_mass_eic.r - $xsetData - $rtStart - $rtEnd - #if $massParameters.massParametersType == "window" - $massParameters.mzStart - $massParameters.mzEnd - -1 - "." - #else - -1 - -1 - "$massParameters.mzList" - $massParameters.mzTolPpm - #end if - "$sampleNames" - $combineSamples - $rtPlotMode - $htmlReportFile - $htmlReportFile.files_path - $outLogFile - </command> -<inputs> - - <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/> - - - <param name="rtStart" type="integer" value="" size="10" label="RT start" help="Start of Retention Time region to plot" /> - <param name="rtEnd" type="integer" value="" size="10" label="RT end" help="End of Retention Time region to plot" /> - - <conditional name="massParameters"> - <param name="massParametersType" type="select" size="50" label="Give masses as" > - <option value="list" selected="true">m/z list</option> - <option value="window" >m/z window</option> - </param> - <when value="list"> - <param name="mzList" type="text" area="true" size="7x70" label="m/z list" - help="Comma or line-separated list of m/z values for which to plot an EIC. One EIC will be plotted for each m/z given here."> - <sanitizer> - <!-- this translates from line-separated to comma separated list, removes quotes --> - <valid/> - <mapping initial="none"> - <add source=" " target=","/> - <add source=" " target=""/> - <add source=""" target=""/> - </mapping> - </sanitizer> - </param> - <param name="mzTolPpm" type="integer" size="10" value="5" label="m/z tolerance (ppm)" /> - </when> - <when value="window"> - <param name="mzStart" type="float" value="" size="10" label="m/z start" help="Start of m/z window" /> - <param name="mzEnd" type="float" value="" size="10" label="m/z end" help="End of m/z window" /> - </when> - </conditional> - - - <param name="sampleNames" type="text" area="true" size="10x70" label="List of sample names" - value="sampleName1,sampleName2,etc" - help="Comma or line-separated list of sample names. Here you can specify the subset of samples - to use for the EIC plots. NB: if your dataset has many samples, specifying a subset here can significantly speed up the processing time." > - <sanitizer> - <!-- this translates from line-separated to comma separated list, removes quotes --> - <valid/> - <mapping initial="none"> - <add source=" " target=","/> - <add source=" " target=""/> - <add source=""" target=""/> - </mapping> - </sanitizer> - </param> - - <param name="combineSamples" type="select" size="50" label="Combine samples in EIC" - help="Combining samples means plot contains the combined EIC of a m/z in the different samples. When NOT combining, each plot - only contains the EIC of the m/z in the respectively given sample (the sample name from the sample list in the same position as the - m/z in the m/z list.). Tip: use Yes for visualizing EIC of grouped masses (MsClust or CAMERA groups), use No for visualizing EICs of the same mass in - the different samples."> - <option value="No" selected="true">No</option> - <option value="Yes" >Yes</option> - </param> - <param name="rtPlotMode" type="select" size="50" label="RT plot mode" - help="Select whether EIC should be on original or raw Retention Time"> - <option value="raw" selected="true">Raw</option> - <option value="corrected" >Corrected</option> - </param> -</inputs> -<outputs> - <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/> - <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - EIC(s) report (HTML)"/> -</outputs> -<tests> - <test> - </test> -</tests> -<help> - -.. class:: infomark - -This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method. -It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot. - -.. image:: $PATH_TO_IMAGES/massEIC.png - - - </help> - <citations> - <citation type="doi">10.1021/ac051437y</citation> <!-- example - see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set - --> - </citations> -</tool> \ No newline at end of file |