# HG changeset patch
# User pieter.lukasse@wur.nl
# Date 1394036411 -3600
# Node ID 19d8fd10248e3db74fd8851e3ed0c7bfd669bc8a
# Parent 24fb75fedee053920481c44038a70b1d10f4cbb1
* Added interface to METEXP data store, including tool to fire queries in batch mode
* Improved quantification output files of MsClust, a.o. sorting
mass list based on intensity (last two columns of quantification
files)
* Added Molecular Mass calculation method
diff -r 24fb75fedee0 -r 19d8fd10248e MsClust.jar
Binary file MsClust.jar has changed
diff -r 24fb75fedee0 -r 19d8fd10248e README.rst
--- a/README.rst Tue Feb 11 12:29:50 2014 +0100
+++ b/README.rst Wed Mar 05 17:20:11 2014 +0100
@@ -19,6 +19,11 @@
============== ======================================================================
Date Changes
-------------- ----------------------------------------------------------------------
+March 2014 * Added interface to METEXP data store, including tool to fire
+ queries in batch mode
+ * Improved quantification output files of MsClust, a.o. sorting
+ mass list based on intensity (last two columns of quantification
+ files)
January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a
single package (this package)
* integration with METEXP software (data store for metabolomics
diff -r 24fb75fedee0 -r 19d8fd10248e combine_output.py
--- a/combine_output.py Tue Feb 11 12:29:50 2014 +0100
+++ b/combine_output.py Wed Mar 05 17:20:11 2014 +0100
@@ -155,12 +155,16 @@
@param data: dictionary containing merged dataset
@param out_csv: output csv file
'''
- header = ['Centrotype',
+ # Columns we don't repeat:
+ header_part1 = ['Centrotype',
'cent.Factor',
'scan nr.',
'R.T. (umin)',
'nr. Peaks',
- 'R.T.',
+ 'R.T.']
+ # These are the headers/columns we repeat in case of
+ # combining hits in one line (see alternative_headers method below):
+ header_part2 = [
'Name',
'FORMULA',
'Library',
@@ -190,13 +194,21 @@
output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
# Write headers
- output_single_handle.writerow(header)
- output_multi_handle.writerow(header * nhits)
+ output_single_handle.writerow(header_part1 + header_part2)
+ output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
# Combine all hits for each centrotype into one line
line = []
for centrotype_idx in xrange(len(data)):
+ i = 0
for hit in data[centrotype_idx]:
- line.extend(hit)
+ if i==0:
+ line.extend(hit)
+ else:
+ line.extend(hit[6:])
+ i = i+1
+ # small validation (if error, it is a programming error):
+ if i > nhits:
+ raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx)
output_multi_handle.writerow(line)
line = []
@@ -205,6 +217,17 @@
for hit in data[centrotype_idx]:
output_single_handle.writerow(hit)
+def alternative_headers(header_part2, nr_alternative_hits):
+ '''
+ This method will iterate over the header names and add the string 'ALT#_' before each,
+ where # is the number of the alternative, according to number of alternative hits we want to add
+ to final csv/tsv
+ '''
+ result = []
+ for i in xrange(nr_alternative_hits):
+ for header_name in header_part2:
+ result.append("ALT" + str(i+1) + "_" + header_name)
+ return result
def main():
'''
diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.py
--- a/export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100
+++ b/export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100
@@ -5,17 +5,18 @@
into a tabular file that can be uploaded to the MetExp database.
RankFilter, CasLookup are already combined by combine_output.py so here we will use
-this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
-quantification files are to be combined with combine_output.py result as well.
+this result. Furthermore here one of the MsClust
+quantification files containing the respective spectra details are to be combined as well.
Extra calculations performed:
- The column MW is also added here and is derived from the column FORMULA found
- in combine_output.py result.
+ in RankFilter, CasLookup combined result.
-So in total here we merge 3 files and calculate one new column.
+So in total here we merge 2 files and calculate one new column.
'''
-
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
import csv
+import re
import sys
from collections import OrderedDict
@@ -40,14 +41,15 @@
ONE_TO_ONE = 'one_to_one'
N_TO_ONE = 'n_to_one'
-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
'''
Merges data from both input dictionaries based on the link fields. This method will
build up a new list containing the merged hits as the items.
@param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
@param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
'''
- # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):
+ # TODO test for correct input files -> same link_field values should be there
+ # (test at least number of unique link_field values):
#
# if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
# raise Exception('input files should have the same nr of key values ')
@@ -64,17 +66,23 @@
# Get the indices for current link_field_set1_value in both data-structures for proper matching
set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
-
-
+ # Validation :
+ if len(set2index) == 0:
+ # means that corresponding data could not be found in set2, then throw error
+ raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" +
+ link_field_set1_value + " only found in first dataset. ")
merged_hits = []
# Combine hits
for hit in xrange(len(set1index)):
# Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
# is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
- # corresponding value in the rankfilter or caslookup tables; i.e.
- # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
- # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+ # corresponding value in the sets; i.e.
+ # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
+ # represented by "key".
+ # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
+ # So set1[x][set1index[n]] = set1.attributeX.instanceN
+ #
# It just ensures the entry is made available as a plain named array for easy access.
rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
if relation_type == ONE_TO_ONE :
@@ -83,7 +91,7 @@
# is N to 1:
cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
- merged_hit = merge_function(rf_record, cl_record)
+ merged_hit = merge_function(rf_record, cl_record, metadata)
merged_hits.append(merged_hit)
merged.append(merged_hits)
@@ -103,29 +111,62 @@
-def _merge_records(rank_caslookup_combi, msclust_quant_record):
+def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
'''
Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
@param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
@param msclust_quant_record: msclust quantification + spectrum record
'''
- i = 0
record = []
for column in rank_caslookup_combi:
record.append(rank_caslookup_combi[column])
- i += 1
for column in msclust_quant_record:
record.append(msclust_quant_record[column])
- i += 1
+
+ for column in metadata:
+ record.append(metadata[column])
+
+ # add MOLECULAR MASS (MM)
+ molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
+ # limit to two decimals:
+ record.append("{0:.2f}".format(molecular_mass))
+
+ # add MOLECULAR WEIGHT (MW) - TODO - calculate this
+ record.append('0.0')
+
+ # level of identification and Location of reference standard
+ record.append('0')
+ record.append('')
return record
-
+def get_molecular_mass(formula):
+ '''
+ Calculates the molecular mass (MM).
+ E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
+ '''
+
+ # Each element is represented by a capital letter, followed optionally by
+ # lower case, with one or more digits as for how many elements:
+ element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
-def _save_data(data, headers, nhits, out_csv):
+ total_mass = 0
+ for (element_name, count) in element_pattern.findall(formula):
+ if count == "":
+ count = 1
+ else:
+ count = int(count)
+ element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?)
+ total_mass += element_mass * count
+
+ return total_mass
+
+
+
+def _save_data(data, headers, out_csv):
'''
Writes tab-separated data to file
@param data: dictionary containing merged dataset
@@ -139,12 +180,35 @@
# Write headers
output_single_handle.writerow(headers)
- # Write one line for each centrotype
- for centrotype_idx in xrange(len(data)):
- for hit in data[centrotype_idx]:
+ # Write
+ for item_idx in xrange(len(data)):
+ for hit in data[item_idx]:
output_single_handle.writerow(hit)
+def _get_map_for_elements_and_masses(elements_and_masses):
+ '''
+ This method will read out the column 'Chemical symbol' and make a map
+ of this, storing the column 'Relative atomic mass' as its value
+ '''
+ resultMap = {}
+ index = 0
+ for entry in elements_and_masses['Chemical symbol']:
+ resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
+ index += 1
+
+ return resultMap
+
+
+def init_elements_and_masses_map():
+ '''
+ Initializes the lookup map containing the elements and their respective masses
+ '''
+ elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
+ global elements_and_masses_map
+ elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
+
+
def main():
'''
Combine Output main function
@@ -156,15 +220,27 @@
rankfilter_and_caslookup_combined_file = sys.argv[1]
msclust_quantification_and_spectra_file = sys.argv[2]
output_csv = sys.argv[3]
+ # metadata
+ metadata = OrderedDict()
+ metadata['organism'] = sys.argv[4]
+ metadata['tissue'] = sys.argv[5]
+ metadata['experiment_name'] = sys.argv[6]
+ metadata['user_name'] = sys.argv[7]
+ metadata['column_type'] = sys.argv[8]
# Read RankFilter and CasLookup output files
rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
+ # Read elements and masses to use for the MW/MM calculation :
+ init_elements_and_masses_map()
+
merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype',
- msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE)
- headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()
- _save_data(merged, headers, nhits, output_csv)
+ msclust_quantification_and_spectra, 'centrotype',
+ _compare_records, _merge_records, metadata,
+ N_TO_ONE)
+ headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
+ _save_data(merged, headers, output_csv)
if __name__ == '__main__':
diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.xml Wed Mar 05 17:20:11 2014 +0100
@@ -0,0 +1,57 @@
+
+ Create tabular file for loading into METabolomics EXPlorer database
+
+ export_to_metexp_tabular.py $rankfilter_and_caslookup_combi $msclust_quant_file $output_result
+ $organism $tissue $experiment_name $user_name $column_type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Tool to combine output from the tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
+
+RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
+this result.
+
+**Notes**
+
+Extra calculations performed:
+- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+
+
+
+
diff -r 24fb75fedee0 -r 19d8fd10248e msclust.xml
--- a/msclust.xml Tue Feb 11 12:29:50 2014 +0100
+++ b/msclust.xml Wed Mar 05 17:20:11 2014 +0100
@@ -1,4 +1,4 @@
-
+
Extracts fragmentation spectra from aligned data
+
+.. class:: infomark
+
+This tool will Query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass
+and one for the formula of the respective identification.
+
+
+
+
diff -r 24fb75fedee0 -r 19d8fd10248e rankfilterGCMS_tabular.xml
--- a/rankfilterGCMS_tabular.xml Tue Feb 11 12:29:50 2014 +0100
+++ b/rankfilterGCMS_tabular.xml Wed Mar 05 17:20:11 2014 +0100
@@ -3,7 +3,7 @@
rankfilter_GCMS/rankfilter.py $input_file
+ help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />