# HG changeset patch # User pieter.lukasse@wur.nl # Date 1394036411 -3600 # Node ID 19d8fd10248e3db74fd8851e3ed0c7bfd669bc8a # Parent 24fb75fedee053920481c44038a70b1d10f4cbb1 * Added interface to METEXP data store, including tool to fire queries in batch mode * Improved quantification output files of MsClust, a.o. sorting mass list based on intensity (last two columns of quantification files) * Added Molecular Mass calculation method diff -r 24fb75fedee0 -r 19d8fd10248e MsClust.jar Binary file MsClust.jar has changed diff -r 24fb75fedee0 -r 19d8fd10248e README.rst --- a/README.rst Tue Feb 11 12:29:50 2014 +0100 +++ b/README.rst Wed Mar 05 17:20:11 2014 +0100 @@ -19,6 +19,11 @@ ============== ====================================================================== Date Changes -------------- ---------------------------------------------------------------------- +March 2014 * Added interface to METEXP data store, including tool to fire + queries in batch mode + * Improved quantification output files of MsClust, a.o. sorting + mass list based on intensity (last two columns of quantification + files) January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a single package (this package) * integration with METEXP software (data store for metabolomics diff -r 24fb75fedee0 -r 19d8fd10248e combine_output.py --- a/combine_output.py Tue Feb 11 12:29:50 2014 +0100 +++ b/combine_output.py Wed Mar 05 17:20:11 2014 +0100 @@ -155,12 +155,16 @@ @param data: dictionary containing merged dataset @param out_csv: output csv file ''' - header = ['Centrotype', + # Columns we don't repeat: + header_part1 = ['Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)', 'nr. Peaks', - 'R.T.', + 'R.T.'] + # These are the headers/columns we repeat in case of + # combining hits in one line (see alternative_headers method below): + header_part2 = [ 'Name', 'FORMULA', 'Library', @@ -190,13 +194,21 @@ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") # Write headers - output_single_handle.writerow(header) - output_multi_handle.writerow(header * nhits) + output_single_handle.writerow(header_part1 + header_part2) + output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) # Combine all hits for each centrotype into one line line = [] for centrotype_idx in xrange(len(data)): + i = 0 for hit in data[centrotype_idx]: - line.extend(hit) + if i==0: + line.extend(hit) + else: + line.extend(hit[6:]) + i = i+1 + # small validation (if error, it is a programming error): + if i > nhits: + raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) output_multi_handle.writerow(line) line = [] @@ -205,6 +217,17 @@ for hit in data[centrotype_idx]: output_single_handle.writerow(hit) +def alternative_headers(header_part2, nr_alternative_hits): + ''' + This method will iterate over the header names and add the string 'ALT#_' before each, + where # is the number of the alternative, according to number of alternative hits we want to add + to final csv/tsv + ''' + result = [] + for i in xrange(nr_alternative_hits): + for header_name in header_part2: + result.append("ALT" + str(i+1) + "_" + header_name) + return result def main(): ''' diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.py --- a/export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100 +++ b/export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100 @@ -5,17 +5,18 @@ into a tabular file that can be uploaded to the MetExp database. RankFilter, CasLookup are already combined by combine_output.py so here we will use -this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust -quantification files are to be combined with combine_output.py result as well. +this result. Furthermore here one of the MsClust +quantification files containing the respective spectra details are to be combined as well. Extra calculations performed: - The column MW is also added here and is derived from the column FORMULA found - in combine_output.py result. + in RankFilter, CasLookup combined result. -So in total here we merge 3 files and calculate one new column. +So in total here we merge 2 files and calculate one new column. ''' - +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 import csv +import re import sys from collections import OrderedDict @@ -40,14 +41,15 @@ ONE_TO_ONE = 'one_to_one' N_TO_ONE = 'n_to_one' -def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): +def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): ''' Merges data from both input dictionaries based on the link fields. This method will build up a new list containing the merged hits as the items. @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) ''' - # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): + # TODO test for correct input files -> same link_field values should be there + # (test at least number of unique link_field values): # # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): # raise Exception('input files should have the same nr of key values ') @@ -64,17 +66,23 @@ # Get the indices for current link_field_set1_value in both data-structures for proper matching set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] - - + # Validation : + if len(set2index) == 0: + # means that corresponding data could not be found in set2, then throw error + raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + + link_field_set1_value + " only found in first dataset. ") merged_hits = [] # Combine hits for hit in xrange(len(set1index)): # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its - # corresponding value in the rankfilter or caslookup tables; i.e. - # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute - # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) + # corresponding value in the sets; i.e. + # set1[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". + # set1index[hit] => points to the row nr=hit (hit is a rownr/index) + # So set1[x][set1index[n]] = set1.attributeX.instanceN + # # It just ensures the entry is made available as a plain named array for easy access. rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) if relation_type == ONE_TO_ONE : @@ -83,7 +91,7 @@ # is N to 1: cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) - merged_hit = merge_function(rf_record, cl_record) + merged_hit = merge_function(rf_record, cl_record, metadata) merged_hits.append(merged_hit) merged.append(merged_hits) @@ -103,29 +111,62 @@ -def _merge_records(rank_caslookup_combi, msclust_quant_record): +def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): ''' Combines single records from both the RankFilter+CasLookup combi file and from MsClust file @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) @param msclust_quant_record: msclust quantification + spectrum record ''' - i = 0 record = [] for column in rank_caslookup_combi: record.append(rank_caslookup_combi[column]) - i += 1 for column in msclust_quant_record: record.append(msclust_quant_record[column]) - i += 1 + + for column in metadata: + record.append(metadata[column]) + + # add MOLECULAR MASS (MM) + molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) + # limit to two decimals: + record.append("{0:.2f}".format(molecular_mass)) + + # add MOLECULAR WEIGHT (MW) - TODO - calculate this + record.append('0.0') + + # level of identification and Location of reference standard + record.append('0') + record.append('') return record - +def get_molecular_mass(formula): + ''' + Calculates the molecular mass (MM). + E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O + ''' + + # Each element is represented by a capital letter, followed optionally by + # lower case, with one or more digits as for how many elements: + element_pattern = re.compile("([A-Z][a-z]?)(\d*)") -def _save_data(data, headers, nhits, out_csv): + total_mass = 0 + for (element_name, count) in element_pattern.findall(formula): + if count == "": + count = 1 + else: + count = int(count) + element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) + total_mass += element_mass * count + + return total_mass + + + +def _save_data(data, headers, out_csv): ''' Writes tab-separated data to file @param data: dictionary containing merged dataset @@ -139,12 +180,35 @@ # Write headers output_single_handle.writerow(headers) - # Write one line for each centrotype - for centrotype_idx in xrange(len(data)): - for hit in data[centrotype_idx]: + # Write + for item_idx in xrange(len(data)): + for hit in data[item_idx]: output_single_handle.writerow(hit) +def _get_map_for_elements_and_masses(elements_and_masses): + ''' + This method will read out the column 'Chemical symbol' and make a map + of this, storing the column 'Relative atomic mass' as its value + ''' + resultMap = {} + index = 0 + for entry in elements_and_masses['Chemical symbol']: + resultMap[entry] = elements_and_masses['Relative atomic mass'][index] + index += 1 + + return resultMap + + +def init_elements_and_masses_map(): + ''' + Initializes the lookup map containing the elements and their respective masses + ''' + elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) + global elements_and_masses_map + elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) + + def main(): ''' Combine Output main function @@ -156,15 +220,27 @@ rankfilter_and_caslookup_combined_file = sys.argv[1] msclust_quantification_and_spectra_file = sys.argv[2] output_csv = sys.argv[3] + # metadata + metadata = OrderedDict() + metadata['organism'] = sys.argv[4] + metadata['tissue'] = sys.argv[5] + metadata['experiment_name'] = sys.argv[6] + metadata['user_name'] = sys.argv[7] + metadata['column_type'] = sys.argv[8] # Read RankFilter and CasLookup output files rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') + # Read elements and masses to use for the MW/MM calculation : + init_elements_and_masses_map() + merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', - msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) - headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() - _save_data(merged, headers, nhits, output_csv) + msclust_quantification_and_spectra, 'centrotype', + _compare_records, _merge_records, metadata, + N_TO_ONE) + headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] + _save_data(merged, headers, output_csv) if __name__ == '__main__': diff -r 24fb75fedee0 -r 19d8fd10248e export_to_metexp_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.xml Wed Mar 05 17:20:11 2014 +0100 @@ -0,0 +1,57 @@ + + Create tabular file for loading into METabolomics EXPlorer database + + export_to_metexp_tabular.py $rankfilter_and_caslookup_combi $msclust_quant_file $output_result + $organism $tissue $experiment_name $user_name $column_type + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +Tool to combine output from the tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. + +RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use +this result. + +**Notes** + +Extra calculations performed: +- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. + + + + diff -r 24fb75fedee0 -r 19d8fd10248e msclust.xml --- a/msclust.xml Tue Feb 11 12:29:50 2014 +0100 +++ b/msclust.xml Wed Mar 05 17:20:11 2014 +0100 @@ -1,4 +1,4 @@ - + Extracts fragmentation spectra from aligned data + +.. class:: infomark + +This tool will Query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass +and one for the formula of the respective identification. + + + + diff -r 24fb75fedee0 -r 19d8fd10248e rankfilterGCMS_tabular.xml --- a/rankfilterGCMS_tabular.xml Tue Feb 11 12:29:50 2014 +0100 +++ b/rankfilterGCMS_tabular.xml Wed Mar 05 17:20:11 2014 +0100 @@ -3,7 +3,7 @@ rankfilter_GCMS/rankfilter.py $input_file + help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />