diff export_to_metexp_tabular.py @ 21:19d8fd10248e

* Added interface to METEXP data store, including tool to fire queries in batch mode * Improved quantification output files of MsClust, a.o. sorting mass list based on intensity (last two columns of quantification files) * Added Molecular Mass calculation method
author pieter.lukasse@wur.nl
date Wed, 05 Mar 2014 17:20:11 +0100
parents 9d5f4f5f764b
children
line wrap: on
line diff
--- a/export_to_metexp_tabular.py	Tue Feb 11 12:29:50 2014 +0100
+++ b/export_to_metexp_tabular.py	Wed Mar 05 17:20:11 2014 +0100
@@ -5,17 +5,18 @@
 into a tabular file that can be uploaded to the MetExp database.
 
 RankFilter, CasLookup are already combined by combine_output.py so here we will use
-this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
-quantification files are to be combined with combine_output.py result as well. 
+this result. Furthermore here one of the MsClust
+quantification files containing the respective spectra details are to be combined as well. 
 
 Extra calculations performed:
 - The column MW is also added here and is derived from the column FORMULA found 
-  in combine_output.py result. 
+  in RankFilter, CasLookup combined result. 
   
-So in total here we merge 3 files and calculate one new column. 
+So in total here we merge 2 files and calculate one new column. 
 '''
-
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
 import csv
+import re
 import sys
 from collections import OrderedDict
 
@@ -40,14 +41,15 @@
 ONE_TO_ONE = 'one_to_one'
 N_TO_ONE = 'n_to_one'
 
-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
     '''
     Merges data from both input dictionaries based on the link fields. This method will
     build up a new list containing the merged hits as the items. 
     @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
     @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
     '''
-    # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):
+    # TODO test for correct input files -> same link_field values should be there 
+    # (test at least number of unique link_field values):
     #
     # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
     #    raise Exception('input files should have the same nr of key values  ')
@@ -64,17 +66,23 @@
             # Get the indices for current link_field_set1_value in both data-structures for proper matching
             set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
             set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
-            
-            
+            # Validation :
+            if len(set2index) == 0:
+                # means that corresponding data could not be found in set2, then throw error
+                raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + 
+                                link_field_set1_value + " only found in first dataset. ")
             
             merged_hits = []
             # Combine hits
             for hit in xrange(len(set1index)):
                 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
                 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
-                # corresponding value in the rankfilter or caslookup tables; i.e. 
-                # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
-                #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+                # corresponding value in the sets; i.e. 
+                # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
+                #                    represented by "key". 
+                # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
+                # So set1[x][set1index[n]] = set1.attributeX.instanceN
+                #
                 # It just ensures the entry is made available as a plain named array for easy access.
                 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
                 if relation_type == ONE_TO_ONE :
@@ -83,7 +91,7 @@
                     # is N to 1:
                     cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
                 
-                merged_hit = merge_function(rf_record, cl_record)
+                merged_hit = merge_function(rf_record, cl_record, metadata)
                 merged_hits.append(merged_hit)
                 
             merged.append(merged_hits)
@@ -103,29 +111,62 @@
     
     
     
-def _merge_records(rank_caslookup_combi, msclust_quant_record):
+def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
     '''
     Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
     
     @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
     @param msclust_quant_record: msclust quantification + spectrum record
     '''
-    i = 0
     record = []
     for column in rank_caslookup_combi:
         record.append(rank_caslookup_combi[column])
-        i += 1
     
     for column in msclust_quant_record:
         record.append(msclust_quant_record[column])
-        i += 1
+        
+    for column in metadata:
+        record.append(metadata[column])
+        
+    # add MOLECULAR MASS (MM) 
+    molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
+    # limit to two decimals:    
+    record.append("{0:.2f}".format(molecular_mass))    
+        
+    # add MOLECULAR WEIGHT (MW) - TODO - calculate this
+    record.append('0.0')    
+    
+    # level of identification and Location of reference standard
+    record.append('0')
+    record.append('')    
         
     return record
 
 
-
+def get_molecular_mass(formula):
+    '''
+    Calculates the molecular mass (MM). 
+    E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
+    '''
+    
+    # Each element is represented by a capital letter, followed optionally by 
+    # lower case, with one or more digits as for how many elements:
+    element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
 
-def _save_data(data, headers, nhits, out_csv):
+    total_mass = 0
+    for (element_name, count) in element_pattern.findall(formula):
+        if count == "":
+            count = 1
+        else:
+            count = int(count)
+        element_mass = float(elements_and_masses_map[element_name])  # "found: Python's built-in float type has double precision " (? check if really correct ?)
+        total_mass += element_mass * count
+        
+    return total_mass
+    
+    
+
+def _save_data(data, headers, out_csv):
     '''
     Writes tab-separated data to file
     @param data: dictionary containing merged dataset
@@ -139,12 +180,35 @@
     # Write headers
     output_single_handle.writerow(headers)
 
-    # Write one line for each centrotype
-    for centrotype_idx in xrange(len(data)):
-        for hit in data[centrotype_idx]:
+    # Write 
+    for item_idx in xrange(len(data)):
+        for hit in data[item_idx]:
             output_single_handle.writerow(hit)
 
 
+def _get_map_for_elements_and_masses(elements_and_masses):
+    '''
+    This method will read out the column 'Chemical symbol' and make a map 
+    of this, storing the column 'Relative atomic mass' as its value
+    '''
+    resultMap = {}
+    index = 0
+    for entry in elements_and_masses['Chemical symbol']:
+        resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
+        index += 1
+        
+    return resultMap
+
+
+def init_elements_and_masses_map():
+    '''
+    Initializes the lookup map containing the elements and their respective masses
+    '''
+    elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
+    global elements_and_masses_map
+    elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
+    
+
 def main():
     '''
     Combine Output main function
@@ -156,15 +220,27 @@
     rankfilter_and_caslookup_combined_file = sys.argv[1]
     msclust_quantification_and_spectra_file = sys.argv[2]
     output_csv = sys.argv[3]
+    # metadata
+    metadata = OrderedDict()
+    metadata['organism'] = sys.argv[4]
+    metadata['tissue'] = sys.argv[5]
+    metadata['experiment_name'] = sys.argv[6]
+    metadata['user_name'] = sys.argv[7]
+    metadata['column_type'] = sys.argv[8]
 
     # Read RankFilter and CasLookup output files
     rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
     msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
     
+    # Read elements and masses to use for the MW/MM calculation :
+    init_elements_and_masses_map()
+    
     merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', 
-                                msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE)
-    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()
-    _save_data(merged, headers, nhits, output_csv)
+                                msclust_quantification_and_spectra, 'centrotype', 
+                                _compare_records, _merge_records, metadata,
+                                N_TO_ONE)
+    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
+    _save_data(merged, headers, output_csv)
 
 
 if __name__ == '__main__':