diff export_to_metexp_tabular.py @ 0:9d5f4f5f764b

Initial commit to toolshed
author pieter.lukasse@wur.nl
date Thu, 16 Jan 2014 13:10:00 +0100
parents
children 19d8fd10248e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the MetExp database.
+
+RankFilter, CasLookup are already combined by combine_output.py so here we will use
+this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
+quantification files are to be combined with combine_output.py result as well. 
+
+Extra calculations performed:
+- The column MW is also added here and is derived from the column FORMULA found 
+  in combine_output.py result. 
+  
+So in total here we merge 3 files and calculate one new column. 
+'''
+
+import csv
+import sys
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2013, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_data(in_csv, delim='\t'):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim))
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = OrderedDict()
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+ONE_TO_ONE = 'one_to_one'
+N_TO_ONE = 'n_to_one'
+
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):
+    '''
+    Merges data from both input dictionaries based on the link fields. This method will
+    build up a new list containing the merged hits as the items. 
+    @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
+    @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
+    '''
+    # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):
+    #
+    # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
+    #    raise Exception('input files should have the same nr of key values  ')
+    
+    
+    merged = []
+    processed = {}
+    for link_field_set1_idx in xrange(len(set1[link_field_set1])):
+        link_field_set1_value = set1[link_field_set1][link_field_set1_idx]
+        if not link_field_set1_value in processed :
+            # keep track of processed items to not repeat them
+            processed[link_field_set1_value] = link_field_set1_value
+
+            # Get the indices for current link_field_set1_value in both data-structures for proper matching
+            set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
+            set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
+            
+            
+            
+            merged_hits = []
+            # Combine hits
+            for hit in xrange(len(set1index)):
+                # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
+                # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+                # corresponding value in the rankfilter or caslookup tables; i.e. 
+                # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
+                #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+                # It just ensures the entry is made available as a plain named array for easy access.
+                rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
+                if relation_type == ONE_TO_ONE :
+                    cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
+                else:
+                    # is N to 1:
+                    cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
+                
+                merged_hit = merge_function(rf_record, cl_record)
+                merged_hits.append(merged_hit)
+                
+            merged.append(merged_hits)
+
+    return merged, len(set1index)
+
+
+def _compare_records(key1, key2):
+    '''
+    in this case the compare method is really simple as both keys are expected to contain 
+    same value when records are the same
+    '''
+    if key1 == key2:
+        return True
+    else:
+        return False
+    
+    
+    
+def _merge_records(rank_caslookup_combi, msclust_quant_record):
+    '''
+    Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
+    
+    @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
+    @param msclust_quant_record: msclust quantification + spectrum record
+    '''
+    i = 0
+    record = []
+    for column in rank_caslookup_combi:
+        record.append(rank_caslookup_combi[column])
+        i += 1
+    
+    for column in msclust_quant_record:
+        record.append(msclust_quant_record[column])
+        i += 1
+        
+    return record
+
+
+
+
+def _save_data(data, headers, nhits, out_csv):
+    '''
+    Writes tab-separated data to file
+    @param data: dictionary containing merged dataset
+    @param out_csv: output csv file
+    '''
+
+    # Open output file for writing
+    outfile_single_handle = open(out_csv, 'wb')
+    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+    # Write headers
+    output_single_handle.writerow(headers)
+
+    # Write one line for each centrotype
+    for centrotype_idx in xrange(len(data)):
+        for hit in data[centrotype_idx]:
+            output_single_handle.writerow(hit)
+
+
+def main():
+    '''
+    Combine Output main function
+    
+    RankFilter, CasLookup are already combined by combine_output.py so here we will use
+    this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
+    quantification files are to be combined with combine_output.py result as well. 
+    '''
+    rankfilter_and_caslookup_combined_file = sys.argv[1]
+    msclust_quantification_and_spectra_file = sys.argv[2]
+    output_csv = sys.argv[3]
+
+    # Read RankFilter and CasLookup output files
+    rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
+    msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
+    
+    merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', 
+                                msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE)
+    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()
+    _save_data(merged, headers, nhits, output_csv)
+
+
+if __name__ == '__main__':
+    main()