comparison export_to_metexp_tabular.py @ 21:19d8fd10248e

* Added interface to METEXP data store, including tool to fire queries in batch mode * Improved quantification output files of MsClust, a.o. sorting mass list based on intensity (last two columns of quantification files) * Added Molecular Mass calculation method
author pieter.lukasse@wur.nl
date Wed, 05 Mar 2014 17:20:11 +0100
parents 9d5f4f5f764b
children
comparison
equal deleted inserted replaced
20:24fb75fedee0 21:19d8fd10248e
3 ''' 3 '''
4 Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust 4 Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
5 into a tabular file that can be uploaded to the MetExp database. 5 into a tabular file that can be uploaded to the MetExp database.
6 6
7 RankFilter, CasLookup are already combined by combine_output.py so here we will use 7 RankFilter, CasLookup are already combined by combine_output.py so here we will use
8 this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust 8 this result. Furthermore here one of the MsClust
9 quantification files are to be combined with combine_output.py result as well. 9 quantification files containing the respective spectra details are to be combined as well.
10 10
11 Extra calculations performed: 11 Extra calculations performed:
12 - The column MW is also added here and is derived from the column FORMULA found 12 - The column MW is also added here and is derived from the column FORMULA found
13 in combine_output.py result. 13 in RankFilter, CasLookup combined result.
14 14
15 So in total here we merge 3 files and calculate one new column. 15 So in total here we merge 2 files and calculate one new column.
16 ''' 16 '''
17 17 from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
18 import csv 18 import csv
19 import re
19 import sys 20 import sys
20 from collections import OrderedDict 21 from collections import OrderedDict
21 22
22 __author__ = "Pieter Lukasse" 23 __author__ = "Pieter Lukasse"
23 __contact__ = "pieter.lukasse@wur.nl" 24 __contact__ = "pieter.lukasse@wur.nl"
38 return output 39 return output
39 40
40 ONE_TO_ONE = 'one_to_one' 41 ONE_TO_ONE = 'one_to_one'
41 N_TO_ONE = 'n_to_one' 42 N_TO_ONE = 'n_to_one'
42 43
43 def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): 44 def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
44 ''' 45 '''
45 Merges data from both input dictionaries based on the link fields. This method will 46 Merges data from both input dictionaries based on the link fields. This method will
46 build up a new list containing the merged hits as the items. 47 build up a new list containing the merged hits as the items.
47 @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) 48 @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
48 @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) 49 @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
49 ''' 50 '''
50 # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): 51 # TODO test for correct input files -> same link_field values should be there
52 # (test at least number of unique link_field values):
51 # 53 #
52 # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): 54 # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
53 # raise Exception('input files should have the same nr of key values ') 55 # raise Exception('input files should have the same nr of key values ')
54 56
55 57
62 processed[link_field_set1_value] = link_field_set1_value 64 processed[link_field_set1_value] = link_field_set1_value
63 65
64 # Get the indices for current link_field_set1_value in both data-structures for proper matching 66 # Get the indices for current link_field_set1_value in both data-structures for proper matching
65 set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] 67 set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
66 set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] 68 set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
67 69 # Validation :
68 70 if len(set2index) == 0:
71 # means that corresponding data could not be found in set2, then throw error
72 raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" +
73 link_field_set1_value + " only found in first dataset. ")
69 74
70 merged_hits = [] 75 merged_hits = []
71 # Combine hits 76 # Combine hits
72 for hit in xrange(len(set1index)): 77 for hit in xrange(len(set1index)):
73 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 78 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
74 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its 79 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
75 # corresponding value in the rankfilter or caslookup tables; i.e. 80 # corresponding value in the sets; i.e.
76 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute 81 # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
77 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) 82 # represented by "key".
83 # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
84 # So set1[x][set1index[n]] = set1.attributeX.instanceN
85 #
78 # It just ensures the entry is made available as a plain named array for easy access. 86 # It just ensures the entry is made available as a plain named array for easy access.
79 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) 87 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
80 if relation_type == ONE_TO_ONE : 88 if relation_type == ONE_TO_ONE :
81 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) 89 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
82 else: 90 else:
83 # is N to 1: 91 # is N to 1:
84 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) 92 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
85 93
86 merged_hit = merge_function(rf_record, cl_record) 94 merged_hit = merge_function(rf_record, cl_record, metadata)
87 merged_hits.append(merged_hit) 95 merged_hits.append(merged_hit)
88 96
89 merged.append(merged_hits) 97 merged.append(merged_hits)
90 98
91 return merged, len(set1index) 99 return merged, len(set1index)
101 else: 109 else:
102 return False 110 return False
103 111
104 112
105 113
106 def _merge_records(rank_caslookup_combi, msclust_quant_record): 114 def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
107 ''' 115 '''
108 Combines single records from both the RankFilter+CasLookup combi file and from MsClust file 116 Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
109 117
110 @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) 118 @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
111 @param msclust_quant_record: msclust quantification + spectrum record 119 @param msclust_quant_record: msclust quantification + spectrum record
112 ''' 120 '''
113 i = 0
114 record = [] 121 record = []
115 for column in rank_caslookup_combi: 122 for column in rank_caslookup_combi:
116 record.append(rank_caslookup_combi[column]) 123 record.append(rank_caslookup_combi[column])
117 i += 1
118 124
119 for column in msclust_quant_record: 125 for column in msclust_quant_record:
120 record.append(msclust_quant_record[column]) 126 record.append(msclust_quant_record[column])
121 i += 1 127
128 for column in metadata:
129 record.append(metadata[column])
130
131 # add MOLECULAR MASS (MM)
132 molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
133 # limit to two decimals:
134 record.append("{0:.2f}".format(molecular_mass))
135
136 # add MOLECULAR WEIGHT (MW) - TODO - calculate this
137 record.append('0.0')
138
139 # level of identification and Location of reference standard
140 record.append('0')
141 record.append('')
122 142
123 return record 143 return record
124 144
125 145
126 146 def get_molecular_mass(formula):
127 147 '''
128 def _save_data(data, headers, nhits, out_csv): 148 Calculates the molecular mass (MM).
149 E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
150 '''
151
152 # Each element is represented by a capital letter, followed optionally by
153 # lower case, with one or more digits as for how many elements:
154 element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
155
156 total_mass = 0
157 for (element_name, count) in element_pattern.findall(formula):
158 if count == "":
159 count = 1
160 else:
161 count = int(count)
162 element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?)
163 total_mass += element_mass * count
164
165 return total_mass
166
167
168
169 def _save_data(data, headers, out_csv):
129 ''' 170 '''
130 Writes tab-separated data to file 171 Writes tab-separated data to file
131 @param data: dictionary containing merged dataset 172 @param data: dictionary containing merged dataset
132 @param out_csv: output csv file 173 @param out_csv: output csv file
133 ''' 174 '''
137 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") 178 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
138 179
139 # Write headers 180 # Write headers
140 output_single_handle.writerow(headers) 181 output_single_handle.writerow(headers)
141 182
142 # Write one line for each centrotype 183 # Write
143 for centrotype_idx in xrange(len(data)): 184 for item_idx in xrange(len(data)):
144 for hit in data[centrotype_idx]: 185 for hit in data[item_idx]:
145 output_single_handle.writerow(hit) 186 output_single_handle.writerow(hit)
146 187
188
189 def _get_map_for_elements_and_masses(elements_and_masses):
190 '''
191 This method will read out the column 'Chemical symbol' and make a map
192 of this, storing the column 'Relative atomic mass' as its value
193 '''
194 resultMap = {}
195 index = 0
196 for entry in elements_and_masses['Chemical symbol']:
197 resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
198 index += 1
199
200 return resultMap
201
202
203 def init_elements_and_masses_map():
204 '''
205 Initializes the lookup map containing the elements and their respective masses
206 '''
207 elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
208 global elements_and_masses_map
209 elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
210
147 211
148 def main(): 212 def main():
149 ''' 213 '''
150 Combine Output main function 214 Combine Output main function
151 215
154 quantification files are to be combined with combine_output.py result as well. 218 quantification files are to be combined with combine_output.py result as well.
155 ''' 219 '''
156 rankfilter_and_caslookup_combined_file = sys.argv[1] 220 rankfilter_and_caslookup_combined_file = sys.argv[1]
157 msclust_quantification_and_spectra_file = sys.argv[2] 221 msclust_quantification_and_spectra_file = sys.argv[2]
158 output_csv = sys.argv[3] 222 output_csv = sys.argv[3]
223 # metadata
224 metadata = OrderedDict()
225 metadata['organism'] = sys.argv[4]
226 metadata['tissue'] = sys.argv[5]
227 metadata['experiment_name'] = sys.argv[6]
228 metadata['user_name'] = sys.argv[7]
229 metadata['column_type'] = sys.argv[8]
159 230
160 # Read RankFilter and CasLookup output files 231 # Read RankFilter and CasLookup output files
161 rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) 232 rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
162 msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') 233 msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
163 234
235 # Read elements and masses to use for the MW/MM calculation :
236 init_elements_and_masses_map()
237
164 merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', 238 merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype',
165 msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) 239 msclust_quantification_and_spectra, 'centrotype',
166 headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() 240 _compare_records, _merge_records, metadata,
167 _save_data(merged, headers, nhits, output_csv) 241 N_TO_ONE)
242 headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
243 _save_data(merged, headers, output_csv)
168 244
169 245
170 if __name__ == '__main__': 246 if __name__ == '__main__':
171 main() 247 main()