comparison combine_output.py @ 0:9d5f4f5f764b

Initial commit to toolshed
author pieter.lukasse@wur.nl
date Thu, 16 Jan 2014 13:10:00 +0100
parents
children 19d8fd10248e
comparison
equal deleted inserted replaced
-1:000000000000 0:9d5f4f5f764b
1 #!/usr/bin/env python
2 # encoding: utf-8
3 '''
4 Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
5 '''
6
7 import csv
8 import re
9 import sys
10 import math
11 import pprint
12
13 __author__ = "Marcel Kempenaar"
14 __contact__ = "brs@nbic.nl"
15 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
16 __license__ = "MIT"
17
18 def _process_data(in_csv):
19 '''
20 Generic method to parse a tab-separated file returning a dictionary with named columns
21 @param in_csv: input filename to be parsed
22 '''
23 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
24 header = data.pop(0)
25 # Create dictionary with column name as key
26 output = {}
27 for index in xrange(len(header)):
28 output[header[index]] = [row[index] for row in data]
29 return output
30
31
32 def _merge_data(rankfilter, caslookup):
33 '''
34 Merges data from both input dictionaries based on the Centrotype field. This method will
35 build up a new list containing the merged hits as the items.
36 @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
37 @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
38 '''
39 # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
40 if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
41 raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
42
43 merged = []
44 processed = {}
45 for compound_id_idx in xrange(len(rankfilter['ID'])):
46 compound_id = rankfilter['ID'][compound_id_idx]
47 if not compound_id in processed :
48 # keep track of processed items to not repeat them
49 processed[compound_id] = compound_id
50 # get centrotype nr
51 centrotype = compound_id.split('-')[0]
52 # Get the indices for current compound ID in both data-structures for proper matching
53 rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
54 cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
55
56 merged_hits = []
57 # Combine hits
58 for hit in xrange(len(rindex)):
59 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
60 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
61 # corresponding value in the rankfilter or caslookup tables; i.e.
62 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
63 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
64 rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
65 cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
66
67 merged_hit = _add_hit(rf_record, cl_record)
68 merged_hits.append(merged_hit)
69
70 merged.append(merged_hits)
71
72 return merged, len(rindex)
73
74
75 def _add_hit(rankfilter, caslookup):
76 '''
77 Combines single records from both the RankFilter- and CasLookup-tools
78 @param rankfilter: record (dictionary) of one compound in the RankFilter output
79 @param caslookup: matching record (dictionary) of one compound in the CasLookup output
80 '''
81 # The ID in the RankFilter output contains the following 5 fields:
82 rf_id = rankfilter['ID'].split('-')
83 try:
84 name, formula = _remove_formula(rankfilter['Name'])
85 hit = [rf_id[0], # Centrotype
86 rf_id[1], # cent.Factor
87 rf_id[2], # scan nr
88 rf_id[3], # R.T. (umin)
89 rf_id[4], # nr. Peaks
90 # Appending other fields
91 rankfilter['R.T.'],
92 name,
93 caslookup['FORMULA'] if not formula else formula,
94 rankfilter['Library'].strip(),
95 rankfilter['CAS'].strip(),
96 rankfilter['Forward'],
97 rankfilter['Reverse'],
98 ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
99 rankfilter['RIexp'],
100 caslookup['RI'],
101 rankfilter['RIsvr'],
102 # Calculate absolute differences
103 math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
104 math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
105 caslookup['Regression.Column.Name'],
106 caslookup['min'],
107 caslookup['max'],
108 caslookup['nr.duplicates'],
109 caslookup['Column.phase.type'],
110 caslookup['Column.name'],
111 rankfilter['Rank'],
112 rankfilter['%rel.err'],
113 rankfilter['Synonyms']]
114 except KeyError as error:
115 print "Problem reading in data from input file(s):\n",
116 print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
117 print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
118 raise error
119
120 return hit
121
122
123 def _remove_formula(name):
124 '''
125 The RankFilter Name field often contains the Formula as well, this function removes it from the Name
126 @param name: complete name of the compound from the RankFilter output
127 '''
128 name = name.split()
129 poss_formula = name[-1]
130 match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
131 if match:
132 return ' '.join(name[:-1]), poss_formula
133 else:
134 return ' '.join(name), False
135
136
137 def _get_default_caslookup():
138 '''
139 The Cas Lookup tool might not have found all compounds in the library searched,
140 this default dict will be used to combine with the Rank Filter output
141 '''
142 return {'FORMULA': 'N/A',
143 'RI': '0.0',
144 'Regression.Column.Name': 'None',
145 'min': '0.0',
146 'max': '0.0',
147 'nr.duplicates': '0',
148 'Column.phase.type': 'N/A',
149 'Column.name': 'N/A'}
150
151
152 def _save_data(data, nhits, out_csv_single, out_csv_multi):
153 '''
154 Writes tab-separated data to file
155 @param data: dictionary containing merged dataset
156 @param out_csv: output csv file
157 '''
158 header = ['Centrotype',
159 'cent.Factor',
160 'scan nr.',
161 'R.T. (umin)',
162 'nr. Peaks',
163 'R.T.',
164 'Name',
165 'FORMULA',
166 'Library',
167 'CAS',
168 'Forward',
169 'Reverse',
170 'Avg. (Forward, Reverse)',
171 'RIexp',
172 'RI',
173 'RIsvr',
174 'RIexp - RIsvr',
175 'RI - RIexp',
176 'Regression.Column.Name',
177 'min',
178 'max',
179 'nr.duplicates',
180 'Column.phase.type',
181 'Column.name',
182 'Rank',
183 '%rel.err',
184 'Synonyms']
185
186 # Open output file for writing
187 outfile_single_handle = open(out_csv_single, 'wb')
188 outfile_multi_handle = open(out_csv_multi, 'wb')
189 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
190 output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
191
192 # Write headers
193 output_single_handle.writerow(header)
194 output_multi_handle.writerow(header * nhits)
195 # Combine all hits for each centrotype into one line
196 line = []
197 for centrotype_idx in xrange(len(data)):
198 for hit in data[centrotype_idx]:
199 line.extend(hit)
200 output_multi_handle.writerow(line)
201 line = []
202
203 # Write one line for each centrotype
204 for centrotype_idx in xrange(len(data)):
205 for hit in data[centrotype_idx]:
206 output_single_handle.writerow(hit)
207
208
209 def main():
210 '''
211 Combine Output main function
212 It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers"
213 NB: the caslookup_result_file will typically have fewer lines than
214 rankfilter_result_file, so the merge has to consider this as well. The final file
215 should have the same nr of lines as rankfilter_result_file.
216 '''
217 rankfilter_result_file = sys.argv[1]
218 caslookup_result_file = sys.argv[2]
219 output_single_csv = sys.argv[3]
220 output_multi_csv = sys.argv[4]
221
222 # Read RankFilter and CasLookup output files
223 rankfilter = _process_data(rankfilter_result_file)
224 caslookup = _process_data(caslookup_result_file)
225 merged, nhits = _merge_data(rankfilter, caslookup)
226 _save_data(merged, nhits, output_single_csv, output_multi_csv)
227
228
229 if __name__ == '__main__':
230 main()