Mercurial > repos > marpiech > norwich_tools
comparison tools/rdock/bin/sdreport @ 0:bc03dbb6eb37 draft
planemo upload commit 781926e52355f7805db8d9a4ccafeff397b19aa4-dirty
author | marpiech |
---|---|
date | Mon, 29 Aug 2016 03:38:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bc03dbb6eb37 |
---|---|
1 #!/usr/bin/perl | |
2 # Produces text summaries of SD records * | |
3 # * | |
4 # Usage: sdreport [-l] [-t] [-s<DataField>] [sdFiles] * | |
5 # * | |
6 # -l output data fields for each record as processed * | |
7 # -t tabulate Rbt.Score.* fields for each record as processed * | |
8 # -s summarise data fields for all records * | |
9 # -s<DataField> summarise data fields for each unique value * | |
10 # of <DataField> * | |
11 # * | |
12 # Note: If -l or -t are combined with -s, the listing/table is output * | |
13 # within each ligand summary * | |
14 # If sdFiles not given, reads from standard input * | |
15 # Output is to standard output * | |
16 # Default is equivalent to sdfilter -l * | |
17 #******************************************************************************* | |
18 use lib "$ENV{'RBT_ROOT'}/lib"; | |
19 | |
20 use SDRecord; | |
21 | |
22 # Default field names and headings for rDock v4.00 scores | |
23 my @defaultFields = ('SCORE','SCORE.INTER','SCORE.INTRA','SCORE.RESTR','SCORE.INTER.VDW'); | |
24 my @defaultHeadings = ('TOTAL','INTER','INTRA','RESTR','VDW'); | |
25 | |
26 # Default field names and headings for normalised scores (score / #ligand heavy atoms) | |
27 my @defaultNormFields = ('SCORE.norm','SCORE.INTER.norm','SCORE.INTRA.norm','SCORE.RESTR.norm','SCORE.heavy'); | |
28 my @defaultNormHeadings = ('TOTALn','INTERn','INTRAn','RESTRn','#heavy'); | |
29 | |
30 # Default field names and headings for rDock v3.00 scores | |
31 my @defaultOldFields = ('Rbt.Score.Corrected','Rbt.Score.Inter','Rbt.Score.Intra','Rbt.Score.IntraMin','Rbt.Score.Restraint'); | |
32 my @defaultOldHeadings = ('TOTAL','INTER','INTRA','INTRAMIN','RESTR'); | |
33 | |
34 my $listFormat = 0; | |
35 my $summaryFormat = 0; | |
36 my $tableFormat = 0; | |
37 my $supplierFormat = 0; | |
38 my $csvFormat = 0; | |
39 my $summaryKey = "_TITLE1"; | |
40 my $oldFields = 0;#If true, use old default field names for component scores | |
41 my $headings = 1;#DM 21 Nov 2000, If false, don't output headings | |
42 my @outputFields; | |
43 my @outputHeadings; | |
44 | |
45 #Print help if no command line arguments | |
46 printHelpAndExit() if (scalar(@ARGV) == 0); | |
47 | |
48 #Parse command line arguments | |
49 my $nArgs = scalar(@ARGV); | |
50 | |
51 while (scalar(@ARGV)) { | |
52 my $arg = shift @ARGV; | |
53 printHelpAndExit() if ($arg eq '-h'); | |
54 if (index($arg,'-l')==0) { | |
55 $listFormat = 1; | |
56 } | |
57 elsif (index($arg,'-o')==0) { | |
58 $oldFields = 1; | |
59 } | |
60 # 7 Feb 2005 (DM) Option to report normalised aggregate scores | |
61 elsif (index($arg,'-norm')==0) { | |
62 $oldFields = 2; | |
63 } | |
64 elsif (index($arg,'-sup')==0) { | |
65 $supplierFormat = 1; | |
66 } | |
67 elsif (index($arg,'-s')==0) { | |
68 $summaryFormat = 1; | |
69 } | |
70 elsif (index($arg,'-id')==0) { | |
71 $summaryKey = substr($arg,3); | |
72 } | |
73 elsif (index($arg,'-nh')==0) { | |
74 $headings = 0; | |
75 } | |
76 elsif (index($arg,'-t')==0) { | |
77 $tableFormat = 1; | |
78 push @outputFields, split(',',substr($arg,2)); | |
79 push @outputHeadings, @outputFields; | |
80 } | |
81 elsif (index($arg,'-c')==0) { | |
82 $csvFormat = 1; | |
83 push @outputFields, split(',',substr($arg,2)); | |
84 push @outputHeadings, @outputFields; | |
85 } | |
86 else { | |
87 push @files,$arg;#must be a filename | |
88 } | |
89 } | |
90 push @ARGV,@files;#put the filenames back in the arg list | |
91 | |
92 #use -l if neither table format is specified | |
93 $listFormat = (!$tableFormat && !$csvFormat && !$supplierFormat); | |
94 | |
95 #If no output fields defined for -t or -c use the defaults (old or new) | |
96 if (scalar(@outputFields)==0) { | |
97 if ($oldFields == 1) { | |
98 @outputFields = @defaultOldFields; | |
99 @outputHeadings = @defaultOldHeadings; | |
100 } | |
101 elsif ($oldFields == 2) { | |
102 @outputFields = @defaultNormFields; | |
103 @outputHeadings = @defaultNormHeadings; | |
104 } | |
105 else { | |
106 @outputFields = @defaultFields; | |
107 @outputHeadings = @defaultHeadings; | |
108 } | |
109 } | |
110 | |
111 my $sdRec = new SDRecord; | |
112 my %summary;#hash of SDRecord lists, indexed by user-defined summary key | |
113 my %indexByName; | |
114 my %indexByNum; | |
115 my $idx = 0; | |
116 my $nRec = 0; | |
117 | |
118 #Column headings for tab and csv format | |
119 #DM 21 Nov 2000 - if $headings is false, then don't output the column headings | |
120 if ($tableFormat && !$summaryFormat && $headings) { | |
121 tabHeadings($summaryKey,@outputHeadings); | |
122 } | |
123 if ($csvFormat && !$summaryFormat && $headings) { | |
124 csvHeadings($summaryKey,@outputHeadings); | |
125 } | |
126 | |
127 #read records | |
128 while ($sdRec->readRec('LINES'=>1,'DATA'=>1)) { | |
129 $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field | |
130 if ($listFormat && !$summaryFormat) { | |
131 print "\n\nRECORD #$nRec\n"; | |
132 $sdRec->writeData(); | |
133 } | |
134 if ($tableFormat && !$summaryFormat) { | |
135 @recList = ($sdRec); | |
136 tabScores(\@recList,$summaryKey,@outputFields); | |
137 } | |
138 elsif ($csvFormat && !$summaryFormat) { | |
139 @recList = ($sdRec); | |
140 csvScores(\@recList,$summaryKey,@outputFields); | |
141 } | |
142 elsif ($supplierFormat && !$summaryFormat) { | |
143 @recList = ($sdRec); | |
144 tabulateSuppliers(\@recList,$summaryKey); | |
145 } | |
146 #add record to summary, indexed by user field value | |
147 #keep a separate index of unique values of user field values, | |
148 #indexed by number in the order the values first appear | |
149 #In this way we can maintain the sorted order of ligands | |
150 #when we come to print out the summary | |
151 if ($summaryFormat) { | |
152 my $summaryValue = $sdRec->{'DATA'}->{$summaryKey}; | |
153 #New data field value encountered | |
154 if (!defined $indexByName{$summaryValue}) { | |
155 $idx++;#incr the number of unique values | |
156 #keep cross-referenced indexes (field value <-> number) | |
157 $indexByName{$summaryValue} = $idx; | |
158 $indexByNum{$idx} = $summaryValue; | |
159 } | |
160 push @{$summary{$summaryValue}},$sdRec->copy('DATA'=>1); | |
161 } | |
162 } | |
163 | |
164 #Print summary if required | |
165 if ($summaryFormat) { | |
166 print "\n===============================================================\n"; | |
167 print "SUMMARY BY $summaryKey\n"; | |
168 foreach $idx (sort {$a<=>$b} keys %indexByNum) {#numberic sort of index numbers | |
169 my $key = $indexByNum{$idx};#look up corresponding data field value | |
170 print "\n===============================================================\n"; | |
171 print "$summaryKey = $key (#$idx)\n\n"; | |
172 writeSummary($summary{$key}); | |
173 if ($listFormat) { | |
174 print "\nIndividual records:\n"; | |
175 foreach $rec (@{$summary{$key}}) { | |
176 print "\n"; | |
177 $rec->writeData(); | |
178 } | |
179 } | |
180 if ($tableFormat) { | |
181 print "\nScores:\n"; | |
182 tabHeadings($summaryKey,@outputHeadings); | |
183 tabScores($summary{$key},$summaryKey,@outputFields); | |
184 } | |
185 if ($csvFormat) { | |
186 print "\nScores:\n"; | |
187 csvHeadings($summaryKey,@outputHeadings); | |
188 csvScores($summary{$key},$summaryKey,@outputFields); | |
189 } | |
190 } | |
191 } | |
192 | |
193 ############################################################## | |
194 # writes a summary to STDOUT for a list of SDRecords | |
195 # Input is a reference to an array of SDRecords | |
196 sub writeSummary { | |
197 my $recListRef = shift; | |
198 | |
199 #Extract the list of data values for each fieldname into a hash array | |
200 #(key=fieldname, value=array ref) | |
201 my %fields; | |
202 foreach $rec (@{$recListRef}) { | |
203 my ($key,$value); | |
204 while ( ($key,$value) = each %{$rec->{'DATA'}}) { | |
205 push @{$fields{$key}},$value; | |
206 } | |
207 } | |
208 | |
209 #Look for constant fields and store separately | |
210 my %constFields; | |
211 foreach $key (keys %fields) { | |
212 my @values = sort @{$fields{$key}}; | |
213 my $nVal = scalar(@values); | |
214 if ($values[0] eq $values[$nVal -1]) { | |
215 $constFields{$key} = $values[0];#store the field name and the constant value | |
216 delete $fields{$key};#remove from (non-const) array | |
217 } | |
218 } | |
219 | |
220 #Print constant fields | |
221 print "\nConstant fields:\n\n"; | |
222 foreach $key (sort keys %constFields) { | |
223 printf "%-40s%s\n",$key,$constFields{$key}; | |
224 } | |
225 #Print min and max value for non-const fields | |
226 print "\nVariable fields:\n\n"; | |
227 foreach $key (sort keys %fields) { | |
228 my @values = @{$fields{$key}}; | |
229 #Look at first value to decide whether to do text or numeric sort | |
230 if (isNaN($values[0])) { | |
231 @values = sort @values;#text sort | |
232 } | |
233 else { | |
234 @values = sort {$a <=> $b} @values;#numeric sort | |
235 } | |
236 my $nVal = scalar(@values); | |
237 printf "%-40s",$key; | |
238 print "Min = $values[0]\tMax = $values[$nVal-1]\t(N = $nVal)\n"; | |
239 } | |
240 } | |
241 | |
242 ############################################################## | |
243 # function isNaN equivalent to the C++, java, javascript isNaN | |
244 # From P Vaglio, ~intranet/lib/rbt_func.pl | |
245 sub isNaN { | |
246 local($_) = @_; | |
247 s/\s+//g; # strip white space | |
248 # match +or- beginning of line 0 or 1 time | |
249 # then any numeric 0 or more | |
250 # then a decimal point | |
251 # then any numeric 0 or more after decimal point | |
252 # then possibly an e or E then + or - then any numreci at least once | |
253 if (/^([+-]?)(\d*)(\.(\d*))?([Ee]([+-]?\d+))?$/ && (defined $2 || defined $4)) { | |
254 return 0; | |
255 } else { | |
256 return 1; | |
257 } | |
258 } | |
259 | |
260 ############################################################## | |
261 # output corresponding headings for use with tabScores | |
262 sub tabHeadings { | |
263 my $summaryKey = shift; | |
264 my @fieldNames = @_; | |
265 printf("%-10s%-30s","REC",$summaryKey); | |
266 foreach $field (@fieldNames) { | |
267 printf("%10s",$field); | |
268 } | |
269 print "\n"; | |
270 } | |
271 | |
272 ############################################################## | |
273 # tab-delimited output of named data field values | |
274 sub tabScores { | |
275 my $recListRef = shift; | |
276 my $summaryKey = shift; | |
277 my @fieldNames = @_; | |
278 foreach $rec (@{$recListRef}) { | |
279 printf("%03d\t%-30.30s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); | |
280 foreach $field (@fieldNames) { | |
281 my $val = $rec->{'DATA'}->{$field}; | |
282 if (isNaN($val)) { | |
283 printf("%-10.12s",$val); | |
284 } | |
285 else { | |
286 printf("%10.3f",$val); | |
287 } | |
288 } | |
289 print "\n"; | |
290 } | |
291 } | |
292 | |
293 ############################################################## | |
294 # output corresponding headings for use with csvScores | |
295 sub csvHeadings { | |
296 my $summaryKey = shift; | |
297 my @fieldNames = @_; | |
298 printf("%s,%s","REC",$summaryKey); | |
299 foreach $field (@fieldNames) { | |
300 printf(",%s",$field); | |
301 } | |
302 print "\n"; | |
303 } | |
304 | |
305 ############################################################## | |
306 # comma-delimited output of named data field values | |
307 sub csvScores { | |
308 my $recListRef = shift; | |
309 my $summaryKey = shift; | |
310 my @fieldNames = @_; | |
311 foreach $rec (@{$recListRef}) { | |
312 printf("%d,%s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); | |
313 foreach $field (@fieldNames) { | |
314 my $val = $rec->{'DATA'}->{$field}; | |
315 if (isNaN($val)) { | |
316 printf(",%s",$val); | |
317 } | |
318 else { | |
319 printf(",%.3f",$val); | |
320 } | |
321 } | |
322 print "\n"; | |
323 } | |
324 } | |
325 | |
326 | |
327 ############################################################## | |
328 # standardised output of Catalyst supplier field | |
329 # Input is a reference to an array of SDRecords | |
330 # and a ligand identifier field to output in column 1 (def=Name) | |
331 sub tabulateSuppliers { | |
332 my $recListRef = shift; | |
333 my $summaryKey = shift || 'Name'; | |
334 foreach $rec (@{$recListRef}) { | |
335 my $suppBase = $rec->{'DATAREF'}->{'Supplier'}+1; | |
336 my $linesRef = $rec->{'LINES'}; | |
337 my $name = $rec->{'DATA'}->{$summaryKey}; | |
338 | |
339 #Output some useful compound info | |
340 my $name = $rec->{'DATA'}->{$summaryKey}; | |
341 my $molFormula = $rec->{'DATA'}->{'MolFormula'}; | |
342 my $molWt = $rec->{'DATA'}->{'MolWt'}; | |
343 my $casNum = $rec->{'DATA'}->{'CAS_num'}; | |
344 my $mdlNum = $rec->{'DATA'}->{'MDLNUMBER'}; | |
345 print "\n\n====================================================================================================\n"; | |
346 printf("%-10.10s%s\n","Name:",$name); | |
347 printf("%-10.10s%s\n","Formula:",$molFormula); | |
348 printf("%-10.10s%s\n","Mol.wt:",$molWt); | |
349 printf("%-10.10s%s\n","CAS #:",$casNum); | |
350 printf("%-10.10s%s\n","MDL #:",$mdlNum); | |
351 | |
352 #Get all the supplier record lines into a list | |
353 #Record is terminated by blank line | |
354 my @lines; | |
355 my $nLines = 0; | |
356 for (; $$linesRef[$suppBase+$nLines] ne ""; $nLines++) { | |
357 push @lines,$$linesRef[$suppBase+$nLines]; | |
358 } | |
359 | |
360 #Column headings | |
361 printf("\n%-20.20s%-40.40s%-40.40s\n", | |
362 "Supplier", | |
363 "Comment", | |
364 "Price" | |
365 ); | |
366 print "----------------------------------------------------------------------------------------------------\n"; | |
367 | |
368 #Loop over each supplier | |
369 my $iLine = 0; | |
370 for (; $iLine < $nLines; $iLine++) { | |
371 #collect supplier info lines | |
372 my @supplierInfo = (); | |
373 for (; $lines[$iLine] ne "." && $iLine < $nLines; $iLine++) { | |
374 push @supplierInfo,$lines[$iLine]; | |
375 } | |
376 #Check for incomplete record | |
377 if ($iLine == $nLines) { | |
378 print "** INCOMPLETE RECORD **\n"; | |
379 last; | |
380 } | |
381 my $nSupplierInfo = scalar(@supplierInfo); | |
382 my $supplier = $supplierInfo[0]; | |
383 #loop over each grade | |
384 for ($iLine++; ($lines[$iLine] ne "........................") && ($iLine < $nLines); $iLine++) { | |
385 #collect grade info lines | |
386 my @gradeInfo = (); | |
387 for (; index($lines[$iLine],"_") ne 0 && $iLine < $nLines; $iLine++) { | |
388 push @gradeInfo,$lines[$iLine]; | |
389 } | |
390 #Check for incomplete record | |
391 if ($iLine == $nLines) { | |
392 print "** INCOMPLETE RECORD **\n"; | |
393 last; | |
394 } | |
395 my $grade = $gradeInfo[0]; | |
396 #loop over each price info line | |
397 for (; index($lines[$iLine],".") ne 0 && $iLine < $nLines; $iLine++) { | |
398 my @priceInfo = split(" ",$lines[$iLine]); | |
399 my $price = join(" ",@priceInfo); | |
400 printf("%-20.20s%-40.39s%-40.40s\n", | |
401 $supplier, | |
402 $grade, | |
403 $price); | |
404 } | |
405 #Check for incomplete record | |
406 if ($iLine == $nLines) { | |
407 print "** INCOMPLETE RECORD **\n"; | |
408 last; | |
409 } | |
410 last if $lines[$iLine] eq "........................"; | |
411 } | |
412 } | |
413 } | |
414 } | |
415 | |
416 | |
417 ####################################################################### | |
418 sub printHelpAndExit { | |
419 print "\nProduces text summaries of SD records\n"; | |
420 print "\nUsage:\tsdreport [-l] [-t[<FieldName,FieldName...>]] [-c<FieldName,FieldName...>] [-id<IDField>] [-nh] [-o] [-s] [-sup] [sdFiles]\n"; | |
421 print "\n\t-l (list format) output all data fields for each record as processed\n"; | |
422 print "\t-t (tab format) tabulate selected fields for each record as processed\n"; | |
423 print "\t-c (csv format) comma delimited output of selected fields for each record as processed\n"; | |
424 print "\t-s (summary format) output summary statistics for each unique value of ligand ID\n"; | |
425 print "\t-sup (supplier format) tabulate supplier details (from Catalyst)\n"; | |
426 print "\t-id<IDField> data field to use as ligand ID\n"; | |
427 print "\t-nh don't output column headings in -t and -c formats\n"; | |
428 print "\t-o use old (v3.00) score field names as default columns in -t and -c formats, else use v4.00 field names\n"; | |
429 print "\t-norm use normalised score field names as default columns in -t and -c formats (normalised = score / #ligand heavy atoms)\n"; | |
430 print "\nNote:\tIf -l, -t or -c are combined with -s, the listing/table is output within each ligand summary\n"; | |
431 print "\t-sup should not be combined with other options\n"; | |
432 print "\tDefault field names for -t and -c are rDock score field names\n"; | |
433 print "\tDefault ID field name is Name\n"; | |
434 print "\n\tIf sdFiles not given, reads from standard input\n"; | |
435 print "\tOutput is to standard output\n\n"; | |
436 exit; | |
437 } |