Mercurial > repos > marpiech > norwich_tools
view tools/rdock/bin/sdreport @ 0:bc03dbb6eb37 draft
planemo upload commit 781926e52355f7805db8d9a4ccafeff397b19aa4-dirty
author | marpiech |
---|---|
date | Mon, 29 Aug 2016 03:38:13 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl # Produces text summaries of SD records * # * # Usage: sdreport [-l] [-t] [-s<DataField>] [sdFiles] * # * # -l output data fields for each record as processed * # -t tabulate Rbt.Score.* fields for each record as processed * # -s summarise data fields for all records * # -s<DataField> summarise data fields for each unique value * # of <DataField> * # * # Note: If -l or -t are combined with -s, the listing/table is output * # within each ligand summary * # If sdFiles not given, reads from standard input * # Output is to standard output * # Default is equivalent to sdfilter -l * #******************************************************************************* use lib "$ENV{'RBT_ROOT'}/lib"; use SDRecord; # Default field names and headings for rDock v4.00 scores my @defaultFields = ('SCORE','SCORE.INTER','SCORE.INTRA','SCORE.RESTR','SCORE.INTER.VDW'); my @defaultHeadings = ('TOTAL','INTER','INTRA','RESTR','VDW'); # Default field names and headings for normalised scores (score / #ligand heavy atoms) my @defaultNormFields = ('SCORE.norm','SCORE.INTER.norm','SCORE.INTRA.norm','SCORE.RESTR.norm','SCORE.heavy'); my @defaultNormHeadings = ('TOTALn','INTERn','INTRAn','RESTRn','#heavy'); # Default field names and headings for rDock v3.00 scores my @defaultOldFields = ('Rbt.Score.Corrected','Rbt.Score.Inter','Rbt.Score.Intra','Rbt.Score.IntraMin','Rbt.Score.Restraint'); my @defaultOldHeadings = ('TOTAL','INTER','INTRA','INTRAMIN','RESTR'); my $listFormat = 0; my $summaryFormat = 0; my $tableFormat = 0; my $supplierFormat = 0; my $csvFormat = 0; my $summaryKey = "_TITLE1"; my $oldFields = 0;#If true, use old default field names for component scores my $headings = 1;#DM 21 Nov 2000, If false, don't output headings my @outputFields; my @outputHeadings; #Print help if no command line arguments printHelpAndExit() if (scalar(@ARGV) == 0); #Parse command line arguments my $nArgs = scalar(@ARGV); while (scalar(@ARGV)) { my $arg = shift @ARGV; printHelpAndExit() if ($arg eq '-h'); if (index($arg,'-l')==0) { $listFormat = 1; } elsif (index($arg,'-o')==0) { $oldFields = 1; } # 7 Feb 2005 (DM) Option to report normalised aggregate scores elsif (index($arg,'-norm')==0) { $oldFields = 2; } elsif (index($arg,'-sup')==0) { $supplierFormat = 1; } elsif (index($arg,'-s')==0) { $summaryFormat = 1; } elsif (index($arg,'-id')==0) { $summaryKey = substr($arg,3); } elsif (index($arg,'-nh')==0) { $headings = 0; } elsif (index($arg,'-t')==0) { $tableFormat = 1; push @outputFields, split(',',substr($arg,2)); push @outputHeadings, @outputFields; } elsif (index($arg,'-c')==0) { $csvFormat = 1; push @outputFields, split(',',substr($arg,2)); push @outputHeadings, @outputFields; } else { push @files,$arg;#must be a filename } } push @ARGV,@files;#put the filenames back in the arg list #use -l if neither table format is specified $listFormat = (!$tableFormat && !$csvFormat && !$supplierFormat); #If no output fields defined for -t or -c use the defaults (old or new) if (scalar(@outputFields)==0) { if ($oldFields == 1) { @outputFields = @defaultOldFields; @outputHeadings = @defaultOldHeadings; } elsif ($oldFields == 2) { @outputFields = @defaultNormFields; @outputHeadings = @defaultNormHeadings; } else { @outputFields = @defaultFields; @outputHeadings = @defaultHeadings; } } my $sdRec = new SDRecord; my %summary;#hash of SDRecord lists, indexed by user-defined summary key my %indexByName; my %indexByNum; my $idx = 0; my $nRec = 0; #Column headings for tab and csv format #DM 21 Nov 2000 - if $headings is false, then don't output the column headings if ($tableFormat && !$summaryFormat && $headings) { tabHeadings($summaryKey,@outputHeadings); } if ($csvFormat && !$summaryFormat && $headings) { csvHeadings($summaryKey,@outputHeadings); } #read records while ($sdRec->readRec('LINES'=>1,'DATA'=>1)) { $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field if ($listFormat && !$summaryFormat) { print "\n\nRECORD #$nRec\n"; $sdRec->writeData(); } if ($tableFormat && !$summaryFormat) { @recList = ($sdRec); tabScores(\@recList,$summaryKey,@outputFields); } elsif ($csvFormat && !$summaryFormat) { @recList = ($sdRec); csvScores(\@recList,$summaryKey,@outputFields); } elsif ($supplierFormat && !$summaryFormat) { @recList = ($sdRec); tabulateSuppliers(\@recList,$summaryKey); } #add record to summary, indexed by user field value #keep a separate index of unique values of user field values, #indexed by number in the order the values first appear #In this way we can maintain the sorted order of ligands #when we come to print out the summary if ($summaryFormat) { my $summaryValue = $sdRec->{'DATA'}->{$summaryKey}; #New data field value encountered if (!defined $indexByName{$summaryValue}) { $idx++;#incr the number of unique values #keep cross-referenced indexes (field value <-> number) $indexByName{$summaryValue} = $idx; $indexByNum{$idx} = $summaryValue; } push @{$summary{$summaryValue}},$sdRec->copy('DATA'=>1); } } #Print summary if required if ($summaryFormat) { print "\n===============================================================\n"; print "SUMMARY BY $summaryKey\n"; foreach $idx (sort {$a<=>$b} keys %indexByNum) {#numberic sort of index numbers my $key = $indexByNum{$idx};#look up corresponding data field value print "\n===============================================================\n"; print "$summaryKey = $key (#$idx)\n\n"; writeSummary($summary{$key}); if ($listFormat) { print "\nIndividual records:\n"; foreach $rec (@{$summary{$key}}) { print "\n"; $rec->writeData(); } } if ($tableFormat) { print "\nScores:\n"; tabHeadings($summaryKey,@outputHeadings); tabScores($summary{$key},$summaryKey,@outputFields); } if ($csvFormat) { print "\nScores:\n"; csvHeadings($summaryKey,@outputHeadings); csvScores($summary{$key},$summaryKey,@outputFields); } } } ############################################################## # writes a summary to STDOUT for a list of SDRecords # Input is a reference to an array of SDRecords sub writeSummary { my $recListRef = shift; #Extract the list of data values for each fieldname into a hash array #(key=fieldname, value=array ref) my %fields; foreach $rec (@{$recListRef}) { my ($key,$value); while ( ($key,$value) = each %{$rec->{'DATA'}}) { push @{$fields{$key}},$value; } } #Look for constant fields and store separately my %constFields; foreach $key (keys %fields) { my @values = sort @{$fields{$key}}; my $nVal = scalar(@values); if ($values[0] eq $values[$nVal -1]) { $constFields{$key} = $values[0];#store the field name and the constant value delete $fields{$key};#remove from (non-const) array } } #Print constant fields print "\nConstant fields:\n\n"; foreach $key (sort keys %constFields) { printf "%-40s%s\n",$key,$constFields{$key}; } #Print min and max value for non-const fields print "\nVariable fields:\n\n"; foreach $key (sort keys %fields) { my @values = @{$fields{$key}}; #Look at first value to decide whether to do text or numeric sort if (isNaN($values[0])) { @values = sort @values;#text sort } else { @values = sort {$a <=> $b} @values;#numeric sort } my $nVal = scalar(@values); printf "%-40s",$key; print "Min = $values[0]\tMax = $values[$nVal-1]\t(N = $nVal)\n"; } } ############################################################## # function isNaN equivalent to the C++, java, javascript isNaN # From P Vaglio, ~intranet/lib/rbt_func.pl sub isNaN { local($_) = @_; s/\s+//g; # strip white space # match +or- beginning of line 0 or 1 time # then any numeric 0 or more # then a decimal point # then any numeric 0 or more after decimal point # then possibly an e or E then + or - then any numreci at least once if (/^([+-]?)(\d*)(\.(\d*))?([Ee]([+-]?\d+))?$/ && (defined $2 || defined $4)) { return 0; } else { return 1; } } ############################################################## # output corresponding headings for use with tabScores sub tabHeadings { my $summaryKey = shift; my @fieldNames = @_; printf("%-10s%-30s","REC",$summaryKey); foreach $field (@fieldNames) { printf("%10s",$field); } print "\n"; } ############################################################## # tab-delimited output of named data field values sub tabScores { my $recListRef = shift; my $summaryKey = shift; my @fieldNames = @_; foreach $rec (@{$recListRef}) { printf("%03d\t%-30.30s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); foreach $field (@fieldNames) { my $val = $rec->{'DATA'}->{$field}; if (isNaN($val)) { printf("%-10.12s",$val); } else { printf("%10.3f",$val); } } print "\n"; } } ############################################################## # output corresponding headings for use with csvScores sub csvHeadings { my $summaryKey = shift; my @fieldNames = @_; printf("%s,%s","REC",$summaryKey); foreach $field (@fieldNames) { printf(",%s",$field); } print "\n"; } ############################################################## # comma-delimited output of named data field values sub csvScores { my $recListRef = shift; my $summaryKey = shift; my @fieldNames = @_; foreach $rec (@{$recListRef}) { printf("%d,%s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); foreach $field (@fieldNames) { my $val = $rec->{'DATA'}->{$field}; if (isNaN($val)) { printf(",%s",$val); } else { printf(",%.3f",$val); } } print "\n"; } } ############################################################## # standardised output of Catalyst supplier field # Input is a reference to an array of SDRecords # and a ligand identifier field to output in column 1 (def=Name) sub tabulateSuppliers { my $recListRef = shift; my $summaryKey = shift || 'Name'; foreach $rec (@{$recListRef}) { my $suppBase = $rec->{'DATAREF'}->{'Supplier'}+1; my $linesRef = $rec->{'LINES'}; my $name = $rec->{'DATA'}->{$summaryKey}; #Output some useful compound info my $name = $rec->{'DATA'}->{$summaryKey}; my $molFormula = $rec->{'DATA'}->{'MolFormula'}; my $molWt = $rec->{'DATA'}->{'MolWt'}; my $casNum = $rec->{'DATA'}->{'CAS_num'}; my $mdlNum = $rec->{'DATA'}->{'MDLNUMBER'}; print "\n\n====================================================================================================\n"; printf("%-10.10s%s\n","Name:",$name); printf("%-10.10s%s\n","Formula:",$molFormula); printf("%-10.10s%s\n","Mol.wt:",$molWt); printf("%-10.10s%s\n","CAS #:",$casNum); printf("%-10.10s%s\n","MDL #:",$mdlNum); #Get all the supplier record lines into a list #Record is terminated by blank line my @lines; my $nLines = 0; for (; $$linesRef[$suppBase+$nLines] ne ""; $nLines++) { push @lines,$$linesRef[$suppBase+$nLines]; } #Column headings printf("\n%-20.20s%-40.40s%-40.40s\n", "Supplier", "Comment", "Price" ); print "----------------------------------------------------------------------------------------------------\n"; #Loop over each supplier my $iLine = 0; for (; $iLine < $nLines; $iLine++) { #collect supplier info lines my @supplierInfo = (); for (; $lines[$iLine] ne "." && $iLine < $nLines; $iLine++) { push @supplierInfo,$lines[$iLine]; } #Check for incomplete record if ($iLine == $nLines) { print "** INCOMPLETE RECORD **\n"; last; } my $nSupplierInfo = scalar(@supplierInfo); my $supplier = $supplierInfo[0]; #loop over each grade for ($iLine++; ($lines[$iLine] ne "........................") && ($iLine < $nLines); $iLine++) { #collect grade info lines my @gradeInfo = (); for (; index($lines[$iLine],"_") ne 0 && $iLine < $nLines; $iLine++) { push @gradeInfo,$lines[$iLine]; } #Check for incomplete record if ($iLine == $nLines) { print "** INCOMPLETE RECORD **\n"; last; } my $grade = $gradeInfo[0]; #loop over each price info line for (; index($lines[$iLine],".") ne 0 && $iLine < $nLines; $iLine++) { my @priceInfo = split(" ",$lines[$iLine]); my $price = join(" ",@priceInfo); printf("%-20.20s%-40.39s%-40.40s\n", $supplier, $grade, $price); } #Check for incomplete record if ($iLine == $nLines) { print "** INCOMPLETE RECORD **\n"; last; } last if $lines[$iLine] eq "........................"; } } } } ####################################################################### sub printHelpAndExit { print "\nProduces text summaries of SD records\n"; print "\nUsage:\tsdreport [-l] [-t[<FieldName,FieldName...>]] [-c<FieldName,FieldName...>] [-id<IDField>] [-nh] [-o] [-s] [-sup] [sdFiles]\n"; print "\n\t-l (list format) output all data fields for each record as processed\n"; print "\t-t (tab format) tabulate selected fields for each record as processed\n"; print "\t-c (csv format) comma delimited output of selected fields for each record as processed\n"; print "\t-s (summary format) output summary statistics for each unique value of ligand ID\n"; print "\t-sup (supplier format) tabulate supplier details (from Catalyst)\n"; print "\t-id<IDField> data field to use as ligand ID\n"; print "\t-nh don't output column headings in -t and -c formats\n"; print "\t-o use old (v3.00) score field names as default columns in -t and -c formats, else use v4.00 field names\n"; print "\t-norm use normalised score field names as default columns in -t and -c formats (normalised = score / #ligand heavy atoms)\n"; print "\nNote:\tIf -l, -t or -c are combined with -s, the listing/table is output within each ligand summary\n"; print "\t-sup should not be combined with other options\n"; print "\tDefault field names for -t and -c are rDock score field names\n"; print "\tDefault ID field name is Name\n"; print "\n\tIf sdFiles not given, reads from standard input\n"; print "\tOutput is to standard output\n\n"; exit; }