diff galaxy-tools/tools/rdock/bin/sdreport @ 2:0faa03a92843 draft default tip

Uploaded
author dzesikah
date Fri, 26 Aug 2016 10:19:49 -0400
parents ad4bc82457e5
children
line wrap: on
line diff
--- a/galaxy-tools/tools/rdock/bin/sdreport	Fri Aug 26 09:55:15 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,437 +0,0 @@
-#!/usr/bin/perl
-# Produces text summaries of SD records                                        *
-#                                                                              *
-# Usage:  sdreport [-l] [-t] [-s<DataField>] [sdFiles]                         *
-#                                                                              *
-#         -l output data fields for each record as processed                   *
-#         -t tabulate Rbt.Score.* fields for each record as processed          *
-#         -s summarise data fields for all records                             *
-#         -s<DataField> summarise data fields for each unique value            *
-#                       of <DataField>                                         *
-#                                                                              *
-# Note:   If -l or -t are combined with -s, the listing/table is output        *
-#         within each ligand summary                                           *
-#         If sdFiles not given, reads from standard input                      *
-#         Output is to standard output                                         *
-#         Default is equivalent to sdfilter -l                                 *
-#*******************************************************************************
-use lib "$ENV{'RBT_ROOT'}/lib";
-
-use SDRecord;
-
-# Default field names and headings for rDock v4.00 scores
-my @defaultFields = ('SCORE','SCORE.INTER','SCORE.INTRA','SCORE.RESTR','SCORE.INTER.VDW');
-my @defaultHeadings = ('TOTAL','INTER','INTRA','RESTR','VDW');
-
-# Default field names and headings for normalised scores (score / #ligand heavy atoms)
-my @defaultNormFields = ('SCORE.norm','SCORE.INTER.norm','SCORE.INTRA.norm','SCORE.RESTR.norm','SCORE.heavy');
-my @defaultNormHeadings = ('TOTALn','INTERn','INTRAn','RESTRn','#heavy');
-
-# Default field names and headings for rDock v3.00 scores
-my @defaultOldFields = ('Rbt.Score.Corrected','Rbt.Score.Inter','Rbt.Score.Intra','Rbt.Score.IntraMin','Rbt.Score.Restraint');
-my @defaultOldHeadings = ('TOTAL','INTER','INTRA','INTRAMIN','RESTR');
-
-my $listFormat = 0;
-my $summaryFormat = 0;
-my $tableFormat = 0;
-my $supplierFormat = 0;
-my $csvFormat = 0;
-my $summaryKey = "_TITLE1";
-my $oldFields = 0;#If true, use old default field names for component scores
-my $headings = 1;#DM 21 Nov 2000, If false, don't output headings
-my @outputFields;
-my @outputHeadings;
-
-#Print help if no command line arguments
-printHelpAndExit() if (scalar(@ARGV) == 0);
-
-#Parse command line arguments
-my $nArgs = scalar(@ARGV);
-
-while (scalar(@ARGV)) {
-  my $arg = shift @ARGV;
-  printHelpAndExit() if ($arg eq '-h');
-  if (index($arg,'-l')==0) {
-    $listFormat = 1;
-  }
-  elsif (index($arg,'-o')==0) {
-    $oldFields = 1;
-  }
-  # 7 Feb 2005 (DM) Option to report normalised aggregate scores
-  elsif (index($arg,'-norm')==0) {
-    $oldFields = 2;
-  }
-  elsif (index($arg,'-sup')==0) {
-    $supplierFormat = 1;
-  }
-  elsif (index($arg,'-s')==0) {
-    $summaryFormat = 1;
-  }
-  elsif (index($arg,'-id')==0) {
-    $summaryKey = substr($arg,3);
-  }
-  elsif (index($arg,'-nh')==0) {
-    $headings = 0;
-  }
-  elsif (index($arg,'-t')==0) {
-    $tableFormat = 1;
-    push @outputFields, split(',',substr($arg,2));
-    push @outputHeadings, @outputFields;
-  }
-  elsif (index($arg,'-c')==0) {
-    $csvFormat = 1;
-    push @outputFields, split(',',substr($arg,2));
-    push @outputHeadings, @outputFields;
-  }
-  else {
-    push @files,$arg;#must be a filename
-  }
-}
-push @ARGV,@files;#put the filenames back in the arg list
-
-#use -l if neither table format is specified
-$listFormat = (!$tableFormat && !$csvFormat && !$supplierFormat);
-
-#If no output fields defined for -t or -c use the defaults (old or new)
-if (scalar(@outputFields)==0) {
-	if ($oldFields == 1) {
-		@outputFields = @defaultOldFields;
-		@outputHeadings = @defaultOldHeadings;
-	}
-	elsif ($oldFields == 2) {
-                @outputFields = @defaultNormFields;
-                @outputHeadings = @defaultNormHeadings;
-	}
-	else {
-		@outputFields = @defaultFields;
-		@outputHeadings = @defaultHeadings;
-	}
-}
-
-my $sdRec = new SDRecord;
-my %summary;#hash of SDRecord lists, indexed by user-defined summary key
-my %indexByName;
-my %indexByNum;
-my $idx = 0;
-my $nRec = 0;
-
-#Column headings for tab and csv format
-#DM 21 Nov 2000 - if $headings is false, then don't output the column headings
-if ($tableFormat && !$summaryFormat && $headings) {
-  tabHeadings($summaryKey,@outputHeadings);
-}
-if ($csvFormat && !$summaryFormat && $headings) {
-  csvHeadings($summaryKey,@outputHeadings);
-}
-
-#read records
-while ($sdRec->readRec('LINES'=>1,'DATA'=>1)) {
-  $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field
-  if ($listFormat && !$summaryFormat) {
-    print "\n\nRECORD #$nRec\n";
-    $sdRec->writeData();
-  }
-  if ($tableFormat && !$summaryFormat) {
-    @recList = ($sdRec);
-    tabScores(\@recList,$summaryKey,@outputFields);
-  }
-  elsif ($csvFormat && !$summaryFormat) {
-    @recList = ($sdRec);
-    csvScores(\@recList,$summaryKey,@outputFields);
-  }
-  elsif ($supplierFormat && !$summaryFormat) {
-    @recList = ($sdRec);
-    tabulateSuppliers(\@recList,$summaryKey);
-  }
-  #add record to summary, indexed by user field value
-  #keep a separate index of unique values of user field values,
-  #indexed by number in the order the values first appear
-  #In this way we can maintain the sorted order of ligands
-  #when we come to print out the summary
-  if ($summaryFormat) {
-    my $summaryValue = $sdRec->{'DATA'}->{$summaryKey};
-    #New data field value encountered
-    if (!defined $indexByName{$summaryValue}) {
-      $idx++;#incr the number of unique values
-      #keep cross-referenced indexes (field value <-> number)
-      $indexByName{$summaryValue} = $idx;
-      $indexByNum{$idx} = $summaryValue;
-    }
-    push @{$summary{$summaryValue}},$sdRec->copy('DATA'=>1);
-  }
-}
-
-#Print summary if required
-if ($summaryFormat) {
-  print "\n===============================================================\n";
-  print "SUMMARY BY $summaryKey\n";
-  foreach $idx (sort {$a<=>$b} keys %indexByNum) {#numberic sort of index numbers
-    my $key = $indexByNum{$idx};#look up corresponding data field value
-    print "\n===============================================================\n";
-    print "$summaryKey = $key (#$idx)\n\n";
-    writeSummary($summary{$key});
-    if ($listFormat) {
-      print "\nIndividual records:\n";
-      foreach $rec (@{$summary{$key}}) {
-        print "\n";
-	      $rec->writeData();
-      }
-    }
-    if ($tableFormat) {
-      print "\nScores:\n";
-      tabHeadings($summaryKey,@outputHeadings);
-      tabScores($summary{$key},$summaryKey,@outputFields);
-    }
-    if ($csvFormat) {
-      print "\nScores:\n";
-      csvHeadings($summaryKey,@outputHeadings);
-      csvScores($summary{$key},$summaryKey,@outputFields);
-    }
-  }
-}
-
-##############################################################
-# writes a summary to STDOUT for a list of SDRecords
-# Input is a reference to an array of SDRecords
-sub writeSummary {
-  my $recListRef = shift;
-
-  #Extract the list of data values for each fieldname into a hash array
-  #(key=fieldname, value=array ref)
-  my %fields;
-  foreach $rec (@{$recListRef}) {
-    my ($key,$value);
-    while ( ($key,$value) = each %{$rec->{'DATA'}}) {
-      push @{$fields{$key}},$value;
-    }
-  }
-
-  #Look for constant fields and store separately
-  my %constFields;
-  foreach $key (keys %fields) {
-    my @values = sort  @{$fields{$key}};
-    my $nVal = scalar(@values);
-    if ($values[0] eq $values[$nVal -1]) {
-      $constFields{$key} = $values[0];#store the field name and the constant value
-      delete $fields{$key};#remove from (non-const) array
-    }
-  }
-
-  #Print constant fields
-  print "\nConstant fields:\n\n";
-  foreach $key (sort keys %constFields) {
-    printf "%-40s%s\n",$key,$constFields{$key};
-  }
-  #Print min and max value for non-const fields
-  print "\nVariable fields:\n\n";
-  foreach $key (sort keys %fields) {
-    my @values = @{$fields{$key}};
-    #Look at first value to decide whether to do text or numeric sort
-    if (isNaN($values[0])) {
-      @values = sort @values;#text sort
-    }
-    else {
-      @values = sort {$a <=> $b} @values;#numeric sort
-    }
-    my $nVal = scalar(@values);
-    printf "%-40s",$key;
-    print "Min = $values[0]\tMax = $values[$nVal-1]\t(N = $nVal)\n";
-  }
-}
-
-##############################################################
-# function isNaN equivalent to the C++, java, javascript isNaN
-# From P Vaglio, ~intranet/lib/rbt_func.pl
-sub isNaN {
-  local($_) = @_;
-  s/\s+//g;         # strip white space
-  # match +or- beginning of line 0 or 1 time
-  # then any numeric 0 or more
-  # then a decimal point 
-  # then any numeric 0 or more after decimal point 
-  # then possibly an e or E then + or - then any numreci at least once
-  if (/^([+-]?)(\d*)(\.(\d*))?([Ee]([+-]?\d+))?$/ && (defined $2 || defined $4)) {
-    return 0;
-  } else {
-    return 1;
-  }
-}
-
-##############################################################
-# output corresponding headings for use with tabScores
-sub tabHeadings {
-  my $summaryKey = shift;
-  my @fieldNames = @_;
-  printf("%-10s%-30s","REC",$summaryKey);
-  foreach $field (@fieldNames) {
-    printf("%10s",$field);
-  }
-  print "\n";
-}
-
-##############################################################
-# tab-delimited output of named data field values
-sub tabScores {
-  my $recListRef = shift;
-  my $summaryKey = shift;
-  my @fieldNames = @_;
-  foreach $rec (@{$recListRef}) {
-    printf("%03d\t%-30.30s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey});
-    foreach $field (@fieldNames) {
-      my $val = $rec->{'DATA'}->{$field};
-      if (isNaN($val)) {
-        printf("%-10.12s",$val);
-      }
-      else {
-        printf("%10.3f",$val);
-      }
-    }
-    print "\n";
-  }
-}
-
-##############################################################
-# output corresponding headings for use with csvScores
-sub csvHeadings {
-  my $summaryKey = shift;
-  my @fieldNames = @_;
-  printf("%s,%s","REC",$summaryKey);
-  foreach $field (@fieldNames) {
-    printf(",%s",$field);
-  }
-  print "\n";
-}
-
-##############################################################
-# comma-delimited output of named data field values
-sub csvScores {
-  my $recListRef = shift;
-  my $summaryKey = shift;
-  my @fieldNames = @_;
-  foreach $rec (@{$recListRef}) {
-    printf("%d,%s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey});
-    foreach $field (@fieldNames) {
-      my $val = $rec->{'DATA'}->{$field};
-      if (isNaN($val)) {
-        printf(",%s",$val);
-      }
-      else {
-        printf(",%.3f",$val);
-      }
-    }
-    print "\n";
-  }
-}
-
-
-##############################################################
-# standardised output of Catalyst supplier field
-# Input is a reference to an array of SDRecords
-# and a ligand identifier field to output in column 1 (def=Name)
-sub tabulateSuppliers {
-  my $recListRef = shift;
-  my $summaryKey = shift || 'Name';
-  foreach $rec (@{$recListRef}) {
-    my $suppBase = $rec->{'DATAREF'}->{'Supplier'}+1;
-    my $linesRef = $rec->{'LINES'};
-    my $name = $rec->{'DATA'}->{$summaryKey};
-
-    #Output some useful compound info
-    my $name = $rec->{'DATA'}->{$summaryKey};
-    my $molFormula = $rec->{'DATA'}->{'MolFormula'};
-    my $molWt = $rec->{'DATA'}->{'MolWt'};
-    my $casNum = $rec->{'DATA'}->{'CAS_num'};
-    my $mdlNum = $rec->{'DATA'}->{'MDLNUMBER'};
-    print "\n\n====================================================================================================\n";
-    printf("%-10.10s%s\n","Name:",$name);
-    printf("%-10.10s%s\n","Formula:",$molFormula);
-    printf("%-10.10s%s\n","Mol.wt:",$molWt);
-    printf("%-10.10s%s\n","CAS #:",$casNum);
-    printf("%-10.10s%s\n","MDL #:",$mdlNum);
-
-    #Get all the supplier record lines into a list
-    #Record is terminated by blank line
-    my @lines;
-    my $nLines = 0;
-    for (; $$linesRef[$suppBase+$nLines] ne ""; $nLines++) {
-      push @lines,$$linesRef[$suppBase+$nLines];
-    }
-
-    #Column headings
-    printf("\n%-20.20s%-40.40s%-40.40s\n",
-	   "Supplier",
-	   "Comment",
-	   "Price"
-	  );
-    print "----------------------------------------------------------------------------------------------------\n";
-
-    #Loop over each supplier
-    my $iLine = 0;
-    for (; $iLine < $nLines; $iLine++) {
-      #collect supplier info lines
-      my @supplierInfo = ();
-      for (; $lines[$iLine] ne "." && $iLine < $nLines; $iLine++) {
-	push @supplierInfo,$lines[$iLine];
-      }
-      #Check for incomplete record
-      if ($iLine == $nLines) {
-	print "** INCOMPLETE RECORD **\n";
-	last;
-      }
-      my $nSupplierInfo = scalar(@supplierInfo);
-      my $supplier = $supplierInfo[0];
-      #loop over each grade
-      for ($iLine++; ($lines[$iLine] ne "........................") && ($iLine < $nLines); $iLine++) {
-	#collect grade info lines
-	my @gradeInfo = ();
-	for (; index($lines[$iLine],"_") ne 0 && $iLine < $nLines; $iLine++) {
-	  push @gradeInfo,$lines[$iLine];
-	}
-	#Check for incomplete record
-	if ($iLine == $nLines) {
-	  print "** INCOMPLETE RECORD **\n";
-	  last;
-	}
-	my $grade = $gradeInfo[0];
-	#loop over each price info line
-	for (; index($lines[$iLine],".") ne 0 && $iLine < $nLines; $iLine++) {
-	  my @priceInfo = split(" ",$lines[$iLine]);
-	  my $price = join(" ",@priceInfo);
-	  printf("%-20.20s%-40.39s%-40.40s\n",
-		 $supplier,
-		 $grade,
-		 $price);		 
-	}
-	#Check for incomplete record
-	if ($iLine == $nLines) {
-	  print "** INCOMPLETE RECORD **\n";
-	  last;
-	}
-	last if $lines[$iLine] eq "........................";
-      }
-    }
-  }
-}
-
-
-#######################################################################
-sub printHelpAndExit {
-  print "\nProduces text summaries of SD records\n";
-  print "\nUsage:\tsdreport [-l] [-t[<FieldName,FieldName...>]] [-c<FieldName,FieldName...>] [-id<IDField>] [-nh] [-o] [-s] [-sup] [sdFiles]\n";
-  print "\n\t-l (list format) output all data fields for each record as processed\n";
-  print "\t-t (tab format) tabulate selected fields for each record as processed\n";
-  print "\t-c (csv format) comma delimited output of selected fields for each record as processed\n";
-  print "\t-s (summary format) output summary statistics for each unique value of ligand ID\n";
-  print "\t-sup (supplier format) tabulate supplier details (from Catalyst)\n";
-  print "\t-id<IDField> data field to use as ligand ID\n";
-  print "\t-nh don't output column headings in -t and -c formats\n";
-  print "\t-o use old (v3.00) score field names as default columns in -t and -c formats, else use v4.00 field names\n";
-  print "\t-norm use normalised score field names as default columns in -t and -c formats (normalised = score / #ligand heavy atoms)\n";
-  print "\nNote:\tIf -l, -t or -c are combined with -s, the listing/table is output within each ligand summary\n";
-  print "\t-sup should not be combined with other options\n";
-  print "\tDefault field names for -t and -c are rDock score field names\n";
-  print "\tDefault ID field name is Name\n";
-  print "\n\tIf sdFiles not given, reads from standard input\n";
-  print "\tOutput is to standard output\n\n";
-  exit;
-}