Mercurial > repos > marpiech > norwich_tools_docking2
changeset 1:30e2440b2173 draft
planemo upload
author | marpiech |
---|---|
date | Mon, 29 Aug 2016 08:38:19 -0400 |
parents | 4cc079c67fab |
children | 7e52b8fb2df4 |
files | tools/rdock/bin/ht_protocol_finder.pl tools/rdock/bin/libRbt.so tools/rdock/bin/libRbt.so.rDock_2013.1_src tools/rdock/bin/make_grid.csh tools/rdock/bin/rbcalcgrid tools/rdock/bin/rbcavity tools/rdock/bin/rbdock tools/rdock/bin/rblist tools/rdock/bin/rbmoegrid tools/rdock/bin/run_rbdock.pl tools/rdock/bin/run_rbscreen.pl tools/rdock/bin/sdfield tools/rdock/bin/sdfilter tools/rdock/bin/sdmodify tools/rdock/bin/sdreport tools/rdock/bin/sdrmsd tools/rdock/bin/sdsort tools/rdock/bin/sdsplit tools/rdock/bin/sdtether tools/rdock/bin/to_unix |
diffstat | 20 files changed, 2260 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/ht_protocol_finder.pl Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,232 @@ +#!/usr/bin/perl -w + +# Script that simulates the result of a high throughput protocol. + +if (@ARGV < 4) +{ + print "Script that simulates the result of a high throughput protocol.\n"; + print "1st) exhaustive docking of a small representative part of the\n"; + print " whole library.\n"; + print "2nd) Store the result of sdreport -t over that exhaustive dock.\n"; + print " in file <sdreport_file> that will be the input of this\n"; + print " script.\n"; + print "3rd) ht_protocol_finder.pl <sdreport_file> <output_file> <thr1max>"; + print " <thr1min> <ns1> <ns2>\n"; + print " <ns1> and <ns2> are the number of steps in stage 1 and in\n"; + print " stage 2. If not present, the default values are 5 and 15\n"; + print " <thrmax> and <thrmin> setup the range of thresholds that will\n"; + print " be simulated in stage 1. The threshold of stage 2 depends\n"; + print " on the value of the threshold of stage 1.\n"; + print " An input of -22 -24 will try protocols:\n"; + print " 5 -22 15 -27\n"; + print " 5 -22 15 -28\n"; + print " 5 -22 15 -29\n"; + print " 5 -23 15 -28\n"; + print " 5 -23 15 -29\n"; + print " 5 -23 15 -30\n"; + print " 5 -24 15 -29\n"; + print " 5 -24 15 -30\n"; + print " 5 -24 15 -31\n"; + print " Output of the program is a 7 column values. First column\n"; + print " represents the time. This is a percentage of the time it\n"; + print " would take to do the docking in exhaustive mode, i.e. \n"; + print " docking each ligand 100 times. Anything\n"; + print " above 12 is too long.\n"; + print " Second column is the first percentage. Percentage of\n"; + print " ligands that pass the first stage.\n"; + print " Third column is the second percentage. Percentage of\n"; + print " ligands that pass the second stage.\n"; + print " The four last columns represent the protocol.\n"; + print " All the protocols tried are written at the end.\n"; + print " The ones for which time is less than 12%, perc1 is\n"; + print " less than 30% and perc2 is less than 5% but bigger than 1%\n"; + print " will have a series of *** after, to indicate they are good choices\n"; + print " WARNING! This is a simulation based in a small set.\n"; + print " The numbers are an indication, not factual values.\n"; + exit (0); +} +open (IFILE, $ARGV[0]) || die "cannot open $ARGV[0] for reading: $!"; +open (OFILE, ">$ARGV[1]") || die "cannot open $ARGV[1] for writing: $!"; +<IFILE>; # read first line +$line = <IFILE>; +$line =~ /\S+\s+(\S+)\s+\S+\s+(\S+)/; +$prevname = $1; +$inter = $2; +push @tmp, $inter; +$i = 0; +while ($line = <IFILE>) +{ + $line =~ /\S+\s+(\S+)\s+\S+\s+(\S+)/; + $name = $1; + $inter = $2; + if ($name ne $prevname) + { + $prevname = $name; + push @inters, [ @tmp ]; + $runsperligand[$i++] = @tmp; + $#tmp = -1; + } + push @tmp, $inter; +} +push @inters, [ @tmp ]; +$runsperligand[$i++] = @tmp; +$totLigands = @inters; +$ta = $ARGV[2]; +$tb = $ARGV[3]; +if (@ARGV > 4) +{ + $n1 = $ARGV[4]; +} +else +{ + $n1 = 5; +} +if (@ARGV > 5) +{ + $n2 = $ARGV[5]; +} +else +{ + $n2 = 15; +} +printf OFILE "Command line args:\n\t"; +foreach $arg (@ARGV) +{ + printf OFILE "%s\t", $arg; +} +printf OFILE "\n\n TIME PERC1 PERC2 N1 THR1 N2 THR2\n"; +for ($t1 = $ta ; $t1 >= $tb ; $t1--) +{ + $tc = $t1 - 5; + for ($t2 = $tc ; $t2 >= ($tc - 2) ; $t2--) + { + ($cref, $dref) = + getTables($t1, $t2); + @ligBelowThr1 = @$cref; + @ligBelowThr2 = @$dref; + ($time,$p1,$p2) = simulation($n1,$t1,$n2,$t2); + if (($time < 12.0) && ($p1 < 30.0) && ($p2 < 5.0) && ($p2 > 1.0)) + { + printf OFILE "%6.3f, %6.3f, %6.3f, %4i, %4i, %4i, %4i ***\n", + $time, $p1, $p2, $n1,$t1,$n2,$t2; + } + else + { + printf OFILE "%6.3f, %6.3f, %6.3f, %4i, %4i, %4i, %4i\n", + $time, $p1, $p2, $n1,$t1,$n2,$t2; + } + } +} + + +sub simulation +{ + my(@params); + my(@lt0, @lt1); + my($thr1, $thr2,$ns1,$ns2,$tottime,$totnumHits,$total,$totnruns); + @params = @_; + $ns1 = $params[0]; + $thr1 = $params[1]; + $ns2 = $params[2]; + $thr2 = $params[3]; + $ntrials = 100; + $tottime = 0; + $totnumHitss1 = 0; + $totnumHitss2 = 0; + $total = $totLigands; + $totnruns = 0; + for ($i = 0 ; $i < $ntrials ; $i++) + { + $numHitss1 = 0; + $numHitss2 = 0; + $totnruns = 0; + for ($j = 0 ; $j < $totLigands ; $j++) + { + ($passStage1, $nruns1) = stage($ns1, $ligBelowThr1[$j]); + if ($passStage1) + { + $numHitss1++; + ($passStage2, $nruns2) = stage($ns2, $ligBelowThr2[$j]); + if ($passStage2) + { + $numHitss2++; + } + $totnruns += $nruns2; + } + $totnruns += $nruns1; + } + $time = $totnruns / $total; + $tottime += $time; + $totnumHitss1 += $numHitss1; + $totnumHitss2 += $numHitss2; + } + $tottime /= $ntrials; + $totnumHitss1 /= $ntrials; + $totnumHitss2 /= $ntrials; + $p1 = $totnumHitss1 * 100.0 / $totLigands; + $p2 = $totnumHitss2 * 100.0 / $totLigands; + return ($tottime, $p1, $p2); +} + +sub getTables +{ + my(@params); + my($totLigands, @lt1, @lt2); + my($row,$inter, $belowThr1, $belowThr2); + my($thr0, $thr1); + @params = @_; + $thr1 = $params[0]; + $thr2 = $params[1]; + $totLigands = 0; + $belowThr1 = 0; + $belowThr2 = 0; + $totLigands = 0; + for $i (0 .. $#inters) + { + $row = $inters[$i]; + for $j (0 .. $#{$row}) + { + $inter = $row->[$j]; + if ($inter <= $thr1) + { + $belowThr1++; + } + if ($inter <= $thr2) + { + $belowThr2++; + } + } + $lt1[$totLigands] = + $belowThr1 / $runsperligand[$i]; + $lt2[$totLigands] = + $belowThr2 / $runsperligand[$i]; + $totLigands++; + $belowThr1 = 0; + $belowThr2 = 0; + } + return (\@lt1, \@lt2); +} + + + +sub stage +{ + my($nruns, $p, $i, $r); + $nruns = $_[0]; + $p = $_[1]; + if ($nruns == 0) + { + return (1, 0); + } + for ($i = 0 ; $i < $nruns ; $i++) + { + $r = rand; + if ($r < $p) + { + return (1, $i); + } + } + return (0, $nruns); +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/make_grid.csh Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,11 @@ +#!/bin/csh -f +# Creates vdW grids for all receptor prm files listed on command line +# +set gridstep = 0.3 +set border = 1.0 + +foreach grid ('vdw1' 'vdw5') + foreach file ($argv) + $RBT_ROOT/bin/rbcalcgrid -r${file} -pcalcgrid_${grid}.prm -o_${grid}.grd -g${gridstep} -b${border} + end +end
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/run_rbdock.pl Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,116 @@ +#!/usr/bin/perl +use lib "$ENV{'RBT_ROOT'}/lib"; +require "run_rbfuncs.pl"; + +my $rbtroot = $ENV{'RBT_ROOT'}; +my $rbthome = $ENV{'RBT_HOME'}; +my $recepDir = "$rbtroot/data/receptors"; +my $scriptDir = "$rbtroot/data/scripts"; +my $libraryDir = "$rbtroot/data/libraries"; +my $binDir = "$rbtroot/bin"; + +#override default directories +#Parse command line arguments +my $nArgs = scalar(@ARGV); + +while (scalar(@ARGV)) { + my $arg = shift @ARGV; + if (index($arg,'-r')==0) { + $recepDir = substr($arg,2); + } + elsif (index($arg,'-s')==0) { + $scriptDir = substr($arg,2); + } + elsif (index($arg,'-l')==0) { + $libraryDir = substr($arg,2); + } +} + + +# Get the receptor, library and script tables +my %receptor_table = get_prm_table($recepDir); +my %script_table = get_prm_table($scriptDir); +my %library_table = get_prm_table($libraryDir); + +#GET ALL THE INPUTS +my $defFiles = "*.{sd,sdf,mol,mdl}"; +print join("\n",glob $defFiles); +my $libFiles = get_input("\n\nEnter expression for SD files to dock",$defFiles); +my $receptor = get_selection(\%receptor_table,"receptor"); +my $script = get_selection(\%script_table,"docking script"); +my $flags = get_input("Enter optional rbdock flags","-ap -an"); +my $runs = get_input("Enter number of runs/ligand",20); + +my $runRoot = get_input("Enter run name","rbdock"); +my $queueSys = get_input("Enter queuing system (NQS,CODINE,CONDOR,NONE)","CONDOR"); +my $queueName; +$queueName = get_input("Enter NQS queue","QRBD") if ($queueSys eq "NQS"); + +$pwd = `pwd`; +chomp $pwd; + +#WRITE THE NQS SCRIPTS + +#GET THE LIST OF SD FILES TO DOCK +my @libList = glob "${libFiles}"; + +my $subFile = "submit_$runRoot.csh"; +open SUBHANDLE,">$subFile"; +print SUBHANDLE "#!/bin/csh -ef\n"; + +my $iFile=0; +foreach $libFile (@libList) { + $iFile++; + #Write the NQS script file + $runName = "${runRoot}_$iFile"; + $nqsFile = "run_${runName}.csh"; + my $logFile = "${runName}.log"; + + print "Writing $nqsFile...\n"; + + open NQSHANDLE,">$nqsFile"; + print NQSHANDLE "#!/bin/csh -ef\n"; + print NQSHANDLE "setenv RBT_ROOT $rbtroot\n"; + print NQSHANDLE "setenv RBT_HOME $rbthome\n"; + print NQSHANDLE "source \$RBT_ROOT/bin/setlibrbt.com\n"; + print NQSHANDLE "cd $pwd\n"; + print NQSHANDLE "if (-e $logFile) rm -f $logFile\n"; + print NQSHANDLE "\$RBT_ROOT/bin/rbdock -i$libFile -o${runName} -r$receptor -p$script -n$runs $flags > $logFile\n"; + close NQSHANDLE; + + chmod 0755,$nqsFile; + print SUBHANDLE "qsub -q $queueName $nqsFile\n" if ($queueSys eq "NQS"); + print SUBHANDLE "qsub -e $pwd/$runRoot/tmp/ -o $pwd/$runRoot/tmp/ $nqsFile\n" if ($queueSys eq "CODINE"); + if ($queueSys eq "CONDOR") { + my $condorFile = "run_${runName}.cmd"; + open CMDHANDLE,">$condorFile"; + print CMDHANDLE "#CONDOR FILE WRITTEN BY run_rbscreen.pl\n"; + print CMDHANDLE "universe = vanilla\n"; + print CMDHANDLE "executable = ${nqsFile}\n"; + print CMDHANDLE "Requirements = CATALYST_NODE =!= True\n"; + print CMDHANDLE "output = ${nqsFile}.out\n"; + print CMDHANDLE "error = ${nqsFile}.err\n"; + print CMDHANDLE "log = ${nqsFile}.log\n"; + print CMDHANDLE "queue\n"; + close NQSHANDLE; + print SUBHANDLE "condor_submit $condorFile\n"; + } +} +close SUBHANDLE; +chmod 0755,$subFile; + + + + + + + + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/run_rbscreen.pl Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,416 @@ +#!/usr/bin/perl +use File::stat; +use lib "$ENV{'RBT_ROOT'}/lib"; +use strict; +require "run_rbfuncs.pl"; + +my $rbtroot = $ENV{'RBT_ROOT'}; +my $rbthome = $ENV{'RBT_HOME'}; +my $rbtligdb = $ENV{'RBT_LIGDB'}; +my $recepDir = "$rbtroot/data/receptors"; +my $scriptDir = "$rbtroot/data/scripts"; +my $filterDir = "$rbtroot/data/filters"; +my $libraryDir = "$rbtligdb/dock"; +my $binDir = "$rbtroot/bin"; + +#override default directories +#Parse command line arguments +my $nArgs = scalar(@ARGV); + +while (scalar(@ARGV)) { + my $arg = shift @ARGV; + if (index($arg,'-r')==0) { + $recepDir = substr($arg,2); + } + elsif (index($arg,'-s')==0) { + $scriptDir = substr($arg,2); + } + elsif (index($arg,'-l')==0) { + $libraryDir = substr($arg,2); + } +} + + +# Get the receptor, library and script tables +#my %receptor_table = get_prm_table($recepDir); +my %receptor_table = get_prm_table($rbthome); +my %script_table = get_prm_table($scriptDir); +my %library_table = get_dock_table($libraryDir); + +#GET ALL THE INPUTS +#my $library = get_selection(\%library_table,"library"); +my $isOK; +my @libraries; +my $receptor; +my $script; + +do { + @libraries = get_multiple_selection(\%library_table,"library"); + $isOK = get_input("\nIs this OK (Y/N)","Y"); +} until ($isOK eq "Y"); + +do { + $receptor = get_selection(\%receptor_table,"receptor"); + $isOK = get_input("\nIs this OK (Y/N)","Y"); +} until ($isOK eq "Y"); + +# (SJ) check whether all the required files are in the dir +# DM 7 Feb 2002 - get PSF/CRD filenames from PRM file +print "\nLooking for all required target files in $rbthome...\n\n"; +my $prm_file = "$rbthome/".$receptor; +my $prm_root = substr($prm_file, 0, rindex($prm_file,".")); +my $as_file = "${prm_root}.as"; +my $vdw1_file = "${prm_root}_vdw1.grd"; +my $vdw5_file = "${prm_root}_vdw5.grd"; + +#Set the default file names equal to their parameter names so that +#error messages make sense if the parameters are missing completely from +#the .prm file. +my $psf_file = "RECEPTOR_TOPOL_FILE"; +my $crd_file = "RECEPTOR_COORD_FILE"; +my $mol2_file; + +# DM 12 May 2003 +# Optional target files for experimental restraints +# +# These default restraint suffixes and filenames should be synchronised +# with the default parameter values in the relevant C++ class source files ($RBT_ROOT/fw/src/.cxx) +# They are used if the user does not explicitly define the parameter in the +# receptor .prm file +# +# Note: this code searches for specific SECTION names in the receptor .prm file +# thus forcing a naming convention for the restraint terms that is not mandatory in the C++ code itself +# +# Pharmacophore restraint file +# SECTION PHARMA +# CONSTRAINTS_FILE = explicit filename +# OPTIONAL_FILE = explicit filename +my $ph4_filename = ".const";#synchronise with RbtPharmaSF::_CONSTRAINTS_FILE +my $ph4_optfilename = ""; +my $has_ph4 = 0; +# Tether restraint file +# SECTION TETHER +# REFERENCE_FILE = suffix to the root receptor.prm filename +my $tether_suffix = "_reference.sd";#synchronise with RbtTetherSF::_REFERENCE_FILE +my $has_tether = 0; +# Nmr restraint file +# SECTION NMR +# FILENAME = explicit filename (not suffix) +my $nmr_filename = "default.noe";#synchronise with RbtTetherSF::_FILENAME +my $has_nmr = 0; + +open PRMHANDLE,$prm_file || die "Cannot read $prm_file"; +while (<PRMHANDLE>) { + my @fields = split(' ',$_); + my $nfields = scalar(@fields); + $psf_file = "$rbthome/".$fields[1] if (($fields[0] eq 'RECEPTOR_TOPOL_FILE') && ($nfields > 1)); + $crd_file = "$rbthome/".$fields[1] if (($fields[0] eq 'RECEPTOR_COORD_FILE') && ($nfields > 1)); + $mol2_file = "$rbthome/".$fields[1] if (($fields[0] eq 'RECEPTOR_FILE') && ($nfields > 1)); + chomp $psf_file; + chomp $crd_file; + $has_ph4 = 1 if (($nfields > 1) && ($fields[0] eq 'SECTION') && ($fields[1] eq 'PHARMA') ); + $ph4_filename = "$rbthome/".$fields[1] if (($nfields > 1) && ($fields[0] eq 'CONSTRAINTS_FILE') && $has_ph4); + $ph4_optfilename = "$rbthome/".$fields[1] if (($nfields > 1) && ($fields[0] eq 'OPTIONAL_FILE') && $has_ph4); + $has_tether = 1 if (($nfields > 1) && ($fields[0] eq 'SECTION') && ($fields[1] eq 'TETHER') ); + $tether_suffix = $fields[1] if (($nfields > 1) && ($fields[0] eq 'REFERENCE_FILE') && $has_tether); + $has_nmr = 1 if (($nfields > 1) && ($fields[0] eq 'SECTION') && ($fields[1] eq 'NMR') ); + $nmr_filename = "$rbthome/".$fields[1] if (($nfields > 1) && ($fields[0] eq 'FILENAME') && $has_nmr); +} +close PRMHANDLE; + +my @src_files = ($prm_file,$as_file, $vdw1_file, $vdw5_file); +# check for either MOL2 or PSF/CRD +if ($mol2_file ne "") { + push (@src_files,"$mol2_file"); +} +else { + push (@src_files,"$psf_file"); + push (@src_files,"$crd_file"); +} +push (@src_files,"${ph4_filename}") if ($has_ph4); +push (@src_files,"${ph4_optfilename}") if (($has_ph4) && ($ph4_optfilename ne "")); +push (@src_files,"${prm_root}${tether_suffix}") if ($has_tether); +push (@src_files,"${nmr_filename}") if ($has_nmr); + +my $is_file_missing = 0; +foreach my $srcfile ( @src_files) { # search through all the target files + if(-e $srcfile) { + print "Found: $srcfile"; + my $sb = stat($srcfile); + #if ($sb->size < 1024) {print " WARNING size is suspiciously small ",$sb->size}; + print "\n"; + } + else { + print "ERROR: $srcfile is missing\n"; + $is_file_missing = 1; + } +} + +if ($is_file_missing) { + print "\nCannot continue - please generate all required target files and place in $rbthome\n"; + exit; +} + +do { + $script = get_selection(\%script_table,"docking script"); + $isOK = get_input("\nIs this OK (Y/N)","Y"); +} until ($isOK eq "Y"); + +my $flags = get_input("Enter optional rbdock flags","-ap -an"); +my $mode; +do +{ + $mode = get_input("High Throughput (HT) or Exhaustive Docking (ED)", "HT"); +}until (($mode eq "HT") || ($mode eq "ED")); + +my $HT; +my $filter; +my $FILTERFILE; +my $nRedock; +my $nStages; +my @targets; +my @runs; +my $redockTarget; +my $redockTargetDef = -20.0;#Default score threshold for redocking phase + +if ($mode eq "ED") +{ + $HT = 0; + my $run = get_input("Enter number of runs/ligand",50); + $flags = $flags . " -n" . $run; + $run--; + $filter = "1\nif - SCORE.NRUNS $run 0.0 -1.0,\n0\n"; +} +else +{ + $HT = 1; + $nStages = get_input("Number of stages (excluding redocking, enter 0 for redocking only)",2); + + #GET AND STORE THE TARGET SCORES AND NUMBER OF RUNS FOR EACH STAGE + + if ($nStages > 0) { + for (my $n = 0; $n < $nStages; $n++) { + print "\nDOCKING STAGE $n\n"; + my $target = get_input("Enter target score",-20.0); + my $run = get_input("Enter number of runs/ligand",20); + if ($run > 0) + { + push @targets,$target; + push @runs,$run; + } + } + $nStages = @targets; + print "\n"; + #Default score threshold for redocking is equal to target score for last docking stage + $redockTargetDef = $targets[$nStages-1]; + } + + $nRedock = get_input("Enter number of redocking runs/ligand",0); + if ($nRedock > 0) { + $redockTarget = get_input("Enter target score for redocking",$redockTargetDef); + } + else + { + $redockTarget = $redockTargetDef; + } + my $nPhases = $nStages; + if ($nRedock > 0) + { + $nPhases++; + } + $filter = "$nPhases\n"; + my $nr; + for (my $n = 0; $n < $nStages ; $n++) + { + $nr = $runs[$n] - 1; + $filter .= "if - $targets[$n] SCORE.INTER 1.0 if - SCORE.NRUNS $nr 0.0 -1.0,\n"; + } + if ($nRedock > 0) + { + $nr = $nRedock - 1; + $filter .= "if - SCORE.NRUNS $nr 0.0 -1.0,\n"; + } + my %filter_table = get_filter_table($filterDir, $redockTarget); + my @wfilters; + do { + print "\nSelect filters. Only the conformations that pass all the\n", + "filters chosen AND have SCORE.INTER lower than ", $redockTarget, + "\nAND SCORE.RESTR.CAVITY < 1.0 will be written down\n"; + @wfilters = get_multiple_selection(\%filter_table,"filters"); + $isOK = get_input("\nIs this OK (Y/N)","Y"); + } until ($isOK eq "Y"); + + for (my $i = 0 ; $i < @wfilters ; $i++) # remove the no filters + { + if ($wfilters[$i] eq "1no_other_filters") + { + splice(@wfilters, $i, 1); + } + } + + my $nwfilters = @wfilters + 2; + $filter .= "$nwfilters\n"; + $filter .= "- SCORE.INTER $redockTarget,\n"; + $filter .= "- SCORE.RESTR.CAVITY 1.0,\n"; + foreach my $wfilter (@wfilters) { + open(FF, "$filterDir/$wfilter") || open(FF, "./$wfilter") || + die "cannot open $wfilter for reading: $!"; + my $line; + while ($line = <FF>){ $filter .= $line;} + $filter .= ",\n"; + } +} + + + + my $runRoot = get_input("Enter run name","rbdock"); + + my $pwd = `pwd`; + chomp $pwd; + + #Make a subdirectory for each docking stage + #plus a tmp directory + mkdir($runRoot,0755); + mkdir("$runRoot/tmp",0755); + mkdir("$runRoot/tmp/condor_logs",0755); + mkdir("$runRoot/log",0755); + mkdir("$runRoot/sd",0755); + mkdir("$runRoot/errors",0755); + chdir "$runRoot/tmp"; + + my $filterFile = "${runRoot}_filter.txt"; + print "Writing $filterFile...\n"; +open FILTERHANDLE,">$filterFile"; +print FILTERHANDLE $filter; +#WRITE THE CUSTOM DOCKING SCRIPT FOR USE BY CONDOR +#This will be the Condor "Executable" +#gzipped SD file to dock is passed as $1 argument +#Run name is passed as $2 argument +my $nqsFile = "run_${runRoot}.csh"; +print "Writing $nqsFile...\n"; +open NQSHANDLE,">$nqsFile"; +print NQSHANDLE "#!/bin/tcsh -f\n"; +print NQSHANDLE "#CONDOR EXECUTABLE SCRIPT WRITTEN BY run_rbscreen.pl\n"; + +# DM 14 Mar 2003 +#Create a tmp runtime directory on the local disk for rDock SD and log files +#These will be compressed before copying back to the submission directory +#on the file server +#There are now two tmp directories: +# /tmp/ribodock/<version> - stores the grid files etc that are common between runs +# /tmp/${PROG}.XXXXXX - stores the I/O SD and log files unique to each run + +print NQSHANDLE "set PROG = \${0:t}\n"; +print NQSHANDLE "set TMPDIR = `mktemp -d /tmp/\${PROG}.XXXXXX`\n"; + +# synchronizing target files +my $versionidx = index $rbtroot,"ribodock"; +my $targetdir = "/tmp/".substr($rbtroot,$versionidx); +print NQSHANDLE "mkdir -pv -m0775 /tmp/ribodock\n"; +print NQSHANDLE "mkdir -pv -m0775 $targetdir\n"; +print NQSHANDLE "setenv RBT_HOME $targetdir\n"; +print NQSHANDLE "rsync -avz ".join(" ",@src_files)." $targetdir\n"; + +# Copy the compressed sd file to the tmp directory and uncompress +# DM 28 Feb 2002 - use the /tmp directory on the local disk instead +# DM 14 Mar 2003 - use TMPDIR = /tmp/${PROG}.XXXXXX instead +print NQSHANDLE "set gzfile = \$1\n"; +print NQSHANDLE "set runName = \$2\n"; +print NQSHANDLE "set copied_gzfile = \${TMPDIR}/\${gzfile:t}\n"; +print NQSHANDLE "set sdfile = \${copied_gzfile:r}\n"; +print NQSHANDLE "echo Ligand_DB : \$gzfile\n"; +print NQSHANDLE "echo Local copy: \$copied_gzfile\n"; +print NQSHANDLE "echo Unzipped : \$sdfile\n"; +print NQSHANDLE "cp -f \$gzfile \$copied_gzfile\n"; +print NQSHANDLE "gunzip -f \$copied_gzfile\n"; + +my $target = $redockTarget; +print NQSHANDLE "#\n"; +print NQSHANDLE "set outRoot = \${TMPDIR}/\${runName}\n"; +print NQSHANDLE "set logFile = \${outRoot}.log\n"; +print NQSHANDLE "if (-e \$logFile) rm -f \$logFile\n"; +print NQSHANDLE "\$RBT_ROOT/bin/rbdock -i\$sdfile -o\${outRoot} -r$receptor -p$script -t$filterFile $flags > \$logFile\n"; + +# Compress the SD and log files and copy back to submission directory +# The ..._errors.sd files are generated by the pharmacophore restraints term for ligands that +# do not fulfill the feature requirements +print NQSHANDLE "if (-e \${outRoot}.sd) then\n"; +print NQSHANDLE " gzip -9vf \${outRoot}.sd\n"; +print NQSHANDLE " cp \${outRoot}.sd.gz ../sd\n"; +print NQSHANDLE "endif\n"; +print NQSHANDLE "if (-e \${logFile}) then\n"; +print NQSHANDLE " gzip -9vf \${logFile}\n"; +print NQSHANDLE " cp \${logFile}.gz ../log\n"; +print NQSHANDLE "endif\n"; +print NQSHANDLE "if (-e \${outRoot}_errors.sd) then\n"; +print NQSHANDLE " gzip -9vf \${outRoot}_errors.sd\n"; +print NQSHANDLE " cp \${outRoot}_errors.sd.gz ../errors\n"; +print NQSHANDLE "endif\n"; + +print NQSHANDLE "rm -rf \${TMPDIR}\n"; +close NQSHANDLE; +chmod 0755,$nqsFile; + +#PREPARE THE CONDOR JOB FILE +my $condorFile = "run_${runRoot}.cmd"; +print "Writing $condorFile...\n"; +open CMDHANDLE,">$condorFile"; +print CMDHANDLE "#CONDOR COMMAND FILE WRITTEN BY run_rbscreen.pl\n"; +print CMDHANDLE "universe = vanilla\n"; +print CMDHANDLE "initialdir = $pwd/$runRoot/tmp\n"; +print CMDHANDLE "environment = RBT_ROOT=$rbtroot;RBT_LIGDB=$rbtligdb;RBT_HOME=$rbthome;LD_LIBRARY_PATH=/usr/local/lib:$rbtroot/lib;\n"; +print CMDHANDLE "executable = ${nqsFile}\n"; +#Stop Condor filling your mailbox +print CMDHANDLE "Notification = Error\n"; +print CMDHANDLE "log = run_${runRoot}.log\n"; +# Added by DM, Enspiral Discovery, 28 Feb 2005 +# Prevent rDock jobs running on dedicated Catalyst nodes +print CMDHANDLE "Requirements = CATALYST_NODE =!= True\n"; + +#GET THE LIST OF COMPRESSED SD FILES TO DOCK +my $iFile=0; +foreach my $library (@libraries) { + my @libFiles = glob "$library_table{$library}"; + foreach my $libFile (@libFiles) { + $iFile++; + my $runName = "${runRoot}_$iFile"; + print CMDHANDLE "output = condor_logs/run_${runName}.out\n"; + print CMDHANDLE "error = condor_logs/run_${runName}.err\n"; + print CMDHANDLE "arguments = $libFile $runName\n"; + print CMDHANDLE "queue\n"; + } +} +close CMDHANDLE; + +#WRITE THE CUSTOM CLEANUP SCRIPT +#This will be run by the user from the command line +#after all the jobs have finished +my $cleanFile = "clean_${runRoot}.csh"; +print "Writing $cleanFile...\n"; +open CLEANHANDLE,">$cleanFile"; +print CLEANHANDLE "#!/bin/tcsh -f\n"; +print CLEANHANDLE "#CLEANUP SCRIPT WRITTEN BY run_rbscreen.pl\n"; +print CLEANHANDLE "set sdout = ../${runRoot}_raw_hits.sd\n"; +print CLEANHANDLE "if (-e \${sdout}) then\n"; +print CLEANHANDLE " echo ARE YOU SURE YOU WANT TO DO THIS\n"; +print CLEANHANDLE " echo \${sdout} already exists\n"; +print CLEANHANDLE " echo To rerun the script remove \${sdout} manually first\n"; +print CLEANHANDLE " exit\n"; +print CLEANHANDLE "endif\n"; +print CLEANHANDLE "echo All compressed SD files in ../sd/ will be concatenated into a single compressed file \${sdout}.gz\n"; +print CLEANHANDLE "touch \${sdout}\n"; +print CLEANHANDLE "foreach file (../sd/*.sd.gz)\n"; +print CLEANHANDLE " echo Sorting and concatenating \${file}...\n"; +print CLEANHANDLE " zcat \${file} | \${RBT_ROOT}/bin/sdsort -n -fSCORE -s >> \${sdout}\n"; +print CLEANHANDLE "end\n"; +print CLEANHANDLE "echo Each block of consecutive records per ligand have been presorted by SCORE "; +print CLEANHANDLE "ready for filtering\n"; +print CLEANHANDLE "echo Compressing \${sdout}...\n"; +print CLEANHANDLE "gzip -9vf \${sdout}\n"; +print CLEANHANDLE "echo DONE\n"; +print CLEANHANDLE "echo You may now safely remove the ../sd/ directory if you wish to save disk space\n"; +close CLEANHANDLE; +chmod 0755,$cleanFile; + +print "To submit, cd $runRoot/tmp; condor_submit $condorFile\n"; +print "When all jobs have completed, run $runRoot/tmp/$cleanFile to concatenate the output files\n";
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdfield Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/perl +# add extra fields to SD records +use lib "$ENV{'RBT_ROOT'}/lib"; +use SDRecord; + +printHelpAndExit() if (scalar(@ARGV) == 0); +#Parse command line arguments +my @files; +my $arg = shift @ARGV; +printHelpAndExit() if ($arg eq '-h'); +#read records +my $sdRec = new SDRecord; +my $nRec=0; +# if not -h, then must be the FieldName +my $FieldName = $arg; +my $FieldVal = shift @ARGV; # and this must be the filed value +# rest are the filenames +while ($sdRec->readRec('DATA'=>1,'LINES'=>1)) { + $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field + $sdRec->addData('_FILE' => $ARGV);#add filename as temp data field + $sdRec->addDataAndLines($FieldName => $FieldVal); + $sdRec->writeRec();#write record +} + +sub printHelpAndExit { + print "\nAdding fields to SD files\n"; + print "Usage:\n"; + print "\tsdfield <Fieldname> <value> <infile[s]>\n"; + print "\t-h\tthis help\n"; + print "\t\n"; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdfilter Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,221 @@ +#!/usr/bin/perl +# Filters SD records by given data field * +# * +# Usage: sdfilter -f'$<DataField> <Operator> <Value>' [sdFiles] * +# or sdfilter -f<filename> [sdFiles] * +# * +# where <filename> contains a list of <DataField> <Operator> <Value> * +# filters, one per line. * +# Multiple filters are OR'd together (for AND, pipe to a 2nd sdfilter) * +# * +# If sdFiles not given, reads from standard input * +# Output is to standard output * +#******************************************************************************* +use lib "$ENV{'RBT_ROOT'}/lib"; + +use SDRecord; + +#keep track of the number of occurrences of each value of this key +my $summaryKey = "Name"; + +#list of filter strings +#all filters in the list are OR'd together +my @SDFILTERS; + +# DM 5 Dec 2000 - new option to filter on diverse_molecules.list +# as output by C2 +# Essentially just a list of record numbers +my $cerius2filter = 0; +my $cerius2file = "diverse_molecules.list"; +my @cerius2recs; + +#Print help if no command line arguments +printHelpAndExit() if (scalar(@ARGV) == 0); + +#Parse command line arguments +my @files; +while (scalar(@ARGV)) { + my $arg = shift @ARGV; + printHelpAndExit() if ($arg eq '-h'); + if (index($arg,'-f')==0) { + $arg = substr($arg,2); + #try opening the argument as a file + if (open FILTERFILE,$arg) { + #get the list of filter strings from the file + my @strList = <FILTERFILE>; + close FILTERFILE; + chomp @strList; + #build the filter strings into a proper Perl expression for eval + #and add to the list + push @SDFILTERS,buildFilterList(@strList); + } + else { + #build the filter string into a proper Perl expression for eval + #and add to the list + push @SDFILTERS, buildFilterList($arg); + } + } + elsif (index($arg,'-sup')==0) { + my $supplier = substr($arg,4); + my $supplierFilter = "scalar(grep /$supplier/, \@supplierList) > 0"; + #print "Supplier filter = $supplierFilter\n"; + push @SDFILTERS, $supplierFilter; + } + elsif (index($arg,'-s')==0) { + $summaryKey = substr($arg,2); + } + #DM 05 Dec 2000 - c2 option + elsif (index($arg,'-c2')==0) { + $cerius2filter = 1; + $arg = substr($arg,3); + $cerius2file = $arg if ($arg); #argument overrides default file name + if (open FILTERFILE,$cerius2file) { + #Skip first two lines + my $tmp = <FILTERFILE> || die "Invalid Cerius2 file $cerius2file"; + $tmp = <FILTERFILE> || die "Invalid Cerius2 file $cerius2file"; + while (<FILTERFILE>) { + chomp; + push @cerius2recs,split(" ",$_); + } + close FILTERFILE; + } + die "No records in $cerius2file\n" if (scalar(@cerius2recs)==0); + #Sort into reverse order so we can pop the record #'s off later + @cerius2recs = reverse (sort {$a<=>$b} @cerius2recs); + #foreach $rec (@cerius2recs) { + # print "$rec\n"; + #} + } + else { + push @files,$arg;#must be a filename + } +} +push @ARGV,@files;#put the filenames back in the arg list + +#read records +my $sdRec = new SDRecord; +my %count;#record the number of occurrences of each value of $summaryKey +my $nRec=0; +#In C2 mode, just need to check record numbers so we can read the SD file +#without parsing the data fields +if ($cerius2filter) { + my $c2nextRec = pop @cerius2recs; + while ($sdRec->readRec('LINES'=>1)) { + if ((++$nRec)==$c2nextRec) { + $sdRec->writeRec(); + exit if (scalar(@cerius2recs)==0); + $c2nextRec=pop @cerius2recs; + } + } +} +#Normal mode, need to parse the data fields +else { + while ($sdRec->readRec('DATA'=>1,'LINES'=>1)) { + $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field + $sdRec->addData('_COUNT' => ++$count{$sdRec->{'DATA'}->{$summaryKey}});#add count as temp data field + my @supplierList = buildSupplierList($sdRec); + foreach $filter (@SDFILTERS) { + if (eval $filter) { + $sdRec->writeRec();#write record if one of the filters returns true + last; + } + } + } +} + +####################################################################### +# Compose a complete SD filter string for each string in the parameter list +# Complete strings are of the form: $sdRec->{'DATA'}->{'Name'} eq RBT162" +# Note: the SDRecord object name is hardcoded as $sdRec +sub buildFilterList { + my @strList = @_;#input list of unexpanded filter strings + my @filterList;#output list of expanded filter strings + + #loop over each input string + foreach $str (@strList) { + my $filter; + my $i1 = 0; + my $lenstr = length($str); + while ($i1 < $lenstr) { + #print "i1=$i1\n"; + $i2 = index($str,"\$",$i1); + #print "i2=$i2\n"; + if ($i2 == -1) { + $filter .= substr($str,$i1); + last; + } + $filter .= substr($str,$i1,$i2-$i1); + #DM 04 Sept 2000 - allow for spaces in field names + #by surrounding with {} brackets + #e.g. ${CDD id} + #Check for { first, if not found assume standard format (e.g. $Name) + my $i3 = index($str,"{",$i2+1); + if ($i3 == $i2+1) { + my $i4 = index($str,"}",$i3+1); + my $var; + if ($i4 == -1) { + $var = substr($str,$i3+1); + $filter .= "\$sdRec->{'DATA'}->{'$var'}"; + last; + } + else { + $var = substr($str,$i3+1,$i4-$i3-1); + $filter .= "\$sdRec->{'DATA'}->{'$var'}"; + } + $i1 = $i4+1; + } + #Standard format + else { + $i3 = index($str," ",$i2+1); + #print "i3=$i3\n"; + my $var; + if ($i3 == -1) { + $var = substr($str,$i2+1); + $filter .= "\$sdRec->{'DATA'}->{'$var'}"; + last; + } + else { + $var = substr($str,$i2+1,$i3-$i2-1); + $filter .= "\$sdRec->{'DATA'}->{'$var'}"; + } + $i1 = $i3; + } + } + #print "$filter\n"; + push @filterList,$filter; + } + return @filterList; +} + +####################################################################### +# Extract the list of suppliers from the Cat_coden multi-line data field +sub buildSupplierList { + my $rec = shift; + my @suppliers; + my $suppBase = $rec->{'DATAREF'}->{'Cat_coden'}+1; + my $linesRef = $rec->{'LINES'}; + if (defined $suppBase) { + while ($$linesRef[$suppBase] ne "") { + push @suppliers,$$linesRef[$suppBase++]; + } + } + #print "Supplier list:\n"; + #print join("\n",@suppliers); + return @suppliers; +} + +####################################################################### +sub printHelpAndExit { + print "\nFilters SD records by data fields\n"; + print "\nUsage:\tsdfilter -f'\$<DataField> <Operator> <Value>' [-s<DataField>] [sdFiles]\n"; + print "or\tsdfilter -f<filename> [-s<DataField>] [sdFiles]\n\n"; + print "Note:\tMultiple filters are allowed and are OR'd together.\n"; + print "\tFilters can be provided in a file, one per line.\n"; + print "\n\tStandard Perl operators should be used. e.g.\n"; + print "\teq ne lt gt le ge for strings\n\t== != < > <= >= for numeric\n"; + print "\n\t_REC (record #) is provided as a pseudo-data field\n"; + print "\tif -s option is used, _COUNT (#occurrences of DataField) is provided as a pseudo-data field\n"; + print "\n\tIf SD file list not given, reads from standard input\n"; + print "\tOutput is to standard output\n\n"; + exit; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdmodify Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,59 @@ +#!/usr/bin/perl +# Script to set the first title line equal to a given data field +# +# Usage: sdmodify -f<DataField> [sdFiles] +# +# If sdFiles not given, reads from standard input +# Output is to standard output + +use lib "$ENV{'RBT_ROOT'}/lib"; + +use SDRecord; + +my $SDMODKEY = "ID"; + +#Print help if no command line arguments +printHelpAndExit() if (scalar(@ARGV) == 0); + +#Parse command line arguments +my @files; +while (scalar(@ARGV)) { + my $arg = shift @ARGV; + printHelpAndExit() if ($arg eq '-h'); + if (index($arg,'-f')==0) { + $SDMODKEY = substr($arg,2);#modification key + } + else { + push @files,$arg;#must be a filename + } +} +push @ARGV,@files;#put the filenames back in the arg list + +#read records +my $sdRec = new SDRecord; +my $nRec=0; +while ($sdRec->readRec('DATA'=>1,'LINES'=>1)) { + $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field + #set the first line equal to the given data field + my $nLines = scalar (@{$sdRec->{'LINES'}}); + if ($nLines > 0) { + ${$sdRec->{'LINES'}}[0] = $sdRec->{'DATA'}->{$SDMODKEY}; + } + $sdRec->writeRec(); +} + +#!/usr/bin/perl +# Script to set the first title line equal to a given data field +# +# Usage: sdmodify -f<DataField> [sdFiles] +# +# If sdFiles not given, reads from standard input +# Output is to standard output + +sub printHelpAndExit { + print "\nScript to set the first title line equal to a given data field.\n"; + print "\nUsage:\tsdmodify -f<DataField> [sdFiles]\n"; + print "\n\tIf sdFiles not given, reads from standard input.\n"; + print "\n\tOutput is to standard output.\n\n"; + exit; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdreport Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,437 @@ +#!/usr/bin/perl +# Produces text summaries of SD records * +# * +# Usage: sdreport [-l] [-t] [-s<DataField>] [sdFiles] * +# * +# -l output data fields for each record as processed * +# -t tabulate Rbt.Score.* fields for each record as processed * +# -s summarise data fields for all records * +# -s<DataField> summarise data fields for each unique value * +# of <DataField> * +# * +# Note: If -l or -t are combined with -s, the listing/table is output * +# within each ligand summary * +# If sdFiles not given, reads from standard input * +# Output is to standard output * +# Default is equivalent to sdfilter -l * +#******************************************************************************* +use lib "$ENV{'RBT_ROOT'}/lib"; + +use SDRecord; + +# Default field names and headings for rDock v4.00 scores +my @defaultFields = ('SCORE','SCORE.INTER','SCORE.INTRA','SCORE.RESTR','SCORE.INTER.VDW'); +my @defaultHeadings = ('TOTAL','INTER','INTRA','RESTR','VDW'); + +# Default field names and headings for normalised scores (score / #ligand heavy atoms) +my @defaultNormFields = ('SCORE.norm','SCORE.INTER.norm','SCORE.INTRA.norm','SCORE.RESTR.norm','SCORE.heavy'); +my @defaultNormHeadings = ('TOTALn','INTERn','INTRAn','RESTRn','#heavy'); + +# Default field names and headings for rDock v3.00 scores +my @defaultOldFields = ('Rbt.Score.Corrected','Rbt.Score.Inter','Rbt.Score.Intra','Rbt.Score.IntraMin','Rbt.Score.Restraint'); +my @defaultOldHeadings = ('TOTAL','INTER','INTRA','INTRAMIN','RESTR'); + +my $listFormat = 0; +my $summaryFormat = 0; +my $tableFormat = 0; +my $supplierFormat = 0; +my $csvFormat = 0; +my $summaryKey = "_TITLE1"; +my $oldFields = 0;#If true, use old default field names for component scores +my $headings = 1;#DM 21 Nov 2000, If false, don't output headings +my @outputFields; +my @outputHeadings; + +#Print help if no command line arguments +printHelpAndExit() if (scalar(@ARGV) == 0); + +#Parse command line arguments +my $nArgs = scalar(@ARGV); + +while (scalar(@ARGV)) { + my $arg = shift @ARGV; + printHelpAndExit() if ($arg eq '-h'); + if (index($arg,'-l')==0) { + $listFormat = 1; + } + elsif (index($arg,'-o')==0) { + $oldFields = 1; + } + # 7 Feb 2005 (DM) Option to report normalised aggregate scores + elsif (index($arg,'-norm')==0) { + $oldFields = 2; + } + elsif (index($arg,'-sup')==0) { + $supplierFormat = 1; + } + elsif (index($arg,'-s')==0) { + $summaryFormat = 1; + } + elsif (index($arg,'-id')==0) { + $summaryKey = substr($arg,3); + } + elsif (index($arg,'-nh')==0) { + $headings = 0; + } + elsif (index($arg,'-t')==0) { + $tableFormat = 1; + push @outputFields, split(',',substr($arg,2)); + push @outputHeadings, @outputFields; + } + elsif (index($arg,'-c')==0) { + $csvFormat = 1; + push @outputFields, split(',',substr($arg,2)); + push @outputHeadings, @outputFields; + } + else { + push @files,$arg;#must be a filename + } +} +push @ARGV,@files;#put the filenames back in the arg list + +#use -l if neither table format is specified +$listFormat = (!$tableFormat && !$csvFormat && !$supplierFormat); + +#If no output fields defined for -t or -c use the defaults (old or new) +if (scalar(@outputFields)==0) { + if ($oldFields == 1) { + @outputFields = @defaultOldFields; + @outputHeadings = @defaultOldHeadings; + } + elsif ($oldFields == 2) { + @outputFields = @defaultNormFields; + @outputHeadings = @defaultNormHeadings; + } + else { + @outputFields = @defaultFields; + @outputHeadings = @defaultHeadings; + } +} + +my $sdRec = new SDRecord; +my %summary;#hash of SDRecord lists, indexed by user-defined summary key +my %indexByName; +my %indexByNum; +my $idx = 0; +my $nRec = 0; + +#Column headings for tab and csv format +#DM 21 Nov 2000 - if $headings is false, then don't output the column headings +if ($tableFormat && !$summaryFormat && $headings) { + tabHeadings($summaryKey,@outputHeadings); +} +if ($csvFormat && !$summaryFormat && $headings) { + csvHeadings($summaryKey,@outputHeadings); +} + +#read records +while ($sdRec->readRec('LINES'=>1,'DATA'=>1)) { + $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field + if ($listFormat && !$summaryFormat) { + print "\n\nRECORD #$nRec\n"; + $sdRec->writeData(); + } + if ($tableFormat && !$summaryFormat) { + @recList = ($sdRec); + tabScores(\@recList,$summaryKey,@outputFields); + } + elsif ($csvFormat && !$summaryFormat) { + @recList = ($sdRec); + csvScores(\@recList,$summaryKey,@outputFields); + } + elsif ($supplierFormat && !$summaryFormat) { + @recList = ($sdRec); + tabulateSuppliers(\@recList,$summaryKey); + } + #add record to summary, indexed by user field value + #keep a separate index of unique values of user field values, + #indexed by number in the order the values first appear + #In this way we can maintain the sorted order of ligands + #when we come to print out the summary + if ($summaryFormat) { + my $summaryValue = $sdRec->{'DATA'}->{$summaryKey}; + #New data field value encountered + if (!defined $indexByName{$summaryValue}) { + $idx++;#incr the number of unique values + #keep cross-referenced indexes (field value <-> number) + $indexByName{$summaryValue} = $idx; + $indexByNum{$idx} = $summaryValue; + } + push @{$summary{$summaryValue}},$sdRec->copy('DATA'=>1); + } +} + +#Print summary if required +if ($summaryFormat) { + print "\n===============================================================\n"; + print "SUMMARY BY $summaryKey\n"; + foreach $idx (sort {$a<=>$b} keys %indexByNum) {#numberic sort of index numbers + my $key = $indexByNum{$idx};#look up corresponding data field value + print "\n===============================================================\n"; + print "$summaryKey = $key (#$idx)\n\n"; + writeSummary($summary{$key}); + if ($listFormat) { + print "\nIndividual records:\n"; + foreach $rec (@{$summary{$key}}) { + print "\n"; + $rec->writeData(); + } + } + if ($tableFormat) { + print "\nScores:\n"; + tabHeadings($summaryKey,@outputHeadings); + tabScores($summary{$key},$summaryKey,@outputFields); + } + if ($csvFormat) { + print "\nScores:\n"; + csvHeadings($summaryKey,@outputHeadings); + csvScores($summary{$key},$summaryKey,@outputFields); + } + } +} + +############################################################## +# writes a summary to STDOUT for a list of SDRecords +# Input is a reference to an array of SDRecords +sub writeSummary { + my $recListRef = shift; + + #Extract the list of data values for each fieldname into a hash array + #(key=fieldname, value=array ref) + my %fields; + foreach $rec (@{$recListRef}) { + my ($key,$value); + while ( ($key,$value) = each %{$rec->{'DATA'}}) { + push @{$fields{$key}},$value; + } + } + + #Look for constant fields and store separately + my %constFields; + foreach $key (keys %fields) { + my @values = sort @{$fields{$key}}; + my $nVal = scalar(@values); + if ($values[0] eq $values[$nVal -1]) { + $constFields{$key} = $values[0];#store the field name and the constant value + delete $fields{$key};#remove from (non-const) array + } + } + + #Print constant fields + print "\nConstant fields:\n\n"; + foreach $key (sort keys %constFields) { + printf "%-40s%s\n",$key,$constFields{$key}; + } + #Print min and max value for non-const fields + print "\nVariable fields:\n\n"; + foreach $key (sort keys %fields) { + my @values = @{$fields{$key}}; + #Look at first value to decide whether to do text or numeric sort + if (isNaN($values[0])) { + @values = sort @values;#text sort + } + else { + @values = sort {$a <=> $b} @values;#numeric sort + } + my $nVal = scalar(@values); + printf "%-40s",$key; + print "Min = $values[0]\tMax = $values[$nVal-1]\t(N = $nVal)\n"; + } +} + +############################################################## +# function isNaN equivalent to the C++, java, javascript isNaN +# From P Vaglio, ~intranet/lib/rbt_func.pl +sub isNaN { + local($_) = @_; + s/\s+//g; # strip white space + # match +or- beginning of line 0 or 1 time + # then any numeric 0 or more + # then a decimal point + # then any numeric 0 or more after decimal point + # then possibly an e or E then + or - then any numreci at least once + if (/^([+-]?)(\d*)(\.(\d*))?([Ee]([+-]?\d+))?$/ && (defined $2 || defined $4)) { + return 0; + } else { + return 1; + } +} + +############################################################## +# output corresponding headings for use with tabScores +sub tabHeadings { + my $summaryKey = shift; + my @fieldNames = @_; + printf("%-10s%-30s","REC",$summaryKey); + foreach $field (@fieldNames) { + printf("%10s",$field); + } + print "\n"; +} + +############################################################## +# tab-delimited output of named data field values +sub tabScores { + my $recListRef = shift; + my $summaryKey = shift; + my @fieldNames = @_; + foreach $rec (@{$recListRef}) { + printf("%03d\t%-30.30s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); + foreach $field (@fieldNames) { + my $val = $rec->{'DATA'}->{$field}; + if (isNaN($val)) { + printf("%-10.12s",$val); + } + else { + printf("%10.3f",$val); + } + } + print "\n"; + } +} + +############################################################## +# output corresponding headings for use with csvScores +sub csvHeadings { + my $summaryKey = shift; + my @fieldNames = @_; + printf("%s,%s","REC",$summaryKey); + foreach $field (@fieldNames) { + printf(",%s",$field); + } + print "\n"; +} + +############################################################## +# comma-delimited output of named data field values +sub csvScores { + my $recListRef = shift; + my $summaryKey = shift; + my @fieldNames = @_; + foreach $rec (@{$recListRef}) { + printf("%d,%s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); + foreach $field (@fieldNames) { + my $val = $rec->{'DATA'}->{$field}; + if (isNaN($val)) { + printf(",%s",$val); + } + else { + printf(",%.3f",$val); + } + } + print "\n"; + } +} + + +############################################################## +# standardised output of Catalyst supplier field +# Input is a reference to an array of SDRecords +# and a ligand identifier field to output in column 1 (def=Name) +sub tabulateSuppliers { + my $recListRef = shift; + my $summaryKey = shift || 'Name'; + foreach $rec (@{$recListRef}) { + my $suppBase = $rec->{'DATAREF'}->{'Supplier'}+1; + my $linesRef = $rec->{'LINES'}; + my $name = $rec->{'DATA'}->{$summaryKey}; + + #Output some useful compound info + my $name = $rec->{'DATA'}->{$summaryKey}; + my $molFormula = $rec->{'DATA'}->{'MolFormula'}; + my $molWt = $rec->{'DATA'}->{'MolWt'}; + my $casNum = $rec->{'DATA'}->{'CAS_num'}; + my $mdlNum = $rec->{'DATA'}->{'MDLNUMBER'}; + print "\n\n====================================================================================================\n"; + printf("%-10.10s%s\n","Name:",$name); + printf("%-10.10s%s\n","Formula:",$molFormula); + printf("%-10.10s%s\n","Mol.wt:",$molWt); + printf("%-10.10s%s\n","CAS #:",$casNum); + printf("%-10.10s%s\n","MDL #:",$mdlNum); + + #Get all the supplier record lines into a list + #Record is terminated by blank line + my @lines; + my $nLines = 0; + for (; $$linesRef[$suppBase+$nLines] ne ""; $nLines++) { + push @lines,$$linesRef[$suppBase+$nLines]; + } + + #Column headings + printf("\n%-20.20s%-40.40s%-40.40s\n", + "Supplier", + "Comment", + "Price" + ); + print "----------------------------------------------------------------------------------------------------\n"; + + #Loop over each supplier + my $iLine = 0; + for (; $iLine < $nLines; $iLine++) { + #collect supplier info lines + my @supplierInfo = (); + for (; $lines[$iLine] ne "." && $iLine < $nLines; $iLine++) { + push @supplierInfo,$lines[$iLine]; + } + #Check for incomplete record + if ($iLine == $nLines) { + print "** INCOMPLETE RECORD **\n"; + last; + } + my $nSupplierInfo = scalar(@supplierInfo); + my $supplier = $supplierInfo[0]; + #loop over each grade + for ($iLine++; ($lines[$iLine] ne "........................") && ($iLine < $nLines); $iLine++) { + #collect grade info lines + my @gradeInfo = (); + for (; index($lines[$iLine],"_") ne 0 && $iLine < $nLines; $iLine++) { + push @gradeInfo,$lines[$iLine]; + } + #Check for incomplete record + if ($iLine == $nLines) { + print "** INCOMPLETE RECORD **\n"; + last; + } + my $grade = $gradeInfo[0]; + #loop over each price info line + for (; index($lines[$iLine],".") ne 0 && $iLine < $nLines; $iLine++) { + my @priceInfo = split(" ",$lines[$iLine]); + my $price = join(" ",@priceInfo); + printf("%-20.20s%-40.39s%-40.40s\n", + $supplier, + $grade, + $price); + } + #Check for incomplete record + if ($iLine == $nLines) { + print "** INCOMPLETE RECORD **\n"; + last; + } + last if $lines[$iLine] eq "........................"; + } + } + } +} + + +####################################################################### +sub printHelpAndExit { + print "\nProduces text summaries of SD records\n"; + print "\nUsage:\tsdreport [-l] [-t[<FieldName,FieldName...>]] [-c<FieldName,FieldName...>] [-id<IDField>] [-nh] [-o] [-s] [-sup] [sdFiles]\n"; + print "\n\t-l (list format) output all data fields for each record as processed\n"; + print "\t-t (tab format) tabulate selected fields for each record as processed\n"; + print "\t-c (csv format) comma delimited output of selected fields for each record as processed\n"; + print "\t-s (summary format) output summary statistics for each unique value of ligand ID\n"; + print "\t-sup (supplier format) tabulate supplier details (from Catalyst)\n"; + print "\t-id<IDField> data field to use as ligand ID\n"; + print "\t-nh don't output column headings in -t and -c formats\n"; + print "\t-o use old (v3.00) score field names as default columns in -t and -c formats, else use v4.00 field names\n"; + print "\t-norm use normalised score field names as default columns in -t and -c formats (normalised = score / #ligand heavy atoms)\n"; + print "\nNote:\tIf -l, -t or -c are combined with -s, the listing/table is output within each ligand summary\n"; + print "\t-sup should not be combined with other options\n"; + print "\tDefault field names for -t and -c are rDock score field names\n"; + print "\tDefault ID field name is Name\n"; + print "\n\tIf sdFiles not given, reads from standard input\n"; + print "\tOutput is to standard output\n\n"; + exit; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdrmsd Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,272 @@ +#! /usr/bin/env python +# +# Calculate SMART RMSD with or without molecular superposition (FIT or NOFIT) +# Script distributed under GNU LGPL 3.0 along rDock software. +# +# This algorithm takes into account molecular automorphism. That is, it identifies +# molecules which are the same but might have atom orders changed and still be able to +# match the pairs and correctly calculate the RMSD. +# +# Author: Daniel Alvarez-Garcia +# Date: 08-11-2013 + +import math +import pybel +import numpy as npy +import optparse + +def superpose3D(ref, target, weights=None,refmask=None,targetmask=None,returnRotMat=False): + """superpose3D performs 3d superposition using a weighted Kabsch algorithm : http://dx.doi.org/10.1107%2FS0567739476001873 & doi: 10.1529/biophysj.105.066654 + definition : superpose3D(ref, target, weights,refmask,targetmask) + @parameter 1 : ref - xyz coordinates of the reference structure (the ligand for instance) + @type 1 : float64 numpy array (nx3) + --- + @parameter 2 : target - theoretical target positions to which we should move (does not need to be physically relevant. + @type 2 : float 64 numpy array (nx3) + --- + @parameter 3: weights - numpy array of atom weights (usuallly between 0 and 1) + @type 3 : float 64 numpy array (n) + @parameter 4: mask - a numpy boolean mask for designating atoms to include + Note ref and target positions must have the same dimensions -> n*3 numpy arrays where n is the number of points (or atoms) + Returns a set of new coordinates, aligned to the target state as well as the rmsd + """ + if weights == None : + weights=1.0 + if refmask == None : + refmask=npy.ones(len(ref),"bool") + if targetmask == None : + targetmask=npy.ones(len(target),"bool") + #first get the centroid of both states + ref_centroid = npy.mean(ref[refmask]*weights,axis=0) + #print ref_centroid + refCenteredCoords=ref-ref_centroid + #print refCenteredCoords + target_centroid=npy.mean(target[targetmask]*weights,axis=0) + targetCenteredCoords=target-target_centroid + #print targetCenteredCoords + #the following steps come from : http://www.pymolwiki.org/index.php/OptAlign#The_Code and http://en.wikipedia.org/wiki/Kabsch_algorithm + # Initial residual, see Kabsch. + E0 = npy.sum( npy.sum(refCenteredCoords[refmask] * refCenteredCoords[refmask]*weights,axis=0),axis=0) + npy.sum( npy.sum(targetCenteredCoords[targetmask] * targetCenteredCoords[targetmask]*weights,axis=0),axis=0) + reftmp=npy.copy(refCenteredCoords[refmask]) + targettmp=npy.copy(targetCenteredCoords[targetmask]) + #print refCenteredCoords[refmask] + #single value decomposition of the dotProduct of both position vectors + try: + dotProd = npy.dot( npy.transpose(reftmp), targettmp* weights) + V, S, Wt = npy.linalg.svd(dotProd ) + except Exception: + try: + dotProd = npy.dot( npy.transpose(reftmp), targettmp) + V, S, Wt = npy.linalg.svd(dotProd ) + except Exception: + print >> sys.stderr,"Couldn't perform the Single Value Decomposition, skipping alignment" + return ref, 0 + # we already have our solution, in the results from SVD. + # we just need to check for reflections and then produce + # the rotation. V and Wt are orthonormal, so their det's + # are +/-1. + reflect = float(str(float(npy.linalg.det(V) * npy.linalg.det(Wt)))) + if reflect == -1.0: + S[-1] = -S[-1] + V[:,-1] = -V[:,-1] + rmsd = E0 - (2.0 * sum(S)) + rmsd = npy.sqrt(abs(rmsd / len(ref[refmask]))) #get the rmsd + #U is simply V*Wt + U = npy.dot(V, Wt) #get the rotation matrix + # rotate and translate the molecule + new_coords = npy.dot((refCenteredCoords), U)+ target_centroid #translate & rotate + #new_coords=(refCenteredCoords + target_centroid) + #print U + if returnRotMat : + return new_coords,rmsd, U + return new_coords,rmsd + + +def squared_distance(coordsA, coordsB): + """Find the squared distance between two 3-tuples""" + sqrdist = sum( (a-b)**2 for a, b in zip(coordsA, coordsB) ) + return sqrdist + +def rmsd(allcoordsA, allcoordsB): + """Find the RMSD between two lists of 3-tuples""" + deviation = sum(squared_distance(atomA, atomB) for + (atomA, atomB) in zip(allcoordsA, allcoordsB)) + return math.sqrt(deviation / float(len(allcoordsA))) + +def mapToCrystal(xtal, pose): + """Some docking programs might alter the order of the atoms in the output (like Autodock Vina does...) + this will mess up the rmsd calculation with OpenBabel""" + query = pybel.ob.CompileMoleculeQuery(xtal.OBMol) + mapper=pybel.ob.OBIsomorphismMapper.GetInstance(query) + mappingpose = pybel.ob.vvpairUIntUInt() + exit=mapper.MapUnique(pose.OBMol,mappingpose) + return mappingpose[0] + +def parseArguments(): + optparse.OptionParser.format_epilog = lambda self, formatter: self.epilog + epilog = """Args: + reference.sdf SDF file with the reference molecule. + input.sdf SDF file with the molecules to be compared to reference.\n""" + parser = optparse.OptionParser("usage: %prog [options] reference.sdf input.sdf", epilog=epilog) + parser.add_option("-f", "--fit",dest="fit", action="store_true", default=False, + help="Superpose molecules before RMSD calculation") + parser.add_option("--threshold","-t",dest="threshold", action="store", nargs=1, + help="Discard poses with RMSD < THRESHOLD with respect previous poses which where not rejected based on same principle. A Population SDField will be added to output SD with the population number.", type=float) + parser.add_option("-o","--out", dest="outfilename", metavar="FILE", default=False, + help="If declared, write an output SDF file with the input molecules with a new sdfield <RMSD>. If molecule was fitted, the fitted molecule coordinates will be saved.") + (options, args) = parser.parse_args() + + #Check we have two arguments + if len(args) < 2: + parser.error("Incorrect number of arguments. Use -h or --help options to print help.") + + return options, args + +def updateCoords(obmol, newcoords): + "Update OBMol coordinates. newcoords is a numpy array" + for i,atom in enumerate(obmol): + atom.OBAtom.SetVector(*newcoords[i]) + +def getAutomorphRMSD(target, molec, fit=False): + """ + Use Automorphism to reorder target coordinates to match ref coordinates atom order + for correct RMSD comparison. Only the lowest RMSD will be returned. + + Returns: + If fit=False: bestRMSD (float) RMSD of the best matching mapping. + If fit=True: (bestRMSD, molecCoordinates) (float, npy.array) RMSD of best match and its molecule fitted coordinates. + """ + mappings = pybel.ob.vvpairUIntUInt() + bitvec = pybel.ob.OBBitVec() + lookup = [] + for i, atom in enumerate(target): + lookup.append(i) + success = pybel.ob.FindAutomorphisms(target.OBMol, mappings) + targetcoords = [atom.coords for atom in target] + mappose = npy.array(mapToCrystal(target, molec)) + mappose = mappose[npy.argsort(mappose[:,0])][:,1] + posecoords = npy.array([atom.coords for atom in molec])[mappose] + resultrmsd = 999999999999 + for mapping in mappings: + automorph_coords = [None] * len(targetcoords) + for x, y in mapping: + automorph_coords[lookup.index(x)] = targetcoords[lookup.index(y)] + mapping_rmsd = rmsd(posecoords, automorph_coords) + if mapping_rmsd < resultrmsd: + resultrmsd = mapping_rmsd + fitted_result = False + if fit: + fitted_pose, fitted_rmsd = superpose3D(npy.array(automorph_coords), npy.array(posecoords)) + if fitted_rmsd < resultrmsd: + resultrmsd = fitted_rmsd + fitted_result = fitted_pose + + if fit: + return (resultrmsd, fitted_pose) + else: + return resultrmsd + +def saveMolecWithRMSD(outsdf, molec, rmsd, population=False): + newData = pybel.ob.OBPairData() + newData.SetAttribute("RMSD") + newData.SetValue('%.3f'%rmsd) + + if population: + popData = pybel.ob.OBPairData() + popData.SetAttribute("Population") + popData.SetValue('%i'%population) + molec.OBMol.CloneData(popData) + + molec.OBMol.CloneData(newData) # Add new data + outsdf.write(molec) + +if __name__ == "__main__": + import sys, os + + (opts, args) = parseArguments() + + xtal = args[0] + poses = args[1] + + if not os.path.exists(xtal) or not os.path.exists(poses): + sys.exit("Input files not found. Please check the path given is correct.") + + fit = opts.fit + outfname = opts.outfilename + threshold = opts.threshold + + # Read crystal pose + crystal = next(pybel.readfile("sdf", xtal)) + crystal.removeh() + crystalnumatoms = len(crystal.atoms) + + #If outfname is defined, prepare an output SDF sink to write molecules + if outfname: + outsdf = pybel.Outputfile('sdf', outfname, overwrite=True) + + # Find the RMSD between the crystal pose and each docked pose + dockedposes = pybel.readfile("sdf", poses) + if fit: print "POSE\tRMSD_FIT" + else: print "POSE\tRMSD_NOFIT" + skipped = [] + moleclist = {} # Save all poses with their dockid + population = {} # Poses to be written + outlist = {} + for docki, dockedpose in enumerate(dockedposes): + dockedpose.removeh() + natoms = len(dockedpose.atoms) + if natoms != crystalnumatoms: + skipped.append(docki+1) + continue + if fit: + resultrmsd, fitted_result = getAutomorphRMSD(crystal, dockedpose, fit=True) + updateCoords(dockedpose, fitted_result) + else: + resultrmsd = getAutomorphRMSD(crystal, dockedpose, fit=False) + + if threshold: + # Calculate RMSD between all previous poses + # Discard if rmsd < FILTER threshold + if moleclist: + match = None + bestmatchrmsd = 999999 + for did,prevmol in moleclist.iteritems(): + tmprmsd = getAutomorphRMSD(prevmol, dockedpose) + if tmprmsd < threshold: + if tmprmsd < bestmatchrmsd: + bestmatchrmsd = tmprmsd + match = did + + if match != None: + # Do not write this one + # sum one up to the matching previous molecule id + print >> sys.stderr, "Pose %i matches pose %i with %.3f RMSD"%(docki+1, match+1, bestmatchrmsd) + population[match] += 1 + else: + # There's no match. Print info for this one and write to outsdf if needed + # Save this one! + if outfname: outlist[docki] = (dockedpose, resultrmsd) + print "%d\t%.2f"%((docki+1),resultrmsd) + moleclist[docki] = dockedpose + population[docki] = 1 + else: + # First molecule in list. Append for sure + moleclist[docki] = dockedpose + population[docki] = 1 + if outfname: outlist[docki] = (dockedpose, resultrmsd) + else: + # Just write best rmsd found and the molecule to outsdf if demanded + if outfname: saveMolecWithRMSD(outsdf, dockedpose, resultrmsd) + print "%d\t%.2f"%((docki+1),resultrmsd) + + if outlist: + # Threshold applied and outlist need to be written + for docki in sorted(outlist.iterkeys()): + molrmsd = outlist[docki] + # Get number of matchs in thresholding operation + pop = population.get(docki) + if not pop: pop = 1 + # Save molecule + saveMolecWithRMSD(outsdf, molrmsd[0], molrmsd[1], pop) + + if skipped: print >> sys.stderr, "SKIPPED input molecules due to number of atom missmatch: %s"%skipped
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdsort Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,98 @@ +#!/usr/bin/perl +# Sorts SD records by given data field +use lib "$ENV{'RBT_ROOT'}/lib"; + +use SDRecord; + +my $SDSORTKEY; # sort key +my $SDSORTASCEND = 1;# 1 = ascending, 0 = descending +my $SDSORTTEXT = 1;# 1 = text sort, 0 = numeric sort + +my $FASTFORMAT = 0; +my $FASTKEY = "_TITLE1"; + +#Print help if no command line arguments +printHelpAndExit() if (scalar(@ARGV) == 0); + +#Parse command line arguments +my @files; +while (scalar(@ARGV)) { + $arg = shift @ARGV; + printHelpAndExit() if ($arg eq '-h'); + if ($arg eq '-r') { + $SDSORTASCEND = 0;#descending sort requested + } + elsif ($arg eq '-n') { + $SDSORTTEXT = 0;;#numeric sort requested + } + elsif (index($arg,'-s')==0) { + $FASTFORMAT = 1; + } + elsif (index($arg,'-id')==0) { + $FASTKEY = substr($arg,3); + } + elsif (index($arg,'-f')==0) { + $SDSORTKEY = substr($arg,2);#sort key + } + else { + push @files,$arg;#must be a filename + } +} +push @ARGV,@files;#put the filenames back in the arg list + +#read records +my $sdRec = new SDRecord; +my @records; +my $nRec=0; + +my $lastid=""; +while ($sdRec->readRec('DATA'=>1,'LINES'=>1)) { + $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field + if ($FASTFORMAT) { + my $id = $sdRec->{'DATA'}->{$FASTKEY}; + if (($lastid ne "") && ($lastid ne $id)) { + foreach $rec (sort sortSD @records) { + $rec->writeRec(); + } + @records = ();#clear the list + } + $lastid = $id; + } + push(@records,$sdRec->copy('DATA'=>1,'LINES'=>1)); +} + +#write sorted records +foreach $rec (sort sortSD @records) { + $rec->writeRec(); +} + +####################################################### +# sort function to sort SD records by given field +# handles text/numeric and ascending/descending sort +sub sortSD { + if ($SDSORTTEXT) { + return $a->{'DATA'}->{$SDSORTKEY} cmp $b->{'DATA'}->{$SDSORTKEY} if ($SDSORTASCEND); + return $b->{'DATA'}->{$SDSORTKEY} cmp $a->{'DATA'}->{$SDSORTKEY}; + } + else { + return $a->{'DATA'}->{$SDSORTKEY} <=> $b->{'DATA'}->{$SDSORTKEY} if ($SDSORTASCEND); + return $b->{'DATA'}->{$SDSORTKEY} <=> $a->{'DATA'}->{$SDSORTKEY}; + } +} + +####################################################################### +sub printHelpAndExit { + print "\nSorts SD records by given data field\n"; + print "\nUsage:\tsdsort [-n] [-r] [-f<DataField>] [sdFiles]\n\n"; + print "\t-n\t\tnumeric sort (default is text sort)\n"; + print "\t-r\t\tdescending sort (default is ascending sort)\n"; + print "\t-f<DataField>\tspecifies sort field\n"; + print "\t-s\t\tfast mode. Sorts the records for each named compound independently (must be consecutive)\n"; + print "\t-id<NameField>\tspecifies compound name field (default = 1st title line)\n\n"; + print "Note:\t_REC (record #) is provided as a pseudo-data field\n"; + print "\n\tIf SD file list not given, reads from standard input\n"; + print "\tOutput is to standard output\n"; + print "\tFast mode can be safely used for partial sorting of huge SD files of raw docking hits\n"; + print "\twithout running into memory problems.\n\n"; + exit; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdsplit Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,72 @@ +#!/usr/bin/perl +# Splits SD records into multiple files of equal no. of records +use lib "$ENV{'RBT_ROOT'}/lib"; + +use FileHandle; +use SDRecord; + +# Record size to split into +my $recSize = 1000; + +#output root +my $outRoot = "tmp"; + +#Print help if no command line arguments +printHelpAndExit() if (scalar(@ARGV) == 0); + +#Parse command line arguments +my @files; +while (scalar(@ARGV)) { + my $arg = shift @ARGV; + printHelpAndExit() if ($arg eq '-h'); + if (index($arg,'-o')==0) { + $outRoot = substr($arg,2); + } + elsif (index($arg,'-')==0) { + $recSize = substr($arg,1); + } + else { + push @files,$arg;#must be a filename + } +} +push @ARGV,@files;#put the filenames back in the arg list + +my $sdRec = new SDRecord; +my $nRec=0; +my $nFile=0; +my $sdfh; + +#read records +while ($sdRec->readRec('LINES'=>1)) { + #check if we need to start a new output file + if ($nRec % $recSize == 0) { + $nFile++; + my $outFile = $outRoot . $nFile . ".sd"; + if (defined $sdfh) { + undef $sdfh; + } + $sdfh = new FileHandle ">$outFile"; + if (!defined $sdfh) { + die "Can't open $outFile"; + } + else { + print STDOUT "Opening $outFile\n"; + } + $sdfh->autoflush(1); + select($sdfh); + } + $nRec++; + $sdRec->writeRec(); +} +undef $sdfh; +select(STDOUT);#reselect STDOUT as default + +####################################################################### +sub printHelpAndExit { + print "\Splits SD records into multiple files of equal size\n"; + print "\nUsage:\tsdsplit [-<RecSize>] [-o<OutputRoot>] [sdFiles]\n\n"; + print "\t-<RecSize>\trecord size to split into (default = 1000 records)\n"; + print "\t-o<OutputRoot>\tRoot name for output files (default = tmp)\n"; + print "\n\tIf SD file list not given, reads from standard input\n"; + exit; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/sdtether Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,263 @@ +#! /usr/bin/env python +# +# Substitute for rbtether of rDock. Will align input molecules to a reference fragment defined by a smarts string, +# it will add a TETHERED ATOM property field to the output SDF that is correctly understood by rDock +# rDock will restrain the matching atom positions to the reference molecule coordinates. +# +# Initially implemented with a conformational search algorithm to better match target coordinates. +# But had problems with OBabel FF generating non-sense conformers. So in this version the conformer search is commented out. +# Now if the input molecule do not have a good conformation, might not align well with the target. This effect will be +# dimished or even vanish if the SMARTS string is defined for a rigid region (like a ring). +# I'm still trying to incorporate somehow this conformational search. +# +# Script distributed under GNU LGPL 3.0 along rDock software. +# +# Author: Daniel Alvarez-Garcia +# Date: 08-11-2013 + +import math +import pybel +import numpy as npy + +def superpose3D(ref, target, weights=None,refmask=None,targetmask=None,returnRotMat=False): + """superpose3D performs 3d superposition using a weighted Kabsch algorithm : http://dx.doi.org/10.1107%2FS0567739476001873 & doi: 10.1529/biophysj.105.066654 + definition : superpose3D(ref, target, weights,refmask,targetmask) + @parameter 1 : ref - xyz coordinates of the reference structure (the ligand for instance) + @type 1 : float64 numpy array (nx3) + --- + @parameter 2 : target - theoretical target positions to which we should move (does not need to be physically relevant. + @type 2 : float 64 numpy array (nx3) + --- + @parameter 3: weights - numpy array of atom weights (usuallly between 0 and 1) + @type 3 : float 64 numpy array (n) + @parameter 4: mask - a numpy boolean mask for designating atoms to include + Note ref and target positions must have the same dimensions -> n*3 numpy arrays where n is the number of points (or atoms) + Returns a set of new coordinates, aligned to the target state as well as the rmsd + """ + if weights == None : + weights=1.0 + if refmask == None : + refmask=npy.ones(len(ref),"bool") + if targetmask == None : + targetmask=npy.ones(len(target),"bool") + #first get the centroid of both states + ref_centroid = npy.mean(ref[refmask]*weights,axis=0) + #print ref_centroid + refCenteredCoords=ref-ref_centroid + #print refCenteredCoords + target_centroid=npy.mean(target[targetmask]*weights,axis=0) + targetCenteredCoords=target-target_centroid + #print targetCenteredCoords + #the following steps come from : http://www.pymolwiki.org/index.php/OptAlign#The_Code and http://en.wikipedia.org/wiki/Kabsch_algorithm + # Initial residual, see Kabsch. + E0 = npy.sum( npy.sum(refCenteredCoords[refmask] * refCenteredCoords[refmask]*weights,axis=0),axis=0) + npy.sum( npy.sum(targetCenteredCoords[targetmask] * targetCenteredCoords[targetmask]*weights,axis=0),axis=0) + reftmp=npy.copy(refCenteredCoords[refmask]) + targettmp=npy.copy(targetCenteredCoords[targetmask]) + #print refCenteredCoords[refmask] + #single value decomposition of the dotProduct of both position vectors + try: + dotProd = npy.dot( npy.transpose(reftmp), targettmp* weights) + V, S, Wt = npy.linalg.svd(dotProd ) + except Exception: + try: + dotProd = npy.dot( npy.transpose(reftmp), targettmp) + V, S, Wt = npy.linalg.svd(dotProd ) + except Exception: + print >> sys.stderr,"Couldn't perform the Single Value Decomposition, skipping alignment" + return ref, 0 + # we already have our solution, in the results from SVD. + # we just need to check for reflections and then produce + # the rotation. V and Wt are orthonormal, so their det's + # are +/-1. + reflect = float(str(float(npy.linalg.det(V) * npy.linalg.det(Wt)))) + if reflect == -1.0: + S[-1] = -S[-1] + V[:,-1] = -V[:,-1] + rmsd = E0 - (2.0 * sum(S)) + rmsd = npy.sqrt(abs(rmsd / len(ref[refmask]))) #get the rmsd + #U is simply V*Wt + U = npy.dot(V, Wt) #get the rotation matrix + # rotate and translate the molecule + new_coords = npy.dot((refCenteredCoords), U)+ target_centroid #translate & rotate + #new_coords=(refCenteredCoords + target_centroid) + #print U + if returnRotMat : + return U, ref_centroid, target_centroid, rmsd + return new_coords,rmsd + + +def squared_distance(coordsA, coordsB): + """Find the squared distance between two 3-tuples""" + sqrdist = sum( (a-b)**2 for a, b in zip(coordsA, coordsB) ) + return sqrdist + +def rmsd(allcoordsA, allcoordsB): + """Find the RMSD between two lists of 3-tuples""" + deviation = sum(squared_distance(atomA, atomB) for + (atomA, atomB) in zip(allcoordsA, allcoordsB)) + return math.sqrt(deviation / float(len(allcoordsA))) + +def mapToCrystal(xtal, pose): + """Some docking programs might alter the order of the atoms in the output (like Autodock Vina does...) + this will mess up the rmsd calculation with OpenBabel""" + query = pybel.ob.CompileMoleculeQuery(xtal.OBMol) + mapper=pybel.ob.OBIsomorphismMapper.GetInstance(query) + mappingpose = pybel.ob.vvpairUIntUInt() + exit=mapper.MapUnique(pose.OBMol,mappingpose) + return mappingpose[0] + +def takeCoords(obmol): + """Take coordinates of an OBMol as a npy array""" + return npy.array([atom.coords for atom in obmol]) + +def updateCoords(obmol, newcoords): + "Update OBMol coordinates. newcoords is a numpy array" + for i,atom in enumerate(obmol): + atom.OBAtom.SetVector(*newcoords[i]) + +def prepareAtomString(idlist): + s = "" + n = len(idlist) + for i, id in enumerate(idlist): + s += "%i"%id + if (i+1) == n: s+="\n" + elif (i+1)%35 == 0: s+=",\n" + else: s+="," + return s + + +if __name__ == "__main__": + import sys + + if len(sys.argv) != 5: + sys.exit("USAGE: %s reference.sdf input.sdf output.sdf 'SMARTS'"%sys.argv[0]) + + refsdf = sys.argv[1] + molsdf = sys.argv[2] + outsdf = sys.argv[3] + smarts = pybel.Smarts(sys.argv[4]) + + # Read reference pose and get atom list matching smarts query + # if more than 1 match, take the first one + ref = next(pybel.readfile("sdf", refsdf)) + refMatchIds = smarts.findall(ref) + numRefMatchs = len(refMatchIds) + + if not numRefMatchs: + sys.exit("No match found in the reference structure and the SMARTS string given. Please check it.") + + if numRefMatchs > 1: + print "More than one match in the reference molecule for the SMARTS string given. Will tether each input molecule all possible ways." + + refIndxPerMatch = [npy.array(rmi) - 1 for rmi in refMatchIds] + + # Take coordinates for the reference matched atoms + refCoords = takeCoords(ref) + refMatchCoords = [npy.take(refCoords, refIndx, axis=0) for refIndx in refIndxPerMatch] + + # Do the same for molecule in molsdf + out=pybel.Outputfile('sdf', outsdf, overwrite=True) + molSupp = pybel.readfile("sdf", molsdf) + ff = pybel.ob.OBForceField_FindForceField('MMFF94') + for i,mol in enumerate(molSupp): + print "## Molecule %i"%(i+1), + mol.OBMol.DeleteNonPolarHydrogens() + molMatchAllIds = smarts.findall(mol) + numMatchs = len(molMatchAllIds) + + if numMatchs == 0: + print "No_Match", + continue + elif numMatchs ==1: + print "Match", + elif numMatchs > 1: + print "Multiple_Match SMART Matches for this molecule (%d)"%numMatchs, + + # If more than one match, write an output of the same molecule for each match + # Start a default bestcoord and rmsd for later looping for each pose + bestCoordPerMatch = [[0 for i in range(numMatchs)] for i in range(numRefMatchs)] + bestRMSPerMatch = [[999 for i in range(numMatchs)] for i in range(numRefMatchs)] + + # Will do a randomrotorsearch to find conformer with the lower rmsd when superposing + # At least 20 when possible + #ff.Setup(mol.OBMol) + #numats = mol.OBMol.NumAtoms() + #numrot = mol.OBMol.NumRotors() + #print "Atoms: %i, Rotors: %i"%(numats, numrot) + #geomopt = 300 + #genconf = 100 + # increase iterations if bigger molecule or bigger number of rotatable bonds + # for allowing better sampling + #if numats > 40 and numrot > 5: + # geomopt = 300 + # genconf = 150 + #if numats > 55 and numrot > 7: + # genconf = 100 + # geomopt = 500 + #print "\tDoing conformational search with WeightedRotorSearch (%i, %i)..."%(genconf, geomopt), + #ff.SteepestDescent(500, 1.0e-4) + #ff.WeightedRotorSearch(genconf,geomopt) + #ff.ConjugateGradients(500, 1.0e-6) + #ff.GetConformers(mol.OBMol) + #numconf = mol.OBMol.NumConformers() + numconf = 1 + #print "%i conformers generated"%numconf + if numconf > 1: + # Doing conf search + #for i in range(numconf): + # mol.OBMol.SetConformer(i) + # confCoords = takeCoords(mol) + # print 'coord:',confCoords[0,:] + # + # for imatch, molMatchIds in enumerate(molMatchAllIds): + # molMatchIndx = npy.array(molMatchIds) - 1 + # confMatchCoords = npy.take(confCoords, molMatchIndx, axis=0) + # + # # Align: Get rotation matrix between the two sets of coords + # # Apply rotation to the whole target molecule + # rotMat, targetCentroid, refCentroid, rmsd = superpose3D(confMatchCoords, refMatchCoords, returnRotMat=True) + # if rmsd < bestRMSPerMatch[imatch]: + # newcoords = npy.dot((confCoords - targetCentroid), rotMat) + refCentroid + # bestRMSPerMatch[imatch] = rmsd + # bestCoordPerMatch[imatch] = newcoords + # #if bestrms < 0.01: break + pass + else: + molCoords = takeCoords(mol) + for imatch, molMatchIds in enumerate(molMatchAllIds): + # loop in each matching way for the input molecule + molMatchIndx = npy.array(molMatchIds) - 1 + molMatchCoords = npy.take(molCoords, molMatchIndx, axis=0) + + # Loop over the reference matches + # Align: Get rotation matrix between the two sets of coords + # Apply rotation to the whole target molecule + for ir, refMatchCoord in enumerate(refMatchCoords): + rotMat, targetCentroid, refCentroid, rmsd = superpose3D(molMatchCoords, refMatchCoord, returnRotMat=True) + if rmsd < bestRMSPerMatch[ir][imatch]: + newcoords = npy.dot((molCoords - targetCentroid), rotMat) + refCentroid + bestRMSPerMatch[ir][imatch] = rmsd + bestCoordPerMatch[ir][imatch] = newcoords + + # Finally update molecule coordinates with the best matching coordinates found + # change molecule coordinates, set TETHERED ATOMS property and save + for imatch in range(numMatchs): + for irefmatch in range(numRefMatchs): + bestCoord = bestCoordPerMatch[irefmatch][imatch] + bestRMS = bestRMSPerMatch[irefmatch][imatch] + print "\tBest RMSD reached (match %d, refmatch %d): %s"%(imatch, irefmatch, bestRMS) + molMatchID = molMatchAllIds[imatch] + updateCoords(mol, bestCoord) + newData = pybel.ob.OBPairData() + newData.SetAttribute("TETHERED ATOMS") + newData.SetValue(prepareAtomString(molMatchID)) + + mol.OBMol.DeleteData("TETHERED ATOMS") # Remove Previous DATA + mol.OBMol.CloneData(newData) # Add new data + out.write(mol) + + out.close() + + print "DONE" + sys.stdout.close() + sys.stderr.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rdock/bin/to_unix Mon Aug 29 08:38:19 2016 -0400 @@ -0,0 +1,32 @@ +#! /bin/sh -e +#Tag 1538 +# +# convert a DOS ASCII file to a UNIX ASCII file by removing trailing ^M at the +# end of each line and ^Z at the end of the file + +#DM 23 Oct 2000 - tmp file in current directory, not /tmp +#TMPFILE=/tmp/to_unix$$ +TMPFILE=to_unix$$ + +if [ $# -gt 2 ] +then + echo "usage: to_unix [<dos_file> [<unix_file>]]" + exit 1 +fi + +# First strip out all carriage-return and ctrl-Z's +if [ $# -gt 0 ] +then + tr -d '\015\032' < "$1" > $TMPFILE +else + tr -d '\015\032' > $TMPFILE +fi + + +if [ $# -eq 2 ] +then + mv -f $TMPFILE "$2" +else + cat $TMPFILE + rm $TMPFILE +fi