diff tools/rdock/bin/sdsort @ 0:bc03dbb6eb37 draft

planemo upload commit 781926e52355f7805db8d9a4ccafeff397b19aa4-dirty
author marpiech
date Mon, 29 Aug 2016 03:38:13 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rdock/bin/sdsort	Mon Aug 29 03:38:13 2016 -0400
@@ -0,0 +1,98 @@
+#!/usr/bin/perl
+# Sorts SD records by given data field
+use lib "$ENV{'RBT_ROOT'}/lib";
+
+use SDRecord;
+
+my $SDSORTKEY; # sort key
+my $SDSORTASCEND = 1;# 1 = ascending, 0 = descending
+my $SDSORTTEXT = 1;# 1 = text sort, 0 = numeric sort
+
+my $FASTFORMAT = 0;
+my $FASTKEY = "_TITLE1";
+
+#Print help if no command line arguments
+printHelpAndExit() if (scalar(@ARGV) == 0);
+
+#Parse command line arguments
+my @files;
+while (scalar(@ARGV)) {
+  $arg = shift @ARGV;
+  printHelpAndExit() if ($arg eq '-h');
+  if ($arg eq '-r') {
+    $SDSORTASCEND = 0;#descending sort requested
+  }
+  elsif ($arg eq '-n') {
+    $SDSORTTEXT = 0;;#numeric sort requested
+  }
+  elsif (index($arg,'-s')==0) {
+    $FASTFORMAT = 1;
+  }
+  elsif (index($arg,'-id')==0) {
+    $FASTKEY = substr($arg,3);
+  }
+  elsif (index($arg,'-f')==0) {
+    $SDSORTKEY = substr($arg,2);#sort key
+  }
+  else {
+    push @files,$arg;#must be a filename
+  }
+}
+push @ARGV,@files;#put the filenames back in the arg list
+
+#read records
+my $sdRec = new SDRecord;
+my @records;
+my $nRec=0;
+
+my $lastid="";
+while ($sdRec->readRec('DATA'=>1,'LINES'=>1)) {
+  $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field
+  if ($FASTFORMAT) {
+    my $id = $sdRec->{'DATA'}->{$FASTKEY};
+    if (($lastid ne "") && ($lastid ne $id)) {
+      foreach $rec (sort sortSD @records) {
+	$rec->writeRec();
+      }
+      @records = ();#clear the list
+    }
+    $lastid = $id;
+  }
+  push(@records,$sdRec->copy('DATA'=>1,'LINES'=>1));
+}
+
+#write sorted records
+foreach $rec (sort sortSD @records) {
+  $rec->writeRec();
+}
+
+#######################################################
+# sort function to sort SD records by given field
+# handles text/numeric and ascending/descending sort
+sub sortSD {
+  if ($SDSORTTEXT) {
+    return $a->{'DATA'}->{$SDSORTKEY} cmp $b->{'DATA'}->{$SDSORTKEY} if ($SDSORTASCEND);
+    return $b->{'DATA'}->{$SDSORTKEY} cmp $a->{'DATA'}->{$SDSORTKEY};
+  }
+  else {
+    return $a->{'DATA'}->{$SDSORTKEY} <=> $b->{'DATA'}->{$SDSORTKEY} if ($SDSORTASCEND);
+    return $b->{'DATA'}->{$SDSORTKEY} <=> $a->{'DATA'}->{$SDSORTKEY};
+  }
+}
+
+#######################################################################
+sub printHelpAndExit {
+  print "\nSorts SD records by given data field\n";
+  print "\nUsage:\tsdsort [-n] [-r] [-f<DataField>] [sdFiles]\n\n";
+  print "\t-n\t\tnumeric sort (default is text sort)\n";
+  print "\t-r\t\tdescending sort (default is ascending sort)\n";
+  print "\t-f<DataField>\tspecifies sort field\n";
+  print "\t-s\t\tfast mode. Sorts the records for each named compound independently (must be consecutive)\n";
+  print "\t-id<NameField>\tspecifies compound name field (default = 1st title line)\n\n";
+  print "Note:\t_REC (record #) is provided as a pseudo-data field\n";
+  print "\n\tIf SD file list not given, reads from standard input\n";
+  print "\tOutput is to standard output\n";
+  print "\tFast mode can be safely used for partial sorting of huge SD files of raw docking hits\n";
+  print "\twithout running into memory problems.\n\n"; 
+  exit;
+}