Mercurial > repos > hathkul > rapidcluster

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapidcluster	Mon Dec 26 11:05:18 2016 -0500
@@ -0,0 +1,282 @@
+#!/usr/bin/env perl
+
+## Distributed under GNU General Public License v3
+
+
+use Getopt::Long;   ## Core Perl module for command line options/arguments
+
+my $start = time;           ## Starts program execution timer
+my $infile;                 ## Variable for input file
+my $outfile;                ## Variable for output file
+my $defined_edit_distance;  ## Variable (integer) for user-defined edit distance
+my $help;                   ## true/false variable for help screen
+my $quiet;                  ## true/false to suppress standard output
+my $filter;				## Variable to define a read filter for entries
+my $max_clusters; ## Maximum number of clusters
+my $version;                ## true/false variable for version screen
+
+                                           ## Take command line arguments for...
+GetOptions (    "infile=s" => \$infile,    ## ... input file
+                "outfile=s" => \$outfile,  ## ... output file
+                "distance=i" => \$defined_edit_distance,  ## ... edit distance
+                "filter=f" => \$filter,    ## ... read filter
+                "cluster_max=i" => \$max_clusters, ## max number of clusters
+                "help" => \$help,          ## ... help screen
+                "version" => \$version,    ## ... version screen
+                "quiet" => \$quiet);       ## ... supress standard output
+
+if ($help){         ## Prints help screen if $help returns as true
+print <<"HELP";
+
+--------------------------------------------------------------------------------
+                              FASTAptamer-RapidCluster
+--------------------------------------------------------------------------------
+
+Usage: fastaptamer_rapidcluster [-h] [-i INFILE] [-o OUTFILE] [-d #] [-f #] [-c #] [-q] [-v]
+
+    [-h]            = Help screen.
+    [-i INFILE]     = Input file from FASTAptamer-Count. REQUIRED.
+    [-o OUTFILE]    = Output file, FASTA format. REQUIRED.
+    [-d]            = Edit distance for clustering sequences. REQUIRED.
+    [-f]            = Read filter. Only sequences with total reads greater than
+                      the value supplied will be clustered.
+    [-c]            = Maximum number of clusters to find.
+    [-q]            = Quiet mode.  Suppresses standard output of file I/O, numb-
+                      er of clusters, cluster size and execution time.
+    [-v]            = Display version.
+
+FASTAptamer-RapidCluster is a slightly modified and much faster version of original
+FASTAptamer-Cluster script. It uses the Levenshtein algorithm to cluster together
+sequences based on a user-defined edit distance.  The most abundant and unclustered
+sequence is used as the "seed sequence" for which edit distance is calculated from.
+Output is FASTA with the following information on the identifier line for each
+sequence entry:
+
+    >Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeed
+    SEQUENCE
+
+To prevent clustering of sequences not highly sampled (and improve execution ti-
+me), invoke the read filter and enter a number.  Only sequences with total reads
+greater than the number entered will be clustered.
+
+Input for FASTAptamer-Cluster MUST come from a FASTAptamer-Count output file.
+
+PLEASE NOTE: This is a computationally intense program that can take multiple h-
+ours to finish depending on the size and complexity of your population. Utilize
+the read filter [-f] to improve execution time.
+
+HELP
+exit;
+}
+
+if (defined $version){                     ## Print version screen if -v is true
+    print <<"VERSION";
+
+FASTAptamer v1.0.2
+
+VERSION
+exit;
+}
+
+##########################################
+## Open input file or exit with warning  #
+##########################################
+
+open (INPUT, '<', $infile) or die
+"\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n";
+
+##########################################
+## Open output file or exit with warning #
+##########################################
+
+open (OUTPUT, '>', $outfile) or die
+"\nCould not open output file or no input file was specified.\See help documentation [-h], README, or User's Guide for program usage.\n";
+
+###############################################
+## Exit with warning if no distance specified #
+###############################################
+
+unless ($defined_edit_distance) {
+die "\nNo edit distance specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; }
+
+my @sequence_and_info;   ## Array to contain each FASTA entry individually
+my $entries;             ## Variable to keep track of the number of entries
+
+$/= ">";   ## Change default input record separator to read FASTA formatted file
+
+while (<INPUT>){                             ## Reads through entire input file
+    if ($_ =~ /(\d+-\d+-\d+.*\d*\n\w+)/){    ## Regex for FASTAptamer-Count format
+        my @read_count_test = split /-/, $1; ## Splits entry by dashes, loads array
+        if ($read_count_test[1] > $filter){     ## Tests for read count filter
+            push @sequence_and_info, $1;         ## Add each matched entry to array
+            $entries++;                          ## Increase entry count by 1
+        }
+    }
+}
+
+close INPUT;  ## Closes input file
+
+unless ($quiet){  ## Unless -q option is invoked, print this warning and summary
+	print "\n**************************************************";
+    print "\nClustering is a computationally intense process.\n";
+    print "Please be patient as clustering can take several\n";
+    print "hours for a single file.  For faster clustering\n";
+    print "use the read filter [-f] to avoid clustering of \n";
+    print "sequences not highly sampled.\n";
+    print "It is also recommended to set max number of \n";
+    print "clusters to find, a couple of hundered is usually enough \n";
+    print "**************************************************\n";
+    print "Total number of sequences to cluster: $entries.\n";
+    print "Total number of clusters to find: $max_clusters.\n";
+    print "Cluster\tUnique Sequences\tReads\tRPM\tLevenshtein\tTime\n";
+
+
+}
+
+my $current_cluster = 1;  ## This variable defines what cluster we're working on
+
+iterate(@sequence_and_info);  ## Sends array with sequence entries to subroutine
+
+###############################################
+##   SUBROUTINE THAT BEGINS TO PROCESS FILE   #
+## One iteration generates a single cluster   #
+## and then repeats w/  unclustered sequences #
+## for next cluster                           #
+###############################################
+my $levecounter = 0;
+
+sub iterate {
+    my ( $top_entry, @entries ) = @_; ## Takes "seed sequence" and remaining entries
+    my @keepers; ## Array to store unclustered entries
+    my @current_seq_info = split /\n/, $top_entry;
+    ## Splits entry into metrics (reads, rank, RPM) and the "seed" sequence
+    my $cluster_rank = 1; ## Defines that seed sequence is ranked #1
+
+    print OUTPUT ">$current_seq_info[0]-$current_cluster-$cluster_rank-0\n$current_seq_info[1]\n";
+    ## Prints current seed sequence in FASTAptamer-Cluster format
+
+	my @current_seq_metrics = split /-/, $current_seq_info[0]; ## Split sequence identifier line
+	my $current_cluster_reads = $current_seq_metrics[1]; ## Take reads to tally cluster size
+	my $current_cluster_rpm = $current_seq_metrics[2]; ## Take RPM to tally cluster size in RPM
+
+    for (@entries) { ## for each entry left in array send to calculate distance
+
+    	my @comparison_seq_info = split /\n/, $_; ## Split entry into metrics and sequence
+    	my @comparison_seq_metrics = split /-/, $comparison_seq_info[0]; ## Split identifier line
+    	my $comparison_seq_reads = $comparison_seq_metrics[1];  ## Take reads to tally cluster size
+    	my $comparison_seq_rpm = $comparison_seq_metrics[2];  ## Take RPM to tally cluster size
+        my $distance = levenshtein( $current_seq_info[1], $comparison_seq_info[1] );
+        ## sends comparison sequence to compare against current seed sequence, returns distances
+#        print "calculation number $levecounter. Distance between $current_seq_info[1] and $comparison_seq_info[1] is $distance \n";
+        $levecounter++;
+        if ( $distance > $defined_edit_distance ) { ## If distance is greater than defined
+            push @keepers, $_;  ## Add to array for comparison in next iteration
+        }
+        elsif ( $distance <= $defined_edit_distance ){ ## If distance is less than or equal to defined
+        	$cluster_rank++;  ## Increment the cluster rank
+        	$current_cluster_reads += $comparison_seq_reads; ## Add reads to cluster reads tally
+        	$current_cluster_rpm += $comparison_seq_rpm;  ## Add RPM to cluster RPM tally
+        	print OUTPUT ">$comparison_seq_info[0]-$current_cluster-$cluster_rank-$distance\n";
+        	print OUTPUT "$comparison_seq_info[1]\n"; ## Print entry in output file
+        }
+    }
+
+    unless ($quiet) { print "$current_cluster\t$cluster_rank\t$current_cluster_reads\t$current_cluster_rpm\t$defined_edit_distance\t"; }
+    ## Display cluster number, number of unique sequences, reads and RPM
+    $current_cluster++; ## Increment cluster number prior to next cluster
+
+    # Quit clustering if the maximum clusters have been found
+    return if $current_cluster > $max_clusters;
+
+    if (@keepers) { ## Subroutine within subroutine for sequences that are unclustered
+        iterate(@keepers);
+    }
+}
+
+my $duration = time - $start; ## Calculates how much time has elapsed since start
+
+unless ($quiet) { ## Displays summary report unless -q is invoked
+#    print "\nInput file: \"$infile\".\n";
+#    print "Output file: \"$outfile\".\n";
+    print "$duration\n";
+}
+
+####################################################################################
+## The subroutines below checks if the Levenshtein edit distance for the           #
+## current "seed" sequence and the comparison sequence is above the set parameter  #
+####################################################################################
+
+
+sub levenshtein {
+	my ($s1, $s2) = @_;
+	my ($len1, $len2) = (length $s1, length $s2); #$len1 and 2 now hold length of input strings
+	return $len2 if ($len1 == 0);
+	return $len1 if ($len2 == 0);
+	my $minlen = ($len1, $len2)[$len1 > $len2];
+	my %mat;
+
+	for (my $i = 0; $i <= $len1; ++$i){
+		for (my $j = 0; $j <= $len2; ++$j){
+			$mat{0}{$j} = $j;
+		}
+		$mat{$i}{0} = $i;
+	}
+
+	my @ar1 = split(//, $s1);
+	my @ar2 = split(//, $s2);
+$mat{1}{1} = ($ar1[0] eq $ar2[0]) ? 0 : 1;
+
+ for (my $l = 2; $l <= $minlen; $l++){
+
+ 	my $minvaluev; # declare $minvaluev inside l loop
+ 	my $minvalueh; # declare $minvalueh inside l loop
+	for (my $i = 1; $i < $l; ++$i){
+my $cost = ($ar1[$i-1] eq $ar2[$l-1]) ? 0 : 1;
+$mat{$i}{$l} = min($mat{$i-1}{$l} + 1, $mat{$i}{$l-1} + 1, $mat{$i-1}{$l-1} + $cost);
+for (my $k = 1; $k < $l; ++$k){
+}
+ $minvaluev = ($mat{$i-1}{$l}, $mat{$i}{$l})[$mat{$i-1}{$l} > $mat{$i}{$l}]; #collects minimum value from vertical part of corner
+ }
+		for (my $j = 1; $j <= $l; ++$j){
+			my $cost = ($ar1[$l-1] eq $ar2[$j-1]) ? 0 : 1;
+			$mat{$l}{$j} = min($mat{$l-1}{$j} + 1, $mat{$l}{$j-1} + 1, $mat{$l-1}{$j-1} + $cost);
+			$minvalueh = ($mat{$l}{$j-1}, $mat{$l}{$j})[$mat{$l}{$j-1} > $mat{$l}{$j}]; #collects minimum value from horisontal part of corner
+		}
+ $minvalue = ($minvaluev, $minvalueh)[$minvaluev > $minvalueh]; #collects minimum value for corner
+ if ($minvalue > $defined_edit_distance){return $defined_edit_distance+1;}
+	}
+
+if ($len1 != $len2){
+if ($len1 == $minlen){
+	for (my $l = $minlen+1; $l <= $len2; $l++){
+      my $minvaluev; # declare $minvaluev inside l loop
+      for (my $i = 1; $i <= $minlen; ++$i){
+      my $cost = ($ar1[$i-1] eq $ar2[$l-1]) ? 0 : 1;
+      $mat{$i}{$l} = min($mat{$i-1}{$l} + 1, $mat{$i}{$l-1} + 1, $mat{$i-1}{$l-1} + $cost);
+      $minvaluev = ($mat{$i-1}{$l}, $mat{$i}{$l})[$mat{$i-1}{$l} > $mat{$i}{$l}]; #collects minimum value from vertical part of corner
+ }
+ if ($minvaluev > $defined_edit_distance){return $defined_edit_distance+1;}
+}
+} else {
+	 	for (my $l = $minlen+1; $l <= $len1; $l++){
+	 		my $minvalueh; # declare $minvalueh inside l loop
+	 		for (my $j = 1; $j <= $minlen; ++$j){
+			my $cost = ($ar1[$l-1] eq $ar2[$j-1]) ? 0 : 1;
+			$mat{$l}{$j} = min($mat{$l-1}{$j} + 1, $mat{$l}{$j-1} + 1, $mat{$l-1}{$j-1} + $cost);
+			$minvalueh = ($mat{$l}{$j-1}, $mat{$l}{$j})[$mat{$l}{$j-1} > $mat{$l}{$j}]; #collects minimum value from horisontal part of corner
+	 	}
+ if ($minvalueh > $defined_edit_distance){return $defined_edit_distance+1;}
+	 }
+	}
+}
+				    return $mat{$len1}{$len2};
+}
+
+
+sub min {
+    my ($min, @vars) = @_;
+    for (@vars) {
+        $min = $_ if $_ < $min;
+    }
+    return $min;
+}
\ No newline at end of file