# HG changeset patch # User hathkul # Date 1482768318 18000 # Node ID 0d1cf9024f0e3ebcfd22f972e8226917b0d927bb # Parent 12f2dd9ac1fdf9127a83211935072b26a339b0c8 Uploaded diff -r 12f2dd9ac1fd -r 0d1cf9024f0e rapidcluster --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapidcluster Mon Dec 26 11:05:18 2016 -0500 @@ -0,0 +1,282 @@ +#!/usr/bin/env perl + +## Distributed under GNU General Public License v3 + + +use Getopt::Long; ## Core Perl module for command line options/arguments + +my $start = time; ## Starts program execution timer +my $infile; ## Variable for input file +my $outfile; ## Variable for output file +my $defined_edit_distance; ## Variable (integer) for user-defined edit distance +my $help; ## true/false variable for help screen +my $quiet; ## true/false to suppress standard output +my $filter; ## Variable to define a read filter for entries +my $max_clusters; ## Maximum number of clusters +my $version; ## true/false variable for version screen + + ## Take command line arguments for... +GetOptions ( "infile=s" => \$infile, ## ... input file + "outfile=s" => \$outfile, ## ... output file + "distance=i" => \$defined_edit_distance, ## ... edit distance + "filter=f" => \$filter, ## ... read filter + "cluster_max=i" => \$max_clusters, ## max number of clusters + "help" => \$help, ## ... help screen + "version" => \$version, ## ... version screen + "quiet" => \$quiet); ## ... supress standard output + +if ($help){ ## Prints help screen if $help returns as true +print <<"HELP"; + +-------------------------------------------------------------------------------- + FASTAptamer-RapidCluster +-------------------------------------------------------------------------------- + +Usage: fastaptamer_rapidcluster [-h] [-i INFILE] [-o OUTFILE] [-d #] [-f #] [-c #] [-q] [-v] + + [-h] = Help screen. + [-i INFILE] = Input file from FASTAptamer-Count. REQUIRED. + [-o OUTFILE] = Output file, FASTA format. REQUIRED. + [-d] = Edit distance for clustering sequences. REQUIRED. + [-f] = Read filter. Only sequences with total reads greater than + the value supplied will be clustered. + [-c] = Maximum number of clusters to find. + [-q] = Quiet mode. Suppresses standard output of file I/O, numb- + er of clusters, cluster size and execution time. + [-v] = Display version. + +FASTAptamer-RapidCluster is a slightly modified and much faster version of original +FASTAptamer-Cluster script. It uses the Levenshtein algorithm to cluster together +sequences based on a user-defined edit distance. The most abundant and unclustered +sequence is used as the "seed sequence" for which edit distance is calculated from. +Output is FASTA with the following information on the identifier line for each +sequence entry: + + >Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeed + SEQUENCE + +To prevent clustering of sequences not highly sampled (and improve execution ti- +me), invoke the read filter and enter a number. Only sequences with total reads +greater than the number entered will be clustered. + +Input for FASTAptamer-Cluster MUST come from a FASTAptamer-Count output file. + +PLEASE NOTE: This is a computationally intense program that can take multiple h- +ours to finish depending on the size and complexity of your population. Utilize +the read filter [-f] to improve execution time. + +HELP +exit; +} + +if (defined $version){ ## Print version screen if -v is true + print <<"VERSION"; + +FASTAptamer v1.0.2 + +VERSION +exit; +} + +########################################## +## Open input file or exit with warning # +########################################## + +open (INPUT, '<', $infile) or die +"\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; + +########################################## +## Open output file or exit with warning # +########################################## + +open (OUTPUT, '>', $outfile) or die +"\nCould not open output file or no input file was specified.\See help documentation [-h], README, or User's Guide for program usage.\n"; + +############################################### +## Exit with warning if no distance specified # +############################################### + +unless ($defined_edit_distance) { +die "\nNo edit distance specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; } + +my @sequence_and_info; ## Array to contain each FASTA entry individually +my $entries; ## Variable to keep track of the number of entries + +$/= ">"; ## Change default input record separator to read FASTA formatted file + +while (){ ## Reads through entire input file + if ($_ =~ /(\d+-\d+-\d+.*\d*\n\w+)/){ ## Regex for FASTAptamer-Count format + my @read_count_test = split /-/, $1; ## Splits entry by dashes, loads array + if ($read_count_test[1] > $filter){ ## Tests for read count filter + push @sequence_and_info, $1; ## Add each matched entry to array + $entries++; ## Increase entry count by 1 + } + } +} + +close INPUT; ## Closes input file + +unless ($quiet){ ## Unless -q option is invoked, print this warning and summary + print "\n**************************************************"; + print "\nClustering is a computationally intense process.\n"; + print "Please be patient as clustering can take several\n"; + print "hours for a single file. For faster clustering\n"; + print "use the read filter [-f] to avoid clustering of \n"; + print "sequences not highly sampled.\n"; + print "It is also recommended to set max number of \n"; + print "clusters to find, a couple of hundered is usually enough \n"; + print "**************************************************\n"; + print "Total number of sequences to cluster: $entries.\n"; + print "Total number of clusters to find: $max_clusters.\n"; + print "Cluster\tUnique Sequences\tReads\tRPM\tLevenshtein\tTime\n"; + + +} + +my $current_cluster = 1; ## This variable defines what cluster we're working on + +iterate(@sequence_and_info); ## Sends array with sequence entries to subroutine + +############################################### +## SUBROUTINE THAT BEGINS TO PROCESS FILE # +## One iteration generates a single cluster # +## and then repeats w/ unclustered sequences # +## for next cluster # +############################################### +my $levecounter = 0; + +sub iterate { + my ( $top_entry, @entries ) = @_; ## Takes "seed sequence" and remaining entries + my @keepers; ## Array to store unclustered entries + my @current_seq_info = split /\n/, $top_entry; + ## Splits entry into metrics (reads, rank, RPM) and the "seed" sequence + my $cluster_rank = 1; ## Defines that seed sequence is ranked #1 + + print OUTPUT ">$current_seq_info[0]-$current_cluster-$cluster_rank-0\n$current_seq_info[1]\n"; + ## Prints current seed sequence in FASTAptamer-Cluster format + + my @current_seq_metrics = split /-/, $current_seq_info[0]; ## Split sequence identifier line + my $current_cluster_reads = $current_seq_metrics[1]; ## Take reads to tally cluster size + my $current_cluster_rpm = $current_seq_metrics[2]; ## Take RPM to tally cluster size in RPM + + for (@entries) { ## for each entry left in array send to calculate distance + + my @comparison_seq_info = split /\n/, $_; ## Split entry into metrics and sequence + my @comparison_seq_metrics = split /-/, $comparison_seq_info[0]; ## Split identifier line + my $comparison_seq_reads = $comparison_seq_metrics[1]; ## Take reads to tally cluster size + my $comparison_seq_rpm = $comparison_seq_metrics[2]; ## Take RPM to tally cluster size + my $distance = levenshtein( $current_seq_info[1], $comparison_seq_info[1] ); + ## sends comparison sequence to compare against current seed sequence, returns distances +# print "calculation number $levecounter. Distance between $current_seq_info[1] and $comparison_seq_info[1] is $distance \n"; + $levecounter++; + if ( $distance > $defined_edit_distance ) { ## If distance is greater than defined + push @keepers, $_; ## Add to array for comparison in next iteration + } + elsif ( $distance <= $defined_edit_distance ){ ## If distance is less than or equal to defined + $cluster_rank++; ## Increment the cluster rank + $current_cluster_reads += $comparison_seq_reads; ## Add reads to cluster reads tally + $current_cluster_rpm += $comparison_seq_rpm; ## Add RPM to cluster RPM tally + print OUTPUT ">$comparison_seq_info[0]-$current_cluster-$cluster_rank-$distance\n"; + print OUTPUT "$comparison_seq_info[1]\n"; ## Print entry in output file + } + } + + unless ($quiet) { print "$current_cluster\t$cluster_rank\t$current_cluster_reads\t$current_cluster_rpm\t$defined_edit_distance\t"; } + ## Display cluster number, number of unique sequences, reads and RPM + $current_cluster++; ## Increment cluster number prior to next cluster + + # Quit clustering if the maximum clusters have been found + return if $current_cluster > $max_clusters; + + if (@keepers) { ## Subroutine within subroutine for sequences that are unclustered + iterate(@keepers); + } +} + +my $duration = time - $start; ## Calculates how much time has elapsed since start + +unless ($quiet) { ## Displays summary report unless -q is invoked +# print "\nInput file: \"$infile\".\n"; +# print "Output file: \"$outfile\".\n"; + print "$duration\n"; +} + +#################################################################################### +## The subroutines below checks if the Levenshtein edit distance for the # +## current "seed" sequence and the comparison sequence is above the set parameter # +#################################################################################### + + +sub levenshtein { + my ($s1, $s2) = @_; + my ($len1, $len2) = (length $s1, length $s2); #$len1 and 2 now hold length of input strings + return $len2 if ($len1 == 0); + return $len1 if ($len2 == 0); + my $minlen = ($len1, $len2)[$len1 > $len2]; + my %mat; + + for (my $i = 0; $i <= $len1; ++$i){ + for (my $j = 0; $j <= $len2; ++$j){ + $mat{0}{$j} = $j; + } + $mat{$i}{0} = $i; + } + + my @ar1 = split(//, $s1); + my @ar2 = split(//, $s2); +$mat{1}{1} = ($ar1[0] eq $ar2[0]) ? 0 : 1; + + for (my $l = 2; $l <= $minlen; $l++){ + + my $minvaluev; # declare $minvaluev inside l loop + my $minvalueh; # declare $minvalueh inside l loop + for (my $i = 1; $i < $l; ++$i){ +my $cost = ($ar1[$i-1] eq $ar2[$l-1]) ? 0 : 1; +$mat{$i}{$l} = min($mat{$i-1}{$l} + 1, $mat{$i}{$l-1} + 1, $mat{$i-1}{$l-1} + $cost); +for (my $k = 1; $k < $l; ++$k){ +} + $minvaluev = ($mat{$i-1}{$l}, $mat{$i}{$l})[$mat{$i-1}{$l} > $mat{$i}{$l}]; #collects minimum value from vertical part of corner + } + for (my $j = 1; $j <= $l; ++$j){ + my $cost = ($ar1[$l-1] eq $ar2[$j-1]) ? 0 : 1; + $mat{$l}{$j} = min($mat{$l-1}{$j} + 1, $mat{$l}{$j-1} + 1, $mat{$l-1}{$j-1} + $cost); + $minvalueh = ($mat{$l}{$j-1}, $mat{$l}{$j})[$mat{$l}{$j-1} > $mat{$l}{$j}]; #collects minimum value from horisontal part of corner + } + $minvalue = ($minvaluev, $minvalueh)[$minvaluev > $minvalueh]; #collects minimum value for corner + if ($minvalue > $defined_edit_distance){return $defined_edit_distance+1;} + } + +if ($len1 != $len2){ +if ($len1 == $minlen){ + for (my $l = $minlen+1; $l <= $len2; $l++){ + my $minvaluev; # declare $minvaluev inside l loop + for (my $i = 1; $i <= $minlen; ++$i){ + my $cost = ($ar1[$i-1] eq $ar2[$l-1]) ? 0 : 1; + $mat{$i}{$l} = min($mat{$i-1}{$l} + 1, $mat{$i}{$l-1} + 1, $mat{$i-1}{$l-1} + $cost); + $minvaluev = ($mat{$i-1}{$l}, $mat{$i}{$l})[$mat{$i-1}{$l} > $mat{$i}{$l}]; #collects minimum value from vertical part of corner + } + if ($minvaluev > $defined_edit_distance){return $defined_edit_distance+1;} +} +} else { + for (my $l = $minlen+1; $l <= $len1; $l++){ + my $minvalueh; # declare $minvalueh inside l loop + for (my $j = 1; $j <= $minlen; ++$j){ + my $cost = ($ar1[$l-1] eq $ar2[$j-1]) ? 0 : 1; + $mat{$l}{$j} = min($mat{$l-1}{$j} + 1, $mat{$l}{$j-1} + 1, $mat{$l-1}{$j-1} + $cost); + $minvalueh = ($mat{$l}{$j-1}, $mat{$l}{$j})[$mat{$l}{$j-1} > $mat{$l}{$j}]; #collects minimum value from horisontal part of corner + } + if ($minvalueh > $defined_edit_distance){return $defined_edit_distance+1;} + } + } +} + return $mat{$len1}{$len2}; +} + + +sub min { + my ($min, @vars) = @_; + for (@vars) { + $min = $_ if $_ < $min; + } + return $min; +} \ No newline at end of file