Galaxy |

Changeset 1:0d1cf9024f0e (2016-12-26)

Previous changeset 0:12f2dd9ac1fd (2016-12-26) Next changeset 2:6540565ae995 (2016-12-26)

Commit message:
Uploaded

added:
rapidcluster

diff -r 12f2dd9ac1fd -r 0d1cf9024f0e rapidcluster
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rapidcluster Mon Dec 26 11:05:18 2016 -0500

[

b'@@ -0,0 +1,282 @@\n+#!/usr/bin/env perl\n+\n+## Distributed under GNU General Public License v3\n+\n+\n+use Getopt::Long; ## Core Perl module for command line options/arguments\n+\n+my $start = time; ## Starts program execution timer\n+my $infile; ## Variable for input file\n+my $outfile; ## Variable for output file\n+my $defined_edit_distance; ## Variable (integer) for user-defined edit distance\n+my $help; ## true/false variable for help screen\n+my $quiet; ## true/false to suppress standard output\n+my $filter;\t\t\t\t## Variable to define a read filter for entries\n+my $max_clusters; ## Maximum number of clusters\n+my $version; ## true/false variable for version screen\n+\n+ ## Take command line arguments for...\n+GetOptions ( "infile=s" => \\$infile, ## ... input file\n+ "outfile=s" => \\$outfile, ## ... output file\n+ "distance=i" => \\$defined_edit_distance, ## ... edit distance\n+ "filter=f" => \\$filter, ## ... read filter\n+ "cluster_max=i" => \\$max_clusters, ## max number of clusters\n+ "help" => \\$help, ## ... help screen\n+ "version" => \\$version, ## ... version screen\n+ "quiet" => \\$quiet); ## ... supress standard output\n+\n+if ($help){ ## Prints help screen if $help returns as true\n+print <<"HELP";\n+\t\n+--------------------------------------------------------------------------------\n+ FASTAptamer-RapidCluster\n+--------------------------------------------------------------------------------\n+\n+Usage: fastaptamer_rapidcluster [-h] [-i INFILE] [-o OUTFILE] [-d #] [-f #] [-c #] [-q] [-v] \n+\n+ [-h] = Help screen.\n+ [-i INFILE] = Input file from FASTAptamer-Count. REQUIRED.\n+ [-o OUTFILE] = Output file, FASTA format. REQUIRED.\n+ [-d] = Edit distance for clustering sequences. REQUIRED.\n+ [-f] = Read filter. Only sequences with total reads greater than\n+ the value supplied will be clustered.\n+ [-c] = Maximum number of clusters to find.\n+ [-q] = Quiet mode. Suppresses standard output of file I/O, numb-\n+ er of clusters, cluster size and execution time.\n+ [-v] = Display version.\n+\n+FASTAptamer-RapidCluster is a slightly modified and much faster version of original \n+FASTAptamer-Cluster script. It uses the Levenshtein algorithm to cluster together \n+sequences based on a user-defined edit distance. The most abundant and unclustered \n+sequence is used as the "seed sequence" for which edit distance is calculated from. \n+Output is FASTA with the following information on the identifier line for each \n+sequence entry:\n+\n+ >Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeed\n+ SEQUENCE\n+\n+To prevent clustering of sequences not highly sampled (and improve execution ti-\n+me), invoke the read filter and enter a number. Only sequences with total reads\n+greater than the number entered will be clustered. \n+\n+Input for FASTAptamer-Cluster MUST come from a FASTAptamer-Count output file. \n+\n+PLEASE NOTE: This is a computationally intense program that can take multiple h-\n+ours to finish depending on the size and complexity of your population. Utilize\n+the read filter [-f] to improve execution time. \n+\n+HELP\n+exit;\n+}\n+\n+if (defined $version){ ## Print version screen if -v is true\n+ print <<"VERSION";\n+\t\n+FASTAptamer v1.0.2\n+\t\n+VERSION\n+exit;\n+}\n+\n+##########################################\t\n+## Open input file or exit with warning #\n+##########################################\n+\n+open (INPUT, \'<\', $infile) or die \n+"\\nCould not open input file or no input file was specified.\\nSee help documentation [-h], README, or User\'s Guide for program usage.\\n";\n+\n+######################################'..b't$defined_edit_distance\\t"; }\n+ ## Display cluster number, number of unique sequences, reads and RPM\n+ $current_cluster++; ## Increment cluster number prior to next cluster\n+\n+ # Quit clustering if the maximum clusters have been found\n+ return if $current_cluster > $max_clusters;\n+\n+ if (@keepers) { ## Subroutine within subroutine for sequences that are unclustered\n+ iterate(@keepers);\n+ }\n+}\n+\n+my $duration = time - $start; ## Calculates how much time has elapsed since start\n+\n+unless ($quiet) { ## Displays summary report unless -q is invoked\n+# print "\\nInput file: \\"$infile\\".\\n";\n+# print "Output file: \\"$outfile\\".\\n";\n+ print "$duration\\n"; \n+}\n+\n+####################################################################################\n+## The subroutines below checks if the Levenshtein edit distance for the #\n+## current "seed" sequence and the comparison sequence is above the set parameter #\n+####################################################################################\n+\n+\n+sub levenshtein {\n+\tmy ($s1, $s2) = @_; \n+\tmy ($len1, $len2) = (length $s1, length $s2); #$len1 and 2 now hold length of input strings\t\n+\treturn $len2 if ($len1 == 0); \n+\treturn $len1 if ($len2 == 0); \n+\tmy $minlen = ($len1, $len2)[$len1 > $len2];\n+\tmy %mat; \n+\t\n+\tfor (my $i = 0; $i <= $len1; ++$i){ \n+\t\tfor (my $j = 0; $j <= $len2; ++$j){ \n+\t\t\t$mat{0}{$j} = $j; \n+\t\t} \n+\t\t$mat{$i}{0} = $i; \n+\t}\n+\t\n+\tmy @ar1 = split(//, $s1); \n+\tmy @ar2 = split(//, $s2); \n+$mat{1}{1} = ($ar1[0] eq $ar2[0]) ? 0 : 1; \n+\n+ for (my $l = 2; $l <= $minlen; $l++){\n+ \t\n+ \tmy $minvaluev; # declare $minvaluev inside l loop\n+ \tmy $minvalueh; # declare $minvalueh inside l loop\n+\tfor (my $i = 1; $i < $l; ++$i){\n+my $cost = ($ar1[$i-1] eq $ar2[$l-1]) ? 0 : 1;\n+$mat{$i}{$l} = min($mat{$i-1}{$l} + 1, $mat{$i}{$l-1} + 1, $mat{$i-1}{$l-1} + $cost); \n+for (my $k = 1; $k < $l; ++$k){\n+}\n+ $minvaluev = ($mat{$i-1}{$l}, $mat{$i}{$l})[$mat{$i-1}{$l} > $mat{$i}{$l}]; #collects minimum value from vertical part of corner\n+ }\n+\t\tfor (my $j = 1; $j <= $l; ++$j){\n+\t\t\tmy $cost = ($ar1[$l-1] eq $ar2[$j-1]) ? 0 : 1; \n+\t\t\t$mat{$l}{$j} = min($mat{$l-1}{$j} + 1, $mat{$l}{$j-1} + 1, $mat{$l-1}{$j-1} + $cost); \n+\t\t\t$minvalueh = ($mat{$l}{$j-1}, $mat{$l}{$j})[$mat{$l}{$j-1} > $mat{$l}{$j}]; #collects minimum value from horisontal part of corner\n+\t\t}\n+ $minvalue = ($minvaluev, $minvalueh)[$minvaluev > $minvalueh]; #collects minimum value for corner\n+ if ($minvalue > $defined_edit_distance){return $defined_edit_distance+1;}\t\t\n+\t}\n+\t\n+if ($len1 != $len2){\t\n+if ($len1 == $minlen){\n+\tfor (my $l = $minlen+1; $l <= $len2; $l++){\n+ my $minvaluev; # declare $minvaluev inside l loop\n+ for (my $i = 1; $i <= $minlen; ++$i){\n+ my $cost = ($ar1[$i-1] eq $ar2[$l-1]) ? 0 : 1;\n+ $mat{$i}{$l} = min($mat{$i-1}{$l} + 1, $mat{$i}{$l-1} + 1, $mat{$i-1}{$l-1} + $cost); \n+ $minvaluev = ($mat{$i-1}{$l}, $mat{$i}{$l})[$mat{$i-1}{$l} > $mat{$i}{$l}]; #collects minimum value from vertical part of corner\n+ }\n+ if ($minvaluev > $defined_edit_distance){return $defined_edit_distance+1;}\n+}\n+} else {\n+\t \tfor (my $l = $minlen+1; $l <= $len1; $l++){\n+\t \t\tmy $minvalueh; # declare $minvalueh inside l loop \n+\t \t\tfor (my $j = 1; $j <= $minlen; ++$j){\n+\t\t\tmy $cost = ($ar1[$l-1] eq $ar2[$j-1]) ? 0 : 1; \n+\t\t\t$mat{$l}{$j} = min($mat{$l-1}{$j} + 1, $mat{$l}{$j-1} + 1, $mat{$l-1}{$j-1} + $cost); \n+\t\t\t$minvalueh = ($mat{$l}{$j-1}, $mat{$l}{$j})[$mat{$l}{$j-1} > $mat{$l}{$j}]; #collects minimum value from horisontal part of corner\n+\t \t}\n+ if ($minvalueh > $defined_edit_distance){return $defined_edit_distance+1;}\n+\t }\n+\t}\n+}\t \t\t\n+\t\t\t\t return $mat{$len1}{$len2}; \n+}\n+\n+\n+sub min {\n+ my ($min, @vars) = @_;\n+ for (@vars) {\n+ $min = $_ if $_ < $min;\n+ }\n+ return $min;\n+}\n\\ No newline at end of file\n'