Previous changeset 0:307254415eb1 (2015-02-10) |
Commit message:
Uploaded |
added:
fastaptamer_cluster |
b |
diff -r 307254415eb1 -r 8a3bb4734410 fastaptamer_cluster --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaptamer_cluster Tue Feb 10 14:31:10 2015 -0500 |
[ |
b'@@ -0,0 +1,244 @@\n+#!/usr/bin/env perl\n+\n+## Last Modified January 19th, 2015 22:03 CST\n+\n+## Citation:\n+## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. \n+## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of \n+## Combinatorial Selections." Molecular Therapy \xe2\x80\x94 Nucleic Acids. 2015.\n+## DOI: 10.1038/mtna.2015.4\n+\n+## Distributed under GNU General Public License v3\n+\n+\n+use Getopt::Long; ## Core Perl module for command line options/arguments\n+\n+my $start = time; ## Starts program execution timer\n+my $infile; ## Variable for input file\n+my $outfile; ## Variable for output file\n+my $defined_edit_distance; ## Variable (integer) for user-defined edit distance\n+my $help; ## true/false variable for help screen\n+my $quiet; ## true/false to suppress standard output\n+my $filter = 0;\t\t\t\t## Variable to define a read filter for entries\n+my $version; ## true/false variable for version screen\n+\n+ ## Take command line arguments for...\n+GetOptions ( "infile=s" => \\$infile, ## ... input file\n+ "outfile=s" => \\$outfile, ## ... output file\n+ "distance=i" => \\$defined_edit_distance, ## ... edit distance\n+ "filter=f" => \\$filter, ## ... read filter\n+ "help" => \\$help, ## ... help screen\n+ "version" => \\$version, ## ... version screen\n+ "quiet" => \\$quiet); ## ... supress standard output\n+\n+if ($help){ ## Prints help screen if $help returns as true\n+print <<"HELP";\n+\t\n+--------------------------------------------------------------------------------\n+ FASTAptamer-Cluster\n+--------------------------------------------------------------------------------\n+\n+Usage: fastaptamer_cluster [-h] [-i INFILE] [-o OUTFILE] [-d #] [-f #] [-q] [-v]\n+\n+ [-h] = Help screen.\n+ [-i INFILE] = Input file from FASTAptamer-Count. REQUIRED.\n+ [-o OUTFILE] = Output file, FASTA format. REQUIRED.\n+ [-d] = Edit distance for clustering sequences. REQUIRED.\n+ [-f] = Read filter. Only sequences with total reads greater than\n+ the value supplied will be clustered.\n+ [-q] = Quiet mode. Suppresses standard output of file I/O, numb-\n+ er of clusters, cluster size and execution time.\n+ [-v] = Display version.\n+\n+FASTAptamer-Cluster uses the Levenshtein algorithm to cluster together sequences\n+based on a user-defined edit distance. The most abundant and unclustered seque-\n+nce is used as the "seed sequence" for which edit distance is calculated from. \n+Output is FASTA with the following information on the identifier line for each \n+sequence entry:\n+\n+ >Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeed\n+ SEQUENCE\n+\n+To prevent clustering of sequences not highly sampled (and improve execution ti-\n+me), invoke the read filter and enter a number. Only sequences with total reads\n+greater than the number entered will be clustered. \n+\n+Input for FASTAptamer-Cluster MUST come from a FASTAptamer-Count output file. \n+\n+PLEASE NOTE: This is a computationally intense program that can take multiple h-\n+ours to finish depending on the size and complexity of your population. Utilize\n+the read filter [-f] to improve execution time. \n+\n+HELP\n+exit;\n+}\n+\n+if (defined $version){ ## Print version screen if -v is true\n+ print <<"VERSION";\n+\t\n+FASTAptamer v1.0.2\n+\t\n+VERSION\n+exit;\n+}\n+\n+##########################################\t\n+## Open input file or exit with warning #\n+##########################################\n+\n+open (INPUT, \'<\', $infile) or die \n+"\\nCould not open input file or no input file was specified.\\nSee help documentation [-h], README, or User\'s Guide for program usage.\\n";\n+\n+#########################'..b'terate {\n+ my ( $top_entry, @entries ) = @_; ## Takes "seed sequence" and remaining entries \n+ my @keepers; ## Array to store unclustered entries\n+ my @current_seq_info = split /\\n/, $top_entry; \n+ ## Splits entry into metrics (reads, rank, RPM) and the "seed" sequence\n+ my $cluster_rank = 1; ## Defines that seed sequence is ranked #1\n+\n+ print OUTPUT ">$current_seq_info[0]-$current_cluster-$cluster_rank-0\\n$current_seq_info[1]\\n";\n+ ## Prints current seed sequence in FASTAptamer-Cluster format\n+\n+\tmy @current_seq_metrics = split /-/, $current_seq_info[0]; ## Split sequence identifier line\n+\tmy $current_cluster_reads = $current_seq_metrics[1]; ## Take reads to tally cluster size\n+\tmy $current_cluster_rpm = $current_seq_metrics[2]; ## Take RPM to tally cluster size in RPM\n+\n+ for (@entries) { ## for each entry left in array send to calculate distance \n+ \tmy @comparison_seq_info = split /\\n/, $_; ## Split entry into metrics and sequence\n+ \tmy @comparison_seq_metrics = split /-/, $comparison_seq_info[0]; ## Split identifier line\n+ \tmy $comparison_seq_reads = $comparison_seq_metrics[1]; ## Take reads to tally cluster size\n+ \tmy $comparison_seq_rpm = $comparison_seq_metrics[2]; ## Take RPM to tally cluster size\n+ my $distance = levenshtein( $current_seq_info[1], $comparison_seq_info[1] );\n+ ## sends comparison sequence to compare against current seed sequence, returns distances\n+ if ( $distance > $defined_edit_distance ) { ## If distance is greater than defined\n+ push @keepers, $_; ## Add to array for comparison in next iteration\n+ }\n+ elsif ( $distance <= $defined_edit_distance ){ ## If distance is less than or equal to defined\n+ \t$cluster_rank++; ## Increment the cluster rank\n+ \t$current_cluster_reads += $comparison_seq_reads; ## Add reads to cluster reads tally\n+ \t$current_cluster_rpm += $comparison_seq_rpm; ## Add RPM to cluster RPM tally\n+ \tprint OUTPUT ">$comparison_seq_info[0]-$current_cluster-$cluster_rank-$distance\\n";\n+ \tprint OUTPUT "$comparison_seq_info[1]\\n"; ## Print entry in output file\n+ }\n+ }\n+\n+ unless ($quiet) { print "$current_cluster\\t$cluster_rank\\t$current_cluster_reads\\t$current_cluster_rpm\\n"; }\n+ ## Display cluster number, number of unique sequences, reads and RPM\n+ $current_cluster++; ## Increment cluster number prior to next cluster\n+\n+ if (@keepers) { ## Subroutine within subroutine for sequences that are unclustered\n+ iterate(@keepers);\n+ }\n+}\n+\n+my $duration = time - $start; ## Calculates how much time has elapsed since start\n+\n+unless ($quiet) { ## Displays summary report unless -q is invoked\n+ print "\\nInput file: \\"$infile\\".\\n";\n+ print "Output file: \\"$outfile\\".\\n";\n+ print "Execution time: $duration s.\\n"; \n+}\n+\n+#############################################################################\n+## The subroutines below calculates the Levenshtein edit distance for the #\n+## current "seed" sequence and the comparison sequence that are sent to it #\n+#############################################################################\n+\n+\n+sub levenshtein {\n+\tmy ($s1, $s2) = @_;\n+\tmy ($len1, $len2) = (length $s1, length $s2);\n+\t\n+\treturn $len2 if ($len1 == 0);\n+\treturn $len1 if ($len2 == 0);\n+\t\n+\tmy %mat;\n+\t\n+\tfor (my $i = 0; $i <= $len1; ++$i){\n+\t\tfor (my $j = 0; $j <= $len2; ++$j){\n+\t\t\t$mat{$i}{$j} = 0;\n+\t\t\t$mat{0}{$j} = $j;\n+\t\t}\n+\t\t$mat{$i}{0} = $i;\n+\t}\n+\t\n+\tmy @ar1 = split(//, $s1);\n+\tmy @ar2 = split(//, $s2);\n+\n+\tfor (my $i = 1; $i <= $len1; ++$i){\n+\t\tfor (my $j = 1; $j <= $len2; ++$j){\n+\t\t\tmy $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;\n+\t\t\t$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,\n+\t\t\t$mat{$i}{$j-1} + 1,\n+\t\t\t$mat{$i-1}{$j-1} + $cost]);\n+\t\t}\n+\t}\n+ return $mat{$len1}{$len2};\n+}\n+\n+sub min\n+{\n+ my @list = @{$_[0]};\n+ my $min = $list[0];\n+\n+ foreach my $i (@list)\n+ {\n+ $min = $i if ($i < $min);\n+ }\n+\n+ return $min;\n+}\n' |