Previous changeset 0:8603505f43c5 (2015-02-10) |
Commit message:
Uploaded |
added:
fastaptamer_search |
b |
diff -r 8603505f43c5 -r ac61067a0852 fastaptamer_search --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaptamer_search Tue Feb 10 14:51:07 2015 -0500 |
[ |
b'@@ -0,0 +1,240 @@\n+#!/usr/bin/env perl\n+\n+## Last Modified January 19th, 2015 22:54 CST\n+\n+## Citation:\n+## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. \n+## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of \n+## Combinatorial Selections." Molecular Therapy \xe2\x80\x94 Nucleic Acids. 2015.\n+## DOI: 10.1038/mtna.2015.4\n+\n+## Distributed under GNU General Public License v3\n+\n+use Getopt::Long; # Core Perl module for command line arguments/options\n+\n+###############################################################################\n+\n+ ## Variables for command line arguments\n+my @inputlist; # Array of input files\n+my $output; # Output filename (optional)\n+my @patternlist; # Array of pattern(s) to search for\n+my $highlight; # If defined, highlight matches using parens\n+my $help; # If defined, show help dialogue\n+my $quiet; # If defined, suppress report in STDOUT\n+my $version; # display version\n+\n+## Process command line arguments\n+GetOptions ( \n+ "input=s" => \\@inputlist, # input file(s)\n+ "output=s" => \\$output, # output file (optional)\n+ "pattern=s" => \\@patternlist, # pattern(s) to search for\n+ "highlight" => \\$highlight, # highlight matched portion with parens\n+ "quiet" => \\$quiet, # suppressing summary report\n+ "version" => \\$version, # display version\n+ "help" => \\$help); # displaying help information\n+\n+if(defined $help) { ## Prints help screen if -help is invoked\n+ print <<"HELP";\n+\n+--------------------------------------------------------------------------------\n+ FASTAptamer-Search\n+--------------------------------------------------------------------------------\n+\n+Usage: fastaptamer_search [-i INFILE] [-o OUTFILE] [-p PATTERN] \n+\n+ [-help] \t= Help screen.\n+ [-i FILENAME] \t= Input file; can be used multiple times. REQUIRED.\n+ [-p PATTERN]\t\t= Sequence pattern to search for; can be used multiple \n+ times. REQUIRED.\n+ [-o FILENAME] \t= Output file for search results. If none given, output \n+ goes to STDOUT. \n+ [-highlight] = Highlight matched portion of sequence in parentheses.\n+ [-q] = Suppress summary report.\n+ [-v] = Display version.\n+\n+FASTAptamer-Search allows users to search for specific patterns within one or m-\n+ore sequence files.\n+\n+To search through more than one input file, simply use the [-i] flag multiple t-\n+imes. All input files must use FASTA format.\n+\n+Similarly, to search for multiple patterns simultaneously, use the [-p] flag as \n+many times as needed. When searching for multiple patterns, note that partial m-\n+atches are not returned. For example, entering the following command:\n+\n+ fastaptamer_search -i FILE1 -i FILE2 -p ATTGCC -p TGGCAT\n+\n+would search FILE1 and FILE2 for sequences containing both ATTGCC and TGGCAT.\n+\n+Patterns and input sequence data are case insensitive and T/U are interchangeab-\n+le. In addition to single bases, patterns can include any of the degenerate base\n+symbols from IUPAC nucleic acid notation:\n+\n+ A/T/G/C/U single bases\n+\n+ R puRines (A/G)\n+ Y pYrimidines (C/T)\n+ W Weak (A/T)\n+ S Strong (G/C)\n+ M aMino (A/C)\n+ K Keto (G/T)\n+\n+ B not A\n+ D not C\n+ H not G\n+ V not T\n+\n+ N aNy base (not a gap)\n+\n+For greater visibility, pattern matches can be highlighted by parentheses in the\n+output by calling the [-highlight] flag.\n+\n+A summary report is generated after each file\'s search results and after search \n+completion. To suppress these reports, enable quiet mode using the [-quiet] flag\n+\n+HELP\n+exit;\n+\n+}\n+\n+if (defined $version){ ## Print version screen if -v is true\n+ print <<"VERSION";\n+\t\n+FASTAptamer v1.0.2\n+\t\n+VERSION\n+exit;\n+}\n+\n+#########################################################'..b'-------------------------------------------------\\n";\n+ print "SEARCH RESULTS FOR INPUT FILE $current_input: $input\\n";\n+ print "--------------------------------------------------------------------------------\\n\\n";\n+ }\n+ open(my $fh_in, \'<\', $input) or die "Could not open input file $input";\n+\n+ while($line1 = <$fh_in>) {\n+ $line2 = <$fh_in>;\n+ my $not_first_regex = 0;\n+ my $hit_confirmed = 0; \n+ foreach $subarray (@superarray) {\n+ $regex = $subarray->[1]; ## Get the regex from subarray\n+ ## $1\n+ if($line2 =~ m{($regex)}gi) { ## Search for all matches, case insensitive\n+ $match_portion = $1; ## Portion of sequence that matched was captured in magic variable $1\n+ if($not_first_regex == 0) { ($match_line = $line2) =~ s{$match_portion}{\\($match_portion\\)}g; }\n+ else { $match_line =~ s{$match_portion}{\\($match_portion\\)}g; }\n+ $not_first_regex = 1;\n+ $hit_confirmed++; \n+ }\n+ }\n+ if(defined $output and $hit_confirmed == $patterncount) {\n+ $filehits++; ## Increment FILE-SPECIFIC hit counter \n+ $totalhits++; ## Increment OVERALL hit counter\n+ if (defined $highlight) { print $fh_out "$line1$match_line"; }\n+ else { print $fh_out "$line1$line2"; }\n+ }\n+ elsif(not defined $output and $hit_confirmed == $patterncount) {\n+ $filehits++; ## Increment FILE-SPECIFIC hit counter \n+ $totalhits++; ## Increment OVERALL hit counter\n+ if (defined $highlight) { print "$line1$match_line"; }\n+ else { print "$line1$line2"; }\n+ }\n+ }\n+ \n+ if(defined $output and not defined $quiet) { ## Print file-specific stats, unless quiet mode\n+ if ($filehits > 1) { print $fh_out "\\nMatched $filehits sequences in file $input.\\n"; } \n+ elsif ($filehits == 1) { print $fh_out "\\nMatched 1 sequence in file $input.\\n"; } \n+ elsif ($filehits == 0) { print $fh_out "\\nDid not match any sequences in file $input.\\n" }\n+ $filehits = 0; ## Reset file-specific hit counter after stats are printed\n+ }\n+ elsif(not defined $output and not defined $quiet) {\n+ if ($filehits > 1) { print "\\nMatched $filehits sequences in file $input.\\n"; } \n+ elsif ($filehits == 1) { print "\\nMatched 1 sequence in file $input.\\n"; } \n+ elsif ($filehits == 0) { print "\\nDid not match any sequences in file $input.\\n" }\n+ $filehits = 0; ## Reset file-specific hit counter after stats are printed\n+ }\n+}\n+\n+\n+## Print a summary after script completion, unless quiet mode\n+unless(defined $quiet) {\n+\tmy $duration = time - $start_time;\n+\t\n+\tprint "\\n--------------------------------------------------------------------------------\\n";\n+\tprint " SEARCH RESULT SUMMARY\\n";\n+\tprint "--------------------------------------------------------------------------------\\n";\n+\t\n+\tif($patterncount > 1) { print "Searched for $patterncount patterns:\\n"; }\n+\telsif($patterncount == 1) { print "Searched for 1 pattern:\\n"; }\n+\tforeach $subarray (@superarray) { print "$subarray->[0]\\n"; }\n+\t\n+\tprint "\\nacross the following $inputcount input files:\\n";\n+\tprint join("\\n", @inputlist);\n+\t\n+\tif($totalhits > 1) { print "\\n$totalhits sequences were matched.\\n"; } \n+ elsif($totalhits == 1) { print "\\n1 sequence was matched.\\n"; }\n+\telsif($totalhits == 0) { print "\\nDid not find any matches.\\n"; }\n+\t\n+\tif($duration == 1) { print "\\nYour search took 1 second.\\n"; }\n+\telse { print "\\nYour search took $duration seconds.\\n"; }\n+\t\n+\tprint "--------------------------------------------------------------------------------\\n";\n+\tprint "--------------------------------------------------------------------------------\\n";\n+}\n' |