# HG changeset patch # User fastaptamer # Date 1423597867 18000 # Node ID ac61067a0852e9f115ef08172142191e21b2627d # Parent 8603505f43c598a8e38f2a6b080c8383f322a7cc Uploaded diff -r 8603505f43c5 -r ac61067a0852 fastaptamer_search --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaptamer_search Tue Feb 10 14:51:07 2015 -0500 @@ -0,0 +1,240 @@ +#!/usr/bin/env perl + +## Last Modified January 19th, 2015 22:54 CST + +## Citation: +## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. +## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of +## Combinatorial Selections." Molecular Therapy — Nucleic Acids. 2015. +## DOI: 10.1038/mtna.2015.4 + +## Distributed under GNU General Public License v3 + +use Getopt::Long; # Core Perl module for command line arguments/options + +############################################################################### + + ## Variables for command line arguments +my @inputlist; # Array of input files +my $output; # Output filename (optional) +my @patternlist; # Array of pattern(s) to search for +my $highlight; # If defined, highlight matches using parens +my $help; # If defined, show help dialogue +my $quiet; # If defined, suppress report in STDOUT +my $version; # display version + +## Process command line arguments +GetOptions ( + "input=s" => \@inputlist, # input file(s) + "output=s" => \$output, # output file (optional) + "pattern=s" => \@patternlist, # pattern(s) to search for + "highlight" => \$highlight, # highlight matched portion with parens + "quiet" => \$quiet, # suppressing summary report + "version" => \$version, # display version + "help" => \$help); # displaying help information + +if(defined $help) { ## Prints help screen if -help is invoked + print <<"HELP"; + +-------------------------------------------------------------------------------- + FASTAptamer-Search +-------------------------------------------------------------------------------- + +Usage: fastaptamer_search [-i INFILE] [-o OUTFILE] [-p PATTERN] + + [-help] = Help screen. + [-i FILENAME] = Input file; can be used multiple times. REQUIRED. + [-p PATTERN] = Sequence pattern to search for; can be used multiple + times. REQUIRED. + [-o FILENAME] = Output file for search results. If none given, output + goes to STDOUT. + [-highlight] = Highlight matched portion of sequence in parentheses. + [-q] = Suppress summary report. + [-v] = Display version. + +FASTAptamer-Search allows users to search for specific patterns within one or m- +ore sequence files. + +To search through more than one input file, simply use the [-i] flag multiple t- +imes. All input files must use FASTA format. + +Similarly, to search for multiple patterns simultaneously, use the [-p] flag as +many times as needed. When searching for multiple patterns, note that partial m- +atches are not returned. For example, entering the following command: + + fastaptamer_search -i FILE1 -i FILE2 -p ATTGCC -p TGGCAT + +would search FILE1 and FILE2 for sequences containing both ATTGCC and TGGCAT. + +Patterns and input sequence data are case insensitive and T/U are interchangeab- +le. In addition to single bases, patterns can include any of the degenerate base +symbols from IUPAC nucleic acid notation: + + A/T/G/C/U single bases + + R puRines (A/G) + Y pYrimidines (C/T) + W Weak (A/T) + S Strong (G/C) + M aMino (A/C) + K Keto (G/T) + + B not A + D not C + H not G + V not T + + N aNy base (not a gap) + +For greater visibility, pattern matches can be highlighted by parentheses in the +output by calling the [-highlight] flag. + +A summary report is generated after each file's search results and after search +completion. To suppress these reports, enable quiet mode using the [-quiet] flag + +HELP +exit; + +} + +if (defined $version){ ## Print version screen if -v is true + print <<"VERSION"; + +FASTAptamer v1.0.2 + +VERSION +exit; +} + +######################################################################################## + +unless(@inputlist) { die "\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; } + +unless(@patternlist) {die "\nNo search pattern was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; } + +my $start_time = time; ## Start timer + +my $inputcount = scalar(@inputlist); +my $patterncount = scalar(@patternlist); + +## Convert user-entered patterns into regexes +my @superarray; ## Array of 2-element arrays via references: ([pattern, regex], [...], etc.) +my $regex; +foreach my $pattern (@patternlist) { + $regex = $pattern; + $regex =~ s{[TU]}{\[TU\]}gi; ## Make T and U interchangeable + $regex =~ s{W}{\[ATU\]}gi; ## W = ATU Regex modifiers after s{}{}: + $regex =~ s{S}{\[CG\]}gi; ## S = CG g tells the script to find ALL matches rather than stop after the first + $regex =~ s{M}{\[AC\]}gi; ## M = AC i makes searching case insensitive + $regex =~ s{K}{\[GTU\]}gi; ## K = GTU + $regex =~ s{R}{\[AG\]}gi; ## R = AG + $regex =~ s{Y}{\[CTU\]}gi; ## Y = CTU + $regex =~ s{B}{\[CGTU\]}gi; ## B = CGTU + $regex =~ s{D}{\[AGTU\]}gi; ## D = AGTU + $regex =~ s{H}{\[ACTU\]}gi; ## H = ACTU + $regex =~ s{V}{\[ACG\]}gi; ## V = ACG + $regex =~ s{N}{\[ACGTU\]}gi; ## N = ACGTU + ## Put the pattern and regex into an array and push the whole thing onto @superarray + push(@superarray, [$pattern, $regex]); ## Push $subarray onto the end of @superarray + } + +my $fh_out; +if(defined $output) { open($fh_out, '>', $output) or die "\nCould not open output file or no output file was specified.\nSee help documentation [-help], README, or User's Guide for program usage.\n"; } + +my $input; +my $current_input = 0; +my $subarray; +my ($line1, $line2); +my $match_hits = 0; +my $match_line = ''; +my $match_portion = ''; +my $filehits = 0; +my $totalhits = 0; + +## Loop logic: read one input file at a time and search all patterns through each line + +## FASTA input + +foreach $input (@inputlist) { + $current_input++; + if(defined $output) { + print $fh_out "\n\n--------------------------------------------------------------------------------\n"; + print $fh_out "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n"; + print $fh_out "--------------------------------------------------------------------------------\n\n"; + } + else { + print "\n\n--------------------------------------------------------------------------------\n"; + print "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n"; + print "--------------------------------------------------------------------------------\n\n"; + } + open(my $fh_in, '<', $input) or die "Could not open input file $input"; + + while($line1 = <$fh_in>) { + $line2 = <$fh_in>; + my $not_first_regex = 0; + my $hit_confirmed = 0; + foreach $subarray (@superarray) { + $regex = $subarray->[1]; ## Get the regex from subarray + ## $1 + if($line2 =~ m{($regex)}gi) { ## Search for all matches, case insensitive + $match_portion = $1; ## Portion of sequence that matched was captured in magic variable $1 + if($not_first_regex == 0) { ($match_line = $line2) =~ s{$match_portion}{\($match_portion\)}g; } + else { $match_line =~ s{$match_portion}{\($match_portion\)}g; } + $not_first_regex = 1; + $hit_confirmed++; + } + } + if(defined $output and $hit_confirmed == $patterncount) { + $filehits++; ## Increment FILE-SPECIFIC hit counter + $totalhits++; ## Increment OVERALL hit counter + if (defined $highlight) { print $fh_out "$line1$match_line"; } + else { print $fh_out "$line1$line2"; } + } + elsif(not defined $output and $hit_confirmed == $patterncount) { + $filehits++; ## Increment FILE-SPECIFIC hit counter + $totalhits++; ## Increment OVERALL hit counter + if (defined $highlight) { print "$line1$match_line"; } + else { print "$line1$line2"; } + } + } + + if(defined $output and not defined $quiet) { ## Print file-specific stats, unless quiet mode + if ($filehits > 1) { print $fh_out "\nMatched $filehits sequences in file $input.\n"; } + elsif ($filehits == 1) { print $fh_out "\nMatched 1 sequence in file $input.\n"; } + elsif ($filehits == 0) { print $fh_out "\nDid not match any sequences in file $input.\n" } + $filehits = 0; ## Reset file-specific hit counter after stats are printed + } + elsif(not defined $output and not defined $quiet) { + if ($filehits > 1) { print "\nMatched $filehits sequences in file $input.\n"; } + elsif ($filehits == 1) { print "\nMatched 1 sequence in file $input.\n"; } + elsif ($filehits == 0) { print "\nDid not match any sequences in file $input.\n" } + $filehits = 0; ## Reset file-specific hit counter after stats are printed + } +} + + +## Print a summary after script completion, unless quiet mode +unless(defined $quiet) { + my $duration = time - $start_time; + + print "\n--------------------------------------------------------------------------------\n"; + print " SEARCH RESULT SUMMARY\n"; + print "--------------------------------------------------------------------------------\n"; + + if($patterncount > 1) { print "Searched for $patterncount patterns:\n"; } + elsif($patterncount == 1) { print "Searched for 1 pattern:\n"; } + foreach $subarray (@superarray) { print "$subarray->[0]\n"; } + + print "\nacross the following $inputcount input files:\n"; + print join("\n", @inputlist); + + if($totalhits > 1) { print "\n$totalhits sequences were matched.\n"; } + elsif($totalhits == 1) { print "\n1 sequence was matched.\n"; } + elsif($totalhits == 0) { print "\nDid not find any matches.\n"; } + + if($duration == 1) { print "\nYour search took 1 second.\n"; } + else { print "\nYour search took $duration seconds.\n"; } + + print "--------------------------------------------------------------------------------\n"; + print "--------------------------------------------------------------------------------\n"; +}