Mercurial > repos > fastaptamer > fastaptamer_search
view fastaptamer_search @ 1:ac61067a0852 draft default tip
Uploaded
author | fastaptamer |
---|---|
date | Tue, 10 Feb 2015 14:51:07 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl ## Last Modified January 19th, 2015 22:54 CST ## Citation: ## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. ## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of ## Combinatorial Selections." Molecular Therapy — Nucleic Acids. 2015. ## DOI: 10.1038/mtna.2015.4 ## Distributed under GNU General Public License v3 use Getopt::Long; # Core Perl module for command line arguments/options ############################################################################### ## Variables for command line arguments my @inputlist; # Array of input files my $output; # Output filename (optional) my @patternlist; # Array of pattern(s) to search for my $highlight; # If defined, highlight matches using parens my $help; # If defined, show help dialogue my $quiet; # If defined, suppress report in STDOUT my $version; # display version ## Process command line arguments GetOptions ( "input=s" => \@inputlist, # input file(s) "output=s" => \$output, # output file (optional) "pattern=s" => \@patternlist, # pattern(s) to search for "highlight" => \$highlight, # highlight matched portion with parens "quiet" => \$quiet, # suppressing summary report "version" => \$version, # display version "help" => \$help); # displaying help information if(defined $help) { ## Prints help screen if -help is invoked print <<"HELP"; -------------------------------------------------------------------------------- FASTAptamer-Search -------------------------------------------------------------------------------- Usage: fastaptamer_search [-i INFILE] [-o OUTFILE] [-p PATTERN] [-help] = Help screen. [-i FILENAME] = Input file; can be used multiple times. REQUIRED. [-p PATTERN] = Sequence pattern to search for; can be used multiple times. REQUIRED. [-o FILENAME] = Output file for search results. If none given, output goes to STDOUT. [-highlight] = Highlight matched portion of sequence in parentheses. [-q] = Suppress summary report. [-v] = Display version. FASTAptamer-Search allows users to search for specific patterns within one or m- ore sequence files. To search through more than one input file, simply use the [-i] flag multiple t- imes. All input files must use FASTA format. Similarly, to search for multiple patterns simultaneously, use the [-p] flag as many times as needed. When searching for multiple patterns, note that partial m- atches are not returned. For example, entering the following command: fastaptamer_search -i FILE1 -i FILE2 -p ATTGCC -p TGGCAT would search FILE1 and FILE2 for sequences containing both ATTGCC and TGGCAT. Patterns and input sequence data are case insensitive and T/U are interchangeab- le. In addition to single bases, patterns can include any of the degenerate base symbols from IUPAC nucleic acid notation: A/T/G/C/U single bases R puRines (A/G) Y pYrimidines (C/T) W Weak (A/T) S Strong (G/C) M aMino (A/C) K Keto (G/T) B not A D not C H not G V not T N aNy base (not a gap) For greater visibility, pattern matches can be highlighted by parentheses in the output by calling the [-highlight] flag. A summary report is generated after each file's search results and after search completion. To suppress these reports, enable quiet mode using the [-quiet] flag HELP exit; } if (defined $version){ ## Print version screen if -v is true print <<"VERSION"; FASTAptamer v1.0.2 VERSION exit; } ######################################################################################## unless(@inputlist) { die "\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; } unless(@patternlist) {die "\nNo search pattern was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; } my $start_time = time; ## Start timer my $inputcount = scalar(@inputlist); my $patterncount = scalar(@patternlist); ## Convert user-entered patterns into regexes my @superarray; ## Array of 2-element arrays via references: ([pattern, regex], [...], etc.) my $regex; foreach my $pattern (@patternlist) { $regex = $pattern; $regex =~ s{[TU]}{\[TU\]}gi; ## Make T and U interchangeable $regex =~ s{W}{\[ATU\]}gi; ## W = ATU Regex modifiers after s{}{}: $regex =~ s{S}{\[CG\]}gi; ## S = CG g tells the script to find ALL matches rather than stop after the first $regex =~ s{M}{\[AC\]}gi; ## M = AC i makes searching case insensitive $regex =~ s{K}{\[GTU\]}gi; ## K = GTU $regex =~ s{R}{\[AG\]}gi; ## R = AG $regex =~ s{Y}{\[CTU\]}gi; ## Y = CTU $regex =~ s{B}{\[CGTU\]}gi; ## B = CGTU $regex =~ s{D}{\[AGTU\]}gi; ## D = AGTU $regex =~ s{H}{\[ACTU\]}gi; ## H = ACTU $regex =~ s{V}{\[ACG\]}gi; ## V = ACG $regex =~ s{N}{\[ACGTU\]}gi; ## N = ACGTU ## Put the pattern and regex into an array and push the whole thing onto @superarray push(@superarray, [$pattern, $regex]); ## Push $subarray onto the end of @superarray } my $fh_out; if(defined $output) { open($fh_out, '>', $output) or die "\nCould not open output file or no output file was specified.\nSee help documentation [-help], README, or User's Guide for program usage.\n"; } my $input; my $current_input = 0; my $subarray; my ($line1, $line2); my $match_hits = 0; my $match_line = ''; my $match_portion = ''; my $filehits = 0; my $totalhits = 0; ## Loop logic: read one input file at a time and search all patterns through each line ## FASTA input foreach $input (@inputlist) { $current_input++; if(defined $output) { print $fh_out "\n\n--------------------------------------------------------------------------------\n"; print $fh_out "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n"; print $fh_out "--------------------------------------------------------------------------------\n\n"; } else { print "\n\n--------------------------------------------------------------------------------\n"; print "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n"; print "--------------------------------------------------------------------------------\n\n"; } open(my $fh_in, '<', $input) or die "Could not open input file $input"; while($line1 = <$fh_in>) { $line2 = <$fh_in>; my $not_first_regex = 0; my $hit_confirmed = 0; foreach $subarray (@superarray) { $regex = $subarray->[1]; ## Get the regex from subarray ## $1 if($line2 =~ m{($regex)}gi) { ## Search for all matches, case insensitive $match_portion = $1; ## Portion of sequence that matched was captured in magic variable $1 if($not_first_regex == 0) { ($match_line = $line2) =~ s{$match_portion}{\($match_portion\)}g; } else { $match_line =~ s{$match_portion}{\($match_portion\)}g; } $not_first_regex = 1; $hit_confirmed++; } } if(defined $output and $hit_confirmed == $patterncount) { $filehits++; ## Increment FILE-SPECIFIC hit counter $totalhits++; ## Increment OVERALL hit counter if (defined $highlight) { print $fh_out "$line1$match_line"; } else { print $fh_out "$line1$line2"; } } elsif(not defined $output and $hit_confirmed == $patterncount) { $filehits++; ## Increment FILE-SPECIFIC hit counter $totalhits++; ## Increment OVERALL hit counter if (defined $highlight) { print "$line1$match_line"; } else { print "$line1$line2"; } } } if(defined $output and not defined $quiet) { ## Print file-specific stats, unless quiet mode if ($filehits > 1) { print $fh_out "\nMatched $filehits sequences in file $input.\n"; } elsif ($filehits == 1) { print $fh_out "\nMatched 1 sequence in file $input.\n"; } elsif ($filehits == 0) { print $fh_out "\nDid not match any sequences in file $input.\n" } $filehits = 0; ## Reset file-specific hit counter after stats are printed } elsif(not defined $output and not defined $quiet) { if ($filehits > 1) { print "\nMatched $filehits sequences in file $input.\n"; } elsif ($filehits == 1) { print "\nMatched 1 sequence in file $input.\n"; } elsif ($filehits == 0) { print "\nDid not match any sequences in file $input.\n" } $filehits = 0; ## Reset file-specific hit counter after stats are printed } } ## Print a summary after script completion, unless quiet mode unless(defined $quiet) { my $duration = time - $start_time; print "\n--------------------------------------------------------------------------------\n"; print " SEARCH RESULT SUMMARY\n"; print "--------------------------------------------------------------------------------\n"; if($patterncount > 1) { print "Searched for $patterncount patterns:\n"; } elsif($patterncount == 1) { print "Searched for 1 pattern:\n"; } foreach $subarray (@superarray) { print "$subarray->[0]\n"; } print "\nacross the following $inputcount input files:\n"; print join("\n", @inputlist); if($totalhits > 1) { print "\n$totalhits sequences were matched.\n"; } elsif($totalhits == 1) { print "\n1 sequence was matched.\n"; } elsif($totalhits == 0) { print "\nDid not find any matches.\n"; } if($duration == 1) { print "\nYour search took 1 second.\n"; } else { print "\nYour search took $duration seconds.\n"; } print "--------------------------------------------------------------------------------\n"; print "--------------------------------------------------------------------------------\n"; }