Mercurial > repos > fastaptamer > fastaptamer_search

#!/usr/bin/env perl

## Last Modified January 19th, 2015 22:54 CST

## Citation:
## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke.
## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of
## Combinatorial Selections." Molecular Therapy — Nucleic Acids. 2015.
## DOI: 10.1038/mtna.2015.4

## Distributed under GNU General Public License v3

use Getopt::Long;  # Core Perl module for command line arguments/options

###############################################################################

                ## Variables for command line arguments
my @inputlist;    # Array of input files
my $output;       # Output filename (optional)
my @patternlist;  # Array of pattern(s) to search for
my $highlight;    # If defined, highlight matches using parens
my $help;         # If defined, show help dialogue
my $quiet;        # If defined, suppress report in STDOUT
my $version;      # display version

## Process command line arguments
GetOptions (
  "input=s"    => \@inputlist,    # input file(s)
  "output=s"   => \$output,       # output file (optional)
  "pattern=s"  => \@patternlist,  # pattern(s) to search for
  "highlight"  => \$highlight,    # highlight matched portion with parens
  "quiet"      => \$quiet,        # suppressing summary report
  "version"    => \$version,      # display version
  "help"       => \$help);        # displaying help information

if(defined $help) {  ## Prints help screen if -help is invoked
        print <<"HELP";

--------------------------------------------------------------------------------
                              FASTAptamer-Search
--------------------------------------------------------------------------------

Usage: fastaptamer_search [-i INFILE] [-o OUTFILE] [-p PATTERN]

    [-help]            	= Help screen.
    [-i FILENAME]     	= Input file; can be used multiple times. REQUIRED.
    [-p PATTERN]		= Sequence pattern to search for; can be used multiple
                          times. REQUIRED.
    [-o FILENAME]   	= Output file for search results. If none given, output
                          goes to STDOUT.
    [-highlight]        = Highlight matched portion of sequence in parentheses.
    [-q]                = Suppress summary report.
    [-v]                = Display version.

FASTAptamer-Search allows users to search for specific patterns within one or m-
ore sequence files.

To search through more than one input file, simply use the [-i] flag multiple t-
imes. All input files must use FASTA format.

Similarly, to search for multiple patterns simultaneously, use the [-p] flag as
many times as needed. When searching for multiple patterns, note that partial m-
atches are not returned. For example, entering the following command:

    fastaptamer_search -i FILE1 -i FILE2 -p ATTGCC -p TGGCAT

would search FILE1 and FILE2 for sequences containing both ATTGCC and TGGCAT.

Patterns and input sequence data are case insensitive and T/U are interchangeab-
le. In addition to single bases, patterns can include any of the degenerate base
symbols from IUPAC nucleic acid notation:

    A/T/G/C/U    single bases

    R    puRines (A/G)
    Y    pYrimidines (C/T)
    W    Weak (A/T)
    S    Strong (G/C)
    M    aMino (A/C)
    K    Keto (G/T)

    B    not A
    D    not C
    H    not G
    V    not T

    N    aNy base (not a gap)

For greater visibility, pattern matches can be highlighted by parentheses in the
output by calling the [-highlight] flag.

A summary report is generated after each file's search results and after search
completion. To suppress these reports, enable quiet mode using the [-quiet] flag

HELP
exit;

}

if (defined $version){                     ## Print version screen if -v is true
    print <<"VERSION";

FASTAptamer v1.0.2

VERSION
exit;
}

########################################################################################

unless(@inputlist) { die "\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; }

unless(@patternlist) {die "\nNo search pattern was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; }

my $start_time = time;		## Start timer

my $inputcount = scalar(@inputlist);
my $patterncount = scalar(@patternlist);

## Convert user-entered patterns into regexes
my @superarray;		## Array of 2-element arrays via references: ([pattern, regex], [...], etc.)
my $regex;
foreach my $pattern (@patternlist) {
	$regex = $pattern;
        $regex =~ s{[TU]}{\[TU\]}gi;	## Make T and U interchangeable
	$regex =~ s{W}{\[ATU\]}gi;     	## W = ATU	Regex modifiers after s{}{}:
        $regex =~ s{S}{\[CG\]}gi;      	## S = CG	g tells the script to find ALL matches rather than stop after the first
        $regex =~ s{M}{\[AC\]}gi;      	## M = AC	i makes searching case insensitive
        $regex =~ s{K}{\[GTU\]}gi;     	## K = GTU
        $regex =~ s{R}{\[AG\]}gi;      	## R = AG
        $regex =~ s{Y}{\[CTU\]}gi;     	## Y = CTU
        $regex =~ s{B}{\[CGTU\]}gi;    	## B = CGTU
        $regex =~ s{D}{\[AGTU\]}gi;    	## D = AGTU
        $regex =~ s{H}{\[ACTU\]}gi;    	## H = ACTU
        $regex =~ s{V}{\[ACG\]}gi;     	## V = ACG
        $regex =~ s{N}{\[ACGTU\]}gi;   	## N = ACGTU
	## Put the pattern and regex into an array and push the whole thing onto @superarray
	push(@superarray, [$pattern, $regex]);		## Push $subarray onto the end of @superarray
	}

my $fh_out;
if(defined $output) { open($fh_out, '>', $output) or die "\nCould not open output file or no output file was specified.\nSee help documentation [-help], README, or User's Guide for program usage.\n"; }

my $input;
my $current_input = 0;
my $subarray;
my ($line1, $line2);
my $match_hits = 0;
my $match_line = '';
my $match_portion = '';
my $filehits = 0;
my $totalhits = 0;

## Loop logic: read one input file at a time and search all patterns through each line

## FASTA input

foreach $input (@inputlist) {
    $current_input++;
    if(defined $output) {
        print $fh_out "\n\n--------------------------------------------------------------------------------\n";
        print $fh_out "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n";
        print $fh_out "--------------------------------------------------------------------------------\n\n";
    }
    else {
        print "\n\n--------------------------------------------------------------------------------\n";
        print "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n";
        print "--------------------------------------------------------------------------------\n\n";
    }
    open(my $fh_in, '<', $input) or die "Could not open input file $input";

    while($line1 = <$fh_in>) {
        $line2 = <$fh_in>;
        my $not_first_regex = 0;
        my $hit_confirmed = 0;
        foreach $subarray (@superarray) {
            $regex = $subarray->[1];        ## Get the regex from subarray
            ##            $1
            if($line2 =~ m{($regex)}gi) {           ## Search for all matches, case insensitive
                $match_portion = $1;            ## Portion of sequence that matched was captured in magic variable $1
                if($not_first_regex == 0) { ($match_line = $line2) =~ s{$match_portion}{\($match_portion\)}g; }
                else { $match_line =~ s{$match_portion}{\($match_portion\)}g; }
                $not_first_regex = 1;
                $hit_confirmed++;
            }
        }
        if(defined $output and $hit_confirmed == $patterncount) {
            $filehits++;    ## Increment FILE-SPECIFIC hit counter
            $totalhits++;   ## Increment OVERALL hit counter
            if (defined $highlight) { print $fh_out "$line1$match_line"; }
            else { print $fh_out "$line1$line2"; }
        }
        elsif(not defined $output and $hit_confirmed == $patterncount) {
            $filehits++;    ## Increment FILE-SPECIFIC hit counter
            $totalhits++;   ## Increment OVERALL hit counter
            if (defined $highlight) { print "$line1$match_line"; }
            else { print "$line1$line2"; }
        }
    }

    if(defined $output and not defined $quiet) {            ## Print file-specific stats, unless quiet mode
        if ($filehits > 1) { print $fh_out "\nMatched $filehits sequences in file $input.\n"; }
        elsif ($filehits == 1) { print $fh_out "\nMatched 1 sequence in file $input.\n"; }
        elsif ($filehits == 0) { print $fh_out "\nDid not match any sequences in file $input.\n" }
        $filehits = 0;     ## Reset file-specific hit counter after stats are printed
    }
    elsif(not defined $output and not defined $quiet) {
        if ($filehits > 1) { print "\nMatched $filehits sequences in file $input.\n"; }
        elsif ($filehits == 1) { print "\nMatched 1 sequence in file $input.\n"; }
        elsif ($filehits == 0) { print "\nDid not match any sequences in file $input.\n" }
        $filehits = 0;     ## Reset file-specific hit counter after stats are printed
    }
}


## Print a summary after script completion, unless quiet mode
unless(defined $quiet) {
	my $duration = time - $start_time;

	print "\n--------------------------------------------------------------------------------\n";
	print " SEARCH RESULT SUMMARY\n";
	print "--------------------------------------------------------------------------------\n";

	if($patterncount > 1) { print "Searched for $patterncount patterns:\n"; }
	elsif($patterncount == 1) { print "Searched for 1 pattern:\n"; }
	foreach $subarray (@superarray) { print "$subarray->[0]\n"; }

	print "\nacross the following $inputcount input files:\n";
	print join("\n", @inputlist);

	if($totalhits > 1) { print "\n$totalhits sequences were matched.\n"; }
        elsif($totalhits == 1) { print "\n1 sequence was matched.\n"; }
	elsif($totalhits == 0) { print "\nDid not find any matches.\n"; }

	if($duration == 1) { print "\nYour search took 1 second.\n"; }
	else { print "\nYour search took $duration seconds.\n"; }

	print "--------------------------------------------------------------------------------\n";
	print "--------------------------------------------------------------------------------\n";
}
author	fastaptamer
date	Tue, 10 Feb 2015 14:51:07 -0500
parents
children