1
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 ## Last Modified January 19th, 2015 22:54 CST
|
|
4
|
|
5 ## Citation:
|
|
6 ## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke.
|
|
7 ## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of
|
|
8 ## Combinatorial Selections." Molecular Therapy — Nucleic Acids. 2015.
|
|
9 ## DOI: 10.1038/mtna.2015.4
|
|
10
|
|
11 ## Distributed under GNU General Public License v3
|
|
12
|
|
13 use Getopt::Long; # Core Perl module for command line arguments/options
|
|
14
|
|
15 ###############################################################################
|
|
16
|
|
17 ## Variables for command line arguments
|
|
18 my @inputlist; # Array of input files
|
|
19 my $output; # Output filename (optional)
|
|
20 my @patternlist; # Array of pattern(s) to search for
|
|
21 my $highlight; # If defined, highlight matches using parens
|
|
22 my $help; # If defined, show help dialogue
|
|
23 my $quiet; # If defined, suppress report in STDOUT
|
|
24 my $version; # display version
|
|
25
|
|
26 ## Process command line arguments
|
|
27 GetOptions (
|
|
28 "input=s" => \@inputlist, # input file(s)
|
|
29 "output=s" => \$output, # output file (optional)
|
|
30 "pattern=s" => \@patternlist, # pattern(s) to search for
|
|
31 "highlight" => \$highlight, # highlight matched portion with parens
|
|
32 "quiet" => \$quiet, # suppressing summary report
|
|
33 "version" => \$version, # display version
|
|
34 "help" => \$help); # displaying help information
|
|
35
|
|
36 if(defined $help) { ## Prints help screen if -help is invoked
|
|
37 print <<"HELP";
|
|
38
|
|
39 --------------------------------------------------------------------------------
|
|
40 FASTAptamer-Search
|
|
41 --------------------------------------------------------------------------------
|
|
42
|
|
43 Usage: fastaptamer_search [-i INFILE] [-o OUTFILE] [-p PATTERN]
|
|
44
|
|
45 [-help] = Help screen.
|
|
46 [-i FILENAME] = Input file; can be used multiple times. REQUIRED.
|
|
47 [-p PATTERN] = Sequence pattern to search for; can be used multiple
|
|
48 times. REQUIRED.
|
|
49 [-o FILENAME] = Output file for search results. If none given, output
|
|
50 goes to STDOUT.
|
|
51 [-highlight] = Highlight matched portion of sequence in parentheses.
|
|
52 [-q] = Suppress summary report.
|
|
53 [-v] = Display version.
|
|
54
|
|
55 FASTAptamer-Search allows users to search for specific patterns within one or m-
|
|
56 ore sequence files.
|
|
57
|
|
58 To search through more than one input file, simply use the [-i] flag multiple t-
|
|
59 imes. All input files must use FASTA format.
|
|
60
|
|
61 Similarly, to search for multiple patterns simultaneously, use the [-p] flag as
|
|
62 many times as needed. When searching for multiple patterns, note that partial m-
|
|
63 atches are not returned. For example, entering the following command:
|
|
64
|
|
65 fastaptamer_search -i FILE1 -i FILE2 -p ATTGCC -p TGGCAT
|
|
66
|
|
67 would search FILE1 and FILE2 for sequences containing both ATTGCC and TGGCAT.
|
|
68
|
|
69 Patterns and input sequence data are case insensitive and T/U are interchangeab-
|
|
70 le. In addition to single bases, patterns can include any of the degenerate base
|
|
71 symbols from IUPAC nucleic acid notation:
|
|
72
|
|
73 A/T/G/C/U single bases
|
|
74
|
|
75 R puRines (A/G)
|
|
76 Y pYrimidines (C/T)
|
|
77 W Weak (A/T)
|
|
78 S Strong (G/C)
|
|
79 M aMino (A/C)
|
|
80 K Keto (G/T)
|
|
81
|
|
82 B not A
|
|
83 D not C
|
|
84 H not G
|
|
85 V not T
|
|
86
|
|
87 N aNy base (not a gap)
|
|
88
|
|
89 For greater visibility, pattern matches can be highlighted by parentheses in the
|
|
90 output by calling the [-highlight] flag.
|
|
91
|
|
92 A summary report is generated after each file's search results and after search
|
|
93 completion. To suppress these reports, enable quiet mode using the [-quiet] flag
|
|
94
|
|
95 HELP
|
|
96 exit;
|
|
97
|
|
98 }
|
|
99
|
|
100 if (defined $version){ ## Print version screen if -v is true
|
|
101 print <<"VERSION";
|
|
102
|
|
103 FASTAptamer v1.0.2
|
|
104
|
|
105 VERSION
|
|
106 exit;
|
|
107 }
|
|
108
|
|
109 ########################################################################################
|
|
110
|
|
111 unless(@inputlist) { die "\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; }
|
|
112
|
|
113 unless(@patternlist) {die "\nNo search pattern was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; }
|
|
114
|
|
115 my $start_time = time; ## Start timer
|
|
116
|
|
117 my $inputcount = scalar(@inputlist);
|
|
118 my $patterncount = scalar(@patternlist);
|
|
119
|
|
120 ## Convert user-entered patterns into regexes
|
|
121 my @superarray; ## Array of 2-element arrays via references: ([pattern, regex], [...], etc.)
|
|
122 my $regex;
|
|
123 foreach my $pattern (@patternlist) {
|
|
124 $regex = $pattern;
|
|
125 $regex =~ s{[TU]}{\[TU\]}gi; ## Make T and U interchangeable
|
|
126 $regex =~ s{W}{\[ATU\]}gi; ## W = ATU Regex modifiers after s{}{}:
|
|
127 $regex =~ s{S}{\[CG\]}gi; ## S = CG g tells the script to find ALL matches rather than stop after the first
|
|
128 $regex =~ s{M}{\[AC\]}gi; ## M = AC i makes searching case insensitive
|
|
129 $regex =~ s{K}{\[GTU\]}gi; ## K = GTU
|
|
130 $regex =~ s{R}{\[AG\]}gi; ## R = AG
|
|
131 $regex =~ s{Y}{\[CTU\]}gi; ## Y = CTU
|
|
132 $regex =~ s{B}{\[CGTU\]}gi; ## B = CGTU
|
|
133 $regex =~ s{D}{\[AGTU\]}gi; ## D = AGTU
|
|
134 $regex =~ s{H}{\[ACTU\]}gi; ## H = ACTU
|
|
135 $regex =~ s{V}{\[ACG\]}gi; ## V = ACG
|
|
136 $regex =~ s{N}{\[ACGTU\]}gi; ## N = ACGTU
|
|
137 ## Put the pattern and regex into an array and push the whole thing onto @superarray
|
|
138 push(@superarray, [$pattern, $regex]); ## Push $subarray onto the end of @superarray
|
|
139 }
|
|
140
|
|
141 my $fh_out;
|
|
142 if(defined $output) { open($fh_out, '>', $output) or die "\nCould not open output file or no output file was specified.\nSee help documentation [-help], README, or User's Guide for program usage.\n"; }
|
|
143
|
|
144 my $input;
|
|
145 my $current_input = 0;
|
|
146 my $subarray;
|
|
147 my ($line1, $line2);
|
|
148 my $match_hits = 0;
|
|
149 my $match_line = '';
|
|
150 my $match_portion = '';
|
|
151 my $filehits = 0;
|
|
152 my $totalhits = 0;
|
|
153
|
|
154 ## Loop logic: read one input file at a time and search all patterns through each line
|
|
155
|
|
156 ## FASTA input
|
|
157
|
|
158 foreach $input (@inputlist) {
|
|
159 $current_input++;
|
|
160 if(defined $output) {
|
|
161 print $fh_out "\n\n--------------------------------------------------------------------------------\n";
|
|
162 print $fh_out "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n";
|
|
163 print $fh_out "--------------------------------------------------------------------------------\n\n";
|
|
164 }
|
|
165 else {
|
|
166 print "\n\n--------------------------------------------------------------------------------\n";
|
|
167 print "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n";
|
|
168 print "--------------------------------------------------------------------------------\n\n";
|
|
169 }
|
|
170 open(my $fh_in, '<', $input) or die "Could not open input file $input";
|
|
171
|
|
172 while($line1 = <$fh_in>) {
|
|
173 $line2 = <$fh_in>;
|
|
174 my $not_first_regex = 0;
|
|
175 my $hit_confirmed = 0;
|
|
176 foreach $subarray (@superarray) {
|
|
177 $regex = $subarray->[1]; ## Get the regex from subarray
|
|
178 ## $1
|
|
179 if($line2 =~ m{($regex)}gi) { ## Search for all matches, case insensitive
|
|
180 $match_portion = $1; ## Portion of sequence that matched was captured in magic variable $1
|
|
181 if($not_first_regex == 0) { ($match_line = $line2) =~ s{$match_portion}{\($match_portion\)}g; }
|
|
182 else { $match_line =~ s{$match_portion}{\($match_portion\)}g; }
|
|
183 $not_first_regex = 1;
|
|
184 $hit_confirmed++;
|
|
185 }
|
|
186 }
|
|
187 if(defined $output and $hit_confirmed == $patterncount) {
|
|
188 $filehits++; ## Increment FILE-SPECIFIC hit counter
|
|
189 $totalhits++; ## Increment OVERALL hit counter
|
|
190 if (defined $highlight) { print $fh_out "$line1$match_line"; }
|
|
191 else { print $fh_out "$line1$line2"; }
|
|
192 }
|
|
193 elsif(not defined $output and $hit_confirmed == $patterncount) {
|
|
194 $filehits++; ## Increment FILE-SPECIFIC hit counter
|
|
195 $totalhits++; ## Increment OVERALL hit counter
|
|
196 if (defined $highlight) { print "$line1$match_line"; }
|
|
197 else { print "$line1$line2"; }
|
|
198 }
|
|
199 }
|
|
200
|
|
201 if(defined $output and not defined $quiet) { ## Print file-specific stats, unless quiet mode
|
|
202 if ($filehits > 1) { print $fh_out "\nMatched $filehits sequences in file $input.\n"; }
|
|
203 elsif ($filehits == 1) { print $fh_out "\nMatched 1 sequence in file $input.\n"; }
|
|
204 elsif ($filehits == 0) { print $fh_out "\nDid not match any sequences in file $input.\n" }
|
|
205 $filehits = 0; ## Reset file-specific hit counter after stats are printed
|
|
206 }
|
|
207 elsif(not defined $output and not defined $quiet) {
|
|
208 if ($filehits > 1) { print "\nMatched $filehits sequences in file $input.\n"; }
|
|
209 elsif ($filehits == 1) { print "\nMatched 1 sequence in file $input.\n"; }
|
|
210 elsif ($filehits == 0) { print "\nDid not match any sequences in file $input.\n" }
|
|
211 $filehits = 0; ## Reset file-specific hit counter after stats are printed
|
|
212 }
|
|
213 }
|
|
214
|
|
215
|
|
216 ## Print a summary after script completion, unless quiet mode
|
|
217 unless(defined $quiet) {
|
|
218 my $duration = time - $start_time;
|
|
219
|
|
220 print "\n--------------------------------------------------------------------------------\n";
|
|
221 print " SEARCH RESULT SUMMARY\n";
|
|
222 print "--------------------------------------------------------------------------------\n";
|
|
223
|
|
224 if($patterncount > 1) { print "Searched for $patterncount patterns:\n"; }
|
|
225 elsif($patterncount == 1) { print "Searched for 1 pattern:\n"; }
|
|
226 foreach $subarray (@superarray) { print "$subarray->[0]\n"; }
|
|
227
|
|
228 print "\nacross the following $inputcount input files:\n";
|
|
229 print join("\n", @inputlist);
|
|
230
|
|
231 if($totalhits > 1) { print "\n$totalhits sequences were matched.\n"; }
|
|
232 elsif($totalhits == 1) { print "\n1 sequence was matched.\n"; }
|
|
233 elsif($totalhits == 0) { print "\nDid not find any matches.\n"; }
|
|
234
|
|
235 if($duration == 1) { print "\nYour search took 1 second.\n"; }
|
|
236 else { print "\nYour search took $duration seconds.\n"; }
|
|
237
|
|
238 print "--------------------------------------------------------------------------------\n";
|
|
239 print "--------------------------------------------------------------------------------\n";
|
|
240 }
|