1
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 ## Last Modified January 19th, 2015 22:22 CST
|
|
4
|
|
5 ## Citation:
|
|
6 ## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke.
|
|
7 ## FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of
|
|
8 ## Combinatorial Selections. Molecular Therapy — Nucleic Acids. 2015.
|
|
9 ## DOI: 10.1038/mtna.2015.4
|
|
10
|
|
11 ## Distributed under GNU General Public License v3
|
|
12
|
|
13 use Getopt::Long; ## Core Perl module for command line arguments/options
|
|
14
|
|
15 my $start = time; ## Starts program execution timer
|
|
16 my $input_fh; ## Input file handle
|
|
17 my $output_fh; ## Output file handle
|
|
18 my $quiet; ## true/false variable for summary report supression
|
|
19 my $help; ## true/false variable for help screen
|
|
20 my $version; ## true/false variable for version screen
|
|
21
|
|
22 ## Take command line arguments for...
|
|
23 GetOptions ( "input=s" => \$input_fh, ## ...input file
|
|
24 "output=s" => \$output_fh, ## ...output file
|
|
25 "quiet" => \$quiet, ## ...supressing std. out
|
|
26 "version" => \$version, ## ...showing version screen
|
|
27 "help" => \$help); ## ...showing help screen
|
|
28
|
|
29 if (defined $help){ ## Print help screen if -h is true
|
|
30 print <<"HELP";
|
|
31
|
|
32 --------------------------------------------------------------------------------
|
|
33 FASTAptamer-Count
|
|
34 --------------------------------------------------------------------------------
|
|
35
|
|
36 Usage: fastaptamer_count [-h] [-q] [-v] [-i INFILE] [-o OUTFILE]
|
|
37
|
|
38 [-h] = Help screen.
|
|
39 [-q] = Suppress STDOUT of run report.
|
|
40 [-v] = Display version.
|
|
41 [-i INFILE] = FASTQ input file. REQUIRED.
|
|
42 [-o OUTFILE] = FASTA output file. REQUIRED.
|
|
43
|
|
44 FASTAptamer-Count serves as the gateway to the FASTAptamer toolkit. For a given
|
|
45 .FASTQ input file, FASTAptamer-Count will determine the number of times each se-
|
|
46 quence was read, rank and sort sequences by decreasing total reads, and normali-
|
|
47 ze sequence frequency to reads per million. Output is generated as a non-redund-
|
|
48 ant FASTA file in the following format for each sequence:
|
|
49
|
|
50 >RANK-READS-RPM
|
|
51 SEQUENCE
|
|
52
|
|
53 Summary report (total reads, unique reads, and execution time) is displayed as
|
|
54 STDOUT at program completion unless [-q] is invoked.
|
|
55
|
|
56 HELP
|
|
57 exit;
|
|
58 }
|
|
59
|
|
60 if (defined $version){ ## Print version screen if -v is true
|
|
61 print <<"VERSION";
|
|
62
|
|
63 FASTAptamer v1.0.2
|
|
64
|
|
65 VERSION
|
|
66 exit;
|
|
67 }
|
|
68
|
|
69 ##########################################
|
|
70 ## Open input file or exit with warning #
|
|
71 ##########################################
|
|
72
|
|
73 open (INPUT, '<', $input_fh) or die
|
|
74 "\nCould not open input file or no input file was specified.\n
|
|
75 See help documentation [-h], README, or User's Guide for program usage.\n";
|
|
76
|
|
77 ##########################################
|
|
78 ## Open output file or exit with warning #
|
|
79 ##########################################
|
|
80
|
|
81 open (OUTPUT, '>', $output_fh) or die
|
|
82 "\nCould not open output file or no output file was specified.\n
|
|
83 See help documentation [-h], README, or User's Guide for program usage.\n";
|
|
84
|
|
85 ###############################################
|
|
86 ## Create a hash for sequence reads where key #
|
|
87 ## is the sequence and value is read count #
|
|
88 ###############################################
|
|
89
|
|
90 my %sequence_reads;
|
|
91
|
|
92 my $entries = 0; ## Counts the number of entries processed
|
|
93
|
|
94 $/ = "@"; ## Change default input record separator to read FASTQ formatted file
|
|
95
|
|
96 ################################################################################
|
|
97 ## Read input file and for each entry, add sequence or increment value in hash #
|
|
98 ################################################################################
|
|
99
|
|
100 while (<INPUT>){
|
|
101 if ($_ =~ /.+\n(\S+)\n.+\n.+/){
|
|
102 my $sequence = $1;
|
|
103 $sequence_reads{$sequence} += 1;
|
|
104 $entries++;
|
|
105 }
|
|
106 }
|
|
107
|
|
108 close INPUT;
|
|
109
|
|
110 ############################################
|
|
111 ## Sort hash by decreasing read count by #
|
|
112 ## converting to arrays of values and keys #
|
|
113 ############################################
|
|
114
|
|
115 my @keys = sort { $sequence_reads{$b} <=> $sequence_reads{$a} } keys(%sequence_reads);
|
|
116 my @vals = @sequence_reads{@keys};
|
|
117
|
|
118 ##################################################################################
|
|
119 ## Using both arrays, print to output the FASTA formatted file containing #
|
|
120 ## the sequence rank, reads, reads per million, and sequence. The extra #
|
|
121 ## scalars here are used to ensure that sequences with equal read counts #
|
|
122 ## receive the same rank value, and that the next untied value is properly #
|
|
123 ## assigned a rank that takes into account the number of tied values before it, #
|
|
124 ## also known as standard competition ranking. #
|
|
125 ##################################################################################
|
|
126
|
|
127 my $last_reads_count = $vals[0];
|
|
128 my $rank = 1;
|
|
129
|
|
130 for my $i (0..$#keys){
|
|
131 my $current_reads_count = $vals[$i];
|
|
132 if ($current_reads_count < $last_reads_count){
|
|
133 $rank = $i + 1;
|
|
134 print OUTPUT ">$rank-$vals[$i]-";
|
|
135 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
|
|
136 print OUTPUT "\n";
|
|
137 print OUTPUT "$keys[$i]\n";
|
|
138
|
|
139 }
|
|
140 elsif ($current_reads_count == $last_reads_count){
|
|
141 print OUTPUT ">$rank-$vals[$i]-";
|
|
142 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
|
|
143 print OUTPUT "\n";
|
|
144 print OUTPUT "$keys[$i]\n";
|
|
145 }
|
|
146 $last_reads_count = $vals[$i];
|
|
147
|
|
148 }
|
|
149
|
|
150 close OUTPUT;
|
|
151
|
|
152 ##################################################################################
|
|
153 ## Unless the -q option is invoked, the following statistics are printed as #
|
|
154 ## standard output. #
|
|
155 ##################################################################################
|
|
156
|
|
157 unless ($quiet){
|
|
158 print "\n$entries total sequences. " . ($#keys + 1) . " unique sequences.\n";
|
|
159 print "Input file: \"$input_fh\".\n";
|
|
160 print "Output file: \"$output_fh\".\n";
|
|
161
|
|
162 my $duration = time - $start;
|
|
163 print "Execution time: $duration s.\n";
|
|
164 }
|
|
165
|
|
166 exit;
|