Mercurial > repos > fastaptamer > fastaptamer_count
comparison fastaptamer_count @ 1:b9b2da3fa7d7 draft default tip
Uploaded
author | fastaptamer |
---|---|
date | Tue, 10 Feb 2015 14:23:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:2bed8ca187a1 | 1:b9b2da3fa7d7 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 ## Last Modified January 19th, 2015 22:22 CST | |
4 | |
5 ## Citation: | |
6 ## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. | |
7 ## FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of | |
8 ## Combinatorial Selections. Molecular Therapy — Nucleic Acids. 2015. | |
9 ## DOI: 10.1038/mtna.2015.4 | |
10 | |
11 ## Distributed under GNU General Public License v3 | |
12 | |
13 use Getopt::Long; ## Core Perl module for command line arguments/options | |
14 | |
15 my $start = time; ## Starts program execution timer | |
16 my $input_fh; ## Input file handle | |
17 my $output_fh; ## Output file handle | |
18 my $quiet; ## true/false variable for summary report supression | |
19 my $help; ## true/false variable for help screen | |
20 my $version; ## true/false variable for version screen | |
21 | |
22 ## Take command line arguments for... | |
23 GetOptions ( "input=s" => \$input_fh, ## ...input file | |
24 "output=s" => \$output_fh, ## ...output file | |
25 "quiet" => \$quiet, ## ...supressing std. out | |
26 "version" => \$version, ## ...showing version screen | |
27 "help" => \$help); ## ...showing help screen | |
28 | |
29 if (defined $help){ ## Print help screen if -h is true | |
30 print <<"HELP"; | |
31 | |
32 -------------------------------------------------------------------------------- | |
33 FASTAptamer-Count | |
34 -------------------------------------------------------------------------------- | |
35 | |
36 Usage: fastaptamer_count [-h] [-q] [-v] [-i INFILE] [-o OUTFILE] | |
37 | |
38 [-h] = Help screen. | |
39 [-q] = Suppress STDOUT of run report. | |
40 [-v] = Display version. | |
41 [-i INFILE] = FASTQ input file. REQUIRED. | |
42 [-o OUTFILE] = FASTA output file. REQUIRED. | |
43 | |
44 FASTAptamer-Count serves as the gateway to the FASTAptamer toolkit. For a given | |
45 .FASTQ input file, FASTAptamer-Count will determine the number of times each se- | |
46 quence was read, rank and sort sequences by decreasing total reads, and normali- | |
47 ze sequence frequency to reads per million. Output is generated as a non-redund- | |
48 ant FASTA file in the following format for each sequence: | |
49 | |
50 >RANK-READS-RPM | |
51 SEQUENCE | |
52 | |
53 Summary report (total reads, unique reads, and execution time) is displayed as | |
54 STDOUT at program completion unless [-q] is invoked. | |
55 | |
56 HELP | |
57 exit; | |
58 } | |
59 | |
60 if (defined $version){ ## Print version screen if -v is true | |
61 print <<"VERSION"; | |
62 | |
63 FASTAptamer v1.0.2 | |
64 | |
65 VERSION | |
66 exit; | |
67 } | |
68 | |
69 ########################################## | |
70 ## Open input file or exit with warning # | |
71 ########################################## | |
72 | |
73 open (INPUT, '<', $input_fh) or die | |
74 "\nCould not open input file or no input file was specified.\n | |
75 See help documentation [-h], README, or User's Guide for program usage.\n"; | |
76 | |
77 ########################################## | |
78 ## Open output file or exit with warning # | |
79 ########################################## | |
80 | |
81 open (OUTPUT, '>', $output_fh) or die | |
82 "\nCould not open output file or no output file was specified.\n | |
83 See help documentation [-h], README, or User's Guide for program usage.\n"; | |
84 | |
85 ############################################### | |
86 ## Create a hash for sequence reads where key # | |
87 ## is the sequence and value is read count # | |
88 ############################################### | |
89 | |
90 my %sequence_reads; | |
91 | |
92 my $entries = 0; ## Counts the number of entries processed | |
93 | |
94 $/ = "@"; ## Change default input record separator to read FASTQ formatted file | |
95 | |
96 ################################################################################ | |
97 ## Read input file and for each entry, add sequence or increment value in hash # | |
98 ################################################################################ | |
99 | |
100 while (<INPUT>){ | |
101 if ($_ =~ /.+\n(\S+)\n.+\n.+/){ | |
102 my $sequence = $1; | |
103 $sequence_reads{$sequence} += 1; | |
104 $entries++; | |
105 } | |
106 } | |
107 | |
108 close INPUT; | |
109 | |
110 ############################################ | |
111 ## Sort hash by decreasing read count by # | |
112 ## converting to arrays of values and keys # | |
113 ############################################ | |
114 | |
115 my @keys = sort { $sequence_reads{$b} <=> $sequence_reads{$a} } keys(%sequence_reads); | |
116 my @vals = @sequence_reads{@keys}; | |
117 | |
118 ################################################################################## | |
119 ## Using both arrays, print to output the FASTA formatted file containing # | |
120 ## the sequence rank, reads, reads per million, and sequence. The extra # | |
121 ## scalars here are used to ensure that sequences with equal read counts # | |
122 ## receive the same rank value, and that the next untied value is properly # | |
123 ## assigned a rank that takes into account the number of tied values before it, # | |
124 ## also known as standard competition ranking. # | |
125 ################################################################################## | |
126 | |
127 my $last_reads_count = $vals[0]; | |
128 my $rank = 1; | |
129 | |
130 for my $i (0..$#keys){ | |
131 my $current_reads_count = $vals[$i]; | |
132 if ($current_reads_count < $last_reads_count){ | |
133 $rank = $i + 1; | |
134 print OUTPUT ">$rank-$vals[$i]-"; | |
135 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000); | |
136 print OUTPUT "\n"; | |
137 print OUTPUT "$keys[$i]\n"; | |
138 | |
139 } | |
140 elsif ($current_reads_count == $last_reads_count){ | |
141 print OUTPUT ">$rank-$vals[$i]-"; | |
142 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000); | |
143 print OUTPUT "\n"; | |
144 print OUTPUT "$keys[$i]\n"; | |
145 } | |
146 $last_reads_count = $vals[$i]; | |
147 | |
148 } | |
149 | |
150 close OUTPUT; | |
151 | |
152 ################################################################################## | |
153 ## Unless the -q option is invoked, the following statistics are printed as # | |
154 ## standard output. # | |
155 ################################################################################## | |
156 | |
157 unless ($quiet){ | |
158 print "\n$entries total sequences. " . ($#keys + 1) . " unique sequences.\n"; | |
159 print "Input file: \"$input_fh\".\n"; | |
160 print "Output file: \"$output_fh\".\n"; | |
161 | |
162 my $duration = time - $start; | |
163 print "Execution time: $duration s.\n"; | |
164 } | |
165 | |
166 exit; |