Next changeset 1:748195d5c4de (2015-02-10) |
Commit message:
Uploaded |
added:
fastaptamer_enrich |
b |
diff -r 000000000000 -r 4ad5a728b517 fastaptamer_enrich --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaptamer_enrich Tue Feb 10 14:48:36 2015 -0500 |
[ |
b'@@ -0,0 +1,441 @@\n+#!/usr/bin/env perl\n+\n+## Last Modified January 19th, 2015 22:24 CST\n+\n+## Citation:\n+## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. \n+## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of \n+## Combinatorial Selections." Molecular Therapy \xe2\x80\x94 Nucleic Acids. 2015.\n+## DOI: 10.1038/mtna.2015.4\n+\n+## Distributed under GNU General Public License v3\n+\n+use Getopt::Long; ## Core Perl module for command line arguments/options\n+\n+###################################################\n+## File Input/Output and command line variables #\n+###################################################\n+\n+my $fileX_fh; ## Variable for input file X\n+my $fileY_fh; ## Variable for input file Y\t\n+my $fileZ_fh; ## Variable for input file Z\n+my $output_fh; ## Variable for output file\n+my $help; ## true/false variable for help screen\n+my $all; ## true/false variable to print only matched sequences\n+my $quiet; ## true/false variable to supress standard output\n+my $filter; ## Variable for reads per million threshold filter\n+my $version; ## true/false variable for version screen\n+\n+ ## Takes command line input for...\n+GetOptions ( "x=s" => \\$fileX_fh, ## ... input file x\n+ "y=s" => \\$fileY_fh, ## ... input file y\n+ "z=s" => \\$fileZ_fh, ## ... input file z\n+ "output=s" => \\$output_fh, ## ... output file\n+ "help" => \\$help, ## ... help screen\n+ "filter=f" => \\$filter, ## ... RPM threshold filter\n+ "version" => \\$version, ## ... display version\n+ "quiet" => \\$quiet); ## ... supress standard output\n+\n+if (defined $help){ ## Prints help screen\n+ print <<"HELP";\n+\t\n+--------------------------------------------------------------------------------\n+ FASTAptamer-Enrich\n+--------------------------------------------------------------------------------\n+\n+Usage: fastaptamer_enrich [-h] [-x INFILE] [-y INFILE] [-z INFILE] [-o OUTFILE] \n+ [-f #] [-q] [-v]\n+\n+ [-h] = Help screen.\n+ [-x INFILE] = First input file from FASTAptamer-Count or \n+ FASTAptamer-Cluster. REQUIRED.\n+ [-y INFILE] = Second input file from FASTAptamer-Count or \n+ FASTAptamer-Cluster. REQUIRED. \n+ *** For two populations only, use -x and -y. ***\n+ [-z INFILE] = Optional third input file from FASTAptamer-Count or \n+ \t\t\t\t FASTAptamer-Cluster.\n+ [-o OUTFILE] = Plain text output file with tab separated values. REQUIRED\n+ [-f] = Optional reads per million threshold filter. \n+ [-q] = Quiet mode. Suppresses standard output of file I/O, \n+ number of matched sequences and execution time.\n+ [-v] = Display version.\n+\n+FASTAptamer-Enrich rapidly calculates fold-enrichment values for each sequence\n+across two or three input files. Output is provided as a tab-delimited plain t-\n+ext file and is formatted to include sequence composition, length, rank, reads, \n+reads per million (RPM), and enrichment values for each sequence. If any files \n+from FASTAptamer-Cluster are provided, output will include cluster information \n+for that population. A threshold filter can be applied to exclude sequences with\n+total reads per million (across all input populations) less than the number ent-\n+ered after the [-f] option. Default behavior is to include all sequences. Enri-\n+chment is calculated by dividing reads per million of y/x (and z/y and z/x, if a \n+third input file is specified).\n+\n+Input for FASTAptamer-Enrich MUST come from FASTAptamer-Count or FASTAptamer-\n+Cluster output files.\n+\n+HELP\n+exit;\n+}\n+\n+if (defined $version){ ## Print version screen if -v is true\n+ print <<"VERSION";\n'..b't";\n+ \t}\n+ \telse {\n+ \t print OUTPUT "\\t";\n+ \t}\n+\t\t\n+ \tif (defined $y_rpm){\n+ \t print OUTPUT $z_rpm / $y_rpm . "\\t";\n+ \t}\n+ \telse {\n+ \t print OUTPUT "\\t";\n+ \t}\n+ \tif (defined $x_rpm){\n+ \t print OUTPUT $z_rpm / $x_rpm . "\\n";\n+ \t}\n+ \telse {\n+ \t print OUTPUT "\\n";\n+ \t}\t\t\n+ \t}\n+\t}\n+}\n+################################################################################\n+for my $sequence_in_y (keys %hash_y){ \n+## Iterate through seqs in y hash (should only contain seqs NOT found in z hash)\n+\t\n+ my $y_match = $hash_y{$sequence_in_y}; \n+## Using sequence key, find corresponding value and store in scoped variable\n+ delete $hash_y{$sequence_in_y}; \n+## Remove the sequence key:value pair from hash\n+ my @y_match_split = split /-/, $y_match; \n+## Split value into rank and reads, store in scoped array\n+ my $seq_length = length $sequence_in_y;\n+ my $y_rpm = $y_match_split[2];\n+ my $x_rpm;\n+ my $total_rpm;\n+ my $x_match;\n+ my @x_match_split;\n+\t\t\n+ if ($hash_x{$sequence_in_y}){ \n+## If sequence key exists for the x hash\n+ $x_match = $hash_x{$sequence_in_y}; \n+## ...find corresponding value and store in scoped variable\n+ delete $hash_x{$sequence_in_y}; \n+## ...remove key:value pair from hash\n+ @x_match_split = split /-/, $x_match; \n+## ...split value into rank and reads, store in scoped array\n+ $x_rpm = $x_match_split[2];\n+ }\n+\t\n+\t$total_rpm = $x_rpm + $y_rpm;\n+ \n+ if ($total_rpm >= $filter){\n+ print OUTPUT "$sequence_in_y\\t$seq_length\\t";\n+ \t\n+ if (defined $x_match){\n+ print OUTPUT "$x_match_split[0]\\t$x_match_split[1]\\t$x_match_split[2]\\t"; \n+ \t\tif (defined $clusters_in_x){\n+ \t\t\tprint OUTPUT "$x_match_split[3]\\t$x_match_split[4]\\t$x_match_split[5]\\t";\n+ \t\t}\n+ }\n+ else {\n+ \tprint OUTPUT "\\t\\t\\t";\n+ \tif (defined $clusters_in_x){\n+ \t\t\tprint OUTPUT "\\t\\t\\t";\n+ \t}\n+ }\n+ \n+ print OUTPUT "$y_match_split[0]\\t$y_match_split[1]\\t$y_match_split[2]\\t";\n+ if (defined $clusters_in_y){\n+ \t\tprint OUTPUT "$y_match_split[3]\\t$y_match_split[4]\\t$y_match_split[5]\\t";\n+ }\n+ \n+ if (defined $x_rpm){\n+ \tprint OUTPUT "\\t\\t\\t" if defined $fileZ_fh;\n+ \tprint OUTPUT "\\t\\t\\t" if defined $clusters_in_z;\n+ \tprint OUTPUT $y_rpm / $x_rpm . "\\n";\n+ \t}\n+ \telse {\n+ \tprint OUTPUT "\\n";\n+ \t}\n+ \n+ }\n+}\n+################################################################################\n+for my $unmatched_x_sequence (keys %hash_x){ \n+## Iterate through all sequences present only in the initial population\n+\t\n+ my $x_match = $hash_x{$unmatched_x_sequence}; \n+## Use sequence key to find corresponding value, assign to \n+ delete $hash_x{$unmatched_x_sequence}; \n+## Remove sequence key:value pair from hash\n+ my @x_match_split = split /-/, $x_match; \n+## Split value into rank and reads, store in scoped array\n+ my $seq_length = length $unmatched_x_sequence;\n+ my $x_rpm = $x_match_split[2];\t\t\t\t\t\t\t\n+ if ($x_rpm >= $filter){\n+ \tprint OUTPUT "$unmatched_x_sequence\\t$seq_length\\t$x_match_split[0]\\t$x_match_split[1]\\t$x_match_split[2]";\n+ \tif (defined $clusters_in_x){\n+ \tprint OUTPUT "\\t$x_match_split[3]\\t$x_match_split[4]\\t$x_match_split[5]\\n";\n+ }\n+ else {\n+ \tprint OUTPUT "\\n";\n+ }\n+\t}\n+}\n+################################################################################\n+\n+close OUTPUT;\n+\n+my $duration = time - $start;\n+\n+unless (defined $quiet){\n+ print "Input file (x): \\"$fileX_fh\\".\\nInput file (y): \\"$fileY_fh\\".\\n";\n+ print "Input file (z): \\"$fileZ_fh\\".\\n" if defined $fileZ_fh;\n+ print "Output file: \\"$output_fh\\".\\n";\n+ print "Execution time: $duration s.\\n";\n+}\n+\n+exit;\n' |