Mercurial > repos > fastaptamer > fastaptamer_enrich

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastaptamer_enrich	Tue Feb 10 14:48:36 2015 -0500
@@ -0,0 +1,441 @@
+#!/usr/bin/env perl
+
+## Last Modified January 19th, 2015 22:24 CST
+
+## Citation:
+## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke.
+## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of
+## Combinatorial Selections." Molecular Therapy — Nucleic Acids. 2015.
+## DOI: 10.1038/mtna.2015.4
+
+## Distributed under GNU General Public License v3
+
+use Getopt::Long;    ## Core Perl module for command line arguments/options
+
+###################################################
+## File Input/Output and command line variables   #
+###################################################
+
+my $fileX_fh;    ## Variable for input file X
+my $fileY_fh;    ## Variable for input file Y
+my $fileZ_fh;    ## Variable for input file Z
+my $output_fh;   ## Variable for output file
+my $help;        ## true/false variable for help screen
+my $all;         ## true/false variable to print only matched sequences
+my $quiet;       ## true/false variable to supress standard output
+my $filter;      ## Variable for reads per million threshold filter
+my $version;     ## true/false variable for version screen
+
+                                            ## Takes command line input for...
+GetOptions (    "x=s" => \$fileX_fh,        ## ... input file x
+                "y=s" => \$fileY_fh,        ## ... input file y
+                "z=s" => \$fileZ_fh,        ## ... input file z
+                "output=s" => \$output_fh,  ## ... output file
+                "help" => \$help,           ## ... help screen
+                "filter=f" => \$filter,     ## ... RPM threshold filter
+                "version" => \$version,     ## ... display version
+                "quiet" => \$quiet);        ## ... supress standard output
+
+if (defined $help){    ## Prints help screen
+    print <<"HELP";
+
+--------------------------------------------------------------------------------
+                               FASTAptamer-Enrich
+--------------------------------------------------------------------------------
+
+Usage: fastaptamer_enrich [-h] [-x INFILE] [-y INFILE] [-z INFILE] [-o OUTFILE]
+                          [-f #] [-q] [-v]
+
+    [-h]            = Help screen.
+    [-x INFILE]     = First input file from FASTAptamer-Count or
+                      FASTAptamer-Cluster. REQUIRED.
+    [-y INFILE]     = Second input file from FASTAptamer-Count or
+                      FASTAptamer-Cluster. REQUIRED.
+                      *** For two populations only, use -x and -y. ***
+    [-z INFILE]     = Optional third input file from FASTAptamer-Count or
+    				  FASTAptamer-Cluster.
+    [-o OUTFILE]    = Plain text output file with tab separated values. REQUIRED
+    [-f]            = Optional reads per million threshold filter.
+    [-q]            = Quiet mode.  Suppresses standard output of file I/O,
+                      number of matched sequences and execution time.
+    [-v]            = Display version.
+
+FASTAptamer-Enrich rapidly calculates fold-enrichment values for each sequence
+across two or three input files.  Output is provided as a tab-delimited plain t-
+ext file and is formatted to include sequence composition, length, rank, reads,
+reads per million (RPM), and enrichment values for each sequence. If any files
+from FASTAptamer-Cluster are provided, output will include cluster information
+for that population. A threshold filter can be applied to exclude sequences with
+total reads per million (across all input populations) less than the number ent-
+ered after the [-f] option.  Default behavior is to include all sequences. Enri-
+chment is calculated by dividing reads per million of y/x (and z/y and z/x, if a
+third input file is specified).
+
+Input for FASTAptamer-Enrich MUST come from FASTAptamer-Count or FASTAptamer-
+Cluster output files.
+
+HELP
+exit;
+}
+
+if (defined $version){                     ## Print version screen if -v is true
+    print <<"VERSION";
+
+FASTAptamer v1.0.2
+
+VERSION
+exit;
+}
+
+######################################
+## Open input files and output files #
+######################################
+
+open (FILE_X, '<', $fileX_fh) or die
+"\nCould not open input file x, or no input file was specified.\n
+See help documentation [-h], README, or User's Guide for program usage.\n";
+
+open (FILE_Y, '<', $fileY_fh) or die
+"\nCould not open input file y, or no input file was specified.\n
+See help documentation [-h], README, or User's Guide for program usage.\n";
+
+if (defined $fileZ_fh){
+    open (FILE_Z, '<', $fileZ_fh) or die "\nCould not open input file z, or no input file was specified.\n
+    See help documentation [-h], README, or User's Guide for program usage.\n";
+}
+
+open (OUTPUT, '>', $output_fh) or die
+"\nCould not open output file or no output file was specified.\n
+See help documentation [-h], README, or User's Guide for program usage.\n";
+
+######################
+## Other variables   #
+######################
+
+$/ = ">";       ## Sets default record separator to > instead of \n
+
+my $start = time;      ## Record start of run-time
+
+my %hash_x;            ## variable for initial input file hash
+my %hash_y;            ## variable for middle input file hash
+my %hash_z;            ## variable for final input file hash
+
+my $entries_x;  ## number of unique sequences in initial input file
+my $entries_y;  ## number of unique sequences in middle input file
+my $entries_z;  ## number of unique sequences in final input file
+
+my $clusters_in_x;  ## True/false variable for clustered file input
+my $clusters_in_y;  ## True/false variable for clustered file input
+my $clusters_in_z;  ## True/false variable for clustered file input
+
+####################################################################
+## Three loops to take input file data and create x, y, and Z hash #
+####################################################################
+
+if (defined $fileZ_fh){
+    while (<FILE_Z>){
+         if ($_ =~ /(\d+-\d+-\d+\.?\d*-\d+-\d+-\d+)\n(\S+)/){
+            my $rank_and_read = $1;
+            my $sequence = $2;
+            $hash_z{$sequence} = $rank_and_read;
+            $entries_z++;
+            $clusters_in_z = 1;  ## Clustered file
+        }
+
+        elsif ($_ =~ /(\d+-\d+-\d+\.?\d*)\n(\S+)/){
+            my $rank_and_read = $1;
+            my $sequence = $2;
+            $hash_z{$sequence} = $rank_and_read;
+            $entries_z++;
+        }
+
+    }
+}
+
+close FILE_Z if defined $fileZ_fh;
+
+while (<FILE_Y>){
+    if ($_ =~ /(\d+-\d+-\d+\.?\d*-\d+-\d+-\d+)\n(\S+)/){
+        my $rank_and_read = $1;
+        my $sequence = $2;
+        $hash_y{$sequence} = $rank_and_read;
+        $entries_y++;
+        $clusters_in_y = 1;  ## Clustered file
+    }
+
+    elsif ($_ =~ /(\d+-\d+-\d+\.?\d*)\n(\S+)/){
+        my $rank_and_read = $1;
+        my $sequence = $2;
+        $hash_y{$sequence} = $rank_and_read;
+        $entries_y++;
+    }
+
+}
+
+close FILE_Y;
+
+while (<FILE_X>){
+    if ($_ =~ /(\d+-\d+-\d+\.?\d*-\d+-\d+-\d+)\n(\S+)/){
+        my $rank_and_read = $1;
+        my $sequence = $2;
+        $hash_x{$sequence} = $rank_and_read;
+        $entries_x++;
+        $clusters_in_x = 1;  ## Clustered file
+    }
+
+    elsif ($_ =~ /(\d+-\d+-\d+\.?\d*)\n(\S+)/){
+        my $rank_and_read = $1;
+        my $sequence = $2;
+        $hash_x{$sequence} = $rank_and_read;
+        $entries_x++;
+    }
+
+}
+
+close FILE_X;
+
+unless (defined $quiet){
+    print "\n$entries_x sequences in \"$fileX_fh\".\n";
+    print "$entries_y sequences in \"$fileY_fh\".\n";
+    print "$entries_z sequences in \"$fileZ_fh\".\n\n" if defined $fileZ_fh;
+}
+
+################################################################################
+## The section below creates the headers for the output file by testing which  #
+## files were clustered and printing accordingly.                              #
+################################################################################
+
+
+if (defined $fileZ_fh){
+    print OUTPUT "Sequence\tLength\tRank (x)\tReads (x)\tRPM (x)\t";
+    if (defined $clusters_in_x){
+        print OUTPUT "Cluster (x)\tRank in Cluster (x)\tEdit Distance (x)\t";
+    }
+    print OUTPUT "Rank (y)\tReads (y)\tRPM (y)\t";
+    if (defined $clusters_in_y){
+        print OUTPUT "Cluster (y)\tRank in Cluster (y)\tEdit Distance (y)\t";
+    }
+    print OUTPUT "Rank (z)\tReads (z)\tRPM (z)\t";
+    if (defined $clusters_in_z){
+        print OUTPUT "Cluster (z)\tRank in Cluster (z)\tEdit Distance (z)\t";
+    }
+    print OUTPUT "Enrichment (y/x)\tEnrichment (z/y)\tEnrichment (z/x)\n";
+}
+else {
+    print OUTPUT "Sequence\tLength\tRank (x)\tReads (x)\tRPM (x)\t";
+    if (defined $clusters_in_x){
+        print OUTPUT "Cluster (x)\tRank in Cluster (x)\tEdit Distance (x)\t";
+    }
+    print OUTPUT "Rank (y)\tReads (y)\tRPM (y)\t";
+    if (defined $clusters_in_y){
+        print OUTPUT "Cluster (y)\tRank in Cluster (y)\tEdit Distance (y)\t";
+    }
+    print OUTPUT "Enrichment (y/x)\n";
+}
+
+################################################################################
+## The next few blocks simply find sequence matches across hashes by testing   #
+## to see if the "key" or sequence exists in the prior hash.  If it does exist #
+## then the key and value pair for the hash are deleted.  It starts with the   #
+## Z hash to test for keys in Y then X, and then moves to the Y hash to test   #
+## for keys left over in X.                                                    #
+################################################################################
+
+if (defined $fileZ_fh){
+    for my $sequence_in_z (keys %hash_z){
+## Iterate through all sequences in hash z
+        my $z_match = $hash_z{$sequence_in_z};
+## For sequence key, find corresponding values & store in a new scoped variable
+        delete $hash_z{$sequence_in_z};
+## Remove key:value pair from hash
+        my @z_match_split = split /-/, $z_match;
+## Split value into rank, reads and RPM... then place into array
+        my $seq_length = length $sequence_in_z;
+        my $z_rpm = $z_match_split[2];
+        my $y_rpm;
+        my $x_rpm;
+        my $total_rpm;
+        my $x_match;
+        my @x_match_split;
+        my $y_match;
+        my @y_match_split;
+
+###############
+        if ($hash_x{$sequence_in_z}){
+## If sequence key returns a value in the x hash
+            $x_match = $hash_x{$sequence_in_z};
+## ...find corresponding value and store in a new scoped variable
+            delete $hash_x{$sequence_in_z};
+## ...remove key:value pair from hash
+            @x_match_split = split /-/, $x_match;
+## ...split value into rank and reads, place into scoped array
+            $x_rpm = $x_match_split[2];
+        }
+###############
+        if ($hash_y{$sequence_in_z}){
+## If sequence key returns a value in the y hash
+            $y_match = $hash_y{$sequence_in_z};
+## ...find corresponding value and store in a new scoped variable
+            delete $hash_y{$sequence_in_z};
+## ...remove key:value pair from hash
+            @y_match_split = split /-/, $y_match;
+## ...split value into rank and reads, place into array
+            $y_rpm = $y_match_split[2];
+        }
+###############
+        $total_rpm = $x_rpm + $y_rpm + $z_rpm;
+
+        if ($total_rpm >= $filter){  ## If filter is defined, print only if total RPM is greater
+        	print OUTPUT "$sequence_in_z\t$seq_length\t";
+
+            if (defined $x_match){
+            	print OUTPUT "$x_match_split[0]\t$x_match_split[1]\t$x_match_split[2]\t";
+        			if (defined $clusters_in_x){
+        				print OUTPUT "$x_match_split[3]\t$x_match_split[4]\t$x_match_split[5]\t";
+        			}
+        	}
+        	else {
+        		print OUTPUT "\t\t\t";
+        		if (defined $clusters_in_x){
+        				print OUTPUT "\t\t\t";
+        		}
+        	}
+
+			 if (defined $y_match){
+            	print OUTPUT "$y_match_split[0]\t$y_match_split[1]\t$y_match_split[2]\t";
+        			if (defined $clusters_in_y){
+        				print OUTPUT "$y_match_split[3]\t$y_match_split[4]\t$y_match_split[5]\t";
+        			}
+        	}
+        	else {
+        		print OUTPUT "\t\t\t";
+        		if (defined $clusters_in_y){
+        				print OUTPUT "\t\t\t";
+        		}
+        	}
+
+        	print OUTPUT "$z_match_split[0]\t$z_match_split[1]\t$z_match_split[2]\t";
+        	if (defined $clusters_in_z){
+        		print OUTPUT "$z_match_split[3]\t$z_match_split[4]\t$z_match_split[5]\t";
+        	}
+
+        	if (defined $y_rpm && defined $x_rpm){
+        	    print OUTPUT $y_rpm / $x_rpm . "\t";
+        	}
+        	else {
+        	    print OUTPUT "\t";
+        	}
+
+        	if (defined $y_rpm){
+        	    print OUTPUT $z_rpm / $y_rpm . "\t";
+        	}
+        	else {
+        	    print OUTPUT "\t";
+        	}
+        	if (defined $x_rpm){
+        	    print OUTPUT $z_rpm / $x_rpm . "\n";
+        	}
+        	else {
+        	    print OUTPUT "\n";
+        	}
+    	}
+	}
+}
+################################################################################
+for my $sequence_in_y (keys %hash_y){
+## Iterate through seqs in y hash (should only contain seqs NOT found in z hash)
+
+    my $y_match = $hash_y{$sequence_in_y};
+## Using sequence key, find corresponding value and store in scoped variable
+    delete $hash_y{$sequence_in_y};
+## Remove the sequence key:value pair from hash
+    my @y_match_split = split /-/, $y_match;
+## Split value into rank and reads, store in scoped array
+    my $seq_length = length $sequence_in_y;
+    my $y_rpm = $y_match_split[2];
+    my $x_rpm;
+    my $total_rpm;
+    my $x_match;
+    my @x_match_split;
+
+    if ($hash_x{$sequence_in_y}){
+## If sequence key exists for the x hash
+        $x_match = $hash_x{$sequence_in_y};
+## ...find corresponding value and store in scoped variable
+        delete $hash_x{$sequence_in_y};
+## ...remove key:value pair from hash
+        @x_match_split = split /-/, $x_match;
+## ...split value into rank and reads, store in scoped array
+        $x_rpm = $x_match_split[2];
+    }
+
+	$total_rpm = $x_rpm + $y_rpm;
+
+    if ($total_rpm >= $filter){
+        print OUTPUT "$sequence_in_y\t$seq_length\t";
+
+        if (defined $x_match){
+            print OUTPUT "$x_match_split[0]\t$x_match_split[1]\t$x_match_split[2]\t";
+        		if (defined $clusters_in_x){
+        			print OUTPUT "$x_match_split[3]\t$x_match_split[4]\t$x_match_split[5]\t";
+        		}
+        }
+        else {
+        	print OUTPUT "\t\t\t";
+        	if (defined $clusters_in_x){
+        			print OUTPUT "\t\t\t";
+        	}
+        }
+
+        print OUTPUT "$y_match_split[0]\t$y_match_split[1]\t$y_match_split[2]\t";
+        if (defined $clusters_in_y){
+        		print OUTPUT "$y_match_split[3]\t$y_match_split[4]\t$y_match_split[5]\t";
+        }
+
+        if (defined $x_rpm){
+        	print OUTPUT "\t\t\t" if defined $fileZ_fh;
+        	print OUTPUT "\t\t\t" if defined $clusters_in_z;
+        	print OUTPUT $y_rpm / $x_rpm . "\n";
+    	}
+    	else {
+        	print OUTPUT "\n";
+    	}
+
+    }
+}
+################################################################################
+for my $unmatched_x_sequence (keys %hash_x){
+## Iterate through all sequences present only in the initial population
+
+    my $x_match = $hash_x{$unmatched_x_sequence};
+## Use sequence key to find corresponding value, assign to
+    delete $hash_x{$unmatched_x_sequence};
+## Remove sequence key:value pair from hash
+    my @x_match_split = split /-/, $x_match;
+## Split value into rank and reads, store in scoped array
+    my $seq_length = length $unmatched_x_sequence;
+    my $x_rpm = $x_match_split[2];
+    if ($x_rpm >= $filter){
+    	print OUTPUT "$unmatched_x_sequence\t$seq_length\t$x_match_split[0]\t$x_match_split[1]\t$x_match_split[2]";
+    	if (defined $clusters_in_x){
+        	print OUTPUT "\t$x_match_split[3]\t$x_match_split[4]\t$x_match_split[5]\n";
+        }
+        else {
+        	print OUTPUT "\n";
+        }
+	}
+}
+################################################################################
+
+close OUTPUT;
+
+my $duration = time - $start;
+
+unless (defined $quiet){
+    print "Input file (x): \"$fileX_fh\".\nInput file (y): \"$fileY_fh\".\n";
+    print "Input file (z): \"$fileZ_fh\".\n" if defined $fileZ_fh;
+    print "Output file: \"$output_fh\".\n";
+    print "Execution time: $duration s.\n";
+}
+
+exit;