view singleFitness.pl @ 0:d3227d69c228 draft

Uploaded
author antmarge
date Sat, 18 Mar 2017 16:23:05 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl -w

#Margaret Antonio updated 16.10.03

use strict;
use Getopt::Long;
use warnings;
use Bio::SeqIO;

#AVAILABLE OPTIONS. WILL PRINT UPON ERROR
sub print_usage() {
	print "\n####################################################################\n";
    print "singleVal: creates a wig file with position and insertion count OR fitness\n";
    print "\nDESCRIPTION: ";
    print "Integrates multiple files of transposon insertion data and outputs\n";
    print "aggregate fitness within a sliding window (specified by size and step).\n";
    
    print "\nUSAGE:\n";
    print "perl singleVal.pl <OPTIONS> <REQ OUTPUT TYPE(S)> <INPUT FILE(S)>\n\n";
    
    print "\nREQUIRED:\n";
    print " -d\tDirectory containing all input files (files from\n";
    print "   \taggregate script)\n";
    print "   \tOR\n";
    print "   \tIn the command line (without a flag), input the name(s) of\n";
    print "   \tfiles containing gene fitness values (output of calcFit). \n\n";
    print " -x\tCutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 10)\n";
    print " -b\tBlanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)\n";
    print " -w\tUse weighted algorithm to calculate averages, variance, sd, se\n";
    print " -l\tWeight ceiling: maximum value to use as a weight (default: 50)\n";
    
    print "OPTIONAL:\n";
    print " -h\tPrints usage and exits program\n";
    print " -o\tOutput file for comparison data. Default: singleVal.wig\n";
    print " -v\tString value for output: 'fit' for fitness OR 'count' for count\n";
    print "   \tDefault: fit for fitness\n";
    print " -n\tName of the reference genome, to be included in the wig header\n";
    print "   \tDefault: genome\n";

    print " \n~~~~Always check that file paths are correctly specified~~~~\n";
    print "\n##################################################################\n";
    
}


#ASSIGN INPUTS TO VARIABLES
our ($help,$ref_genome,$indir,$out,$cutoff,$blank_pc,$weight_ceiling);
GetOptions(
'h'=>\$help,
'o:s' =>\$out,
'c:i' =>\$cutoff,
'b:i' =>\$blank_pc,
'l:i' =>\$weight_ceiling,
);

sub get_time() {
    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
    return "$hour:$min:$sec";
}

if ($help){
    print_usage();
    exit;
}
if (!$indir and (scalar @ARGV==0)){
	print "\nERROR: Please correctly specify input files or directory\n";
    print_usage();
	print "\n";
	exit;
}

#ADDED BY MLA allows input to be directory---good for inputting L1-L6
my @files=@ARGV;


#SET DEFAULTS
if (!$cutoff){$cutoff=10}
if (!$blank_pc){$blank_pc=0}
if (!$weight_ceiling){$weight_ceiling=50}
if (!$out){$out="singleVal.csv"}

# Returns mean, variance, sd, se
sub average {
    my $scoreref = shift @_;
    my @scores = @{$scoreref};
    
    my $sum=0;
    my $num=0;
    
    # Get the average.
    foreach my $w (@scores) {
        $sum += $w;
        $num++;
    }
    my $average= $sum/$num;
    my $xminusxbars = 0;
    
    # Get the variance.
    foreach my $w (@scores) {
        $xminusxbars += ($w-$average)**2;
    }
    my $variance = (1/($num-1)) * $xminusxbars;
    my $sd = sqrt($variance);
    my $se = $sd / sqrt($num);
    
    return ($average, $variance, $sd, $se);
    
}

# Takes two parameters, both hashrefs to lists.
# 1) hashref to list of scores
# 2) hashref to list of weights, equal in length to the scores.
sub weighted_average {
    
    my $scoreref = shift @_;
    my $weightref = shift @_;
    my @scores = @{$scoreref};
    my @weights = @{$weightref};
    
    my $sum=0;
    my ($weighted_average, $weighted_variance)=(0,0);
    my $v2;
    
    # Go through once to get total, calculate V2.
    for (my $i=0; $i<@weights; $i++) {
        $sum += $weights[$i];
        $v2 += $weights[$i]**2;
    }
    if ($sum == 0) { return 0; } # No scores given?
    
    my $scor = join (' ', @scores);
    my $wght = join (' ', @weights);
    
    # Now calculated weighted average.
    my ($top, $bottom) = (0,0);
    for (my $i=0; $i<@weights; $i++) {
        $top += $weights[$i] * $scores[$i];
        $bottom += $weights[$i];
    }
    $weighted_average = $top/$bottom;
    #print "WA: $weighted_average\n";
    
    ($top, $bottom) = (0,0);
    # Now calculate weighted sample variance.
    for (my $i=0; $i<@weights; $i++) {
        $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2);
        $bottom += $weights[$i];
    }
    $weighted_variance = $top/$bottom;
    
    my $weighted_stdev = sqrt($weighted_variance);
    my $weighted_stder = $weighted_stdev / sqrt(@scores);  # / length scores.
    
    return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder);
}

my %pos_summary;


foreach my $filename (@files) {
    print "\n",$filename,"\n";
    open IN, $filename;
    my %hash;
    while (my $line = <IN>) {
        chomp $line;
        my @lines = split(/,/,$line);
        my $pos = $lines[0];
        my $w = $lines[12];
        if ($w and $w eq 'nW') {next;}
        if (!$w) { $w = 0 } # For blanks
        my $c1 = $lines[2];
        my $c2 = $lines[3];
        my $avg = ($c1+$c2)/2; # Later: change which function to use? C1? AVG(C1+C2)?
        if ($avg < $cutoff) { next; } # Skip cutoff genes.
        if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight.
        
        my @empty;
        if (!$pos_summary{$pos}) {
            $pos_summary{$pos}{w} = [@empty];
            $pos_summary{$pos}{s} = [@empty];
        }
        $pos_summary{$pos}{w} = [@{$pos_summary{$pos}{w}}, $w];  # List of Fitness scores.
        $pos_summary{$pos}{s} = [@{$pos_summary{$pos}{s}}, $avg]; # List of counts used to generate those fitness scores.
    }
    close IN;
    
}

open SUMMARY, ">",$out;


    print SUMMARY "pos,fitness,ins_count,fitness_sd,fitness_se\n";
    # Now print out summary stats.
    foreach my $key (sort {$a<=>$b} keys %pos_summary) {
        if (!$key) {next}
        my $sum=0;
        my $num=0;
        my $avgsum = 0;
        
        # Get the average.
        foreach my $w (@{$pos_summary{$key}{w}}) {
            $sum += $w;
            $num++;
        }
        my $average= $sum/$num;
        my $xminusxbars = 0;
        
        # Get the standard deviation.
        foreach my $w (@{$pos_summary{$key}{w}}) {
            $xminusxbars += ($w-$average)**2;
        }
        my ($sd, $se) = ('','');
        if ($num > 1) {
            $sd = sqrt($xminusxbars/($num-1));
            $se = $sd / sqrt($num);
        }
        
        print SUMMARY "$key,$average,$num,$sd,$se\n";

    }


close SUMMARY;