Mercurial > repos > rachel-929292 > extract_fragment_counts_for_chromosomes

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script_2_GC_correct_bdp-qad.pl	Tue Oct 18 09:18:24 2016 -0400
@@ -0,0 +1,145 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use Data::Dumper;
+
+### Open input and create output files ###
+open(TEST_LOG, ">$ARGV[1]_GC_bias") or die "Can't open test log!!\n";
+open(INPUT, "$ARGV[0]") or die "Can't open input!!\n";
+open(OUTPUT, ">$ARGV[1]") or die "Can't create output!!\n";
+open (TABLE, "$ARGV[2]") or die "Cannot open reference table";
+
+
+### Check input and output files have been provided ###
+if (!$ARGV[0] or !$ARGV[1]) {
+    die "Incorrect number of parameters provided\n Usage: perl <script> <input> <output>\n";
+}
+
+### Intialise counts and params
+my $total_read_count = 0;
+my $total_mapped = 0;
+my @bin_counts = ();		# Or maybe use a hash? "start_chr" as hash key?
+my %bin_hash = ();
+my %alt_hash = ();		# Alternative hash. Holds GC content values
+my $ROI_limit = 20000;
+
+my %GC_corr_vals = ();		# average counts per GC content
+my %read_chr_counts = ();		# GC content step
+my @chromes = qw(chr21 chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr22 chrX chrY chrM);
+
+### Extra params for Fan correction
+my $total_good_reads = 0;
+my $total_good_bins = 0;
+
+### Populate the hash
+#open(TABLE, "/devt/qad/data/support/gc_content_correction/ref_table_GC_hg19_20k_v2") or die "Can't open table\n";
+
+while (my $strip = <TABLE>) {
+    chomp $strip;
+
+    my @words = split(" ", $strip);
+    my $key = join("_", $words[0], $words[1]);
+    $bin_hash{$words[1]}{$words[0]} = 0;
+
+    ### Alternative hash containing GC content info
+    $alt_hash{$words[1]}{$words[0]} = [0, $words[2]];
+}
+close(TABLE);
+
+### Loop through each line of input SAM file
+while (my $line = <INPUT>) {
+    chomp $line;
+
+    ### Split the line into comparable components
+    my @line_words = split(" ", $line);
+    my @line_chars = split("", $line);
+    #my @qname_words = split("_", $line_words[0]);
+
+    ### Skip lines of the header section
+    if ($line_chars[0] eq "@") {
+        next;
+    }
+
+    $total_read_count++;
+
+    ### Testing
+    #print "$total_read_count\n";
+
+    ### Extract alignment values
+    my $aln_chr = $line_words[2];
+    my $aln_pos = $line_words[3];
+
+
+    if ($line !~ m/\sXS:i:\d+/ and $line =~ m/\sAS:i:\d+/) {
+        $total_mapped++;
+        my $bin_id = int($aln_pos / $ROI_limit) + 1;
+        $bin_hash{$aln_chr}{$bin_id}++;
+
+        $alt_hash{$aln_chr}{$bin_id}[0]++;
+    }
+}
+
+### Filter the bins. Remove bins with no counts
+while (my ($key, $value) = each %alt_hash) {
+    while (my ($inner_key, $inner_value) = each %$value) {
+        if ($alt_hash{$key}{$inner_key}[0] == 0 ) {
+            delete $alt_hash{$key}{$inner_key};
+        }
+        ### Calculate the average count per GC content
+        else {
+            my $GC_short_val = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]);
+            $GC_corr_vals{$GC_short_val}[0] += $alt_hash{$key}{$inner_key}[0];	# Total No of reads
+            $GC_corr_vals{$GC_short_val}[1]++;			# Total of GC regions with this %
+
+            ### Fan correction
+            $total_good_bins++;
+            $total_good_reads += $alt_hash{$key}{$inner_key}[0];
+        }
+    }
+}
+
+### Fan correction
+my $global_mean = $total_good_reads / $total_good_bins;
+
+### Apply the correction and tally up the counts within each chr
+while (my ($key, $value) = each %alt_hash) {
+    while (my ($inner_key, $inner_value) = each %$value) {
+        my $GC = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]);
+        my $mean = $GC_corr_vals{$GC}[0] / $GC_corr_vals{$GC}[1];
+        #my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($alt_hash{$key}{$inner_key}[0] / $mean);
+
+        ### Fan alternative correction
+        my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($global_mean / $mean);
+
+        $read_chr_counts{$key} += $corr_count;
+    }
+}
+
+### Print the metrics into output file
+printf OUTPUT "Total reads: $total_read_count\n";
+printf OUTPUT "Total mapped reads: $total_mapped\n";
+printf OUTPUT "Total good reads(testing): $total_good_reads\n";
+#while ((my $key, my $value) = each %read_chr_counts) {
+#    printf OUTPUT "$key\t$value\n";
+#}
+foreach my $chrom (@chromes) {
+    if (exists $read_chr_counts{$chrom}) {
+        printf OUTPUT "$chrom\t$read_chr_counts{$chrom}\n";
+    }
+    else {
+        printf OUTPUT "$chrom\t0\n";
+    }
+}
+
+### Printing extra data. Reads per GC content step
+foreach my $val (sort {$a<=>$b} keys %GC_corr_vals) {
+    my $mean = $GC_corr_vals{$val}[0] / $GC_corr_vals{$val}[1];
+    printf TEST_LOG "$val\t$mean\n";
+}
+
+### close the filehandles and exit ###
+close(TEST_LOG);
+close(INPUT);
+close(OUTPUT);
+#die "... end of the quality measure script\n";