Mercurial > repos > rachel-929292 > extract_fragment_counts_for_chromosomes
changeset 0:b255b5ea9708 draft
Uploaded
author | rachel-929292 |
---|---|
date | Tue, 18 Oct 2016 09:18:24 -0400 |
parents | |
children | e1803849ab52 |
files | script_2_GC_correct_bdp-qad.pl |
diffstat | 1 files changed, 145 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script_2_GC_correct_bdp-qad.pl Tue Oct 18 09:18:24 2016 -0400 @@ -0,0 +1,145 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Data::Dumper; + +### Open input and create output files ### +open(TEST_LOG, ">$ARGV[1]_GC_bias") or die "Can't open test log!!\n"; +open(INPUT, "$ARGV[0]") or die "Can't open input!!\n"; +open(OUTPUT, ">$ARGV[1]") or die "Can't create output!!\n"; +open (TABLE, "$ARGV[2]") or die "Cannot open reference table"; + + +### Check input and output files have been provided ### +if (!$ARGV[0] or !$ARGV[1]) { + die "Incorrect number of parameters provided\n Usage: perl <script> <input> <output>\n"; +} + +### Intialise counts and params +my $total_read_count = 0; +my $total_mapped = 0; +my @bin_counts = (); # Or maybe use a hash? "start_chr" as hash key? +my %bin_hash = (); +my %alt_hash = (); # Alternative hash. Holds GC content values +my $ROI_limit = 20000; + +my %GC_corr_vals = (); # average counts per GC content +my %read_chr_counts = (); # GC content step +my @chromes = qw(chr21 chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr22 chrX chrY chrM); + +### Extra params for Fan correction +my $total_good_reads = 0; +my $total_good_bins = 0; + +### Populate the hash +#open(TABLE, "/devt/qad/data/support/gc_content_correction/ref_table_GC_hg19_20k_v2") or die "Can't open table\n"; + +while (my $strip = <TABLE>) { + chomp $strip; + + my @words = split(" ", $strip); + my $key = join("_", $words[0], $words[1]); + $bin_hash{$words[1]}{$words[0]} = 0; + + ### Alternative hash containing GC content info + $alt_hash{$words[1]}{$words[0]} = [0, $words[2]]; +} +close(TABLE); + +### Loop through each line of input SAM file +while (my $line = <INPUT>) { + chomp $line; + + ### Split the line into comparable components + my @line_words = split(" ", $line); + my @line_chars = split("", $line); + #my @qname_words = split("_", $line_words[0]); + + ### Skip lines of the header section + if ($line_chars[0] eq "@") { + next; + } + + $total_read_count++; + + ### Testing + #print "$total_read_count\n"; + + ### Extract alignment values + my $aln_chr = $line_words[2]; + my $aln_pos = $line_words[3]; + + + if ($line !~ m/\sXS:i:\d+/ and $line =~ m/\sAS:i:\d+/) { + $total_mapped++; + my $bin_id = int($aln_pos / $ROI_limit) + 1; + $bin_hash{$aln_chr}{$bin_id}++; + + $alt_hash{$aln_chr}{$bin_id}[0]++; + } +} + +### Filter the bins. Remove bins with no counts +while (my ($key, $value) = each %alt_hash) { + while (my ($inner_key, $inner_value) = each %$value) { + if ($alt_hash{$key}{$inner_key}[0] == 0 ) { + delete $alt_hash{$key}{$inner_key}; + } + ### Calculate the average count per GC content + else { + my $GC_short_val = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]); + $GC_corr_vals{$GC_short_val}[0] += $alt_hash{$key}{$inner_key}[0]; # Total No of reads + $GC_corr_vals{$GC_short_val}[1]++; # Total of GC regions with this % + + ### Fan correction + $total_good_bins++; + $total_good_reads += $alt_hash{$key}{$inner_key}[0]; + } + } +} + +### Fan correction +my $global_mean = $total_good_reads / $total_good_bins; + +### Apply the correction and tally up the counts within each chr +while (my ($key, $value) = each %alt_hash) { + while (my ($inner_key, $inner_value) = each %$value) { + my $GC = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]); + my $mean = $GC_corr_vals{$GC}[0] / $GC_corr_vals{$GC}[1]; + #my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($alt_hash{$key}{$inner_key}[0] / $mean); + + ### Fan alternative correction + my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($global_mean / $mean); + + $read_chr_counts{$key} += $corr_count; + } +} + +### Print the metrics into output file +printf OUTPUT "Total reads: $total_read_count\n"; +printf OUTPUT "Total mapped reads: $total_mapped\n"; +printf OUTPUT "Total good reads(testing): $total_good_reads\n"; +#while ((my $key, my $value) = each %read_chr_counts) { +# printf OUTPUT "$key\t$value\n"; +#} +foreach my $chrom (@chromes) { + if (exists $read_chr_counts{$chrom}) { + printf OUTPUT "$chrom\t$read_chr_counts{$chrom}\n"; + } + else { + printf OUTPUT "$chrom\t0\n"; + } +} + +### Printing extra data. Reads per GC content step +foreach my $val (sort {$a<=>$b} keys %GC_corr_vals) { + my $mean = $GC_corr_vals{$val}[0] / $GC_corr_vals{$val}[1]; + printf TEST_LOG "$val\t$mean\n"; +} + +### close the filehandles and exit ### +close(TEST_LOG); +close(INPUT); +close(OUTPUT); +#die "... end of the quality measure script\n";