Mercurial > repos > rachel-929292 > extract_fragment_counts_for_chromosomes
changeset 2:910e6aeab6af draft default tip
Deleted selected files
author | rachel-929292 |
---|---|
date | Wed, 19 Oct 2016 08:47:14 -0400 |
parents | e1803849ab52 |
children | |
files | script_2_GC_correct_bdp-qad.pl script_2_GC_correct_bdp-qad.xml |
diffstat | 2 files changed, 0 insertions(+), 171 deletions(-) [+] |
line wrap: on
line diff
--- a/script_2_GC_correct_bdp-qad.pl Tue Oct 18 09:18:48 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Data::Dumper; - -### Open input and create output files ### -open(TEST_LOG, ">$ARGV[1]_GC_bias") or die "Can't open test log!!\n"; -open(INPUT, "$ARGV[0]") or die "Can't open input!!\n"; -open(OUTPUT, ">$ARGV[1]") or die "Can't create output!!\n"; -open (TABLE, "$ARGV[2]") or die "Cannot open reference table"; - - -### Check input and output files have been provided ### -if (!$ARGV[0] or !$ARGV[1]) { - die "Incorrect number of parameters provided\n Usage: perl <script> <input> <output>\n"; -} - -### Intialise counts and params -my $total_read_count = 0; -my $total_mapped = 0; -my @bin_counts = (); # Or maybe use a hash? "start_chr" as hash key? -my %bin_hash = (); -my %alt_hash = (); # Alternative hash. Holds GC content values -my $ROI_limit = 20000; - -my %GC_corr_vals = (); # average counts per GC content -my %read_chr_counts = (); # GC content step -my @chromes = qw(chr21 chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr22 chrX chrY chrM); - -### Extra params for Fan correction -my $total_good_reads = 0; -my $total_good_bins = 0; - -### Populate the hash -#open(TABLE, "/devt/qad/data/support/gc_content_correction/ref_table_GC_hg19_20k_v2") or die "Can't open table\n"; - -while (my $strip = <TABLE>) { - chomp $strip; - - my @words = split(" ", $strip); - my $key = join("_", $words[0], $words[1]); - $bin_hash{$words[1]}{$words[0]} = 0; - - ### Alternative hash containing GC content info - $alt_hash{$words[1]}{$words[0]} = [0, $words[2]]; -} -close(TABLE); - -### Loop through each line of input SAM file -while (my $line = <INPUT>) { - chomp $line; - - ### Split the line into comparable components - my @line_words = split(" ", $line); - my @line_chars = split("", $line); - #my @qname_words = split("_", $line_words[0]); - - ### Skip lines of the header section - if ($line_chars[0] eq "@") { - next; - } - - $total_read_count++; - - ### Testing - #print "$total_read_count\n"; - - ### Extract alignment values - my $aln_chr = $line_words[2]; - my $aln_pos = $line_words[3]; - - - if ($line !~ m/\sXS:i:\d+/ and $line =~ m/\sAS:i:\d+/) { - $total_mapped++; - my $bin_id = int($aln_pos / $ROI_limit) + 1; - $bin_hash{$aln_chr}{$bin_id}++; - - $alt_hash{$aln_chr}{$bin_id}[0]++; - } -} - -### Filter the bins. Remove bins with no counts -while (my ($key, $value) = each %alt_hash) { - while (my ($inner_key, $inner_value) = each %$value) { - if ($alt_hash{$key}{$inner_key}[0] == 0 ) { - delete $alt_hash{$key}{$inner_key}; - } - ### Calculate the average count per GC content - else { - my $GC_short_val = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]); - $GC_corr_vals{$GC_short_val}[0] += $alt_hash{$key}{$inner_key}[0]; # Total No of reads - $GC_corr_vals{$GC_short_val}[1]++; # Total of GC regions with this % - - ### Fan correction - $total_good_bins++; - $total_good_reads += $alt_hash{$key}{$inner_key}[0]; - } - } -} - -### Fan correction -my $global_mean = $total_good_reads / $total_good_bins; - -### Apply the correction and tally up the counts within each chr -while (my ($key, $value) = each %alt_hash) { - while (my ($inner_key, $inner_value) = each %$value) { - my $GC = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]); - my $mean = $GC_corr_vals{$GC}[0] / $GC_corr_vals{$GC}[1]; - #my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($alt_hash{$key}{$inner_key}[0] / $mean); - - ### Fan alternative correction - my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($global_mean / $mean); - - $read_chr_counts{$key} += $corr_count; - } -} - -### Print the metrics into output file -printf OUTPUT "Total reads: $total_read_count\n"; -printf OUTPUT "Total mapped reads: $total_mapped\n"; -printf OUTPUT "Total good reads(testing): $total_good_reads\n"; -#while ((my $key, my $value) = each %read_chr_counts) { -# printf OUTPUT "$key\t$value\n"; -#} -foreach my $chrom (@chromes) { - if (exists $read_chr_counts{$chrom}) { - printf OUTPUT "$chrom\t$read_chr_counts{$chrom}\n"; - } - else { - printf OUTPUT "$chrom\t0\n"; - } -} - -### Printing extra data. Reads per GC content step -foreach my $val (sort {$a<=>$b} keys %GC_corr_vals) { - my $mean = $GC_corr_vals{$val}[0] / $GC_corr_vals{$val}[1]; - printf TEST_LOG "$val\t$mean\n"; -} - -### close the filehandles and exit ### -close(TEST_LOG); -close(INPUT); -close(OUTPUT); -#die "... end of the quality measure script\n";
--- a/script_2_GC_correct_bdp-qad.xml Tue Oct 18 09:18:48 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ - -<tool id="script_2_GC_correct_bdp-qad" name="GC Corrected Chromosome Counter" version="0.1.0"> - <description>for counting chromosomes</description> - <command interpreter="perl">script_2_GC_correct_bdp-qad.pl $input $output $input2 </command> - <inputs> - <param format="sam" name="input" type="data" label="Source file"/> - <param format="txt" name="input2" type="data" label="Source file"/> - </inputs> - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <tests> - <test> - <param name= "input" value="1.sam"/> - <output name= "output" file="output"/> - </test> - </tests> - - - <help> - This tool aligns reads against Chromosomes - </help> - -</tool> -