Mercurial > repos > rachel-929292 > extract_fragment_counts_for_chromosomes

--- a/script_2_GC_correct_bdp-qad.pl	Tue Oct 18 09:18:48 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,145 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use warnings;
-use Data::Dumper;
-
-### Open input and create output files ###
-open(TEST_LOG, ">$ARGV[1]_GC_bias") or die "Can't open test log!!\n";
-open(INPUT, "$ARGV[0]") or die "Can't open input!!\n";
-open(OUTPUT, ">$ARGV[1]") or die "Can't create output!!\n";
-open (TABLE, "$ARGV[2]") or die "Cannot open reference table";
-
-
-### Check input and output files have been provided ###
-if (!$ARGV[0] or !$ARGV[1]) {
-    die "Incorrect number of parameters provided\n Usage: perl <script> <input> <output>\n";
-}
-
-### Intialise counts and params
-my $total_read_count = 0;
-my $total_mapped = 0;
-my @bin_counts = ();		# Or maybe use a hash? "start_chr" as hash key?
-my %bin_hash = ();
-my %alt_hash = ();		# Alternative hash. Holds GC content values
-my $ROI_limit = 20000;
-
-my %GC_corr_vals = ();		# average counts per GC content
-my %read_chr_counts = ();		# GC content step
-my @chromes = qw(chr21 chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr22 chrX chrY chrM);
-
-### Extra params for Fan correction
-my $total_good_reads = 0;
-my $total_good_bins = 0;
-
-### Populate the hash
-#open(TABLE, "/devt/qad/data/support/gc_content_correction/ref_table_GC_hg19_20k_v2") or die "Can't open table\n";
-
-while (my $strip = <TABLE>) {
-    chomp $strip;
-
-    my @words = split(" ", $strip);
-    my $key = join("_", $words[0], $words[1]);
-    $bin_hash{$words[1]}{$words[0]} = 0;
-
-    ### Alternative hash containing GC content info
-    $alt_hash{$words[1]}{$words[0]} = [0, $words[2]];
-}
-close(TABLE);
-
-### Loop through each line of input SAM file
-while (my $line = <INPUT>) {
-    chomp $line;
-
-    ### Split the line into comparable components
-    my @line_words = split(" ", $line);
-    my @line_chars = split("", $line);
-    #my @qname_words = split("_", $line_words[0]);
-
-    ### Skip lines of the header section
-    if ($line_chars[0] eq "@") {
-        next;
-    }
-
-    $total_read_count++;
-
-    ### Testing
-    #print "$total_read_count\n";
-
-    ### Extract alignment values
-    my $aln_chr = $line_words[2];
-    my $aln_pos = $line_words[3];
-
-
-    if ($line !~ m/\sXS:i:\d+/ and $line =~ m/\sAS:i:\d+/) {
-        $total_mapped++;
-        my $bin_id = int($aln_pos / $ROI_limit) + 1;
-        $bin_hash{$aln_chr}{$bin_id}++;
-
-        $alt_hash{$aln_chr}{$bin_id}[0]++;
-    }
-}
-
-### Filter the bins. Remove bins with no counts
-while (my ($key, $value) = each %alt_hash) {
-    while (my ($inner_key, $inner_value) = each %$value) {
-        if ($alt_hash{$key}{$inner_key}[0] == 0 ) {
-            delete $alt_hash{$key}{$inner_key};
-        }
-        ### Calculate the average count per GC content
-        else {
-            my $GC_short_val = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]);
-            $GC_corr_vals{$GC_short_val}[0] += $alt_hash{$key}{$inner_key}[0];	# Total No of reads
-            $GC_corr_vals{$GC_short_val}[1]++;			# Total of GC regions with this %
-
-            ### Fan correction
-            $total_good_bins++;
-            $total_good_reads += $alt_hash{$key}{$inner_key}[0];
-        }
-    }
-}
-
-### Fan correction
-my $global_mean = $total_good_reads / $total_good_bins;
-
-### Apply the correction and tally up the counts within each chr
-while (my ($key, $value) = each %alt_hash) {
-    while (my ($inner_key, $inner_value) = each %$value) {
-        my $GC = sprintf("%.1f", $alt_hash{$key}{$inner_key}[1]);
-        my $mean = $GC_corr_vals{$GC}[0] / $GC_corr_vals{$GC}[1];
-        #my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($alt_hash{$key}{$inner_key}[0] / $mean);
-
-        ### Fan alternative correction
-        my $corr_count = $alt_hash{$key}{$inner_key}[0] * ($global_mean / $mean);
-
-        $read_chr_counts{$key} += $corr_count;
-    }
-}
-
-### Print the metrics into output file
-printf OUTPUT "Total reads: $total_read_count\n";
-printf OUTPUT "Total mapped reads: $total_mapped\n";
-printf OUTPUT "Total good reads(testing): $total_good_reads\n";
-#while ((my $key, my $value) = each %read_chr_counts) {
-#    printf OUTPUT "$key\t$value\n";
-#}
-foreach my $chrom (@chromes) {
-    if (exists $read_chr_counts{$chrom}) {
-        printf OUTPUT "$chrom\t$read_chr_counts{$chrom}\n";
-    }
-    else {
-        printf OUTPUT "$chrom\t0\n";
-    }
-}
-
-### Printing extra data. Reads per GC content step
-foreach my $val (sort {$a<=>$b} keys %GC_corr_vals) {
-    my $mean = $GC_corr_vals{$val}[0] / $GC_corr_vals{$val}[1];
-    printf TEST_LOG "$val\t$mean\n";
-}
-
-### close the filehandles and exit ###
-close(TEST_LOG);
-close(INPUT);
-close(OUTPUT);
-#die "... end of the quality measure script\n";
--- a/script_2_GC_correct_bdp-qad.xml	Tue Oct 18 09:18:48 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-
-<tool id="script_2_GC_correct_bdp-qad" name="GC Corrected Chromosome Counter" version="0.1.0">
-	<description>for counting chromosomes</description>
-	<command interpreter="perl">script_2_GC_correct_bdp-qad.pl $input $output $input2 </command>
-	<inputs>
-	<param format="sam" name="input" type="data" label="Source file"/>
-        <param format="txt" name="input2" type="data" label="Source file"/>
-	 </inputs>
-	<outputs>
-	<data format="tabular" name="output" />
-	 </outputs>
-
-	<tests>
-	<test>
-	<param name= "input" value="1.sam"/>
-	<output name= "output" file="output"/>
-	</test>
-	</tests>
-
-
-	<help>
-	This tool aligns reads against Chromosomes
-	</help>
-
-</tool>
-