combine_hgvs_annotations: combine_hgvs

comparison combine_hgvs_tables @ 0:baf1543e8ae1 default tip

initial commit

author	Yusuf Ali <ali@yusuf.email>
date	Wed, 25 Mar 2015 13:27:49 -0600
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:baf1543e8ae1
+#!/usr/bin/env perl
+use strict;
+use warnings;
+if(@ARGV == 1 and $ARGV[0] eq "-v"){
+print "Version 1.0\n";
+exit;
+}
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
+$quiet = 1;
+shift @ARGV;
+}
+@ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n",
+"Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n";
+my %zygosity;
+my %quality;
+my %var_count;
+my %tot_count;
+my %methods;
+my %data;
+my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools
+my $collapse = $ARGV[0] =~ /^true$/i;
+open(OUT, ">$ARGV[1]")
+or die "Cannot open $ARGV[1] for writing: $!\n";
+my @headers;
+my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column);
+for(my $i=2; $i<$#ARGV; $i+=2){
+my $method_name = $ARGV[$i];
+my $infile = $ARGV[$i+1];
+print STDERR "Processing $infile...\n" unless $quiet;
+open(IN, $infile)
+or die "Cannot open $infile for reading: $!\n";
+my $header = <IN>; # header
+chomp $header;
+if($i == 2){
+@headers = split /\t/, $header;
+for(my $j = 0; $j <= $#headers; $j++){
+if($headers[$j] eq "Chr"){
+$chr_column = $j;
+}
+elsif($headers[$j] eq "Zygosity"){
+$zygosity_column = $j;
+}
+elsif($headers[$j] eq "P-value"){
+$pvalue_column = $j;
+}
+elsif($headers[$j] eq "Variant Reads"){
+$alt_cnt_column = $j;
+}
+elsif($headers[$j] eq "Total Reads"){
+$tot_cnt_column = $j;
+}
+elsif($headers[$j] eq "Selected transcript"){
+$transcript_column = $j;
+}
+elsif($headers[$j] eq "Transcript HGVS"){
+$cdna_hgvs_column = $j;
+}
+elsif($headers[$j] eq "DNA From"){
+$pos_column = $j;
+}
+elsif($headers[$j] eq "DNA To"){
+$to_column = $j;
+}
+elsif($headers[$j] eq "Sources"){
+$srcs_column = $j;
+}
+}
+if(defined $srcs_column){
+# print header as-is
+print OUT "$header\n";
+}
+else{
+# print header from the first file, with extra column for methods at the end
+push @headers, "Sources";
+$srcs_column = $#headers;
+print OUT join("\t", @headers), "\n";
+}
+}
+else{
+# Make sure the columns are the same in the various files
+my @latest_headers = split /\t/, $header;
+for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){
+if($latest_headers[$j] ne $headers[$j]){
+die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n";
+}
+}
+}
+while(<IN>){
+chomp;
+my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields
+if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){
+$chr_prefix_exists = 1;
+}
+$F[$chr_column] =~ s/^chr//; # for consistency
+my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files
+# record disagreement on zygosity if any
+if(not defined $zygosity{$key}){
+$zygosity{$key} = [];
+$quality{$key} = [];
+$var_count{$key} = [];
+$tot_count{$key} = [];
+$data{$key} = \@F;
+#print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers;
+}
+push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]);
+push @{$quality{$key}}, $F[$pvalue_column];
+push @{$var_count{$key}}, $F[$alt_cnt_column];
+push @{$tot_count{$key}}, $F[$tot_cnt_column];
+push @{$methods{$key}}, $method_name;
+}
+close(IN);
+}
+if($chr_prefix_exists){
+for my $key (keys %data){
+# assuming there is a mix of chr1 and 1, always add the chr for consistency
+$data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/;
+}
+}
+# Sort by chr, then pos, then transcript_id
+for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or
+$data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or
+$data{$a}->[$to_column] <=> $data{$b}->[$to_column] or
+$data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){
+# Before printing a line, merge the zygosity, variant quality, read count data if requested
+my %seen;
+if($collapse){
+my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}};
+if(@zygosities == 0){
+# do nothing, existing NA in 7th column will be a fine answer as they are all like tha
+}
+elsif(@zygosities == 1){
+# agreement by all methods, just print the one not NA
+$data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA";
+}
+else{
+$data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur
+}
+}
+else{
+$data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}});
+}
+if($collapse){
+undef %seen;
+my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}};
+if(@qualities == 0){
+# do nothing, existing NA in 8th column will be a fine answer as they are all like that
+}
+elsif(@qualities == 1){
+# agreement by all methods, just print the one not NA
+$data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA";
+}
+else{
+# calculate the median for the collapsed value
+my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}});
+my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value)
+$data{$key}->[$pvalue_column] = $median_quality;
+}
+}
+else{
+$data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}});
+}
+if($collapse){
+undef %seen;
+my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}};
+undef %seen;
+my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}};
+if(@var_counts == 0 and @tot_counts == 0){
+# do nothing, existing NAs okay
+}
+elsif(@var_counts == 1 and @tot_counts == 1){
+# agreement by all methods, just print the one in %data unless it's NA
+$data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA";
+$data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA";
+}
+else{
+# calculate the median for the collapsed value
+my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}});
+my $pivot = $#sorted_var_counts/2;
+my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean
+int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2);
+$data{$key}->[$alt_cnt_column] = $median_var_count;
+my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}});
+$pivot = $#sorted_tot_counts/2;
+my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean
+int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2);
+$data{$key}->[$tot_cnt_column] = $median_tot_count;
+}
+}
+else{
+# list the raw numbers
+$data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}});
+$data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}});
+}
+# to facilitate multiple rounds of combining, slash the extra column from the last round if it exists
+$data{$key}->[$srcs_column] = join("; ", @{$methods{$key}});
+for(my $i = 0; $i <= $#{$data{$key}}; $i++){
+$data{$key}->[$i] = "" if not defined $data{$key}->[$i];
+}
+print OUT join("\t", @{$data{$key}}), "\n";
+}
+close(OUT);

Mercurial > repos > yusuf > combine_hgvs_annotations

comparison combine_hgvs_tables @ 0:baf1543e8ae1 default tip