Mercurial > repos > yusuf > combine_hgvs_annotations
comparison combine_hgvs_tables @ 0:baf1543e8ae1 default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:27:49 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:baf1543e8ae1 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | |
| 7 print "Version 1.0\n"; | |
| 8 exit; | |
| 9 } | |
| 10 | |
| 11 my $quiet = 0; | |
| 12 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ | |
| 13 $quiet = 1; | |
| 14 shift @ARGV; | |
| 15 } | |
| 16 | |
| 17 @ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n", | |
| 18 "Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n"; | |
| 19 | |
| 20 my %zygosity; | |
| 21 my %quality; | |
| 22 my %var_count; | |
| 23 my %tot_count; | |
| 24 my %methods; | |
| 25 my %data; | |
| 26 my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools | |
| 27 | |
| 28 my $collapse = $ARGV[0] =~ /^true$/i; | |
| 29 | |
| 30 open(OUT, ">$ARGV[1]") | |
| 31 or die "Cannot open $ARGV[1] for writing: $!\n"; | |
| 32 | |
| 33 my @headers; | |
| 34 my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column); | |
| 35 for(my $i=2; $i<$#ARGV; $i+=2){ | |
| 36 my $method_name = $ARGV[$i]; | |
| 37 my $infile = $ARGV[$i+1]; | |
| 38 print STDERR "Processing $infile...\n" unless $quiet; | |
| 39 open(IN, $infile) | |
| 40 or die "Cannot open $infile for reading: $!\n"; | |
| 41 my $header = <IN>; # header | |
| 42 chomp $header; | |
| 43 if($i == 2){ | |
| 44 @headers = split /\t/, $header; | |
| 45 for(my $j = 0; $j <= $#headers; $j++){ | |
| 46 if($headers[$j] eq "Chr"){ | |
| 47 $chr_column = $j; | |
| 48 } | |
| 49 elsif($headers[$j] eq "Zygosity"){ | |
| 50 $zygosity_column = $j; | |
| 51 } | |
| 52 elsif($headers[$j] eq "P-value"){ | |
| 53 $pvalue_column = $j; | |
| 54 } | |
| 55 elsif($headers[$j] eq "Variant Reads"){ | |
| 56 $alt_cnt_column = $j; | |
| 57 } | |
| 58 elsif($headers[$j] eq "Total Reads"){ | |
| 59 $tot_cnt_column = $j; | |
| 60 } | |
| 61 elsif($headers[$j] eq "Selected transcript"){ | |
| 62 $transcript_column = $j; | |
| 63 } | |
| 64 elsif($headers[$j] eq "Transcript HGVS"){ | |
| 65 $cdna_hgvs_column = $j; | |
| 66 } | |
| 67 elsif($headers[$j] eq "DNA From"){ | |
| 68 $pos_column = $j; | |
| 69 } | |
| 70 elsif($headers[$j] eq "DNA To"){ | |
| 71 $to_column = $j; | |
| 72 } | |
| 73 elsif($headers[$j] eq "Sources"){ | |
| 74 $srcs_column = $j; | |
| 75 } | |
| 76 } | |
| 77 if(defined $srcs_column){ | |
| 78 # print header as-is | |
| 79 print OUT "$header\n"; | |
| 80 } | |
| 81 else{ | |
| 82 # print header from the first file, with extra column for methods at the end | |
| 83 push @headers, "Sources"; | |
| 84 $srcs_column = $#headers; | |
| 85 print OUT join("\t", @headers), "\n"; | |
| 86 } | |
| 87 } | |
| 88 else{ | |
| 89 # Make sure the columns are the same in the various files | |
| 90 my @latest_headers = split /\t/, $header; | |
| 91 for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){ | |
| 92 if($latest_headers[$j] ne $headers[$j]){ | |
| 93 die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n"; | |
| 94 } | |
| 95 } | |
| 96 } | |
| 97 while(<IN>){ | |
| 98 chomp; | |
| 99 my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields | |
| 100 if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){ | |
| 101 $chr_prefix_exists = 1; | |
| 102 } | |
| 103 $F[$chr_column] =~ s/^chr//; # for consistency | |
| 104 my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files | |
| 105 # record disagreement on zygosity if any | |
| 106 if(not defined $zygosity{$key}){ | |
| 107 $zygosity{$key} = []; | |
| 108 $quality{$key} = []; | |
| 109 $var_count{$key} = []; | |
| 110 $tot_count{$key} = []; | |
| 111 $data{$key} = \@F; | |
| 112 #print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers; | |
| 113 } | |
| 114 push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]); | |
| 115 push @{$quality{$key}}, $F[$pvalue_column]; | |
| 116 push @{$var_count{$key}}, $F[$alt_cnt_column]; | |
| 117 push @{$tot_count{$key}}, $F[$tot_cnt_column]; | |
| 118 push @{$methods{$key}}, $method_name; | |
| 119 } | |
| 120 close(IN); | |
| 121 } | |
| 122 | |
| 123 if($chr_prefix_exists){ | |
| 124 for my $key (keys %data){ | |
| 125 # assuming there is a mix of chr1 and 1, always add the chr for consistency | |
| 126 $data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/; | |
| 127 } | |
| 128 } | |
| 129 | |
| 130 # Sort by chr, then pos, then transcript_id | |
| 131 for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or | |
| 132 $data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or | |
| 133 $data{$a}->[$to_column] <=> $data{$b}->[$to_column] or | |
| 134 $data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){ | |
| 135 # Before printing a line, merge the zygosity, variant quality, read count data if requested | |
| 136 my %seen; | |
| 137 if($collapse){ | |
| 138 my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}}; | |
| 139 if(@zygosities == 0){ | |
| 140 # do nothing, existing NA in 7th column will be a fine answer as they are all like tha | |
| 141 } | |
| 142 elsif(@zygosities == 1){ | |
| 143 # agreement by all methods, just print the one not NA | |
| 144 $data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA"; | |
| 145 } | |
| 146 else{ | |
| 147 $data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur | |
| 148 } | |
| 149 } | |
| 150 else{ | |
| 151 $data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}}); | |
| 152 } | |
| 153 | |
| 154 if($collapse){ | |
| 155 undef %seen; | |
| 156 my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}}; | |
| 157 if(@qualities == 0){ | |
| 158 # do nothing, existing NA in 8th column will be a fine answer as they are all like that | |
| 159 } | |
| 160 elsif(@qualities == 1){ | |
| 161 # agreement by all methods, just print the one not NA | |
| 162 $data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA"; | |
| 163 } | |
| 164 else{ | |
| 165 # calculate the median for the collapsed value | |
| 166 my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}}); | |
| 167 my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value) | |
| 168 $data{$key}->[$pvalue_column] = $median_quality; | |
| 169 } | |
| 170 } | |
| 171 else{ | |
| 172 $data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}}); | |
| 173 } | |
| 174 | |
| 175 if($collapse){ | |
| 176 undef %seen; | |
| 177 my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}}; | |
| 178 undef %seen; | |
| 179 my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}}; | |
| 180 if(@var_counts == 0 and @tot_counts == 0){ | |
| 181 # do nothing, existing NAs okay | |
| 182 } | |
| 183 elsif(@var_counts == 1 and @tot_counts == 1){ | |
| 184 # agreement by all methods, just print the one in %data unless it's NA | |
| 185 $data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA"; | |
| 186 $data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA"; | |
| 187 } | |
| 188 else{ | |
| 189 # calculate the median for the collapsed value | |
| 190 my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}}); | |
| 191 my $pivot = $#sorted_var_counts/2; | |
| 192 my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean | |
| 193 int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2); | |
| 194 $data{$key}->[$alt_cnt_column] = $median_var_count; | |
| 195 my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}}); | |
| 196 $pivot = $#sorted_tot_counts/2; | |
| 197 my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean | |
| 198 int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2); | |
| 199 $data{$key}->[$tot_cnt_column] = $median_tot_count; | |
| 200 } | |
| 201 } | |
| 202 else{ | |
| 203 # list the raw numbers | |
| 204 $data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}}); | |
| 205 $data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}}); | |
| 206 } | |
| 207 | |
| 208 # to facilitate multiple rounds of combining, slash the extra column from the last round if it exists | |
| 209 $data{$key}->[$srcs_column] = join("; ", @{$methods{$key}}); | |
| 210 for(my $i = 0; $i <= $#{$data{$key}}; $i++){ | |
| 211 $data{$key}->[$i] = "" if not defined $data{$key}->[$i]; | |
| 212 } | |
| 213 print OUT join("\t", @{$data{$key}}), "\n"; | |
| 214 } | |
| 215 close(OUT); |
