Mercurial > repos > yusuf > poor_gene_coverage
comparison annotate_low_coverage @ 0:7cdd13ff182a default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 15:49:28 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7cdd13ff182a |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use File::Basename; | |
| 6 #@ARGV >= 4 and @ARGV <= 6 or die "Usage: $0 <capture kit.bed> <poor regions input.bed> <annotated output.hgvs.txt> [target gene list.txt] [gene name pattern for detail reporting]\n"; | |
| 7 | |
| 8 my $dirname = dirname(__FILE__); | |
| 9 my $tool_dir = shift @ARGV; | |
| 10 | |
| 11 #read config file | |
| 12 if(not -e "$tool_dir/report_poor_converage" ){ | |
| 13 system("cp $dirname/tool-data/report_poor_coverage $tool_dir/report_poor_coverage"); | |
| 14 } | |
| 15 my %configuration_file; | |
| 16 open FILE, '<', "$tool_dir/report_poor_coverage"; | |
| 17 while(<FILE>){ | |
| 18 (my $key, my $value) = split(/\s+/,$_); | |
| 19 $configuration_file{$key} = $value; | |
| 20 } | |
| 21 close FILE; | |
| 22 my $ref_fasta = $configuration_file{"ref_fasta"}; | |
| 23 my $ref_flat = $configuration_file{"ref_flat"}; | |
| 24 my $ref_gencode = $configuration_file{"ref_gencode"}; | |
| 25 my $clinVar = $configuration_file{"clinVar"}; | |
| 26 my $dgv2 = $configuration_file{"dgv2"}; | |
| 27 | |
| 28 my $capture_kit_bed = $configuration_file{"capturekits_directory"} . (shift @ARGV) . ".bed" ; | |
| 29 my $low_input_bed = shift @ARGV; | |
| 30 my $low_output_hgvs = shift @ARGV; | |
| 31 my $target_list = @ARGV ? shift @ARGV : undef; | |
| 32 my $gene_name_regex = @ARGV ? shift @ARGV : undef; | |
| 33 my $tmp_hgvs = "$$.tmp.hgvs.txt"; | |
| 34 | |
| 35 my $tmp_bed; | |
| 36 if(defined $target_list and $target_list ne "/dev/null"){ | |
| 37 $tmp_bed = "$$.targeted.tmp.bed"; | |
| 38 #print STDERR "filter_by_list false $low_input_bed $target_list $tmp_bed 1 3\n"; | |
| 39 system "$dirname/add_names_to_bed $low_input_bed $ref_flat - /dev/null | $dirname/filter_by_list false - $target_list $tmp_bed 1 3"; # 3 indicates 4th column of the bed file, which should have the gene name | |
| 40 $low_input_bed = $tmp_bed; | |
| 41 open(TARGETS, "$dirname/add_names_to_bed $capture_kit_bed $ref_flat - /dev/null | $dirname/filter_by_list false - $target_list - 1 3 |") | |
| 42 or die "Cannot run filter_by_list, aborting: $!\n"; | |
| 43 } | |
| 44 else{ | |
| 45 open(TARGETS, "$dirname/add_names_to_bed $capture_kit_bed $ref_flat - /dev/null |") | |
| 46 or die "Cannot run add_names_to_bed on $capture_kit_bed: $!\n"; | |
| 47 } | |
| 48 | |
| 49 #print STDERR "/export/achri_galaxy/galaxy-dist/tools/achri/vcf2hgvs_table -q -p 0.05 -c $low_input_bed -g /export/achri_galaxy/dbs/DGV2/hg19.2012-03-29.txt.gz -o - -r /export/achri_galaxy/dbs/hg19.fa -e /export/achri_galaxy/dbs/hg19_refGene_gencode_ultrasensitive.gtf -i /dev/null -b /export/achri_galaxy/dbs/hg19_refFlat_2014-07-10.bed | /export/achri_galaxy/galaxy-dist/tools/achri/filter_by_index_gamma /export/achri_galaxy/dbs/ClinVar/ClinVarFullRelease_2014-03.xml. ClinVar - $tmp_hgvs \"\""; | |
| 50 system "$dirname/vcf2hgvs_table -q -p 0.05 -c $low_input_bed -g $dgv2 -o - -r $ref_fasta -e $ref_gencode -i /dev/null -b $ref_flat | $dirname/filter_by_index_gamma $clinVar. ClinVar - $tmp_hgvs \"\""; | |
| 51 my $tmp_output = "$$.tmp.annotated.hgvs.txt"; | |
| 52 #system "/export/achri_galaxy/galaxy-dist/tools/achri/hgvs_table_annotate /export/achri_galaxy/dbs/sift/hg19 /export/achri_galaxy/dbs/polyphen2/hg19.txt.gz /export/achri_galaxy/dbs/gerp/hg19 /export/achri_galaxy/dbs/TissueDistributionDBs/human.v2009-07-30.tab /export/achri_galaxy/dbs/pathways/KEGG.human.2012-09-25.txt /export/achri_galaxy/dbs/interpro_supermatch_hg19.bed $tmp_hgvs $tmp_output /export/achri_galaxy/dbs/hg19.fa"; | |
| 53 open(OUT, ">$low_output_hgvs") | |
| 54 or die "Cannot open $low_output_hgvs for writing: $!\n"; | |
| 55 open(COLLAPSE, "$dirname/hgvs_collapse_transcripts $tmp_hgvs - 1 |") | |
| 56 or die "cannot run hgvs_collapse_transcripts: $!\n"; | |
| 57 my $header = <COLLAPSE>; | |
| 58 chomp $header; | |
| 59 my @headers = split /\t/, $header; | |
| 60 my ($chr_column, $from_column, $to_column, $ftype_column, $clinvar_column, $loc_column, $maf_column, $gene_column, $gene_desc_column, $cdna_hgvs_column, $transcript_column, $zygosity_column, $phase_column); | |
| 61 my %req_columns = ( | |
| 62 "Chr" => \$chr_column, | |
| 63 "DNA From" => \$from_column, | |
| 64 "DNA To" => \$to_column, | |
| 65 "Feature type" => \$ftype_column, | |
| 66 # "Variant context" => \$loc_column, | |
| 67 "Pop. freq." => \$maf_column, | |
| 68 "Gene Name" => \$gene_column, | |
| 69 # "Gene Function" => \$gene_desc_column, | |
| 70 "ClinVar Text Matches" => \$clinvar_column, | |
| 71 "Transcript HGVS" => \$cdna_hgvs_column, | |
| 72 "Selected transcript" => \$transcript_column, | |
| 73 "Zygosity" => \$zygosity_column, | |
| 74 "Phase" => \$phase_column); | |
| 75 &load_header_columns(\%req_columns, \@headers); | |
| 76 my %col2name = reverse %req_columns; | |
| 77 | |
| 78 my %target_sizes; | |
| 79 <TARGETS>; # header | |
| 80 while(<TARGETS>){ | |
| 81 chomp; | |
| 82 my @F = split /\t/, $_; | |
| 83 next unless $#F >= 3; # nameless records | |
| 84 for my $gene_name (split /;\s*/, $F[3]){ | |
| 85 #print STDERR "Processing $_\n"; | |
| 86 $target_sizes{$gene_name} += $F[2]-$F[1]+1; # assumes non-overlapping targets in BED file | |
| 87 } | |
| 88 } | |
| 89 close(TARGETS); | |
| 90 my $target_total = 0; | |
| 91 for my $gene_name (keys %target_sizes){ | |
| 92 #print STDERR "Adding size of $gene_name ($target_sizes{$gene_name})\n"; | |
| 93 $target_total += $target_sizes{$gene_name}; | |
| 94 } | |
| 95 | |
| 96 my %missing_homo; | |
| 97 my %missing_coding_homo; | |
| 98 my %missing_het; | |
| 99 my %missing_coding_het; | |
| 100 my @detail_lines; | |
| 101 while(<COLLAPSE>){ | |
| 102 chomp; | |
| 103 my @F = split /\t/, $_, -1; | |
| 104 | |
| 105 for my $gene_name (split /;\s*/, uc($F[$gene_column])){ | |
| 106 next if not exists $target_sizes{$gene_name}; # not a gene of interest | |
| 107 #print STDERR "Processing $_\n"; | |
| 108 next if defined $gene_name_regex and $F[$transcript_column] !~ /$gene_name_regex/o; | |
| 109 #print STDERR "Passes filters\n"; | |
| 110 if(not exists $missing_homo{$gene_name}){ | |
| 111 $missing_homo{$gene_name} = {}; | |
| 112 $missing_coding_homo{$gene_name} = {}; | |
| 113 $missing_het{$gene_name} = {}; | |
| 114 $missing_coding_het{$gene_name} = {}; | |
| 115 } | |
| 116 | |
| 117 if($F[$zygosity_column] =~ /homo/){ | |
| 118 for my $pos ($F[$from_column]..$F[$to_column]){ | |
| 119 $missing_homo{$gene_name}->{$pos}++; | |
| 120 $missing_coding_homo{$gene_name}->{$pos}++ if $F[$ftype_column] =~ /protein_coding/;# and $F[$loc_column] =~ /exon/; | |
| 121 } | |
| 122 } | |
| 123 else{ | |
| 124 for my $pos ($F[$from_column]..$F[$to_column]){ | |
| 125 $missing_het{$gene_name}->{$pos}++; | |
| 126 $missing_coding_het{$gene_name}->{$pos}++ if $F[$ftype_column] =~ /protein_coding/;# and $F[$loc_column] =~ /exon/; | |
| 127 #print STDERR "Missing $gene_name het $pos ($F[$from_column]..$F[$to_column]) $F[$ftype_column] $F[$loc_column]\n"; | |
| 128 } | |
| 129 } | |
| 130 next unless $F[$ftype_column] =~ /protein_coding/;# and $F[$loc_column] =~ /exon/; | |
| 131 #push @detail_lines, [$F[$chr_column], $F[$from_column], $F[$to_column], $F[$maf_column], $gene_name, $F[$cdna_hgvs_column], $F[$transcript_column], ($F[$zygosity_column] =~ /homo/ ? "<4x" : "<20x"), $F[$gene_desc_column], $F[$domain_column], $F[$gene_column]]; | |
| 132 $F[$clinvar_column] = "" if not defined $F[$clinvar_column]; | |
| 133 $F[$clinvar_column] =~ s/>/>/; | |
| 134 push @detail_lines, [$F[$chr_column], $F[$from_column], $F[$to_column], $F[$maf_column], $gene_name, $F[$cdna_hgvs_column], $F[$transcript_column], ($F[$zygosity_column] =~ /homo/ ? "<4x" : "<20x"), $F[$gene_column], $F[$clinvar_column]]; | |
| 135 } | |
| 136 } | |
| 137 my $missing_homo = 0; | |
| 138 my $missing_coding_homo = 0; | |
| 139 my $missing_het = 0; | |
| 140 my $missing_coding_het = 0; | |
| 141 for my $gene_name (keys %target_sizes){ | |
| 142 $missing_homo{$gene_name} = defined $missing_homo{$gene_name} ? keys %{$missing_homo{$gene_name}} : 0; | |
| 143 $missing_homo += $missing_homo{$gene_name}; | |
| 144 $missing_coding_homo{$gene_name} = defined $missing_coding_homo{$gene_name} ? keys %{$missing_coding_homo{$gene_name}} : 0; | |
| 145 $missing_coding_homo += $missing_coding_homo{$gene_name}; | |
| 146 $missing_het{$gene_name} = defined $missing_het{$gene_name} ? keys %{$missing_het{$gene_name}} : 0; | |
| 147 $missing_het += $missing_het{$gene_name}; | |
| 148 $missing_coding_het{$gene_name} = defined $missing_coding_het{$gene_name} ? keys %{$missing_coding_het{$gene_name}} : 0; | |
| 149 $missing_coding_het += $missing_coding_het{$gene_name}; | |
| 150 } | |
| 151 if(defined $gene_name_regex){ | |
| 152 print OUT "Note: only considering coding sequence target bases in gene models with IDs matching \"$gene_name_regex\"\n"; | |
| 153 } | |
| 154 print OUT "\ttotal size\tcoding <4x coverage\t%\tcoding<20x coverage\t%\ttotal <4x coverage\t%\ttotal <20x coverage\n"; | |
| 155 printf OUT "Total target gene bases designed for capture\t%d\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n", $target_total, $missing_coding_homo, $missing_coding_homo/$target_total*100, $missing_coding_het, $missing_coding_het/$target_total*100, | |
| 156 $missing_homo, $missing_homo/$target_total*100, $missing_het, $missing_het/$target_total*100; | |
| 157 print OUT "Per gene missing coding sequence coverage breakdown:\n"; | |
| 158 for my $gene_name (sort keys %missing_homo){ | |
| 159 printf OUT "%s\t%d\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n", $gene_name, $target_sizes{$gene_name}, $missing_coding_homo{$gene_name}, | |
| 160 $missing_coding_homo{$gene_name}/$target_sizes{$gene_name}*100, $missing_coding_het{$gene_name}, $missing_coding_het{$gene_name}/$target_sizes{$gene_name}*100, | |
| 161 $missing_homo{$gene_name}, $missing_homo{$gene_name}/$target_sizes{$gene_name}*100, | |
| 162 $missing_het{$gene_name}, $missing_het{$gene_name}/$target_sizes{$gene_name}*100; | |
| 163 } | |
| 164 # Print revised header that's only a few of the fields | |
| 165 @detail_lines = sort {$a->[4] cmp $b->[4] or $a->[1] <=> $b->[1]} @detail_lines; | |
| 166 print OUT "\n\nCoding region low coverage details (alphabetical by gene name):\n"; | |
| 167 print OUT join("\t", $col2name{\$chr_column}, $col2name{\$from_column}, $col2name{\$to_column}, $col2name{\$maf_column}, $col2name{\$gene_column}, $col2name{\$cdna_hgvs_column}, | |
| 168 $col2name{\$transcript_column}, "Low coverage type", "Genes in this region (gene overlaps possible)", $col2name{\$clinvar_column}), "\n"; | |
| 169 print OUT join("\n", map {join("\t", @{$_})} @detail_lines), "\n"; | |
| 170 | |
| 171 unlink($tmp_hgvs); | |
| 172 unlink($tmp_output); | |
| 173 unlink($tmp_bed) if defined $tmp_bed; | |
| 174 | |
| 175 sub load_header_columns{ | |
| 176 my ($reqs_hash_ref, $headers_array_ref) = @_; | |
| 177 my %unfulfilled; | |
| 178 for my $field_name (keys %$reqs_hash_ref){ | |
| 179 $unfulfilled{$field_name} = 1; | |
| 180 } | |
| 181 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){ | |
| 182 for my $req_header_name (keys %$reqs_hash_ref){ | |
| 183 if($req_header_name eq $headers_array_ref->[$i]){ | |
| 184 ${$reqs_hash_ref->{$req_header_name}} = $i; | |
| 185 delete $unfulfilled{$req_header_name}; | |
| 186 last; | |
| 187 } | |
| 188 } | |
| 189 } | |
| 190 if(keys %unfulfilled){ | |
| 191 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n"; | |
| 192 } | |
| 193 } | |
| 194 |
