Mercurial > repos > thondeboer > neat_genreads

diff utilities/deprecated/FindNucleotideContextOnReference.healthy.pl @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author: thondeboer
date: Tue, 15 May 2018 02:39:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utilities/deprecated/FindNucleotideContextOnReference.healthy.pl	Tue May 15 02:39:53 2018 -0400
@@ -0,0 +1,508 @@
+#!/usr/bin/perl
+
+use strict;
+use Math::Round;
+
+
+if ($#ARGV < 1) {
+   print "parameter mismatch\nTo run type this command:\nperl $0 fastahack reference input_pos_file output_file human_gff_file\n\n";
+
+   print " first argument = full path to fastahack\n"; 
+   print " second argument = full path to reference genome\n"; 
+   print " third argument = input file with arbitrary number of columns, but 1st col=chromosome name and 2nd col=position\n"; 
+   print " fourth argument = output file with three columns: chromosome name, position of the center nucleotide, and the thre-nucleotide context for that position\n";
+   print " fifth argument = full path to human gff file\n\n\n"; 
+   exit 1;
+}
+
+
+my $Fastahack=$ARGV[0];
+my $Reference=$ARGV[1];
+open(InputPositions,             '<', $ARGV[2]) || die("Could not open file!");
+open(OutputTrinucleotideContext, '>', $ARGV[3]) || die("Could not open file!");
+open(HumanGFF,                   '<', $ARGV[4]) || die("Could not open file!");
+
+
+
+################ read in one coordinate at a time and execute fastahack on it
+
+# reading the header
+my $head = <InputPositions>;
+$head =~ s/\n|\r//;
+print OutputTrinucleotideContext "$head\tContext\n";
+my $gffHead = <HumanGFF>;
+chomp $gffHead;
+
+# creating trinucleotide context data hash, insertion and deletion counts
+my %trinucleotide_context_data;
+my %context_tally_across_mutated_to;
+my %gff_hash;
+my $gffMatch;
+my %location;
+# my %genotype_hash;
+my %insertion_hash;
+my %deletion_hash;   
+my $insertion_total;
+my $deletion_total;
+my $zygotes_total;
+my %annotation_hash;
+my $annotation_total;
+my %exonic_consequence_hash;
+my $intronic;
+my $exonic;
+my $intergenic;
+
+# reading the positional information
+my $line_count = 1;
+while (<InputPositions>) {
+   $_ =~ s/\n|\r//;
+   #print "$_\n";
+   my @line = split('\t', $_);
+
+   # getting the chromosome and coordinate fields from input file
+   # fastahack will need to the chromosome and coordinate to read the information from the reference
+   my $chromosome = $line[0];
+   my $coordinate = $line[1];
+
+   # get coordinates of first and last character in the context
+   my $start_region = $coordinate - 1;
+   my $end_region = $coordinate + 1;
+
+   # if the coordinate is the very first letter on the chromosome, then do not read before that position
+   # the context becomes 2 letter code, as opposed to a trinucleotide
+   if ( $start_region == 0 ) {
+      $start_region = 1;
+      $end_region = 2;
+   }
+
+   #print "$Fastahack -r $chromosome:$start_region..$end_region $Reference\n";
+   my $context = `$Fastahack -r $chromosome:$start_region..$end_region $Reference`;
+   
+   # capitalize context letters
+   $context = uc($context);
+
+   #### IF USING CONTROLLED DATA, split germline column into germline allele and mutated_to allele
+   # my @germline = split ('/', $line[6]);
+
+   # if germline allele does not equal reference allele, print "start_region germline allele end_region"
+   # specifically, replace the middle letter of the context with the germline allele
+   #print "$germline[0], $germline[1]\n";
+   # if ($germline[0] ne $germline[1]) {
+      # print "germline/reference mismatch, line number $line_count\n";
+      # if ($coordinate != 1) {
+         # substr($context,1,1)= $germline[1];   
+      # }
+      # else {
+        #  substr($context,0,1)= $germline[1];
+      # }
+   # }
+
+   print OutputTrinucleotideContext "$_\t$context";
+   
+
+   ###############################
+   # new section: forming the data structure
+   ###############################
+
+   # to create N_N contexts for data structure, context_code is defined as the trinucleotide context with a blank middle allele
+   my $context_code=$context;
+   $context_code =~ s/\n|\r//;
+   substr($context_code,1,1) = "_";
+   
+   # create variables for mutated_from and  mutated_to nucleotides
+   my $mutated_from = $line[3];
+   my $mutated_to = $line[4];
+
+   # creating genotype variable from column 10 of VCF
+   my $genotype = $line[9];
+
+   # incrementing each genotype
+   # $genotype_hash{$genotype} = $genotype_hash{$genotype} + 1;
+
+   # splitting heterozygosity by comma, defining heterozygosity total
+   my @zygotes = split (',', $mutated_to);
+   my $zygotes_length = scalar(@zygotes);
+
+   # identify heterozygosity, choose one at random to use. Count heterozygosity instances
+   if ($zygotes_length > 1) {
+      my $zygotesRand = $zygotes_length*rand();
+      my $zygotesRound = round($zygotesRand) - 1;
+      $zygotes_total = $zygotes_total + 1;
+
+      $mutated_to = $zygotes[$zygotesRound];
+      # print "@zygotes\t$mutated_to\n";
+   }  
+   # print "@zygotes\t$mutated_to\n";
+   
+   # my $round_rand_test = round(rand());
+   # print $round_rand_test;
+
+   # define length of insertions and deletions
+   my $insertion_length;
+   my $deletion_length;
+   if ($mutated_from eq "-") {
+      $insertion_length = length( $mutated_to );
+   }
+   else {
+      $insertion_length = length( $mutated_to ) - 1;
+   }
+   if ($mutated_to eq "-") {
+      $deletion_length = length( $mutated_from );
+   }
+   else {
+      $deletion_length = length( $mutated_from ) - 1;
+   }
+
+   # context_codes are totalled
+   $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to} = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to} + 1; 
+   $context_tally_across_mutated_to{$context_code}{$mutated_from} = $context_tally_across_mutated_to{$context_code}{$mutated_from} + 1; 
+
+   # insertion and deletion lengths are totalled
+   if ($insertion_length > $deletion_length) {
+      $insertion_hash{$insertion_length} = $insertion_hash{$insertion_length} + 1;
+   }
+   if ($deletion_length > $insertion_length) {
+      $deletion_hash{$deletion_length} = $deletion_hash{$deletion_length} + 1;
+   }
+
+   # total insertions and deletions
+   if ($insertion_length != $deletion_length) {
+      if ($insertion_length > $deletion_length) {
+         $insertion_total = $insertion_total + 1;
+      }
+      elsif ($deletion_length > $insertion_length) {
+         $deletion_total = $deletion_total + 1;
+      } 
+   }
+
+   # Find variant annotation and exonic consequence in ANNOVAR outfile
+   my $annotation = $line[7];
+   if ( $annotation =~ /Func.refGene=(.{1,30});Gene\.refGene/ ) {
+      # print "$1\n";
+      $annotation_hash{$1}++;
+      $annotation_total++;
+   }   
+   if ( $annotation =~ /ExonicFunc.refGene=(.{1,30});AAChange\.refGene/ ) {
+      # print "$1\n";
+      $exonic_consequence_hash{$1}++;
+   } 
+   if ( $annotation =~ /Func.refGene=.{0,15}intronic\;/ ) {
+      $intronic++;
+   }
+   if ( $annotation !~ /Func.refGene=ncRNA_exonic/ ) {
+      if ( $annotation =~ /Func.refGene=.{0,15}exonic\;/ ) {
+         $exonic++;
+      }
+   }
+   if ( $annotation =~ /Func.refGene=.{0,15}intergenic\;/ ) {
+      $intergenic++;
+   }
+   elsif ( $annotation =~ /Func.refGene=.{0,15}ncRNA_splicing\;/ ) {
+      $intergenic++;
+   }
+   elsif ( $annotation =~ /Func.refGene=.{0,15}upstream\;/ ) {
+      $intergenic++;
+   }
+   elsif ( $annotation =~ /Func.refGene=.{0,15}downstream\;/ ) {
+      $intergenic++;
+   }
+   
+   $location{$coordinate}++;
+   # Reading input gff file, incrementing gff variant region hash
+   # while (<HumanGFF>) {
+      # $_ =~ s/\n|\r//;
+      # my @line = split('\t', $_);
+      # my $region_name = "$line[3]-$line[4]";
+      # if ($coordinate >= $line[3] && $coordinate <= $line[4]) {
+         # $gff_hash{$region_name}++;
+         # $gffMatch++;
+         # print "$coordinate $region_name\n";
+      # }
+   # }
+   #print "$region_name, $gff_hash{$region_name}\n";
+   
+
+   # to keep track of progress
+   # 1000000 for LARGE dbsnp vcfs, 10000 for smaller vcf/tsv tumor mutation files
+   unless ($line_count%10000) {
+      print "processed $line_count lines\n";
+   }
+   $line_count++; 
+} 
+# end working through the input file
+
+# print total number of mutations
+my $mutation_total = $line_count;
+print "Number of Mutations -- $mutation_total\n";
+
+
+################### Reading the input gff and creating custom BED file ####################
+
+my $gffBED = "vars.bed";
+open(my $bed_handle, '>', $gffBED) || die("Could not open file!");
+
+# Print BED file Header
+print $bed_handle "START\tEND\tVariant_Frequency\n";
+
+# Reading input gff file, incrementing gff variant region hash
+while (<HumanGFF>) {
+   $_ =~ s/\n|\r//;
+   my @line = split('\t', $_);
+   my $region_name = "$line[3]-$line[4]";
+   my $region_length = $line[4] - $line[3];
+   my $region_freq = 0;
+   foreach my $coordinate (sort(keys %location)) {
+      if ($coordinate >= $line[3] && $coordinate <= $line[4]) {
+         $gff_hash{$region_name}++;
+         $gffMatch++;
+         # print "$coordinate $region_name\n";
+      }
+   }
+   if ($gff_hash{$region_name} == 0) {
+      print $bed_handle "$line[3]\t$line[4]\t$region_freq\n";
+   }
+   if ($gff_hash{$region_name} > 0) {
+      $region_freq = $gff_hash{$region_name} / $region_length;
+      print $bed_handle "$line[3]\t$line[4]\t$region_freq\n";
+      print "Region $region_name variant frequency -- $region_freq\n";
+      print "Total variants in region $region_name -- $gff_hash{$region_name}\n";
+   }
+}
+   #print "$region_name, $gff_hash{$region_name}\n";
+
+print "GFF Match -- $gffMatch\n";
+
+
+######################### open files for writing ##########################
+
+
+# my $genotype_name = "zygosity.prob";
+# open(my $genotype_handle, '>', $genotype_name) || die("Could not open file!");
+   
+my $insertion_file_name = "SSM_insLength.prob";
+open(my $insertion_prob_handle, '>', $insertion_file_name) || die("Could not open file!");
+
+my $deletion_file_name = "SSM_delLength.prob";
+open(my $deletion_prob_handle, '>', $deletion_file_name) || die("Could not open file!");
+
+my $overall_file_name = "SSM_overall.prob";
+open(my $overall_prob_handle, '>', $overall_file_name) || die("Could not open file!");
+
+my $heterozygosity_file_name = "heterozygosity.prob";
+open(my $heterozygosity_prob_handle, '>', $heterozygosity_file_name) || die("Could not open file!");
+
+my $annotation_file_name = "annofreq.prob";
+open(my $annotation_handle, '>', $annotation_file_name) || die("Could not open file!");
+
+my $exonic_con_file_name = "exonic_consequences.prob";
+open(my $exonic_con_handle, '>', $exonic_con_file_name) || die("Could not open file!");
+
+my $intronic_file_name = "intronic_vars.prob";
+open(my $intronic_handle, '>', $intronic_file_name) || die("Could not open file!");
+
+my $exonic_file_name = "exonic_vars.prob";
+open(my $exonic_handle, '>', $exonic_file_name) || die ("Could not open file!");
+
+my $intergenic_file_name = "intergenic_vars.prob";
+open(my $intergenic_handle, '>', $intergenic_file_name) || die ("Could not open file!");
+
+
+######################### Calculate frequency models ####################### 
+
+
+# calculate zygosity ratio frequency, print to file
+# foreach my $genotype (sort(keys %genotype_hash)) {
+   # my $zygosity_frequency;
+   # $zygosity_frequency = $genotype_hash{$genotype}/$mutation_total;
+   # print $genotype_handle "$genotype\t$zygosity_frequency\n";
+   # print "Genotype, $genotype -- $genotype_hash{$genotype}\n";
+# }
+
+# print annotation and exonic consequence frequencies
+foreach $1 (sort(keys %annotation_hash)) {
+   my $annotation_frequency;
+   $annotation_frequency = $annotation_hash{$1}/$mutation_total;
+   print "$1 -- $annotation_hash{$1}, $annotation_frequency\n";
+   print $annotation_handle "$1\t$annotation_frequency\n";
+}
+foreach $1 (sort(keys %exonic_consequence_hash)) {
+   my $exonic_con_freq;
+   if ( $1 ne "." ) {
+      $exonic_con_freq = $exonic_consequence_hash{$1}/$mutation_total;
+      print "Exonic Consequence: $1 -- $exonic_consequence_hash{$1}, $exonic_con_freq\n";
+      print $exonic_con_handle "$1\t$exonic_con_freq\n";
+   }
+}
+
+# Calculating exonic, intronic, and intergenic frequencies, printing to files
+my $intronic_freq;
+my $exonic_freq;
+my $intergenic_freq;
+$intronic_freq = $intronic/$mutation_total;
+$exonic_freq = $exonic/$mutation_total;
+$intergenic_freq = $intergenic/$mutation_total;
+print $intronic_handle "$intronic_freq\n";
+print $exonic_handle "$exonic_freq\n";
+print $intergenic_handle "$intergenic_freq\n";
+
+print "Intronic -- $intronic\nExonic -- $exonic\nIntergenic -- $intergenic\n";
+#print "Total Annotations -- $annotation_total\n";
+
+# print overall likelihood file headers
+print $overall_prob_handle "mutation_type\tprobability\n";
+
+# print insertions and deletion probabilities out of all mutations
+my $insertion_prob_all = $insertion_total / $mutation_total;
+my $deletion_prob_all = $deletion_total / $mutation_total;
+print $overall_prob_handle "insertion\t$insertion_prob_all\ndeletion\t$deletion_prob_all\n";
+# print $overall_prob_handle "Deletion Probability -- $deletion_prob_all\n";
+
+# print InDel totals
+print "Insertions $insertion_total\n";
+print "Deletions $deletion_total\n";
+
+# print insertion and deletion headers
+print $insertion_prob_handle "insertion_length\tprobability\n";
+print $deletion_prob_handle "deletion_length\tprobability\n";
+
+# calculate InDel length totals and probability out of total number of insertions/deletions. Print probabilities to file.
+foreach my $insertion_length (sort(keys %insertion_hash)) {
+   my $insertion_probability;
+   $insertion_probability = $insertion_hash{$insertion_length}/$insertion_total;
+   print $insertion_prob_handle "$insertion_length\t$insertion_probability\n";
+   # print "Insertion, $insertion_length, total , $insertion_hash{$insertion_length}\n";
+}
+foreach my $deletion_length (sort(keys %deletion_hash)) {
+   my $deletion_probability;
+   $deletion_probability = $deletion_hash{$deletion_length}/$deletion_total;
+   print $deletion_prob_handle "$deletion_length\t$deletion_probability\n";
+   # print "Deletion, $deletion_length, total, $deletion_hash{$deletion_length}\n";
+}
+ 
+# print heterozygosity frequency to file
+my $zygote_frequency = $zygotes_total / $mutation_total;
+print $heterozygosity_prob_handle "$zygote_frequency\n";
+
+print "heterozygous alleles -- $zygotes_total\n";
+
+
+# define nucleotide array
+my @nucleotides = ("A", "C", "G", "T");
+   
+foreach my $nt1 (@nucleotides) {
+   foreach my $nt3 (@nucleotides) {
+
+      # define the output file name and open it for writing
+      my $trinucleotide_SNP_probability_file_name = "Context".$nt1."-".$nt3.".trinuc";
+      open(my $trinuc_prob_handle, '>', $trinucleotide_SNP_probability_file_name) || die("Could not open file!");
+
+
+      # print trinucleotide contexts and corresponding totals for every mutated_to nucleotide
+      my $context_code=$nt1."_".$nt3;
+
+         #foreach my $mutated_from_nucl_key (keys %{ $trinucleotide_context_data{$context_code} }) {
+         foreach my $mutated_from (@nucleotides) {
+            # define the "mutated_to" keys in trinuc context hash
+            # my $mutated_to_nucl_key;
+
+            # the sum is only across mutated_to, and will be redefined for each mutated_from
+            my $context_sum_across_mutated_to = 0;
+            my $context_sum_across_indel = 0;
+
+            # print "\nRaw counts for mutated_from $mutated_from \n";
+
+
+            # foreach $mutated_to_nucl_key (keys %{ $trinucleotide_context_data{$context_code}{$mutated_from_nucl_key} }) {
+            foreach my $mutated_to (@nucleotides) {
+               my $mutated_from_length = length( $mutated_from );
+               my $mutated_to_length = length( $mutated_to );
+               if ( $mutated_from_length == 1 ) {
+                  if ( $mutated_from ne "-" ) {
+                     if ( $mutated_to_length == 1 ) {
+                        if ( $mutated_to ne "-" ) {
+                           # print "$context_code, $mutated_from_nucl_key, $mutated_to_nucl_key -- $trinucleotide_context_data{$context_code}{$mutated_from_nucl_key}{$mutated_to_nucl_key}\n";
+                           $context_sum_across_mutated_to = $context_sum_across_mutated_to + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to};
+                        }# end if statement
+                        else {
+                           $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to};
+                        }# end else statement
+                     }# end if statement
+                     else {
+                        $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to};
+                     }# end else statement
+                  }# end if statement
+                  else {
+                     $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to};
+                  }# end else statement
+               }# end if statement
+               else {
+                  $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to};
+               }# end else statement
+               # print "$context_code, $mutated_from, $mutated_to-- $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}\n";
+            }# end of loop over mutated_to
+
+            # print "\nProbabilities for mutated_from $mutated_from:\n";
+
+
+            foreach my $mutated_to (@nucleotides) {
+            #foreach $mutated_to_nucl_key (keys %{ $trinucleotide_context_data{$context_code}{$mutated_from_nucl_key} }) {
+               my $mutated_from_length = length( $mutated_from);
+               my $mutated_to_length = length( $mutated_to);
+               if ( $mutated_from_length == 1 ) {
+                  if ( $mutated_from ne "-" ) {
+                     if ( $mutated_to_length == 1 ) {
+                        if ( $mutated_to ne "-" ) {
+                           my $SNP_probability;
+                           if ( $context_sum_across_mutated_to == 0 ) {
+                              $SNP_probability = 0;
+                           }
+                           else {
+                              $SNP_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_mutated_to;
+                           }
+                           if ( $mutated_to eq "T" ) {
+                              print $trinuc_prob_handle "$SNP_probability";
+                           }
+                           else {
+                              # print "$context_code, $mutated_from, $mutated_to, context_sum_across_mutated_to=$context_sum_across_mutated_to -- $SNP_probability\n";
+                              print $trinuc_prob_handle "$SNP_probability\t";
+                           }
+                        }# end of if statement
+                        else {
+                           my $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel;
+                           # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n";
+                        }# end else statement
+                     }# end of if statement
+                     else {
+                        # my $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel;
+                        # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n";
+                     }# end else statement
+                  }# end of if statement
+                  else {
+                     # my $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel;
+                     # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n";
+                  }# end else statement
+               }# end of if statement
+               else {
+                  my $indel_probability;
+                  if ( $context_sum_across_indel = 0 ) {
+                     $indel_probability = 0;
+                  }
+                  else {
+                     # $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel;
+                     # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n";
+                  }
+               }# end else statement
+            }# end of loop over mutated_to
+            print $trinuc_prob_handle "\n";
+
+         }# end of loop over mutated_from
+
+     # print "\n\n";
+     
+
+  }# end loop over nt3
+}# end loop over nt1
+
+
+
+
author	thondeboer
date	Tue, 15 May 2018 02:39:53 -0400
parents
children