Mercurial > repos > thondeboer > neat_genreads
comparison utilities/deprecated/FindNucleotideContextOnReference.pl @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 use strict; | |
| 4 | |
| 5 | |
| 6 if ($#ARGV < 1) { | |
| 7 print "parameter mismatch\nTo run type this command:\nperl $0 fastahack reference input_pos_file output_file\n\n"; | |
| 8 | |
| 9 print " first argument = full path to fastahack\n"; | |
| 10 print " second argument = full path to reference genome\n"; | |
| 11 print " third argument = input file with arbitrary number of columns, but 1st col=chromosome name and 2nd col=position\n"; | |
| 12 print " fourth argument = output file with three columns: chromosome name, position of the center nucleotide, and the thre-nucleotide context for that position\n\n\n"; | |
| 13 exit 1; | |
| 14 } | |
| 15 | |
| 16 | |
| 17 my $Fastahack=$ARGV[0]; | |
| 18 my $Reference=$ARGV[1]; | |
| 19 open(InputPositions, '<', $ARGV[2]) || die("Could not open file!"); | |
| 20 open(OutputTrinucleotideContext, '>', $ARGV[3]) || die("Could not open file!"); | |
| 21 | |
| 22 | |
| 23 | |
| 24 | |
| 25 ################ read in one coordinate at a time and execute fastahack on it | |
| 26 | |
| 27 # reading the header | |
| 28 my $head = <InputPositions>; | |
| 29 $head =~ s/\n|\r//; | |
| 30 print OutputTrinucleotideContext "$head\tContext\n"; | |
| 31 | |
| 32 # creating trinucleotide context data hash, insertion and deletion counts | |
| 33 my %trinucleotide_context_data; | |
| 34 my %context_tally_across_mutated_to; | |
| 35 my %insertion_hash; | |
| 36 my %deletion_hash; | |
| 37 my $insertion_total; | |
| 38 my $deletion_total; | |
| 39 | |
| 40 | |
| 41 # reading the positional information | |
| 42 my $line_count = 1; | |
| 43 while (<InputPositions>) { | |
| 44 $_ =~ s/\n|\r//; | |
| 45 #print "$_\n"; | |
| 46 my @line = split('\t', $_); | |
| 47 | |
| 48 # getting the chromosome and coordinate fields from input file | |
| 49 # fastahack will need to the chromosome and coordinate to read the information from the reference | |
| 50 my $chromosome = $line[0]; | |
| 51 my $coordinate = $line[1]; | |
| 52 | |
| 53 # get coordinates of first and last character in the context | |
| 54 my $start_region = $coordinate - 1; | |
| 55 my $end_region = $coordinate + 1; | |
| 56 | |
| 57 # if the coordinate is the very first letter on the chromosome, then do not read before that position | |
| 58 # the context becomes 2 letter code, as opposed to a trinucleotide | |
| 59 if ( $start_region == 0 ) { | |
| 60 $start_region = 1; | |
| 61 $end_region = 2; | |
| 62 } | |
| 63 | |
| 64 #print "$Fastahack -r $chromosome:$start_region..$end_region $Reference\n"; | |
| 65 my $context = `$Fastahack -r $chromosome:$start_region..$end_region $Reference`; | |
| 66 | |
| 67 # capitalize context letters | |
| 68 $context = uc($context); | |
| 69 | |
| 70 # split germline column into germline allele and mutated_to allele | |
| 71 # my @germline = split ('/', $line[6]); | |
| 72 | |
| 73 # if germline allele does not equal reference allele, print "start_region germline allele end_region" | |
| 74 # specifically, replace the middle letter of the context with the germline allele | |
| 75 #print "$germline[0], $germline[1]\n"; | |
| 76 # if ($germline[0] ne $germline[1]) { | |
| 77 # print "germline/reference mismatch, line number $line_count\n"; | |
| 78 # if ($coordinate != 1) { | |
| 79 # substr($context,1,1)= $germline[1]; | |
| 80 # } | |
| 81 # else { | |
| 82 # substr($context,0,1)= $germline[1]; | |
| 83 # } | |
| 84 # } | |
| 85 | |
| 86 print OutputTrinucleotideContext "$_\t$context"; | |
| 87 | |
| 88 | |
| 89 | |
| 90 ############################### | |
| 91 # new section: forming the data structure | |
| 92 ############################### | |
| 93 | |
| 94 # to create N_N contexts for data structure, context_code is defined as the trinucleotide context with a blank middle allele | |
| 95 my $context_code=$context; | |
| 96 $context_code =~ s/\n|\r//; | |
| 97 substr($context_code,1,1) = "_"; | |
| 98 | |
| 99 # create variables for mutated_from and mutated_to nucleotides | |
| 100 my $mutated_from = $line[9]; | |
| 101 my $mutated_to = $line[10]; | |
| 102 | |
| 103 # define length of insertions and deletions | |
| 104 # if ($mutated_from eq "-") { | |
| 105 my $insertion_length = length( $mutated_to ); | |
| 106 # } | |
| 107 | |
| 108 # if ($mutated_to eq "-") { | |
| 109 my $deletion_length = length( $mutated_from ); | |
| 110 # } | |
| 111 | |
| 112 # context_codes are totalled | |
| 113 $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to} = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to} + 1; | |
| 114 $context_tally_across_mutated_to{$context_code}{$mutated_from} = $context_tally_across_mutated_to{$context_code}{$mutated_from} + 1; | |
| 115 | |
| 116 # insertion and deletion lengths are totalled | |
| 117 if ($mutated_from eq "-") { | |
| 118 $insertion_hash{$insertion_length} = $insertion_hash{$insertion_length} + 1; | |
| 119 } | |
| 120 if ($mutated_to eq "-") { | |
| 121 $deletion_hash{$deletion_length} = $deletion_hash{$deletion_length} + 1; | |
| 122 } | |
| 123 | |
| 124 # total insertions and deletions | |
| 125 if ($mutated_from eq "-") { | |
| 126 $insertion_total = $insertion_total + 1; | |
| 127 } | |
| 128 if ($mutated_to eq "-") { | |
| 129 $deletion_total = $deletion_total + 1; | |
| 130 } | |
| 131 | |
| 132 | |
| 133 | |
| 134 # to keep track of progress | |
| 135 unless ($line_count%10000) { | |
| 136 print "processed $line_count lines\n"; | |
| 137 } | |
| 138 $line_count++; | |
| 139 } | |
| 140 # end working through the input file | |
| 141 | |
| 142 # print total number of mutations | |
| 143 my $mutation_total = $line_count - 1; | |
| 144 print "Number of Mutations -- $mutation_total\n"; | |
| 145 | |
| 146 # define the output file name for insertions, deletions, and overall likelihoods. Open files for writing | |
| 147 my $insertion_file_name = "Leukemia_OPEN_insLength.prob"; | |
| 148 open(my $insertion_prob_handle, '>', $insertion_file_name) || die("Could not open file!"); | |
| 149 | |
| 150 my $deletion_file_name = "Leukemia_OPEN_delLength.prob"; | |
| 151 open(my $deletion_prob_handle, '>', $deletion_file_name) || die("Could not open file!"); | |
| 152 | |
| 153 my $overall_file_name = "Leukemia_OPEN_overall.prob"; | |
| 154 open(my $overall_prob_handle, '>', $overall_file_name) || die("Could not open file!"); | |
| 155 | |
| 156 # print overall likelihood file headers | |
| 157 print $overall_prob_handle "mutation_type\tprobability\n"; | |
| 158 | |
| 159 # print insertions and deletion probabilities out of all mutations | |
| 160 my $insertion_prob_all = $insertion_total / $mutation_total; | |
| 161 my $deletion_prob_all = $deletion_total / $mutation_total; | |
| 162 print $overall_prob_handle "insertion\t$insertion_prob_all\ndeletion\t$deletion_prob_all\n"; | |
| 163 # print $overall_prob_handle "Deletion Probability -- $deletion_prob_all\n"; | |
| 164 | |
| 165 # print InDel totals | |
| 166 print "Insertions $insertion_total\n"; | |
| 167 print "Deletions $deletion_total\n"; | |
| 168 | |
| 169 # print insertion and deletion headers | |
| 170 print $insertion_prob_handle "insertion_length\tprobability\n"; | |
| 171 print $deletion_prob_handle "deletion_length\tprobability\n"; | |
| 172 | |
| 173 # calculate InDel length totals and probability out of total number of insertions/deletions. Print probabilities to file. | |
| 174 foreach my $insertion_length (sort(keys %insertion_hash)) { | |
| 175 my $insertion_probability; | |
| 176 $insertion_probability = $insertion_hash{$insertion_length}/$insertion_total; | |
| 177 print $insertion_prob_handle "$insertion_length\t$insertion_probability\n"; | |
| 178 print "Insertion, $insertion_length, total , $insertion_hash{$insertion_length}\n"; | |
| 179 } | |
| 180 foreach my $deletion_length (sort(keys %deletion_hash)) { | |
| 181 my $deletion_probability; | |
| 182 $deletion_probability = $deletion_hash{$deletion_length}/$deletion_total; | |
| 183 print $deletion_prob_handle "$deletion_length\t$deletion_probability\n"; | |
| 184 print "Deletion, $deletion_length, total, $deletion_hash{$deletion_length}\n"; | |
| 185 } | |
| 186 | |
| 187 | |
| 188 # define nucleotide array | |
| 189 my @nucleotides = ("A", "C", "G", "T"); | |
| 190 | |
| 191 foreach my $nt1 (@nucleotides) { | |
| 192 foreach my $nt3 (@nucleotides) { | |
| 193 | |
| 194 # define the output file name and open it for writing | |
| 195 my $trinucleotide_SNP_probability_file_name = "Leukemia_OPEN_".$nt1."-".$nt3.".trinuc"; | |
| 196 open(my $trinuc_prob_handle, '>', $trinucleotide_SNP_probability_file_name) || die("Could not open file!"); | |
| 197 | |
| 198 | |
| 199 # print trinucleotide contexts and corresponding totals for every mutated_to nucleotide | |
| 200 my $context_code=$nt1."_".$nt3; | |
| 201 | |
| 202 #foreach my $mutated_from_nucl_key (keys %{ $trinucleotide_context_data{$context_code} }) { | |
| 203 foreach my $mutated_from (@nucleotides) { | |
| 204 # define the "mutated_to" keys in trinuc context hash | |
| 205 # my $mutated_to_nucl_key; | |
| 206 | |
| 207 # the sum is only across mutated_to, and will be redefined for each mutated_from | |
| 208 my $context_sum_across_mutated_to = 0; | |
| 209 my $context_sum_across_indel = 0; | |
| 210 | |
| 211 # print "\nRaw counts for mutated_from $mutated_from \n"; | |
| 212 | |
| 213 | |
| 214 # foreach $mutated_to_nucl_key (keys %{ $trinucleotide_context_data{$context_code}{$mutated_from_nucl_key} }) { | |
| 215 foreach my $mutated_to (@nucleotides) { | |
| 216 my $mutated_from_length = length( $mutated_from ); | |
| 217 my $mutated_to_length = length( $mutated_to ); | |
| 218 if ( $mutated_from_length == 1 ) { | |
| 219 if ( $mutated_from ne "-" ) { | |
| 220 if ( $mutated_to_length == 1 ) { | |
| 221 if ( $mutated_to ne "-" ) { | |
| 222 # print "$context_code, $mutated_from_nucl_key, $mutated_to_nucl_key -- $trinucleotide_context_data{$context_code}{$mutated_from_nucl_key}{$mutated_to_nucl_key}\n"; | |
| 223 $context_sum_across_mutated_to = $context_sum_across_mutated_to + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}; | |
| 224 }# end if statement | |
| 225 else { | |
| 226 $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}; | |
| 227 }# end else statement | |
| 228 }# end if statement | |
| 229 else { | |
| 230 $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}; | |
| 231 }# end else statement | |
| 232 }# end if statement | |
| 233 else { | |
| 234 $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}; | |
| 235 }# end else statement | |
| 236 }# end if statement | |
| 237 else { | |
| 238 $context_sum_across_indel = $context_sum_across_indel + $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}; | |
| 239 }# end else statement | |
| 240 # print "$context_code, $mutated_from, $mutated_to-- $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}\n"; | |
| 241 }# end of loop over mutated_to | |
| 242 | |
| 243 # print "\nProbabilities for mutated_from $mutated_from:\n"; | |
| 244 | |
| 245 | |
| 246 foreach my $mutated_to (@nucleotides) { | |
| 247 #foreach $mutated_to_nucl_key (keys %{ $trinucleotide_context_data{$context_code}{$mutated_from_nucl_key} }) { | |
| 248 my $mutated_from_length = length( $mutated_from); | |
| 249 my $mutated_to_length = length( $mutated_to); | |
| 250 if ( $mutated_from_length == 1 ) { | |
| 251 if ( $mutated_from ne "-" ) { | |
| 252 if ( $mutated_to_length == 1 ) { | |
| 253 if ( $mutated_to ne "-" ) { | |
| 254 my $SNP_probability; | |
| 255 if ( $context_sum_across_mutated_to == 0 ) { | |
| 256 $SNP_probability = 0; | |
| 257 } | |
| 258 else { | |
| 259 $SNP_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_mutated_to; | |
| 260 } | |
| 261 if ( $mutated_to eq "T" ) { | |
| 262 print $trinuc_prob_handle "$SNP_probability"; | |
| 263 } | |
| 264 else { | |
| 265 # print "$context_code, $mutated_from, $mutated_to, context_sum_across_mutated_to=$context_sum_across_mutated_to -- $SNP_probability\n"; | |
| 266 print $trinuc_prob_handle "$SNP_probability\t"; | |
| 267 } | |
| 268 }# end of if statement | |
| 269 else { | |
| 270 my $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel; | |
| 271 # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n"; | |
| 272 }# end else statement | |
| 273 }# end of if statement | |
| 274 else { | |
| 275 # my $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel; | |
| 276 # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n"; | |
| 277 }# end else statement | |
| 278 }# end of if statement | |
| 279 else { | |
| 280 # my $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel; | |
| 281 # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n"; | |
| 282 }# end else statement | |
| 283 }# end of if statement | |
| 284 else { | |
| 285 my $indel_probability; | |
| 286 if ( $context_sum_across_indel = 0 ) { | |
| 287 $indel_probability = 0; | |
| 288 } | |
| 289 else { | |
| 290 # $indel_probability = $trinucleotide_context_data{$context_code}{$mutated_from}{$mutated_to}/$context_sum_across_indel; | |
| 291 # print $indel_prob_handle "$context_code, $mutated_from, $mutated_to, context_sum_across_indel=$context_sum_across_indel -- $indel_probability\n"; | |
| 292 } | |
| 293 }# end else statement | |
| 294 }# end of loop over mutated_to | |
| 295 print $trinuc_prob_handle "\n"; | |
| 296 | |
| 297 }# end of loop over mutated_from | |
| 298 | |
| 299 # print "\n\n"; | |
| 300 | |
| 301 | |
| 302 }# end loop over nt3 | |
| 303 }# end loop over nt1 | |
| 304 | |
| 305 | |
| 306 | |
| 307 |
