comparison call_consensus_from_sam_3.pl @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 #!/usr/bin/perl
2 # This calls a consensus based on the sam file and does not use qual scores etc it simply counts up each event and calls a simple most observations consensus
3 # the sam file must be sorted in order to cope with indels.
4 # usgae is ./call_consensus_from_sam_3.pl aligned.sam genome.fasta 10
5 # the number 10 is the minimum mapq you want to consider
6
7
8
9 open (INFILEFASTA, "$ARGV[1]"); # opens files
10
11 open (INFILESAM, "$ARGV[0]");
12 $full_len_cut = 1000;
13 $min_mapq = $ARGV[2];
14 $full_len_count = 0;
15 open (OUT, ">$ARGV[0].Observed_differences_list.txt");
16 open (OUTINDELS, ">$ARGV[0].Indels_applied_list.txt");
17 open (OUTMINOR, ">$ARGV[0].minor_variants.txt");
18 print OUTMINOR "Genome\tPosition\tA\tC\tG\tT\tDepth\n";
19 open (OUTMINORB, ">$ARGV[0].dominant_minor_variant.txt");
20 print OUTMINORB "Genome\tPosition\tA\tC\tG\tT\tDominat nucleotide\tFrequency\tTotal Depth\n";
21 print OUT "Genome\tPosition\tReference nucleotide\tGATC Depth count\tG count\tA count\tT count\tC count\tConsensus (. means no change from reference)\tDeletions\tTotal insertions recorded\tThis insertion most popular\tThis many recorded most popular insertions\n";
22 open (OUTNEWG, ">$ARGV[0].corrected_genome_snps_only.txt");
23 open (OUTNEWGINDEL, ">$ARGV[0].corrected_genome_snps_and_indels.txt");
24 open (OUTINDELKEY, ">$ARGV[0].key_indels.txt");
25 open (OUTINDELKEYSIG, ">$ARGV[0].Significant_key_indels.txt");
26 open (OUTFULLSAM, ">$ARGV[0].full_len_sam.sam");
27 $start_time = time;
28
29 %for_translation = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
30 %rev_translation = (GGC=>"A", ACT=>"S", TCA=>"*", ACA=>"C", TCG=>"R", GAT=>"I", GTT=>"N", GCT=>"S", GTA=>"Y", TGT=>"T", CGA=>"S", CGG=>"P", CAG=>"L", TGC=>"A", CAC=>"V", CTT=>"K", AAC=>"V", GTG=>"H", TCT=>"R", GGT=>"T", TGG=>"P", CCA=>"W", GAG=>"L", GCG=>"R", CAA=>"L", TTA=>"*", CTG=>"Q", CGT=>"T", CAT=>"M", TTT=>"K", TAC=>"V", CTA=>"*", AAG=>"L", TCC=>"G", GAC=>"V", GCA=>"C", TGA=>"S", AAT=>"I", ATA=>"Y", ATT=>"N", AGT=>"T", TTG=>"Q", GTC=>"D", ACC=>"G", GGA=>"S", AAA=>"F", CCT=>"R", ACG=>"R", CCG=>"R", ATG=>"H", TAT=>"I", GGG=>"P", CCC=>"G", TAA=>"L", CTC=>"E", TAG=>"L", ATC=>"D", AGA=>"S", GAA=>"F", CGC=>"A", GCC=>"G", AGC=>"A", TTC=>"E", AGG=>"P");
31 %base_pair = (G=>"A", A=>"T", T=>"A", C=>"G");
32
33 print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n";
34
35
36 %transcripts = ();
37 %orf_hash =();
38 %peptides_pep_score = ();
39 %unique_fastas = ();
40 %peptides_and_utn = ();
41 %utn_and_peptides = ();
42 %fasta_head_and_peptides = ();
43 %indel_tab = ();
44
45 $fasta_out ="";
46 $fasta_header = "Not_started_yet";
47 while ($fasta_line = <INFILEFASTA>)
48 {
49 chomp $fasta_line;
50 print "\n$fasta_line\n";
51 if (substr($fasta_line,0,1) eq ">") {
52
53 if ($fasta_header ne "Not_started_yet") {
54 #print "\n\nFasta header $fasta_header\n\n";
55 @temp = split (/\s+/, $fasta_header);
56 $fasta_head_no_sp = $temp[0];
57 #print "\n\nAfter split $fasta_head_no_sp\n\n";
58 substr($fasta_head_no_sp,0,1) = "";
59 $transcripts {$fasta_head_no_sp} = $fasta_out;
60
61 #print "\n\n$fasta_head_no_sp\n\n";
62 }
63
64
65 $fasta_header = $fasta_line;
66
67
68 $fasta_out ="";
69 }
70 if (substr($fasta_line,0,1) ne ">") {$fasta_out = $fasta_out.$fasta_line;}
71 }
72 ($fasta_head_no_sp, $restofit) = split (/ /, $fasta_header);
73 substr($fasta_head_no_sp,0,1,"");
74 $transcripts {$fasta_head_no_sp} = $fasta_out;
75
76 #print "\n$fasta_header\n\n$fasta_head_no_sp\n";
77
78 $faster_header = "";
79 $chrom_proc = 0;
80 $chromosome = "notstartedyet";
81 $old_chromosome = "";
82 #exit;
83
84
85 $sam_count = 0;
86
87 $sam_header= "";
88 $transcript_count = 0;
89
90 open (INFILESAM, "$ARGV[0]");
91
92 while ($sam_line = <INFILESAM>)
93 {
94 if (substr($sam_line,0,1) eq "@") {next;}
95 #if (($sam_count % 10000) == 0) {print "$sam_count entries processed\n";}
96 #print "$sam_count\n";
97 $sam_count ++;
98 chomp $sam_line;
99 @sam_cells = split (/\t/, $sam_line);
100 if ($sam_cells[2] eq "*") {next;}
101 if (($sam_cells[2] ne $chromosome) and ($chromosome ne "notstartedyet")) {
102 &process_data;
103 $chromosome = $sam_cells[2];
104 $genome = $transcripts{$chromosome};
105 $len_gen = length $genome;
106 delete $transcripts{$chromosome};
107 $chrom_proc ++;
108 print "hi $chromosome is being processed this is chromosome number $chrom_proc\n";
109 %g = ();
110 %a = ();
111 %t = ();
112 %c = ();
113 %ins = ();
114 %del = ();
115 %depth = ();
116
117 }
118 # Data processed
119 if ($chromosome eq "notstartedyet") {
120
121 $chromosome = $sam_cells[2];
122 $genome = $transcripts{$chromosome};
123 $len_gen = length $genome;
124 delete $transcripts{$chromosome};
125 $chrom_proc ++;
126 print "hi $chromosome is being processed this is chromosome number $chrom_proc\n";
127 %g = ();
128 %a = ();
129 %t = ();
130 %c = ();
131 %ins = ();
132 %del = ();
133 %depth = ();
134
135
136 }
137
138
139
140 $utn = $sam_cells[0];
141 $mapq = $sam_cells[4];
142 $sequence = uc $sam_cells[9];
143 $seq_len = length $sequence;
144 $sequence =~ tr/Uu/TT/;
145 #print "$sam_cells[5]\n";
146 if ($min_mapq > $mapq) {next;}
147 #print "OK";
148 $flag = $sam_cells [1];
149 $genome_position = $sam_cells[3] - 1;
150 $fl_pos = $genome_position;
151 if ($genome_position < $full_len_cut) {$full_len_flag = "S";
152 #if (length $sequence > 28000) {print "possible full len\n";}
153
154 } else {$full_len_flag = "I";}
155 $sam_position = 0;
156 @cigar = split(/(M|I|D|N|S|H|P|X|=)/, $sam_cells[5]);
157 $array_cigar = 1;
158 $temp_len = 0;
159 while (length($cigar[$array_cigar]) >=1){
160 $cigar_value = $cigar[$array_cigar - 1];
161 if (($cigar[$array_cigar] eq "M") or ($cigar[$array_cigar] eq "I") or ($cigar[$array_cigar] eq "S")){$temp_len = $temp_len + $cigar_value;}
162 $array_cigar = $array_cigar + 2;
163 }
164 $tran_len = length $sequence;
165 if ($tran_len ne $temp_len) {print "cigar fail\n";next;}
166 $array_cigar = 1;
167 while (length($cigar[$array_cigar]) >=1){
168 #print "cigar entry is $cigar[$array_cigar]\n";
169 $cigar_value = $cigar[$array_cigar - 1];
170 if ($cigar[$array_cigar] =~ /[H]/){
171 #nothing to do
172 }
173 if (($cigar[$array_cigar] =~ /[S]/) and (($array_cigar + 1) < scalar (@cigar)))
174 {
175 $soft_clip = $cigar_value;
176 $sam_position = $sam_position + $cigar_value;
177 $seq_len = $seq_len - $cigar_value;
178 }
179
180
181 if ($cigar[$array_cigar] =~ /[M]/){
182 $temp_count = 1;
183 #print "M";
184 while ($temp_count <= $cigar_value)
185 {
186 $depth{$genome_position} ++;
187 $alt = substr($sequence, $sam_position, 1);
188 #print "alternative $alt\n";
189 if ($alt eq "G") {$g{$genome_position} ++;}
190 if ($alt eq "A") {$a{$genome_position} ++;}
191 if ($alt eq "T") {$t{$genome_position} ++;}
192 if ($alt eq "C") {$c{$genome_position} ++;}
193 $temp_count ++;
194 $genome_position ++;
195 $sam_position ++;
196 $fl_pos ++;
197 }
198
199 }
200
201 #if (($cigar[$array_cigar] =~ /[D]/) and ($cigar_value >4)) {$cigar[$array_cigar] = "N";}
202
203 if ($cigar[$array_cigar] =~ /[D]/){
204 $temp_cv = $cigar_value - 1;
205 $tmp_name = "$chromosome\tDeletion\t$genome_position\t$cigar_value\t ";
206 if (exists $indel_tab{$tmp_name}){$indel_tab{$tmp_name} ++;} else {$indel_tab{$tmp_name} = 1;}
207 while ($temp_cv >= 0){
208 if (exists $del{$genome_position + $temp_cv}) {$del{$genome_position + $temp_cv} ++;} else {$del{$genome_position + $temp_cv} = 1;}
209 $temp_cv --;
210 }
211 $genome_position = $genome_position + $cigar_value;
212 $fl_pos = $fl_pos + $cigar_value;
213 }
214
215 if ($cigar[$array_cigar] =~ /[I]/){
216
217 $insertion = substr ($sequence, $sam_position, $cigar_value);
218 $tmp_name = "$chromosome\tInsertion\t$genome_position\t$cigar_value\t$insertion";
219 if (exists $indel_tab{$tmp_name}){$indel_tab{$tmp_name} ++;} else {$indel_tab{$tmp_name} = 1;}
220 if (exists $ins{$genome_position}) {$ins{$genome_position} = $ins{$genome_position}."\t$insertion";} else {$ins{$genome_position} = $insertion;}
221
222 $sam_position = $sam_position + $cigar_value;
223
224 }
225
226 if ($cigar[$array_cigar] =~ /[N]/){
227
228 $genome_position = $genome_position + $cigar_value;
229
230 }
231 $array_cigar = $array_cigar +2;
232
233 }
234 #if ($fl_pos > ($len_gen - 30)) {print "Stops\n";}
235 #if (($fl_pos > ($len_gen - $full_len_cut)) and ($full_len_flag eq "S")) {print OUTFULLSAM "$sam_line\n"; print "Full length found\n"; $full_len_count ++;}
236 if ($seq_len > ($len_gen - $full_len_cut)) {print OUTFULLSAM "$sam_line\n"; print "Full length found $seq_len X $len_gen\n"; $full_len_count ++;}
237
238 }
239
240
241
242
243 #all done just need to process the last chromosome...
244 &process_data;
245
246 foreach $keys (keys %transcripts){
247 $genome = $transcripts{$keys};
248 print OUTNEWGINDEL ">No_mapped_reads_to_".$keys."_genome_so_no_corrections\n$genome\n";
249 print OUTNEWG ">No_mapped_reads_to_".$keys."_genome_so_no_corrections\n$genome\n";
250 }
251
252
253
254 $time_elapsed = time - $start_time;
255
256 print "Processed $sam_count entries for $chrom_proc chromosomes in $time_elapsed seconds\nFull length entries is $full_len_count\n";
257
258 exit;
259
260
261 sub process_data { #now to process all the data on this chromosome before moving on to the next one
262 $genome_position = 0;
263 $len_genome = length($genome);
264 $old_genome = $genome;
265 open (TEMP, ">temp.txt");
266
267
268 while ($genome_position <= $len_genome){
269 #print "at $genome_position\n";
270 %ins_hash = ();
271 %del_hash = ();
272 $ref_nucleotide = substr ($genome, $genome_position, 1);
273 $sam_con = $ref_nucleotide;
274 $temp_pos = $genome_position + 1;
275
276 if ($depth{$genome_position} > 0) {
277 $A = $a{$genome_position}/$depth{$genome_position};
278 $C = $c{$genome_position}/$depth{$genome_position};
279 $G = $g{$genome_position}/$depth{$genome_position};
280 $T = $t{$genome_position}/$depth{$genome_position};
281 $test = -1;
282 $consensus = "N";
283 if ($A > $test){$test = $A; $consensus = "A";}
284 if ($C > $test){$test = $C; $consensus = "C";}
285 if ($G > $test){$test = $G; $consensus = "G";}
286 if ($T > $test){$test = $T; $consensus = "T";}
287 if ($consensus eq "A") {$A = $A * -1; $test = $A; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";}
288 if ($consensus eq "C") {$C = $C * -1; $test = $C; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";}
289 if ($consensus eq "G") {$G = $G * -1; $test = $G; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";}
290 if ($consensus eq "T") {$T = $T * -1; $test = $T; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";}
291 if ($consensus eq "N") {print OUTMINOR "$chromosome\t$temp_pos\t0\t0\t0\t0\n";}
292 $major = $test * -1;
293 if ($A > $test){$test = $A; $secconsensus = "A";}
294 if ($C > $test){$test = $C; $secconsensus = "C";}
295 if ($G > $test){$test = $G; $secconsensus = "G";}
296 if ($T > $test){$test = $T; $secconsensus = "T";}
297
298 if ($secconsensus eq "A") {print OUTMINORB "$chromosome\t$temp_pos\t$A\t0\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";}
299 if ($secconsensus eq "C") {print OUTMINORB "$chromosome\t$temp_pos\t0\t$C\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";}
300 if ($secconsensus eq "G") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t$G\t0\t$consensus\t$major\t$depth{$genome_position}\n";}
301 if ($secconsensus eq "T") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t$T\t$consensus\t$major\t$depth{$genome_position}\n";}
302 if ($secconsensus eq "N") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";}
303
304 } else {print OUTMINOR "$chromosome\t$temp_pos\t0\t0\t0\t0\n"; print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t0\n";}
305
306 if ($depth{$genome_position} < 3) {
307 if ($len_genome <100000){
308 $sam_con = ".";
309 print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t$sam_con\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n";
310 }
311 $genome_position ++;
312 next;
313 }
314
315 $temp_most = 0;
316 $temp_most_ins = "";
317 $ins_count = 0;
318 if (exists $ins{$genome_position}){
319 $temp_ins = $ins{$genome_position};
320 @insertions = split (/\t/, $temp_ins);
321 $ins_count = scalar @insertions;
322 foreach $key (@insertions)
323 {
324 $ins_hash{$key} ++;
325 # print "found $key\n";
326 }
327 $temp_most = 0;
328 $temp_most_ins = "";
329 foreach $key (keys %ins_hash){
330 # print "$key $ins_hash{$key}\n";
331 if ($ins_hash{$key} > $temp_most) {$temp_most = $ins_hash{$key}; $temp_most_ins = $key;}
332 }
333
334
335
336 #if ($ins_count > $depth{$genome_position}) {print OUT "POSSIBLE $ins_count INSERTION AT $genome_position\t $ins{$genome_position}\n";}
337 if ($temp_most > ($depth{$genome_position}) ) {
338 print OUT "Chromosome $chromosome TOTAL $ins_count INSERTION with $depth{$genome_position} no insertions AT $temp_pos most abundant is $temp_most_ins with $temp_most instertions\t $ins{$genome_position}\n";
339
340 print TEMP "Chromosome $chromosome TOTAL $ins_count INSERTION with $depth{$genome_position} no insertions AT $temp_pos most abundant is $temp_most_ins with $temp_most instertions\t$temp_most_ins\t$genome_position\t$ins{$genome_position}\n";
341 }
342 if ($temp_most > (($depth{$genome_position})/10)) {
343 $indel_proportion = $temp_most/$depth{$genome_position};
344 #print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n";
345 print OUTINDELS "$chromosome\tInsertion\t$genome_position\t$ins_count\t$temp_most_ins\t$depth{$genome_position}\t$temp_most_ins\t$indel_proportion\n";
346 }
347
348
349 }
350
351 if (exists $del{$genome_position}){
352
353 if ($del{$genome_position} > $depth{$genome_position}) {
354 print OUT "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos\n";
355 #print OUTINDELS "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos\n";
356 print TEMP "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos most frequent is $temp_most_dels deletion with $del{$genome_position}\t1\t$genome_position\t$del{$genome_position}\n";
357 }
358 #if ($del_count > 10) {print OUT "POSSIBLE $del_count deletion AT $genome_position\t $del{$genome_position}\n";}
359
360 if ($del{$genome_position} > (($depth{$genome_position})/10)) {
361 $indel_proportion = $del{$genome_position}/$depth{$genome_position};
362 #print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n";
363 print OUTINDELS "$chromosome\tDeletion\t$genome_position\t$del{$genome_position}\t$temp_most_dels\t$depth{$genome_position}\t$temp_most_dels\t$indel_proportion\n";
364 }
365
366
367 }
368
369
370
371
372
373
374 $g_flag = 0;
375 $a_flag = 0;
376 $t_flag = 0;
377 $c_flag = 0;
378 $amb = "N";
379 $top_nuc = 0;
380 $top_nucleotide = "";
381 $sec_nuc = 0;
382 $second_nucleotide = "";
383 $ref_nuc = 0;
384 if ($ref_nucleotide eq "G") {$ref_nuc = $g{$genome_position};}
385 if ($ref_nucleotide eq "A") {$ref_nuc = $a{$genome_position};}
386 if ($ref_nucleotide eq "T") {$ref_nuc = $t{$genome_position};}
387 if ($ref_nucleotide eq "C") {$ref_nuc = $c{$genome_position};}
388 if ($g{$genome_position} >= $top_nuc){
389 $sam_con = "G";
390 #$g_flag = 1;
391 $second_nucleotide = $top_nucleotide;
392 $sec_nuc = $top_nuc;
393 $top_nuc = $g{$genome_position};
394 $top_nucleotide = "G";
395 #if ($ref_nucleotide eq "G") {$amb ="G";}
396 }
397 if ($a{$genome_position} >= $top_nuc){
398 $sam_con = "A";
399 #$a_flag = 1;
400 $second_nucleotide = $top_nucleotide;
401 $sec_nuc = $top_nuc;
402 $top_nuc = $a{$genome_position};
403 $top_nucleotide = "A";
404 #if ($ref_nucleotide eq "A") {$amb ="A";}
405 }
406 if ($t{$genome_position} >= $top_nuc){
407 $sam_con = "T";
408 #$t_flag = 1;
409 $second_nucleotide = $top_nucleotide;
410 $sec_nuc = $top_nuc;
411 $top_nuc = $t{$genome_position};
412 $top_nucleotide = "T";
413 # if ($ref_nucleotide eq "T") {$amb ="T";}
414 }
415 if ($c{$genome_position} >= $top_nuc){
416 $sam_con = "C";
417 #$c_flag = 1;
418 $second_nucleotide = $top_nucleotide;
419 $sec_nuc = $top_nuc;
420 $top_nuc = $c{$genome_position};
421 $top_nucleotide = "C";
422 # if ($ref_nucleotide eq "C") {$amb ="C";}
423 }
424
425 #print "This is G's recoded at this location $g{$genome_position}\n";
426 if (($g{$genome_position} >= $a{$genome_position}) and ($g{$genome_position} >= $t{$genome_position}) and ($g{$genome_position} >= $c{$genome_position}) and ($g{$genome_position} >= 1)) {
427 $sam_con = "G"; $g_flag = 1;
428 if ($ref_nucleotide eq "G") {$amb ="G";}
429 }
430 if (($a{$genome_position} >= $g{$genome_position}) and ($a{$genome_position} >= $t{$genome_position}) and ($a{$genome_position} >= $c{$genome_position}) and ($a{$genome_position} >= 1)) {
431 $sam_con = "A"; $a_flag = 1;
432 if ($ref_nucleotide eq "A") {$amb ="A";}
433 }
434 if (($t{$genome_position} >= $g{$genome_position}) and ($t{$genome_position} >= $a{$genome_position}) and ($t{$genome_position} >= $c{$genome_position}) and ($t{$genome_position} >= 1)) {
435 $sam_con = "T"; $t_flag = 1;
436 if ($ref_nucleotide eq "T") {$amb ="T";}
437 }
438 if (($c{$genome_position} >= $g{$genome_position}) and ($c{$genome_position} >= $a{$genome_position}) and ($c{$genome_position} >= $t{$genome_position}) and ($c{$genome_position} >= 1)) {
439 $sam_con = "C"; $c_flag = 1;
440 if ($ref_nucleotide eq "C") {$amb ="C";}
441 }
442
443 if (($g_flag + $a_flag + $t_flag + $c_flag) > 1) {
444 print OUT "ambiguity chromosome $chromosome at $temp_pos $g_flag G $a_flag A $t_flag T $c_flag C counts are G $g{$genome_position} A $a{$genome_position} T $t{$genome_position} C $c{$genome_position} \n";
445 if ($amb ne "N") {$sam_con = $amb;}
446 }
447
448 if ($sam_con ne $ref_nucleotide) {
449 #print "change\n";
450 print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t$sam_con\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n";
451 } else {
452 if (($len_genome <100000) or ($temp_most > $depth{$genome_position}) or ($del{$genome_position} > $depth{$genome_position})){
453 print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t.\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n";
454 }
455 }
456
457
458 substr ($genome, $genome_position, 1, $sam_con);
459
460 $genome_position ++;
461 #print "at second $genome_position\n";
462
463 }
464
465
466 close TEMP;
467 $new_genome_w_indels = $genome;
468 open (TEMP, "temp.txt");
469 @indels = reverse<TEMP>;
470 foreach $line (@indels)
471 {
472 chomp $line;
473 @indels_cells = split(/\t/, $line);
474 $change = $indels_cells[1];
475 $type = $indels_cells[0];
476 $genome_position = $indels_cells[2];
477 if (index ($type, " INSERTION ") > 0) {
478 substr ($new_genome_w_indels, $genome_position, 0, $change);
479 }
480 if (index ($type, " deletion ") > 0) {
481 substr ($new_genome_w_indels, $genome_position, 1, "");
482 }
483
484 }
485
486
487
488 print OUTNEWGINDEL ">New_".$chromosome."_genome_with_indels_is\n$new_genome_w_indels\n";
489 print OUTNEWG ">New_".$chromosome."_genome_is\n$genome\n";
490 open (CTSO, ">$ARGV[0].CTSO.txt");
491 $warning = "";
492 #open (OUTINDELKEYSIG, ">$ARGV[0].Significant_key_indels.txt");
493 # open (OUTINDELKEY, ">$ARGV[0].key_indels.txt");
494 # $tmp_name = "$chromosome\tDeletion\t$genome_position\t$temp_cv\t$insertion"; temp_cv is the size of the indel
495 print OUTINDELKEY "Chromosome\tInsertion or deletion\tLocation\tIndel size\tInsertion seq\tObservation count\tDepth at this lcation\tProportion of observation vs depth\n";
496 print OUTINDELKEYSIG "Chromosome\tInsertion or deletion\tLocation\tIndel size\tInsertion seq\tObservation count\tDepth at this lcation\tProportion of observation vs depth\n";
497 foreach $key (keys %indel_tab){
498 $value = $indel_tab{$key};
499 @array = split(/\t/, $key);
500 $genome_position = $array[2]; $indel_len = $array[3];
501 if ($depth{$genome_position} > 0) {$a = ($value / $depth{$genome_position}); $tmp_proportion = sprintf ("%.2f",$a);} else {$tmp_proportion = "zero depth here";}
502
503 print OUTINDELKEY "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n";
504 if ($tmp_proportion > 0.1) {
505
506 if ($indel_len > 6) {print "\n\nCtSo\nInsertion or deletion\t$array[1]\nLocation\t$genome_position\nIndel size\t$indel_len\nObservation count\t$value\nDepth at this location\t$depth{$genome_position}\nProportion of observation vs depth\t$tmp_proportion\n\n";
507 print CTSO "\n\nCtSo\nInsertion or deletion\t$array[1]\nLocation\t$genome_position\nIndel size\t$indel_len\nObservation count\t$value\nDepth at this location\t$depth{$genome_position}\nProportion of observation vs depth\t$tmp_proportion\n\n";
508 print OUTINDELKEYSIG "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n";
509 if (($value > 9) or ($depth{$genome_position} > 9)) {$warning = $warning."$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n";}
510 }
511 #print OUTINDELKEYSIG "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n";
512 }
513
514 }
515 if ($warning ne "") {print OUTINDELKEYSIG "\n\nEspecially take a moment to look at these in the above list...CtSo!\n\n$warning\n";}
516 }
517