Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/interpolate_sam.pl @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 | 
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:903fc43d6227 | 
|---|---|
| 1 #!/usr/bin/perl | |
| 2 use strict; | |
| 3 | |
| 4 ###Builds interpolated pileup from SAM file | |
| 5 ##@description counts bases between paired ends and piles up single end reads. | |
| 6 ##@output, uses a #header for the RNAME and then the number of reads per base | |
| 7 ##@author sm8@sanger.ac.uk, Stephen B. Montgomery | |
| 8 | |
| 9 ##@caveats | |
| 10 ##Requires RNAME to have format as per example | |
| 11 ## chromosome:NCBI36:18:1:76117153:1 | |
| 12 ## supercontig::NT_113883:1:137703:1 | |
| 13 ## clone::AC138827.3:1:149397:1 | |
| 14 ##Expects simple CIGAR characters, M, I and D | |
| 15 ##Expects SAM file to be sorted. | |
| 16 ##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77) | |
| 17 | |
| 18 ##Verify and read in SAM file | |
| 19 my $sam_file = $ARGV[0]; | |
| 20 if(!defined($sam_file)) { die("No sam file defined on arg 1"); } | |
| 21 unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); } | |
| 22 open(SAM, $sam_file) || die("Cannot open sam file"); | |
| 23 | |
| 24 ##Globals | |
| 25 my $current_location = ""; ##Current RNAME being processed | |
| 26 my $current_size = 0; ##Size of sequence region being processed | |
| 27 my $current_position = 1; ##Current base being processed | |
| 28 my $open = 0; ##Number of open reads (PE reads that have not been closed) | |
| 29 my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the | |
| 30 ##contained value from those open and deletes the indexed position from the hash | |
| 31 | |
| 32 while (my $line = <SAM>) { | |
| 33 my @tokens = split /\t/, $line; | |
| 34 | |
| 35 if ($current_location ne $tokens[2]) { ##Start a new sequence region | |
| 36 for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region | |
| 37 if (defined($close{$i})) { | |
| 38 $open = $open - $close{$i}; | |
| 39 delete $close{$i}; | |
| 40 } | |
| 41 print $open . "\n"; | |
| 42 } | |
| 43 if ($current_location ne "") { | |
| 44 print "\n"; | |
| 45 } | |
| 46 | |
| 47 ##Initiate a new sequence region | |
| 48 my @location_tokens = split /:/, $tokens[2]; | |
| 49 $current_position = 1; | |
| 50 $current_location = $tokens[2]; | |
| 51 $current_size = $location_tokens[4]; | |
| 52 $open = 0; | |
| 53 %close = (); | |
| 54 print "#" . $tokens[2] . "\n"; | |
| 55 | |
| 56 ##Print pileup to just before the first read (will be 0) | |
| 57 for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) { | |
| 58 print $open . "\n"; | |
| 59 } | |
| 60 $current_position = $tokens[3]; | |
| 61 | |
| 62 } else { ##Sequence region already open | |
| 63 if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position | |
| 64 ##cycle through to catch up to the current position | |
| 65 for (my $i = $current_position; $i < $tokens[3]; $i++) { | |
| 66 if (defined($close{$i})) { | |
| 67 $open = $open - $close{$i}; | |
| 68 delete $close{$i}; | |
| 69 } | |
| 70 print $open . "\n"; | |
| 71 } | |
| 72 $current_position = $tokens[3]; | |
| 73 } | |
| 74 } | |
| 75 $open++; ##Increment the number of open reads | |
| 76 | |
| 77 if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition | |
| 78 $open--; | |
| 79 my $parsed_cig = &parseCigar($tokens[5]); | |
| 80 my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; | |
| 81 if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } | |
| 82 $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; | |
| 83 } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition | |
| 84 my $parsed_cig = &parseCigar($tokens[5]); | |
| 85 my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; | |
| 86 if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } | |
| 87 $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; | |
| 88 } else { | |
| 89 #do nothing | |
| 90 } | |
| 91 } | |
| 92 for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region | |
| 93 if (defined($close{$i})) { | |
| 94 $open = $open - $close{$i}; | |
| 95 delete $close{$i}; | |
| 96 } | |
| 97 print $open . "\n"; | |
| 98 } | |
| 99 print "\n"; | |
| 100 close(SAM); | |
| 101 exit(0); | |
| 102 | |
| 103 ##reads and tokenizes simple cigarline | |
| 104 sub parseCigar() { | |
| 105 my $cigar_line = shift; | |
| 106 $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g; | |
| 107 my @cigar_tokens = split /\t/, $cigar_line; | |
| 108 my %parsed = ('M' => 0, | |
| 109 'I' => 0, | |
| 110 'D' => 0); | |
| 111 my @events = (); | |
| 112 for(my $i = 0; $i < scalar(@cigar_tokens); $i++) { | |
| 113 if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) { | |
| 114 if (!defined($parsed{$2})) { $parsed{$2} = 0; } | |
| 115 my $nt = $2; | |
| 116 if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; } | |
| 117 $parsed{$nt} += $1; | |
| 118 my %event_el = ("t" => $nt, | |
| 119 "n" => $1); | |
| 120 push @events, \%event_el; | |
| 121 } | |
| 122 } | |
| 123 $parsed{'events'} = \@events; | |
| 124 return \%parsed; | |
| 125 } | 
