Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/novo2sam.pl @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 # Contact: lh3 | |
4 # Version: 0.1.3 | |
5 | |
6 #Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com) | |
7 | |
8 #use strict; | |
9 #use warnings; | |
10 use Data::Dumper; | |
11 use Getopt::Std; | |
12 | |
13 &novo2sam; | |
14 exit; | |
15 | |
16 sub mating { | |
17 my ($s1, $s2) = @_; | |
18 my $isize = 0; | |
19 if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize | |
20 my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; | |
21 my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; | |
22 $isize = $x2 - $x1; | |
23 } | |
24 # update mate coordinate | |
25 if ($s2->[2] ne '*') { | |
26 @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); | |
27 $s1->[1] |= 0x20 if ($s2->[1] & 0x10); | |
28 } else { | |
29 $s1->[1] |= 0x8; | |
30 } | |
31 if ($s1->[2] ne '*') { | |
32 @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); | |
33 $s2->[1] |= 0x20 if ($s1->[1] & 0x10); | |
34 } else { | |
35 $s2->[1] |= 0x8; | |
36 } | |
37 } | |
38 | |
39 sub novo2sam { | |
40 my %opts = (); | |
41 getopts("p", \%opts); | |
42 die("Usage: novo2sam.pl [-p] <aln.novo>\n") if (@ARGV == 0); | |
43 my $is_paired = defined($opts{p}); | |
44 # core loop | |
45 my @s1 = (); | |
46 my @s2 = (); | |
47 my ($s_last, $s_curr) = (\@s1, \@s2); | |
48 while (<>) { | |
49 next if (/^#/); | |
50 next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/); | |
51 &novo2sam_aux($_, $s_curr, $is_paired); | |
52 if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { | |
53 &mating($s_last, $s_curr); | |
54 print join("\t", @$s_last), "\n"; | |
55 print join("\t", @$s_curr), "\n"; | |
56 @$s_last = (); @$s_curr = (); | |
57 } else { | |
58 print join("\t", @$s_last), "\n" if (@$s_last != 0); | |
59 my $s = $s_last; $s_last = $s_curr; $s_curr = $s; | |
60 } | |
61 } | |
62 print join("\t", @$s_last), "\n" if (@$s_last != 0); | |
63 } | |
64 | |
65 sub novo2sam_aux { | |
66 my ($line, $s, $is_paired) = @_; | |
67 | |
68 chomp($line); | |
69 my @t = split(/\s+/, $line); | |
70 my @variations = @t[13 .. $#t]; | |
71 @$s = (); | |
72 return if ($t[4] ne 'U'); | |
73 my $len = length($t[2]); | |
74 # read name | |
75 $s->[0] = substr($t[0], 1); | |
76 $s->[0] =~ s/\/[12]$//g; | |
77 # initial flag (will be updated later) | |
78 $s->[1] = 0; | |
79 $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7); | |
80 $s->[1] |= 2 if ($t[10] eq '.'); | |
81 # read & quality | |
82 if ($t[9] eq 'R') { | |
83 $s->[9] = reverse($t[2]); | |
84 $s->[10] = reverse($t[3]); | |
85 $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; | |
86 } else { | |
87 $s->[9] = $t[2]; $s->[10] = $t[3]; | |
88 } | |
89 # cigar | |
90 my $cigarstring =""; | |
91 if (scalar @variations ==0 ) { | |
92 $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment | |
93 } else { | |
94 #convert to correct CIGAR | |
95 my $tmpstr = join" ",@variations ; | |
96 if ( $tmpstr=~ /\+|\-/ ) { | |
97 $cigarstring = cigar_method($line,\@variations,$len); | |
98 $s->[5]=$cigarstring; | |
99 } else { | |
100 $s->[5]=$len. "M"; | |
101 } | |
102 } | |
103 | |
104 # coor | |
105 $s->[2] = substr($t[7], 1); $s->[3] = $t[8]; | |
106 $s->[1] |= 0x10 if ($t[9] eq 'R'); | |
107 # mapQ | |
108 $s->[4] = $t[5] > $t[6]? $t[5] : $t[6]; | |
109 # mate coordinate | |
110 $s->[6] = '*'; $s->[7] = $s->[8] = 0; | |
111 # aux | |
112 push(@$s, "NM:i:".(@t-13)); | |
113 my $md = ''; | |
114 $md = mdtag($md,$line,\@variations,$len); | |
115 push(@$s, "MD:Z:$md"); | |
116 | |
117 } | |
118 | |
119 sub mdtag { | |
120 my $oldmd = shift; | |
121 my $line = shift; | |
122 my $ref =shift; | |
123 my $rdlen = shift; | |
124 my @variations = @$ref; | |
125 my $string=""; | |
126 my $mdtag=""; | |
127 my $t=1; | |
128 my $q=1; | |
129 my $deleteflag=0; | |
130 my $len =0; | |
131 foreach $string (@variations) { | |
132 my ($indeltype,$insert) = indeltype($string); | |
133 if ($indeltype eq "+") { | |
134 $len = length ($insert); | |
135 $q+=$len; | |
136 next; | |
137 } | |
138 my $pos = $1 if $string =~ /^(\d+)/; | |
139 $len = $pos - $t; | |
140 if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) { | |
141 $mdtag.=$len; | |
142 } | |
143 $t+=$len; | |
144 $q+=$len; | |
145 if ($indeltype eq ">") { | |
146 $mdtag.=$insert; | |
147 $deleteflag=0; | |
148 $t+=1; | |
149 $q+=1; | |
150 } | |
151 if ($indeltype eq "-") { | |
152 my $deletedbase = $2 if $string =~ /(\d+)\-([A-Za-z]+)/; | |
153 if ($deleteflag == 0 ) { | |
154 $mdtag.="^"; | |
155 } | |
156 $mdtag.=$deletedbase; | |
157 $deleteflag=1; | |
158 $t+=1; | |
159 } | |
160 } | |
161 $len = $rdlen - $q + 1; | |
162 if ($len > 0) { | |
163 $mdtag.="$len"; | |
164 } | |
165 # print "In:$line\n"; | |
166 # print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n"; | |
167 | |
168 return $mdtag; | |
169 } | |
170 | |
171 sub indeltype { | |
172 my $string = shift; | |
173 my $insert=""; | |
174 my $indeltype; | |
175 if ($string =~ /([A-Za-z]+)\>/) { | |
176 $indeltype=">"; | |
177 $insert=$1; | |
178 } elsif ($string =~ /\-/) { | |
179 $indeltype="-"; | |
180 } elsif ($string =~ /\+([A-Za-z]+)/) { | |
181 $indeltype="+"; | |
182 $insert=$1; | |
183 } | |
184 return ($indeltype,$insert); | |
185 | |
186 } | |
187 | |
188 | |
189 sub cigar_method { | |
190 my $line = shift; | |
191 my $ref =shift; | |
192 my $rdlen = shift; | |
193 my @variations = @$ref; | |
194 my $string=""; | |
195 my $type=""; | |
196 my $t =1; | |
197 my $q=1; | |
198 my $indeltype=""; | |
199 my $cigar= ""; | |
200 my $insert = ""; | |
201 my $len=0; | |
202 my @cig=(); | |
203 foreach $string (@variations) { | |
204 next if $string =~ />/; | |
205 my $pos = $1 if $string =~ /^(\d+)/; | |
206 | |
207 if ($string =~ /\+([A-Za-z]+)/) { | |
208 $indeltype="+"; | |
209 $insert = $1; | |
210 }elsif ($string =~ /\-([A-Za-z]+)/) { | |
211 $indeltype="-"; | |
212 $insert = $1; | |
213 } | |
214 #print "$pos $indeltype $insert $t $q\n"; | |
215 $len = $pos - $t; | |
216 if ( $len > 0) { | |
217 $cigar.=$len."M"; | |
218 push(@cig,$len."M"); | |
219 } | |
220 $t+=$len; | |
221 $q+=$len; | |
222 | |
223 if ($indeltype eq "-") { | |
224 $cigar.="D"; | |
225 push(@cig,"D"); | |
226 $t++; | |
227 } | |
228 if ($indeltype eq "+") { | |
229 $len = length ($insert); | |
230 if ($len == 1) { | |
231 $cigar.="I"; | |
232 push(@cig,"I"); | |
233 } | |
234 if ($len > 1) { | |
235 $cigar.=$len."I"; | |
236 push(@cig,$len."I") | |
237 } | |
238 $q+=$len; | |
239 } | |
240 $insert=""; | |
241 } | |
242 $len= $rdlen - $q + 1; | |
243 if ($len > 0) { | |
244 $cigar.=$len."M"; | |
245 push(@cig,$len."M"); | |
246 } | |
247 | |
248 $cigar = newcigar($cigar,'D'); | |
249 $cigar = newcigar($cigar,'I'); | |
250 | |
251 #print "$line\n"; | |
252 #print "c CIGAR:\t$cigar\n\n"; | |
253 return $cigar; | |
254 | |
255 } | |
256 | |
257 | |
258 | |
259 sub newcigar { | |
260 my $cigar = shift; | |
261 my $char = shift; | |
262 my $new = ""; | |
263 my $copy = $cigar; | |
264 #print "$cigar\n"; | |
265 $copy =~ s/^($char+)/$1;/g; | |
266 #print "$copy\n"; | |
267 $copy =~ s/([^0-9$char])($char+)/$1;$2;/g; | |
268 #print "$copy\n"; | |
269 my @parts = split(/;/,$copy); | |
270 my $el=""; | |
271 foreach $el (@parts) { | |
272 #print "$el\n"; | |
273 if ($el =~ /^$char+$/) { | |
274 $new.=length($el).$char; | |
275 }else { | |
276 $new.=$el; | |
277 } | |
278 | |
279 } | |
280 return $new; | |
281 } |