0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 # Contact: lh3
|
|
4 # Version: 0.1.3
|
|
5
|
|
6 #Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com)
|
|
7
|
|
8 #use strict;
|
|
9 #use warnings;
|
|
10 use Data::Dumper;
|
|
11 use Getopt::Std;
|
|
12
|
|
13 &novo2sam;
|
|
14 exit;
|
|
15
|
|
16 sub mating {
|
|
17 my ($s1, $s2) = @_;
|
|
18 my $isize = 0;
|
|
19 if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
|
|
20 my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
|
|
21 my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
|
|
22 $isize = $x2 - $x1;
|
|
23 }
|
|
24 # update mate coordinate
|
|
25 if ($s2->[2] ne '*') {
|
|
26 @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
|
|
27 $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
|
|
28 } else {
|
|
29 $s1->[1] |= 0x8;
|
|
30 }
|
|
31 if ($s1->[2] ne '*') {
|
|
32 @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
|
|
33 $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
|
|
34 } else {
|
|
35 $s2->[1] |= 0x8;
|
|
36 }
|
|
37 }
|
|
38
|
|
39 sub novo2sam {
|
|
40 my %opts = ();
|
|
41 getopts("p", \%opts);
|
|
42 die("Usage: novo2sam.pl [-p] <aln.novo>\n") if (@ARGV == 0);
|
|
43 my $is_paired = defined($opts{p});
|
|
44 # core loop
|
|
45 my @s1 = ();
|
|
46 my @s2 = ();
|
|
47 my ($s_last, $s_curr) = (\@s1, \@s2);
|
|
48 while (<>) {
|
|
49 next if (/^#/);
|
|
50 next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/);
|
|
51 &novo2sam_aux($_, $s_curr, $is_paired);
|
|
52 if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
|
|
53 &mating($s_last, $s_curr);
|
|
54 print join("\t", @$s_last), "\n";
|
|
55 print join("\t", @$s_curr), "\n";
|
|
56 @$s_last = (); @$s_curr = ();
|
|
57 } else {
|
|
58 print join("\t", @$s_last), "\n" if (@$s_last != 0);
|
|
59 my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
|
|
60 }
|
|
61 }
|
|
62 print join("\t", @$s_last), "\n" if (@$s_last != 0);
|
|
63 }
|
|
64
|
|
65 sub novo2sam_aux {
|
|
66 my ($line, $s, $is_paired) = @_;
|
|
67
|
|
68 chomp($line);
|
|
69 my @t = split(/\s+/, $line);
|
|
70 my @variations = @t[13 .. $#t];
|
|
71 @$s = ();
|
|
72 return if ($t[4] ne 'U');
|
|
73 my $len = length($t[2]);
|
|
74 # read name
|
|
75 $s->[0] = substr($t[0], 1);
|
|
76 $s->[0] =~ s/\/[12]$//g;
|
|
77 # initial flag (will be updated later)
|
|
78 $s->[1] = 0;
|
|
79 $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7);
|
|
80 $s->[1] |= 2 if ($t[10] eq '.');
|
|
81 # read & quality
|
|
82 if ($t[9] eq 'R') {
|
|
83 $s->[9] = reverse($t[2]);
|
|
84 $s->[10] = reverse($t[3]);
|
|
85 $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
|
|
86 } else {
|
|
87 $s->[9] = $t[2]; $s->[10] = $t[3];
|
|
88 }
|
|
89 # cigar
|
|
90 my $cigarstring ="";
|
|
91 if (scalar @variations ==0 ) {
|
|
92 $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment
|
|
93 } else {
|
|
94 #convert to correct CIGAR
|
|
95 my $tmpstr = join" ",@variations ;
|
|
96 if ( $tmpstr=~ /\+|\-/ ) {
|
|
97 $cigarstring = cigar_method($line,\@variations,$len);
|
|
98 $s->[5]=$cigarstring;
|
|
99 } else {
|
|
100 $s->[5]=$len. "M";
|
|
101 }
|
|
102 }
|
|
103
|
|
104 # coor
|
|
105 $s->[2] = substr($t[7], 1); $s->[3] = $t[8];
|
|
106 $s->[1] |= 0x10 if ($t[9] eq 'R');
|
|
107 # mapQ
|
|
108 $s->[4] = $t[5] > $t[6]? $t[5] : $t[6];
|
|
109 # mate coordinate
|
|
110 $s->[6] = '*'; $s->[7] = $s->[8] = 0;
|
|
111 # aux
|
|
112 push(@$s, "NM:i:".(@t-13));
|
|
113 my $md = '';
|
|
114 $md = mdtag($md,$line,\@variations,$len);
|
|
115 push(@$s, "MD:Z:$md");
|
|
116
|
|
117 }
|
|
118
|
|
119 sub mdtag {
|
|
120 my $oldmd = shift;
|
|
121 my $line = shift;
|
|
122 my $ref =shift;
|
|
123 my $rdlen = shift;
|
|
124 my @variations = @$ref;
|
|
125 my $string="";
|
|
126 my $mdtag="";
|
|
127 my $t=1;
|
|
128 my $q=1;
|
|
129 my $deleteflag=0;
|
|
130 my $len =0;
|
|
131 foreach $string (@variations) {
|
|
132 my ($indeltype,$insert) = indeltype($string);
|
|
133 if ($indeltype eq "+") {
|
|
134 $len = length ($insert);
|
|
135 $q+=$len;
|
|
136 next;
|
|
137 }
|
|
138 my $pos = $1 if $string =~ /^(\d+)/;
|
|
139 $len = $pos - $t;
|
|
140 if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) {
|
|
141 $mdtag.=$len;
|
|
142 }
|
|
143 $t+=$len;
|
|
144 $q+=$len;
|
|
145 if ($indeltype eq ">") {
|
|
146 $mdtag.=$insert;
|
|
147 $deleteflag=0;
|
|
148 $t+=1;
|
|
149 $q+=1;
|
|
150 }
|
|
151 if ($indeltype eq "-") {
|
|
152 my $deletedbase = $2 if $string =~ /(\d+)\-([A-Za-z]+)/;
|
|
153 if ($deleteflag == 0 ) {
|
|
154 $mdtag.="^";
|
|
155 }
|
|
156 $mdtag.=$deletedbase;
|
|
157 $deleteflag=1;
|
|
158 $t+=1;
|
|
159 }
|
|
160 }
|
|
161 $len = $rdlen - $q + 1;
|
|
162 if ($len > 0) {
|
|
163 $mdtag.="$len";
|
|
164 }
|
|
165 # print "In:$line\n";
|
|
166 # print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n";
|
|
167
|
|
168 return $mdtag;
|
|
169 }
|
|
170
|
|
171 sub indeltype {
|
|
172 my $string = shift;
|
|
173 my $insert="";
|
|
174 my $indeltype;
|
|
175 if ($string =~ /([A-Za-z]+)\>/) {
|
|
176 $indeltype=">";
|
|
177 $insert=$1;
|
|
178 } elsif ($string =~ /\-/) {
|
|
179 $indeltype="-";
|
|
180 } elsif ($string =~ /\+([A-Za-z]+)/) {
|
|
181 $indeltype="+";
|
|
182 $insert=$1;
|
|
183 }
|
|
184 return ($indeltype,$insert);
|
|
185
|
|
186 }
|
|
187
|
|
188
|
|
189 sub cigar_method {
|
|
190 my $line = shift;
|
|
191 my $ref =shift;
|
|
192 my $rdlen = shift;
|
|
193 my @variations = @$ref;
|
|
194 my $string="";
|
|
195 my $type="";
|
|
196 my $t =1;
|
|
197 my $q=1;
|
|
198 my $indeltype="";
|
|
199 my $cigar= "";
|
|
200 my $insert = "";
|
|
201 my $len=0;
|
|
202 my @cig=();
|
|
203 foreach $string (@variations) {
|
|
204 next if $string =~ />/;
|
|
205 my $pos = $1 if $string =~ /^(\d+)/;
|
|
206
|
|
207 if ($string =~ /\+([A-Za-z]+)/) {
|
|
208 $indeltype="+";
|
|
209 $insert = $1;
|
|
210 }elsif ($string =~ /\-([A-Za-z]+)/) {
|
|
211 $indeltype="-";
|
|
212 $insert = $1;
|
|
213 }
|
|
214 #print "$pos $indeltype $insert $t $q\n";
|
|
215 $len = $pos - $t;
|
|
216 if ( $len > 0) {
|
|
217 $cigar.=$len."M";
|
|
218 push(@cig,$len."M");
|
|
219 }
|
|
220 $t+=$len;
|
|
221 $q+=$len;
|
|
222
|
|
223 if ($indeltype eq "-") {
|
|
224 $cigar.="D";
|
|
225 push(@cig,"D");
|
|
226 $t++;
|
|
227 }
|
|
228 if ($indeltype eq "+") {
|
|
229 $len = length ($insert);
|
|
230 if ($len == 1) {
|
|
231 $cigar.="I";
|
|
232 push(@cig,"I");
|
|
233 }
|
|
234 if ($len > 1) {
|
|
235 $cigar.=$len."I";
|
|
236 push(@cig,$len."I")
|
|
237 }
|
|
238 $q+=$len;
|
|
239 }
|
|
240 $insert="";
|
|
241 }
|
|
242 $len= $rdlen - $q + 1;
|
|
243 if ($len > 0) {
|
|
244 $cigar.=$len."M";
|
|
245 push(@cig,$len."M");
|
|
246 }
|
|
247
|
|
248 $cigar = newcigar($cigar,'D');
|
|
249 $cigar = newcigar($cigar,'I');
|
|
250
|
|
251 #print "$line\n";
|
|
252 #print "c CIGAR:\t$cigar\n\n";
|
|
253 return $cigar;
|
|
254
|
|
255 }
|
|
256
|
|
257
|
|
258
|
|
259 sub newcigar {
|
|
260 my $cigar = shift;
|
|
261 my $char = shift;
|
|
262 my $new = "";
|
|
263 my $copy = $cigar;
|
|
264 #print "$cigar\n";
|
|
265 $copy =~ s/^($char+)/$1;/g;
|
|
266 #print "$copy\n";
|
|
267 $copy =~ s/([^0-9$char])($char+)/$1;$2;/g;
|
|
268 #print "$copy\n";
|
|
269 my @parts = split(/;/,$copy);
|
|
270 my $el="";
|
|
271 foreach $el (@parts) {
|
|
272 #print "$el\n";
|
|
273 if ($el =~ /^$char+$/) {
|
|
274 $new.=length($el).$char;
|
|
275 }else {
|
|
276 $new.=$el;
|
|
277 }
|
|
278
|
|
279 }
|
|
280 return $new;
|
|
281 }
|