Mercurial > repos > portiahollyoak > temp
comparison scripts/pickClippedFastq.pl @ 0:28d1a6f8143f draft
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author | portiahollyoak |
---|---|
date | Mon, 25 Apr 2016 13:08:56 -0400 |
parents | |
children | 9672fe07a232 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:28d1a6f8143f |
---|---|
1 #!/share/bin/perl | |
2 use List::Util qw(sum); | |
3 use Bio::Seq; | |
4 | |
5 die "perl $0 <input_prefix> <TE sequence database>\n" if @ARGV<1; | |
6 | |
7 my %transposon_seq=(); | |
8 my %transposon_revcom_seq=(); | |
9 my $curr_seq=""; | |
10 my $curr_transposon=""; | |
11 open (input, "<$ARGV[1]") or die "Can't open $ARGV[1] since $!\n"; | |
12 while (my $line=<input>) { | |
13 chomp($line); | |
14 if ($line =~ /^\>/) { | |
15 if ($curr_transposon ne "") { | |
16 $transposon_seq{$curr_transposon}=uc($curr_seq); | |
17 my $seq=Bio::Seq->new(-seq=>$curr_seq, -alphabet => 'dna'); | |
18 $curr_seq=$seq->revcom->seq; | |
19 $transposon_revcom_seq{$curr_transposon}=uc($curr_seq); | |
20 } | |
21 my @a=split(/\s+/, $line); | |
22 $a[0] =~ s/\>//; | |
23 $curr_transposon=$a[0]; | |
24 $curr_seq=""; | |
25 } | |
26 else {$curr_seq=$curr_seq.$line;} | |
27 } | |
28 close input; | |
29 | |
30 open m1,">>$ARGV[0].clipped.reads.aln"; | |
31 | |
32 open (input, "<$ARGV[0].insertion.bp.bed") or die "Can't open $ARGV[0].insertion.bp.bed since $!\n"; | |
33 while (my $line=<input>) { | |
34 chomp($line); | |
35 my @a=split(/\t/, $line); | |
36 | |
37 my $lower=$a[1]-15; | |
38 my $upper=$a[2]+15; | |
39 if (($lower > 0)&&($upper > 0)) | |
40 { | |
41 system("samtools view -hXf 0x2 $ARGV[0].sorted.bam $a[0]\:$lower\-$upper > temp.sam"); | |
42 | |
43 open in,"temp.sam"; | |
44 my %pe1; | |
45 my %pe2; | |
46 while(<in>) | |
47 { | |
48 chomp; | |
49 my @f=split/\t/,$_,12; | |
50 ## read number 1 or 2 | |
51 my ($rnum)=$f[1]=~/(\d)$/; | |
52 | |
53 ## XT:A:* | |
54 my ($xt)=$f[11]=~/XT:A:(.)/; | |
55 | |
56 if ($f[5]=~/S/) { | |
57 | |
58 ## Coordinate | |
59 my $coor=-10; | |
60 my $strand=""; | |
61 my $final=""; | |
62 my $clipseq=""; | |
63 my @z=split(/M/, $f[5]); | |
64 | |
65 if (($f[5]=~/S$/)&&($f[1]=~/r/)) | |
66 { | |
67 my (@cigar_m)=$f[5]=~/(\d+)M/g; | |
68 my (@cigar_d)=$f[5]=~/(\d+)D/g; | |
69 my (@cigar_s)=$f[5]=~/(\d+)S/g; | |
70 my (@cigar_i)=$f[5]=~/(\d+)I/g; | |
71 my $aln_ln=sum(@cigar_m,@cigar_d); | |
72 $coor=$f[3]+$aln_ln-1; | |
73 $strand="-"; | |
74 | |
75 my (@clipped)=$z[1]=~/(\d+)S/g; | |
76 my $cliplen=sum(@clipped); | |
77 if ($cliplen >= 15) { | |
78 $clipseq=substr($f[9], length($f[9])-$cliplen, $cliplen); | |
79 } | |
80 } | |
81 | |
82 elsif (($f[1]=~/R/)&&($z[0]=~/S/)) | |
83 { | |
84 $coor=$f[3]; $strand="+"; | |
85 | |
86 my (@clipped)=$z[0]=~/(\d+)S/g; | |
87 my $cliplen=sum(@clipped); | |
88 if ($cliplen >= 15) { | |
89 $clipseq=substr($f[9], 0, $cliplen); | |
90 } | |
91 } | |
92 | |
93 if ($clipseq ne "") { | |
94 my $flag=0; | |
95 while ((my $key, my $value) = each (%transposon_seq)) { | |
96 my $seq=$value; | |
97 if ($a[4] eq "antisense") { | |
98 $seq=$transposon_revcom_seq{$key}; | |
99 } | |
100 if (($seq =~ /$clipseq/)&&($a[3] eq $key)&&($coor >= $lower)&&($coor <= $upper)) { | |
101 # print "$clipseq\n"; | |
102 $final=$coor."\($strand\)"; | |
103 if (defined $pe1{$final}) { | |
104 if (length($clipseq) > length($pe1{$final})) { | |
105 $pe1{$final}=$clipseq; | |
106 } | |
107 } | |
108 else {$pe1{$final}=$clipseq; $pe2{$final}=0;} | |
109 $flag=1; | |
110 last; | |
111 } | |
112 } | |
113 } | |
114 | |
115 } | |
116 }#while; | |
117 close in; | |
118 | |
119 open in,"temp.sam"; | |
120 while(<in>) | |
121 { | |
122 chomp; | |
123 my @f=split/\t/,$_,12; | |
124 my ($rnum)=$f[1]=~/(\d)$/; | |
125 my ($xt)=$f[11]=~/XT:A:(.)/; | |
126 | |
127 if ($f[5]=~/S/) { | |
128 | |
129 my $coor=-10; | |
130 my $strand=""; | |
131 my $final=""; | |
132 my $clipseq=""; | |
133 my @z=split(/M/, $f[5]); | |
134 | |
135 if (($f[5]=~/S$/)&&($f[1]=~/r/)) | |
136 { | |
137 my (@cigar_m)=$f[5]=~/(\d+)M/g; | |
138 my (@cigar_d)=$f[5]=~/(\d+)D/g; | |
139 my (@cigar_s)=$f[5]=~/(\d+)S/g; | |
140 my (@cigar_i)=$f[5]=~/(\d+)I/g; | |
141 my $aln_ln=sum(@cigar_m,@cigar_d); | |
142 $coor=$f[3]+$aln_ln-1; | |
143 $strand="-"; | |
144 | |
145 my (@clipped)=$z[1]=~/(\d+)S/g; | |
146 my $cliplen=sum(@clipped); | |
147 if ($cliplen >= 6) { | |
148 $clipseq=substr($f[9], length($f[9])-$cliplen, $cliplen); | |
149 } | |
150 } | |
151 | |
152 elsif (($f[1]=~/R/)&&($z[0]=~/S/)) | |
153 { | |
154 $coor=$f[3]; $strand="+"; | |
155 | |
156 my (@clipped)=$z[0]=~/(\d+)S/g; | |
157 my $cliplen=sum(@clipped); | |
158 if ($cliplen >= 6) { | |
159 $clipseq=substr($f[9], 0, $cliplen); | |
160 } | |
161 } | |
162 | |
163 if ($clipseq ne "") { | |
164 foreach my $coor (keys %pe1) { | |
165 if (($coor =~ /\+/) && (substr($pe1{$coor}, length($pe1{$coor})-length($clipseq), length($clipseq)) eq $clipseq)) { | |
166 $pe2{$coor}++; | |
167 } | |
168 elsif (($coor =~ /\-/) && (substr($pe1{$coor}, 0, length($clipseq)) eq $clipseq)) { | |
169 $pe2{$coor}++; | |
170 } | |
171 } | |
172 } | |
173 } | |
174 } | |
175 close in; | |
176 | |
177 my $clip_site=""; | |
178 | |
179 foreach my $coor (keys %pe2) | |
180 { | |
181 $clip_site=$clip_site."$coor\:$pe2{$coor}\;"; | |
182 } | |
183 chop($clip_site); | |
184 print m1 "$a[0]\t$lower\t$upper\t$a[3]\t$clip_site\n"; | |
185 system("rm temp.sam"); | |
186 } | |
187 else | |
188 { | |
189 print m1 "$line\n"; | |
190 } | |
191 } | |
192 close input; | |
193 close m1; |