comparison scripts/pickClippedFastq.pl @ 0:28d1a6f8143f draft

planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author portiahollyoak
date Mon, 25 Apr 2016 13:08:56 -0400
parents
children 9672fe07a232
comparison
equal deleted inserted replaced
-1:000000000000 0:28d1a6f8143f
1 #!/share/bin/perl
2 use List::Util qw(sum);
3 use Bio::Seq;
4
5 die "perl $0 <input_prefix> <TE sequence database>\n" if @ARGV<1;
6
7 my %transposon_seq=();
8 my %transposon_revcom_seq=();
9 my $curr_seq="";
10 my $curr_transposon="";
11 open (input, "<$ARGV[1]") or die "Can't open $ARGV[1] since $!\n";
12 while (my $line=<input>) {
13 chomp($line);
14 if ($line =~ /^\>/) {
15 if ($curr_transposon ne "") {
16 $transposon_seq{$curr_transposon}=uc($curr_seq);
17 my $seq=Bio::Seq->new(-seq=>$curr_seq, -alphabet => 'dna');
18 $curr_seq=$seq->revcom->seq;
19 $transposon_revcom_seq{$curr_transposon}=uc($curr_seq);
20 }
21 my @a=split(/\s+/, $line);
22 $a[0] =~ s/\>//;
23 $curr_transposon=$a[0];
24 $curr_seq="";
25 }
26 else {$curr_seq=$curr_seq.$line;}
27 }
28 close input;
29
30 open m1,">>$ARGV[0].clipped.reads.aln";
31
32 open (input, "<$ARGV[0].insertion.bp.bed") or die "Can't open $ARGV[0].insertion.bp.bed since $!\n";
33 while (my $line=<input>) {
34 chomp($line);
35 my @a=split(/\t/, $line);
36
37 my $lower=$a[1]-15;
38 my $upper=$a[2]+15;
39 if (($lower > 0)&&($upper > 0))
40 {
41 system("samtools view -hXf 0x2 $ARGV[0].sorted.bam $a[0]\:$lower\-$upper > temp.sam");
42
43 open in,"temp.sam";
44 my %pe1;
45 my %pe2;
46 while(<in>)
47 {
48 chomp;
49 my @f=split/\t/,$_,12;
50 ## read number 1 or 2
51 my ($rnum)=$f[1]=~/(\d)$/;
52
53 ## XT:A:*
54 my ($xt)=$f[11]=~/XT:A:(.)/;
55
56 if ($f[5]=~/S/) {
57
58 ## Coordinate
59 my $coor=-10;
60 my $strand="";
61 my $final="";
62 my $clipseq="";
63 my @z=split(/M/, $f[5]);
64
65 if (($f[5]=~/S$/)&&($f[1]=~/r/))
66 {
67 my (@cigar_m)=$f[5]=~/(\d+)M/g;
68 my (@cigar_d)=$f[5]=~/(\d+)D/g;
69 my (@cigar_s)=$f[5]=~/(\d+)S/g;
70 my (@cigar_i)=$f[5]=~/(\d+)I/g;
71 my $aln_ln=sum(@cigar_m,@cigar_d);
72 $coor=$f[3]+$aln_ln-1;
73 $strand="-";
74
75 my (@clipped)=$z[1]=~/(\d+)S/g;
76 my $cliplen=sum(@clipped);
77 if ($cliplen >= 15) {
78 $clipseq=substr($f[9], length($f[9])-$cliplen, $cliplen);
79 }
80 }
81
82 elsif (($f[1]=~/R/)&&($z[0]=~/S/))
83 {
84 $coor=$f[3]; $strand="+";
85
86 my (@clipped)=$z[0]=~/(\d+)S/g;
87 my $cliplen=sum(@clipped);
88 if ($cliplen >= 15) {
89 $clipseq=substr($f[9], 0, $cliplen);
90 }
91 }
92
93 if ($clipseq ne "") {
94 my $flag=0;
95 while ((my $key, my $value) = each (%transposon_seq)) {
96 my $seq=$value;
97 if ($a[4] eq "antisense") {
98 $seq=$transposon_revcom_seq{$key};
99 }
100 if (($seq =~ /$clipseq/)&&($a[3] eq $key)&&($coor >= $lower)&&($coor <= $upper)) {
101 # print "$clipseq\n";
102 $final=$coor."\($strand\)";
103 if (defined $pe1{$final}) {
104 if (length($clipseq) > length($pe1{$final})) {
105 $pe1{$final}=$clipseq;
106 }
107 }
108 else {$pe1{$final}=$clipseq; $pe2{$final}=0;}
109 $flag=1;
110 last;
111 }
112 }
113 }
114
115 }
116 }#while;
117 close in;
118
119 open in,"temp.sam";
120 while(<in>)
121 {
122 chomp;
123 my @f=split/\t/,$_,12;
124 my ($rnum)=$f[1]=~/(\d)$/;
125 my ($xt)=$f[11]=~/XT:A:(.)/;
126
127 if ($f[5]=~/S/) {
128
129 my $coor=-10;
130 my $strand="";
131 my $final="";
132 my $clipseq="";
133 my @z=split(/M/, $f[5]);
134
135 if (($f[5]=~/S$/)&&($f[1]=~/r/))
136 {
137 my (@cigar_m)=$f[5]=~/(\d+)M/g;
138 my (@cigar_d)=$f[5]=~/(\d+)D/g;
139 my (@cigar_s)=$f[5]=~/(\d+)S/g;
140 my (@cigar_i)=$f[5]=~/(\d+)I/g;
141 my $aln_ln=sum(@cigar_m,@cigar_d);
142 $coor=$f[3]+$aln_ln-1;
143 $strand="-";
144
145 my (@clipped)=$z[1]=~/(\d+)S/g;
146 my $cliplen=sum(@clipped);
147 if ($cliplen >= 6) {
148 $clipseq=substr($f[9], length($f[9])-$cliplen, $cliplen);
149 }
150 }
151
152 elsif (($f[1]=~/R/)&&($z[0]=~/S/))
153 {
154 $coor=$f[3]; $strand="+";
155
156 my (@clipped)=$z[0]=~/(\d+)S/g;
157 my $cliplen=sum(@clipped);
158 if ($cliplen >= 6) {
159 $clipseq=substr($f[9], 0, $cliplen);
160 }
161 }
162
163 if ($clipseq ne "") {
164 foreach my $coor (keys %pe1) {
165 if (($coor =~ /\+/) && (substr($pe1{$coor}, length($pe1{$coor})-length($clipseq), length($clipseq)) eq $clipseq)) {
166 $pe2{$coor}++;
167 }
168 elsif (($coor =~ /\-/) && (substr($pe1{$coor}, 0, length($clipseq)) eq $clipseq)) {
169 $pe2{$coor}++;
170 }
171 }
172 }
173 }
174 }
175 close in;
176
177 my $clip_site="";
178
179 foreach my $coor (keys %pe2)
180 {
181 $clip_site=$clip_site."$coor\:$pe2{$coor}\;";
182 }
183 chop($clip_site);
184 print m1 "$a[0]\t$lower\t$upper\t$a[3]\t$clip_site\n";
185 system("rm temp.sam");
186 }
187 else
188 {
189 print m1 "$line\n";
190 }
191 }
192 close input;
193 close m1;