Mercurial > repos > portiahollyoak > temp
diff scripts/pickClippedFastq.pl @ 0:28d1a6f8143f draft
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author | portiahollyoak |
---|---|
date | Mon, 25 Apr 2016 13:08:56 -0400 |
parents | |
children | 9672fe07a232 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/pickClippedFastq.pl Mon Apr 25 13:08:56 2016 -0400 @@ -0,0 +1,193 @@ +#!/share/bin/perl +use List::Util qw(sum); +use Bio::Seq; + +die "perl $0 <input_prefix> <TE sequence database>\n" if @ARGV<1; + +my %transposon_seq=(); +my %transposon_revcom_seq=(); +my $curr_seq=""; +my $curr_transposon=""; +open (input, "<$ARGV[1]") or die "Can't open $ARGV[1] since $!\n"; +while (my $line=<input>) { + chomp($line); + if ($line =~ /^\>/) { + if ($curr_transposon ne "") { + $transposon_seq{$curr_transposon}=uc($curr_seq); + my $seq=Bio::Seq->new(-seq=>$curr_seq, -alphabet => 'dna'); + $curr_seq=$seq->revcom->seq; + $transposon_revcom_seq{$curr_transposon}=uc($curr_seq); + } + my @a=split(/\s+/, $line); + $a[0] =~ s/\>//; + $curr_transposon=$a[0]; + $curr_seq=""; + } + else {$curr_seq=$curr_seq.$line;} +} +close input; + +open m1,">>$ARGV[0].clipped.reads.aln"; + +open (input, "<$ARGV[0].insertion.bp.bed") or die "Can't open $ARGV[0].insertion.bp.bed since $!\n"; +while (my $line=<input>) { + chomp($line); + my @a=split(/\t/, $line); + + my $lower=$a[1]-15; + my $upper=$a[2]+15; + if (($lower > 0)&&($upper > 0)) + { + system("samtools view -hXf 0x2 $ARGV[0].sorted.bam $a[0]\:$lower\-$upper > temp.sam"); + + open in,"temp.sam"; + my %pe1; + my %pe2; + while(<in>) + { + chomp; + my @f=split/\t/,$_,12; + ## read number 1 or 2 + my ($rnum)=$f[1]=~/(\d)$/; + + ## XT:A:* + my ($xt)=$f[11]=~/XT:A:(.)/; + + if ($f[5]=~/S/) { + + ## Coordinate + my $coor=-10; + my $strand=""; + my $final=""; + my $clipseq=""; + my @z=split(/M/, $f[5]); + + if (($f[5]=~/S$/)&&($f[1]=~/r/)) + { + my (@cigar_m)=$f[5]=~/(\d+)M/g; + my (@cigar_d)=$f[5]=~/(\d+)D/g; + my (@cigar_s)=$f[5]=~/(\d+)S/g; + my (@cigar_i)=$f[5]=~/(\d+)I/g; + my $aln_ln=sum(@cigar_m,@cigar_d); + $coor=$f[3]+$aln_ln-1; + $strand="-"; + + my (@clipped)=$z[1]=~/(\d+)S/g; + my $cliplen=sum(@clipped); + if ($cliplen >= 15) { + $clipseq=substr($f[9], length($f[9])-$cliplen, $cliplen); + } + } + + elsif (($f[1]=~/R/)&&($z[0]=~/S/)) + { + $coor=$f[3]; $strand="+"; + + my (@clipped)=$z[0]=~/(\d+)S/g; + my $cliplen=sum(@clipped); + if ($cliplen >= 15) { + $clipseq=substr($f[9], 0, $cliplen); + } + } + + if ($clipseq ne "") { + my $flag=0; + while ((my $key, my $value) = each (%transposon_seq)) { + my $seq=$value; + if ($a[4] eq "antisense") { + $seq=$transposon_revcom_seq{$key}; + } + if (($seq =~ /$clipseq/)&&($a[3] eq $key)&&($coor >= $lower)&&($coor <= $upper)) { +# print "$clipseq\n"; + $final=$coor."\($strand\)"; + if (defined $pe1{$final}) { + if (length($clipseq) > length($pe1{$final})) { + $pe1{$final}=$clipseq; + } + } + else {$pe1{$final}=$clipseq; $pe2{$final}=0;} + $flag=1; + last; + } + } + } + + } + }#while; + close in; + + open in,"temp.sam"; + while(<in>) + { + chomp; + my @f=split/\t/,$_,12; + my ($rnum)=$f[1]=~/(\d)$/; + my ($xt)=$f[11]=~/XT:A:(.)/; + + if ($f[5]=~/S/) { + + my $coor=-10; + my $strand=""; + my $final=""; + my $clipseq=""; + my @z=split(/M/, $f[5]); + + if (($f[5]=~/S$/)&&($f[1]=~/r/)) + { + my (@cigar_m)=$f[5]=~/(\d+)M/g; + my (@cigar_d)=$f[5]=~/(\d+)D/g; + my (@cigar_s)=$f[5]=~/(\d+)S/g; + my (@cigar_i)=$f[5]=~/(\d+)I/g; + my $aln_ln=sum(@cigar_m,@cigar_d); + $coor=$f[3]+$aln_ln-1; + $strand="-"; + + my (@clipped)=$z[1]=~/(\d+)S/g; + my $cliplen=sum(@clipped); + if ($cliplen >= 6) { + $clipseq=substr($f[9], length($f[9])-$cliplen, $cliplen); + } + } + + elsif (($f[1]=~/R/)&&($z[0]=~/S/)) + { + $coor=$f[3]; $strand="+"; + + my (@clipped)=$z[0]=~/(\d+)S/g; + my $cliplen=sum(@clipped); + if ($cliplen >= 6) { + $clipseq=substr($f[9], 0, $cliplen); + } + } + + if ($clipseq ne "") { + foreach my $coor (keys %pe1) { + if (($coor =~ /\+/) && (substr($pe1{$coor}, length($pe1{$coor})-length($clipseq), length($clipseq)) eq $clipseq)) { + $pe2{$coor}++; + } + elsif (($coor =~ /\-/) && (substr($pe1{$coor}, 0, length($clipseq)) eq $clipseq)) { + $pe2{$coor}++; + } + } + } + } + } + close in; + + my $clip_site=""; + + foreach my $coor (keys %pe2) + { + $clip_site=$clip_site."$coor\:$pe2{$coor}\;"; + } + chop($clip_site); + print m1 "$a[0]\t$lower\t$upper\t$a[3]\t$clip_site\n"; + system("rm temp.sam"); + } + else + { + print m1 "$line\n"; + } +} +close input; +close m1;