# HG changeset patch # User big-tiandm # Date 1406790504 14400 # Node ID 1a6d88b42514be7ec3039c318f2035f6af99efb1 # Parent 0979f7a58686cd4a3bc97366710d9b309fcd26f5 Uploaded diff -r 0979f7a58686 -r 1a6d88b42514 quantify.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantify.pl Thu Jul 31 03:08:24 2014 -0400 @@ -0,0 +1,495 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use File::Path; +use strict; +use File::Basename; +#use Getopt::Std; +use Getopt::Long; +use RNA; + +my %opts; +GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","time:s","h"); +if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $read=$opts{'r'}; +my $pre=$opts{'p'}; +my $mature=$opts{'m'}; + +my $dir=$opts{'o'}; +unless ($dir=~/\/$/) {$dir .="/";} +if (not -d $dir) { + mkdir $dir; +} + +my $threads=defined $opts{'t'} ? $opts{'t'} : 1; +my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0; + +my $upstream = 2; +my $downstream = 5; + +$upstream = $opts{'e'} if(defined $opts{'e'}); +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; + +my $time=Time(); +if (defined $opts{'time'}) { $time=$opts{'time'};} + +my $tmpdir="${dir}/miRNA_Express_${time}"; +if(not -d $tmpdir){ + mkdir($tmpdir); +} +chdir $tmpdir; + +`cp $pre ./`; +my $pre_file_name=basename($pre); + +&mapping(); # matures align to precursors && reads align to precursors; + +my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end; +&maturePosOnPre(); # acknowledge mature positions on precursor + +my %pre_read; +&readPosOnPre(); # acknowledge reads positions on precursors + +if(!(defined $opts{'tag'})){ + foreach my $key (keys %pre_read) { + $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=1;$i<=@ss;$i++) { + $marks .="Smp$i;"; + } + last; + } +} + +my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...." +&attachPre(); + +my $preno=scalar (keys %pre); +print "Total Precursor Number is $preno !!!!\n"; + +my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe; +&structure(); + + +##### analysis and print out && moRs +my $aln=$dir."known_microRNA_express.aln"; +my $list=$dir."known_microRNA_express.txt"; +my $moRs=$dir."known_microRNA_express.moRs"; + +system("ln $mature $dir/known_microRNA_mature.fa "); +system("ln $pre $dir/known_microRNA_precursor.fa "); + +open ALN,">$aln"; +open LIST,">$list"; +open MORS,">$moRs"; + +$"="\t"; ##### @array print in \t + +my @marks=split/\;/,$marks; +#print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n"; +print LIST "#matueID\tpreID\tpos1\tpos2"; +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_matureExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_starExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_totalExp"; +} +print LIST "\n"; +print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n"; +print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n"; +my %moRs; + +foreach my $key (keys %pre) { + print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n"; + next if(! (exists $pre_read{$key})); + my @array=@{$pre_read{$key}}; + @array=sort{$a->[3]<=> $b->[3]} @array; + + my $length=length($pre{$key}); + + my $maxline=-1;my $max=0; ### storage the maxinum express read line + my $totalReadsNo=0; + my @not_over=(); ### new read format better for moRs analysis + +####print out Aln file start + for (my $i=0;$i<@array;$i++) { + my $maps=$array[$i][3]+1; + my $mape=$array[$i][3]+length($array[$i][4]); + my $str=""; + $str .= "." x ($maps-1); + $str .=$array[$i][4]; + $str .="." x ($length-$mape); + $str .=" "; + + $array[$i][0]=~/:([\d|_]+)_x(\d+)$/; + my @sample=split /\_/,$1; + my $total=$2; + print ALN $str,"@sample","\t",$total,"\n"; + + if($total>$max){$max=$total; $maxline=$i;} + $totalReadsNo+=$total; + + push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"]; + } +####print out Aln file end + +#### express list start + my ($ms,$me,$ss,$se); + if (!(exists($pre_mature{$key}))) { + $ms=$array[$maxline][3]+1; + $me=$array[$maxline][3]+length($array[$maxline][4]); + ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'}); + + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + else{ + foreach my $maID (keys %{$pre_mature{$key}}) { + $ms=$pre_mature{$key}{$maID}{"mature"}[0]; + $me=$pre_mature{$key}{$maID}{"mature"}[1]; + $ss=$pre_mature{$key}{$maID}{"star"}[0]; + $se=$pre_mature{$key}{$maID}{"star"}[1]; + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + } +#### express list end + +#### analysis moRs start + my @result; my @m_texp;my $m_texp=0; ### moRs informations + + while (@not_over>0) { + my @over=@not_over; + @not_over=(); + +#丰度最高tag + my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1; + for (my $i=0;$i<@over;$i++) { + my @m_array=@{$over[$i]}; + if ($m_max<$m_array[4]) { + $m_max=$m_array[4]; + $m_maxline=$i; + } + } + $m_start=$over[$m_maxline][1]; + $m_end=$over[$m_maxline][2]; + $m_exp=$m_max; + $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j]=$m_nums[$j]; + } + +#统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt + for (my $i=0;$i<@over;$i++) { + next if($i==$m_maxline); + my @m_array=@{$over[$i]}; + if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) { + $m_exp+=$m_array[4]; + $m_no++; + $m_array[3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j] +=$m_nums[$j]; + } + } + elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads + } + if($m_exp>5){### 5个reads + $m_texp+=$m_exp; + for (my $j=0;$j<@m_exp;$j++) { + $m_texp[$j]+=$m_exp[$j]; + } + my $string=&subseq($pre{$key},$m_start,$m_end,"+"); + push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ; + } + } + + my $str=scalar @result; + my $percent=sprintf("%.2f",$m_texp/$totalReadsNo); + $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}"; + @{$moRs{$str}}=@result; + +#### analysis moRs end +} + +##### moRs print out start +foreach my $key (keys %moRs) { + my @tmp=split/\t/,$key; + next if ($tmp[4]<=2); + next if($tmp[3]<0.95); + my @over; + for (my $i=0;$i<@{$moRs{$key}};$i++) { + my @arrayi=split/\t/,$moRs{$key}[$i]; + for (my $j=0;$j<@{$moRs{$key}};$j++) { + next if($i==$j); + my @arrayj=split/\t/,$moRs{$key}[$j]; + if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) { + push @over,$moRs{$key}[$i]; + } + } + } + if (@over>0) { + print MORS "$key\n"; + foreach (@{$moRs{$key}}) { + print MORS "$_\n"; + } + } +} +###### moRs print out end +close ALN; +close LIST; +close MORS; + +$"=" ";##### reset + + +################### Sub programs ################# +sub express{ + my ($ms,$me,$ss,$se,$read)=@_; + my (@mexp,@sexp,@texp); + $$read[0][0]=~/:([_|\d]+)_x(\d+)$/; + my @numsample=split/_/,$1; + for (my $i=0;$i<@numsample;$i++) { + $mexp[$i]=0; + $sexp[$i]=0; + $texp[$i]=0; + } + + for (my $i=0;$i<@{$read};$i++) { + my $start=$$read[$i][3]+1; + my $end=$$read[$i][3]+length($$read[$i][4]); + $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/; + my $expresses=$1; + my @nums=split/_/,$expresses; + + for (my $j=0;$j<@nums;$j++) { + $texp[$j]+=$nums[$j]; + } + if ($start>=$ms && $end<=$me) { + for (my $j=0;$j<@nums;$j++) { + $mexp[$j]+=$nums[$j]; + } + } + if ($start>=$ss && $end<=$se) { + for (my $j=0;$j<@nums;$j++) { + $sexp[$j]+=$nums[$j]; + } + } + } + return(\@mexp,\@sexp,\@texp); +} + +sub structure{ + foreach my $key (keys %pre_mature) { + if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";} + my ($str,$mfe)=RNA::fold($pre{$key}); + $struc{$key}{"struc"}=$str; + $struc{$key}{"mfe"}=sprintf ("%.2f",$mfe); + + foreach my $id (keys %{$pre_mature{$key}}) { + ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str); + } +=cut +##### Nucleotide complementary + my @tmp=split//,$str; + my %a2b; + my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } + +##### search star position + foreach my $id (keys %{$pre_mature{$key}}) { + my $n=0; + for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; + if($a>$b){ + $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2; + $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + if($a<$b{ + $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2; + $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + last; + } + $n++; + } + } +=cut + } +} +sub other_pair{ + my ($start,$end,$structure)=@_; + ##### Nucleotide complementary + my @tmp=split//,$structure; + my %a2b; my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } +##### search star position + my $n=0;my $startpos; my $endpos; + for (my $i=$start;$i<=$end ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; +# if($a>$b){ +# $startpos=$b-$n+2; +# $endpos=$b-$n+2+($end-$start); +# } +# if($a<$b){ + $endpos=$b+$n+2; + if($endpos>length($structure)){$endpos=length($structure);} + $startpos=$b+$n+2-($end-$start); + if($startpos<1){$startpos=1;} +# } + last; + } + $n++; + } + return ($startpos,$endpos); +} +sub attachPre{ + open IN, "<$pre_file_name"; + my $name; + while (my $aline=) { + chomp $aline; + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $pre{$name} .=$aline; + } + close IN; +} +sub readPosOnPre{ + open IN,") { + chomp $aline; + my @tmp=split/\t/,$aline; + my $id=lc($tmp[2]); + push @{$pre_read{$tmp[2]}},[@tmp]; + } + close IN; +} +sub maturePosOnPre{ + open IN,") { + chomp $aline; + my @tmp=split/\t/,$aline; + my $mm=$tmp[0]; +# $mm=~s/\-3P|\-5P//i; + $mm=lc($mm); + my $pm=$tmp[2]; + $pm=lc($pm); + +# next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a + next if($mm!~/$pm/); +# print "$tmp[2]\t$tmp[0]\n"; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0); +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]); + } + close IN; +} +sub mapping{ + my $err; +## build bowtie index + print STDERR "building bowtie index\n"; + $err = `bowtie-build $pre_file_name miRNA_precursor`; + +## map mature sequences against precursors + print STDERR "mapping mature sequences against index\n"; + $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature mature_mapped.bwt`; + +## map reads against precursors + print STDERR "mapping read sequences against index\n"; + $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa read_mapped.bwt `; + +} + +sub subseq{ + my $seq=shift; + my $beg=shift; + my $end=shift; + my $strand=shift; + + my $subseq=substr($seq,$beg-1,$end-$beg+1); + if ($strand eq "-") { + $subseq=revcom($subseq); + } + return uc $subseq; +} + +sub revcom{ + my $seq=shift; + $seq=~tr/ATCGatcg/TAGCtagc/; + $seq=reverse $seq; + return uc $seq; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -r -p -m -mis -t -e -f -tag -o -time +mandatory parameters: +-p precursor.fa miRNA precursor sequences from miRBase # must be absolute path +-m mature.fa miRNA sequences from miRBase # must be absolute path +-r reads.fa your read sequences #must be absolute path + +-o output directory + +options: +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-t [int] threads number,default 1 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-tag [string] sample marks# eg. sampleA;sampleB;sampleC +-time sting #make directory time,default is the local time +-h help +USAGE +exit(1); +} +