# HG changeset patch
# User big-tiandm
# Date 1406790504 14400
# Node ID 1a6d88b42514be7ec3039c318f2035f6af99efb1
# Parent  0979f7a58686cd4a3bc97366710d9b309fcd26f5
Uploaded

diff -r 0979f7a58686 -r 1a6d88b42514 quantify.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/quantify.pl	Thu Jul 31 03:08:24 2014 -0400
@@ -0,0 +1,495 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use File::Path;
+use strict;
+use File::Basename;
+#use Getopt::Std;
+use Getopt::Long;
+use RNA;
+
+my %opts;
+GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","time:s","h");
+if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $read=$opts{'r'};
+my $pre=$opts{'p'};
+my $mature=$opts{'m'};
+
+my $dir=$opts{'o'};
+unless ($dir=~/\/$/) {$dir .="/";}
+if (not -d $dir) {
+	mkdir $dir;
+}
+
+my $threads=defined $opts{'t'} ? $opts{'t'} : 1;
+my $mismatch=defined $opts{'mis'} ?  $opts{'mis'} : 0;
+
+my $upstream = 2;
+my $downstream = 5;
+
+$upstream = $opts{'e'} if(defined $opts{'e'});
+$downstream = $opts{'f'} if(defined $opts{'f'});
+
+my $marks=defined $opts{'tag'} ? $opts{'tag'} : "";
+
+my $time=Time();
+if (defined $opts{'time'}) { $time=$opts{'time'};}
+
+my $tmpdir="${dir}/miRNA_Express_${time}";
+if(not -d $tmpdir){
+	mkdir($tmpdir);
+}
+chdir $tmpdir;
+
+`cp $pre ./`;
+my $pre_file_name=basename($pre);
+
+&mapping(); # matures align to precursors && reads align to precursors;
+
+my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end;
+&maturePosOnPre(); # acknowledge mature positions on precursor 
+
+my %pre_read;
+&readPosOnPre(); # acknowledge reads positions on precursors
+
+if(!(defined $opts{'tag'})){
+	foreach my $key (keys %pre_read) {
+		$pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/;
+		my @ss=split/_/,$1;
+		for (my $i=1;$i<=@ss;$i++) {
+			$marks .="Smp$i;";
+		}
+		last;
+	}
+}
+
+my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...."
+&attachPre();
+
+my $preno=scalar (keys %pre);
+print "Total Precursor Number is $preno !!!!\n";
+
+my %struc; #mature  star  loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe;
+&structure();
+
+
+##### analysis and print out  && moRs
+my $aln=$dir."known_microRNA_express.aln";
+my $list=$dir."known_microRNA_express.txt";
+my $moRs=$dir."known_microRNA_express.moRs";
+
+system("ln $mature $dir/known_microRNA_mature.fa ");
+system("ln $pre $dir/known_microRNA_precursor.fa ");
+
+open ALN,">$aln";
+open LIST,">$list";
+open MORS,">$moRs";
+
+$"="\t"; ##### @array print in \t
+
+my @marks=split/\;/,$marks;
+#print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n";
+print LIST "#matueID\tpreID\tpos1\tpos2";
+for (my $i=0;$i<@marks;$i++) {
+	print LIST "\t",$marks[$i],"_matureExp";
+}
+for (my $i=0;$i<@marks;$i++) {
+	print LIST "\t",$marks[$i],"_starExp";
+}
+for (my $i=0;$i<@marks;$i++) {
+	print LIST "\t",$marks[$i],"_totalExp";
+}
+print LIST "\n";
+print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n";
+print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n";
+my %moRs;
+
+foreach my $key (keys %pre) {
+	print ALN ">$key\n$pre{$key}\n$struc{$key}{struc}  ($struc{$key}{mfe})\n";
+	next if(! (exists $pre_read{$key}));
+	my @array=@{$pre_read{$key}};
+	@array=sort{$a->[3]<=> $b->[3]} @array;
+	
+	my $length=length($pre{$key});
+
+	my $maxline=-1;my $max=0; ### storage the maxinum express read line
+	my $totalReadsNo=0; 
+	my @not_over=(); ### new read format better for moRs analysis
+
+####print out Aln file start
+	for (my $i=0;$i<@array;$i++) {
+		my $maps=$array[$i][3]+1;
+		my $mape=$array[$i][3]+length($array[$i][4]);
+		my $str="";
+		$str .= "." x ($maps-1);
+		$str .=$array[$i][4];
+		$str .="." x ($length-$mape);
+		$str .="        ";
+
+		$array[$i][0]=~/:([\d|_]+)_x(\d+)$/;
+		my @sample=split /\_/,$1;
+		my $total=$2;
+		print ALN $str,"@sample","\t",$total,"\n";
+
+		if($total>$max){$max=$total; $maxline=$i;}
+		$totalReadsNo+=$total;
+		
+		push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"];
+	}
+####print out Aln file end
+
+#### express list start
+	my ($ms,$me,$ss,$se);
+	if (!(exists($pre_mature{$key}))) {
+		$ms=$array[$maxline][3]+1;
+		$me=$array[$maxline][3]+length($array[$maxline][4]);
+		($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'});
+	
+		my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array);
+		print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n";
+	}
+	else{
+		foreach my $maID (keys %{$pre_mature{$key}}) {
+			$ms=$pre_mature{$key}{$maID}{"mature"}[0];
+			$me=$pre_mature{$key}{$maID}{"mature"}[1];
+			$ss=$pre_mature{$key}{$maID}{"star"}[0];
+			$se=$pre_mature{$key}{$maID}{"star"}[1];
+			my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array);
+			print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n";
+		}
+	}
+#### express list end
+
+#### analysis moRs start
+	my @result; my @m_texp;my $m_texp=0; ### moRs informations
+
+	while (@not_over>0) {
+		my @over=@not_over;
+		@not_over=();
+
+#丰度最高tag
+		my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1;
+		for (my $i=0;$i<@over;$i++) {
+			my @m_array=@{$over[$i]};
+			if ($m_max<$m_array[4]) {
+				$m_max=$m_array[4];
+				$m_maxline=$i;
+			}
+		}
+		$m_start=$over[$m_maxline][1];
+		$m_end=$over[$m_maxline][2];
+		$m_exp=$m_max;
+		$over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/;
+		my @m_nums=split/_/,$1;
+		for (my $j=0;$j<@m_nums;$j++) {
+			$m_exp[$j]=$m_nums[$j];
+		}
+
+#统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt
+		for (my $i=0;$i<@over;$i++) {
+			next if($i==$m_maxline);
+			my @m_array=@{$over[$i]};
+			if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) {
+				$m_exp+=$m_array[4];
+				$m_no++;
+				$m_array[3]=~/:([\d|_]+)_x(\d+)$/;
+				my @m_nums=split/_/,$1;
+				for (my $j=0;$j<@m_nums;$j++) {
+					$m_exp[$j] +=$m_nums[$j];
+				}
+			}
+			elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads
+		}
+		if($m_exp>5){### 5个reads
+			$m_texp+=$m_exp;
+			for (my $j=0;$j<@m_exp;$j++) {
+				$m_texp[$j]+=$m_exp[$j];
+			}
+			my $string=&subseq($pre{$key},$m_start,$m_end,"+");
+			push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ;
+		}
+	}
+
+	my $str=scalar @result;
+	my $percent=sprintf("%.2f",$m_texp/$totalReadsNo);
+	$str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}";
+	@{$moRs{$str}}=@result;
+
+#### analysis moRs end
+}
+
+##### moRs print out start
+foreach my $key (keys %moRs) {
+	my @tmp=split/\t/,$key;
+	next if ($tmp[4]<=2);
+	next if($tmp[3]<0.95);
+	my @over;
+	for (my $i=0;$i<@{$moRs{$key}};$i++) {
+		my @arrayi=split/\t/,$moRs{$key}[$i];
+		for (my $j=0;$j<@{$moRs{$key}};$j++) {
+			next if($i==$j);
+			my @arrayj=split/\t/,$moRs{$key}[$j];
+			if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) {
+				push @over,$moRs{$key}[$i];
+			}
+		}
+	}
+	if (@over>0) {
+		print MORS "$key\n";
+		foreach  (@{$moRs{$key}}) {
+			print MORS "$_\n";
+		}
+	}
+}
+###### moRs print out end
+close ALN;
+close LIST;
+close MORS;
+
+$"=" ";##### reset
+
+
+################### Sub programs #################
+sub express{
+	my ($ms,$me,$ss,$se,$read)=@_;
+	my (@mexp,@sexp,@texp);
+	$$read[0][0]=~/:([_|\d]+)_x(\d+)$/;
+	my @numsample=split/_/,$1;
+	for (my $i=0;$i<@numsample;$i++) {
+		$mexp[$i]=0;
+		$sexp[$i]=0;
+		$texp[$i]=0;
+	}
+
+	for (my $i=0;$i<@{$read};$i++) {
+		my $start=$$read[$i][3]+1;
+		my $end=$$read[$i][3]+length($$read[$i][4]);
+		$$read[$i][0]=~/:([_|\d]+)_x(\d+)$/;
+		my $expresses=$1;
+		my @nums=split/_/,$expresses;
+		
+		for (my $j=0;$j<@nums;$j++) {
+			$texp[$j]+=$nums[$j];
+		}
+		if ($start>=$ms && $end<=$me) {
+			for (my $j=0;$j<@nums;$j++) {
+				$mexp[$j]+=$nums[$j];
+			}
+		}
+		if ($start>=$ss && $end<=$se) {
+			for (my $j=0;$j<@nums;$j++) {
+				$sexp[$j]+=$nums[$j];
+			}
+		}
+	}
+	return(\@mexp,\@sexp,\@texp);
+}
+
+sub structure{
+	foreach my $key (keys %pre_mature) {
+		if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";}
+		my ($str,$mfe)=RNA::fold($pre{$key});
+		$struc{$key}{"struc"}=$str;
+		$struc{$key}{"mfe"}=sprintf ("%.2f",$mfe);
+
+		foreach my $id (keys %{$pre_mature{$key}}) {
+			($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str);
+		}
+=cut
+##### Nucleotide complementary 
+		my @tmp=split//,$str;
+		my %a2b;
+		my @bps;
+		for (my $i=0;$i<@tmp;$i++) {
+			if ($tmp[$i] eq "("){push @bps,$i+1 ; next;}
+			if ($tmp[$i] eq ")") {
+				my $up=pop @bps;
+				$a2b{$i+1}=$up;
+				$a2b{$up}=$i+1;
+			}
+		}
+
+##### search star position
+		foreach my $id (keys %{$pre_mature{$key}}) {
+			my $n=0;
+			for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) {				
+				if (defined $a2b{$i}) {
+					my $a=$i; my $b=$a2b{$i};
+					if($a>$b){
+						$pre_mature{$key}{$id}{"star"}[0]=$b-$n+2;
+						$pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]);
+					}
+					if($a<$b{
+						$pre_mature{$key}{$id}{"star"}[1]=$b+$n+2;
+						$pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]);
+					}
+					last;
+				}
+				$n++;
+			}
+		}
+=cut
+	}
+}
+sub other_pair{
+	my ($start,$end,$structure)=@_;
+	##### Nucleotide complementary 
+	my @tmp=split//,$structure;
+	my %a2b;	my @bps;
+	for (my $i=0;$i<@tmp;$i++) {
+		if ($tmp[$i] eq "("){push @bps,$i+1 ; next;}
+		if ($tmp[$i] eq ")") {
+			my $up=pop @bps;
+			$a2b{$i+1}=$up;
+			$a2b{$up}=$i+1;
+		}
+	}
+##### search star position
+	my $n=0;my $startpos; my $endpos;
+	for (my $i=$start;$i<=$end ; $i++) {				
+		if (defined $a2b{$i}) {
+			my $a=$i; my $b=$a2b{$i};
+#			if($a>$b){
+#				$startpos=$b-$n+2;
+#				$endpos=$b-$n+2+($end-$start);
+#			}
+#			if($a<$b){
+				$endpos=$b+$n+2;
+				if($endpos>length($structure)){$endpos=length($structure);}
+				$startpos=$b+$n+2-($end-$start);
+				if($startpos<1){$startpos=1;}
+#			}
+			last;
+		}
+		$n++;
+	}
+	return ($startpos,$endpos);
+}
+sub attachPre{
+	open IN, "<$pre_file_name";
+	my $name;
+	while (my $aline=<IN>) {
+		chomp $aline;
+		if ($aline=~/^>(\S+)/) {
+			$name=$1;
+			next;
+		}
+		$pre{$name} .=$aline;
+	}
+	close IN;
+}
+sub readPosOnPre{
+	open IN,"<read_mapped.bwt";
+	while (my $aline=<IN>) {
+		chomp $aline;
+		my @tmp=split/\t/,$aline;
+		my $id=lc($tmp[2]);
+		push @{$pre_read{$tmp[2]}},[@tmp];
+	}
+	close IN;
+}
+sub maturePosOnPre{
+	open IN,"<mature_mapped.bwt";
+	while (my $aline=<IN>) {
+		chomp $aline;
+		my @tmp=split/\t/,$aline;
+		my $mm=$tmp[0];
+#		$mm=~s/\-3P|\-5P//i;
+		$mm=lc($mm);
+		my $pm=$tmp[2];
+		$pm=lc($pm);
+
+#		next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a
+		next if($mm!~/$pm/);
+#		print "$tmp[2]\t$tmp[0]\n";
+#		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream;
+#		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0);
+#		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream;
+		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1;
+		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]);
+	}
+	close IN;
+}
+sub mapping{
+    my $err;
+## build bowtie index
+    print STDERR "building bowtie index\n";
+    $err = `bowtie-build $pre_file_name miRNA_precursor`;
+
+## map mature sequences against precursors
+    print STDERR "mapping mature sequences against index\n";
+	$err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature mature_mapped.bwt`;
+
+## map reads against precursors
+    print STDERR "mapping read sequences against index\n";
+    $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa read_mapped.bwt `;
+
+}
+
+sub subseq{
+	my $seq=shift;
+	my $beg=shift;
+	my $end=shift;
+	my $strand=shift;
+	
+	my $subseq=substr($seq,$beg-1,$end-$beg+1);
+	if ($strand eq "-") {
+		$subseq=revcom($subseq);
+	}
+	return uc $subseq;
+}
+
+sub revcom{
+	my $seq=shift;
+	$seq=~tr/ATCGatcg/TAGCtagc/;
+	$seq=reverse $seq;
+	return uc $seq;
+}
+
+sub Time{
+        my $time=time();
+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
+        $month++;
+        $year+=1900;
+        if (length($sec) == 1) {$sec = "0"."$sec";}
+        if (length($min) == 1) {$min = "0"."$min";}
+        if (length($hour) == 1) {$hour = "0"."$hour";}
+        if (length($day) == 1) {$day = "0"."$day";}
+        if (length($month) == 1) {$month = "0"."$month";}
+        #print "$year-$month-$day $hour:$min:$sec\n";
+        return("$year-$month-$day-$hour-$min-$sec");
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -r -p -m -mis -t -e -f -tag -o -time
+mandatory parameters:
+-p precursor.fa  miRNA precursor sequences from miRBase # must be absolute path
+-m mature.fa     miRNA sequences from miRBase # must be absolute path
+-r reads.fa      your read sequences #must be absolute path
+
+-o output directory
+
+options:
+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0
+-t [int]     threads number,default 1
+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2
+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5
+-tag [string] sample marks# eg. sampleA;sampleB;sampleC
+-time sting #make directory time,default is the local time
+-h help
+USAGE
+exit(1);
+}
+