Mercurial > repos > bigrna > gpsrna

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2013/7/19
#Modified:
#Description:
my $version=1.00;

use File::Path;
use strict;
use File::Basename;
#use Getopt::Std;
use Getopt::Long;
#use RNA;

my %opts;
GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","h");
if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
&usage;
}

my $read=$opts{'r'};
my $pre=$opts{'p'};
my $mature=$opts{'m'};

my $dir=$opts{'o'};
unless ($dir=~/\/$/) {$dir .="/";}
if (not -d $dir) {
	mkdir $dir;
}

my $threads=defined $opts{'t'} ? $opts{'t'} : 1;
my $mismatch=defined $opts{'mis'} ?  $opts{'mis'} : 0;

my $upstream = 2;
my $downstream = 5;

$upstream = $opts{'e'} if(defined $opts{'e'});
$downstream = $opts{'f'} if(defined $opts{'f'});

my $marks=defined $opts{'tag'} ? $opts{'tag'} : "";

my $time=Time();

my $tmpdir="${dir}/known_miRNA_Express";
if(not -d $tmpdir){
	mkdir($tmpdir);
}
chdir $tmpdir;

`cp $pre ./`;
my $pre_file_name=basename($pre);

&mapping(); # matures align to precursors && reads align to precursors;

my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end;
&maturePosOnPre(); # acknowledge mature positions on precursor

my %pre_read;
&readPosOnPre(); # acknowledge reads positions on precursors

if(!(defined $opts{'tag'})){
	foreach my $key (keys %pre_read) {
		$pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/;
		my @ss=split/_/,$1;
		for (my $i=1;$i<=@ss;$i++) {
			$marks .="Smp$i;";
		}
		last;
	}
}

my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...."
&attachPre();

my $preno=scalar (keys %pre);
print "Total Precursor Number is $preno !!!!\n";

my %struc; #mature  star  loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe;
&structure();


##### analysis and print out  && moRs
my $aln=$dir."known_microRNA_express.aln";
my $list=$dir."known_microRNA_express.txt";
my $moRs=$dir."known_microRNA_express.moRs";

system("ln -s $mature $dir/known_microRNA_mature.fa ");
system("ln -s $pre $dir/known_microRNA_precursor.fa ");

open ALN,">$aln";
open LIST,">$list";
open MORS,">$moRs";

$"="\t"; ##### @array print in \t

my @marks=split/\;/,$marks;
#print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n";
print LIST "#matueID\tpreID\tpos1\tpos2";
for (my $i=0;$i<@marks;$i++) {
	print LIST "\t",$marks[$i],"_matureExp";
}
for (my $i=0;$i<@marks;$i++) {
	print LIST "\t",$marks[$i],"_starExp";
}
for (my $i=0;$i<@marks;$i++) {
	print LIST "\t",$marks[$i],"_totalExp";
}
print LIST "\n";
print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n";
print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n";
my %moRs;

foreach my $key (keys %pre) {
	print ALN ">$key\n$pre{$key}\n$struc{$key}{struc}  ($struc{$key}{mfe})\n";
	next if(! (exists $pre_read{$key}));
	my @array=@{$pre_read{$key}};
	@array=sort{$a->[3]<=> $b->[3]} @array;

	my $length=length($pre{$key});

	my $maxline=-1;my $max=0; ### storage the maxinum express read line
	my $totalReadsNo=0;
	my @not_over=(); ### new read format better for moRs analysis

####print out Aln file start
	for (my $i=0;$i<@array;$i++) {
		my $maps=$array[$i][3]+1;
		my $mape=$array[$i][3]+length($array[$i][4]);
		my $str="";
		$str .= "." x ($maps-1);
		$str .=$array[$i][4];
		$str .="." x ($length-$mape);
		$str .="        ";

		$array[$i][0]=~/:([\d|_]+)_x(\d+)$/;
		my @sample=split /\_/,$1;
		my $total=$2;
		print ALN $str,"@sample","\t",$total,"\n";

		if($total>$max){$max=$total; $maxline=$i;}
		$totalReadsNo+=$total;

		push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"];
	}
####print out Aln file end

#### express list start
	my ($ms,$me,$ss,$se);
	if (!(exists($pre_mature{$key}))) {
		$ms=$array[$maxline][3]+1;
		$me=$array[$maxline][3]+length($array[$maxline][4]);
		($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'});

		my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array);
		print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n";
	}
	else{
		foreach my $maID (keys %{$pre_mature{$key}}) {
			$ms=$pre_mature{$key}{$maID}{"mature"}[0];
			$me=$pre_mature{$key}{$maID}{"mature"}[1];
			$ss=$pre_mature{$key}{$maID}{"star"}[0];
			$se=$pre_mature{$key}{$maID}{"star"}[1];
			my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array);
			print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n";
		}
	}
#### express list end

#### analysis moRs start
	my @result; my @m_texp;my $m_texp=0; ### moRs informations

	while (@not_over>0) {
		my @over=@not_over;
		@not_over=();

#丰度最高tag
		my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1;
		for (my $i=0;$i<@over;$i++) {
			my @m_array=@{$over[$i]};
			if ($m_max<$m_array[4]) {
				$m_max=$m_array[4];
				$m_maxline=$i;
			}
		}
		$m_start=$over[$m_maxline][1];
		$m_end=$over[$m_maxline][2];
		$m_exp=$m_max;
		$over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/;
		my @m_nums=split/_/,$1;
		for (my $j=0;$j<@m_nums;$j++) {
			$m_exp[$j]=$m_nums[$j];
		}

#统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt
		for (my $i=0;$i<@over;$i++) {
			next if($i==$m_maxline);
			my @m_array=@{$over[$i]};
			if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) {
				$m_exp+=$m_array[4];
				$m_no++;
				$m_array[3]=~/:([\d|_]+)_x(\d+)$/;
				my @m_nums=split/_/,$1;
				for (my $j=0;$j<@m_nums;$j++) {
					$m_exp[$j] +=$m_nums[$j];
				}
			}
			elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads
		}
		if($m_exp>5){### 5个reads
			$m_texp+=$m_exp;
			for (my $j=0;$j<@m_exp;$j++) {
				$m_texp[$j]+=$m_exp[$j];
			}
			my $string=&subseq($pre{$key},$m_start,$m_end,"+");
			push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ;
		}
	}

	my $str=scalar @result;
	my $percent=sprintf("%.2f",$m_texp/$totalReadsNo);
	$str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}";
	@{$moRs{$str}}=@result;

#### analysis moRs end
}

##### moRs print out start
foreach my $key (keys %moRs) {
	my @tmp=split/\t/,$key;
	next if ($tmp[4]<=2);
	next if($tmp[3]<0.95);
	my @over;
	for (my $i=0;$i<@{$moRs{$key}};$i++) {
		my @arrayi=split/\t/,$moRs{$key}[$i];
		for (my $j=0;$j<@{$moRs{$key}};$j++) {
			next if($i==$j);
			my @arrayj=split/\t/,$moRs{$key}[$j];
			if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) {
				push @over,$moRs{$key}[$i];
			}
		}
	}
	if (@over>0) {
		print MORS "$key\n";
		foreach  (@{$moRs{$key}}) {
			print MORS "$_\n";
		}
	}
}
###### moRs print out end
close ALN;
close LIST;
close MORS;

$"=" ";##### reset


################### Sub programs #################
sub express{
	my ($ms,$me,$ss,$se,$read)=@_;
	my (@mexp,@sexp,@texp);
	$$read[0][0]=~/:([_|\d]+)_x(\d+)$/;
	my @numsample=split/_/,$1;
	for (my $i=0;$i<@numsample;$i++) {
		$mexp[$i]=0;
		$sexp[$i]=0;
		$texp[$i]=0;
	}

	for (my $i=0;$i<@{$read};$i++) {
		my $start=$$read[$i][3]+1;
		my $end=$$read[$i][3]+length($$read[$i][4]);
		$$read[$i][0]=~/:([_|\d]+)_x(\d+)$/;
		my $expresses=$1;
		my @nums=split/_/,$expresses;

		for (my $j=0;$j<@nums;$j++) {
			$texp[$j]+=$nums[$j];
		}
		if ($start>=$ms && $end<=$me) {
			for (my $j=0;$j<@nums;$j++) {
				$mexp[$j]+=$nums[$j];
			}
		}
		if ($start>=$ss && $end<=$se) {
			for (my $j=0;$j<@nums;$j++) {
				$sexp[$j]+=$nums[$j];
			}
		}
	}
	return(\@mexp,\@sexp,\@texp);
}

sub structure{
	foreach my $key (keys %pre_mature) {
		if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";}
		#my ($str,$mfe)=RNA::fold($pre{$key});
		my $rnafold=`perl -e 'print "$pre{$key}"' | RNAfold --noPS`;
		my @rnafolds=split/\s+/,$rnafold;
		my $str=$rnafolds[1];
		my $mfe=$rnafolds[-1];
		$mfe=~s/\(//;
		$mfe=~s/\)//;

		$struc{$key}{"struc"}=$str;
		#$struc{$key}{"mfe"}=sprintf ("%.2f",$mfe);
		$struc{$key}{"mfe"}=$mfe;

		foreach my $id (keys %{$pre_mature{$key}}) {
			($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str);
		}
=cut
##### Nucleotide complementary
		my @tmp=split//,$str;
		my %a2b;
		my @bps;
		for (my $i=0;$i<@tmp;$i++) {
			if ($tmp[$i] eq "("){push @bps,$i+1 ; next;}
			if ($tmp[$i] eq ")") {
				my $up=pop @bps;
				$a2b{$i+1}=$up;
				$a2b{$up}=$i+1;
			}
		}

##### search star position
		foreach my $id (keys %{$pre_mature{$key}}) {
			my $n=0;
			for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) {
				if (defined $a2b{$i}) {
					my $a=$i; my $b=$a2b{$i};
					if($a>$b){
						$pre_mature{$key}{$id}{"star"}[0]=$b-$n+2;
						$pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]);
					}
					if($a<$b{
						$pre_mature{$key}{$id}{"star"}[1]=$b+$n+2;
						$pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]);
					}
					last;
				}
				$n++;
			}
		}
=cut
	}
}
sub other_pair{
	my ($start,$end,$structure)=@_;
	##### Nucleotide complementary
	my @tmp=split//,$structure;
	my %a2b;	my @bps;
	for (my $i=0;$i<@tmp;$i++) {
		if ($tmp[$i] eq "("){push @bps,$i+1 ; next;}
		if ($tmp[$i] eq ")") {
			my $up=pop @bps;
			$a2b{$i+1}=$up;
			$a2b{$up}=$i+1;
		}
	}
##### search star position
	my $n=0;my $startpos; my $endpos;
	for (my $i=$start;$i<=$end ; $i++) {
		if (defined $a2b{$i}) {
			my $a=$i; my $b=$a2b{$i};
#			if($a>$b){
#				$startpos=$b-$n+2;
#				$endpos=$b-$n+2+($end-$start);
#			}
#			if($a<$b){
				$endpos=$b+$n+2;
				if($endpos>length($structure)){$endpos=length($structure);}
				$startpos=$b+$n+2-($end-$start);
				if($startpos<1){$startpos=1;}
#			}
			last;
		}
		$n++;
	}
	return ($startpos,$endpos);
}
sub attachPre{
	open IN, "<$pre_file_name";
	my $name;
	while (my $aline=<IN>) {
		chomp $aline;
		if ($aline=~/^>(\S+)/) {
			$name=$1;
			next;
		}
		$pre{$name} .=$aline;
	}
	close IN;
}
sub readPosOnPre{
	open IN,"<read_mapped.bwt";
	while (my $aline=<IN>) {
		chomp $aline;
		my @tmp=split/\t/,$aline;
		my $id=lc($tmp[2]);
		push @{$pre_read{$tmp[2]}},[@tmp];
	}
	close IN;
}
sub maturePosOnPre{
	open IN,"<mature_mapped.bwt";
	while (my $aline=<IN>) {
		chomp $aline;
		my @tmp=split/\t/,$aline;
		my $mm=$tmp[0];
#		$mm=~s/\-3P|\-5P//i;
		$mm=lc($mm);
		my $pm=$tmp[2];
		$pm=lc($pm);

#		next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a
		next if($mm!~/$pm/);
#		print "$tmp[2]\t$tmp[0]\n";
#		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream;
#		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0);
#		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream;
		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1;
		$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]);
	}
	close IN;
}
sub mapping{
    my $err;
## build bowtie index
    #print STDERR "building bowtie index\n";
    $err = `bowtie-build $pre_file_name miRNA_precursor`;

## map mature sequences against precursors
    #print STDERR "mapping mature sequences against index\n";
	$err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`;

## map reads against precursors
    #print STDERR "mapping read sequences against index\n";
    $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`;

}

sub subseq{
	my $seq=shift;
	my $beg=shift;
	my $end=shift;
	my $strand=shift;

	my $subseq=substr($seq,$beg-1,$end-$beg+1);
	if ($strand eq "-") {
		$subseq=revcom($subseq);
	}
	return uc $subseq;
}

sub revcom{
	my $seq=shift;
	$seq=~tr/ATCGatcg/TAGCtagc/;
	$seq=reverse $seq;
	return uc $seq;
}

sub Time{
        my $time=time();
        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
        $month++;
        $year+=1900;
        if (length($sec) == 1) {$sec = "0"."$sec";}
        if (length($min) == 1) {$min = "0"."$min";}
        if (length($hour) == 1) {$hour = "0"."$hour";}
        if (length($day) == 1) {$day = "0"."$day";}
        if (length($month) == 1) {$month = "0"."$month";}
        #print "$year-$month-$day $hour:$min:$sec\n";
        return("$year-$month-$day-$hour-$min-$sec");
}

sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -r -p -m -mis -t -e -f -tag -o -time
mandatory parameters:
-p precursor.fa  miRNA precursor sequences from miRBase # must be absolute path
-m mature.fa     miRNA sequences from miRBase # must be absolute path
-r reads.fa      your read sequences #must be absolute path

-o output directory

options:
-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0
-t [int]     threads number,default 1
-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2
-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5
-tag [string] sample marks# eg. sampleA;sampleB;sampleC
-time sting #make directory time,default is the local time
-h help
USAGE
exit(1);
}
author	bigrna
date	Sun, 04 Jan 2015 02:47:25 -0500
parents
children