Repository 'mirplant2'
hg clone https://toolshed.g2.bx.psu.edu/repos/big-tiandm/mirplant2

Changeset 18:a79212816cbc (2014-07-25)
Previous changeset 17:1131b4008650 (2014-07-25) Next changeset 19:141a337097e1 (2014-07-25)
Commit message:
Uploaded
added:
quantify.pl
b
diff -r 1131b4008650 -r a79212816cbc quantify.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/quantify.pl Fri Jul 25 05:22:21 2014 -0400
[
b'@@ -0,0 +1,495 @@\n+#!/usr/bin/perl -w\r\n+#Filename:\r\n+#Author: Tian Dongmei\r\n+#Email: tiandm@big.ac.cn\r\n+#Date: 2013/7/19\r\n+#Modified:\r\n+#Description: \r\n+my $version=1.00;\r\n+\r\n+use File::Path;\r\n+use strict;\r\n+use File::Basename;\r\n+#use Getopt::Std;\r\n+use Getopt::Long;\r\n+use RNA;\r\n+\r\n+my %opts;\r\n+GetOptions(\\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","time:s","h");\r\n+if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments\r\n+&usage;\r\n+}\r\n+\r\n+my $read=$opts{\'r\'};\r\n+my $pre=$opts{\'p\'};\r\n+my $mature=$opts{\'m\'};\r\n+\r\n+my $dir=$opts{\'o\'};\r\n+unless ($dir=~/\\/$/) {$dir .="/";}\r\n+if (not -d $dir) {\r\n+\tmkdir $dir;\r\n+}\r\n+\r\n+my $threads=defined $opts{\'t\'} ? $opts{\'t\'} : 1;\r\n+my $mismatch=defined $opts{\'mis\'} ?  $opts{\'mis\'} : 0;\r\n+\r\n+my $upstream = 2;\r\n+my $downstream = 5;\r\n+\r\n+$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\r\n+$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\r\n+\r\n+my $marks=defined $opts{\'tag\'} ? $opts{\'tag\'} : "";\r\n+\r\n+my $time=Time();\r\n+if (defined $opts{\'time\'}) { $time=$opts{\'time\'};}\r\n+\r\n+my $tmpdir="${dir}/miRNA_Express_${time}";\r\n+if(not -d $tmpdir){\r\n+\tmkdir($tmpdir);\r\n+}\r\n+chdir $tmpdir;\r\n+\r\n+`cp $pre ./`;\r\n+my $pre_file_name=basename($pre);\r\n+\r\n+&mapping(); # matures align to precursors && reads align to precursors;\r\n+\r\n+my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end;\r\n+&maturePosOnPre(); # acknowledge mature positions on precursor \r\n+\r\n+my %pre_read;\r\n+&readPosOnPre(); # acknowledge reads positions on precursors\r\n+\r\n+if(!(defined $opts{\'tag\'})){\r\n+\tforeach my $key (keys %pre_read) {\r\n+\t\t$pre_read{$key}[0][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n+\t\tmy @ss=split/_/,$1;\r\n+\t\tfor (my $i=1;$i<=@ss;$i++) {\r\n+\t\t\t$marks .="Smp$i;";\r\n+\t\t}\r\n+\t\tlast;\r\n+\t}\r\n+}\r\n+\r\n+my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...."\r\n+&attachPre();\r\n+\r\n+my $preno=scalar (keys %pre);\r\n+print "Total Precursor Number is $preno !!!!\\n";\r\n+\r\n+my %struc; #mature  star  loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe;\r\n+&structure();\r\n+\r\n+\r\n+##### analysis and print out  && moRs\r\n+my $aln=$dir."known_microRNA_express.aln";\r\n+my $list=$dir."known_microRNA_express.txt";\r\n+my $moRs=$dir."known_microRNA_express.moRs";\r\n+\r\n+system("ln $mature $dir/known_microRNA_mature.fa ");\r\n+system("ln $pre $dir/known_microRNA_precursor.fa ");\r\n+\r\n+open ALN,">$aln";\r\n+open LIST,">$list";\r\n+open MORS,">$moRs";\r\n+\r\n+$"="\\t"; ##### @array print in \\t\r\n+\r\n+my @marks=split/\\;/,$marks;\r\n+#print LIST "#matueID\\tpreID\\tpos1\\tpos2\\tmatureExp\\tstarExp\\ttotalExp\\n";\r\n+print LIST "#matueID\\tpreID\\tpos1\\tpos2";\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_matureExp";\r\n+}\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_starExp";\r\n+}\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_totalExp";\r\n+}\r\n+print LIST "\\n";\r\n+print ALN "#>precursor ID \\n#precursor sequence\\n#precursor structure (mfe)\\n#RNA_seq\\t@marks\\ttotal\\n";\r\n+print MORS "#>precursor ID\\tstrand\\texpress_reads\\texpress_reads\\/total_reads\\tblock_number\\tprecursor_sequence\\n#\\tblock_start\\tblock_end\\t@marks\\ttotal\\ttag_number\\tsequence\\n";\r\n+my %moRs;\r\n+\r\n+foreach my $key (keys %pre) {\r\n+\tprint ALN ">$key\\n$pre{$key}\\n$struc{$key}{struc}  ($struc{$key}{mfe})\\n";\r\n+\tnext if(! (exists $pre_read{$key}));\r\n+\tmy @array=@{$pre_read{$key}};\r\n+\t@array=sort{$a->[3]<=> $b->[3]} @array;\r\n+\t\r\n+\tmy $length=length($pre{$key});\r\n+\r\n+\tmy $maxline=-1;my $max=0; ### storage the maxinum express read line\r\n+\tmy $totalReadsNo=0; \r\n+\tmy @not_over=(); ### new read format better for moRs analysis\r\n+\r\n+####print out Aln file start\r\n+\tfor (my $i=0;$i<@array;$i++) {\r\n+\t\tmy $maps=$array[$i][3]+1;\r\n+\t\tmy $mape=$array[$i][3]+length($array[$i][4]);\r\n+\t\tmy $str="";\r\n+\t\t$str .= "." x ($maps-1);\r\n+\t\t$str .=$array[$i][4];\r\n+\t\t$str .="." x ($length-$mape);\r\n+\t\t$str .="        ";\r\n+\r\n+\t\t$array[$i][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n+\t\tmy @sampl'..b') {\r\n+\t\t\tmy $a=$i; my $b=$a2b{$i};\r\n+#\t\t\tif($a>$b){\r\n+#\t\t\t\t$startpos=$b-$n+2;\r\n+#\t\t\t\t$endpos=$b-$n+2+($end-$start);\r\n+#\t\t\t}\r\n+#\t\t\tif($a<$b){\r\n+\t\t\t\t$endpos=$b+$n+2;\r\n+\t\t\t\tif($endpos>length($structure)){$endpos=length($structure);}\r\n+\t\t\t\t$startpos=$b+$n+2-($end-$start);\r\n+\t\t\t\tif($startpos<1){$startpos=1;}\r\n+#\t\t\t}\r\n+\t\t\tlast;\r\n+\t\t}\r\n+\t\t$n++;\r\n+\t}\r\n+\treturn ($startpos,$endpos);\r\n+}\r\n+sub attachPre{\r\n+\topen IN, "<$pre_file_name";\r\n+\tmy $name;\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tif ($aline=~/^>(\\S+)/) {\r\n+\t\t\t$name=$1;\r\n+\t\t\tnext;\r\n+\t\t}\r\n+\t\t$pre{$name} .=$aline;\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub readPosOnPre{\r\n+\topen IN,"<read_mapped.bwt";\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tmy $id=lc($tmp[2]);\r\n+\t\tpush @{$pre_read{$tmp[2]}},[@tmp];\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub maturePosOnPre{\r\n+\topen IN,"<mature_mapped.bwt";\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tmy $mm=$tmp[0];\r\n+#\t\t$mm=~s/\\-3P|\\-5P//i;\r\n+\t\t$mm=lc($mm);\r\n+\t\tmy $pm=$tmp[2];\r\n+\t\t$pm=lc($pm);\r\n+\r\n+#\t\tnext if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a\r\n+\t\tnext if($mm!~/$pm/);\r\n+#\t\tprint "$tmp[2]\\t$tmp[0]\\n";\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream;\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0);\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream;\r\n+\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1;\r\n+\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]);\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub mapping{\r\n+    my $err;\r\n+## build bowtie index\r\n+    print STDERR "building bowtie index\\n";\r\n+    $err = `bowtie-build $pre_file_name miRNA_precursor`;\r\n+\r\n+## map mature sequences against precursors\r\n+    print STDERR "mapping mature sequences against index\\n";\r\n+\t$err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature mature_mapped.bwt`;\r\n+\r\n+## map reads against precursors\r\n+    print STDERR "mapping read sequences against index\\n";\r\n+    $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa read_mapped.bwt `;\r\n+\r\n+}\r\n+\r\n+sub subseq{\r\n+\tmy $seq=shift;\r\n+\tmy $beg=shift;\r\n+\tmy $end=shift;\r\n+\tmy $strand=shift;\r\n+\t\r\n+\tmy $subseq=substr($seq,$beg-1,$end-$beg+1);\r\n+\tif ($strand eq "-") {\r\n+\t\t$subseq=revcom($subseq);\r\n+\t}\r\n+\treturn uc $subseq;\r\n+}\r\n+\r\n+sub revcom{\r\n+\tmy $seq=shift;\r\n+\t$seq=~tr/ATCGatcg/TAGCtagc/;\r\n+\t$seq=reverse $seq;\r\n+\treturn uc $seq;\r\n+}\r\n+\r\n+sub Time{\r\n+        my $time=time();\r\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n+        $month++;\r\n+        $year+=1900;\r\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\r\n+        if (length($min) == 1) {$min = "0"."$min";}\r\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\r\n+        if (length($day) == 1) {$day = "0"."$day";}\r\n+        if (length($month) == 1) {$month = "0"."$month";}\r\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\r\n+        return("$year-$month-$day-$hour-$min-$sec");\r\n+}\r\n+\r\n+sub usage{\r\n+print <<"USAGE";\r\n+Version $version\r\n+Usage:\r\n+$0 -r -p -m -mis -t -e -f -tag -o -time\r\n+mandatory parameters:\r\n+-p precursor.fa  miRNA precursor sequences from miRBase # must be absolute path\r\n+-m mature.fa     miRNA sequences from miRBase # must be absolute path\r\n+-r reads.fa      your read sequences #must be absolute path\r\n+\r\n+-o output directory\r\n+\r\n+options:\r\n+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0\r\n+-t [int]     threads number,default 1\r\n+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2\r\n+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5\r\n+-tag [string] sample marks# eg. sampleA;sampleB;sampleC\r\n+-time sting #make directory time,default is the local time\r\n+-h help\r\n+USAGE\r\n+exit(1);\r\n+}\r\n+\r\n'