Mercurial > repos > bigrna > gpsrna
changeset 0:87fe81de0931 draft default tip
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Annotate.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,178 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chentt +#Email: chentt@big.ac.cn +#Date: 2014/4/10 +#Modified: +#Description: cluster annotate by priority +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","d=i","g=s","o=s","t=s","h"); +if (!(defined $opts{i} and defined $opts{g} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} +#my $genelistout=$opts{'t'}; +my $dis=defined $opts{'d'}? $opts{'d'}:1000; +my %gene; + +#open OUT,">$genelistout"; #output file +#print OUT "#ID\tchr\tstart\tend\tstrand\ns"; +open IN,"<$opts{g}"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline;#ID chr start end strand + #push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; + $gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[4]]; +} +#while (my $aline=<IN>) { +# chomp $aline; +# next if($aline=~/^\#/); +# my @tmp=split/\t/,$aline; +# my $ID; +# if ($tmp[2] eq "gene") { +# $tmp[0]=~s/Chr\./Chr/; +# $tmp[0]=~s/Chr/chr/; +# my @infor=split/;/,$tmp[8]; +# for (my $i=0;$i<@infor ;$i++) { +# if ($infor[$i]=~/Alias\=(\S+)$/) { +# $ID=$1; +# last; +# } +# } +# $gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] +# print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; +# } +#} +close IN; +#close OUT; + + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + if($aline=~/^\#/){print OUT "$aline\tP_annotate\n";next} + my @result; + #shift @tmp; + my @id=split/:/,$tmp[0]; + $id[0]=~s/Chr0/Chr/; + my @posi=split/-/,$id[1]; + foreach my $key (keys %{$gene{$id[0]}}) { + if ($posi[0]<$gene{$id[0]}{$key}[1] && $posi[1]>$gene{$id[0]}{$key}[0]) { + push @result,"gene-body;$key;$gene{$id[0]}{$key}[2]";#$te{$key}"; + next; + } + #if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-1000) { + if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-$dis) { + push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+"); + push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-"); + next; + } + #if ($posi[0]<$gene{$id[0]}{$key}[1]+1000 && $posi[1]>$gene{$id[0]}{$key}[1]) { + if ($posi[0]<$gene{$id[0]}{$key}[1]+$dis && $posi[1]>$gene{$id[0]}{$key}[1]) { + push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+"); + push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-"); + next; + } + } + my $result; + if (!(@result)) { + $result="intergenic"; + } + elsif($#result==0){ + $result=$result[0]; + + } + else{ + $result=join "\t",@result; + } +# else{ +# my $te_num=0; +# my @te_overlap; +# my @te_up_down; +# my @non_overlap; +# my @non_up_down; +# for (my $k=0;$k<@result ;$k++) { +# my @rr=split/\;/,$result[$k]; +# if ($rr[3] eq "Y") { +# $te_num++; +# if ($rr[0] eq "overlap") { +# push @te_overlap,$result[$k]; +# } +# else{ +# push @te_up_down,$result[$k]; +# } +# } +# else{ +# if ($rr[0] eq "overlap") { +# push @non_overlap,$result[$k]; +# } +# else{ +# push @non_up_down,$result[$k]; +# } +# } +# } +# if ($te_num==0) {#non TE +# if (!(@te_overlap)) {#down up +# if ($#non_up_down==0) { +# $result=$non_up_down[0]; +# } +# else{#overlap +# my $all_2=join "\t",@non_up_down; +# $result="up&down1-kb\t".$all_2; +# } +# } +# else{ +# $result=join "\t",@non_overlap; +# if ($#non_overlap>=1) { +# print "$aline\t$result\n"; +# } +# } +# } +# else{#TE +# if (!(@te_overlap)) {#down up +# if ($#te_up_down==0) { +# $result=$te_up_down[0]; +# } +# else{#overlap +# my $all_2=join "\t",@te_up_down; +# $result="up&down1-kb\t".$all_2; +# } +# } +# else{ +# $result=join "\t",@te_overlap; +# if ($#te_overlap>=1) { +# print "$aline\t$result\n"; +# } +# } +# } +# } + print OUT "$aline\t$result\n"; +} + +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -g -d +options: +-i input file +-g genelist file +-d int the length of the upstream and downstream,default 1000 +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ClassAnnotate.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,251 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chen Tingting +#Email: chentt@big.ac.cn +#Date: 2014/5/13 +#Modified: +#Description: cluster annotate +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h"); +if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments +&usage; +} + +#my %gene; +my %gene1; +open IN,"<$opts{g}"; +open OUT ,">$opts{l}"; +print OUT "#ID\tchr\tstart\tend\tstrand\n"; +my $n=1; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + my $ID; + if ($tmp[2] eq "gene") { + $tmp[0]=~s/Chr\./Chr/; + #$tmp[0]=~s/Chr/chr/; + my @infor=split/;/,$tmp[8]; + for (my $i=0;$i<@infor ;$i++) { + if ($infor[$i]=~/Alias\=(\S+)$/) { + $ID=$1; + last; + } + else { + $ID="unknown$n"; + $n++; + } + } + #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] + push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; + print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; + } +} +#while (my $aline=<IN>) { +# chomp $aline; +# next if($aline=~/^\#/); +# my @tmp=split/\t/,$aline;#ID chr start end strand +# push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; +# #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]]; +#} +close IN; +close OUT; + +my %nat; +open TMP,">$opts{t}"; +print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n"; + +open IN,"<$opts{n}"; +my $nat_n=1; +while (my $aline=<IN>) { + next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt - + chomp $aline; + my @arr=split /\t/,$aline; + my ($ns,$ne,$ns2,$ne2)=(0,0,0,0); + if ($arr[11]=~/Nearby/) { + ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]); + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n"; + $nat_n++; + }else{ + $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1; + $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1; + $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1; + $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1; + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2 + push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n"; + $nat_n++; + } +} +close IN; +close TMP; + +my %repeat; +open IN,"<$opts{'r'}"; +my $first=<IN>; +$first=<IN>; +$first=<IN>; +while (my $aline=<IN>) { + chomp $aline; + $aline=~s/^\s+//; + my @tmp=split/\s+/,$aline; + $tmp[4]=~s/chr0/Chr/; + $tmp[4]=~s/chr/Chr/; + push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class + #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n"; +} +close IN; + +my %phase; +open IN,"<$opts{'p'}"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + if ($tmp[5]>=25) { + $phase{$tmp[0]}=$tmp[5]; + } +} +close IN; + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +while (my $aline=<IN>) { + chomp $aline; + if($aline=~/^\#/){ + print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n"; + next; + } + my @tmp=split/\t/,$aline; + my @inf=split/\:/,$tmp[0]; + my @pos=split/\-/,$inf[1]; + my $chr=$inf[0]; + $chr=~s/Chr0/Chr/; + my $start=$pos[0]; + my $end=$pos[1]; + #=========Repeat============ + my @repeat; + if (defined(@{$repeat{$chr}})) { + my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}}; + for (my $i=0;$i<@r_array ;$i++) { + if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) { + push @repeat,$r_array[$i][2]; + } + elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){ + push @repeat,$r_array[$i][2]; + + } + elsif($end<$r_array[$i][0]){ + last; + } + else{next;} + } + } + my $repeat; + if (@repeat==0) { + $repeat="\/"; + } + else{ + $repeat=join ";",@repeat; + } + #=========nat=============== + my @nat1;#class 1 + my @nat2;#class 2 + my @nat;#nat ID + #foreach my $chr (keys %nat) { + my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}}; + for (my $i=0;$i<@n_array ;$i++) { + if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) { + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){ + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($end<$n_array[$i][0]){ + last; + } + else{next;} + } + #} + + my $nat1; + my $nat2; + my $nat; + if (@nat1==0) { + $nat1="\/"; + } + else{ + $nat1=join ";",@nat1; + } + if (@nat2==0) { + $nat2="\/"; + } + else{ + $nat2=join ";",@nat2; + } + if (@nat==0) { + $nat="\/"; + } + else{ + $nat=join ";",@nat; + } + #========phase============== + my $phase="\/"; + if (defined($phase{$tmp[0]})) { + $phase="phase"; + } + #=========long=============== + my $long="\/"; + if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) { + $long="long"; + } + print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n"; +} + +close IN; +close OUT; + +sub nearby{ + my @p=@_; + my ($s,$e)=(0,0); + if ($p[1]<$p[2]) { + $s=$p[1]; + $e=$p[2]; + }else{ + $s=$p[3]; + $e=$p[0]; + } + return ($s,$e); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -g -n -r -p -t -l +options: +-i input file + -g gff file + -n NATs file + -r repeat file + -p phase file +-o output file +-t nat output file +-l genelist output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DEGseq_2.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2009-05-06 +#Modified: +#Description: ɾ³ýmatched reads +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","outdir=s","column1:i","mark1=s","depth1:i","depth2:i","column2:i","mark2=s","h"); +if (!(defined $opts{i} and defined $opts{outdir} and defined $opts{mark1} and defined $opts{mark2}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $outputdir=$opts{'outdir'}; +unless ($outputdir=~/\/$/) {$outputdir .="/";} +my $column1=defined $opts{column1} ? $opts{column1} : 3; +my $column2=defined $opts{column2} ? $opts{column2} : 4; +my $mark1=$opts{mark1}; +my $mark2=$opts{mark2}; +my $fileout=$outputdir."degseq.R"; + +open OUT,">$fileout"; #output file +#my ($name,$dir); +#$name=basename($filein); +print OUT "library(DEGseq)\n"; +print OUT "geneExpFile <- system.file(package=\"DEGseq\")\n"; +print OUT "geneExpFile<-file.path(\"$filein\")\n"; +print OUT "layout(matrix(c(1,2,3,4,5,6), 3, 2, byrow=TRUE))\npar(mar=c(2, 2, 2,2))\n"; +print OUT "outputdir<-file.path(\"$outputdir\")\n"; +print OUT "geneExpMatrix1 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column1))\n"; +print OUT "geneExpMatrix2 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column2))\n"; +if(defined $opts{'depth1'} && defined $opts{'depth2'}){ +print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",depth1=$opts{depth1},depth2=$opts{depth2},outputDir=outputdir,method=\"MARS\")\n"; +} +else{ +print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",outputDir=outputdir,method=\"MARS\")\n"; +} +close OUT; + + +system("R CMD BATCH $fileout"); + +wait; + + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -outdir -column1 -mark1 -column2 -mark2 -depth1 -depth2 +options: +-i input file +-outdir output file dir +-column1 the first column for DEGseq +-mark1 the name of the column1 +-depth1 depth for the first file,use for normalize +-column2 the second column for DEGseq +-mark2 the name of the column2 +-depth2 depth for the second file,use for normalize + +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Length_Distibution.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,219 @@ +#!/usr/bin/perl -w +#========================================================================================== +# Date: +# Title: +# Comment: Program to plot gene structure +# Input: 1. input file of Gene region annotation which format like GenePred +# 2. input file of Transcripts region annotation which format like GenePred +# 3. input file of gene snp detail info +# Output: output file of gene structure graph by html or svg formt +# Test Usage: +#======================================================================================== +#use strict; +my $version=1.00; +use SVG; +use Getopt::Long; +my %opt; +GetOptions(\%opt,"i=s","o=s",,"h"); +if (!(defined $opt{i} and defined $opt{o}) || defined $opt{h}) { +&usage; +} +#===============================Define Attribute========================================== +my %attribute=( + canvas=>{ + 'width'=>1500, + 'height'=>1800 + }, + text=>{ + 'stroke'=>"#000000", + 'fill'=>"none", + 'stroke-width'=>0.5 + #'stroke-width2'=>1 + }, + line=>{ + 'stroke'=>"black", + 'stroke-width'=>1 + }, + font=>{ + 'fill'=>"#000000", + 'font-size'=>12, + 'font-size2'=>10, + 'font-weight'=>'bold', + 'font-family'=>"Arial" + #'font-family2'=>"ArialNarrow-bold" + }, + rect=>{ + 'fill'=>"lightgreen", + 'stroke'=>"black", + 'stroke-width'=>0.5 + }, + readwidth=>0.5 +); +#my $Xscale=600/$length;#¶¨ÒåXÖá±ÈÀý³ß 1:1000 xÖáµÄ×ø±ê³¤¶È¶¼Òª°´Õմ˱ÈÀý³ß»»Ëã +#========================================data============================ +open(IN,"$opt{i}")||die"cannot open the file $opt{i}"; +my @R_length; +my @T_length; +my $R_number=0; +my $T_number=0; +my $R_max=0; +my $T_max=0; + +my $title=<IN>; +chomp $title; +my @title=split/\t/,$title; +my @mark=split/\s+/,$title[1]; +my $sample_number=@mark; +while (my $aline=<IN>) { + if ($aline=~/^\s/) { + my $T_title=<IN>; + chomp $T_title; + while (my $a_aline=<IN>) { + chomp $a_aline; + my @temp=split/\t/,$a_aline; + my @number=split/\s+/,$temp[1]; + for (my $i=0;$i<@number ;$i++) { + if ($R_max<$number[$i]) { + $R_max=$number[$i]; + } + } + push @R_length,[$temp[0],@number]; + $R_number++; + } + } + else { + chomp $aline; + my @temp=split/\t/,$aline; + my @number=split/\s+/,$temp[1]; + for (my $i=0;$i<@number ;$i++) { + if ($T_max<$number[$i]) { + $T_max=$number[$i]; + } + } + push @T_length,[$temp[0],@number]; + $T_number++; + } +} +close IN; +print "Tag max: $T_max\nRead max: $R_max\n"; +my $kd_number=5; +##=======================Reads ×Ý×ø±ê¿Ì¶È========================== +my $r=1; +my $rr=1; +my $R=$R_max; +while ($R>10) { + $R=$R/10; + $r=$r*10; + $rr++; +} +$R=int($R+0.5); +my $R_xg=$R/$kd_number*$r;#×Ý×ø±êһС¸ñ´óС£¨Ò»¹²10¸ñ£© +my $R_kedu_scale_x=6*$rr;#×Ý×ø±ê¿Ì¶ÈÎÄ×Ö +##=======================Tags ×Ý×ø±ê¿Ì¶È========================== +my $t=1; +my $tt=1; +my $T=$T_max; +while ($T>10) { + $T=$T/10; + $t=$t*10; + $tt++; +} +$T=int($T+0.5); +my $T_xg=$T/$kd_number*$t;#×Ý×ø±êһС¸ñ´óС£¨Ò»¹²10¸ñ£© +my $T_kedu_scale_x=6*$tt;#×Ý×ø±ê¿Ì¶ÈÎÄ×Ö + +#############################s#define start coordinate and scale +my $XOFFSET=50; +my $YOFFSET=60; +my $width=800; +my $heigth=800; +my $X_width=600; +#my $height=1600; +#### Starting #### +#н¨»²¼ +my $svg=SVG->new(width=>$width,height=>$heigth); +####×ø±êÖá +my $axisL=300;#read ×Ý×ø±ê³¤¶È +my $x_margin = 50; +#=========Reads number setting========================================== +my $Y_R_title=30;#±êÌâµÄ×ÝÏò¿í¶È +my $Y_R_0=$YOFFSET+$axisL+$Y_R_title; +my $X_R_0=$XOFFSET+$x_margin; +my $R_Yscale=$axisL/$R_xg/$kd_number; +my $R_Xscale=$X_width/$R_number/($sample_number+1); +#=====================================Reads Y axis====================== +$svg->line('x1',$X_R_0,'y1',$Y_R_0,'x2',$X_R_0,'y2',$Y_R_0-$axisL,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); +for (my $i=1;$i<$kd_number ;$i++) { + $svg->line('x1',$X_R_0-5,'y1',$Y_R_0-$i*$R_xg*$R_Yscale,'x2',$X_R_0,'y2',$Y_R_0-$i*$R_xg*$R_Yscale,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$X_R_0-$R_kedu_scale_x,'y',$Y_R_0-$i*$R_xg*$R_Yscale+4,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',6,'font-family',$attribute{font}{'font-family'},'-cdata',$i*$R_xg); +} +#=====================================Reads X axis====================== +$svg->line('x1',$X_R_0,'y1',$Y_R_0,'x2',$X_R_0+$X_width,'y2',$Y_R_0,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + +#print "$R_number\t$sample_number\n"; +for ($i=0;$i<$R_number ;$i++) { + for (my $j=1;$j<$sample_number+1 ;$j++) { + my $red=$j/$sample_number*255; + $svg->rect('x',$X_R_0+($j+$i*($sample_number+1))*$R_Xscale,'y',$Y_R_0-$R_length[$i][$j]*$R_Yscale,'width',$R_Xscale,'height',$R_length[$i][$j]*$R_Yscale,'stroke',"black",'stroke-width',"0.5",'fill',"rgb($red,125,0)"); + } + $svg->text('x',$X_R_0+(1+$sample_number/2+$i*($sample_number+1))*$R_Xscale,'y',$Y_R_0+15,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',6,'font-family',$attribute{font}{'font-family'},'-cdata',$R_length[$i][0]); +} +#===Reads number title +$svg->text('x',$XOFFSET+400,'y',$YOFFSET+$Y_R_title,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',"1",'font-size',15,'font-family',$attribute{font}{'font-family'},'-cdata',"Reads Length Distribution"); +#===Reads +for (my $i=0;$i<$sample_number ;$i++) { + my $red=($i+1)/$sample_number*255; + $svg->rect('x',$X_R_0+550,'y',$YOFFSET+$Y_R_title+20*$i,'width',15,'height',10,'stroke',"black",'stroke-width',"0.5",'fill',"rgb($red,125,0)"); + $svg->text('x',$X_R_0+550+30,'y',$YOFFSET+$Y_R_title+20*$i+10,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',$mark[$i]); +} +####================================================================================== +#=========================================Tag s +my $Y_T_title=30;#±êÌâµÄ×ÝÏò¿í¶È +my $Y_T_0=$Y_R_0+$axisL+$Y_R_title+50;#length size +my $X_T_0=$XOFFSET+$x_margin; +my $T_Yscale=$axisL/$T_xg/$kd_number; +my $T_Xscale=$X_width/$T_number/($sample_number+1); +#=====================================Tags Y axis====================== +$svg->line('x1',$X_T_0,'y1',$Y_T_0,'x2',$X_T_0,'y2',$Y_T_0-$axisL,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); +for (my $i=1;$i<$kd_number ;$i++) { + $svg->line('x1',$X_T_0-5,'y1',$Y_T_0-$i*$T_xg*$T_Yscale,'x2',$X_T_0,'y2',$Y_T_0-$i*$T_xg*$T_Yscale,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$X_T_0-$T_kedu_scale_x,'y',$Y_T_0-$i*$T_xg*$T_Yscale+4,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',6,'font-family',$attribute{font}{'font-family'},'-cdata',$i*$T_xg); +} +#=====================================Tags X axis====================== +$svg->line('x1',$X_T_0,'y1',$Y_T_0,'x2',$X_T_0+$X_width,'y2',$Y_T_0,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + +#print "$R_number\t$sample_number\n"; +for ($i=0;$i<$T_number ;$i++) { + for (my $j=1;$j<$sample_number+1 ;$j++) { + my $red=$j/$sample_number*255; + $svg->rect('x',$X_T_0+($j+$i*($sample_number+1))*$T_Xscale,'y',$Y_T_0-$T_length[$i][$j]*$T_Yscale,'width',$T_Xscale,'height',$T_length[$i][$j]*$T_Yscale,'stroke',"black",'stroke-width',"0.5",'fill',"rgb($red,125,0)"); + } + $svg->text('x',$X_T_0+(1+$sample_number/2+$i*($sample_number+1))*$T_Xscale,'y',$Y_T_0+15,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',6,'font-family',$attribute{font}{'font-family'},'-cdata',$T_length[$i][0]); +} +#===Reads number title +$svg->text('x',$XOFFSET+400,'y',$Y_R_0+30+$Y_T_title,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',"1",'font-size',15,'font-family',$attribute{font}{'font-family'},'-cdata',"Tags Length Distribution"); +#===Reads +for (my $i=0;$i<$sample_number ;$i++) { + my $red=($i+1)/$sample_number*255; + $svg->rect('x',$X_T_0+550,'y',$Y_R_0+30+$Y_T_title+20*$i,'width',15,'height',10,'stroke',"black",'stroke-width',"0.5",'fill',"rgb($red,125,0)"); + $svg->text('x',$X_T_0+550+30,'y',$Y_R_0+30+$Y_T_title+20*$i+10,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',$mark[$i]); +} + + + + +open (OUT,">$opt{o}"); +print OUT $svg->xmlify(); + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 +options: +-i +-o svg output +-h help +USAGE +exit(1); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SampleDEGseqMerge.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: chentt@big.ac.cn +#Date: 2014-05-21 +#Modified: +#Description: merged deg file and total information +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i:s@","mark:s@","f:s","o=s","n=s","h"); +if (!(defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my @filein=@{$opts{'i'}}; +my @mark=@{$opts{'mark'}}; +my $fileout=$opts{'o'}; +my $number=$opts{'n'}; + +my %hash; +open IN,"<$filein[0]"; #input file + +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\"/); + my @temp=split/\t/,$aline; + $hash{$temp[0]}=$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1]; +} +close IN; + +for (my $i=1;$i<=$#filein;$i++) { + open IN,"<$filein[$i]"; #input file + + while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\"/); + my @temp=split/\t/,$aline; + if (!(defined $hash{$temp[0]})) { + print "Not find $temp[0]in sample one!\n"; + next; + } + $hash{$temp[0]} .="\t".$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1]; + } + close IN; +} + +open OUT,">$fileout"; #output file +my $deg_title; +foreach (@mark) { + $deg_title.="log2(Fold_change)\tp_value\tq_value\t".$_."\t"; +} +$deg_title=~s/\s+$//; +my %function; +my $title; +open F,"<$opts{f}"; +while (my $aline=<F>) { + chomp $aline; + if($aline=~/^\#/){ + my $title=$aline; + my @title=split/\t/,$aline; + $title[2+$number].="\t".$deg_title; + $title=join"\t",@title; + print OUT "$title\n"; + next; + } + my @temp=split/\t/,$aline; + $temp[2+$number].="\t".$hash{$temp[0]}; + my $temp=join"\t",@temp; + print OUT "$temp\n"; + +} +close F; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -mark -f +options: +-i input file # -i output_score.txt -i output_score.txt -i output_score.txt +-mark sample name # -mark sam1_VS_sam2 -mark sam1_VS_sam3 -mark sam2_VS_sam3 +-f cluster file +-n sample number +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/collapseReads2Tags.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,170 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-3-20 +#Modified: +#Description: fastq file form reads cluster(the same sequence in the same cluster) +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h"); +if (!(defined $opts{o} and defined $opts{'format'}) || defined $opts{h}) { #necessary arguments +&usage; +} +my @filein=@{$opts{i}} if(defined $opts{i}); +my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq"; +my $fileout=$opts{'o'}; +my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33; +my %hash;##·Ö¿é´æ·ÅÔʼÐòÁÐ + +my $format=$opts{'format'}; +if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { + die "Parameter -format is error!\n"; +} + +my ($qualT,$qualV); +if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) { #quality filter + my @temp=split /:/,$opts{'qual'}; + $qualT=$temp[0]; + $qualV=$temp[1]; + + for (my $i=0;$i<@filein;$i++) { + open IN,"<$filein[$i]"; + while (my $aline=<IN>) { + my $seq=<IN>; + my $n=<IN>; + my $qv=<IN>; + my $tag=&qvcheck($qv,$qualT,$qualV); + next if(!$tag); + my $str=substr($seq,0,6); + $hash{$str}[$i].=$seq; + } + close IN; + } +} +elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads + for (my $i=0;$i<@filein;$i++) { + open IN,"<$filein[$i]"; + while (my $aline=<IN>) { + my $seq=<IN>; + my $n=<IN>; + my $qv=<IN>; + my $str=substr($seq,0,6); + $hash{$str}[$i].=$seq; + } + close IN; + } + +} +elsif($format eq "fasta" || $format eq "fa"){ + for (my $i=0;$i<@filein;$i++) { + open IN,"<$filein[$i]"; + while (my $aline=<IN>) { + my $seq=<IN>; + my $str=substr($seq,0,6); + $hash{$str}[$i].=$seq; + } + close IN; + } +} + +open OUT,">$fileout"; #output file +my $count=0; +foreach my $key (keys %hash) { + my %cluster; + for (my $i=0;$i<@filein;$i++) { + next if(!(defined $hash{$key}[$i])); + my @tmp=split/\n/,$hash{$key}[$i]; + foreach (@tmp) { + $cluster{$_}[$i]++; + } + } + + foreach my $seq (keys %cluster) { + my $exp=""; my $ee=0; + for (my $i=0;$i<@filein;$i++) { + if (defined $cluster{$seq}[$i]) { + $exp.="_$cluster{$seq}[$i]"; + $ee+=$cluster{$seq}[$i]; + }else{ + $exp.="_0"; + } + } + $count+=$ee; + $exp=~s/^_//; + print OUT ">$name","_$count:$exp","_x$ee\n$seq\n"; + } +} +close OUT; + + +sub qvcheck{ + my ($str,$t,$v)=@_; + my $qv=0; + if($t eq "mean"){ + $qv=&getMeanQuality($str); + } + elsif($t eq "min"){ + $qv=&getMinQuality($str); + } + if ($qv<$v) { + return 0; + } + return 1; +} + +sub getMeanQuality(){ + chomp $_[0]; + my @bases = split(//,$_[0]); + my $sum = 0; + for(my $i = 0; $i <= $#bases; $i++){ + my $num = ord($bases[$i]) - $pq; + $sum += $num; + } + + return $sum/($#bases+1); + +} + +### +### This function gives back the Q-value of the worst base +sub getMinQuality(){ + chomp $_[0]; + my @bases = split(//,$_[0]); + my $worst = 1000; + for(my $i = 0; $i <= $#bases; $i++){ +# printf ("base: $bases[$i] --> %d\n",ord($bases[$i])); + my $num = ord($bases[$i]) - $pq; + if($num < $worst){ + $worst = $num; + } + } + return $worst; +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -format -mark -qual -qv -o +options: +-i input file#fastq file ##can be multiple -i file1 -i file2 ... +-mark string#quary name,default is "seq" +-o output file +-format string # fastq|fasta|fq|fa + +-qual #reads filter + eg:(min:value/mean:value) + This parameter just for solexa reads. + If the input files are solid and needs filter,please do filter first . + +-qv integer #Phred quality64/33,default 33 +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/conventional.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,156 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chentt +#Email: chentt@big.ac.cn +#Date: 2014/04/09 +#Modified: +#Description: islands merged of merged samples +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","d=i","o=s","N=i","t=s","mark=s","h"); +if (!(defined $opts{i} and defined $opts{d} and defined $opts{N} and defined $opts{mark} and defined $opts{t} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +my $distance=$opts{'d'}; +my $tempout=$opts{'t'}; +my $mark=$opts{'mark'}; +my @sample=split/\#/,$mark; +$mark=join"\"\t\"",@sample; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n"; +open TMP,">$tempout"; +print TMP "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n"; +my %hash; +while (my $aline=<IN>) { + chomp $aline; + if($aline=~/^\#/){ + #print OUT "$aline\n"; + next; + } + my @tmp=split/\t/,$aline; + my $chr=shift @tmp; + #shift @tmp; + push @{$hash{$chr}},[@tmp]; +} + +close IN; + +foreach my $key (keys %hash) { + my @tag=sort{$a->[1] <=> $b->[1]} @{$hash{$key}}; + my @sample; + my $start=$tag[0][1]; + my $end=$tag[0][2]; + push @sample,[@{$tag[0]}]; + for (my $i=1;$i<@tag-1;$i++) { + if ($tag[$i][1]-$end<=$distance) { + if ($tag[$i][2]>$end) { + $end=$tag[$i][2]; + } + push @sample,[@{$tag[$i]}]; + } + else{ + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + $max_length="\>30"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + $start=$tag[$i][1]; + $end=$tag[$i][2]; + + @sample=(); + push @sample,[@{$tag[$i]}]; + } + } + if ($tag[$#tag][1]-$end<=$distance) { + if ($tag[$#tag][2]>$end) { + $end=$tag[$#tag][2]; + } + push @sample,[@{$tag[$#tag]}]; + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + $max_length="\>30"; + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + } + else{ + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + $max_length="\>30"; + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + + } +} +close OUT; +close TMP; +sub Max_length{ + my @exp=@{$_[0]}; + my %sample_length; + my $total_exp; + my @each; + my @tag; + for (my $i=0;$i<=$#exp ;$i++) { + my $length=$exp[$i][2]-$exp[$i][1]+1; + #if ($length>30) { + # $length=40; + #} + my $exp=0; + foreach (1..$opts{'N'}) { + $exp+=$exp[$i][$_+2]; + $each[$_-1]+=$exp[$i][$_+2]; + } + $sample_length{$length}+=$exp; + $total_exp+=$exp; + push @tag,($exp[$i][1].",".$exp[$i][2].",".$exp[$i][0].",".$exp); + } + my $max=0; + my $max_key; + foreach my $key (sort keys %sample_length) { + my $p=$sample_length{$key}/$total_exp; + if ($p>$max) { + $max=$p; + $max_key=$key; + } + $sample_length{$key}=sprintf("%.2f",$p); + } + my $tag_n=@tag; + my $tag=join";",@tag; + $tag=$tag_n."\t".$tag; + return($max_key,$sample_length{$max_key},$tag,@each); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -d -N -t -mark +options: +-i input file +-d distance of two islands +-mark sample name; +-o output file +-N sample number +-t temp output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_bowtie_to_blast.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,126 @@ +#!/usr/bin/perl + + +use warnings; +use strict; +use Getopt::Std; + +######################################### USAGE ################################ + +my $usage= +"$0 file_bowtie_result file_solexa_seq file_chromosome + +This is a converter which changes Bowtie output into Blast format. +The input includes three files: a Bowtie result file (default Bowtie +output file), a fasta file consisting of small Reads and a chromosome +fasta file. It outputs the alignments in blast_parsed format. + +file_bowtie_result likes: + +AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0 +AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0 + +file_solexa_seq likes: + +>AtFlower100010_x2 +AAGGAGATTCTTTCAGTCCAG + +file_chromosome contains chromosome seq in fasta format + +"; + + +####################################### INPUT FILES ############################ + +my $file_bowtie_result=shift or die $usage; +my $file_short_seq=shift or die $usage; +my $file_chromosome_seq=shift or die $usage; + + +##################################### GLOBAL VARIBALES ######################### + +my %short_seq_length=(); +my %chromosome_length=(); + + +######################################### MAIN ################################# + +#get the short sequence id and its length +sequence_length($file_short_seq,\%short_seq_length); + +#get the chromosome sequence id and its length +sequence_length($file_chromosome_seq,\%chromosome_length); + +#convert bowtie result format to blast format; +change_format($file_bowtie_result); + +exit; + + +##################################### SUBROUTINES ############################## + +sub sequence_length{ + my ($file,$hash) = @_; + my ($id, $desc, $sequence, $seq_length) = (); + + open (FASTA, "<$file") or die "can not open $$file\n"; + while (<FASTA>) + { + chomp; + if (/^>(\S+)(.*)/) + { + $id = $1; + $desc = $2; + $sequence = ""; + while (<FASTA>){ + chomp; + if (/^>(\S+)(.*)/){ + $$hash{$id} = length $sequence; + $id = $1; + $desc = $2; + $sequence = ""; + next; + } + $sequence .= $_; + } + } + } + $seq_length=length($sequence); + $$hash{$id} = $seq_length; + close FASTA; +} + + + + + +sub change_format{ + #Change Bowtie format into blast format + my $file=shift @_; + open(FILE,"<$file")||die"can not open the bowtie result file:$!\n"; + #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n"; + + while(<FILE>){ + chomp; + my @tmp=split("\t",$_); + #Clean the reads ID + my @tmp1=split(" ",$tmp[0]); + print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t"; + if($tmp[1] eq "+"){ + my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]}; + my $seq_bg=$tmp[3] + 1; + print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n"; + } + if($tmp[1] eq "-"){ + my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3]; + my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1; + print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n"; + } + } + +# close BLASTOUT; + +} + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/count_ref_length.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,58 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2009-05-06 +#Modified: +#Description: ɾ³ýmatched reads +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file + +my ($name,$seq); +while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^>(\S+)/) { + $name=$1; + while (my $new=<IN>) { + chomp $new; + if ($new=~/^>(\S+)/) { + print OUT $name,"\t",length($seq),"\n"; + $seq =""; + $name=$1; + next; + } + else{$seq .=$new;} + } + } + print OUT $name,"\t",length($seq),"\n"; +} + +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/count_rfam_express.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,1800 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","o=s","tag:s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; + +if(!(defined $opts{'tag'})){ + my $line=`head -1 $filein`; + my @tmp=split/\t/,$line; + $tmp[0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=1;$i<=@ss;$i++) { + $marks .="Smp$i;"; + } +} + +my @marks=split/\;/,$marks; + +my %rfam_key; +while(<DATA>){ + chomp; + if(/^(\S+)\s+(\S+)$/){ + $rfam_key{$1}=$2; + } +} + + +my %reads; +my %tags; +open IN,"<$filein"; +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + $tmp[0]=~/:([\d|_]+)_x(\d+)$/; + + my @exp=split/_/,$1; + my @tag=split/\;/,$tmp[2]; + + if (defined $rfam_key{$tag[0]}) { + for (my $i=0;$i<@exp;$i++) { + $reads{$rfam_key{$tag[0]}}[$i]+=$exp[$i]; + $tags{$rfam_key{$tag[0]}}[$i]++ if($exp[$i]!=0); + } + }else{ + for (my $i=0;$i<@exp;$i++) { + $reads{other}[$i]+=$exp[$i]; + $tags{other}[$i]++ if($exp[$i]!=0); + } + } + +} +close IN; + +$"="\t"; ##### @array print in \t +open OUT,">$fileout"; +print OUT "####################################\n# small RNA expressed reads number #\n####################################\n"; +print OUT "#RNAname\t@marks\n"; +foreach my $key (keys %reads) { + print OUT $key; + for (my $i=0;$i<@{$reads{$key}} ;$i++) { + print OUT "\t",$reads{$key}[$i]; + } + print OUT "\n"; +} + +print OUT "\n\n####################################\n# small RNA expressed tags number #\n####################################\n"; +print OUT "#RNAname\t@marks\n"; + +foreach my $key (keys %tags) { + print OUT $key; + for (my $i=0;$i<@{$reads{$key}} ;$i++) { + if(defined $tags{$key}[$i]){print OUT "\t",$tags{$key}[$i];} + else{print OUT "\t0";} + } + print OUT "\n"; +} + +close OUT; +$"=" "; ##### @array print in \t + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -tag -o +options: +-i input file# rfam bowtie bwt. format mapping result +-tag [string] sample marks# eg. sampleA;sampleB;sampleC +-o output file + +-h help +USAGE +exit(1); +} + +__DATA__ +RF00635 lncRNA +RF01868 lncRNA +RF01869 lncRNA +RF01870 lncRNA +RF01871 lncRNA +RF01872 lncRNA +RF01873 lncRNA +RF01874 lncRNA +RF01875 lncRNA +RF01876 lncRNA +RF01877 lncRNA +RF01878 lncRNA +RF01879 lncRNA +RF01880 lncRNA +RF01881 lncRNA +RF01882 lncRNA +RF01883 lncRNA +RF01884 lncRNA +RF01885 lncRNA +RF01886 lncRNA +RF01887 lncRNA +RF01888 lncRNA +RF01889 lncRNA +RF01890 lncRNA +RF01891 lncRNA +RF01892 lncRNA +RF01893 lncRNA +RF01894 lncRNA +RF01904 lncRNA +RF01905 lncRNA +RF01906 lncRNA +RF01907 lncRNA +RF01908 lncRNA +RF01909 lncRNA +RF01928 lncRNA +RF01929 lncRNA +RF01930 lncRNA +RF01931 lncRNA +RF01932 lncRNA +RF01933 lncRNA +RF01934 lncRNA +RF01935 lncRNA +RF01946 lncRNA +RF01947 lncRNA +RF01948 lncRNA +RF01950 lncRNA +RF01951 lncRNA +RF01952 lncRNA +RF01953 lncRNA +RF01954 lncRNA +RF01955 lncRNA +RF01956 lncRNA +RF01957 lncRNA +RF01958 lncRNA +RF01961 lncRNA +RF01962 lncRNA +RF01963 lncRNA +RF01964 lncRNA +RF01965 lncRNA +RF01966 lncRNA +RF01967 lncRNA +RF01968 lncRNA +RF01969 lncRNA +RF01970 lncRNA +RF01971 lncRNA +RF01972 lncRNA +RF01973 lncRNA +RF01974 lncRNA +RF01975 lncRNA +RF01976 lncRNA +RF01977 lncRNA +RF01978 lncRNA +RF01979 lncRNA +RF01980 lncRNA +RF01981 lncRNA +RF01983 lncRNA +RF01984 lncRNA +RF01985 lncRNA +RF01986 lncRNA +RF01987 lncRNA +RF01992 lncRNA +RF02038 lncRNA +RF02039 lncRNA +RF02040 lncRNA +RF02041 lncRNA +RF02042 lncRNA +RF02043 lncRNA +RF02044 lncRNA +RF02045 lncRNA +RF02046 lncRNA +RF02047 lncRNA +RF02085 lncRNA +RF02086 lncRNA +RF02087 lncRNA +RF02089 lncRNA +RF02090 lncRNA +RF02091 lncRNA +RF02098 lncRNA +RF02101 lncRNA +RF02102 lncRNA +RF02103 lncRNA +RF02104 lncRNA +RF02105 lncRNA +RF02106 lncRNA +RF02107 lncRNA +RF02108 lncRNA +RF02109 lncRNA +RF02110 lncRNA +RF02112 lncRNA +RF02113 lncRNA +RF02114 lncRNA +RF02115 lncRNA +RF02116 lncRNA +RF02117 lncRNA +RF02118 lncRNA +RF02119 lncRNA +RF02120 lncRNA +RF02121 lncRNA +RF02122 lncRNA +RF02123 lncRNA +RF02124 lncRNA +RF02125 lncRNA +RF02126 lncRNA +RF02127 lncRNA +RF02128 lncRNA +RF02129 lncRNA +RF02130 lncRNA +RF02131 lncRNA +RF02132 lncRNA +RF02133 lncRNA +RF02134 lncRNA +RF02135 lncRNA +RF02136 lncRNA +RF02137 lncRNA +RF02138 lncRNA +RF02139 lncRNA +RF02140 lncRNA +RF02141 lncRNA +RF02142 lncRNA +RF02143 lncRNA +RF02145 lncRNA +RF02146 lncRNA +RF02147 lncRNA +RF02148 lncRNA +RF02149 lncRNA +RF02150 lncRNA +RF02152 lncRNA +RF02153 lncRNA +RF02154 lncRNA +RF02155 lncRNA +RF02156 lncRNA +RF02157 lncRNA +RF02158 lncRNA +RF02159 lncRNA +RF02160 lncRNA +RF02161 lncRNA +RF02164 lncRNA +RF02165 lncRNA +RF02166 lncRNA +RF02167 lncRNA +RF02168 lncRNA +RF02169 lncRNA +RF02170 lncRNA +RF02171 lncRNA +RF02172 lncRNA +RF02173 lncRNA +RF02174 lncRNA +RF02175 lncRNA +RF02176 lncRNA +RF02177 lncRNA +RF02178 lncRNA +RF02179 lncRNA +RF02180 lncRNA +RF02181 lncRNA +RF02182 lncRNA +RF02183 lncRNA +RF02184 lncRNA +RF02185 lncRNA +RF02186 lncRNA +RF02187 lncRNA +RF02188 lncRNA +RF02189 lncRNA +RF02190 lncRNA +RF02191 lncRNA +RF02192 lncRNA +RF02193 lncRNA +RF02195 lncRNA +RF02196 lncRNA +RF02197 lncRNA +RF02198 lncRNA +RF02199 lncRNA +RF02200 lncRNA +RF02201 lncRNA +RF02202 lncRNA +RF02203 lncRNA +RF02204 lncRNA +RF02205 lncRNA +RF02206 lncRNA +RF02207 lncRNA +RF02208 lncRNA +RF02209 lncRNA +RF02210 lncRNA +RF02211 lncRNA +RF02212 lncRNA +RF02213 lncRNA +RF02215 lncRNA +RF02216 lncRNA +RF02217 lncRNA +RF02218 lncRNA +RF02219 lncRNA +RF02220 lncRNA +RF02246 lncRNA +RF02247 lncRNA +RF02248 lncRNA +RF02249 lncRNA +RF02250 lncRNA +RF02251 lncRNA +RF02252 lncRNA +RF02255 lncRNA +RF02256 lncRNA +RF02257 lncRNA +RF02258 lncRNA +RF02259 lncRNA +RF02267 lncRNA +RF02272 lncRNA +RF00027 miRNA +RF00047 miRNA +RF00051 miRNA +RF00052 miRNA +RF00053 miRNA +RF00073 miRNA +RF00074 miRNA +RF00075 miRNA +RF00076 miRNA +RF00103 miRNA +RF00104 miRNA +RF00129 miRNA +RF00130 miRNA +RF00131 miRNA +RF00143 miRNA +RF00144 miRNA +RF00178 miRNA +RF00237 miRNA +RF00239 miRNA +RF00241 miRNA +RF00244 miRNA +RF00245 miRNA +RF00246 miRNA +RF00247 miRNA +RF00248 miRNA +RF00249 miRNA +RF00250 miRNA +RF00251 miRNA +RF00253 miRNA +RF00254 miRNA +RF00255 miRNA +RF00256 miRNA +RF00257 miRNA +RF00258 miRNA +RF00363 miRNA +RF00364 miRNA +RF00365 miRNA +RF00366 miRNA +RF00367 miRNA +RF00445 miRNA +RF00446 miRNA +RF00451 miRNA +RF00452 miRNA +RF00455 miRNA +RF00456 miRNA +RF00464 miRNA +RF00486 miRNA +RF00637 miRNA +RF00638 miRNA +RF00639 miRNA +RF00640 miRNA +RF00641 miRNA +RF00642 miRNA +RF00643 miRNA +RF00644 miRNA +RF00645 miRNA +RF00646 miRNA +RF00647 miRNA +RF00648 miRNA +RF00649 miRNA +RF00650 miRNA +RF00651 miRNA +RF00652 miRNA +RF00653 miRNA +RF00654 miRNA +RF00655 miRNA +RF00656 miRNA +RF00657 miRNA +RF00658 miRNA +RF00659 miRNA +RF00660 miRNA +RF00661 miRNA +RF00662 miRNA +RF00663 miRNA +RF00664 miRNA +RF00665 miRNA +RF00666 miRNA +RF00667 miRNA +RF00668 miRNA +RF00669 miRNA +RF00670 miRNA +RF00671 miRNA +RF00672 miRNA +RF00673 miRNA +RF00674 miRNA +RF00675 miRNA +RF00676 miRNA +RF00677 miRNA +RF00678 miRNA +RF00679 miRNA +RF00680 miRNA +RF00681 miRNA +RF00682 miRNA +RF00683 miRNA +RF00684 miRNA +RF00685 miRNA +RF00686 miRNA +RF00687 miRNA +RF00688 miRNA +RF00689 miRNA +RF00690 miRNA +RF00691 miRNA +RF00692 miRNA +RF00693 miRNA +RF00694 miRNA +RF00695 miRNA +RF00696 miRNA +RF00697 miRNA +RF00698 miRNA +RF00699 miRNA +RF00700 miRNA +RF00701 miRNA +RF00702 miRNA +RF00703 miRNA +RF00704 miRNA +RF00705 miRNA +RF00706 miRNA +RF00707 miRNA +RF00708 miRNA +RF00709 miRNA +RF00710 miRNA +RF00711 miRNA +RF00712 miRNA +RF00713 miRNA +RF00714 miRNA +RF00715 miRNA +RF00716 miRNA +RF00717 miRNA +RF00718 miRNA +RF00719 miRNA +RF00720 miRNA +RF00721 miRNA +RF00722 miRNA +RF00723 miRNA +RF00724 miRNA +RF00725 miRNA +RF00726 miRNA +RF00727 miRNA +RF00728 miRNA +RF00729 miRNA +RF00730 miRNA +RF00731 miRNA +RF00732 miRNA +RF00733 miRNA +RF00734 miRNA +RF00735 miRNA +RF00736 miRNA +RF00737 miRNA +RF00739 miRNA +RF00740 miRNA +RF00741 miRNA +RF00742 miRNA +RF00743 miRNA +RF00744 miRNA +RF00745 miRNA +RF00746 miRNA +RF00747 miRNA +RF00748 miRNA +RF00749 miRNA +RF00750 miRNA +RF00751 miRNA +RF00752 miRNA +RF00753 miRNA +RF00754 miRNA +RF00755 miRNA +RF00756 miRNA +RF00757 miRNA +RF00758 miRNA +RF00760 miRNA +RF00761 miRNA +RF00762 miRNA +RF00763 miRNA +RF00764 miRNA +RF00765 miRNA +RF00766 miRNA +RF00767 miRNA +RF00768 miRNA +RF00769 miRNA +RF00770 miRNA +RF00771 miRNA +RF00772 miRNA +RF00773 miRNA +RF00774 miRNA +RF00775 miRNA +RF00776 miRNA +RF00777 miRNA +RF00778 miRNA +RF00779 miRNA +RF00780 miRNA +RF00781 miRNA +RF00782 miRNA +RF00783 miRNA +RF00784 miRNA +RF00785 miRNA +RF00786 miRNA +RF00787 miRNA +RF00788 miRNA +RF00789 miRNA +RF00790 miRNA +RF00791 miRNA +RF00792 miRNA +RF00793 miRNA +RF00794 miRNA +RF00795 miRNA +RF00796 miRNA +RF00797 miRNA +RF00798 miRNA +RF00799 miRNA +RF00800 miRNA +RF00801 miRNA +RF00802 miRNA +RF00803 miRNA +RF00804 miRNA +RF00805 miRNA +RF00806 miRNA +RF00807 miRNA +RF00808 miRNA +RF00809 miRNA +RF00810 miRNA +RF00811 miRNA +RF00812 miRNA +RF00813 miRNA +RF00814 miRNA +RF00815 miRNA +RF00816 miRNA +RF00817 miRNA +RF00818 miRNA +RF00819 miRNA +RF00820 miRNA +RF00821 miRNA +RF00822 miRNA +RF00823 miRNA +RF00824 miRNA +RF00825 miRNA +RF00826 miRNA +RF00827 miRNA +RF00828 miRNA +RF00829 miRNA +RF00830 miRNA +RF00831 miRNA +RF00832 miRNA +RF00833 miRNA +RF00834 miRNA +RF00835 miRNA +RF00836 miRNA +RF00837 miRNA +RF00838 miRNA +RF00839 miRNA +RF00840 miRNA +RF00841 miRNA +RF00842 miRNA +RF00843 miRNA +RF00844 miRNA +RF00845 miRNA +RF00846 miRNA +RF00847 miRNA +RF00848 miRNA +RF00849 miRNA +RF00850 miRNA +RF00851 miRNA +RF00852 miRNA +RF00853 miRNA +RF00854 miRNA +RF00855 miRNA +RF00856 miRNA +RF00857 miRNA +RF00858 miRNA +RF00859 miRNA +RF00861 miRNA +RF00862 miRNA +RF00863 miRNA +RF00864 miRNA +RF00865 miRNA +RF00866 miRNA +RF00867 miRNA +RF00868 miRNA +RF00869 miRNA +RF00870 miRNA +RF00871 miRNA +RF00872 miRNA +RF00873 miRNA +RF00874 miRNA +RF00875 miRNA +RF00876 miRNA +RF00877 miRNA +RF00878 miRNA +RF00879 miRNA +RF00882 miRNA +RF00883 miRNA +RF00884 miRNA +RF00885 miRNA +RF00886 miRNA +RF00887 miRNA +RF00888 miRNA +RF00890 miRNA +RF00891 miRNA +RF00892 miRNA +RF00893 miRNA +RF00894 miRNA +RF00895 miRNA +RF00896 miRNA +RF00897 miRNA +RF00898 miRNA +RF00899 miRNA +RF00900 miRNA +RF00901 miRNA +RF00902 miRNA +RF00903 miRNA +RF00904 miRNA +RF00905 miRNA +RF00906 miRNA +RF00907 miRNA +RF00908 miRNA +RF00909 miRNA +RF00910 miRNA +RF00911 miRNA +RF00912 miRNA +RF00914 miRNA +RF00915 miRNA +RF00917 miRNA +RF00918 miRNA +RF00919 miRNA +RF00920 miRNA +RF00921 miRNA +RF00922 miRNA +RF00925 miRNA +RF00926 miRNA +RF00927 miRNA +RF00928 miRNA +RF00929 miRNA +RF00931 miRNA +RF00932 miRNA +RF00933 miRNA +RF00934 miRNA +RF00935 miRNA +RF00936 miRNA +RF00937 miRNA +RF00939 miRNA +RF00940 miRNA +RF00941 miRNA +RF00942 miRNA +RF00943 miRNA +RF00945 miRNA +RF00946 miRNA +RF00947 miRNA +RF00948 miRNA +RF00949 miRNA +RF00950 miRNA +RF00951 miRNA +RF00952 miRNA +RF00953 miRNA +RF00954 miRNA +RF00955 miRNA +RF00956 miRNA +RF00957 miRNA +RF00958 miRNA +RF00959 miRNA +RF00960 miRNA +RF00961 miRNA +RF00962 miRNA +RF00963 miRNA +RF00964 miRNA +RF00965 miRNA +RF00966 miRNA +RF00967 miRNA +RF00968 miRNA +RF00969 miRNA +RF00970 miRNA +RF00971 miRNA +RF00972 miRNA +RF00973 miRNA +RF00974 miRNA +RF00975 miRNA +RF00976 miRNA +RF00977 miRNA +RF00978 miRNA +RF00979 miRNA +RF00980 miRNA +RF00981 miRNA +RF00983 miRNA +RF00984 miRNA +RF00985 miRNA +RF00986 miRNA +RF00987 miRNA +RF00988 miRNA +RF00989 miRNA +RF00990 miRNA +RF00991 miRNA +RF00992 miRNA +RF00993 miRNA +RF00994 miRNA +RF00995 miRNA +RF00996 miRNA +RF00997 miRNA +RF00998 miRNA +RF00999 miRNA +RF01000 miRNA +RF01001 miRNA +RF01002 miRNA +RF01003 miRNA +RF01004 miRNA +RF01005 miRNA +RF01006 miRNA +RF01007 miRNA +RF01008 miRNA +RF01009 miRNA +RF01010 miRNA +RF01011 miRNA +RF01012 miRNA +RF01013 miRNA +RF01014 miRNA +RF01015 miRNA +RF01016 miRNA +RF01017 miRNA +RF01018 miRNA +RF01019 miRNA +RF01020 miRNA +RF01021 miRNA +RF01022 miRNA +RF01023 miRNA +RF01024 miRNA +RF01025 miRNA +RF01026 miRNA +RF01027 miRNA +RF01028 miRNA +RF01029 miRNA +RF01030 miRNA +RF01031 miRNA +RF01032 miRNA +RF01033 miRNA +RF01034 miRNA +RF01035 miRNA +RF01036 miRNA +RF01037 miRNA +RF01038 miRNA +RF01039 miRNA +RF01040 miRNA +RF01041 miRNA +RF01042 miRNA +RF01043 miRNA +RF01044 miRNA +RF01045 miRNA +RF01059 miRNA +RF01061 miRNA +RF01063 miRNA +RF01064 miRNA +RF01117 miRNA +RF01314 miRNA +RF01413 miRNA +RF01895 miRNA +RF01896 miRNA +RF01897 miRNA +RF01898 miRNA +RF01899 miRNA +RF01900 miRNA +RF01901 miRNA +RF01902 miRNA +RF01903 miRNA +RF01910 miRNA +RF01911 miRNA +RF01912 miRNA +RF01913 miRNA +RF01914 miRNA +RF01915 miRNA +RF01916 miRNA +RF01917 miRNA +RF01918 miRNA +RF01919 miRNA +RF01920 miRNA +RF01921 miRNA +RF01922 miRNA +RF01923 miRNA +RF01924 miRNA +RF01925 miRNA +RF01926 miRNA +RF01927 miRNA +RF01936 miRNA +RF01937 miRNA +RF01938 miRNA +RF01939 miRNA +RF01940 miRNA +RF01941 miRNA +RF01942 miRNA +RF01943 miRNA +RF01944 miRNA +RF01945 miRNA +RF01996 miRNA +RF01997 miRNA +RF02000 miRNA +RF02002 miRNA +RF02006 miRNA +RF02007 miRNA +RF02008 miRNA +RF02009 miRNA +RF02010 miRNA +RF02011 miRNA +RF02013 miRNA +RF02014 miRNA +RF02015 miRNA +RF02016 miRNA +RF02017 miRNA +RF02018 miRNA +RF02019 miRNA +RF02020 miRNA +RF02021 miRNA +RF02022 miRNA +RF02023 miRNA +RF02024 miRNA +RF02025 miRNA +RF02026 miRNA +RF02027 miRNA +RF02028 miRNA +RF02061 miRNA +RF02092 miRNA +RF02093 miRNA +RF02094 miRNA +RF02095 miRNA +RF02096 miRNA +RF02097 miRNA +RF02214 miRNA +RF02244 miRNA +RF02245 miRNA +RF02254 miRNA +RF00001 rRNA +RF00002 rRNA +RF01118 rRNA +RF01960 rRNA +RF00177 rRNA +RF01959 rRNA +RF00003 snRNA +RF00004 snRNA +RF00007 snRNA +RF00012 snRNA +RF00015 snRNA +RF00016 snRNA +RF00020 snRNA +RF00026 snRNA +RF00045 snRNA +RF00046 snRNA +RF00049 snRNA +RF00054 snRNA +RF00055 snRNA +RF00056 snRNA +RF00065 snRNA +RF00066 snRNA +RF00067 snRNA +RF00068 snRNA +RF00069 snRNA +RF00070 snRNA +RF00071 snRNA +RF00072 snRNA +RF00085 snRNA +RF00086 snRNA +RF00087 snRNA +RF00088 snRNA +RF00089 snRNA +RF00090 snRNA +RF00091 snRNA +RF00092 snRNA +RF00093 snRNA +RF00095 snRNA +RF00096 snRNA +RF00097 snRNA +RF00099 snRNA +RF00105 snRNA +RF00108 snRNA +RF00132 snRNA +RF00133 snRNA +RF00134 snRNA +RF00135 snRNA +RF00136 snRNA +RF00137 snRNA +RF00138 snRNA +RF00139 snRNA +RF00142 snRNA +RF00145 snRNA +RF00147 snRNA +RF00149 snRNA +RF00150 snRNA +RF00151 snRNA +RF00152 snRNA +RF00153 snRNA +RF00154 snRNA +RF00155 snRNA +RF00156 snRNA +RF00157 snRNA +RF00158 snRNA +RF00159 snRNA +RF00160 snRNA +RF00181 snRNA +RF00186 snRNA +RF00187 snRNA +RF00188 snRNA +RF00189 snRNA +RF00190 snRNA +RF00191 snRNA +RF00200 snRNA +RF00201 snRNA +RF00202 snRNA +RF00203 snRNA +RF00204 snRNA +RF00205 snRNA +RF00206 snRNA +RF00208 snRNA +RF00211 snRNA +RF00212 snRNA +RF00213 snRNA +RF00217 snRNA +RF00218 snRNA +RF00221 snRNA +RF00231 snRNA +RF00263 snRNA +RF00264 snRNA +RF00265 snRNA +RF00266 snRNA +RF00267 snRNA +RF00268 snRNA +RF00270 snRNA +RF00271 snRNA +RF00272 snRNA +RF00273 snRNA +RF00274 snRNA +RF00275 snRNA +RF00276 snRNA +RF00277 snRNA +RF00278 snRNA +RF00279 snRNA +RF00280 snRNA +RF00281 snRNA +RF00282 snRNA +RF00283 snRNA +RF00284 snRNA +RF00285 snRNA +RF00286 snRNA +RF00287 snRNA +RF00288 snRNA +RF00289 snRNA +RF00291 snRNA +RF00292 snRNA +RF00293 snRNA +RF00294 snRNA +RF00295 snRNA +RF00296 snRNA +RF00300 snRNA +RF00301 snRNA +RF00302 snRNA +RF00303 snRNA +RF00304 snRNA +RF00305 snRNA +RF00306 snRNA +RF00307 snRNA +RF00309 snRNA +RF00310 snRNA +RF00311 snRNA +RF00312 snRNA +RF00313 snRNA +RF00314 snRNA +RF00315 snRNA +RF00316 snRNA +RF00317 snRNA +RF00318 snRNA +RF00319 snRNA +RF00320 snRNA +RF00321 snRNA +RF00322 snRNA +RF00323 snRNA +RF00324 snRNA +RF00325 snRNA +RF00326 snRNA +RF00327 snRNA +RF00328 snRNA +RF00329 snRNA +RF00330 snRNA +RF00331 snRNA +RF00332 snRNA +RF00333 snRNA +RF00334 snRNA +RF00335 snRNA +RF00336 snRNA +RF00337 snRNA +RF00338 snRNA +RF00339 snRNA +RF00340 snRNA +RF00341 snRNA +RF00342 snRNA +RF00343 snRNA +RF00344 snRNA +RF00345 snRNA +RF00348 snRNA +RF00349 snRNA +RF00350 snRNA +RF00351 snRNA +RF00352 snRNA +RF00353 snRNA +RF00355 snRNA +RF00356 snRNA +RF00357 snRNA +RF00358 snRNA +RF00359 snRNA +RF00360 snRNA +RF00361 snRNA +RF00377 snRNA +RF00392 snRNA +RF00393 snRNA +RF00394 snRNA +RF00396 snRNA +RF00397 snRNA +RF00398 snRNA +RF00399 snRNA +RF00400 snRNA +RF00401 snRNA +RF00402 snRNA +RF00403 snRNA +RF00404 snRNA +RF00405 snRNA +RF00406 snRNA +RF00407 snRNA +RF00408 snRNA +RF00409 snRNA +RF00410 snRNA +RF00411 snRNA +RF00412 snRNA +RF00413 snRNA +RF00414 snRNA +RF00415 snRNA +RF00416 snRNA +RF00417 snRNA +RF00418 snRNA +RF00419 snRNA +RF00420 snRNA +RF00421 snRNA +RF00422 snRNA +RF00423 snRNA +RF00424 snRNA +RF00425 snRNA +RF00426 snRNA +RF00427 snRNA +RF00428 snRNA +RF00429 snRNA +RF00430 snRNA +RF00431 snRNA +RF00432 snRNA +RF00438 snRNA +RF00439 snRNA +RF00440 snRNA +RF00441 snRNA +RF00443 snRNA +RF00471 snRNA +RF00472 snRNA +RF00473 snRNA +RF00474 snRNA +RF00475 snRNA +RF00476 snRNA +RF00477 snRNA +RF00478 snRNA +RF00479 snRNA +RF00482 snRNA +RF00488 snRNA +RF00492 snRNA +RF00493 snRNA +RF00494 snRNA +RF00509 snRNA +RF00526 snRNA +RF00527 snRNA +RF00528 snRNA +RF00529 snRNA +RF00530 snRNA +RF00531 snRNA +RF00532 snRNA +RF00533 snRNA +RF00535 snRNA +RF00536 snRNA +RF00537 snRNA +RF00538 snRNA +RF00539 snRNA +RF00540 snRNA +RF00541 snRNA +RF00542 snRNA +RF00543 snRNA +RF00544 snRNA +RF00545 snRNA +RF00546 snRNA +RF00548 snRNA +RF00553 snRNA +RF00554 snRNA +RF00560 snRNA +RF00561 snRNA +RF00562 snRNA +RF00563 snRNA +RF00564 snRNA +RF00565 snRNA +RF00566 snRNA +RF00567 snRNA +RF00568 snRNA +RF00569 snRNA +RF00570 snRNA +RF00571 snRNA +RF00572 snRNA +RF00573 snRNA +RF00574 snRNA +RF00575 snRNA +RF00576 snRNA +RF00577 snRNA +RF00578 snRNA +RF00579 snRNA +RF00580 snRNA +RF00581 snRNA +RF00582 snRNA +RF00584 snRNA +RF00586 snRNA +RF00588 snRNA +RF00591 snRNA +RF00592 snRNA +RF00593 snRNA +RF00594 snRNA +RF00598 snRNA +RF00599 snRNA +RF00600 snRNA +RF00601 snRNA +RF00602 snRNA +RF00603 snRNA +RF00604 snRNA +RF00606 snRNA +RF00607 snRNA +RF00608 snRNA +RF00609 snRNA +RF00610 snRNA +RF00611 snRNA +RF00612 snRNA +RF00613 snRNA +RF00614 snRNA +RF00618 snRNA +RF00619 snRNA +RF01119 snRNA +RF01120 snRNA +RF01121 snRNA +RF01122 snRNA +RF01123 snRNA +RF01124 snRNA +RF01125 snRNA +RF01126 snRNA +RF01127 snRNA +RF01128 snRNA +RF01129 snRNA +RF01130 snRNA +RF01131 snRNA +RF01132 snRNA +RF01133 snRNA +RF01134 snRNA +RF01135 snRNA +RF01136 snRNA +RF01137 snRNA +RF01138 snRNA +RF01139 snRNA +RF01140 snRNA +RF01141 snRNA +RF01142 snRNA +RF01143 snRNA +RF01144 snRNA +RF01145 snRNA +RF01146 snRNA +RF01147 snRNA +RF01148 snRNA +RF01149 snRNA +RF01150 snRNA +RF01151 snRNA +RF01152 snRNA +RF01153 snRNA +RF01155 snRNA +RF01156 snRNA +RF01157 snRNA +RF01158 snRNA +RF01159 snRNA +RF01160 snRNA +RF01161 snRNA +RF01162 snRNA +RF01163 snRNA +RF01164 snRNA +RF01165 snRNA +RF01166 snRNA +RF01167 snRNA +RF01168 snRNA +RF01169 snRNA +RF01170 snRNA +RF01171 snRNA +RF01172 snRNA +RF01173 snRNA +RF01174 snRNA +RF01175 snRNA +RF01176 snRNA +RF01177 snRNA +RF01178 snRNA +RF01179 snRNA +RF01180 snRNA +RF01181 snRNA +RF01182 snRNA +RF01183 snRNA +RF01184 snRNA +RF01185 snRNA +RF01186 snRNA +RF01188 snRNA +RF01189 snRNA +RF01190 snRNA +RF01191 snRNA +RF01192 snRNA +RF01193 snRNA +RF01194 snRNA +RF01195 snRNA +RF01196 snRNA +RF01197 snRNA +RF01198 snRNA +RF01199 snRNA +RF01200 snRNA +RF01201 snRNA +RF01202 snRNA +RF01203 snRNA +RF01204 snRNA +RF01205 snRNA +RF01206 snRNA +RF01207 snRNA +RF01208 snRNA +RF01209 snRNA +RF01210 snRNA +RF01211 snRNA +RF01212 snRNA +RF01213 snRNA +RF01214 snRNA +RF01215 snRNA +RF01216 snRNA +RF01218 snRNA +RF01219 snRNA +RF01220 snRNA +RF01221 snRNA +RF01222 snRNA +RF01223 snRNA +RF01224 snRNA +RF01225 snRNA +RF01226 snRNA +RF01227 snRNA +RF01228 snRNA +RF01229 snRNA +RF01230 snRNA +RF01231 snRNA +RF01232 snRNA +RF01233 snRNA +RF01234 snRNA +RF01235 snRNA +RF01236 snRNA +RF01237 snRNA +RF01238 snRNA +RF01239 snRNA +RF01240 snRNA +RF01241 snRNA +RF01242 snRNA +RF01243 snRNA +RF01244 snRNA +RF01245 snRNA +RF01246 snRNA +RF01247 snRNA +RF01248 snRNA +RF01249 snRNA +RF01250 snRNA +RF01251 snRNA +RF01252 snRNA +RF01253 snRNA +RF01254 snRNA +RF01255 snRNA +RF01256 snRNA +RF01257 snRNA +RF01258 snRNA +RF01259 snRNA +RF01260 snRNA +RF01261 snRNA +RF01262 snRNA +RF01263 snRNA +RF01264 snRNA +RF01265 snRNA +RF01266 snRNA +RF01267 snRNA +RF01268 snRNA +RF01269 snRNA +RF01270 snRNA +RF01271 snRNA +RF01272 snRNA +RF01273 snRNA +RF01274 snRNA +RF01275 snRNA +RF01276 snRNA +RF01277 snRNA +RF01278 snRNA +RF01279 snRNA +RF01280 snRNA +RF01281 snRNA +RF01283 snRNA +RF01284 snRNA +RF01285 snRNA +RF01286 snRNA +RF01287 snRNA +RF01288 snRNA +RF01289 snRNA +RF01290 snRNA +RF01291 snRNA +RF01292 snRNA +RF01293 snRNA +RF01294 snRNA +RF01295 snRNA +RF01296 snRNA +RF01297 snRNA +RF01298 snRNA +RF01299 snRNA +RF01300 snRNA +RF01301 snRNA +RF01302 snRNA +RF01303 snRNA +RF01304 snRNA +RF01305 snRNA +RF01306 snRNA +RF01307 snRNA +RF01308 snRNA +RF01309 snRNA +RF01310 snRNA +RF01311 snRNA +RF01312 snRNA +RF01420 snRNA +RF01421 snRNA +RF01422 snRNA +RF01423 snRNA +RF01424 snRNA +RF01425 snRNA +RF01426 snRNA +RF01427 snRNA +RF01428 snRNA +RF01429 snRNA +RF01430 snRNA +RF01431 snRNA +RF01432 snRNA +RF01433 snRNA +RF01434 snRNA +RF01435 snRNA +RF01436 snRNA +RF01437 snRNA +RF01438 snRNA +RF01439 snRNA +RF01440 snRNA +RF01441 snRNA +RF01442 snRNA +RF01443 snRNA +RF01444 snRNA +RF01445 snRNA +RF01446 snRNA +RF01447 snRNA +RF01448 snRNA +RF01449 snRNA +RF01450 snRNA +RF01451 snRNA +RF01452 snRNA +RF01498 snRNA +RF01499 snRNA +RF01500 snRNA +RF01501 snRNA +RF01505 snRNA +RF01506 snRNA +RF01507 snRNA +RF01509 snRNA +RF01511 snRNA +RF01513 snRNA +RF01514 snRNA +RF01515 snRNA +RF01516 snRNA +RF01522 snRNA +RF01523 snRNA +RF01524 snRNA +RF01525 snRNA +RF01526 snRNA +RF01531 snRNA +RF01532 snRNA +RF01533 snRNA +RF01534 snRNA +RF01535 snRNA +RF01536 snRNA +RF01537 snRNA +RF01538 snRNA +RF01539 snRNA +RF01540 snRNA +RF01541 snRNA +RF01542 snRNA +RF01543 snRNA +RF01544 snRNA +RF01545 snRNA +RF01546 snRNA +RF01547 snRNA +RF01548 snRNA +RF01549 snRNA +RF01550 snRNA +RF01551 snRNA +RF01552 snRNA +RF01553 snRNA +RF01554 snRNA +RF01555 snRNA +RF01556 snRNA +RF01557 snRNA +RF01558 snRNA +RF01559 snRNA +RF01560 snRNA +RF01561 snRNA +RF01562 snRNA +RF01563 snRNA +RF01564 snRNA +RF01565 snRNA +RF01566 snRNA +RF01567 snRNA +RF01568 snRNA +RF01569 snRNA +RF01570 snRNA +RF01572 snRNA +RF01573 snRNA +RF01574 snRNA +RF01575 snRNA +RF01576 snRNA +RF01583 snRNA +RF01584 snRNA +RF01585 snRNA +RF01586 snRNA +RF01587 snRNA +RF01588 snRNA +RF01589 snRNA +RF01590 snRNA +RF01591 snRNA +RF01592 snRNA +RF01593 snRNA +RF01594 snRNA +RF01595 snRNA +RF01596 snRNA +RF01597 snRNA +RF01598 snRNA +RF01599 snRNA +RF01600 snRNA +RF01601 snRNA +RF01602 snRNA +RF01603 snRNA +RF01604 snRNA +RF01605 snRNA +RF01606 snRNA +RF01607 snRNA +RF01608 snRNA +RF01609 snRNA +RF01610 snRNA +RF01611 snRNA +RF01612 snRNA +RF01613 snRNA +RF01614 snRNA +RF01615 snRNA +RF01617 snRNA +RF01618 snRNA +RF01620 snRNA +RF01621 snRNA +RF01622 snRNA +RF01624 snRNA +RF01625 snRNA +RF01626 snRNA +RF01627 snRNA +RF01628 snRNA +RF01629 snRNA +RF01630 snRNA +RF01631 snRNA +RF01632 snRNA +RF01633 snRNA +RF01634 snRNA +RF01635 snRNA +RF01636 snRNA +RF01637 snRNA +RF01638 snRNA +RF01639 snRNA +RF01640 snRNA +RF01641 snRNA +RF01642 snRNA +RF01644 snRNA +RF01645 snRNA +RF01646 snRNA +RF01647 snRNA +RF01648 snRNA +RF01649 snRNA +RF01650 snRNA +RF01651 snRNA +RF01652 snRNA +RF01653 snRNA +RF01654 snRNA +RF01655 snRNA +RF01658 snRNA +RF01659 snRNA +RF01660 snRNA +RF01661 snRNA +RF01662 snRNA +RF01664 snRNA +RF01802 snRNA +RF01829 snRNA +RF01844 snRNA +RF01846 snRNA +RF01847 snRNA +RF01848 snRNA +RF01860 snRNA +RF01861 snRNA +RF01862 snRNA +RF01863 snRNA +RF01864 snRNA +RF01866 snRNA +RF02163 snRNA +RF00014 sRNA +RF00018 sRNA +RF00021 sRNA +RF00034 sRNA +RF00035 sRNA +RF00057 sRNA +RF00077 sRNA +RF00078 sRNA +RF00079 sRNA +RF00081 sRNA +RF00082 sRNA +RF00083 sRNA +RF00084 sRNA +RF00101 sRNA +RF00110 sRNA +RF00111 sRNA +RF00112 sRNA +RF00113 sRNA +RF00115 sRNA +RF00116 sRNA +RF00117 sRNA +RF00118 sRNA +RF00119 sRNA +RF00121 sRNA +RF00122 sRNA +RF00124 sRNA +RF00125 sRNA +RF00126 sRNA +RF00128 sRNA +RF00166 sRNA +RF00195 sRNA +RF00368 sRNA +RF00369 sRNA +RF00370 sRNA +RF00371 sRNA +RF00372 sRNA +RF00378 sRNA +RF00444 sRNA +RF00505 sRNA +RF00519 sRNA +RF00615 sRNA +RF00616 sRNA +RF01116 sRNA +RF01385 sRNA +RF01386 sRNA +RF01387 sRNA +RF01388 sRNA +RF01389 sRNA +RF01390 sRNA +RF01391 sRNA +RF01392 sRNA +RF01393 sRNA +RF01394 sRNA +RF01395 sRNA +RF01396 sRNA +RF01397 sRNA +RF01398 sRNA +RF01399 sRNA +RF01400 sRNA +RF01401 sRNA +RF01402 sRNA +RF01403 sRNA +RF01404 sRNA +RF01405 sRNA +RF01406 sRNA +RF01407 sRNA +RF01408 sRNA +RF01409 sRNA +RF01410 sRNA +RF01411 sRNA +RF01412 sRNA +RF01457 sRNA +RF01459 sRNA +RF01460 sRNA +RF01461 sRNA +RF01462 sRNA +RF01463 sRNA +RF01464 sRNA +RF01465 sRNA +RF01466 sRNA +RF01467 sRNA +RF01468 sRNA +RF01469 sRNA +RF01470 sRNA +RF01471 sRNA +RF01472 sRNA +RF01473 sRNA +RF01474 sRNA +RF01476 sRNA +RF01477 sRNA +RF01478 sRNA +RF01479 sRNA +RF01487 sRNA +RF01488 sRNA +RF01489 sRNA +RF01492 sRNA +RF01493 sRNA +RF01494 sRNA +RF01496 sRNA +RF01503 sRNA +RF01504 sRNA +RF01512 sRNA +RF01519 sRNA +RF01520 sRNA +RF01521 sRNA +RF01527 sRNA +RF01528 sRNA +RF01529 sRNA +RF01530 sRNA +RF01571 sRNA +RF01578 sRNA +RF01579 sRNA +RF01580 sRNA +RF01581 sRNA +RF01582 sRNA +RF01619 sRNA +RF01623 sRNA +RF01643 sRNA +RF01656 sRNA +RF01663 sRNA +RF01665 sRNA +RF01668 sRNA +RF01669 sRNA +RF01670 sRNA +RF01671 sRNA +RF01672 sRNA +RF01673 sRNA +RF01674 sRNA +RF01675 sRNA +RF01676 sRNA +RF01677 sRNA +RF01678 sRNA +RF01679 sRNA +RF01680 sRNA +RF01681 sRNA +RF01682 sRNA +RF01683 sRNA +RF01684 sRNA +RF01685 sRNA +RF01686 sRNA +RF01687 sRNA +RF01690 sRNA +RF01691 sRNA +RF01693 sRNA +RF01694 sRNA +RF01696 sRNA +RF01698 sRNA +RF01699 sRNA +RF01700 sRNA +RF01701 sRNA +RF01702 sRNA +RF01703 sRNA +RF01705 sRNA +RF01706 sRNA +RF01710 sRNA +RF01712 sRNA +RF01714 sRNA +RF01718 sRNA +RF01719 sRNA +RF01722 sRNA +RF01723 sRNA +RF01728 sRNA +RF01732 sRNA +RF01742 sRNA +RF01757 sRNA +RF01762 sRNA +RF01775 sRNA +RF01781 sRNA +RF01782 sRNA +RF01783 sRNA +RF01784 sRNA +RF01789 sRNA +RF01791 sRNA +RF01793 sRNA +RF01796 sRNA +RF01808 sRNA +RF01810 sRNA +RF01812 sRNA +RF01814 sRNA +RF01815 sRNA +RF01816 sRNA +RF01817 sRNA +RF01818 sRNA +RF01819 sRNA +RF01820 sRNA +RF01821 sRNA +RF01822 sRNA +RF01823 sRNA +RF01827 sRNA +RF01828 sRNA +RF01858 sRNA +RF01867 sRNA +RF02029 sRNA +RF02030 sRNA +RF02031 sRNA +RF02049 sRNA +RF02050 sRNA +RF02051 sRNA +RF02052 sRNA +RF02053 sRNA +RF02054 sRNA +RF02055 sRNA +RF02056 sRNA +RF02057 sRNA +RF02059 sRNA +RF02060 sRNA +RF02062 sRNA +RF02063 sRNA +RF02064 sRNA +RF02065 sRNA +RF02066 sRNA +RF02067 sRNA +RF02070 sRNA +RF02071 sRNA +RF02072 sRNA +RF02073 sRNA +RF02074 sRNA +RF02075 sRNA +RF02077 sRNA +RF02078 sRNA +RF02079 sRNA +RF02080 sRNA +RF02081 sRNA +RF02082 sRNA +RF02099 sRNA +RF02100 sRNA +RF02151 sRNA +RF02221 sRNA +RF02222 sRNA +RF02223 sRNA +RF02224 sRNA +RF02225 sRNA +RF02226 sRNA +RF02227 sRNA +RF02228 sRNA +RF02230 sRNA +RF02231 sRNA +RF02232 sRNA +RF02233 sRNA +RF02234 sRNA +RF02235 sRNA +RF02236 sRNA +RF02237 sRNA +RF02238 sRNA +RF02239 sRNA +RF02240 sRNA +RF02241 sRNA +RF02242 sRNA +RF02243 sRNA +RF02268 sRNA +RF02269 sRNA +RF00127 sRNA +RF01852 tRNA +RF00005 tRNA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filterReadsByCount.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2010-01 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","o=s","mark:s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample"; +my @mark=split /\#/,$mark; + +open OUT,">$opts{o}"; +open IN,"<$opts{i}"; +my %hash;my %reads; +while (my $aline=<IN>) { + chomp $aline; + my $seq=<IN>; + chomp $seq; + if($aline=~/:([\d|_]+)_x(\d+)$/){ + if ($2>3) { + my @ss=split/_/,$1; + for (my $i=0;$i<@ss;$i++) { + $hash{length($seq)}[$i]++ if($ss[$i]>0); + $hash{length($seq)}[$i] +=0 if($ss[$i]==0); + $reads{length($seq)}[$i]+=$ss[$i]; + } + print OUT "$aline\n$seq\n"; + } + } +} +close IN; +close OUT; + +my $dir=dirname($opts{'o'}); +chdir $dir; +my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt"; +open OUT, ">$lengthfile"; +open R,">$dir/length_distribution_after_count_filter.R"; + +print OUT "Tags length\t@mark\n"; + +my $samNo=@mark; +my $avalue=""; +my @length=sort{$a<=>$b} keys %hash; +foreach (@length) { + print OUT $_,"\t@{$hash{$_}}\n"; + my $vv=join ", ",@{$hash{$_}}; + $avalue .="$vv,"; +} +$avalue =~s/,$//; +my $lengths=join ",",@length; +my $marks=join "\",\"",@mark; + +print R "a<-c($avalue) +b<-matrix(a,ncol=$samNo,byrow=T) +cl<-colors() +names=c($lengths) +legends=c(\"$marks\") +png(\"Tags_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +$avalue=""; +print OUT "\nReads length\t@mark\n"; +foreach (@length) { + print OUT $_,"\t@{$reads{$_}}\n"; + my $vv=join ", ", @{$reads{$_}}; + $avalue .= "$vv,"; +} +$avalue =~s/,$//; + +print R "a<-c($avalue)\n +b<-matrix(a,ncol=$samNo,byrow=T) + +png(\"Reads_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +close OUT; +close R; + +system ("R CMD BATCH $dir/length_distribution_after_count_filter.R"); + +#system ("rm $dir/length_distribution.R"); +#system ("rm $dir/length_distribution.Rout"); +#system ("rm $dir/.RData"); +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -min -max -mark +options: + +-i input file +-o output file +-mark string #sample name eg: samA#samB#samC +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filterReadsByLength.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,121 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2010-01 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","min=i","max=i","o=s","mark:s","h"); +if (!(defined $opts{i} and defined $opts{o} and defined $opts{min} and defined $opts{max}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample"; +my @mark=split /,/,$mark; + + +open OUT,">$opts{o}"; +open IN,"<$opts{i}"; +my %hash;my %reads; +while (my $aline=<IN>) { + chomp $aline; + my $seq=<IN>; + chomp $seq; + + if($aline=~/:([\d|_]+)_x(\d+)$/){ + my @ss=split/_/,$1; + for (my $i=0;$i<@ss;$i++) { + $hash{length($seq)}[$i]++ if($ss[$i]>0); + $hash{length($seq)}[$i] +=0 if($ss[$i]==0); + $reads{length($seq)}[$i]+=$ss[$i]; + } + } + #else{$reads{length($seq)}+=1;} + if (length ($seq)>=$opts{'min'} && length ($seq) <=$opts{'max'}) { + print OUT "$aline\n$seq\n"; + } +} +close IN; +close OUT; + +my $dir=dirname($opts{'o'}); +chdir $dir; +my $lengthfile=$dir."/reads_length_distribution.txt"; +open OUT, ">$lengthfile"; +open R,">$dir/length_distribution.R"; + +print OUT "Tags length\t@mark\n"; + +my $samNo=@mark; +my $avalue=""; +my @length=sort{$a<=>$b} keys %hash; +foreach (@length) { + print OUT $_,"\t@{$hash{$_}}\n"; + my $vv=join ", ",@{$hash{$_}}; + $avalue .="$vv,"; +} +$avalue =~s/,$//; +my $lengths=join ",",@length; +my $marks=join "\",\"",@mark; + +print R "a<-c($avalue) +b<-matrix(a,ncol=$samNo,byrow=T) +cl<-colors() +names=c($lengths) +legends=c(\"$marks\") +png(\"Tags_length.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +$avalue=""; +print OUT "\nReads length\t@mark\n"; +foreach (@length) { + print OUT $_,"\t@{$reads{$_}}\n"; + my $vv=join ", ", @{$reads{$_}}; + $avalue .= "$vv,"; +} +$avalue =~s/,$//; + +print R "a<-c($avalue)\n +b<-matrix(a,ncol=$samNo,byrow=T) + +png(\"Reads_length.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +close OUT; +close R; + +#system ("R CMD BATCH $dir/length_distribution.R"); + +#system ("rm $dir/length_distribution.R"); +#system ("rm $dir/length_distribution.Rout"); +#system ("rm $dir/.RData"); +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -min -max -mark +options: + +-i input file +-o output file +-min reads min length. +-max reads max length. +-mark string #sample name eg: samA,samB,samC +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_genelist.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,62 @@ +#!/usr/bin/perl -w +#Filename: +#Author: chentt +#Email: +#Date: 2012-4-6 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} +open IN,"<$opts{i}"; +open OUT ,">$opts{o}"; +print OUT "#ID\tchr\tstart\tend\tstrand\n"; +my $n=1; +my %gene1; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + my $ID; + if ($tmp[2] eq "gene") { + $tmp[0]=~s/Chr\./Chr/; + #$tmp[0]=~s/Chr/chr/; + my @infor=split/;/,$tmp[8]; + for (my $i=0;$i<@infor ;$i++) { + if ($infor[$i]=~/Alias\=(\S+)$/) { + $ID=$1; + last; + } + else { + $ID="unknown$n"; + $n++; + } + } + #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] + push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; + print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; + } +} +close IN; +close OUT; + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -h +options: +-i input cluster file +-o output file +-h help +USAGE +exit(1); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_miRPlant.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,273 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-5-29 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","format=s","o=s","h"); +if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments +&usage; +} +my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath); +my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir); +open IN,"<$opts{i}"; +$config=<IN>; chomp $config; +$prepath=<IN>; chomp $prepath; +$knownpath=<IN>; chomp $knownpath; +$rfampath=<IN>;chomp $rfampath; +$genomepath=<IN>; chomp $genomepath; +$novelpath=<IN>; chomp $novelpath; +close IN; +my @tmp=split/\//,$prepath; +$predir=$tmp[-1]; +@tmp=split/\//,$rfampath; +$rfamdir=$tmp[-1]; +@tmp=split/\//,$knownpath; +$knowndir=$tmp[-1]; +@tmp=split/\//,$genomepath; +$genomedir=$tmp[-1]; +@tmp=split/\//,$novelpath; +$noveldir=$tmp[-1]; + +my $dir=dirname($opts{'o'}); + +open OUT ,">$opts{'o'}"; +print OUT "<HTML>\n <HEAD>\n <TITLE> Analysis Report </TITLE>\n </HEAD> + <BODY bgcolor=\"lightgray\">\n <h1 align=\"center\">\n <font face=\"ºÚÌå\">\n <b>Small RNA Analysis Report</b>\n </font>\n </h1> + <h2>1. Sequence No. and quality</h2> + <h3>1.1 Sequece No.</h3> +"; + +### raw data no +open IN,"<$config"; +my @files;my @marks; my @rawNo; +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @files,$tmp[0]; + + my $no=`less $tmp[0] |wc -l `; + chomp $no; + if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { + $no=$no/4; + } + else{ + $no=$no/2; + } + push @rawNo,$no; + + push @marks,$tmp[1]; +} +close IN; + +### preprocess +unless ($prepath=~/\/$/) { + $prepath .="/"; +} + +my @trimNo;my @collapse; +my $collapsefile=$prepath."collapse_reads.fa"; +open IN,"<$collapsefile"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $trimNo[$i] +=$lng[$i]; + $collapse[$i] ++; + } + } +} +close IN; + +my @cleanR;my @cleanT; +my $clean=$prepath."collapse_reads_19_28.fa"; +open IN,"<$clean"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $cleanR[$i] +=$lng[$i]; + $cleanT[$i] ++; + } + } +} +close IN; + +print OUT "<table border=\"1\"> +<tr align=\"center\"> +<th> </th> +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Raw Reads No. </th> +"; +foreach (@rawNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Reads No. After Trimed 3\' adapter </th> +"; +foreach (@trimNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Unique Tags No. </th> +"; +foreach (@collapse) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Reads No. </th> +"; +foreach (@cleanR) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Tags No. </th> +"; +foreach (@cleanT) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +The raw data file path is: <b>$files[0]</b><br /> +"; +for (my $i=1;$i<@files;$i++) { + print OUT "          <b>$files[$i]</b><br />"; +} +print OUT "The collapsed file path is: <b>$collapsefile</b><br /> +The clean data file path is: <b>$clean</b><br /> +</p> +<h2> 1. Sequence length count</h2> +"; +print OUT "\n"; + +my $length=$prepath."length.html"; +open IN,"<$length"; +while (my $aline=<IN>) { + chomp $aline; + print OUT "$aline\n"; +} + +print OUT "<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution.txt\"> length file</a> +</p> +"; + +#### rfam +unless ($rfampath=~/\/$/) { + $rfampath .="/"; +} +print OUT "<h2>2. Rfam non-miRNA annotation</h2> +<h3>2.1 Reads count</h3> +<table border=\"1\"> +<tr align=\"center\"> +"; + +my @rfamR; my @rfamT; +my $tag=1; +open IN,"<$dir/rfam_non-miRNA_annotation.txt"; +while (my $aline=<IN>) { + chomp $aline; + $tag=0 if($aline=~/tags\s+number/); + next if($aline=~/^\#/); + next if($aline=~/^\s*$/); + my @tmp=split/\s+/,$aline; + if($tag == 1){push @rfamR,[@tmp];} + else{push @rfamT,[@tmp];} +} +close IN; + + +print OUT "<th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamR;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamR[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { + print OUT "<td> $rfamR[$i][$j]</td>\n"; + } +} + +print OUT "</tr>\n</table> + <h3>2.2 Tags count</h3> + <table border=\"1\"> + <tr align=\"center\"> + <th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamT;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamT[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { + print OUT "<td> $rfamT[$i][$j]</td>\n"; + } +} +print OUT "</tr>\n</table> +<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>"; +print OUT "<b>rfam_mapped.bwt</b></p> + <h2>3. MicroRNA result</h2> + <h3>3.1 known microRNA</h3> + <p>The known microRNA express list: <a href=\"./known_microRNA_express.txt\"> known_microRNA_express.txt</a><br/> + The known microRNA alngment file: <a href=\"./known_microRNA_express.aln\"> known_microRNA_express.aln</a><br/> + The known moRs file: <a href=\"./known_microRNA_express.moRs\"> known_microRNA_express.moRs</a><br/> + The known microRNA mature sequence file: <a href=\"./known_microRNA_mature.fa\"> known_microRNA_mature.fa</a><br/> + The knowm microRNA precursor sequence file: <a href=\"./known_microRNA_precursor.fa\"> known_microRNA_precursor.fa</a> + </p> + + <h3>3.2 novel microRNA</h3> + <p>The novel microRNA prediction file:<a href=\"./microRNA_prediction.mrd\"> microRNA_prediction.mrd</a><br/> + The novel microRNA express list: <a href=\"./novel_microRNA_express.txt\"> novel_microRNA_express.txt</a><br/> + The novel microRNA mature sequence file: <a href=\"./novel_microRNA_mature.fa\"> novel_microRNA_mature.fa</a><br/> + The novel microRNA precursor sequence file: <a href=\"./novel_microRNA_precursor.fa\"> novel_microRNA_precursor.fa</a> + </p> +"; + + + +print OUT " + </BODY> +</HTML> +"; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -o +options: +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_preprocess.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,388 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-5-29 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","format=s","min=i","max=i","o=s","h"); +if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments +&usage; +} +my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath); +my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir); +open IN,"<$opts{i}"; +$config=<IN>; chomp $config; +$prepath=<IN>; chomp $prepath; +$genomepath=<IN>; chomp $genomepath; +$rfampath=<IN>; +close IN; +my @tmp=split/\//,$prepath; +$predir=$tmp[-1]; +@tmp=split/\//,$genomepath; +$genomedir=$tmp[-1]; + +my $dir=dirname($opts{'o'}); + +open OUT ,">$opts{'o'}"; +print OUT "<HTML>\n <HEAD>\n <TITLE> Analysis Report </TITLE>\n </HEAD> + <BODY bgcolor=\"lightgray\">\n <h1 align=\"center\">\n <font face=\"ºÚÌå\">\n <b>Preprocess Report</b>\n </font>\n </h1> + <h2>1. Sequence No. and quality</h2> + <h3>1.1 Sequece No.</h3> +"; + +### raw data no +open IN,"<$config"; +my @files;my @marks; my @rawNo; +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @files,$tmp[0]; + + my $no=`less $tmp[0] |wc -l `; + chomp $no; + if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { + $no=$no/4; + } + else{ + $no=$no/2; + } + push @rawNo,$no; + + push @marks,$tmp[1]; +} +close IN; + +### preprocess +unless ($prepath=~/\/$/) { + $prepath .="/"; +} + +my @trimNo;my @collapse; +my $collapsefile=$prepath."collapse_reads.fa"; +open IN,"<$collapsefile"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $trimNo[$i] +=$lng[$i]; + $collapse[$i] ++; + } + } +} +close IN; + +my @cleanR;my @cleanT; +my $clean=$prepath."collapse_reads_$opts{min}_$opts{max}.fa"; +open IN,"<$clean"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $cleanR[$i] +=$lng[$i]; + $cleanT[$i] ++; + } + } +} +close IN; + +print OUT "<table border=\"1\"> +<tr align=\"center\"> +<th> </th> +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Raw Reads No. </th> +"; +foreach (@rawNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Reads No. After Trimed 3\' adapter </th> +"; +foreach (@trimNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Unique Tags No. </th> +"; +foreach (@collapse) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Reads No. </th> +"; +foreach (@cleanR) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Tags No. </th> +"; +foreach (@cleanT) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +The raw data file path is: <b>$files[0]</b><br /> +"; +for (my $i=1;$i<@files;$i++) { + print OUT "          <b>$files[$i]</b><br />"; +} +print OUT "The collapsed file path is: <b>$collapsefile</b><br /> +The clean data file path is: <b>$clean</b><br /> +</p> +<h2> 1. Sequence length count</h2> +<h3> 1.1 Reads length count </h3> +"; +print OUT "\n"; + +my (%length); my $key="Tags Length"; +open IN,"<$prepath/reads_length_distribution.txt"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\s*$/); + if ($aline=~/^Reads/) { $key="Reads Length";} + my @tmp=split/\t/,$aline; + my @array=split/\s/,$tmp[1]; + push @{$length{$key}},[$tmp[0],@array]; +} +close IN; + +print OUT "<table border=\"1\"> +<tr align=\"center\">"; +my $hashkey="Reads Length"; +foreach (@{$length{$hashkey}[0]}) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr>"; + +for (my $i=1;$i<@{$length{$hashkey}};$i++) { + print OUT "<tr align=\"center\"> + <th >$length{$hashkey}[$i][0] </th> + "; + for(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) { + print OUT "<td> $length{$hashkey}[$i][$j] </td>\n"; + } + print OUT "</tr>\n"; +} +print OUT "</table>\n"; + +print OUT "<h3> 1.2 Tags length count </h3>"; + +print OUT "<table border=\"1\"> +<tr align=\"center\">"; +$hashkey="Tags Length"; +foreach (@{$length{$hashkey}[0]}) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr>"; + +for (my $i=1;$i<@{$length{$hashkey}};$i++) { + print OUT "<tr align=\"center\"> + <th > $length{$hashkey}[$i][0] </th> + "; + for(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) { + print OUT "<td> $length{$hashkey}[$i][$j] </td>\n"; + } + print OUT "</tr>\n"; +} + +print OUT "</table>\n"; + +print OUT "<h2> 2. Sequence length distribution </h2>"; +my $length=$prepath."length.html"; +open IN,"<$length"; +while (my $aline=<IN>) { + chomp $aline; + print OUT "$aline\n"; +} + +#print OUT "<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution.txt\"> length file</a> +#</p> +#"; + + + + +####genome map +#unless ($genomedir=~/\/$/) { +# $genomedir .="/"; +#} + +print OUT "<h2>2. Genome Alignment Result</h2> +<h3>2.1 Mapping count</h3> +"; + +open IN,"<$genomepath/genome_mapped.fa"; +my (@gread,@gtag); +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @sss=split/_/,$1; + for (my $i=0;$i<@sss;$i++) { + if ($sss[$i]>0) { + $gread[$i] +=$sss[$i]; + $gtag[$i] ++; + } + } +} +close IN; + +print OUT "<table border=\"1\"> +<tr align=\"center\"> +<th> </th> +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Reads No. </th> +"; +foreach (@gread) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Reads Percent </th> +"; + +for (my $i=0;$i<@gread;$i++) { + my $per=sprintf ("%.2f",$gread[$i]/$cleanR[$i]*100); + print OUT "<td> $per\%</td>\n"; +} + +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Tags No. </th> +"; +foreach (@gtag) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Tags Percent </th> +"; + +for (my $i=0;$i<@gtag;$i++) { + my $per=sprintf ("%.2f",$gtag[$i]/$cleanT[$i]*100); + print OUT "<td> $per\%</td>\n"; +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +The genome mapped bwt file path is: <b>$genomedir/genome_mapped.bwt</b><br /> +The genome mapped FASTA file path is: <b>$genomedir/genome_mapped.fa</b> +<br /> +"; + + + +#### rfam +if(defined $rfampath && $rfampath=~/rfam_match/){ +chomp $rfampath; +@tmp=split/\//,$rfampath; +$rfamdir=$tmp[-1]; + +unless ($rfampath=~/\/$/) { + $rfampath .="/"; +} +print OUT "<h2>3. Rfam non-miRNA annotation</h2> +<h3>3.1 Reads count</h3> +<table border=\"1\"> +<tr align=\"center\"> +"; + +my @rfamR; my @rfamT; +my $tag=1; +open IN,"<$dir/rfam_non-miRNA_annotation.txt"; +while (my $aline=<IN>) { + chomp $aline; + $tag=0 if($aline=~/tags\s+number/); + next if($aline=~/^\#/); + next if($aline=~/^\s*$/); + my @tmp=split/\s+/,$aline; + if($tag == 1){push @rfamR,[@tmp];} + else{push @rfamT,[@tmp];} +} +close IN; + + +print OUT "<th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamR;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamR[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { + print OUT "<td> $rfamR[$i][$j]</td>\n"; + } +} + +print OUT "</tr>\n</table> + <h3>3.2 Tags count</h3> + <table border=\"1\"> + <tr align=\"center\"> + <th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamT;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamT[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { + print OUT "<td> $rfamT[$i][$j]</td>\n"; + } +} +print OUT "</tr>\n</table> +<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>"; +print OUT "<b>rfam_mapped.bwt</b></p> +"; +} + + +print OUT " + </BODY> +</HTML> +"; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -o +options: +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_siRNA.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,788 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-5-29 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","format=s","o=s","h"); +if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments +&usage; +} +my ($config,$prepath,$rfampath,$genomepath,$clusterpath,$annotatepath,$plotpath,$degpath); +my ($predir,$rfamdir,$genomedir,$clusterdir,$annotatedir,$plotdir,$degdir); +open IN,"<$opts{i}"; +$config=<IN>; chomp $config; +$prepath=<IN>; chomp $prepath; +$rfampath=<IN>;chomp $rfampath; +$genomepath=<IN>; chomp $genomepath; +$clusterpath=<IN>; chomp $clusterpath; +$annotatepath=<IN>; chomp $annotatepath; +$plotpath=<IN>; chomp $plotpath; +my $deg_tag=1; +if(eof){$deg_tag=0;} +else{ + $degpath=<IN>; chomp $degpath; +} +close IN; +my @tmp=split/\//,$prepath; +$predir=$tmp[-1]; +@tmp=split/\//,$rfampath; +$rfamdir=$tmp[-1]; +@tmp=split/\//,$genomepath; +$genomedir=$tmp[-1]; +@tmp=split/\//,$clusterpath; +$clusterdir=$tmp[-1]; +@tmp=split/\//,$annotatepath; +$annotatedir=$tmp[-1]; +@tmp=split/\//,$plotpath; +$plotdir=$tmp[-1]; + +my $dir=dirname($opts{'o'}); + +open OUT ,">$opts{'o'}"; +print OUT "<HTML>\n <HEAD>\n <TITLE> Analysis Report </TITLE>\n </HEAD> + <BODY bgcolor=\"lightgray\">\n <h1 align=\"center\">\n <font face=\"ºÚÌå\">\n <b>Small RNA Analysis Report</b>\n </font>\n </h1> + <h2>1. Sequence No. and quality</h2> + <h3>1.1 Sequece No.</h3> +"; + +### raw data no +open IN,"<$config"; +my @files;my @marks; my @rawNo; +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @files,$tmp[0]; + + my $no=`less $tmp[0] |wc -l `; + chomp $no; + if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { + $no=$no/4; + } + else{ + $no=$no/2; + } + push @rawNo,$no; + + push @marks,$tmp[1]; +} +close IN; + +### preprocess +unless ($prepath=~/\/$/) { + $prepath .="/"; +} + +my @trimNo;my @collapse; +my $collapsefile=$prepath."collapse_reads.fa"; +open IN,"<$collapsefile"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $trimNo[$i] +=$lng[$i]; + $collapse[$i] ++; + } + } +} +close IN; + +my @cleanR;my @cleanT; +my $clean=$prepath."collapse_reads_18-40.fa"; +open IN,"<$clean"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $cleanR[$i] +=$lng[$i]; + $cleanT[$i] ++; + } + } +} +close IN; + +my @filterR;my @filterT; +my $filter=$prepath."collapse_reads_out.fa"; +open IN,"<$filter"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $filterR[$i] +=$lng[$i]; + $filterT[$i] ++; + } + } +} +close IN; + + +print OUT "<table border=\"1\"> +<tr align=\"center\"> +<th> </th> +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Raw Reads No. </th> +"; +foreach (@rawNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Reads No. After Trimed 3\' adapter </th> +"; +foreach (@trimNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Unique Tags No. </th> +"; +foreach (@collapse) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Reads No. </th> +"; +foreach (@cleanR) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Tags No. </th> +"; +foreach (@cleanT) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Filter Reads No. \(reads count \>3\) </th> +"; +foreach (@filterR) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Filter Tags No. \(reads count \>3\) </th> +"; +foreach (@filterT) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +The raw data file path is: <b>$files[0]</b><br /> +"; +for (my $i=1;$i<@files;$i++) { + print OUT "          <b>$files[$i]</b><br />"; +} +print OUT "The collapsed file path is: <b>$collapsefile</b><br /> +The clean data file path is: <b>$clean</b><br /> +The filter (remain total reads>3) data file path is: <b>$filter</b><br /> +</p> +<h2> 1. Sequence length count</h2> +"; +print OUT "\n"; + +my $length=$prepath."length.html"; +open IN,"<$length"; +while (my $aline=<IN>) { + chomp $aline; + print OUT "$aline\n"; +} +close IN; + +print OUT "<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution_after_count_filter.txt\"> length file</a> +</p> +"; + +#### rfam +unless ($rfampath=~/\/$/) { + $rfampath .="/"; +} +unless ($genomepath=~/\/$/) { + $genomepath .="/"; +} +print OUT "<h2>2. Rfam non-miRNA annotation</h2> +<h3>2.1 Reads count</h3> +<table border=\"1\"> +<tr align=\"center\"> +"; + +my @rfamR; my @rfamT; +my $tag=1; +open IN,"<$dir/rfam_match/rfam_non-miRNA_annotation.txt"; +while (my $aline=<IN>) { + chomp $aline; + $tag=0 if($aline=~/tags\s+number/); + next if($aline=~/^\#/); + next if($aline=~/^\s*$/); + my @tmp=split/\s+/,$aline; + if($tag == 1){push @rfamR,[@tmp];} + else{push @rfamT,[@tmp];} +} +close IN; + + +print OUT "<th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamR;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamR[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { + print OUT "<td> $rfamR[$i][$j]</td>\n"; + } +} + +print OUT "</tr>\n</table> + <h3>2.2 Tags count</h3> + <table border=\"1\"> + <tr align=\"center\"> + <th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamT;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamT[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { + print OUT "<td> $rfamT[$i][$j]</td>\n"; + } +} +print OUT "</tr>\n</table> +<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>"; +print OUT "<b>rfam_mapped.bwt</b></p>"; + +open IN,"<$dir/genome_match/genome_mapped.bwt"; +my @genome_r_u; +my @genome_r_m; +my @genome_t_u; +my @genome_t_m; +my $tags_map_number=0; +while (my $aline=<IN>) { + chomp $aline; + my @temp=split/\t/,$aline; + if ($temp[6]==0) { + $aline=~/:([\d|_]+)_x(\d+)/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $genome_r_u[$i] +=$lng[$i]; + $genome_t_u[$i] ++; + } + } + $tags_map_number++; + } + if ($temp[6]>0) { + $aline=~/:([\d|_]+)_x(\d+)/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $genome_r_m[$i] +=$lng[$i]; + $genome_t_m[$i] ++; + } + } + for (my $i=0;$i<$temp[6] ;$i++) { + my $next=<IN>; + } + $tags_map_number++; + } +} +close IN; +#<h3>3.1 Reads count</h3> +#<table border=\"1\"> +#<tr align=\"center\"> +print OUT "<h2>3. genome mapping result</h2> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">Map</th>\n +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Uniq Map Reads No.</th> +"; +for (my $i=0;$i<@genome_r_u ;$i++) { + print OUT "<td> $genome_r_u[$i]</td>\n"; +} + +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Uniq Map Tags No.</th> +"; +for (my $i=0;$i<@genome_t_u ;$i++) { + print OUT "<td> $genome_t_u[$i]</td>\n"; +} + +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Multiple Map Reads No.</th> +"; +for (my $i=0;$i<@genome_r_m ;$i++) { + print OUT "<td> $genome_r_m[$i]</td>\n"; +} + +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Multiple Map Tags No.</th> +"; +for (my $i=0;$i<@genome_t_m ;$i++) { + print OUT "<td> $genome_t_m[$i]</td>\n"; +} + +print OUT "</tr>\n</table> +<p>Note:<br />The genome mapping results is: <b>$genomepath</b>"; +print OUT "<b>genome_mapped.bwt</b></p>"; + +my $cluster="$clusterpath/sample_reads.cluster"; +my $cluster_number=`less $cluster |wc -l `; +$cluster_number=$cluster_number-1; +my (%cluster_length,@exp,@rpkm); +my @exp_range=qw(0 \(0--10] \(10--100] \(100--1000] \(1000--10000] \(10000--100000] \(100000--**\)); +my @rpkm_range=qw(0 \(0--0.25] \(0.25--0.5] \(0.5--1] \(1.0-5.0] \(5--10] \(10--50] \(50--100] \(100--500] \(500--1000] \(1000--**]); + +open IN,"<$cluster"; +while (my $aline=<IN>) { + next if($aline=~/^\"/); + chomp $aline; + my @temp=split/\t/,$aline; + my @id=split/:|-/,$temp[0]; + $cluster_length{$id[2]-$id[1]+1}++; + for (my $i=0;$i<@marks ;$i++) { + if ($temp[$i+3] == 0) {$exp[0][$i]++;} + elsif ($temp[$i+3]>0 && $temp[$i+3]<= 10 ){$exp[1][$i]++;} + elsif ($temp[$i+3]>10 && $temp[$i+3]<=100){$exp[2][$i]++;} + elsif ($temp[$i+3]>100 && $temp[$i+3]<=1000){$exp[3][$i]++;} + elsif ($temp[$i+3]>1000 && $temp[$i+3]<=10000){$exp[4][$i]++;} + elsif ($temp[$i+3]>10000 && $temp[$i+3]<=100000){$exp[5][$i]++;} + elsif ($temp[$i+3]>100000){$exp[6][$i]++;} + } +} +close IN; + +my $cluster_rpkm="$clusterpath/sample_rpkm.cluster"; +open IN,"<$cluster_rpkm"; +while (my $aline=<IN>) { + next if($aline=~/^\#/); + chomp $aline; + my @temp=split/\t/,$aline; + for (my $i=0;$i<@marks ;$i++) { + if ($temp[$i+3]==0) {$rpkm[0][$i]++;} + elsif($temp[$i+3]>0 && $temp[$i+3]<=0.25){$rpkm[1][$i]++;} + elsif($temp[$i+3]>0.25 && $temp[$i+3]<=0.5){$rpkm[2][$i]++;} + elsif($temp[$i+3]>0.5 && $temp[$i+3]<=1){$rpkm[3][$i]++;} + elsif($temp[$i+3]>1 && $temp[$i+3]<=5){$rpkm[4][$i]++;} + elsif($temp[$i+3]>5 && $temp[$i+3]<=10){$rpkm[5][$i]++;} + elsif($temp[$i+3]>10 && $temp[$i+3]<=50){$rpkm[6][$i]++;} + elsif($temp[$i+3]>50 && $temp[$i+3]<=100){$rpkm[7][$i]++;} + elsif($temp[$i+3]>100 && $temp[$i+3]<=500){$rpkm[8][$i]++;} + elsif($temp[$i+3]>500 && $temp[$i+3]<=1000){$rpkm[9][$i]++;} + else{$rpkm[10][$i]++;} + } +} + +close IN; + +my $cluster_length_file="$clusterpath/cluster_length.txt"; +open LEN,">$cluster_length_file"; +print LEN "\#length\tcluster_number\n"; +foreach my $key (sort keys %cluster_length) { + print LEN "$key\t$cluster_length{$key}\n"; +} +close LEN; +print OUT "<h2>4. cluster result</h2> +<h3>4.1 Cluster count</h3> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\"> </th> +<td>Merged samples</td></tr> +<tr align=\"center\"> +<th align=\"left\">Tags number</th> +<td>$tags_map_number</td></tr> +<tr align=\"center\"> +<th align=\"left\">Cluster number</th> +<td>$cluster_number</td></tr>\n</table> +"; + +print OUT "<h3>4.2 Cluster length</h3> +<p> Note:<br />The clusters length data: <a href=\"./$clusterdir/cluster_length.txt\"> length file</a> +</p> +"; +print OUT "<h3>4.3 Quantify</h3> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">Reads Range</th>\n +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@exp_range;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$exp_range[$i]</th> + "; + for (my $j=0;$j<@marks ;$j++) { + if (!(defined($exp[$i][$j]))) { + print OUT "<td> 0</td>\n"; + } + else{print OUT "<td> $exp[$i][$j]</td>\n";} + } +} +print OUT "</tr>\n</table>"; + +print OUT "\n<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">RPKM Range</th>\n +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rpkm_range;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rpkm_range[$i]</th> + "; + for (my $j=0;$j<@marks ;$j++) { + if (!(defined($rpkm[$i][$j]))) { + print OUT "<td> 0</td>\n"; + } + else{print OUT "<td> $rpkm[$i][$j]</td>\n";} + } +} +print OUT "</tr>\n</table>"; + +my $annotate="$annotatepath/sample_c_p.anno"; +my (%posit,%repeat,%nat1,%nat2); +my (@phase,@long,@repeat,@nat); +for (my $j=0;$j<@marks ;$j++) { + $phase[$j]=0; + $long[$j]=0; + $repeat[$j]=0; + $nat[$j]=0; +} + +my $class_anno=1; +open ANNO,"<$annotate"; +while (my $aline=<ANNO>) { + chomp $aline; + my @temp=split/\t/,$aline; + if($aline=~/^\#/){ + if (@temp != 10+@marks) { + $class_anno=0; + } + next; + } + for (my $i=3+@marks+$class_anno;$i<@temp;$i++) { + my @posit=split/\;/,$temp[$i]; + for (my $j=0;$j<@marks ;$j++) { + if ($temp[3+$j]>0) { + $posit{$posit[0]}[$j]++; + } + else{ + if (!(defined($posit{$posit[0]}[$j]))) { + $posit{$posit[0]}[$j]=0; + } + } + } + } + if ($class_anno) { + for (my $j=0;$j<@marks ;$j++) { + if ($temp[3+$j]>0) { + if ($temp[6] eq "phase") { + $phase[$j]++; + } + if ($temp[7] eq "long") { + $long[$j]++; + } + if ($temp[8] ne "\/") { + $repeat[$j]++; + my @rr=split/\;/,$temp[8]; + foreach (@rr) { + $repeat{$_}[$j]++; + } + } + if ($temp[9] ne "\/") { + $nat[$j]++; + my @nn1=split/\;/,$temp[9]; + my @nn2=split/\;/,$temp[10]; + for (my $k=0;$k<@nn1 ;$k++) { + $nat1{$nn1[$k]}[$j]++; + $nat2{$nn2[$k]}[$j]++; + } + } + } + } + } +} +close ANNO; + +print OUT "<h2>5. Cluster Annotate</h2> +<h3>5.1 Cluster genome position annotate</h3> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">clusters number</th>\n +"; + +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +foreach my $key (sort keys %posit) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + foreach (@{$posit{$key}}) { + print OUT "<td> $_</td>\n"; + } +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +One cluster mybe annotate to multiple genes<br /> +"; + +if ($class_anno) { + print OUT "<h3>5.2 Cluster source mechanism annotate</h3> + <table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">clusters number</th>\n + "; + + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Phase</th>\n + "; + foreach (@phase) { + print OUT "<td> $_ </td>\n"; + } + + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Long</th>\n + "; + foreach (@long) { + print OUT "<td> $_ </td>\n"; + } + + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Repeat</th>\n + "; + foreach (@repeat) { + print OUT "<td> $_ </td>\n"; + } + + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Nat</th>\n + "; + foreach (@nat) { + print OUT "<td> $_ </td>\n"; + } + print OUT "</tr>\n</table>"; + + print OUT "<p> + Repeat subclass annotate: + "; + + print OUT "<table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Repeat subclass</th>\n + "; + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + + foreach my $key (sort keys %repeat) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($repeat{$key}[$i])) { + print OUT "<td> $repeat{$key}[$i] </td>\n"; + } + else{print OUT "<td> 0 </td>\n";} + } + } + print OUT "</tr>\n</table>"; + + + print OUT "<p> + Nat subclass1 annotate: + "; + + print OUT "<table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Nat subclass1</th>\n + "; + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + foreach my $key (sort keys %nat1) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($nat1{$key}[$i])) { + print OUT "<td> $nat1{$key}[$i] </td>\n"; + } + else{print OUT "<td> 0 </td>\n";} + } + } + print OUT "</tr>\n</table>"; + + print OUT "<p> + Nat subclass2 annotate: + "; + + print OUT "<table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Nat subclass2</th>\n + "; + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + foreach my $key (sort keys %nat2) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($nat2{$key}[$i])) { + print OUT "<td> $nat2{$key}[$i] </td>\n"; + } + else{print OUT "<td> 0 </td>\n";} + } + } + print OUT "</tr>\n</table>"; + print OUT "<p> + Note:<br /> + One cluster mybe annotate to multiple repeats or nats<br /> + "; +} +else { + print OUT "<h3>5.2 Cluster source mechanism annotate</h3> + <br />Do not do source mechanism annotate <br />"; + +} + +print OUT "<h2>6. Graph of Clusters of all samples</h2> \n"; + +my $plot=$plotpath."cluster.html"; +open IN,"<$plot"; +while (my $aline=<IN>) { + chomp $aline; + print OUT "$aline\n"; +} +close IN; + + +if ($deg_tag) { + my $deg_file=`ls $degpath`; + chomp $deg_file; + my @deg_file=split/\n/,$deg_file; + my %deg; + foreach (@deg_file) { + my $output="$degpath/$_/output_score.txt"; + open IN,"<$output"; + $deg{$_}[0]=0; + $deg{$_}[1]=0; + $deg{$_}[2]=0; + while (my $aline=<IN>) { + next if ($aline=~/^\"/); + chomp $aline; + my @temp=split/\t/,$aline; + if ($temp[9] eq "TRUE") { + $deg{$_}[0]++; + if ($temp[4] >0) { + $deg{$_}[1]++; + } + if ($temp[4] <0) { + $deg{$_}[2]++; + } + } + } + close IN; + } + + print OUT "<h2>7. DEG</h2> + <table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Genes number</th>\n + <th> DEG </th>\n + <th> UP </th>\n + <th> DOWN </th>\n + "; + + foreach my $key (sort keys %deg) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@{$deg{$key}} ;$i++) { + print OUT "<td> $deg{$key}[$i] </td>\n"; + } + } + print OUT "</tr>\n</table>"; +} +else{ + print OUT "<h2>7. DEG</h2> + <br />Do not do DE clusters <br />"; +} + +print OUT " + </BODY> +</HTML> +"; +close OUT; + + + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -o +options: +-i +-format +-o output file +-h help +USAGE +exit(1); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/install_DEG.R Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,2 @@ +source("http://bioconductor.org/biocLite.R") +biocLite("DEGseq")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matching.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,86 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","g=s","index:s","v:i","p:i","r:s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +unless ($fileout=~/\/$/) {$fileout.="/";} +my $genome=$opts{'g'}; +my $mis=defined $opts{'v'}? $opts{'v'} : 0; +my $hits=defined $opts{'r'}? $opts{'r'} : 25; +my $index=defined $opts{'index'} ? $opts{'index'} : ""; +my $threads=defined $opts{'p'} ? $opts{'p'} : 1; + + +my $time=&Time(); +my $mapdir=$fileout."/genome_match"; +if(not -d $mapdir){ + mkdir $mapdir; +} +chdir $mapdir; +###check genome index +if (-s $index.".1.ebwt") { +}else{ + `bowtie-build $genome $genome`; + $index="$genome"; +} + +### genome mapping +`bowtie -v $mis -f -p $threads -m $hits -a --best --strata $index $filein --al genome_mapped.fa --un genome_not_mapped.fa --max genome_mapped_Mlimit.fa > genome_mapped.bwt 2> run.log`; + +#`convert_bowtie_to_blast.pl genome_mapped.bwt genome_mapped.fa $genome > genome_mapped.bst`; + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file# input reads fasta/fastq file +-g input file# genome file +-index file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; + +-p/--threads <int> number of alignment threads to launch (default: 1) + +-r int a read is allowed to map up to this number of positions in the genome + default is 25 + +-o output directory + +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRDeep_plant.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,1622 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use Getopt::Std; +#use RNA; + + +################################# MIRDEEP ################################################# + +################################## USAGE ################################################## + + +my $usage= +"$0 file_signature file_structure temp_out_directory + +This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with +information on the positions of reads aligned to potential precursor sequences (signature). +It also takes as input an RNAfold output file, giving information on the sequence, structure +and mimimum free energy of the potential precursor sequences. + +Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA +sequences that should be considered for conservation purposes. -t prints out the potential +precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that +exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors +that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences +obtained through conventional cloning. -z consider the number of base pairings in the lower +stems (this option is not well tested). + +-h print this usage +-s fasta file with known miRNAs +#-o temp directory ,maked befor running the program. +-t print filtered +-u limited output (only ids) +-v cut-off (default 1) +-x sensitive option for Sanger sequences +-y use Randfold +-z consider Drosha processing +"; + + + + + +############################################################################################ + +################################### INPUT ################################################## + + +#signature file in blast_parsed format +my $file_blast_parsed=shift or die $usage; + +#structure file outputted from RNAfold +my $file_struct=shift or die $usage; + +my $tmpdir=shift or die $usage; +#options +my %options=(); +getopts("hs:tuv:xyz",\%options); + + + + + + +############################################################################################# + +############################# GLOBAL VARIABLES ############################################## + + +#parameters +my $nucleus_lng=11; + +my $score_star=3.9; +my $score_star_not=-1.3; +my $score_nucleus=7.63; +my $score_nucleus_not=-1.17; +my $score_randfold=1.37; +my $score_randfold_not=-3.624; +my $score_intercept=0.3; +my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0); +my $score_min=1; +if($options{v}){$score_min=$options{v};} +if($options{x}){$score_min=-5;} + +my $e=2.718281828; + +#hashes +my %hash_desc; +my %hash_seq; +my %hash_struct; +my %hash_mfe; +my %hash_nuclei; +my %hash_mirs; +my %hash_query; +my %hash_comp; +my %hash_bp; + +#other variables +my $subject_old; +my $message_filter; +my $message_score; +my $lines; +my $out_of_bound; + + + +############################################################################################## + +################################ MAIN ###################################################### + + +#print help if that option is used +if($options{h}){die $usage;} +unless ($tmpdir=~/\/$/) {$tmpdir .="/";} +if(!(-s $tmpdir)){mkdir $tmpdir;} +$tmpdir .="TMP_DIR/"; +mkdir $tmpdir; + +#parse structure file outputted from RNAfold +parse_file_struct($file_struct); + +#if conservation is scored, the fasta file of known miRNA sequences is parsed +if($options{s}){create_hash_nuclei($options{s})}; + +#parse signature file in blast_parsed format and resolve each potential precursor +parse_file_blast_parsed($file_blast_parsed); +#`rm -rf $tmpdir`; +exit; + + + + +############################################################################################## + +############################## SUBROUTINES ################################################### + + + +sub parse_file_blast_parsed{ + +# read through the signature blastparsed file, fills up a hash with information on queries +# (deep sequences) mapping to the current subject (potential precursor) and resolve each +# potential precursor in turn + + my $file_blast_parsed=shift; + + open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n"; + while (my $line=<FILE_BLAST_PARSED>){ + if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){ + my $query=$1; + my $query_lng=$2; + my $query_beg=$3; + my $query_end=$4; + my $subject=$5; + my $subject_lng=$6; + my $subject_beg=$7; + my $subject_end=$8; + my $e_value=$9; + my $pid=$10; + my $bitscore=$11; + my $other=$12; + + #if the new line concerns a new subject (potential precursor) then the old subject must be resolved + if($subject_old and $subject_old ne $subject){ + resolve_potential_precursor(); + } + + #resolve the strand + my $strand=find_strand($other); + + #resolve the number of reads that the deep sequence represents + my $freq=find_freq($query); + + #read information of the query (deep sequence) into hash + $hash_query{$query}{"subject_beg"}=$subject_beg; + $hash_query{$query}{"subject_end"}=$subject_end; + $hash_query{$query}{"strand"}=$strand; + $hash_query{$query}{"freq"}=$freq; + + #save the signature information + $lines.=$line; + + $subject_old=$subject; + } + } + resolve_potential_precursor(); +} + +sub resolve_potential_precursor{ + +# dissects the potential precursor in parts by filling hashes, and tests if it passes the +# initial filter and the scoring filter + +# binary variable whether the potential precursor is still viable + my $ret=1; +#print STDERR ">$subject_old\n"; + + fill_structure(); +#print STDERR "\%hash_bp",scalar keys %hash_bp,"\n"; + fill_pri(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_mature(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_star(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_loop(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_lower_flanks(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + +# do_test_assemble(); + +# this is the actual classification + unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;} + + print_results($ret); + + reset_variables(); + + return; + +} + + + +sub print_results{ + + my $ret=shift; + +# print out if the precursor is accepted and accepted precursors should be printed out +# or if the potential precursor is discarded and discarded potential precursors should +# be printed out + + if((!$options{t} and $ret) or ($options{t} and !$ret)){ + #full output + unless($options{u}){ + if($message_filter){print $message_filter;} + if($message_score){print $message_score;} + print_hash_comp(); + print $lines,"\n\n"; + return; + } + #limited output (only ids) + my $id=$hash_comp{"pri_id"}; + print "$id\n"; + } +} + + + + + + + +sub pass_threshold_score{ + +# this is the scoring + + #minimum free energy of the potential precursor +# my $score_mfe=score_mfe($hash_comp{"pri_mfe"}); + my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"}); + + #count of reads that map in accordance with Dicer processing + my $score_freq=score_freq($hash_comp{"freq"}); +#print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n"; + + #basic score + my $score=$score_mfe+$score_freq; + + #scoring of conserved nucleus/seed (optional) + if($options{s}){ + + #if the nucleus is conserved + if(test_nucleus_conservation()){ + + #nucleus from position 2-8 + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + + #resolve DNA/RNA ambiguities + $nucleus=~tr/[T]/[U]/; + + #print score contribution + score_s("score_nucleus\t$score_nucleus"); + + #print the ids of known miRNAs with same nucleus + score_s("$hash_mirs{$nucleus}"); +#print STDERR "score_nucleus\t$score_nucleus\n"; + + #add to score + $score+=$score_nucleus; + + #if the nucleus is not conserved + }else{ + #print (negative) score contribution + score_s("score_nucleus\t$score_nucleus_not"); + + #add (negative) score contribution + $score+=$score_nucleus_not; + } + } + + #if the majority of potential star reads fall as expected from Dicer processing + if($hash_comp{"star_read"}){ + score_s("score_star\t$score_star"); +#print STDERR "score_star\t$score_star\n"; + $score+=$score_star; + }else{ + score_s("score_star\t$score_star_not"); +#print STDERR "score_star_not\t$score_star_not\n"; + $score+=$score_star_not; + } + + #score lower stems for potential for Drosha recognition (highly optional) + if($options{z}){ + my $stem_bp=$hash_comp{"stem_bp"}; + my $score_stem=$scores_stem[$stem_bp]; + $score+=$score_stem; + score_s("score_stem\t$score_stem"); + } + +#print STDERR "score_intercept\t$score_intercept\n"; + + $score+=$score_intercept; + + #score for randfold (optional) + if($options{y}){ + +# only calculate randfold value if it can make the difference between the potential precursor +# being accepted or discarded + if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){ + + #randfold value<0.05 + if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");} + + #randfold value>0.05 + else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");} + } + } + + #round off values to one decimal + my $round_mfe=round($score_mfe*10)/10; + my $round_freq=round($score_freq*10)/10; + my $round=round($score*10)/10; + + #print scores + score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round"); + + #return 1 if the potential precursor is accepted, return 0 if discarded + unless($score>=$score_min){return 0;} + return 1; +} + +sub test_randfold{ + + #print sequence to temporary file, test randfold value, return 1 or 0 + +# print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); + #my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; + #open(FILE, ">$tmpfile"); + #print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; + #close FILE; + +# my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; + #my $p1=`randfold -s $tmpfile 999 | cut -f 3`; + #my $p2=`randfold -s $tmpfile 999 | cut -f 3`; + my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999); + my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999); + my $p_value=($p1+$p2)/2; + wait; +# system "rm $tmpfile"; + + if($p_value<=0.05){return 1;} + + return 0; +} + +sub randfold_pvalue{ + my $cpt_sup = 0; + my $cpt_inf = 0; + my $cpt_ega = 1; + + my ($seq,$number_of_randomizations)=@_; + #my $str =$seq; + #my $mfe = RNA::fold($seq,$str); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $str=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + for (my $i=0;$i<$number_of_randomizations;$i++) { + $seq = shuffle_sequence_dinucleotide($seq); + #$str = $seq; + + #my $rand_mfe = RNA::fold($str,$str); + $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $str=$rawfolds[1]; + my $rand_mfe=$rawfolds[-1]; + $rand_mfe=~s/\(//; + $rand_mfe=~s/\)//; + + if ($rand_mfe < $mfe) { + $cpt_inf++; + } + if ($rand_mfe == $mfe) { + $cpt_ega++; + } + if ($rand_mfe > $mfe) { + $cpt_sup++; + } + } + + my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1); + + #print "$name\t$mfe\t$proba\n"; + return $proba; +} + +sub shuffle_sequence_dinucleotide { + + my ($str) = @_; + + # upper case and convert to ATGC + $str = uc($str); + $str =~ s/U/T/g; + + my @nuc = ('A','T','G','C'); + my $count_swap = 0; + # set maximum number of permutations + my $stop = length($str) * 10; + + while($count_swap < $stop) { + + my @pos; + + # look start and end letters + my $firstnuc = $nuc[int(rand 4)]; + my $thirdnuc = $nuc[int(rand 4)]; + + # get positions for matching nucleotides + for (my $i=0;$i<(length($str)-2);$i++) { + if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) { + push (@pos,($i+1)); + $i++; + } + } + + # swap at random trinucleotides + my $max = scalar(@pos); + for (my $i=0;$i<$max;$i++) { + my $swap = int(rand($max)); + if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) { + $count_swap++; + my $w1 = substr($str,$pos[$i],1); + my $w2 = substr($str,$pos[$swap],1); + substr($str,$pos[$i],1,$w2); + substr($str,$pos[$swap],1,$w1); + } + } + } + return($str); +} + +sub test_nucleus_conservation{ + + #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 + + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + $nucleus=~tr/[T]/[U]/; + if($hash_nuclei{$nucleus}){return 1;} + + return 0; +} + + + +sub pass_filtering_initial{ + + #test if the structure forms a plausible hairpin + unless(pass_filtering_structure()){filter_p("structure problem"); return 0;} + + #test if >90% of reads map to the hairpin in consistence with Dicer processing + unless(pass_filtering_signature()){filter_p("signature problem");return 0;} + + return 1; + +} + + +sub pass_filtering_signature{ + + #number of reads that map in consistence with Dicer processing + my $consistent=0; + + #number of reads that map inconsistent with Dicer processing + my $inconsistent=0; + +# number of potential star reads map in good consistence with Drosha/Dicer processing +# (3' overhangs relative to mature product) + my $star_perfect=0; + +# number of potential star reads that do not map in good consistence with 3' overhang + my $star_fuzzy=0; + + + #sort queries (deep sequences) by their position on the hairpin + my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query; + + foreach my $query(@queries){ + + #number of reads that the deep sequence represents + unless(defined($hash_query{$query}{"freq"})){next;} + my $query_freq=$hash_query{$query}{"freq"}; + + #test which Dicer product (if any) the deep sequence corresponds to + my $product=test_query($query); + + #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable + if($product){$consistent+=$query_freq;} + + #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable + else{$inconsistent+=$query_freq;} + + #test a potential star sequence has good 3' overhang + if($product eq "star"){ + if(test_star($query)){$star_perfect+=$query_freq;} + else{$star_fuzzy+=$query_freq;} + } + } + +# if the majority of potential star sequences map in good accordance with 3' overhang +# score for the presence of star evidence + if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;} + + #total number of reads mapping to the hairpin + my $freq=$consistent+$inconsistent; + $hash_comp{"freq"}=$freq; + unless($freq>0){filter_s("read frequency too low"); return 0;} + + #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded + my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent); + unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;} + + #the hairpin is retained + return 1; +} + +sub test_star{ + + #test if a deep sequence maps in good consistence with 3' overhang + + my $query=shift; + + #5' begin and 3' end positions + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + + #the difference between observed and expected begin positions must be 0 or 1 + my $offset=$beg-$hash_comp{"star_beg"}; + if($offset==0 or $offset==1 or $offset==-1){return 1;} + + return 0; +} + + + +sub test_query{ + + #test if deep sequence maps in consistence with Dicer processing + + my $query=shift; + + #begin, end, strand and read count + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + my $strand=$hash_query{$query}{"strand"}; + my $freq=$hash_query{$query}{"freq"}; + + #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs) + if($strand eq '-'){return 0;} + + #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end + my $fuzz_beg=2; + #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end + my $fuzz_end=2; + + #if in accordance with Dicer processing, return the type of Dicer product + if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";} + if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";} + if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";} + + #if not in accordance, return 0 + return 0; +} + + +sub pass_filtering_structure{ + + #The potential precursor must form a hairpin with miRNA precursor-like characteristics + + #return value + my $ret=1; + + #potential mature, star, loop and lower flank parts must be identifiable + unless(test_components()){return 0;} + + #no bifurcations + unless(no_bifurcations_precursor()){$ret=0;} + + #minimum 14 base pairings in duplex + unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");} + + #not more than 6 nt difference between mature and star length + unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") } + + return $ret; +} + + + + + + +sub test_components{ + + #tests whether potential mature, star, loop and lower flank parts are identifiable + + unless($hash_comp{"mature_struct"}){ + filter_s("no mature"); +# print STDERR "no mature\n"; + return 0; + } + + unless($hash_comp{"star_struct"}){ + filter_s("no star"); +# print STDERR "no star\n"; + return 0; + } + + unless($hash_comp{"loop_struct"}){ + filter_s("no loop"); +# print STDERR "no loop\n"; + return 0; + } + + unless($hash_comp{"flank_first_struct"}){ + filter_s("no flanks"); +#print STDERR "no flanks_first_struct\n"; + return 0; + } + + unless($hash_comp{"flank_second_struct"}){ + filter_s("no flanks"); +# print STDERR "no flanks_second_struct\n"; + return 0; + } + return 1; +} + + + + + +sub no_bifurcations_precursor{ + + #tests whether there are bifurcations in the hairpin + + #assembles the potential precursor sequence and structure from the expected Dicer products + #this is the expected biological precursor, in contrast with 'pri_seq' that includes + #some genomic flanks on both sides + + my $pre_struct; + my $pre_seq; + if($hash_comp{"mature_arm"} eq "first"){ + $pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}; + $pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"}; + }else{ + $pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}; + $pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"}; + } + + #read into hash + $hash_comp{"pre_struct"}=$pre_struct; + $hash_comp{"pre_seq"}=$pre_seq; + + #simple pattern matching checks for bifurcations + unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){ + filter_s("bifurcation in precursor"); +# print STDERR "bifurcation in precursor\n"; + return 0; + } + + return 1; +} + +sub bp_precursor{ + + #total number of bps in the precursor + + my $pre_struct=$hash_comp{"pre_struct"}; + + #simple pattern matching + my $pre_bps=0; + while($pre_struct=~/\(/g){ + $pre_bps++; + } + return $pre_bps; +} + + +sub bp_duplex{ + + #total number of bps in the duplex + + my $duplex_bps=0; + my $mature_struct=$hash_comp{"mature_struct"}; + + #simple pattern matching + while($mature_struct=~/(\(|\))/g){ + $duplex_bps++; + } + return $duplex_bps; +} + +sub diff_lng{ + + #find difference between mature and star lengths + + my $mature_lng=length $hash_comp{"mature_struct"}; + my $star_lng=length $hash_comp{"star_struct"}; + my $diff_lng=$mature_lng-$star_lng; + return $diff_lng; +} + + + +sub do_test_assemble{ + +# not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks) +# is identical to 'pri_struct' before disassembly into parts + + my $assemble_struct; + + if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){ + if($hash_comp{"mature_arm"} eq "first"){ + $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"}; + }else{ + $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"}; + } + unless($assemble_struct eq $hash_comp{"pri_struct"}){ + $hash_comp{"test_assemble"}=$assemble_struct; + print_hash_comp(); + } + } + return; + } + + + +sub fill_structure{ + + #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired + + my $struct=$hash_struct{$subject_old}; + my $lng=length $struct; + + #local stack for keeping track of basepairings + my @bps; + + for(my $pos=1;$pos<=$lng;$pos++){ + my $struct_pos=excise_struct($struct,$pos,$pos,"+"); + + if($struct_pos eq "("){ + push(@bps,$pos); + } + + if($struct_pos eq ")"){ + my $pos_prev=pop(@bps); + $hash_bp{$pos_prev}=$pos; + $hash_bp{$pos}=$pos_prev; + } + } + return; +} + + + +sub fill_star{ + + #fills specifics on the expected star strand into 'comp' hash ('component' hash) + + #if the mature sequence is not plausible, don't look for the star arm + my $mature_arm=$hash_comp{"mature_arm"}; + unless($mature_arm){$hash_comp{"star_arm"}=0; return;} + + #if the star sequence is not plausible, don't fill into the hash + my($star_beg,$star_end)=find_star(); + my $star_arm=arm_star($star_beg,$star_end); + unless($star_arm){return;} + + #excise expected star sequence and structure + my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+"); + my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+"); + + #fill into hash + $hash_comp{"star_beg"}=$star_beg; + $hash_comp{"star_end"}=$star_end; + $hash_comp{"star_seq"}=$star_seq; + $hash_comp{"star_struct"}=$star_struct; + $hash_comp{"star_arm"}=$star_arm; + + return; +} + + +sub find_star{ + + #uses the 'bp' hash to find the expected star begin and end positions from the mature positions + + #the -2 is for the overhang + my $mature_beg=$hash_comp{"mature_beg"}; + my $mature_end=$hash_comp{"mature_end"}-2; + my $mature_lng=$mature_end-$mature_beg+1; + + #in some cases, the last nucleotide of the mature sequence does not form a base pair, + #and therefore does not basepair with the first nucleotide of the star sequence. + #In this case, the algorithm searches for the last nucleotide of the mature sequence + #to form a base pair. The offset is the number of nucleotides searched through. + my $offset_star_beg=0; + my $offset_beg=0; + + #the offset should not be longer than the length of the mature sequence, then it + #means that the mature sequence does not form any base pairs + while(!$offset_star_beg and $offset_beg<$mature_lng){ + if($hash_bp{$mature_end-$offset_beg}){ + $offset_star_beg=$hash_bp{$mature_end-$offset_beg}; + }else{ + $offset_beg++; + } + } + #when defining the beginning of the star sequence, compensate for the offset + my $star_beg=$offset_star_beg-$offset_beg; + + #same as above + my $offset_star_end=0; + my $offset_end=0; + while(!$offset_star_end and $offset_end<$mature_lng){ + if($hash_bp{$mature_beg+$offset_end}){ + $offset_star_end=$hash_bp{$mature_beg+$offset_end}; + }else{ + $offset_end++; + } + } + #the +2 is for the overhang + my $star_end=$offset_star_end+$offset_end+2; + + return($star_beg,$star_end); +} + + +sub fill_pri{ + + #fills basic specifics on the precursor into the 'comp' hash + + my $seq=$hash_seq{$subject_old}; + my $struct=$hash_struct{$subject_old}; + my $mfe=$hash_mfe{$subject_old}; + my $length=length $seq; + + $hash_comp{"pri_id"}=$subject_old; + $hash_comp{"pri_seq"}=$seq; + $hash_comp{"pri_struct"}=$struct; + $hash_comp{"pri_mfe"}=$mfe; + $hash_comp{"pri_beg"}=1; + $hash_comp{"pri_end"}=$length; + + return; +} + + +sub fill_mature{ + + #fills specifics on the mature sequence into the 'comp' hash + + my $mature_query=find_mature_query(); + my($mature_beg,$mature_end)=find_positions_query($mature_query); + my $mature_strand=find_strand_query($mature_query); + my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand); + my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand); + my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand); + + $hash_comp{"mature_query"}=$mature_query; + $hash_comp{"mature_beg"}=$mature_beg; + $hash_comp{"mature_end"}=$mature_end; + $hash_comp{"mature_strand"}=$mature_strand; + $hash_comp{"mature_struct"}=$mature_struct; + $hash_comp{"mature_seq"}=$mature_seq; + $hash_comp{"mature_arm"}=$mature_arm; + + return; +} + + + +sub fill_loop{ + + #fills specifics on the loop sequence into the 'comp' hash + + #unless both mature and star sequences are plausible, do not look for the loop + unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;} + + my $loop_beg; + my $loop_end; + + #defining the begin and end positions of the loop from the mature and star positions + #excision depends on whether the mature or star sequence is 5' of the loop ('first') + if($hash_comp{"mature_arm"} eq "first"){ + $loop_beg=$hash_comp{"mature_end"}+1; + }else{ + $loop_end=$hash_comp{"mature_beg"}-1; + } + + if($hash_comp{"star_arm"} eq "first"){ + $loop_beg=$hash_comp{"star_end"}+1; + }else{ + $loop_end=$hash_comp{"star_beg"}-1; + } + + #unless the positions are plausible, do not fill into hash + unless(test_loop($loop_beg,$loop_end)){return;} + + my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+"); + my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+"); + + $hash_comp{"loop_beg"}=$loop_beg; + $hash_comp{"loop_end"}=$loop_end; + $hash_comp{"loop_seq"}=$loop_seq; + $hash_comp{"loop_struct"}=$loop_struct; + + return; +} + + +sub fill_lower_flanks{ + + #fills specifics on the lower flanks and unpaired strands into the 'comp' hash + + #unless both mature and star sequences are plausible, do not look for the flanks + unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;} + + my $flank_first_end; + my $flank_second_beg; + + #defining the begin and end positions of the flanks from the mature and star positions + #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first') + if($hash_comp{"mature_arm"} eq "first"){ + $flank_first_end=$hash_comp{"mature_beg"}-1; + }else{ + $flank_second_beg=$hash_comp{"mature_end"}+1; + } + + if($hash_comp{"star_arm"} eq "first"){ + $flank_first_end=$hash_comp{"star_beg"}-1; + }else{ + $flank_second_beg=$hash_comp{"star_end"}+1; + } + + #unless the positions are plausible, do not fill into hash + unless(test_flanks($flank_first_end,$flank_second_beg)){return;} + + $hash_comp{"flank_first_end"}=$flank_first_end; + $hash_comp{"flank_second_beg"}=$flank_second_beg; + $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+"); + $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+"); + $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+"); + $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+"); + + if($options{z}){ + fill_stems_drosha(); + } + + return; +} + + +sub fill_stems_drosha{ + + #scores the number of base pairings formed by the first ten nt of the lower stems + #in general, the more stems, the higher the score contribution + #warning: this options has not been thoroughly tested + + my $flank_first_struct=$hash_comp{"flank_first_struct"}; + my $flank_second_struct=$hash_comp{"flank_second_struct"}; + + my $stem_first=substr($flank_first_struct,-10); + my $stem_second=substr($flank_second_struct,0,10); + + my $stem_bp_first=0; + my $stem_bp_second=0; + + #find base pairings by simple pattern matching + while($stem_first=~/\(/g){ + $stem_bp_first++; + } + + while($stem_second=~/\)/g){ + $stem_bp_second++; + } + + my $stem_bp=min2($stem_bp_first,$stem_bp_second); + + $hash_comp{"stem_first"}=$stem_first; + $hash_comp{"stem_second"}=$stem_second; + $hash_comp{"stem_bp_first"}=$stem_bp_first; + $hash_comp{"stem_bp_second"}=$stem_bp_second; + $hash_comp{"stem_bp"}=$stem_bp; + + return; +} + + + + +sub arm_mature{ + + #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor + + my ($beg,$end,$strand)=@_; + + #mature and star sequences should alway be on plus strand + if($strand eq "-"){return 0;} + + #there should be no bifurcations and minimum one base pairing + my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand); + if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){ + return "first"; + }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){ + return "second"; + } + return 0; +} + + +sub arm_star{ + + #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + #no overlap between the mature and the star sequence + if($hash_comp{"mature_arm"} eq "first"){ + ($hash_comp{"mature_end"}<$beg) or return 0; + }elsif($hash_comp{"mature_arm"} eq "second"){ + ($end<$hash_comp{"mature_beg"}) or return 0; + } + + #there should be no bifurcations and minimum one base pairing + my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+"); + if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){ + return "first"; + }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){ + return "second"; + } + return 0; +} + + +sub test_loop{ + + #tests the loop positions + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub test_flanks{ + + #tests the positions of the lower flanks + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub comp{ + + #subroutine to retrive from the 'comp' hash + + my $type=shift; + my $component=$hash_comp{$type}; + return $component; +} + + +sub find_strand_query{ + + #subroutine to find the strand for a given query + + my $query=shift; + my $strand=$hash_query{$query}{"strand"}; + return $strand; +} + + +sub find_positions_query{ + + #subroutine to find the begin and end positions for a given query + + my $query=shift; + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + return ($beg,$end); +} + + + +sub find_mature_query{ + + #finds the query with the highest frequency of reads and returns it + #is used to determine the positions of the potential mature sequence + + my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query; + my $mature_query=$queries[0]; + return $mature_query; +} + + + + +sub reset_variables{ + + #resets the hashes for the next potential precursor + +# %hash_query=(); +# %hash_comp=(); +# %hash_bp=(); + foreach my $key (keys %hash_query) {delete($hash_query{$key});} + foreach my $key (keys %hash_comp) {delete($hash_comp{$key});} + foreach my $key (keys %hash_bp) {delete($hash_bp{$key});} + +# $message_filter=(); +# $message_score=(); +# $lines=(); + undef($message_filter); + undef($message_score); + undef($lines); + return; +} + + + +sub excise_seq{ + + #excise sub sequence from the potential precursor + + my($seq,$beg,$end,$strand)=@_; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($seq)){$out_of_bound++;return 0;} + + #if on the minus strand, the reverse complement should be excised + if($strand eq "-"){$seq=revcom($seq);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_seq=substr($seq,$beg-1,$end-$beg+1); + + return $sub_seq; + +} + +sub excise_struct{ + + #excise sub structure + + my($struct,$beg,$end,$strand)=@_; + my $lng=length $struct; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($struct)){return 0;} + + #if excising relative to minus strand, positions are reversed + if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_struct=substr($struct,$beg-1,$end-$beg+1); + + return $sub_struct; +} + + +sub create_hash_nuclei{ + #parses a fasta file with sequences of known miRNAs considered for conservation purposes + #reads the nuclei into a hash + + my ($file) = @_; + my ($id, $desc, $sequence, $nucleus) = (); + + open (FASTA, "<$file") or die "can not open $file\n"; + while (<FASTA>) + { + chomp; + if (/^>(\S+)(.*)/) + { + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + while (<FASTA>){ + chomp; + if (/^>(\S+)(.*)/){ + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + next; + } + $sequence .= $_; + } + } + } + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + close FASTA; +} + + +sub parse_file_struct{ + #parses the output from RNAfoldand reads it into hashes + my($file) = @_; + my($id,$desc,$seq,$struct,$mfe) = (); + open (FILE_STRUCT, "<$file") or die "can not open $file\n"; + while (<FILE_STRUCT>){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $id= $1; + $desc= $2; + $seq= ""; + $struct= ""; + $mfe= ""; + while (<FILE_STRUCT>){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + $id = $1; + $desc = $2; + $seq = ""; + $struct = ""; + $mfe = ""; + next; + } + if(/^\w/){ + tr/uU/tT/; + $seq .= $_; + next; + } + if(/((\.|\(|\))+)/){$struct .=$1;} + if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;} + } + } + } + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + close FILE_STRUCT; + return; +} + + +sub score_s{ + + #this score message is appended to the end of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score.=$message."\n";; + return; +} + + + +sub score_p{ + + #this score message is appended to the beginning of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score=$message."\n".$message_score; + return; +} + + + +sub filter_s{ + + #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor + + my $message=shift; + $message_filter.=$message."\n"; + return; +} + + +sub filter_p{ + + #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor + + my $message=shift; + if(defined $message_filter){$message_filter=$message."\n".$message_filter;} + else{$message_filter=$message."\n";} + return; +} + + +sub find_freq{ + + #finds the frequency of a given read query from its id. + + my($query)=@_; + + if($query=~/x(\d+)/i){ + my $freq=$1; + return $freq; + }else{ + #print STDERR "Problem with read format\n"; + return 0; + } +} + + +sub print_hash_comp{ + + #prints the 'comp' hash + + my @keys=sort keys %hash_comp; + foreach my $key(@keys){ + my $value=$hash_comp{$key}; + print "$key \t$value\n"; + } +} + + + +sub print_hash_bp{ + + #prints the 'bp' hash + + my @keys=sort {$a<=>$b} keys %hash_bp; + foreach my $key(@keys){ + my $value=$hash_bp{$key}; + print "$key\t$value\n"; + } + print "\n"; +} + + + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + return($strand); +} + + +sub contained{ + + #Is the stretch defined by the first positions contained in the stretch defined by the second? + + my($beg1,$end1,$beg2,$end2)=@_; + + testbeginend($beg1,$end1,$beg2,$end2); + + if($beg2<=$beg1 and $end1<=$end2){ + return 1; + }else{ + return 0; + } +} + + +sub testbeginend{ + + #Are the beginposition numerically smaller than the endposition for each pair? + + my($begin1,$end1,$begin2,$end2)=@_; + + unless($begin1<=$end1 and $begin2<=$end2){ + print STDERR "beg can not be larger than end for $subject_old\n"; + exit; + } +} + + +sub rev_pos{ + +# The blast_parsed format always uses positions that are relative to the 5' of the given strand +# This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with +# the n't nucleotide on the plus strand + +# This subroutine reverses the begin and end positions of positions of the minus strand so that they +# are relative to the 5' end of the plus strand + + my($beg,$end,$lng)=@_; + + my $new_end=$lng-$beg+1; + my $new_beg=$lng-$end+1; + + return($new_beg,$new_end); +} + +sub round { + + #rounds to nearest integer + + my($number) = shift; + return int($number + .5); + +} + + +sub rev{ + + #reverses the order of nucleotides in a sequence + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + #the complementary of a sequence + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + #reverse complement + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + + +sub max2 { + + #max of two numbers + + my($a, $b) = @_; + return ($a>$b ? $a : $b); +} + +sub min2 { + + #min of two numbers + + my($a, $b) = @_; + return ($a<$b ? $a : $b); +} + + + +sub score_freq{ + +# scores the count of reads that map to the potential precursor +# Assumes geometric distribution as described in methods section of manuscript + + my $freq=shift; + + #parameters of known precursors and background hairpins + my $parameter_test=0.999; + my $parameter_control=0.6; + + #log_odds calculated directly to avoid underflow + my $intercept=log((1-$parameter_test)/(1-$parameter_control)); + my $slope=log($parameter_test/$parameter_control); + my $log_odds=$slope*$freq+$intercept; + + #if no strong evidence for 3' overhangs, limit the score contribution to 0 + unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);} + + return $log_odds; +} + + + +##sub score_mfe{ + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + +## my $mfe=shift; + + #numerical value, minimum 1 +## my $mfe_adj=max2(1,-$mfe); + + #parameters of known precursors and background hairpins, scale and location +## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); +## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + +## my $odds=$prob_test/$prob_background; +## my $log_odds=log($odds); + +## return $log_odds; +##} + +sub score_mfe{ +# use bignum; + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + + my ($mfe,$mlng)=@_; + + #numerical value, minimum 1 + my $mfe_adj=max2(1,-$mfe); +my $mfe_adj1=$mfe/$mlng; + #parameters of known precursors and background hairpins, scale and location + my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; + my $ev=$e**($mfe_adj1*$c); + #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; + my $log_odds=($a/($b+$ev)); + + + my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); + my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + + my $odds=$prob_test/$prob_background; + my $log_odds_2=log($odds); + #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; + return $log_odds; +} + + + +sub prob_gumbel_discretized{ + +# discretized Gumbel distribution, probabilities within windows of 1 kCal/mol +# uses the subroutine that calculates the cdf to find the probabilities + + my ($var,$scale,$location)=@_; + + my $bound_lower=$var-0.5; + my $bound_upper=$var+0.5; + + my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location); + my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location); + + my $prob=$cdf_upper-$cdf_lower; + + return $prob; +} + + +sub cdf_gumbel{ + +# calculates the cumulative distribution function of the Gumbel distribution + + my ($var,$scale,$location)=@_; + + my $cdf=$e**(-($e**(-($var-$location)/$scale))); + + return $cdf; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRNA_Express_and_sequence.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,173 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-6-4 +#Modified: +#Description: solexa miRNA express and sequence +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h"); +if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'list'}; +my $out=$opts{'fa'}; +my $preout=$opts{'pre'}; + +=cut +my %hash_pri; +open PRI,"<$opts{p}"; +while (my $aline=<PRI>) { + chomp $aline; + if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;} +} +close PRI; +=cut + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +open FA ,">$out"; +open PRE,">$preout"; + +print OUT "#ID\tcoordinate\tpos1\tpos2"; +my @marks=split/\,/,$opts{'tag'}; +foreach (@marks) { + print OUT "\t",$_,"_matureExp"; +} +foreach (@marks) { + print OUT "\t",$_,"_starExp"; +} +foreach (@marks) { + print OUT "\t",$_,"_totalExp"; +} + +print OUT "\n"; + +my (%uniq_id,$novel); +while (my $aline=<IN>) { + chomp $aline; + until ($aline =~ /^score\s+[-\d\.]+/){ + $aline = <IN>; + if (eof) {last;} + } + if (eof) {last;} +########## miRNA ID ################ + $novel++; +########### annotate#################### + do {$aline=<IN>;} until($aline=~/flank_first_end/) ; + chomp $aline; + my @flank1=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/flank_second_beg/) ; + chomp $aline; + my @flank2=split/\t/,$aline; +# +########## mature start loop pre #### + do {$aline=<IN>;} until($aline=~/mature_beg/) ; + chomp $aline; + my @start=split/\t/,$aline; +# $start[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/mature_end/) ; + chomp $aline; + my @end=split/\t/,$aline; +# $end[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/mature_seq/) ; + chomp $aline; + my @arr1=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/pre_seq/) ; + chomp $aline; + my @arr2=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/pri_id/) ; + chomp $aline; + my @pri_id=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/pri_seq/) ; + chomp $aline; + my @pri_seq=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/star_beg/) ; + chomp $aline; + my @star_start=split/\t/,$aline; +# $star_start[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/star_end/) ; + chomp $aline; + my @star_end=split/\t/,$aline; +# $star_end[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/star_seq/) ; + chomp $aline; + my @arr3=split/\t/,$aline; + print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t"; + #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t"; + print FA ">miR-c-$novel\n$arr1[1]\n"; + print PRE ">miR-c-$novel\n$pri_seq[1]\n"; +########## reads count ############# + <IN>; + my @count1;my @count2;my @count3;my @count4; + $aline=<IN>; + do { + chomp $aline; + my @reads=split/\t/,$aline; + my @pos=(); + $reads[5]=~/(\d+)\.\.(\d+)/; +# $pos[0] =$1-$flank1[1]; +# $pos[1] =$2-$flank1[1]; + $pos[0]=$1; + $pos[1]=$2; + $reads[0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=0;$i<@ss ;$i++) { + if (!(defined $count3[$i])) { + $count3[$i]=0; + } + if (!(defined $count4[$i])) { + $count4[$i]=0; + } + $count2[$i]+=$ss[$i]; + + } +# $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 ); +# $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 ); +# $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1); +# $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1); + if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 ) + { + for (my $i=0;$i<@ss;$i++) { + $count3[$i]+=$ss[$i]; + } + } + if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){ + for (my $i=0;$i<@ss;$i++) { + $count4[$i]+=$ss[$i]; + } + } + $aline=<IN>; + chomp $aline; + } until(length $aline < 1) ; + $"="\t"; + print OUT "@count3\t@count4\t@count2\n"; + $"=" "; +} + +close IN; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -list -fa -pre -tag +options: +-i input file,predictions file +-list output file miRNA list file +-fa output file ,miRNA sequence fasta file. +-pre output file, miRNA precursor fasta file. +-tag string, sample names# eg: samA,samB,samC +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,253 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-4-22 +#Modified: +#Description: plant microRNA prediction +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +#use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +#use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h"); +if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$opts{'i'}; +my $data=$opts{'fa'}; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $hit=25; # maximum reads mapping hits in genome +if (defined $opts{'r'}) {$hit=$opts{'r'};} + +my $upstream = 2; # microRNA 5' extension +$upstream = $opts{'e'} if(defined $opts{'e'}); + +my $downstream = 5;# microRNA 3' extension +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; +my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; +my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; + +$time=&Time(); +print "$time, Checking input file!\n"; + +my (@filein,@mark); +&read_config(); + +&checkfa($opts{pre}) if(defined $opts{pre}); +&checkfa($opts{mat}) if(defined $opts{mat}); +&checkfa($opts{gfa}); + +chdir $dir; +my $data2=$data; +my $known_result=$dir."known_miRNA_Express"; +if(defined $opts{pre} and defined $opts{mat}){ + &quantify(); ### known microRAN quantify + $data2=$known_result."/mirbase_not_mapped.fa"; +} + +my $genome_map=$dir."genome_match"; +&genome($data2); + +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +#$time=Time(); +#print "$time: Novel microRNA prediction!\n\n"; + +&predict($mapfa); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub predict{ + my ($file)=@_; + $time=&Time(); + print "$time: Novel microRNA prediction!\n\n"; + my $predict=$dir."Novel_miRNA_predict"; + mkdir $predict; + chdir $predict; + system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); +# print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; + + system("bowtie-build -f excised_precursor.fa excised_precursor"); +# print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; + + system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log"); +# print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; + + system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); +# print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; + + system("sort -k 4 precursor_mapped.bst > signatures.bst"); +# print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; + + chdir $dir; + system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); +# print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; + #system("rm novel_tmp_dir -rf"); + my $tag=join "," ,@mark; + system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); + + system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa"); + +} + +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} + +sub quantify{ + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); + print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + #&check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=<N>; + chomp $line; + if($line !~ /^>\S+/){ + #printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if(<N> !~ /^[ACGTNacgtn]*$/){ + #printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + #&printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: + +$0 -i -fa -gfa -idx -pre -mat -mis -e -f -t -o -path +options: +-i input files, # config + +-fa ,#fasta sequence file + +-path scirpt path + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-pre string, input file #species specific microRNA precursor sequences +-mat string, input file #species specific microRNA mature sequences + +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-r int a read is allowed to map up to this number of positions in the genome,default is 25 + +-dis <int> Maximal space between miRNA and miRNA* (200) +-flank <int> Flank sequence length of miRNA precursor (10) +-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20) + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA.xml Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,113 @@ +<tool id="micrornas_v1" name="microRNA" veision="1.0.0"> + <description>Program for plant microRNA analysis(miRNA only) </description> + + <requirements> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <!--requirement type="package" version="3.0.1">R</requirement!--> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="2.1.8">ViennaRNA</requirement> + </requirements> + + <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command--> + + <command interpreter="perl">microRNA.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + -path \$SCRIPT_PATH + + + ## Do or not annotate known microRNAs + #if $mirbase.known_microRNA == "yes": + -pre $mirbase.pre -mat $mirbase.mat + #end if + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + ### bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = $reference_genome.own_file + -gfa $index_path + #else: + #set index_path = $reference_genome.index.fields.path + -gfa ${index_path}.fa -idx $index_path + #end if + + + -mis $mismatch -i $config -fa $reads -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log + </command> + + <inputs> + <param name="config" type="data" label="Raw data configs file" /> + <param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" /> + + <!-- reference genome --> + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> + + <conditional name="mirbase"> + <param name="known_microRNA" type="select" label="Analysis known microRNAs(eg. from mirbase)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <param name="mat" type="data" label="mature microRNA sequence file" /> + <param name="pre" type="data" label="precursor microRNA sequence fie" /> + </when> + </conditional> <!-- params --> + + + + <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" /> + <param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" /> + <param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" /> + <param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> + <param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" /> + <param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" /> + <param name="mfe" type="float" value="-30" label="Maximal free energy allowed for a miRNA precursor" /> + + </inputs> + + <outputs> + <data format="txt" name="known microRNA express list" from_work_dir="miRNA_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="known microRNA express alignment" from_work_dir="miRNA_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="known microRNA moRs result" from_work_dir="miRNA_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="fasta" name="known microRNA precursor file" from_work_dir="miRNA_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="fasta" name="known microRNA mature file" from_work_dir="miRNA_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="novel microRNA express list" from_work_dir="miRNA_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/> + <data format="fasta" name="novel microRNA precursor file" from_work_dir="miRNA_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/> + <data format="fasta" name="novel microRNA mature sequence file" from_work_dir="miRNA_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/> + <data format="fasta" name="non-microRNA sequence FASTA file" from_work_dir="miRNA_out/non_microRNA_sequence.fa" label="${tool.name} on ${on_string}: Sequence FASTA file of non-microRNA tags"/> + + </outputs> + + <help> + + </help> + </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA_pipeline.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,508 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-4-22 +#Modified: +#Description: plant microRNA prediction +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +#use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +#use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i:s@","tag:s@","phred:i","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); +if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $format=$opts{'format'}; +if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { + #&printErr(); + die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +} + +my $phred_qv=64; +if (defined $opts{'phred'}) { + $phred_qv=$opts{'phred'}; +} + +my @inputfiles=@{$opts{'i'}}; +my @inputtags=@{$opts{'tag'}}; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRPlant_out/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@inputfiles;$i++) { + print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; + } +close CONFIG; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $a="ATCTCGTATG"; #adapter +if (defined $opts{'a'}) {$a=$opts{'a'};} + +my $m=6; #adapter minimum mapped nt +if (defined $opts{'M'}) {$m=$opts{'M'};} + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $min_nt=19; # minimum reads length +if (defined $opts{'min'}) {$min_nt=$opts{'min'};} + +my $max_nt=28; #maximum reads length +if (defined $opts{'max'}) {$max_nt=$opts{'max'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $mis_rfam=0;# mismatch number for rfam +if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} + +my $hit=25; # maximum reads mapping hits in genome +if (defined $opts{'r'}) {$hit=$opts{'r'};} + +my $upstream = 2; # microRNA 5' extension +$upstream = $opts{'e'} if(defined $opts{'e'}); + +my $downstream = 5;# microRNA 3' extension +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; +my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; +my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; + +$time=&Time(); +print "$time, Checking input file!\n"; + +my (@filein,@mark,@clean); +#&read_config(); +@filein=@inputfiles; +@mark=@inputtags; + +&checkfa($opts{pre}); +&checkfa($opts{mat}); +&checkfa($opts{gfa}); + + +##### clip adpter --> clean data start +$time=&Time(); +print "$time, Preprocess:\n trim adapter, reads collapse and filter reads by length.\n"; + +$time=~s/:/-/g; +$time=~s/ /-/g; +my $preprocess=$dir."preProcess/"; +mkdir $preprocess; +my $can_use_threads = eval 'use threads; 1'; +if ($can_use_threads) { +# Do processing using threads + print "Do processing using threads\n"; + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clean,$preprocess.$out."_clips_adapter.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } +} else { +# Do not processing using threads + print "Do not processing using threads\n"; + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clean,$preprocess.$out."_clips_adapter.fq"; + &clips($in,$out); + } +} + +##### clip adpter --> clean data end + +my $collapsed=$preprocess."collapse_reads.fa"; +my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data +my $data2; ### mirbase not mapped reads +my $data3; ### rfam not mapped reads +&collapse(\@clean,$collapsed); #collapse reads to tags + +&filterbylength(); # filter <$min_nt && >$max_nt + +print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; + +$time=Time(); +print "$time: known microRNA quantify!\n\n"; + +chdir $dir; + +$time=~s/:/-/g; +$time=~s/ /-/g; +my $known_result=$dir."known_miRNA_Express/"; +&quantify(); ### known microRAN quantify + + +#my $miR_exp_dir=&search($known_result,"miRNA_Express_"); +$data2=$known_result."/mirbase_not_mapped.fa"; + +my $pathfile="$dir/path.txt"; +open PA,">$pathfile"; +print PA "$config\n"; +print PA "$preprocess\n"; +print PA "$known_result\n"; + +if (defined $opts{'rfam'}) { #rfam mapping and analysis + $time=Time(); + print "$time: RNA annotate!\n\n"; + $time=~s/:/-/g; + $time=~s/ /-/g; + my $rfam_exp_dir=$dir."rfam_match"; + &rfam(); + #my $rfam_exp_dir=&search($dir,"rfam_match_"); + $data3=$rfam_exp_dir."/rfam_not_mapped.fa"; +print PA "$rfam_exp_dir\n"; + + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); +} + +my $data4=$data; +if (defined $opts{'D'}) { #genome mapping + $data4=$data3; +}else{ + $data4=$data2; +} + +$time=Time(); +print "$time: Genome alignment!\n\n"; +$time=~s/:/-/g; +$time=~s/ /-/g; +my $genome_map=$dir."genome_match"; +&genome($data4); +print PA "$genome_map\n"; +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +#$time=Time(); +#print "$time: Novel microRNA prediction!\n\n"; + +&predict($mapfa); + +close PA; +system("perl $scipt_path/html_miRPlant.pl -i $pathfile -format $format -o $dir/result.html"); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub predict{ + my ($file)=@_; + $time=&Time(); + print "$time: Novel microRNA prediction!\n\n"; + $time=~s/:/-/g; + $time=~s/ /-/g; + my $predict=$dir."novel_miRNA_predict"; +print PA "$predict\n"; + mkdir $predict; + chdir $predict; + system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); +# print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; + + system("bowtie-build -f excised_precursor.fa excised_precursor"); +# print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; + + system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log"); +# print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; + + system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); +# print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; + + system("sort -k 4 precursor_mapped.bst > signatures.bst"); +# print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; + + chdir $dir; + system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); +# print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; + #system("rm novel_tmp_dir -rf"); + my $tag=join "," ,@mark; + system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); +} + +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} +sub rfam{ + if (defined $opts{'idx2'}) { + system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} "); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; + }else{ + system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir"); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; + } +} +sub quantify{ + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); + print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; +} +sub filterbylength{ + my $tmpmark=join ",", @mark; + system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); + system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt -o $preprocess/length.html"); +# print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; + +} +sub collapse{ + my ($ins,$data)=@_; + my $str=""; + for (my $i=0;$i<@{$ins};$i++) { + $str .="-i $$ins[$i] "; + } + system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); +# print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; +} + +sub clips{ + my ($in,$out)=@_; + my $adapter=$preprocess.$out."_clips_adapter.fq"; + if($format eq "fq" || $format eq "fastq"){ + system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; + print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; + } + if($format eq "fa" || $format eq "fasta"){ + system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; + # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; + } + #my $clean=$preprocess.$out."_clean.fq"; + #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); + + return; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + &check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub check_rawdata{ + my ($fileforcheck)=@_; + if (!(-s $fileforcheck)) { + #&printErr(); + die "Can not find $fileforcheck, or file is empty!!!\n"; + } + if ($format eq "fasta" || $format eq "fa") { + &checkfa($fileforcheck); + } + if ($format eq "fastq" || $format eq "fq") { + &checkfq($fileforcheck); + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=<N>; + chomp $line; + if($line !~ /^>\S+/){ + #printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if(<N> !~ /^[ACGTNacgtn]*$/){ + #printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub checkfq{ + my ($file_reads)=@_; + + open N,"<$file_reads"; + for (my $i=0;$i<10;$i++) { + my $a=<N>; + my $b=<N>; + my $c=<N>; + my $d=<N>; + chomp $a; + chomp $b; + chomp $c; + chomp $d; + if($a!~/^\@/){ + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if($b!~ /^[ACGTNacgtn]*$/){ + #&printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + if ($c!~/^\@/ && $c!~/^\+/) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if ((length $b) != (length $d)) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + my @qv=split //,$d; + for (my $j=0;$j<@qv ;$j++) { + my $q=ord($qv[$j])-64; + if($q<0){$phred_qv=33;} + } + } + close N; +} + +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + #&printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + +=cut + +sub printErr{ + print STDERR color 'bold red'; + print STDERR "Error: "; + print STDERR color 'reset'; +} +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +=cut +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: + +$0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t -o -path +options: +-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ... +-tag string # raw data file names, -tag xxx -tag xxx + +-format string,#specific input rawdata file format : fastq|fq|fasta|fa + +-path scirpt path + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-pre string, input file #species specific microRNA precursor sequences +-mat string, input file #species specific microRNA mature sequences + +-rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-D If [-D] is specified,will discard rfam mapped reads(nead -rfam). + +-a string, ADAPTER string. default is ATCTCGTATG. +-M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. +-min int, reads min length,default is 19. +-max int, reads max length,default is 28. + +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment +-r int a read is allowed to map up to this number of positions in the genome,default is 25 + +-dis <int> Maximal space between miRNA and miRNA* (200) +-flank <int> Flank sequence length of miRNA precursor (10) +-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20) + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA_pipeline.xml Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,192 @@ +<tool id="plant_microRNA_v1" name="microRNA_pipeline" veision="1.0.0"> + <description>Program for plant microRNA analysis (rawdata preprocess -> genome mapping -> non-coding RNA(exclude miRNAs) annotation -> microRNA analysis)</description> + + <requirements> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <!--requirement type="package" version="3.0.1">R</requirement!--> + <requirement type="package" version="1.96">threads</requirement> + <requirement type="package" version="2.59">SVG</requirement> + <!--requirement type="package" version="0.228">parent</requirement--> + <requirement type="package" version="2.1.8">ViennaRNA</requirement> + </requirements> + + <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command--> + + <command interpreter="perl">microRNA_pipeline.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + -path \$SCRIPT_PATH + + #for $j, $s in enumerate( $series ) + ##rank_of_series=$j + -i ${s.input} + -tag ${s.tag} + #end for + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + #####bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = $reference_genome.own_file + -gfa $index_path + #else: + #set index_path = $reference_genome.index.fields.path + -gfa ${index_path}.fa -idx $index_path + #end if + + + ## Do or not annotate rfam non-miRNA RNAs + #if $params.annotate_rfam == "yes": + + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($params.reference_rfam.source) == "history": + ######## bowtie-build "$params.reference_rfam.own_file" rfam; ln -s "$params.reference_rfam.own_file" rfam.fa; + #set rfam_index_path = $params.reference_rfam.own_file + -rfam $rfam_index_path + #else: + #set rfam_index_path = $params.reference_rfam.index.fields.path + -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path + #end if + + -v $params.v + ## Do or not delete rfam mapped tags + #if $params.delete_rfam == "yes": + -D + #end if + #end if + + + ## Do or not annotate known microRNAs + #if $mirbase.known_microRNA == "yes": + -pre $mirbase.pre -mat $mirbase.mat + #end if + + + -format $format -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log + </command> + + <inputs> + + <repeat name="series" title="Raw sequence data"> + <param name="input" type="data" label="Raw data"/> + <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/> + </repeat> + + <param name="format" type="select" label="Raw data format" multiple="false"> + <option value="fastq">Raw data is fastq. format</option> + <option value="fasta">Raw data is fasta. format</option> + </param> + + <param name="phred" type="select" label="Input quals are Phred+64 or Phred+33" multiple="false"> + <option value="64">Phred+64</option> + <option value="33" selected="true">Phred+33</option> + </param> + + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> + + <conditional name="params"> + <param name="annotate_rfam" type="select" label="annotate rfam nocoding RNAs(excluding miRNA)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <!--param name="rfam" type="data" label="rfam sequence file" /--> + <conditional name="reference_rfam"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a non-coding RNA reference" help="If your reference of interest is not listed, contact the Galaxy team"> + <options from_data_table="rfam_bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" /> + </when> + </conditional> + + <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches for rfam mapping"/> + + <param name="delete_rfam" type="select" label="delet rfam mapped reads"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + </when> + + </conditional> + + <conditional name="mirbase"> + <param name="known_microRNA" type="select" label="Analysis known microRNAs(eg. from mirbase)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <param name="mat" type="data" label="mature microRNA sequence file" /> + <param name="pre" type="data" label="precursor microRNA sequence file" /> + </when> + + </conditional> + + + <param name="a" type="text" value="TGGAATTCTCGGGTGCCAAGG" label="3' adapter sequence" /> + <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" /> + <param name="min" type="integer" value="19" label="minimum microRNA length" /> + <param name="max" type="integer" value="28" label="maximum microRNA length" /> + <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to precursors" /> + <param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" /> + <param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" /> + <param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> + <param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" /> + <param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" /> + <param name="mfe" type="float" value="-30" label="Maximal free energy allowed for a miRNA precursor" /> + </inputs> + + <outputs> + <data format="txt" name="known microRNA express list" from_work_dir="miRPlant_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="known microRNA express alignment" from_work_dir="miRPlant_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="known microRNA moRs result" from_work_dir="miRPlant_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="known microRNA precursor file" from_work_dir="miRPlant_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="known microRNA mature file" from_work_dir="miRPlant_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file"> + <filter>(mirbase['known_microRNA'] == 'yes')</filter> + </data> + <data format="txt" name="novel microRNA express list" from_work_dir="miRPlant_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/> + <data format="txt" name="novel microRNA precursor file" from_work_dir="miRPlant_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/> + <data format="txt" name="novel microRNA mature sequence file" from_work_dir="miRPlant_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/> + <data format="html" name="analysis result" from_work_dir="miRPlant_out/result.html" label="${tool.name} on ${on_string}: analysis result"/> + </outputs> + + <help> + + </help> + </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nibls.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,319 @@ +#!/usr/bin/perl +##################################################################################################### +#LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. +# +# Dan MacLean +# dan.maclean@sainsbury-laboratory.ac.uk +# +# This program is free academic software; academic and non-profit +# users may redistribute it freely. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 +# see included file GPL3.txt +# +# + + +###Dont forget you will need ... +##################################################################################################### +# Boost::Graph +#Copyright 2005 by David Burdick +# Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm +#Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself. +##################################################################################################### + + + +use strict; +use warnings; +use Boost::Graph; +use Getopt::Long; + + +my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n"; + +my $gff_file ; +my $min_inc = 5; +my $clus = 0.6; +my $buff = 0; +my $output_file; +my $temp; +my $mark; + +GetOptions( + + 'c=f' => \$clus, + 'm=i' => \$min_inc, + 'f|file=s' => \$gff_file, + 'b=i' => \$buff, + 'o=s' => \$output_file, + 't=s' => \$temp, + 'k=s' => \$mark +) ; + + +die $usage unless $gff_file; + + +my $starttime = time; +warn "started $starttime\n"; + +## load in data +my %molecules; # stores starts and ends of srnas +open GFF, "<$gff_file"; + +while (my $entry = <GFF>){ + + chomp $entry; + next if($entry=~/^\#/); + my @data = split(/\t/,$entry); + my $chr=shift @data; + my $strand=shift @data; + my $start=shift @data; + my $end=shift @data; +# my $length1=$end-$start+1; +# if ($length1>30) { +# $length1=40; +# } + my $total; + for (my $s=0;$s<@data ;$s++) { + $total+=$data[$s]; + } + push @data,$total; +# push @data,$length1; +# if (defined $molecules{$chr}{$start}{$end}{$strand}) { +# my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand}); +# for (my $i=0;$i<$#old_data ;$i++) { +# $data[$i]+=$old_data[$i]; +# } +# } + my $data=join ";",@data; + $molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information + #print "$chr\t$start\t$end\n"; +} + +close GFF; + +warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n"; + +my @sample=split/\#/,$mark; +$mark=join"\"\t\"",@sample; +open OUT, ">$output_file"; +print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n"; +open CLUSTER,">$temp"; +print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n"; +foreach my $chromosome (keys %molecules){ + my $g = new Boost::Graph(directed=>0); + my @starts = keys(%{$molecules{$chromosome}} ); + @starts = sort {$a <=> $b} @starts; + + while (my $srna_start = shift @starts){ ## work from left most sRNA to right most, add to graph if they close enough + + + foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){ + + + ###use new graph if the next srna is too far away from this one.. + if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){ + + + ##dump the info from the old graph + if (scalar(@{$g->get_nodes()}) > 2){ + + my $cluster_coeff = get_cc($g); + if ($cluster_coeff >= $clus){ + dump_locus($g, $cluster_coeff); + } + } + + + $g = new Boost::Graph(directed=>0); + + } + + foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){ ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below + + unless ($e eq $srna_end){ + my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end; ## turn coordinate of sRNA inro a node name + my $en = $chromosome. ':' . $srna_start . ':' . $e; + $g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1'); + } + + } + + foreach my $start (@starts){ ##build graph of overlaps + my $new = 0; + last if $start - $min_inc > $srna_end; + if ($start - $min_inc <= $srna_end){ + + my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end; + foreach my $end (keys %{$molecules{$chromosome}{$start}}){ + + my $end_node = $chromosome . ':' . $start . ':' . $end; + $g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1'); + } + + } + } + } + + if (!(defined $starts[0])) { + ##dump the info from the last graph + if (scalar(@{$g->get_nodes()}) > 2){ + + my $cluster_coeff = get_cc($g); + if ($cluster_coeff >= $clus){ + dump_locus($g, $cluster_coeff); + } + } + } + } +} + +warn "Loci printed\nFinished\n"; + +my $endtime = time; + +my $elapsed = $endtime - $starttime; + +warn "Time elapsed = $elapsed s\n"; +close OUT; +close CLUSTER; +######################################################################################### +sub get_cc{ ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2 + + my $graph = shift; + + my @component = @{$graph->get_nodes()}; #number of nodes + my @clustering_coefficients; + + foreach my $vertex (@component) + { + + my @neighbours = @{$graph->neighbors($vertex)}; + + my %edges_in_graph; + + my $n = @neighbours; #n = the number of neighbours + my $k = ($n * ($n - 1))/2; #k = total number of possible connections + + my $e= 0; #actual number of connections within sub-graph + + foreach my $neighbour (@neighbours) + { + foreach my $neighbour_2 (@neighbours) + { + my $edge1 = "$neighbour\t$neighbour_2"; + my $edge2 = "$neighbour_2\t$neighbour"; + unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2}) + { + if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour)) + { + ++$e; + $edges_in_graph{$edge1}=1; + $edges_in_graph{$edge2}=1; + } + } + } + } + + if ($k >= 1) + { + my $c = $e / $k; + push @clustering_coefficients, $c; + } + else {push @clustering_coefficients, '0';} + } + + my $graph_n = scalar(@clustering_coefficients); + my $graph_cc = 0; + foreach my $cc (@clustering_coefficients){ + + $graph_cc = $graph_cc + $cc; + + } + $graph_cc = $graph_cc / $graph_n; + + return $graph_cc; +} + +############################################################################################################ + +sub dump_locus{ + + my $g = shift; + my $cc = shift; + my $chr; + my $start = 1000000000000000000000000000000000000000000000; + my $end = -1; + my @sample; + my @tag; + foreach my $node (@{$g->get_nodes()}){ + + $node =~ m/^(\S+):(\d+):(\d+)$/; + my $c=$1; + my $s=$2; + my $e=$3; + # my @data; + foreach my $str (keys %{$molecules{$c}{$s}{$e}}) { + my @data=split(/;/,$molecules{$c}{$s}{$e}{$str}); + push @tag,($s.",".$e.",".$str.",".$data[-1]); +# for (my $i=0;$i<$#old_data ;$i++) { +# $data[$i]+=$old_data[$i]; +# } + my $length=$e-$s+1; + if ($length>30) { + $length=40; + } + push @data,$length; + my $data=join ";",@data;#sample_exp/total_exp/length; + push @sample,$data; + } + + $chr = $c; + $start = $s if $s < $start; + $end = $e if $e > $end; + } + my $tag=join";",@tag; + my $tag_number=@tag; + my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample); + if ($max_length==40) { + $max_length="\>30"; + } + my $cluster_exp=join"\t",@cluster_exp; + my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp; + print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag_number\t$tag\n"; + print OUT $gff, "\n"; +} + +sub Max_length{ + my @exp=@{$_[0]}; + my %sample_length; + my $total_exp; + my @each; + for (my $i=0;$i<=$#exp ;$i++) { + my @tag=split/;/,$exp[$i]; + my $length=pop(@tag); + my $exp=pop(@tag); + $sample_length{$length}+=$exp; + $total_exp+=$exp; + for (my $j=0;$j<=$#tag ;$j++) { + $each[$j]+=$tag[$j]; + } + } + my $max=0; + my $max_key; + foreach my $key (sort keys %sample_length) { + my $p=$sample_length{$key}/$total_exp; + if ($p>$max) { + $max=$p; + $max_key=$key; + } + $sample_length{$key}=sprintf("%.2f",$p); + } + return($max_key,$sample_length{$max_key},@each); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/non_miRNA_reads.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,62 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","fa=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'fa'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +my (%fa,%seq); +while (my $aline=<IN>){ + chomp $aline; + $aline=~s/^>//; + my $seq=<IN>; + chomp $seq; + #$seq{$seq}=$aline; + $fa{$aline}=$seq; +} +close IN; + +open IN,"<$opts{i}"; +while(my $aline=<IN>){ + chomp $aline; + if($aline=~/^\S+_x\d+/){ + $aline=~/^(\S+_x\d+)/; + my $name=$1; + delete($fa{$name}); + } +} +close IN; + +open OUT,">$fileout"; #output file +foreach my $key (keys %fa) { + print OUT ">$key\n$fa{$key}\n"; +} +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phased_siRNA.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,254 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use Math::Cephes qw(:hypergeometrics); + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file + +while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^\#/) { + print OUT $aline,"\tp-value\n"; + next; + } + my @tmp=split/\t/,$aline; + my @pos=split/:|-/,$tmp[0]; + $tmp[1]=~s/nt//; + my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]); + + print OUT $aline,"\t",$pv,"\n"; +} +close IN; +close OUT; + +sub phase{ + my ($tagL,$start,$end,$tags)=@_; + my @tmp=split/\;/,$tags; + my %tag; + for (my $i=0;$i<@tmp;$i++) { + my @aa=split/\,/,$tmp[$i]; + next if($aa[1]-$aa[0]+1 != $tagL); +# $tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+"); +# $tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-"); + $tag{$aa[0]}+=$aa[3] if($aa[2] eq "+"); + $tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-"); + } + + my $pv=&pvalue2(\%tag,$tagL,$start,$end); + + return $pv; +} + +sub pvalue2{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; my $pp=1; + foreach my $ccs(keys %{$tag}){ + my $n=0; + my $k=0; + my $K=0; + my $N=0; + + my $cor= $ccs; + my $ss=$cor; + my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end; + + my $max=0; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + if ($max<$$tag{$x}) {$max=$$tag{$x};} + $n +=$$tag{$x}; + $N++; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + $k +=$$tag{$x}; + $K++; + } + } + + + return $p if($K<3); + return $p if($max/$n>0.8); + + my $pn=0; + next if($n==$k); + $pn=10*$k/($n-$k)+1; + $pn = $pn ** ($K-2); + $pn = log($pn); + if ($p<$pn) { + $p=$pn; + } + + } + + return $p; + +} + +sub pvalue{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; + foreach my $ccs(keys %{$tag}){ + my $n=-1; + my $k=-1; + + my ($cor, $str)=split(/,/, $ccs); + if ($str eq "+") # small RNAs on the Watson strand + { + my $ss=$cor; + my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + + for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + elsif ($str eq "-") # small RNAs on the Crick strand + { + my $ee=$cor; + my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + next if($k<3); + + my $pn=0; my $N=$tagL*11*2-1; my $M=21; + for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k + { + my $c=1; + my $rr=1; + my $rw=1; + + for (my $j=0; $j<=$w-1; $j++) + { + $c=$c*($M-$j)/($j+1); + } + for (my $x=0; $x<=$n-$w-1; $x++) + { + $rr=$rr*($N-$M-$x)/($x+1); + } + for (my $y=0; $y<=$n-1; $y++) + { + $rw=$rw*($y+1)/($N-$y); + } + my $pr=$c*$rr*$rw; + + $pn=$pn+$pr; + } + + $p=$pn<$p ? $pn :$p; + + if ($p<0.001) #select and output small RNA clusters with p<0.001 + + { + + return $p; + + } + + } + return $p; +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preProcess.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,385 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-12-2 +#Modified: +#Description: RNA-seq data pre-process +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +#use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +#use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h"); +if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $format=$opts{'format'}; +if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { + #&printErr(); + die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +} + +my $phred_qv=64; +if (defined $opts{'phred'}) {$phred_qv=$opts{'phred'};} + +my @inputfiles=@{$opts{'i'}}; +my @inputtags=@{$opts{'tag'}}; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/preProcess/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@inputfiles;$i++) { + print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; + } +close CONFIG; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $a="ATCTCGTATG"; #adapter +if (defined $opts{'a'}) {$a=$opts{'a'};} + +my $m=6; #adapter minimum mapped nt +if (defined $opts{'M'}) {$m=$opts{'M'};} + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $min_nt=19; # minimum reads length +if (defined $opts{'min'}) {$min_nt=$opts{'min'};} + +my $max_nt=28; #maximum reads length +if (defined $opts{'max'}) {$max_nt=$opts{'max'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $mis_rfam=0;# mismatch number for rfam +if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} + +my (@filein,@mark,@clean); +#&read_config(); +@filein=@inputfiles; +@mark=@inputtags; + +&checkfa($opts{gfa}); + + +##### clip adpter --> clean data start +my $preprocess=$dir."preProcess_clean/"; +mkdir $preprocess; +my $can_use_threads = eval 'use threads; 1'; +if ($can_use_threads) { +# Do processing using threads + print "Do processing using threads\n"; + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clean,$preprocess.$out."_clips_adapter.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } +} else { +# Do not processing using threads + print "Do not processing using threads\n"; + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clean,$preprocess.$out."_clips_adapter.fq"; + &clips($in,$out); + } +} + +##### clip adpter --> clean data end + +my $collapsed=$preprocess."collapse_reads.fa"; +my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data +&collapse(\@clean,$collapsed); #collapse reads to tags + +&filterbylength(); # filter <$min_nt && >$max_nt + +print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; + +my $clean_data=$preprocess."clean_data.fa"; +system("ln -s $data $clean_data"); + +$time=Time(); +print "$time: Genome alignment!\n\n"; +my $genome_map=$dir."genome_match"; +&genome($data); +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +chdir $dir; +my $pathfile="$dir/path.txt"; +open PA,">$pathfile"; +print PA "$config\n"; +print PA "$preprocess\n"; +print PA "$genome_map\n"; + +if (defined $opts{'rfam'}) { #rfam mapping and analysis + $time=Time(); + print "$time: RNA annotate!\n\n"; + $time=~s/:/-/g; + $time=~s/ /-/g; + my $rfam_exp_dir=$dir."rfam_match"; + &rfam(); + #my $rfam_exp_dir=&search($dir,"rfam_match_"); +print PA "$rfam_exp_dir\n"; + + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); +} + + +close PA; +system("perl $scipt_path/html_preprocess.pl -i $pathfile -format $format -min $min_nt -max $max_nt -o $dir/preprocessResult.html"); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir -index $opts{idx}") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} +sub rfam{ + if (defined $opts{'idx2'}) { + system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} "); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; + }else{ + system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir "); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; + } +} +sub filterbylength{ + my $tmpmark=join ",", @mark; + system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); + system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt -o $preprocess/length.html"); +# print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; + +} +sub collapse{ + my ($ins,$data)=@_; + my $str=""; + for (my $i=0;$i<@{$ins};$i++) { + $str .="-i $$ins[$i] "; + } + system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); +# print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; +} + +sub clips{ + my ($in,$out)=@_; + my $adapter=$preprocess.$out."_clips_adapter.fq"; + if($format eq "fq" || $format eq "fastq"){ + system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; +# print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; + } + if($format eq "fa" || $format eq "fasta"){ + system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; + # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; + } + #my $clean=$preprocess.$out."_clean.fq"; + #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); + + return; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + &check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub check_rawdata{ + my ($fileforcheck)=@_; + if (!(-s $fileforcheck)) { + #&printErr(); + die "Can not find $fileforcheck, or file is empty!!!\n"; + } + if ($format eq "fasta" || $format eq "fa") { + &checkfa($fileforcheck); + } + if ($format eq "fastq" || $format eq "fq") { + &checkfq($fileforcheck); + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=<N>; + chomp $line; + if($line !~ /^>\S+/){ + #printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if(<N> !~ /^[ACGTNacgtn]*$/){ + #printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub checkfq{ + my ($file_reads)=@_; + + open N,"<$file_reads"; + for (my $i=0;$i<10;$i++) { + my $a=<N>; + my $b=<N>; + my $c=<N>; + my $d=<N>; + chomp $a; + chomp $b; + chomp $c; + chomp $d; + if($a!~/^\@/){ + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if($b!~ /^[ACGTNacgtn]*$/){ + #&printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + if ($c!~/^\@/ && $c!~/^\+/) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if ((length $b) != (length $d)) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + my @qv=split //,$d; + for (my $j=0;$j<@qv ;$j++) { + my $q=ord($qv[$j])-64; + if($q<0){$phred_qv=33;} + } + } + close N; +} + +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + #&printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t -o -path +options: +-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ... +-tag string # raw data file names, -tag xxx -tag xxx + +-format string,#specific input rawdata file format : fastq|fq|fasta|fa +-phred int # phred quality number, default is 64 + +-path scirpt path + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-a string, ADAPTER string. default is ATCTCGTATG. +-M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. +-min int, reads min length,default is 19. +-max int, reads max length,default is 28. + +-mis [int] number of allowed mismatches when mapping reads to genome, default 0 +-v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preProcess.xml Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,155 @@ +<tool id="preprocess" name="preProcess" veision="1.0.0"> + <description>Program for Raw data preprocess analysis, including 3' adapter triming, reads collaping, genome mapping and rfam non-miRNA analysis </description> + + <requirements> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <!--requirement type="package" version="3.0.1">R</requirement!--> + <requirement type="package" version="1.96">threads</requirement> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="2.1.8">ViennaRNA</requirement> + </requirements> + + <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command--> + + <command interpreter="perl">preProcess.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + -path \$SCRIPT_PATH + + #for $j, $s in enumerate( $series ) + ##rank_of_series=$j + -i ${s.input} + -tag ${s.tag} + #end for + + ## Do or not annotate rfam non-miRNA RNAs + #if $nocoding.annotate_rfam == "yes": + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($nocoding.reference_rfam.source) == "history": + -rfam $nocoding.reference_rfam.own_file + #else: + #set rfam_index_path = $nocoding.reference_rfam.index.fields.path + -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path + #end if + -v $nocoding.v + #end if + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + #set index_path = 'genome' + -gfa $reference_genome.own_file + #else: + #set index_path = $reference_genome.index.fields.path + -gfa ${index_path}.fa -idx $index_path + #end if + + -format $format -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch > run.log + </command> + + <inputs> + + <repeat name="series" title="Raw sequence data"> + <param name="input" type="data" label="Raw data"/> + <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/> + </repeat> + + <!--param name="input" format="tabular" type="data" label="input config file" /--> + + <param name="format" type="select" label="Raw data format" multiple="false"> + <option value="fastq">Raw data is fastq. format</option> + <option value="fasta">Raw data is fasta. format</option> + </param> + <param name="phred" type="select" label="Input quals are Phred+64 or Phred+33" multiple="false"> + <option value="64">Phred+64</option> + <option value="33" selected="true">Phred+33</option> + </param> + + <!-- reference genome --> + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> + + <!--param name="gfa" type="data" label="genome sequence fasta file"/--> + <!--param type="data" name="index" label="genome sequence bowtie index"/--> + <param name="a" type="text" value="TGGAATTCTCGGGTGCCAAGG" label="3' adapter sequence" /> + <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" /> + <param name="min" type="integer" value="19" label="Minimum microRNA length" /> + <param name="max" type="integer" value="28" label="Maximum microRNA length" /> + <param name="mismatch" type="integer" value="0" label="Number of allowed mismatches when mapping reads to genome" /> + + <conditional name="nocoding"> + <param name="annotate_rfam" type="select" label="Annotate nocoding RNAs(excluding miRNA)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <!--param name="rfam" type="data" label="rfam sequence file" /--> + <conditional name="reference_rfam"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference" help="If your reference of interest is not listed, contact the Galaxy team"> + <options from_data_table="rfam_bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" /> + </when> + </conditional> + + <param name="v" type="integer" value="0" label="Report end-to-end hits less than v mismatches for non-coding RNA annotation"/> + </when> + </conditional> + + + + </inputs> + + <outputs> + <data format="html" name="preprocess result" from_work_dir="preProcess/preprocessResult.html" label="${tool.name} on ${on_string}: preprocess result"/> + + <data format="txt" name="clean FASTA data" from_work_dir="preProcess/preProcess_clean/clean_data.fa" label="${tool.name} on ${on_string}: clean FASTA data"/> + + <data format="txt" name="genome mapping result" from_work_dir="preProcess/genome_match/genome_mapped.bwt" label="${tool.name} on ${on_string}: genome mapping result"/> + <data format="txt" name="genome mapped FASTA reads" from_work_dir="preProcess/genome_match/genome_mapped.fa" label="${tool.name} on ${on_string}: genome mapped FASTA reads"/> + + <data format="txt" name="Rfam mapping result" from_work_dir="preProcess/rfam_match/rfam_mapped.bwt" label="${tool.name} on ${on_string}: Rfam mapping result"> + <filter>(nocoding['annotate_rfam'] == 'yes')</filter> + </data> + <data format="txt" name="Rfam mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_mapped.fa" label="${tool.name} on ${on_string}: Rfam mapped FASTA file"> + <filter>(nocoding['annotate_rfam'] == 'yes')</filter> + </data> + <data format="txt" name="Rfam not mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_not_mapped.fa" label="${tool.name} on ${on_string}: Rfam not mapped FASTA file"> + <filter>(nocoding['annotate_rfam'] == 'yes')</filter> + </data> + <data format="txt" name="input config" from_work_dir="preProcess/input_config" label="${tool.name} on ${on_string}: input config"/> + + </outputs> + + <help> + + </help> + </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preProcess_core.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,385 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-12-2 +#Modified: +#Description: RNA-seq data pre-process +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +#use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +#use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h"); +if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $format=$opts{'format'}; +if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { + #&printErr(); + die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +} + +my $phred_qv=64; +if (defined $opts{'phred'}) {$phred_qv=$opts{'phred'};} + +my @inputfiles=@{$opts{'i'}}; +my @inputtags=@{$opts{'tag'}}; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/preProcess/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@inputfiles;$i++) { + print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; + } +close CONFIG; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $a="ATCTCGTATG"; #adapter +if (defined $opts{'a'}) {$a=$opts{'a'};} + +my $m=6; #adapter minimum mapped nt +if (defined $opts{'M'}) {$m=$opts{'M'};} + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $min_nt=19; # minimum reads length +if (defined $opts{'min'}) {$min_nt=$opts{'min'};} + +my $max_nt=28; #maximum reads length +if (defined $opts{'max'}) {$max_nt=$opts{'max'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $mis_rfam=0;# mismatch number for rfam +if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} + +my (@filein,@mark,@clean); +#&read_config(); +@filein=@inputfiles; +@mark=@inputtags; + +&checkfa($opts{gfa}); + + +##### clip adpter --> clean data start +my $preprocess=$dir."preProcess_clean/"; +mkdir $preprocess; +my $can_use_threads = eval 'use threads; 1'; +if ($can_use_threads) { +# Do processing using threads + print "Do processing using threads\n"; + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clean,$preprocess.$out."_clips_adapter.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } +} else { +# Do not processing using threads + print "Do not processing using threads\n"; + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clean,$preprocess.$out."_clips_adapter.fq"; + &clips($in,$out); + } +} + +##### clip adpter --> clean data end + +my $collapsed=$preprocess."collapse_reads.fa"; +my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data +&collapse(\@clean,$collapsed); #collapse reads to tags + +&filterbylength(); # filter <$min_nt && >$max_nt + +print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; + +my $clean_data=$preprocess."clean_data.fa"; +system("ln -s $data $clean_data"); + +$time=Time(); +print "$time: Genome alignment!\n\n"; +my $genome_map=$dir."genome_match"; +&genome($data); +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +chdir $dir; +my $pathfile="$dir/path.txt"; +open PA,">$pathfile"; +print PA "$config\n"; +print PA "$preprocess\n"; +print PA "$genome_map\n"; + +if (defined $opts{'rfam'}) { #rfam mapping and analysis + $time=Time(); + print "$time: RNA annotate!\n\n"; + $time=~s/:/-/g; + $time=~s/ /-/g; + my $rfam_exp_dir=$dir."rfam_match"; + &rfam(); + #my $rfam_exp_dir=&search($dir,"rfam_match_"); +print PA "$rfam_exp_dir\n"; + + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); +} + + +close PA; +system("perl $scipt_path/html_preprocess.pl -i $pathfile -format $format -min $min_nt -max $max_nt -o $dir/preprocessResult.html"); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir -index $opts{idx}") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} +sub rfam{ + if (defined $opts{'idx2'}) { + system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} "); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; + }else{ + system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir "); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; + } +} +sub filterbylength{ + my $tmpmark=join ",", @mark; + system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); + system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt -o $preprocess/length.html"); +# print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; + +} +sub collapse{ + my ($ins,$data)=@_; + my $str=""; + for (my $i=0;$i<@{$ins};$i++) { + $str .="-i $$ins[$i] "; + } + system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); +# print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; +} + +sub clips{ + my ($in,$out)=@_; + my $adapter=$preprocess.$out."_clips_adapter.fq"; + if($format eq "fq" || $format eq "fastq"){ + system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; +# print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; + } + if($format eq "fa" || $format eq "fasta"){ + system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; + # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; + } + #my $clean=$preprocess.$out."_clean.fq"; + #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); + + return; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + &check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub check_rawdata{ + my ($fileforcheck)=@_; + if (!(-s $fileforcheck)) { + #&printErr(); + die "Can not find $fileforcheck, or file is empty!!!\n"; + } + if ($format eq "fasta" || $format eq "fa") { + &checkfa($fileforcheck); + } + if ($format eq "fastq" || $format eq "fq") { + &checkfq($fileforcheck); + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=<N>; + chomp $line; + if($line !~ /^>\S+/){ + #printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if(<N> !~ /^[ACGTNacgtn]*$/){ + #printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub checkfq{ + my ($file_reads)=@_; + + open N,"<$file_reads"; + for (my $i=0;$i<10;$i++) { + my $a=<N>; + my $b=<N>; + my $c=<N>; + my $d=<N>; + chomp $a; + chomp $b; + chomp $c; + chomp $d; + if($a!~/^\@/){ + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if($b!~ /^[ACGTNacgtn]*$/){ + #&printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + if ($c!~/^\@/ && $c!~/^\+/) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if ((length $b) != (length $d)) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + my @qv=split //,$d; + for (my $j=0;$j<@qv ;$j++) { + my $q=ord($qv[$j])-64; + if($q<0){$phred_qv=33;} + } + } + close N; +} + +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + #&printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t -o -path +options: +-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ... +-tag string # raw data file names, -tag xxx -tag xxx + +-format string,#specific input rawdata file format : fastq|fq|fasta|fa +-phred int # phred quality number, default is 64 + +-path scirpt path + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-a string, ADAPTER string. default is ATCTCGTATG. +-M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. +-min int, reads min length,default is 19. +-max int, reads max length,default is 28. + +-mis [int] number of allowed mismatches when mapping reads to genome, default 0 +-v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/precursors.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,858 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use RNA; + +my %opts; +GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h"); +if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $checkno=1; +my $filein=$opts{'map'}; +my $faout=$opts{'o'}; +my $strout=$opts{'s'}; +my $genome= $opts{'g'}; + +my $maxd=defined $opts{'d'} ? $opts{'d'} : 200; +my $flank=defined $opts{'f'}? $opts{'f'} : 10; + +my $MAX_ENERGY=-18; +if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};} +my $MAX_UNPAIR=5; +my $MIN_PAIR=15; +my $MAX_SIZEDIFF=4; +my $MAX_BULGE=2; +my $ASYMMETRY=5; +my $MIN_UNPAIR=0; +my $MIN_SPACE=5; +my $MAX_SPACE=$maxd; +my $FLANK=$flank; + +######### load in genome sequences start ######## +my %genome; +my %lng; +my $name; +open IN,"<$genome"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $genome{$name} .=$aline; +} +close IN; +foreach my $key (keys %genome) { + $lng{$key}=length($genome{$key}); +} +####### load in genome sequences end ########## + +my %breaks; ### reads number bigger than 3 +open IN,"<$filein"; #input file +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + $tmp[0]=~/_x(\d+)$/; + my $no=$1; + next if($no<3); + #my $trand=&find_strand($tmp[9]); + #my @pos=split/\.\./,$tmp[5]; + my $end=$tmp[3]+length($tmp[4])-1; + if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);} + push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base +} +close IN; + +my %cites; ### peaks +foreach my $chr (keys %breaks) { + foreach my $strand (keys %{$breaks{$chr}}) { + my @array=@{$breaks{$chr}{$strand}}; + @array=sort{$a->[0]<=>$b->[0]} @array; + for (my $i=0;$i<@array;$i++) { + my $start=$array[$i][0];my $end=$array[$i][1]; + my @subarray=(); + push @subarray,$array[$i]; + + for (my $j=$i+1;$j<@array;$j++) { + if ($start<$array[$j][1] && $end>$array[$j][0]) { ###overlap + push @subarray,$array[$j]; + ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); + } + else{ + $i=$j-1; + &find_cites(\@subarray,$chr,$strand); + last; + } + } + } + } +} + +my %cluster; +foreach my $chr (keys %cites) { + foreach my $strand (keys %{$cites{$chr}}) { + my @array=@{$sites{$chr}{$strand}}; + @array=sort{$a->[0]<=>$b->[0]} @array; + for (my $i=0;$i<@array;$i++) { + my $start=$array[$i][0];my $end=$array[$i][1]; + my @subarray=(); + push @subarray,$array[$i]; + + for (my $j=$i+1;$j<@array;$j++) { + if ($end>$array[$j][0]-$maxd) { ###distance less than 200bp + push @subarray,$array[$j]; + ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); + } + else{ + @{$cluster{$chr}{$strand}{$i}}=@subarray; + $i=$j-1; + last; + } + } + } + + } +} + + +open FA,">$faout"; #output file +open STR,">$strout"; +foreach my $chr (keys %cluster) { + foreach my $strand (keys %{$cluster{$chr}}) { + foreach my $no (keys %{$cluster{$chr}{$strand}}) { + my @array2=@{$cluster{$chr}{$strand}{$no}}; + @array2=sort{$a->[0]<=>$b->[0]} @array2; + &excise(\@array2,$chr,$strand); + } + } +} +close FA; +close STR; +sub oneCiteDn{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$maxd+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; +} +sub oneCiteUp{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$maxd-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; + +} + +sub twoCites{ + my ($array,$a,$b,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$b][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + +# my( $str,$mfe)=RNA::fold($seq); +# return 0 if($mfe>$MAX_ENERGY); ### minimum mfe + my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee); + + return $val; + +} +sub excise{ + my ($cluster,$chr,$strand)=@_; + + if(@{$cluster}==1){ + $ok=&oneCiteDn($cluster,0,$chr,$strand); + $ok=&oneCiteUp($cluster,0,$chr,$strand); + }else{ + my $peak_pos=0; + + for (my $i=0;$i<@{$cluster};$i++) { + if($$cluster[$i][2]>$$cluster[$peak_pos][2]){$peak_pos=$i;} + } + + my $ok=0; + for (my $i=0;$i<@{$cluster};$i++) { + next if($i==$peak_pos); + if($i<$peak_pos){$ok=&twoCites($cluster,$i,$peak_pos,$chr,$strand);} + else{$ok=&twoCites($cluster,$peak_pos,$i,$chr,$strand);} + last if($ok); + } + if (!$ok) { + $ok=&oneCiteDn($cluster,$peak_pos,$chr,$strand); + $ok=&oneCiteUp($cluster,$peak_pos,$chr,$strand); + } + + } +} + +sub ffw2{ + my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_; + + my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + # position tag1 and tag2 + my $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + my $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + if ($tag2_beg < $tag1_beg) { + # swap tag1 and tag2 + ($tag1,$tag2)=($tag2,$tag1); + ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg); + } + my $tag1_end=$tag1_beg+length($tag1)-1; + my $tag2_end=$tag2_beg+length($tag2)-1; + # re-clipping + my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1; + my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length; + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + # re-reposition + $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + + $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + $tag1_end=$tag1_beg+length($tag1)-1; + $tag2_end=$tag2_beg+length($tag2)-1; + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + #$mfe=sprintf "%.2f", $mfe; + if ($mfe > $MAX_ENERGY) {return 0;} + + # tag1 + my $tag1_length=$tag1_end-$tag1_beg+1; + my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length); + my $tag1_arm=which_arm($tag1_struct); + my $tag1_unpair=$tag1_struct=~tr/.//; + my $tag1_pair=$tag1_length-$tag1_unpair; + my $tag1_max_bulge=biggest_bulge($tag1_struct); + if ($tag1_arm ne "5p") { return 0;} # tag not in stem +# if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag1_pair < $MIN_PAIR) {return 0;} + if ($tag1_max_bulge > $MAX_BULGE) {return 0;} + + # tag2 + my $tag2_length=$tag2_end-$tag2_beg+1; + my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length); + my $tag2_arm=which_arm($tag2_struct); + my $tag2_unpair=$tag2_struct=~tr/.//; + my $tag2_pair=$tag2_length-$tag2_unpair; + my $tag2_max_bulge=biggest_bulge($tag2_struct); + if ($tag2_arm ne "3p") {return 0;} # star not in stem +# if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag2_pair < $MIN_PAIR) {return 0;} + if ($tag2_max_bulge > $MAX_BULGE) {return 0;} + + # space size between miR and miR* + my $space=$tag2_beg-$tag1_end-1; + if ($space < $MIN_SPACE) {return 0;} + if ($space > $MAX_SPACE) {return 0;} + + # size diff of miR and miR* + my $size_diff=abs($tag1_length-$tag2_length); + if ($size_diff > $MAX_SIZEDIFF) {return 0;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end); + my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end); + my $asy=($asy1 < $asy2) ? $asy1 : $asy2; + if ($asy > $ASYMMETRY) {return 0} + + # duplex fold, determine whether two matures like a miR/miR* ike duplex + my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2); + # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context + my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end); + if ($like_mir_duplex1==0 && $like_mir_duplex2==0) { + return 0; + } + + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + + return 1; +} + +sub ffw1{ + my ($seq,$tag,$chr,$strand,$ss,$ee)=@_; + my $pass=0; + + my $N_count=$seq=~tr/N//; + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + my $tag_length=length $tag; + + # position + my $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return $pass; + } + my $tag_end=$tag_beg+length($tag)-1; + + + # define candidate precursor by hybrid short arm to long arm, not solid enough + my($beg,$end)=define_precursor($seq,$tag); + if (not defined $beg) { + return $pass; + } + if (not defined $end) { + return $pass; + } + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + if ($mfe > $MAX_ENERGY) { + $pass=0; + return $pass; + } + + # reposition + $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return 0; + } + $tag_end=$tag_beg+length($tag)-1; + + my $tag_struct=substr($struct,$tag_beg-1,$tag_length); + my $tag_arm=which_arm($tag_struct); + my $tag_unpair=$tag_struct=~tr/.//; + my $tag_pair=$tag_length-$tag_unpair; + my $tag_max_bulge=biggest_bulge($tag_struct); + if ($tag_arm eq "-") { return $pass;} +# if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag_pair < $MIN_PAIR) { return $pass;} + if ($tag_max_bulge > $MAX_BULGE) {return $pass;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + # get star + my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); + my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); + my $star_length=$star_end-$star_beg+1; + my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); + my $star_arm=which_arm($star_struct); + my $star_unpair=$star_struct=~tr/.//; + my $star_pair=$star_length-$star_unpair; + my $star_max_bulge=biggest_bulge($star_struct); + if ($star_arm eq "-") { return $pass;} +# if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass} + if ($star_pair < $MIN_PAIR) {return $pass;} + if ($star_max_bulge > $MAX_BULGE) {return $pass;} + + if ($tag_arm eq $star_arm) {return $pass;} + + # space size between miR and miR* + my $space; + if ($tag_beg < $star_beg) { + $space=$star_beg-$tag_end-1; + } + else { + $space=$tag_beg-$star_end-1; + } + if ($space < $MIN_SPACE) { return $pass;} + if ($space > $MAX_SPACE) { return $pass;} + + # size diff + my $size_diff=abs($tag_length-$star_length); + if ($size_diff > $MAX_SIZEDIFF) { return $pass;} + + # asymmetry + my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); + if ($asy > $ASYMMETRY) {return $pass;} + + $pass=1; + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + return $pass; + +} +sub get_star { + my($table,$beg,$end)=@_; + + my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 + foreach my $i ($beg..$end) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $s1=$i; + $s2=$j; + last; + } + } + foreach my $i (reverse ($beg..$end)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $e1=$i; + $e2=$j; + last; + } + } +# print "$s1,$e1 $s2,$e2\n"; + + # correct terminus + my $off1=$s1-$beg; + my $off2=$end-$e1; + $s2+=$off1; + $s2+=2; # 081009 + $e2-=$off2; $e2=1 if $e2 < 1; + $e2+=2; $e2=1 if $e2 < 1; # 081009 + ($s2,$e2)=($e2,$s2) if ($s2 > $e2); + return ($s2,$e2); +} + +sub define_precursor { + my $seq=shift; + my $tag=shift; + + my $seq_length=length $seq; + my $tag_length=length $tag; + my $tag_beg=index($seq,$tag,0)+1; + my $tag_end=$tag_beg+$tag_length-1; + + # split the candidate region into short arm and long arm + my $tag_arm; + my ($larm,$larm_beg,$larm_end); + my ($sarm,$sarm_beg,$sarm_end); + if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm + $sarm=substr($seq,0,$tag_end); + $larm=substr($seq,$tag_end); + $sarm_beg=1; + $sarm_end=$tag_end; + $larm_beg=$tag_end+1; + $larm_end=$seq_length; + $tag_arm="5p"; + } + else { + $larm=substr($seq,0,$tag_beg-1); # on 3' arm + $sarm=substr($seq,$tag_beg-1); + $larm_beg=1; + $larm_end=$tag_beg-1; + $sarm_beg=$tag_beg; + $sarm_end=$seq_length; + $tag_arm="3p"; + } + +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # clipping short arm + if ($tag_arm eq "5p") { + $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } + else { + $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # define the precursor by hybriding short arm to long arm +=cut #modify in 2014-10-28 + my $duplex=RNA::duplexfold($sarm,$larm); + my $struct=$duplex->{structure}; + my $energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; +# print "pair=$pair\n"; + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut +###### new codes begin + my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; +######## new codes end + +# print "$beg1:$end1 $beg2:$end2\n"; + # transform coordinates + $beg1=$beg1+$sarm_beg-1; + $end1=$end1+$sarm_beg-1; + $beg2=$beg2+$larm_beg-1; + $end2=$end2+$larm_beg-1; +# print "$beg1:$end1 $beg2:$end2\n"; + + my $off5p=$beg1-$sarm_beg; + my $off3p=$sarm_end-$end1; + $beg2-=$off3p; $beg2=1 if $beg2 < 1; + $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length; + +# print "$beg1:$end1 $beg2:$end2\n"; + + my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2; + my $end=$sarm_end > $end2 ? $sarm_end : $end2; + + return if $pair < $MIN_PAIR; +# print "$beg,$end\n"; + return ($beg,$end); +} + + +# duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex +sub likeMirDuplex1 { + my $seq1=shift; + my $seq2=shift; + my $like_mir_duplex=1; + + my $length1=length $seq1; + my $length2=length $seq2; +=cut + my $duplex=RNA::duplexfold($seq1, $seq2); + my $duplex_struct=$duplex->{structure}; + my $duplex_energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$duplex_struct); + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut + my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $duplex_struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $duplex_energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$duplex_struct); + #my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; + + # revise beg1, end1, beg2, end2 + $str1=~/^(\.*)/; + $beg1+=length($1); + $str1=~/(\.*)$/; + $end1-=length($1); + $str2=~/^(\.*)/; + $beg2+=length($1); + $str2=~/(\.*)$/; + $end2-=length($1); + + my $pair_num=$str1=~tr/(//; + my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom + my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck +# print $pair_num,"\n"; +# print $overhang1,"\n"; +# print $overhang2,"\n"; + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 || $overhang2 < 0 ) { + $like_mir_duplex=0; + } + if ($overhang1 > 4 || $overhang2 > 4) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} + +# judge whether two matures form miR/miR* duplex, in hairpin context +sub likeMirDuplex2 { + my ($table,$beg1,$end1,$beg2,$end2)=@_; + my $like_mir_duplex=1; + +# s1 e1 +# 5 ----------------------------3 +# | | |||| ||| | +#3 -------------------------------5 +# e2 s2 + + my $pair_num=0; + my $overhang1=0; + my $overhang2=0; + my ($s1,$e1,$s2,$e2); + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $s1=$i; + $e2=$j; + last; + } + } + } + foreach my $i (reverse ($beg1..$end1)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $e1=$i; + $s2=$j; + last; + } + } + } + +# print "$beg1,$end1 $s1,$e1\n"; +# print "$beg2,$end2 $s2,$e2\n"; + + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + ++$pair_num; + } + } + } + if (defined $s1 && defined $e2) { + $overhang1=($end2-$e2)-($s1-$beg1); + } + if (defined $e1 && defined $s2) { + $overhang2=($end1-$e1)-($s2-$beg2); + } + + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 && $overhang2 < 0) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} +sub parse_struct { + my $struct=shift; + my $table=shift; + + my @t=split('',$struct); + my @lbs; # left brackets + foreach my $k (0..$#t) { + if ($t[$k] eq "(") { + push @lbs, $k+1; + } + elsif ($t[$k] eq ")") { + my $lb=pop @lbs; + my $rb=$k+1; + $table->{$lb}=$rb; + $table->{$rb}=$lb; + } + } + if (@lbs) { + warn "unbalanced RNA struct.\n"; + } +} +sub which_arm { + my $substruct=shift; + my $arm; + if ($substruct=~/\(/ && $substruct=~/\)/) { + $arm="-"; + } + elsif ($substruct=~/\(/) { + $arm="5p"; + } + else { + $arm="3p"; + } + return $arm; +} +sub biggest_bulge { + my $struct=shift; + my $bulge_size=0; + my $max_bulge=0; + while ($struct=~/(\.+)/g) { + $bulge_size=length $1; + if ($bulge_size > $max_bulge) { + $max_bulge=$bulge_size; + } + } + return $max_bulge; +} +sub get_asy { + my($table,$a1,$a2)=@_; + my ($pre_i,$pre_j); + my $asymmetry=0; + foreach my $i ($a1..$a2) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if (defined $pre_i && defined $pre_j) { + my $diff=($i-$pre_i)+($j-$pre_j); + $asymmetry += abs($diff); + } + $pre_i=$i; + $pre_j=$j; + } + } + return $asymmetry; +} + +sub peaks{ + my @cluster=@{$_[0]}; + + return if(@cluster<1); + + my $max=0; my $index=-1; + for (my $i=0;$i<@cluster;$i++) { + if($cluster[$i][2]>$max){ + $max=$cluster[$i][2]; + $index=$i; + } + } +# &excise(\@cluster,$index,$_[1],$_[2]); + return($index); +} + +sub find_cites{ + my @tmp=@{$_[0]}; + my $i=&peaks(\@tmp); + + my $start=$tmp[$i][0]; + my $total=0; my $node5=0; + for (my $j=0;$j<@tmp ;$j++) { + $total+=$tmp[$j][2]; + $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2); + } + push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5); +} + +sub newpos{ + my ($a,$b,$c,$d)=@_; + my $s= $a>$c ? $c : $a; + my $e=$b>$d ? $b : $d; + return($s,$e); +} + +sub rev{ + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + + return($strand); +} +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -map -g -d -f -o -s -e +options: + -map input file# align result # bst. format + -g input file # genome sequence fasta format + -d <int> Maximal space between miRNA and miRNA* (200) + -f <int> Flank sequence length of miRNA precursor (10) + -o output file# percursor fasta file + -s output file# precursor structure file + -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol) + + -h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantify.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,502 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use File::Path; +use strict; +use File::Basename; +#use Getopt::Std; +use Getopt::Long; +#use RNA; + +my %opts; +GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","h"); +if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $read=$opts{'r'}; +my $pre=$opts{'p'}; +my $mature=$opts{'m'}; + +my $dir=$opts{'o'}; +unless ($dir=~/\/$/) {$dir .="/";} +if (not -d $dir) { + mkdir $dir; +} + +my $threads=defined $opts{'t'} ? $opts{'t'} : 1; +my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0; + +my $upstream = 2; +my $downstream = 5; + +$upstream = $opts{'e'} if(defined $opts{'e'}); +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; + +my $time=Time(); + +my $tmpdir="${dir}/known_miRNA_Express"; +if(not -d $tmpdir){ + mkdir($tmpdir); +} +chdir $tmpdir; + +`cp $pre ./`; +my $pre_file_name=basename($pre); + +&mapping(); # matures align to precursors && reads align to precursors; + +my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end; +&maturePosOnPre(); # acknowledge mature positions on precursor + +my %pre_read; +&readPosOnPre(); # acknowledge reads positions on precursors + +if(!(defined $opts{'tag'})){ + foreach my $key (keys %pre_read) { + $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=1;$i<=@ss;$i++) { + $marks .="Smp$i;"; + } + last; + } +} + +my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...." +&attachPre(); + +my $preno=scalar (keys %pre); +print "Total Precursor Number is $preno !!!!\n"; + +my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe; +&structure(); + + +##### analysis and print out && moRs +my $aln=$dir."known_microRNA_express.aln"; +my $list=$dir."known_microRNA_express.txt"; +my $moRs=$dir."known_microRNA_express.moRs"; + +system("ln -s $mature $dir/known_microRNA_mature.fa "); +system("ln -s $pre $dir/known_microRNA_precursor.fa "); + +open ALN,">$aln"; +open LIST,">$list"; +open MORS,">$moRs"; + +$"="\t"; ##### @array print in \t + +my @marks=split/\;/,$marks; +#print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n"; +print LIST "#matueID\tpreID\tpos1\tpos2"; +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_matureExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_starExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_totalExp"; +} +print LIST "\n"; +print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n"; +print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n"; +my %moRs; + +foreach my $key (keys %pre) { + print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n"; + next if(! (exists $pre_read{$key})); + my @array=@{$pre_read{$key}}; + @array=sort{$a->[3]<=> $b->[3]} @array; + + my $length=length($pre{$key}); + + my $maxline=-1;my $max=0; ### storage the maxinum express read line + my $totalReadsNo=0; + my @not_over=(); ### new read format better for moRs analysis + +####print out Aln file start + for (my $i=0;$i<@array;$i++) { + my $maps=$array[$i][3]+1; + my $mape=$array[$i][3]+length($array[$i][4]); + my $str=""; + $str .= "." x ($maps-1); + $str .=$array[$i][4]; + $str .="." x ($length-$mape); + $str .=" "; + + $array[$i][0]=~/:([\d|_]+)_x(\d+)$/; + my @sample=split /\_/,$1; + my $total=$2; + print ALN $str,"@sample","\t",$total,"\n"; + + if($total>$max){$max=$total; $maxline=$i;} + $totalReadsNo+=$total; + + push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"]; + } +####print out Aln file end + +#### express list start + my ($ms,$me,$ss,$se); + if (!(exists($pre_mature{$key}))) { + $ms=$array[$maxline][3]+1; + $me=$array[$maxline][3]+length($array[$maxline][4]); + ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'}); + + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + else{ + foreach my $maID (keys %{$pre_mature{$key}}) { + $ms=$pre_mature{$key}{$maID}{"mature"}[0]; + $me=$pre_mature{$key}{$maID}{"mature"}[1]; + $ss=$pre_mature{$key}{$maID}{"star"}[0]; + $se=$pre_mature{$key}{$maID}{"star"}[1]; + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + } +#### express list end + +#### analysis moRs start + my @result; my @m_texp;my $m_texp=0; ### moRs informations + + while (@not_over>0) { + my @over=@not_over; + @not_over=(); + +#·á¶È×î¸ßtag + my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1; + for (my $i=0;$i<@over;$i++) { + my @m_array=@{$over[$i]}; + if ($m_max<$m_array[4]) { + $m_max=$m_array[4]; + $m_maxline=$i; + } + } + $m_start=$over[$m_maxline][1]; + $m_end=$over[$m_maxline][2]; + $m_exp=$m_max; + $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j]=$m_nums[$j]; + } + +#ͳ¼ÆÒÔ·á¶È×î¸ßtagΪ×ø±êµÄreads, Á½¶ËλÖòîÒì²»³¬¹ý3nt + for (my $i=0;$i<@over;$i++) { + next if($i==$m_maxline); + my @m_array=@{$over[$i]}; + if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) { + $m_exp+=$m_array[4]; + $m_no++; + $m_array[3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j] +=$m_nums[$j]; + } + } + elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #È¥³ý¿çÔ½blockµÄreads + } + if($m_exp>5){### 5¸öreads + $m_texp+=$m_exp; + for (my $j=0;$j<@m_exp;$j++) { + $m_texp[$j]+=$m_exp[$j]; + } + my $string=&subseq($pre{$key},$m_start,$m_end,"+"); + push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ; + } + } + + my $str=scalar @result; + my $percent=sprintf("%.2f",$m_texp/$totalReadsNo); + $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}"; + @{$moRs{$str}}=@result; + +#### analysis moRs end +} + +##### moRs print out start +foreach my $key (keys %moRs) { + my @tmp=split/\t/,$key; + next if ($tmp[4]<=2); + next if($tmp[3]<0.95); + my @over; + for (my $i=0;$i<@{$moRs{$key}};$i++) { + my @arrayi=split/\t/,$moRs{$key}[$i]; + for (my $j=0;$j<@{$moRs{$key}};$j++) { + next if($i==$j); + my @arrayj=split/\t/,$moRs{$key}[$j]; + if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) { + push @over,$moRs{$key}[$i]; + } + } + } + if (@over>0) { + print MORS "$key\n"; + foreach (@{$moRs{$key}}) { + print MORS "$_\n"; + } + } +} +###### moRs print out end +close ALN; +close LIST; +close MORS; + +$"=" ";##### reset + + +################### Sub programs ################# +sub express{ + my ($ms,$me,$ss,$se,$read)=@_; + my (@mexp,@sexp,@texp); + $$read[0][0]=~/:([_|\d]+)_x(\d+)$/; + my @numsample=split/_/,$1; + for (my $i=0;$i<@numsample;$i++) { + $mexp[$i]=0; + $sexp[$i]=0; + $texp[$i]=0; + } + + for (my $i=0;$i<@{$read};$i++) { + my $start=$$read[$i][3]+1; + my $end=$$read[$i][3]+length($$read[$i][4]); + $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/; + my $expresses=$1; + my @nums=split/_/,$expresses; + + for (my $j=0;$j<@nums;$j++) { + $texp[$j]+=$nums[$j]; + } + if ($start>=$ms && $end<=$me) { + for (my $j=0;$j<@nums;$j++) { + $mexp[$j]+=$nums[$j]; + } + } + if ($start>=$ss && $end<=$se) { + for (my $j=0;$j<@nums;$j++) { + $sexp[$j]+=$nums[$j]; + } + } + } + return(\@mexp,\@sexp,\@texp); +} + +sub structure{ + foreach my $key (keys %pre_mature) { + if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";} + #my ($str,$mfe)=RNA::fold($pre{$key}); + my $rnafold=`perl -e 'print "$pre{$key}"' | RNAfold --noPS`; + my @rnafolds=split/\s+/,$rnafold; + my $str=$rnafolds[1]; + my $mfe=$rnafolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + $struc{$key}{"struc"}=$str; + #$struc{$key}{"mfe"}=sprintf ("%.2f",$mfe); + $struc{$key}{"mfe"}=$mfe; + + foreach my $id (keys %{$pre_mature{$key}}) { + ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str); + } +=cut +##### Nucleotide complementary + my @tmp=split//,$str; + my %a2b; + my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } + +##### search star position + foreach my $id (keys %{$pre_mature{$key}}) { + my $n=0; + for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; + if($a>$b){ + $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2; + $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + if($a<$b{ + $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2; + $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + last; + } + $n++; + } + } +=cut + } +} +sub other_pair{ + my ($start,$end,$structure)=@_; + ##### Nucleotide complementary + my @tmp=split//,$structure; + my %a2b; my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } +##### search star position + my $n=0;my $startpos; my $endpos; + for (my $i=$start;$i<=$end ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; +# if($a>$b){ +# $startpos=$b-$n+2; +# $endpos=$b-$n+2+($end-$start); +# } +# if($a<$b){ + $endpos=$b+$n+2; + if($endpos>length($structure)){$endpos=length($structure);} + $startpos=$b+$n+2-($end-$start); + if($startpos<1){$startpos=1;} +# } + last; + } + $n++; + } + return ($startpos,$endpos); +} +sub attachPre{ + open IN, "<$pre_file_name"; + my $name; + while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $pre{$name} .=$aline; + } + close IN; +} +sub readPosOnPre{ + open IN,"<read_mapped.bwt"; + while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + my $id=lc($tmp[2]); + push @{$pre_read{$tmp[2]}},[@tmp]; + } + close IN; +} +sub maturePosOnPre{ + open IN,"<mature_mapped.bwt"; + while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + my $mm=$tmp[0]; +# $mm=~s/\-3P|\-5P//i; + $mm=lc($mm); + my $pm=$tmp[2]; + $pm=lc($pm); + +# next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a + next if($mm!~/$pm/); +# print "$tmp[2]\t$tmp[0]\n"; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0); +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]); + } + close IN; +} +sub mapping{ + my $err; +## build bowtie index + #print STDERR "building bowtie index\n"; + $err = `bowtie-build $pre_file_name miRNA_precursor`; + +## map mature sequences against precursors + #print STDERR "mapping mature sequences against index\n"; + $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`; + +## map reads against precursors + #print STDERR "mapping read sequences against index\n"; + $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`; + +} + +sub subseq{ + my $seq=shift; + my $beg=shift; + my $end=shift; + my $strand=shift; + + my $subseq=substr($seq,$beg-1,$end-$beg+1); + if ($strand eq "-") { + $subseq=revcom($subseq); + } + return uc $subseq; +} + +sub revcom{ + my $seq=shift; + $seq=~tr/ATCGatcg/TAGCtagc/; + $seq=reverse $seq; + return uc $seq; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -r -p -m -mis -t -e -f -tag -o -time +mandatory parameters: +-p precursor.fa miRNA precursor sequences from miRBase # must be absolute path +-m mature.fa miRNA sequences from miRBase # must be absolute path +-r reads.fa your read sequences #must be absolute path + +-o output directory + +options: +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-t [int] threads number,default 1 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-tag [string] sample marks# eg. sampleA;sampleB;sampleC +-time sting #make directory time,default is the local time +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantify_siRNA.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w +#Filename: +#Author: chentt +#Email: +#Date: 2012-4-6 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","d=s","h"); +if (!(defined $opts{i} and defined $opts{d} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $input=$opts{'i'}; +my $output=$opts{'o'}; +my $depth=$opts{'d'}; + +open (IN,"<$input")||die"$!"; +open OUT,">$output"; +#my @Total=qw(15797079 18042650 17455254 17295526 18791753 16719596 15150009 18451484 17402501 17729362 19347595 17518516 15699663 16589265 15442892 14012264 14190746 17280260 13213117 12390121 14874304 ); +my @Total=split/\,/,$depth; +#print OUT "#clusterID\tmajor_length\tpercent\n"; +while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^\"/){ + my @title=split/\t/,$aline; + for (my $i=0;$i<@title ;$i++) { + $title[$i]=~s/^\"(\S+)\"$/$1/; + } + my $title=join "\t",@title; + print OUT "\#$title\n"; + next; + } + my @temp=split/\t/,$aline; + print OUT "$temp[0]\t$temp[1]\t$temp[2]"; + my @id=split/:/,$temp[0]; + my @posi=split/-/,$id[1]; + for (my $i=3;$i<@temp;$i++) { + my $rpkm=sprintf("%.2f",$temp[$i]/($posi[1]-$posi[0]+1)/$Total[$i-3]*1000000000); + print OUT "\t$rpkm"; + } + print OUT "\n"; +} +close IN; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -h +options: +-i input cluster file +-o output file +-d depth +-h help +USAGE +exit(1); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rfam.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,100 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","ref=s","index:s","v:i","p:i","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $dir=$opts{'o'}; +unless ($dir=~/\/$/) {$dir.="/";} +my $rfam=$opts{'ref'}; +my $mis=defined $opts{'v'}? $opts{'v'} : 0; +my $index=defined $opts{'index'} ? $opts{'index'} : ""; +my $threads=defined $opts{'p'} ? $opts{'p'} : 1; + +if (not -d $dir) { + mkdir $dir; +} + + +my $time=Time(); + +my $mapdir=$dir."/rfam_match"; +if(not -d $mapdir){ + mkdir $mapdir; +} +chdir $mapdir; +###check genome index +if (-s $index.".1.ebwt") { +}else{ + &checkACGT($rfam); + `bowtie-build $rfam $rfam`; + $index="$rfam"; +} +### genome mapping +`bowtie -v $mis -f -p $threads -k 1 $index $filein --al rfam_mapped.fa --un rfam_not_mapped.fa > rfam_mapped.bwt 2> run.log`; + +sub checkACGT{ + my $string; + open IN,"<$rfam"; + while (my $aline=<IN>) { + if ($aline!~/^>/) { + $aline=~s/U/T/gi; + } + $string .=$aline; + } + close IN; + $rfam=basename($rfam); + open OUT, ">$rfam"; + print OUT $string; + close OUT; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file# input reads fasta/fastq file +-ref input file# rfam file, which do not contain miRNAs +-index file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; + +-p/--threads <int> number of alignment threads to launch (default: 1) + +-o output directory +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRNA_plot.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,411 @@ +#!/usr/bin/perl -w +#========================================================================================== +# Date: +# Title: +# Comment: Program to plot gene structure +# Input: 1. +# 2. +# 3. +# Output: output file of gene structure graph by html or svg formt +# Test Usage: +#======================================================================================== +#use strict; +my $version=1.00; +use SVG; +use Getopt::Long; +my %opt; +GetOptions(\%opt,"g=s","l=s","span=s","c=s","o=s","out=s","cen:s","mark=s","h"); +if (!( defined $opt{o}) || defined $opt{h}) { +&usage; +} +my $span=$opt{span}; +#my $sample_cloumn=$opt{n}; +my $mark=$opt{mark}; +my @mark=split/\#/,$mark; +my $genelist=$opt{g}; +#===============================Define Attribute========================================== +my %attribute=( + canvas=>{ + 'width'=>1500, + 'height'=>1800 + }, + text=>{ + 'stroke'=>"#000000", + 'fill'=>"none", + 'stroke-width'=>0.5 + }, + line=>{ + 'stroke'=>"black", + 'stroke-width'=>1 + }, + csv=>{ + 'stroke'=>"red", + 'stroke-width'=>0.5 + }, + exon=>{ + 'stroke'=>"black", + 'stroke-width'=>1 + }, + intron=>{ + 'stroke'=>"black", + 'stroke-width'=>1.5 + }, + font=>{ + 'fill'=>"#000000", + 'font-size'=>12, + 'font-size2'=>10, + #'font-weight'=>'bold', + 'font-family'=>"Arial" + #'font-family'=>"ArialNarrow-bold" + }, + rect=>{ + 'fill'=>"lightgreen", + 'stroke'=>"black", + 'stroke-width'=>0.5 + }, + readwidth=>0.5 +); +#############################s#define start coordinate and scale +open(TXT,">$opt{out}"); +open(LENGTH,"$opt{l}")||die"cannot open the file $opt{l}"; +my %length; +while (my $aline=<LENGTH>) { + chomp $aline; + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + $temp[0]=~s/^c/C/; + $length{$temp[0]}=$temp[1]; +} +close LENGTH; +#--------------------------------------------------------------- +open(GENE,"$opt{g}")||die"cannot open the file $opt{g}"; +my %genelist; +while (my $aline=<GENE>) { + chomp $aline;#LOC_Os01g01280 Chr1 133291 134685 + + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + if ($temp[1]=~/^Chr(\d)$/) { + $temp[1]="Chr0$1"; + } + push @{$genelist{$temp[1]}},[$temp[0],$temp[2],$temp[3]]; + +} +close GENE; +#my %have_gene; +#foreach my $chr (sort keys %genelist) { +# my @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}}; +# my $start=$genelist[0][1]; +# my $end=$genelist[0][2]; +# for (my $i=0;$i<@genelist ;$i++) { +# if ($gene) { +# } +# } +#} + +my %gene_desity; +foreach my $chr (sort keys %genelist) { + my @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}}; + for (my $i=0;$i<@genelist ;$i++) { + my $start=int($genelist[$i][1]/$span); + my $end=int($genelist[$i][2]/$span); + #my @t_rpkm=split/\t/,$target_rpkm{$genelist[$i][0]}; + if ($start==$end) { + $gene_desity{$chr}[$start]++; + } + else{ + for (my $k=$start;$k<=$end ;$k++) { + $gene_desity{$chr}[$k]++; + } + } + } +} +#------------------------------------------region_gene_number------------------------- +my $max_gene_number=0; +my $total=0; +foreach my $chr (sort keys %genelist) { + for (my $i=0;$i<@{$gene_desity{$chr}} ;$i++) { + if (!(defined($gene_desity{$chr}[$i]))) { + $gene_desity{$chr}[$i]=0; + } + if ($gene_desity{$chr}[$i]>$max_gene_number) { + $max_gene_number=$gene_desity{$chr}[$i]; + #print "$gene_desity{$chr}[$i]\n"; + } + #print TXT "$i\t$gene_desity[$i]\n"; + $total+=$gene_desity{$chr}[$i]; + #print "$chr\t$i\t$gene_desity{$chr}[$i]\n"; + } +} +#print "Gene max:$max_gene_number\ntotal:$total\n"; + +#--------------------------------------------------------------- +my %centromere; +if (defined($opt{cen})) { + open CEN,"$opt{cen}"; + while (my $aline=<CEN>) { + chomp $aline; + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + $temp[0]=~s/^c/C/; + $centromere{$temp[0]}[0]=$temp[1]; + $centromere{$temp[0]}[1]=$temp[2]; + } + close CEN; +} + +#--------------------------------------------------------------- +my $max_length=0; +foreach my $chr (keys %length) { + if ($max_length<$length{$chr}) { + $max_length=$length{$chr}; + } + print "$chr\n"; +} +#====================================cluster data======================================= +open(CLUSTER,"$opt{c}")||die"cannot open the file $opt{c}"; +my %cluster; +my %cluster_density; +#my @sample=qw(39B3 3PA3 3LC3); +my @cluster_non_add; +while (my $aline=<CLUSTER>) { + next if($aline=~/^\#/); + chomp $aline;##Chr MajorLength Percent end 19B1 + my @temp=split/\t/,$aline; + my @ID=split/\:/,$temp[0]; + my @posi=split/\-/,$ID[1]; + my @all_rpkm=@temp; + shift @all_rpkm; + shift @all_rpkm; + shift @all_rpkm; +# for (my $s=0;$s<@all_rpkm ;$s++) {#log transfer +# $all_rpkm[$s]=log2($all_rpkm[$s]); +# } + push @{$cluster{$ID[0]}},[$temp[0],$posi[0],$posi[1],@all_rpkm];#ID start end rpkm(19B1,1PA1,1LC1); +} +close CLUSTER; +my %max_cluster; +my $chr_number=0; +print "@mark\n$mark\n"; +foreach my $chr (sort keys %cluster) { + for (my $i=0;$i<@mark ;$i++) { + $max_cluster{$chr}[$i]=0; + } + $chr_number++; +} +foreach my $chr (sort keys %cluster) { + @{$cluster{$chr}}=sort{$a->[1] <=> $b->[1]}@{$cluster{$chr}}; + for (my $i=0;$i<$#{$cluster{$chr}} ;$i++) { + for (my $s=0;$s<@mark;$s++) { + if ($cluster{$chr}[$i][3+$s]>$max_cluster{$chr}) { + $max_cluster{$chr}[$s]=$cluster{$chr}[$i][3+$s]; + } + } + } + +} +foreach my $chr (sort keys %max_cluster) { + for (my $s=0; $s<@mark;$s++) { + # print "$max_cluster{$chr}[$s]\n"; + } +} +#--------------------------------------------------------------------------------------- +foreach my $chr(keys %cluster) { + for(my $i=0;$i<$#{$cluster{$chr}};$i++) { + my $start=int($cluster{$chr}[$i][1]/$span); + my $end=int($cluster{$chr}[$i][2]/$span); + if ($start==$end) { + for (my $s=0;$s<@mark ;$s++) { + $cluster_density{$chr}[$start][$s]+=$cluster{$chr}[$i][3+$s]; + } + + } + else{ + for (my $m=$start;$m<=$end ;$m++) { + for (my $s=0;$s<@mark ;$s++) { + $cluster_density{$chr}[$m][$s]+=$cluster{$chr}[$i][3+$s]; + } + } + } + } +} +my %max_cluster_density; +my $max_all_density=0; +foreach my $chr (sort keys %cluster) {# + for (my $s=0;$s<@mark ;$s++) { + for (my $i=0;$i<$#{$cluster{$chr}} ;$i++) { + $max_cluster_density{$chr}[$s]=0; + } + } + +} +foreach my $chr (sort keys %cluster_density) { + print "$#{$cluster_density{$chr}}\n"; + for (my $k=0;$k<$#{$cluster_density{$chr}} ;$k++) { + print TXT "$chr\t$k"; + for (my $s=0;$s<@mark;$s++) { + if (!(defined($cluster_density{$chr}[$k][$s]))) { + $cluster_density{$chr}[$k][$s]=0; + } + if ($cluster_density{$chr}[$k][$s]>$max_cluster_density{$chr}[$s]) { + $max_cluster_density{$chr}[$s]=$cluster_density{$chr}[$k][$s]; + } + if ($cluster_density{$chr}[$k][$s]>$max_all_density) { + $max_all_density=$cluster_density{$chr}[$k][$s]; + } + print TXT "\t$cluster_density{$chr}[$k][$s]"; + } + print TXT "\n"; + } +} +print "max density: $max_all_density\n"; +#-------------------------------------------------------------------- +my $top_margin=30; +my $tail_margin=30; +my $XOFFSET=50; +my $YOFFSET=60; +my $chr_length=600; +my $Xscale=$chr_length/$max_length;#¶¨ÒåXÖá±ÈÀý³ß 1:1000 xÖáµÄ×ø±ê³¤¶È¶¼Òª°´Õմ˱ÈÀý³ß»»Ëã +#my $high_cov=$high_cov9B1=0.5;#¶¨Òå·åͼ×î¸ß·å +#my $Yscale=1/$high_cov;#¶¨ÒåYÖá±ÈÀý³ß 1:60 yÖáµÄ×ø±ê³¤¶È¶¼Òª°´Õմ˱ÈÀý³ß»»Ëã +#========================================New canvas============================ +#### Starting #### +#н¨»²¼ +my $width=1000; +my $heigth=100+130*$chr_number; +my $svg=SVG->new(width=>$width, height=>$heigth); +#»Í¼Æðʼµã +my $canvas_start_x=$XOFFSET; +my $canvas_end_x=$XOFFSET+$max_length*$Xscale;#°´ÕÕ±ÈÀý³ß »Ïß +my $canvas_start_y=$YOFFSET; +my $canvas_end_y=$YOFFSET; +my $chr_heigth=$heigth-$YOFFSET-$tail_margin; +print "chr number:$chr_number\n"; +my $one_chr_heigth=$chr_heigth/$chr_number; +my $Yscale=($one_chr_heigth-15)/$max_all_density; +#my $chr_width=$YOFFSET; +#my $chr_start_y; +#my $chr_end_y; +#my $Yscale=0.01; +#=======================================title of the graph=============================== +#my $span_k=$span/1000; +#$svg->text('x',$width/2,'y',$YOFFSET-20,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',15,'font-family',$attribute{font}{'font-family'},'-cdata',"Clusters rpkm/"."$span_k"."kb Distribution"); +#=======================================the top max chr line============================= +$svg->line(id=>'l1',x1=>$canvas_start_x,y1=>$canvas_start_y,x2=>$canvas_end_x,y2=>$canvas_end_y,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); +$long_scale=int ($max_length/10);#Ê®µÈ·Ö ´ó¿Ì¶È +#´ó×ø±ê¿Ì¶È +for ($i=0;$i<=10;$i++) { + my $long_x_start=$XOFFSET+$long_scale*$i*$Xscale; + my $long_x_end=$long_x_start; + my $long_y_start=$YOFFSET; + my $long_y_end=$YOFFSET-5; + $svg->line('x1',$long_x_start,'y1',$long_y_start,'x2',$long_x_end,'y2',$long_y_end,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + my $Bscale=$long_scale*$i; + my $cdata=int ($Bscale/1000000); + $svg->text('x',$long_x_start,'y',$long_y_start-10,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$cdata."M"); +} +#========================================================================================= +my $cc=1; +foreach my $chr (sort keys %length) { + my $chr_end_x=$XOFFSET+$length{$chr}*$Xscale; + my $chr_start_x=$XOFFSET; + my $chr_start_y=$YOFFSET+$cc*$one_chr_heigth; + my $chr_end_y=$chr_start_y; + #$chr_start_y+=$chr_width; + #$chr_end_y+=$chr_width; +# for (my $i=0;$i<@{$gene_desity{$chr}};$i++) { +# print "$chr\t$i\t$gene_desity{$chr}[$i]\n"; +# my $red=$gene_desity{$chr}[$i]/$max_gene_number*255; +# my $green=$gene_desity{$chr}[$i]/$max_gene_number*255; +# print "$red\t$green\t0\n"; +# $svg->rect('x',$chr_start_x+$i*$span*$Xscale,'y',$chr_start_y,'width',$span*$Xscale,'height',8,'stroke',"rgb($red,$green,0)",'stroke-width',0.1,'fill',"rgb($red,$green,0)"); +# } + + $svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_end_x,y2=>$chr_end_y,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$XOFFSET-40,'y',$chr_start_y,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$chr); + my $m_length=$length{$chr}%1000000; + $svg->text('x',$chr_end_x+20,'y',$chr_start_y,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$m_length."M"); + + + if (defined($centromere{$chr}[0])) { + $svg->rect('x',$XOFFSET+$centromere{$chr}[0]*$Xscale,'y',$chr_start_y-2,'width',($centromere{$chr}[1]-$centromere{$chr}[0]+1)*$Xscale,'height',5,'stroke',"blue",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"blue"); + } + for (my $s=0;$s<@mark ;$s++) { + for (my $i=0;$i<$#{$cluster_density{$chr}}-1 ;$i++) { + #if ($cluster_density{$chr}[$i]*$Yscale>40) { + #$cluster_density{$chr}[$i]=40/$Yscale; + #$svg->rect('x',$XOFFSET+$i*$span*$Xscale,'y',$chr_start_y-45,'width',$span*$Xscale,'height',5,'stroke',"green",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"green"); + #} + #print "$i\t$cluster_density{$chr}[$i][$s]\t$cluster_density{$chr}[$i+1][$s]\n"; + my $cluster_density_start_x=$XOFFSET+$i*$span*$Xscale; + my $cluster_density_end_x=$XOFFSET+($i+1)*$span*$Xscale; + my $cluster_density_start_y=$chr_start_y-$cluster_density{$chr}[$i][$s]*$Yscale; + my $cluster_density_end_y=$chr_start_y-$cluster_density{$chr}[$i+1][$s]*$Yscale; + my $c_red=($s+1)/@mark*255; + $svg->line('x1',$cluster_density_start_x,'y1',$cluster_density_start_y,'x2',$cluster_density_end_x,'y2',$cluster_density_end_y,'stroke',"rgb($c_red,125,0)",'stroke-width',0.3); + } + + } + #=======Y axis + $svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_start_x,y2=>$chr_start_y-$one_chr_heigth+15,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + #=======Y axis ===>3 xiaoge + my $s10=1; + my $e10=0; + my $chr_max=$max_all_density; + while ($chr_max>10) { + $chr_max=int($chr_max/10); + $s10=$s10*10; + $e10++; + } + $chr_max=$chr_max/2; + #print "*****$max_all_density\t$chr_max\t$s10\n"; + for (my $i=1;$i<3 ;$i++) { + my $y1=$chr_start_y-$chr_max*$s10*$Yscale*$i; + my $xiaoge_Y=$chr_max*$i; + $svg->line('x1',$chr_start_x,'y1',$y1,'x2',$chr_start_x+3,'y2',$y1,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$chr_start_x-26,'y',$y1+4,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',8,'font-family',$attribute{font}{'font-family'},'-cdata',$xiaoge_Y."e".$e10); + } + $cc++; +} + +for (my $s=0;$s<@mark ;$s++) { + my $c_red=($s+1)/@mark*255; + print "**$c_red\n"; + $svg->line('x1',$canvas_end_x+100,'y1',$YOFFSET+$s*20+30,'x2',$canvas_end_x+130,'y2',$YOFFSET+$s*20+30,'stroke',"rgb($c_red,125,0)",'stroke-width',1); + $svg->text('x',$canvas_end_x+150,'y',$YOFFSET+$s*20+5+30,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',$mark[$s]); +} +# +# +if (defined($opt{cen})) { + $svg->rect('x',$canvas_end_x+100,'y',$YOFFSET+@mark*20+30,'width',30,'height',5,'stroke',"blue",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"blue"); + $svg->text('x',$canvas_end_x+150,'y',$YOFFSET+@mark*20+30+5,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',"centromere"); +} + +close TXT; + +open (OUT,">$opt{o}"); +print OUT $svg->xmlify(); + +sub log2 { + my $n = shift; + return log($n)/log(2); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 +options: +-g genelist +-span +-n sample cloumn +-mark sample name +-o output graph file name with html or svg extension +-c cluster file input +-out txt output +-l length of chr +-cen centromere +-h help +USAGE +exit(1); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sam2Bed_bowtie.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,74 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2011/11/7 +#Modified: +#Description: sam2BED +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","mark=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +my $mark=$opts{'mark'}; +my @sample=split/\#/,$mark; +$mark=join"\t",@sample; +open OUT,">$fileout"; #output file +print OUT "#chr\tstrand\tstart\tend\t$mark\n"; + +open IN,"<$filein"; #input file +my $Tags_num=0; +my @read_num; +#print OUT "#chr\tstart\tend\tnum\t<=20\t21\t22\t23\t24\t>=25\n"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\@/); + my @tmp=split/\t/,$aline; + my $strand=$tmp[1]; + my $start=$tmp[3]+1; + my $length=length($tmp[4]); + my $end=$start+$length-1; + my $hit=$tmp[6]+1; + #======express caculate weighted=================================== + my $exp; + my @tempID=split/\:/,$tmp[0]; + my @exp=split/\_/,$tempID[1]; + pop @exp; + for (my $j=0;$j<@exp ;$j++) { + #my @tempID1=split/\=/,$tempID[$j]; + $exp[$j]=sprintf("%.2f",$exp[$j]/$hit); + $read_num[$j]+=$exp[$j]; + #print OUT "\t$exp"; + } + $exp=join "\t",@exp; + print OUT $tmp[2],"\t",$strand,"\t",$start,"\t",$end,"\t",$exp,"\n"; + $Tags_num++; + +} +print "Total Tags numer: $Tags_num\n"; +my $read_number=join "\t",@read_num; +print "Each sample numer: $read_number\n"; +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-mark sampleA sampleB sampleC..... +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,402 @@ +#!/usr/bin/perl -w +my $version=1.00; +use strict; +use warnings; +use Getopt::Long; +use Getopt::Std; +use threads; +#use threads::shared; +use Parallel::ForkManager; +#use lib '/leofs/biotrans/chentt/perl_module/'; +#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq +print " +##################################### +# # +# sRNA cluster # +# # +##################################### +"; +########################################################################################### +my $usage="$0 +Options: +-i input file# fasta +-config input file +-g genome file +-f gff file + +-o workdir file +-path script path +-t int, number of threads [1] +-format fastq, fq, fasta or fa +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-mis int number of allowed mismatches when mapping reads to genome, default 0 + +-n int max hits number,default 25; used in genome alignment +-d int distance of tag to merged a cluster; default 100 +-p cluster method F :conventional default is F + T :NIBLES +-l int the length of the upstream and downstream,default 1000;used in position annotate + +-nat natural antisense transcripts file +-repeat repeat information file out of Repeatmasker +-deg file config of de sample +-cen centromere file input +-span plot span, default 50000 +"; + +my %options; +GetOptions(\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h"); +#print help if that option is used +if($options{h}){die $usage;} + +my $filein=$options{'i'}; + +#my $config=$options{'i'}; +my $genome_fa=$options{'g'}; +my $gff=$options{'f'}; + + +########################################################################################## +my $predir=`pwd`; +chomp $predir; +my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; + +my $path=$options{'path'}; + +my $t=defined($options{'t'})? $options{'t'}:1; #threads number + +my $mis=defined $options{'mis'} ? $options{'mis'}:0; + + +my $hit=defined $options{'n'}?$options{'n'}:25; + +my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; + +my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; + +my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; + +my $format=$options{'format'}; +#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { +# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +#} + + + +my $sample_number; +my ($dir,$dir_tmp); +################################ MAIN ################################################## +print "\ncluster program start:"; +my $time=Time(); +make_dir_tmp(); + +my $mark; +my $sample_mark; + +my $config=$options{'config'}; +my (@filein,@mark); +&read_config(); +$sample_number=@mark; +$mark=join "\t",@mark; +$sample_mark=join "\#",@mark; + + + +my $data3=$filein; ### rfam not mapped reads +genome(); + +my $bed=$dir."cluster\/"."sample\.bed"; +my $read=$dir."cluster\/"."sample_reads\.cluster"; +my $read_txt=$dir."cluster\/"."cluster\.txt"; +my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; +my $preprocess; +my $cluster_file; +my $annotate_dir; +my $deg_dir; +my $plot_dir; +my %id; +for (my $i=0;$i<@mark ;$i++) { + $id{$mark[$i]}=$i+4; +} + + +my @map_read; +my $map_tag=0; + +bwt2bed(); + +cluster(); + +quantify(); + +phase(); + +if (defined($options{'nat'})&&defined($options{'repeat'})) { + class(); +} +else{ + get_genelist(); +} + +annotate(); + +genome_length(); + +plot(); + +my @pairdir; +if (defined($options{'deg'})) { + dec(); + infor_merge(); +} +else{infor_merge_no_dec()} +html(); +print "\ncluster program end:"; +Time(); +############################sub program################################################### +sub make_dir_tmp{ + + #make temporary directory + if(not -d "$workdir\/cluster_runs"){ + mkdir("$workdir\/cluster_runs"); + mkdir("$workdir\/cluster_runs\/ref\/"); + } + + $dir="$workdir\/cluster_runs\/"; + #print STDERR "mkdir $dir\n\n"; + return; +} + +sub genome{ + if(defined $options{'idx'}){ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; + }else{ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; + } + #=================== mapping sta =================================================== + my $map_file=$dir."genome_match\/genome_mapped\.fa"; + open (MAP,"<$map_file")||die"$!"; + print "\n#each sample mapping reads sta:\n\n"; + print "#$mark\ttotal\n"; + while (my $ID=<MAP>) { + chomp $ID; + my @tmp=split/\:/,$ID; + my @exp=split/\_/,$tmp[1]; + $exp[-1] =~ s/^x//; + for (my $i=0;$i<@exp ;$i++) { + $map_read[$i]+=$exp[$i]; + } + $map_tag++; + my $seq=<MAP>; + } + my $map_read=join"\t",@map_read; + print "$map_read\n\n"; + print "#total mapped tags:$map_read\n\n"; + close MAP; + return 0; +} + +sub bwt2bed{ + $cluster_file=$dir."cluster\/"; + mkdir ("$cluster_file"); + print "sam file changed to bed file\n"; + my ($file) = $dir."genome_match\/genome_mapped\.bwt"; + + my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; + print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; + return 0; +} + +sub cluster{ + print "tags is ready to merged clusters\n\n"; + my ($file) =$bed; + if ($cluster_mothod eq "F") { + my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; + print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; + } + elsif($cluster_mothod eq "T"){ + my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`; + print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n"; + } + else{print "\-p is wrong!\n\n";} + return 0; +} + + +sub quantify{ + print "clusters is ready to quantified\n\n"; + my @depth=@map_read; + pop @depth; + my $depth=join ",",@depth; + my $quantify=`perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm`; + print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n"; + return 0; +} + +sub phase{ + $annotate_dir=$dir."annotate\/"; + mkdir ("$annotate_dir"); + print "clusters is to predict phase siRNA\n"; + my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; + print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; + return 0; +} + +sub class{ + print "clusters is ready to annotate by sources\n\n"; + my $nat=$options{'nat'}; + my $repeat=$options{'repeat'}; + my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; + print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; +} + +sub annotate{ + print "clusters is ready to annotate by gff file\n\n"; + my $file; + if (defined($options{'nat'})&&defined($options{'repeat'})) { + $file="$annotate_dir\/sample_class.anno"; + } + else{ + $file=$rpkm; + } + my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; + print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; + return 0; +} +sub get_genelist{ + + my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; + print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; +} +sub dec{ + print "deg reading\n\n"; + my $deg_file=$options{'deg'}; + open IN,"<$deg_file"; + my @deg; + my $s=0; + while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + $deg[$s]=$aline; + my @ea=split/\s+/,$aline; + push @pairdir,"$ea[0]_VS_$ea[1]\/"; + #print "$deg[$s]\n"; + $s++; + } + close IN; + $deg_dir=$dir."deg\/"; + mkdir ("$deg_dir"); + my $max_process = 10; + my $pm = new Parallel::ForkManager( $max_process ); + my $number=@deg-1; + foreach(0..$number){ + $pm->start and next; + &dec_pel($deg[$_]); + $pm->finish; + } + $pm->wait_all_children; +} + +sub dec_pel{ + print "\n******************\nstart:\n"; + Time(); + my $sample=shift(@_); + my @each=split/\s+/,$sample; + print "$each[0]\t$each[1]\n"; + my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; + mkdir ("$deg_sample_dir"); + print "read: $read\n"; + print "deg_sample_dir: $deg_sample_dir\n"; + print "$id{$each[0]}\t$each[0]\n"; + print "$id{$each[1]}\t$each[1]\n"; + my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 + my $time2=time(); + print "end:\n*************************\n"; + Time(); + sleep 1; +} + +sub infor_merge{ + my ($input,$mark); + foreach (@pairdir) { + print "@pairdir\n"; + $mark.=" -mark $_ "; + $input.=" -i $dir/deg\/$_\/output_score\.txt "; + print "$input\n$mark\n"; + } + my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; + print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; +} + +sub infor_merge_no_dec{ + my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; +} + +sub genome_length{ + my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; + print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" + +} + +sub plot{ + $plot_dir="$dir\/plot\/"; + mkdir ("$plot_dir"); + my $span=defined($options{span})?$options{span}:50000; + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `; + "print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n"; + +} + +sub html{ + my $pathfile="$dir/path.txt"; + open PA,">$pathfile"; + print PA "$config\n"; + print PA "$preprocess\n"; + print PA "$dir"."rfam_match\n"; + print PA "$dir"."genome_match\n"; + print PA "$cluster_file\n"; + print PA "$annotate_dir\n"; + print PA "$plot_dir\n"; + if (defined($deg_dir)) { + print PA "$deg_dir\n"; + } + close PA; + #my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +################################################################################# +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + #&check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA.xml Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,181 @@ +<tool id="plant_sirna_v1" name="siRNA" veision="1.0.0"> + <description>Program for plant siRNA analysis (siRNA only)</description> + + <requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="package" version="3.0.1">R</requirement> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="1.96">threads</requirement> + <requirement type="package" version="1.06">Parallel-ForkManager</requirement> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="1.4_001">Boost-Graph</requirement> + </requirements> + + <command interpreter="perl">siRNA.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + + -path \$SCRIPT_PATH + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + #### bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = $reference_genome.own_file + -g $index_path + + #else: + #set index_path = $reference_genome.index.fields.path + -g ${index_path}.fa -idx $index_path + #end if + + + ## Do or not annotate siRNAs by function + #if $params.function_anno == "yes": + + ## prepare bowtie index + #set nat_path = '' + #if str($params.nat_file.source) == "history": + #set nat_path = $params.nat_file.nat + + #else: + #set nat_path = $params.nat_file.index.fields.path + #end if + + ## prepare bowtie index + #set repeat_path = '' + #if str($params.repeat_file.source) == "history": + #set repeat_path = $params.repeat_file.repeat + + #else: + #set repeat_path = $params.repeat_file.index.fields.path + #end if + + + -nat $nat_path -repeat $repeat_path + #end if + + ## Do or not DEG + #if $degseq.degseq_analysis == "yes" : + -deg $degseq.deg + #end if + + -i $reads -config $config -n $mapnt -format $format -f $gff -mis $mis -d $d -p $p -l $l -cen $cen -span $span > run.log + + </command> + + <inputs> + + <param name="config" type="data" label="Raw data configs file" /> + <param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" /> + + <param name="format" type="select" lable=" Data format" multiple="false"> + <option value="fastq">Input data is fastq. format</option> + <option value="fasta">Input data is fasta. format</option> + </param> + + + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + + + <param name="gff" type="data" label="gff file" /> + <param name="mis" type="integer" value="0" label="Number of allowed mismatches when mapping reads to genome" /> + <param name="mapnt" type="integer" value="25" label="A read is allowed to map up to this number of positions in the genome" /> + <param name="d" type="integer" value="100" label="Distance of tag to merged a cluster" /> + + <param name="p" type="select" lable="siRNA cluster method" multiple="false"> + <option value="F">conventional</option> + <option value="T">NIBLES</option> + </param> + <param name="l" type="integer" value="1000" label="The length of the upstream and downstream,used in position annotate" /> + + + <conditional name="params"> + <param name="function_anno" type="select" label="Do or not annotate siRNAs by function"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + + + <conditional name="nat_file"> + <param name="source" type="select" label="Will you select a atural antisense transcripts file from your history ?" help="down load from ***"> + <option value="indexed">Use a built-in file</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a atural antisense transcripts file" help="If your species of interest is not listed, contact the Galaxy team"> + <options from_data_table="nat_annotate"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No files are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="nat" type="data" format="txt" label="atural antisense transcripts file" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + + <conditional name="repeat_file"> + <param name="source" type="select" label="Will you select a repeat information file from your history ?" help="down load from ***"> + <option value="indexed">Use a built-in file</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a repeat information file" help="If your species of interest is not listed, contact the Galaxy team"> + <options from_data_table="repeat_annotate"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No files are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="repeat" type="data" label="repeat information file out of Repeatmasker" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + </when> + </conditional> <!-- params --> + + <param name="cen" type="data" label="centromere file input" /> + <param name="span" type="integer" value="50000" label="plot span" /> + + <conditional name="degseq"> + <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="deg" type="data" label="file config of de sample" /> + </when> + </conditional> + + + </inputs> + + <outputs> + <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/> + <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/> + + </outputs> + + <help> + + </help> + </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA_pipeline.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,524 @@ +#!/usr/bin/perl -w +my $version=1.00; +use strict; +use warnings; +use Getopt::Long; +use Getopt::Std; +use threads; +#use threads::shared; +use Parallel::ForkManager; +#use lib '/leofs/biotrans/chentt/perl_module/'; +#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq +print " +##################################### +# # +# sRNA cluster # +# # +##################################### +"; +########################################################################################### +my $usage="$0 +Options: +-i input file# raw data file +-tag string #raw data sample name +-g genome file +-f gff file + +-o workdir file +-path script path +-t int, number of threads [1] +-format fastq, fq, fasta or fa +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-mis int number of allowed mismatches when mapping reads to genome, default 0 +-rfam string, input file# rfam database file. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-v int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment + +-a string, ADAPTER string. default is ATCTCGTATG. +-n int max hits number,default 25; used in genome alignment +-d int distance of tag to merged a cluster; default 100 +-p cluster method F :conventional default is F + T :NIBLES +-l int the length of the upstream and downstream,default 1000;used in position annotate + +-nat natural antisense transcripts file +-repeat repeat information file out of Repeatmasker +-deg file config of de sample +-cen centromere file input +-span plot span, default 50000 +"; + +my %options; +GetOptions(\%options,"i:s@","tag:s@","g=s","phred:i","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h"); +#print help if that option is used +if($options{h}){die $usage;} + +my @filein=@{$options{'i'}}; +my @mark=@{$options{'tag'}}; + +#my $config=$options{'i'}; +my $genome_fa=$options{'g'}; +my $gff=$options{'f'}; + + +########################################################################################## +my $predir=`pwd`; +chomp $predir; +my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; + +my $path=$options{'path'}; + +my $t=defined($options{'t'})? $options{'t'}:1; #threads number + +my $mis=defined $options{'mis'} ? $options{'mis'}:0; + +my $mis_rfam=defined $options{'v'} ? $options{'v'}:0; + +my $hit=defined $options{'n'}?$options{'n'}:25; + +my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; + +my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; + +my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; + +my $format=$options{'format'}; +#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { +# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +#} + +my $adpter="ATCTCGTATG"; #adapter +if (defined $options{'a'}) {$adpter=$options{'a'};} + + +my $phred_qv=64; +if(defined $options{'phred'}){$phred_qv=$options{'phred'};} +my $sample_number; +my ($dir,$dir_tmp); +################################ MAIN ################################################## +print "\ncluster program start:"; +my $time=Time(); +make_dir_tmp(); + +my @clip; +my $mark; +my $sample_mark; + +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@filein;$i++) { + print CONFIG $filein[$i],"\t",$mark[$i],"\n"; + } +close CONFIG; +if (@filein != @mark) { + die "Maybe config file have some wrong!!!\n"; +} +$sample_number=@mark; +$mark=join "\t",@mark; +$sample_mark=join "\#",@mark; + + +#read_config(); + +trim_adapter_and_filter(); + +my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data +my $data2=$filter_out; ### mirbase not mapped reads +my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa"; ### rfam not mapped reads +my $bed=$dir."cluster\/"."sample\.bed"; +my $read=$dir."cluster\/"."sample_reads\.cluster"; +my $read_txt=$dir."cluster\/"."cluster\.txt"; +my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; +my $preprocess; +my $cluster_file; +my $annotate_dir; +my $deg_dir; +my $plot_dir; +my %id; +for (my $i=0;$i<@mark ;$i++) { + $id{$mark[$i]}=$i+4; +} + +print "\n######## tiandm test start ###########\n"; +print "\@mark: @mark\n\%id keys number:"; +print scalar keys %id; +print "\n"; +foreach my $kyess (keys %id){ + print $kyess," --> $id{$kyess}\n"; +} +print "\n######## tiandm test end ############\n"; +group_and_filter(); #collapse reads to tags + +rfam(); + +my @map_read; +my $map_tag=0; +genome(); + +bwt2bed(); + +cluster(); + +quantify(); + +phase(); + +if (defined($options{'nat'})&&defined($options{'repeat'})) { + class(); +} +else{ + get_genelist(); +} + +annotate(); + +genome_length(); + +plot(); + +my @pairdir; +if (defined($options{'deg'})) { + dec(); + infor_merge(); +} +else{infor_merge_no_dec()} +html(); +print "\ncluster program end:"; +Time(); +############################sub program################################################### +sub make_dir_tmp{ + + #make temporary directory + if(not -d "$workdir\/cluster_runs"){ + mkdir("$workdir\/cluster_runs"); + mkdir("$workdir\/cluster_runs\/ref\/"); + } + + $dir="$workdir\/cluster_runs\/"; + #print STDERR "mkdir $dir\n\n"; + return; +} + +#sub read_config{ +# open IN,"<$config"; +# while (my $aline=<IN>) { +# chomp $aline; +# my @tmp=split/\t/,$aline; +# push @filein,$tmp[0]; +# push @mark,$tmp[1]; +# } +# close IN; +# if (@filein != @mark) { +# die "Maybe config file have some wrong!!!\n"; +# } +# $sample_number=@mark; +# $mark=join "\t",@mark; +# $sample_mark=join "\#",@mark; +#} + + +sub trim_adapter_and_filter{ + my $time=time(); + $preprocess=$dir."preProcess/"; + mkdir $preprocess; + my $can_use_threads = eval 'use threads; 1'; + if ($can_use_threads) { + # Do processing using threads + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } + } + else { +# Do not processing using threads + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + &clips($in,$out); + } + } +} + +sub clips{ + my ($filein,$fileout)=@_; + my $adapter=$dir."preProcess\/$fileout\_clip\.fq"; + if($format eq "fq" || $format eq "fastq"){ + my $clip=`fastx_clipper -a $adpter -M 6 -Q $phred_qv -i $filein -o $adapter`; + } + if($format eq "fa" || $format eq "fasta"){ + my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`; + } + #my $clean=$dir."preProcess\/$fileout\_clean.fq"; + #my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `; + return $fileout; +} + +sub group_and_filter{ + #my ($ins,$data)=@_; + my @ins=@clip; + my $str=""; + my $group_out_file=$dir."preProcess\/"."collapse_reads.fa"; + #print "$$ins[0]\t$$ins[0]\n"; + for (my $i=0;$i<@clip;$i++) { + $str .="-i $clip[$i] "; + #print "$$ins[$i]\n"; + } + my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`; + print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n"; + + my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa"; + my $tmpmark=join ",", @mark; + + my $length_f=`perl $path\/filterReadsByLength.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $tmpmark`; + print "perl $path\/filterReadsByLength.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $tmpmark\n\n"; + my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`; + print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n"; + my $plot_l_D=`perl $path/Length_Distibution.pl -i $dir/preProcess/reads_length_distribution_after_count_filter.txt -o $dir/preProcess/length.html `; + print "perl $path\/Length_Distibution.pl -i $dir\/preProcess\/reads_length_distribution_after_count_filter.txt -o $dir\/preProcess\/length\.html\n\n"; + return 0; +} + +sub rfam{ + if (defined $options{'idx2'}) { + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}"); + }else{ + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir"); + } + my $tag=join "\\;" ,@mark; + my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`; + return 0; +} +sub genome{ + if(defined $options{'idx'}){ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; + }else{ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; + } + #=================== mapping sta =================================================== + my $map_file=$dir."genome_match\/genome_mapped\.fa"; + open (MAP,"<$map_file")||die"$!"; + print "\n#each sample mapping reads sta:\n\n"; + print "#$mark\ttotal\n"; + while (my $ID=<MAP>) { + chomp $ID; + my @tmp=split/\:/,$ID; + my @exp=split/\_/,$tmp[1]; + $exp[-1] =~ s/^x//; + for (my $i=0;$i<@exp ;$i++) { + $map_read[$i]+=$exp[$i]; + } + $map_tag++; + my $seq=<MAP>; + } + my $map_read=join"\t",@map_read; + print "$map_read\n\n"; + print "#total mapped tags:$map_read\n\n"; + close MAP; + return 0; +} + +sub bwt2bed{ + $cluster_file=$dir."cluster\/"; + mkdir ("$cluster_file"); + print "sam file changed to bed file\n"; + my ($file) = $dir."genome_match\/genome_mapped\.bwt"; + + my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; + print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; + return 0; +} + +sub cluster{ + print "tags is ready to merged clusters\n\n"; + my ($file) =$bed; + if ($cluster_mothod eq "F") { + my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; + print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; + } + elsif($cluster_mothod eq "T"){ + my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`; + print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n"; + } + else{print "\-p is wrong!\n\n";} + return 0; +} + + +sub quantify{ + print "clusters is ready to quantified\n\n"; + my @depth=@map_read; + pop @depth; + my $depth=join ",",@depth; + my $quantify=`perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm`; + print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n"; + return 0; +} + +sub phase{ + $annotate_dir=$dir."annotate\/"; + mkdir ("$annotate_dir"); + print "clusters is to predict phase siRNA\n"; + my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; + print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; + return 0; +} + +sub class{ + print "clusters is ready to annotate by sources\n\n"; + my $nat=$options{'nat'}; + my $repeat=$options{'repeat'}; + my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; + print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; +} + +sub annotate{ + print "clusters is ready to annotate by gff file\n\n"; + my $file; + if (defined($options{'nat'})&&defined($options{'repeat'})) { + $file="$annotate_dir\/sample_class.anno"; + } + else{ + $file=$rpkm; + } + my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; + print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; + return 0; +} +sub get_genelist{ + + my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; + print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; +} + +sub dec{ + print "deg reading\n\n"; + my $deg_file=$options{'deg'}; + open IN,"<$deg_file"; + my @deg; + my $s=0; + while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + $deg[$s]=$aline; + my @ea=split/\s+/,$aline; + push @pairdir,"$ea[0]_VS_$ea[1]\/"; + #print "$deg[$s]\n"; + $s++; + } + close IN; + $deg_dir=$dir."deg\/"; + mkdir ("$deg_dir"); + my $max_process = 10; + my $pm = new Parallel::ForkManager( $max_process ); + my $number=@deg-1; + foreach(0..$number){ + $pm->start and next; + &dec_pel($deg[$_]); + $pm->finish; + } + $pm->wait_all_children; +} + +sub dec_pel{ + print "\n******************\nstart:\n"; + Time(); + my $sample=shift(@_); + my @each=split/\s+/,$sample; + print "$each[0]\t$each[1]\n"; + my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; + mkdir ("$deg_sample_dir"); + print "read: $read\n"; + print "deg_sample_dir: $deg_sample_dir\n"; + print "$id{$each[0]}\t$each[0]\n"; + print "$id{$each[1]}\t$each[1]\n"; + my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 + my $time2=time(); + print "end:\n*************************\n"; + Time(); + sleep 1; +} + +sub infor_merge{ + my ($input,$mark); + foreach (@pairdir) { + print "@pairdir\n"; + $mark.=" -mark $_ "; + $input.=" -i $dir/deg\/$_\/output_score\.txt "; + print "$input\n$mark\n"; + } + my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; + print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; +} + +sub infor_merge_no_dec{ + my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; +} + +sub genome_length{ + my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; + print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" + +} + +sub plot{ + $plot_dir="$dir\/plot\/"; + mkdir ("$plot_dir"); + my $span=defined($options{span})?$options{span}:50000; + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `; + "print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n"; + +} + +sub html{ + my $pathfile="$dir/path.txt"; + open PA,">$pathfile"; + print PA "$config\n"; + print PA "$preprocess\n"; + print PA "$dir"."rfam_match\n"; + print PA "$dir"."genome_match\n"; + print PA "$cluster_file\n"; + print PA "$annotate_dir\n"; + print PA "$plot_dir\n"; + if (defined($deg_dir)) { + print PA "$deg_dir\n"; + } + close PA; + my $html=`perl $path\/html_siRNA.pl -i $pathfile -format $format -o $dir/result.html`; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +#################################################################################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA_pipeline.xml Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,221 @@ +<tool id="plant_sirna_pipeline_v1" name="siRNA_pipeline" veision="1.0.0"> + <description>Program for plant siRNA analysis (rawdata preprocess -> genome alignment -> non-coding annotate -> siRNA analysis)</description> + + <requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="package" version="3.0.1">R</requirement> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="1.96">threads</requirement> + <requirement type="package" version="1.06">Parallel-ForkManager</requirement> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="1.4_001">Boost-Graph</requirement> + </requirements> + + <command interpreter="perl">siRNA_pipeline.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + + -path \$SCRIPT_PATH + + #for $j, $s in enumerate( $series ) + ##rank_of_series=$j + -i ${s.input} + -tag ${s.tag} + #end for + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + #set index_path = $reference_genome.own_file + -g $index_path + #else: + #set index_path = $reference_genome.index.fields.path + -g ${index_path}.fa -idx $index_path + #end if + + + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($reference_rfam.source) == "history": + #set rfam_index_path = $reference_rfam.own_file + -rfam $rfam_index_path -v $v + #else: + #set rfam_index_path = $reference_rfam.index.fields.path + -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v + #end if + + + + ## Do or not annotate siRNAs by function + #if $params.function_anno == "yes": + + ## prepare bowtie index + #set nat_path = '' + #if str($params.nat_file.source) == "history": + #set nat_path = $params.nat_file.nat + + #else: + #set nat_path = $params.nat_file.index.fields.path + #end if + + ## prepare bowtie index + #set repeat_path = '' + #if str($params.repeat_file.source) == "history": + #set repeat_path = $params.repeat_file.repeat + + #else: + #set repeat_path = $params.repeat_file.index.fields.path + #end if + + + -nat $nat_path -repeat $repeat_path + #end if + + ## Do or not DEG + #if $degseq.degseq_analysis == "yes" : + -deg $degseq.deg + #end if + + -format $format -phred $phred -f $gff -mis $mis -a $a -n $mapnt -d $d -p $p -l $l -cen $cen -span $span > run.log + + </command> + + <inputs> + + <repeat name="series" title="Raw sequence data file"> + <param name="input" type="data" label="Raw data file"/> + <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/> + </repeat> + + <param name="format" type="select" label="raw data format" multiple="false"> + <option value="fastq">Raw data is fastq. format</option> + <option value="fasta">Raw data is fasta. format</option> + </param> + + <param name="phred" type="select" label="input quals are Phred+64 or Phred+33" multiple="false"> + <option value="64">Phred+64</option> + <option value="33" selected="true">Phred+33</option> + </param> + + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + + <conditional name="reference_rfam"> + <param name="source" type="select" label="Will you select a rfam reference from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a non-coding reference " help="If your rfam of interest is not listed, contact the Galaxy team"> + <options from_data_table="rfam_bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" /> + </when> + </conditional> + + <param name="gff" type="data" label="gff file" /> + <param name="mis" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" /> + <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches for non-coding alignment"/> + <param name="a" type="text" value="TGGAATTCTCGGGTGCCAAGG" label="3' adapter sequence" /> + <param name="mapnt" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> + <param name="d" type="integer" value="100" label="distance of tag to merged a cluster" /> + + <param name="p" type="select" label="siRNA cluster identify method" multiple="false"> + <option value="F">conventional</option> + <option value="T">NIBLES</option> + </param> + <param name="l" type="integer" value="1000" label="the length of the upstream and downstream,used in position annotate" /> + + + <conditional name="params"> + <param name="function_anno" type="select" label="Do or not annotate siRNAs by function"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + + + <conditional name="nat_file"> + <param name="source" type="select" label="Will you select a atural antisense transcripts file from your history ?" help="down load from ***"> + <option value="indexed">Use a built-in file</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a atural antisense transcripts file" help="If your species of interest is not listed, contact the Galaxy team"> + <options from_data_table="nat_annotate"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No files are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="nat" type="data" format="txt" metadata_name="dbkey" label="atural antisense transcripts file" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + + <conditional name="repeat_file"> + <param name="source" type="select" label="Will you select a repeat information file from your history ?" help="down load from ***"> + <option value="indexed">Use a built-in file</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a repeat information file" help="If your species of interest is not listed, contact the Galaxy team"> + <options from_data_table="repeat_annotate"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No files are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="repeat" type="data" metadata_name="dbkey" label="repeat information file out of Repeatmasker" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + </when> + </conditional> <!-- params --> + + <param name="cen" type="data" label="centromere file input" /> + <param name="span" type="integer" value="50000" label="plot span" /> + + <conditional name="degseq"> + <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="deg" type="data" label="file config of de sample" /> + </when> + </conditional> + + </inputs> + + <outputs> + <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/> + <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/> + + </outputs> + + <help> + + </help> + </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/bowtie_indices.loc.sample Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 +# +#and your /depot/data2/galaxy/bowtie/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/nat_annotate.loc.sample Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 +# +#and your /depot/data2/galaxy/bowtie/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/repeat_annotate.loc.sample Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 +# +#and your /depot/data2/galaxy/bowtie/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/rfam_bowtie_indices.loc.sample Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 +# +#and your /depot/data2/galaxy/bowtie/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,24 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of indexes in the Bowtie mapper format --> + <table name="bowtie_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/bowtie_indices.loc" /> + </table> + <!-- Locations of indexes in the Bowtie mapper format --> + <table name="rfam_bowtie_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/rfam_bowtie_indices.loc" /> + </table> + <!-- Locations of repeat annotation file --> + <table name="repeat_annotate" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/repeat_annotate.loc" /> + </table> + <!-- Locations of nat annotation file --> + <table name="nat_annotate" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/nat_annotate.loc" /> + </table> + +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,134 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="fastx_toolkit" version="0.0.13"> + <repository changeset_revision="ec66ae4c269b" name="package_fastx_toolkit_0_0_13" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="bowtie" version="0.12.7"> + <repository changeset_revision="9f9f38617a98" name="package_bowtie_0_12_7" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <set_environment version="1.0"> + <environment_variable action="set_to" name="SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> + <!--package name="R" version="3.0.1"> + <repository name="package_r_3_0_1" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package!--> + + <package name="ViennaRNA" version="2.1.8"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.tbi.univie.ac.at/RNA/packages/source/ViennaRNA-2.1.8.tar.gz</action> + <action type="shell_command">./configure --prefix=$INSTALL_DIR </action> + <action type="shell_command">make</action> + <action type="shell_command">make install</action> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> + </action> + </actions> + </install> + </package> + <package name="R" version="3.0.1"> + <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> + + <install version="1.0"> + <actions> + <action type="set_environment_for_install"> + <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu"> + <package name="R" version="3.0.1" /> + </repository> + </action> + <action type="shell_command">R CMD BATCH $REPOSITORY_INSTALL_DIR/install_DEG.R </action> + <action type="shell_command">echo "export PATH=$PATH" > $INSTALL_DIR/env.sh </action> + <action type="shell_command">chmod 755 $INSTALL_DIR/env.sh </action> + + </actions> + </install> + </package> + + <package name="threads" version="1.96"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/J/JD/JDHEDDEN/threads-1.96.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package> + + + <package name="Parallel-ForkManager" version="1.06"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/S/SZ/SZABGAB/Parallel-ForkManager-1.06.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package> + + <package name="Boost-Graph" version="1.4_001"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/D/DU/DUFFEE/Boost-Graph-1.4_001.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + ls && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package> + + <package name="SVG" version="2.59"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/S/SZ/SZABGAB/SVG-2.59.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package> + + <!--package name="parent" version="0.228"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/C/CO/CORION/parent-0.228.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package--> + +</tool_dependency>