Repository 'mirplant2'
hg clone https://toolshed.g2.bx.psu.edu/repos/big-tiandm/mirplant2

Changeset 50:7b5a48b972e9 (2014-12-05)
Previous changeset 49:f008ab2cadc6 (2014-12-03) Next changeset 51:3202911efdae (2014-12-05)
Commit message:
Uploaded
modified:
filterReadsByLength.pl
miRPlant.pl
miRPlant.xml
preProcess.xml
tool_dependencies.xml
added:
Annotate.pl
ClassAnnotate.pl
DEGseq_2.pl
SampleDEGseqMerge.pl
conventional.pl
convert_bowtie_to_blast.pl
count_ref_length.pl
filterReadsByCount.pl
get_genelist.pl
html_siRNA.pl
install_DEG.R
miRDeep_plant.pl
miRNA_Express_and_sequence.pl
microRNA.pl
microRNA.xml
nibls.pl
non_miRNA_reads.pl
phased_siRNA.pl
precursors.pl
quantify.pl
quantify_siRNA.pl
sRNA_plot.pl
sam2Bed_bowtie.pl
siRNA.pl
siRNA.xml
siRNA_pipeline.pl
siRNA_pipeline.xml
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 Annotate.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Annotate.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,178 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chentt
+#Email: chentt@big.ac.cn
+#Date: 2014/4/10
+#Modified:
+#Description: cluster annotate by priority
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","d=i","g=s","o=s","t=s","h");
+if (!(defined $opts{i} and defined $opts{g} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+#my $genelistout=$opts{'t'};
+my $dis=defined $opts{'d'}? $opts{'d'}:1000;
+my %gene;
+
+#open OUT,">$genelistout"; #output file  
+#print OUT "#ID\tchr\tstart\tend\tstrand\ns";
+open IN,"<$opts{g}";
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;#ID chr start end strand
+ #push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
+ $gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[4]];
+}
+#while (my $aline=<IN>) {
+# chomp $aline;
+# next if($aline=~/^\#/);
+# my @tmp=split/\t/,$aline;
+# my $ID;
+# if ($tmp[2] eq "gene") {
+# $tmp[0]=~s/Chr\./Chr/;
+# $tmp[0]=~s/Chr/chr/;
+# my @infor=split/;/,$tmp[8];
+# for (my $i=0;$i<@infor ;$i++) {
+# if ($infor[$i]=~/Alias\=(\S+)$/) {
+# $ID=$1;
+# last;
+# }
+# }
+# $gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+# print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+# }
+#}
+close IN;
+#close OUT;
+
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+while (my $aline=<IN>) {
+ chomp $aline;
+ my @tmp=split/\t/,$aline;
+ if($aline=~/^\#/){print OUT "$aline\tP_annotate\n";next}
+ my @result;
+ #shift @tmp;
+ my @id=split/:/,$tmp[0];
+ $id[0]=~s/Chr0/Chr/;
+ my @posi=split/-/,$id[1];
+ foreach my $key (keys %{$gene{$id[0]}}) {
+ if ($posi[0]<$gene{$id[0]}{$key}[1] && $posi[1]>$gene{$id[0]}{$key}[0]) {
+ push @result,"gene-body;$key;$gene{$id[0]}{$key}[2]";#$te{$key}";
+ next;
+ }
+ #if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-1000) {
+ if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-$dis) {
+ push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
+ push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
+ next;
+ }
+ #if ($posi[0]<$gene{$id[0]}{$key}[1]+1000 && $posi[1]>$gene{$id[0]}{$key}[1]) {
+ if ($posi[0]<$gene{$id[0]}{$key}[1]+$dis && $posi[1]>$gene{$id[0]}{$key}[1]) {
+ push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
+ push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
+ next;
+ }
+ }
+ my $result;
+ if (!(@result)) {
+ $result="intergenic";
+ }
+ elsif($#result==0){
+ $result=$result[0];
+
+ }
+ else{
+ $result=join "\t",@result;
+ }
+# else{
+# my $te_num=0;
+# my @te_overlap;
+# my @te_up_down;
+# my @non_overlap;
+# my @non_up_down;
+# for (my $k=0;$k<@result ;$k++) {
+# my @rr=split/\;/,$result[$k];
+# if ($rr[3] eq "Y") {
+# $te_num++;
+# if ($rr[0] eq "overlap") {
+# push @te_overlap,$result[$k];
+# }
+# else{
+# push @te_up_down,$result[$k];
+# }
+# }
+# else{
+# if ($rr[0] eq "overlap") {
+# push @non_overlap,$result[$k];
+# }
+# else{
+# push @non_up_down,$result[$k];
+# }
+# }
+# }
+# if ($te_num==0) {#non TE 
+# if (!(@te_overlap)) {#down up
+# if ($#non_up_down==0) {
+# $result=$non_up_down[0];
+# }
+# else{#overlap
+# my $all_2=join "\t",@non_up_down;
+# $result="up&down1-kb\t".$all_2;
+# }
+# }
+# else{
+# $result=join "\t",@non_overlap;
+# if ($#non_overlap>=1) {
+# print "$aline\t$result\n";
+# }
+# }
+# }
+# else{#TE
+# if (!(@te_overlap)) {#down up
+# if ($#te_up_down==0) {
+# $result=$te_up_down[0];
+# }
+# else{#overlap
+# my $all_2=join "\t",@te_up_down;
+# $result="up&down1-kb\t".$all_2;
+# }
+# }
+# else{
+# $result=join "\t",@te_overlap;
+# if ($#te_overlap>=1) {
+# print "$aline\t$result\n";
+# }
+# }
+# }
+# }
+ print OUT "$aline\t$result\n";
+}
+
+close IN;
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -g -d 
+options:
+-i input file
+-g genelist file
+-d int the length of the upstream and downstream,default 1000
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 ClassAnnotate.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ClassAnnotate.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,251 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chen Tingting
+#Email: chentt@big.ac.cn
+#Date: 2014/5/13
+#Modified:
+#Description: cluster annotate
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h");
+if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t}  and defined $opts{l}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+#my %gene;
+my %gene1;
+open IN,"<$opts{g}";
+open OUT ,">$opts{l}";
+print OUT "#ID\tchr\tstart\tend\tstrand\n";
+my $n=1;
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;
+ my $ID;
+ if ($tmp[2] eq "gene") {
+ $tmp[0]=~s/Chr\./Chr/;
+ #$tmp[0]=~s/Chr/chr/;
+ my @infor=split/;/,$tmp[8];
+ for (my $i=0;$i<@infor ;$i++) {
+ if ($infor[$i]=~/Alias\=(\S+)$/) {
+ $ID=$1;
+ last;
+ }
+ else {
+ $ID="unknown$n";
+ $n++;
+ }
+ }
+ #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+ push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
+ print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+ }
+}
+#while (my $aline=<IN>) {
+# chomp $aline;
+# next if($aline=~/^\#/);
+# my @tmp=split/\t/,$aline;#ID chr start end strand
+# push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
+# #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]];
+#}
+close IN;
+close OUT;
+
+my %nat;
+open TMP,">$opts{t}";
+print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n";
+
+open IN,"<$opts{n}";
+my $nat_n=1;
+while (my $aline=<IN>) {
+ next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt -
+ chomp $aline;
+ my @arr=split /\t/,$aline;
+ my ($ns,$ne,$ns2,$ne2)=(0,0,0,0);
+ if ($arr[11]=~/Nearby/) {
+ ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]);
+ push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n];
+ print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n";
+ $nat_n++;
+ }else{
+ $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1;
+ $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1;
+ $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1;
+ $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1;
+ push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2
+ push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"];
+ print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n";
+ $nat_n++;
+ }
+}
+close IN;
+close TMP;
+
+my %repeat;
+open IN,"<$opts{'r'}";
+my $first=<IN>;
+$first=<IN>;
+$first=<IN>;
+while (my $aline=<IN>) {
+ chomp $aline;
+ $aline=~s/^\s+//;
+ my @tmp=split/\s+/,$aline;
+ $tmp[4]=~s/chr0/Chr/;
+ $tmp[4]=~s/chr/Chr/;
+ push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class
+ #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n";
+}
+close IN;
+
+my %phase;
+open IN,"<$opts{'p'}";
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;
+ if ($tmp[5]>=25) {
+ $phase{$tmp[0]}=$tmp[5];
+ }
+}
+close IN;
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+while (my $aline=<IN>) {
+ chomp $aline;
+ if($aline=~/^\#/){
+ print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n";
+ next;
+ }
+ my @tmp=split/\t/,$aline;
+ my @inf=split/\:/,$tmp[0];
+ my @pos=split/\-/,$inf[1];
+ my $chr=$inf[0];
+ $chr=~s/Chr0/Chr/;
+ my $start=$pos[0];
+ my $end=$pos[1];
+ #=========Repeat============
+ my @repeat;
+ if (defined(@{$repeat{$chr}})) {
+ my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}};
+ for (my $i=0;$i<@r_array ;$i++) {
+ if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) {
+ push @repeat,$r_array[$i][2];
+ }
+ elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){
+ push @repeat,$r_array[$i][2];
+
+ }
+ elsif($end<$r_array[$i][0]){
+ last;
+ }
+ else{next;}
+ }
+ }
+ my $repeat;
+ if (@repeat==0) {
+ $repeat="\/";
+ }
+ else{
+ $repeat=join ";",@repeat;
+ }
+ #=========nat===============
+ my @nat1;#class 1
+ my @nat2;#class 2
+ my @nat;#nat ID
+ #foreach my $chr (keys %nat) {
+ my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}};
+ for (my $i=0;$i<@n_array ;$i++) {
+ if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) {
+ push @nat1,$n_array[$i][2];
+ push @nat2,$n_array[$i][3];
+ push @nat,$n_array[$i][4];
+ }
+ elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){
+ push @nat1,$n_array[$i][2];
+ push @nat2,$n_array[$i][3];
+ push @nat,$n_array[$i][4];
+ }
+ elsif($end<$n_array[$i][0]){
+ last;
+ }
+ else{next;}
+ }
+ #}
+
+ my $nat1;
+ my $nat2;
+ my $nat;
+ if (@nat1==0) {
+ $nat1="\/";
+ }
+ else{
+ $nat1=join ";",@nat1;
+ }
+ if (@nat2==0) {
+ $nat2="\/";
+ }
+ else{
+ $nat2=join ";",@nat2;
+ }
+ if (@nat==0) {
+ $nat="\/";
+ }
+ else{
+ $nat=join ";",@nat;
+ }
+ #========phase==============
+ my $phase="\/";
+ if (defined($phase{$tmp[0]})) {
+ $phase="phase";
+ }
+ #=========long===============
+ my $long="\/";
+ if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) {
+ $long="long";
+ }
+ print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n";
+}
+
+close IN;
+close OUT;
+
+sub nearby{
+ my @p=@_;
+ my ($s,$e)=(0,0);
+ if ($p[1]<$p[2]) {
+ $s=$p[1];
+ $e=$p[2];
+ }else{
+ $s=$p[3];
+ $e=$p[0];
+ }
+ return ($s,$e);
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -g -n -r -p -t -l
+options:
+-i input file
+ -g gff file
+ -n NATs file
+ -r repeat file
+ -p phase file
+-o output file
+-t nat output file
+-l genelist output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 DEGseq_2.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/DEGseq_2.pl Fri Dec 05 00:11:02 2014 -0500
b
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2009-05-06
+#Modified:
+#Description: ɾ��matched reads 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","outdir=s","column1:i","mark1=s","depth1:i","depth2:i","column2:i","mark2=s","h");
+if (!(defined $opts{i} and defined $opts{outdir} and defined $opts{mark1} and defined $opts{mark2}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $outputdir=$opts{'outdir'};
+unless ($outputdir=~/\/$/) {$outputdir .="/";}
+my $column1=defined $opts{column1} ? $opts{column1} : 3;
+my $column2=defined $opts{column2} ? $opts{column2} : 4;
+my $mark1=$opts{mark1};
+my $mark2=$opts{mark2};
+my $fileout=$outputdir."degseq.R";
+
+open OUT,">$fileout"; #output file  
+#my ($name,$dir);
+#$name=basename($filein);
+print OUT "library(DEGseq)\n";
+print OUT "geneExpFile <- system.file(package=\"DEGseq\")\n";
+print OUT "geneExpFile<-file.path(\"$filein\")\n";
+print OUT "layout(matrix(c(1,2,3,4,5,6), 3, 2, byrow=TRUE))\npar(mar=c(2, 2, 2,2))\n";
+print OUT "outputdir<-file.path(\"$outputdir\")\n";
+print OUT "geneExpMatrix1 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column1))\n";
+print OUT "geneExpMatrix2 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column2))\n";
+if(defined $opts{'depth1'} && defined $opts{'depth2'}){
+print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",depth1=$opts{depth1},depth2=$opts{depth2},outputDir=outputdir,method=\"MARS\")\n";
+}
+else{
+print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",outputDir=outputdir,method=\"MARS\")\n";
+}
+close OUT;
+
+
+system("R CMD BATCH $fileout");
+
+wait;
+
+
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -outdir -column1 -mark1 -column2 -mark2 -depth1 -depth2
+options:
+-i input file
+-outdir output file dir
+-column1 the first column for DEGseq
+-mark1 the name of the column1
+-depth1 depth for the first file,use for normalize
+-column2 the second column for DEGseq
+-mark2 the name of the column2
+-depth2 depth for the second file,use for normalize
+
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 SampleDEGseqMerge.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SampleDEGseqMerge.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,94 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: chentt@big.ac.cn
+#Date: 2014-05-21
+#Modified:
+#Description: merged deg file and total information
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i:s@","mark:s@","f:s","o=s","n=s","h");
+if (!(defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my @filein=@{$opts{'i'}};
+my @mark=@{$opts{'mark'}};
+my $fileout=$opts{'o'};
+my $number=$opts{'n'};
+
+my %hash;
+open IN,"<$filein[0]"; #input file  
+
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\"/);
+ my @temp=split/\t/,$aline;
+ $hash{$temp[0]}=$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1];
+}
+close IN;
+
+for (my $i=1;$i<=$#filein;$i++) {
+ open IN,"<$filein[$i]"; #input file  
+
+ while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\"/);
+ my @temp=split/\t/,$aline;
+ if (!(defined $hash{$temp[0]})) {
+ print "Not find $temp[0]in sample one!\n";
+ next;
+ }
+ $hash{$temp[0]} .="\t".$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1];
+ }
+ close IN;
+}
+
+open OUT,">$fileout"; #output file  
+my $deg_title;
+foreach (@mark) {
+ $deg_title.="log2(Fold_change)\tp_value\tq_value\t".$_."\t";
+}
+$deg_title=~s/\s+$//;
+my %function;
+my $title;
+open F,"<$opts{f}";
+while (my $aline=<F>) {
+ chomp $aline;
+ if($aline=~/^\#/){
+ my $title=$aline;
+ my @title=split/\t/,$aline;
+ $title[2+$number].="\t".$deg_title;
+ $title=join"\t",@title;
+ print OUT "$title\n";
+ next;
+ }
+ my @temp=split/\t/,$aline;
+ $temp[2+$number].="\t".$hash{$temp[0]};
+ my $temp=join"\t",@temp;
+ print OUT "$temp\n";
+
+}
+close F;
+close OUT;
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -mark -f
+options:
+-i input file # -i output_score.txt -i output_score.txt -i output_score.txt
+-mark  sample name  # -mark sam1_VS_sam2 -mark sam1_VS_sam3 -mark sam2_VS_sam3
+-f cluster file
+-n sample number
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 conventional.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conventional.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,156 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chentt
+#Email: chentt@big.ac.cn
+#Date: 2014/04/09
+#Modified:
+#Description: islands merged of merged samples 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","d=i","o=s","N=i","t=s","mark=s","h");
+if (!(defined $opts{i} and defined $opts{d} and defined $opts{N} and defined $opts{mark} and defined $opts{t} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+my $distance=$opts{'d'};
+my $tempout=$opts{'t'};
+my $mark=$opts{'mark'};
+my @sample=split/\#/,$mark;
+$mark=join"\"\t\"",@sample;
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
+open TMP,">$tempout";
+print TMP "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
+my %hash;
+while (my $aline=<IN>) {
+ chomp $aline;
+ if($aline=~/^\#/){
+ #print OUT "$aline\n";
+ next;
+ }
+ my @tmp=split/\t/,$aline;
+ my $chr=shift @tmp;
+ #shift @tmp;
+ push @{$hash{$chr}},[@tmp];
+}
+
+close IN;
+
+foreach my $key (keys %hash) {
+ my @tag=sort{$a->[1] <=> $b->[1]} @{$hash{$key}};
+ my @sample;
+ my $start=$tag[0][1];
+ my $end=$tag[0][2];
+ push @sample,[@{$tag[0]}];
+ for (my $i=1;$i<@tag-1;$i++) {
+ if ($tag[$i][1]-$end<=$distance) {
+ if ($tag[$i][2]>$end) {
+ $end=$tag[$i][2];
+ }
+ push @sample,[@{$tag[$i]}];
+ }
+ else{
+ my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+ my $cluster_exp=join"\t",@cluster_exp;
+ if ($max_length>30) {
+ print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+ $max_length="\>30";
+ }
+ else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+ print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+ $start=$tag[$i][1];
+ $end=$tag[$i][2];
+
+ @sample=();
+ push @sample,[@{$tag[$i]}];
+ }
+ }
+ if ($tag[$#tag][1]-$end<=$distance) {
+ if ($tag[$#tag][2]>$end) {
+ $end=$tag[$#tag][2];
+ }
+ push @sample,[@{$tag[$#tag]}];
+ my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+ my $cluster_exp=join"\t",@cluster_exp;
+ if ($max_length>30) {
+ $max_length="\>30";
+ print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+ }
+ else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+ print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+ }
+ else{
+ my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+ my $cluster_exp=join"\t",@cluster_exp;
+ if ($max_length>30) {
+ $max_length="\>30";
+ print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+ }
+ else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+ print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+
+ }
+}
+close OUT;
+close TMP;
+sub Max_length{
+ my @exp=@{$_[0]};
+ my %sample_length;
+ my $total_exp;
+ my @each;
+ my @tag;
+ for (my $i=0;$i<=$#exp ;$i++) {
+ my $length=$exp[$i][2]-$exp[$i][1]+1;
+ #if ($length>30) {
+ # $length=40;
+ #}
+ my $exp=0;
+ foreach  (1..$opts{'N'}) {
+ $exp+=$exp[$i][$_+2];
+ $each[$_-1]+=$exp[$i][$_+2];
+ }
+ $sample_length{$length}+=$exp;
+ $total_exp+=$exp;
+ push @tag,($exp[$i][1].",".$exp[$i][2].",".$exp[$i][0].",".$exp);
+ }
+ my $max=0;
+ my $max_key;
+ foreach my $key (sort keys %sample_length) {
+ my $p=$sample_length{$key}/$total_exp;
+ if ($p>$max) {
+ $max=$p;
+ $max_key=$key;
+ }
+ $sample_length{$key}=sprintf("%.2f",$p);
+ }
+ my $tag_n=@tag;
+ my $tag=join";",@tag;
+ $tag=$tag_n."\t".$tag;
+ return($max_key,$sample_length{$max_key},$tag,@each);
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -d -N -t -mark
+options:
+-i input file
+-d distance of two islands
+-mark sample name;
+-o output file
+-N sample number
+-t temp output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 convert_bowtie_to_blast.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_bowtie_to_blast.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,126 @@
+#!/usr/bin/perl 
+
+
+use warnings;
+use strict;
+use Getopt::Std;
+
+######################################### USAGE ################################
+
+my $usage=
+"$0 file_bowtie_result file_solexa_seq file_chromosome
+
+This is a converter which changes Bowtie output into Blast format.
+The input includes three files: a Bowtie result file (default Bowtie
+output file), a fasta file consisting of small Reads and a chromosome
+fasta file. It outputs the alignments in blast_parsed format.
+
+file_bowtie_result likes:
+
+AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0
+AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0
+
+file_solexa_seq likes:
+
+>AtFlower100010_x2
+AAGGAGATTCTTTCAGTCCAG
+
+file_chromosome contains chromosome seq in fasta format
+
+";
+
+
+####################################### INPUT FILES ############################
+
+my $file_bowtie_result=shift or die $usage;
+my $file_short_seq=shift or die $usage;
+my $file_chromosome_seq=shift or die $usage;
+
+
+##################################### GLOBAL VARIBALES #########################
+
+my %short_seq_length=();
+my %chromosome_length=();
+
+
+######################################### MAIN ################################# 
+
+#get the short sequence id and its length
+sequence_length($file_short_seq,\%short_seq_length);
+
+#get the chromosome sequence id and its length
+sequence_length($file_chromosome_seq,\%chromosome_length);
+
+#convert bowtie result format to blast format;
+change_format($file_bowtie_result);
+
+exit;
+
+
+##################################### SUBROUTINES ##############################
+
+sub sequence_length{
+    my ($file,$hash) = @_;
+    my ($id, $desc, $sequence, $seq_length) = ();
+
+    open (FASTA, "<$file") or die "can not open $$file\n";
+    while (<FASTA>)
+    {
+        chomp;
+        if (/^>(\S+)(.*)/)
+ {
+     $id       = $1;
+     $desc     = $2;
+     $sequence = "";
+     while (<FASTA>){
+                chomp;
+                if (/^>(\S+)(.*)/){
+     $$hash{$id}  = length $sequence;
+     $id         = $1;
+     $desc       = $2;
+     $sequence   = "";
+     next;
+                }
+ $sequence .= $_;
+            }
+        }
+    }
+    $seq_length=length($sequence);
+    $$hash{$id} = $seq_length;
+    close FASTA;
+}
+
+
+
+
+
+sub change_format{
+ #Change Bowtie format into blast format
+ my $file=shift @_;
+ open(FILE,"<$file")||die"can not open the bowtie result file:$!\n";
+  #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n";
+
+ while(<FILE>){
+ chomp;
+ my @tmp=split("\t",$_);
+ #Clean the reads ID
+ my @tmp1=split(" ",$tmp[0]);
+ print  "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t";
+ if($tmp[1] eq "+"){
+ my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]};
+ my $seq_bg=$tmp[3] + 1;
+ print  "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n";
+ }
+ if($tmp[1] eq "-"){
+ my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3];
+ my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1;
+ print  "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n";
+ }
+ }
+
+# close BLASTOUT;
+
+}
+
+
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 count_ref_length.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/count_ref_length.pl Fri Dec 05 00:11:02 2014 -0500
b
@@ -0,0 +1,58 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2009-05-06
+#Modified:
+#Description: ɾ��matched reads 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+
+my ($name,$seq);
+while (my $aline=<IN>) {
+ chomp $aline;
+ if ($aline=~/^>(\S+)/) {
+ $name=$1;
+ while (my $new=<IN>) {
+ chomp $new;
+ if ($new=~/^>(\S+)/) {
+ print OUT $name,"\t",length($seq),"\n";
+ $seq ="";
+ $name=$1;
+ next;
+ }
+ else{$seq .=$new;}
+ }
+ }
+ print OUT $name,"\t",length($seq),"\n";
+}

+close IN;
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 filterReadsByCount.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filterReadsByCount.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,116 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2010-01
+#Modified:
+#Description:  
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","mark:s","h");
+if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
+my @mark=split /\#/,$mark;
+
+open OUT,">$opts{o}";
+open IN,"<$opts{i}";
+my %hash;my %reads;
+while (my $aline=<IN>) {
+ chomp $aline;
+ my $seq=<IN>;
+ chomp $seq;
+ if($aline=~/:([\d|_]+)_x(\d+)$/){
+ if ($2>3) {
+ my @ss=split/_/,$1;
+ for (my $i=0;$i<@ss;$i++) {
+ $hash{length($seq)}[$i]++ if($ss[$i]>0);
+ $hash{length($seq)}[$i] +=0 if($ss[$i]==0);
+ $reads{length($seq)}[$i]+=$ss[$i];
+ }
+ print OUT "$aline\n$seq\n";
+ }
+ }
+}
+close IN;
+close OUT;
+
+my $dir=dirname($opts{'o'});
+chdir $dir;
+my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt";
+open OUT, ">$lengthfile";
+open R,">$dir/length_distribution_after_count_filter.R";
+
+print OUT "Tags length\t@mark\n";
+
+my $samNo=@mark;
+my $avalue="";
+my @length=sort{$a<=>$b} keys %hash;
+foreach  (@length) {
+ print OUT $_,"\t@{$hash{$_}}\n";
+ my $vv=join ", ",@{$hash{$_}};
+ $avalue .="$vv,";
+}
+$avalue =~s/,$//;
+my $lengths=join ",",@length;
+my $marks=join "\",\"",@mark;
+
+print R  "a<-c($avalue)
+b<-matrix(a,ncol=$samNo,byrow=T)
+cl<-colors()
+names=c($lengths)
+legends=c(\"$marks\")
+png(\"Tags_length_after_count_filter.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+$avalue="";
+print OUT "\nReads length\t@mark\n";
+foreach  (@length) {
+ print OUT $_,"\t@{$reads{$_}}\n";
+ my $vv=join ", ", @{$reads{$_}};
+ $avalue .= "$vv,";
+}
+$avalue =~s/,$//;
+
+print R  "a<-c($avalue)\n
+b<-matrix(a,ncol=$samNo,byrow=T)
+
+png(\"Reads_length_after_count_filter.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+close OUT;
+close R;
+
+system ("R CMD BATCH $dir/length_distribution_after_count_filter.R");
+
+#system ("rm $dir/length_distribution.R");
+#system ("rm $dir/length_distribution.Rout");
+#system ("rm $dir/.RData");
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o  -min -max -mark
+options:
+
+-i input file
+-o output file
+-mark string #sample name eg: samA#samB#samC
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 filterReadsByLength.pl
--- a/filterReadsByLength.pl Wed Dec 03 02:03:27 2014 -0500
+++ b/filterReadsByLength.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -33,7 +33,7 @@
  my @ss=split/_/,$1;
  for (my $i=0;$i<@ss;$i++) {
  $hash{length($seq)}[$i]++ if($ss[$i]>0);
- $hash{length($seq)}[$i] +=0 if($ss[$i]>0);
+ $hash{length($seq)}[$i] +=0 if($ss[$i]==0);
  $reads{length($seq)}[$i]+=$ss[$i];
  }
  }
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 get_genelist.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_genelist.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: chentt
+#Email:
+#Date: 2012-4-6
+#Modified:
+#Description:
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+open IN,"<$opts{i}";
+open OUT ,">$opts{o}";
+print OUT "#ID\tchr\tstart\tend\tstrand\n";
+my $n=1;
+my %gene1;
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;
+ my $ID;
+ if ($tmp[2] eq "gene") {
+ $tmp[0]=~s/Chr\./Chr/;
+ #$tmp[0]=~s/Chr/chr/;
+ my @infor=split/;/,$tmp[8];
+ for (my $i=0;$i<@infor ;$i++) {
+ if ($infor[$i]=~/Alias\=(\S+)$/) {
+ $ID=$1;
+ last;
+ }
+ else {
+ $ID="unknown$n";
+ $n++;
+ }
+ }
+ #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+ push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
+ print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+ }
+}
+close IN;
+close OUT;
+
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -h
+options:
+-i input cluster file
+-o output file
+-h help
+USAGE
+exit(1);
+}
\ No newline at end of file
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 html_siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/html_siRNA.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,788 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-5-29\n+#Modified:\n+#Description: \n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use File::Basename;\n+\n+my %opts;\n+GetOptions(\\%opts,"i=s","format=s","o=s","h");\n+if (!(defined $opts{o}  and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+my ($config,$prepath,$rfampath,$genomepath,$clusterpath,$annotatepath,$plotpath,$degpath);\n+my ($predir,$rfamdir,$genomedir,$clusterdir,$annotatedir,$plotdir,$degdir);\n+open IN,"<$opts{i}";\n+$config=<IN>; chomp $config;\n+$prepath=<IN>; chomp $prepath;\n+$rfampath=<IN>;chomp $rfampath;\n+$genomepath=<IN>; chomp $genomepath;\n+$clusterpath=<IN>; chomp $clusterpath;\n+$annotatepath=<IN>; chomp $annotatepath;\n+$plotpath=<IN>; chomp $plotpath;\n+my $deg_tag=1;\n+if(eof){$deg_tag=0;}\n+else{\n+\t$degpath=<IN>; chomp $degpath;\n+}\n+close IN;\n+my @tmp=split/\\//,$prepath;\n+$predir=$tmp[-1];\n+@tmp=split/\\//,$rfampath;\n+$rfamdir=$tmp[-1];\n+@tmp=split/\\//,$genomepath;\n+$genomedir=$tmp[-1];\n+@tmp=split/\\//,$clusterpath;\n+$clusterdir=$tmp[-1];\n+@tmp=split/\\//,$annotatepath;\n+$annotatedir=$tmp[-1];\n+@tmp=split/\\//,$plotpath;\n+$plotdir=$tmp[-1];\n+\n+my $dir=dirname($opts{\'o\'});\n+\n+open OUT ,">$opts{\'o\'}";\n+print OUT "<HTML>\\n  <HEAD>\\n  <TITLE> Analysis Report </TITLE>\\n </HEAD>\n+ <BODY bgcolor=\\"lightgray\\">\\n  <h1 align=\\"center\\">\\n    <font face=\\"\xba\xda\xcc\xe5\\">\\n\t<b>Small RNA Analysis Report</b>\\n  </font>\\n  </h1>\n+  <h2>1. Sequence No. and quality</h2>\n+  <h3>1.1 Sequece No.</h3>\n+";\n+\n+### raw data no\n+open IN,"<$config";\n+my @files;my @marks; my @rawNo;\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tmy @tmp=split/\\t/,$aline;\n+\tpush @files,$tmp[0];\n+\t\n+\tmy $no=`less $tmp[0] |wc -l `;\n+\tchomp $no;\n+\tif ($opts{\'format\'} eq "fq" || $opts{\'format\'} eq "fastq") {\n+\t\t$no=$no/4;\n+\t}\n+\telse{\n+\t\t$no=$no/2;\n+\t}\n+\tpush @rawNo,$no;\n+\n+\tpush @marks,$tmp[1];\n+}\n+close IN;\n+\n+### preprocess \n+unless ($prepath=~/\\/$/) {\n+\t$prepath .="/";\n+}\n+\n+my @trimNo;my @collapse;\n+my $collapsefile=$prepath."collapse_reads.fa";\n+open IN,"<$collapsefile";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$trimNo[$i] +=$lng[$i];\n+\t\t\t$collapse[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+my @cleanR;my @cleanT;\n+my $clean=$prepath."collapse_reads_18-40.fa";\n+open IN,"<$clean";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$cleanR[$i] +=$lng[$i];\n+\t\t\t$cleanT[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+my @filterR;my @filterT;\n+my $filter=$prepath."collapse_reads_out.fa";\n+open IN,"<$filter";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$filterR[$i] +=$lng[$i];\n+\t\t\t$filterT[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">\n+<th>&nbsp;</th>\n+";\n+foreach  (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print  OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Raw Reads No. </th>\n+";\n+foreach  (@rawNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Reads No. After Trimed 3\\\' adapter </th>\n+";\n+foreach  (@trimNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Unique Tags No. </th>\n+";\n+foreach  (@collapse) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT  "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Reads No. </th>\n+";\n+foreach  (@cleanR) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Tags No. </th>\n+";\n+foreach  (@cleanT) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT  "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Filter Reads No. \\(reads count \\>3\\) </th>\n+";\n+forea'..b'ters number</th>\\n\n+\t";\n+\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Phase</th>\\n\n+\t";\n+\tforeach  (@phase) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Long</th>\\n\n+\t";\n+\tforeach  (@long) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Repeat</th>\\n\n+\t";\n+\tforeach  (@repeat) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Nat</th>\\n\n+\t";\n+\tforeach  (@nat) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\n+\tprint OUT "<p>\n+\tRepeat subclass annotate:\n+\t";\n+\n+\tprint OUT "<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Repeat subclass</th>\\n\n+\t";\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\n+\tforeach my $key (sort keys %repeat) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@marks ;$i++) {\n+\t\t\tif (defined($repeat{$key}[$i])) {\n+\t\t\t\tprint OUT "<td> $repeat{$key}[$i] </td>\\n";\n+\t\t\t}\n+\t\t\telse{print OUT "<td> 0 </td>\\n";}\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\n+\n+\tprint OUT "<p>\n+\tNat subclass1 annotate:\n+\t";\n+\n+\tprint OUT "<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Nat subclass1</th>\\n\n+\t";\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\tforeach my $key (sort keys %nat1) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@marks ;$i++) {\n+\t\t\tif (defined($nat1{$key}[$i])) {\n+\t\t\t\tprint OUT "<td> $nat1{$key}[$i] </td>\\n";\n+\t\t\t}\n+\t\t\telse{print OUT "<td> 0 </td>\\n";}\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\n+\tprint OUT "<p>\n+\tNat subclass2 annotate:\n+\t";\n+\n+\tprint OUT "<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Nat subclass2</th>\\n\n+\t";\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\tforeach my $key (sort keys %nat2) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@marks ;$i++) {\n+\t\t\tif (defined($nat2{$key}[$i])) {\n+\t\t\t\tprint OUT "<td> $nat2{$key}[$i] </td>\\n";\n+\t\t\t}\n+\t\t\telse{print OUT "<td> 0 </td>\\n";}\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\tprint OUT "<p>\n+\tNote:<br />\n+\tOne cluster mybe annotate to multiple repeats or nats<br />\n+\t";\n+}\n+else {\n+\tprint OUT "<h3>5.2 Cluster source mechanism annotate</h3>\n+\t<br />Do not do source mechanism annotate <br />";\n+\n+}\n+\n+print OUT "<h2>6. Graph of Clusters of all samples</h2> \\n";\n+\n+my $plot=$plotpath."cluster.html";\n+open IN,"<$plot";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tprint OUT "$aline\\n";\n+}\n+close IN;\n+\n+\n+if ($deg_tag) {\n+\tmy $deg_file=`ls $degpath`;\n+\tchomp $deg_file;\n+\tmy @deg_file=split/\\n/,$deg_file;\n+\tmy %deg;\n+\tforeach  (@deg_file) {\n+\t\tmy $output="$degpath/$_/output_score.txt";\n+\t\topen IN,"<$output";\n+\t\t$deg{$_}[0]=0;\n+\t\t$deg{$_}[1]=0;\n+\t\t$deg{$_}[2]=0;\n+\t\twhile (my $aline=<IN>) {\n+\t\t\tnext if ($aline=~/^\\"/);\n+\t\t\tchomp $aline;\n+\t\t\tmy @temp=split/\\t/,$aline;\n+\t\t\tif ($temp[9] eq "TRUE") {\n+\t\t\t\t$deg{$_}[0]++;\n+\t\t\t\tif ($temp[4] >0) {\n+\t\t\t\t\t$deg{$_}[1]++;\n+\t\t\t\t}\n+\t\t\t\tif ($temp[4] <0) {\n+\t\t\t\t\t$deg{$_}[2]++;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\tclose IN;\n+\t}\n+\n+\tprint OUT "<h2>7. DEG</h2>\n+\t<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Genes number</th>\\n\n+\t<th> DEG </th>\\n\n+\t<th> UP </th>\\n\n+\t<th> DOWN </th>\\n\n+\t";\n+\n+\tforeach my $key (sort keys %deg) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@{$deg{$key}} ;$i++) {\n+\t\t\tprint OUT "<td> $deg{$key}[$i] </td>\\n";\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+}\n+else{\n+\tprint OUT "<h2>7. DEG</h2>\n+\t<br />Do not do DE clusters <br />";\n+}\n+\n+print OUT "\n+ </BODY>\n+</HTML>\n+";\n+close OUT;\n+\n+\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -o\n+options:\n+-i \n+-format\n+-o output file\n+-h help\n+USAGE\n+exit(1);\n+}\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 install_DEG.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/install_DEG.R Fri Dec 05 00:11:02 2014 -0500
b
@@ -0,0 +1,2 @@
+source("http://bioconductor.org/biocLite.R")
+biocLite("DEGseq")
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 miRDeep_plant.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/miRDeep_plant.pl Fri Dec 05 00:11:02 2014 -0500
b
b'@@ -0,0 +1,1622 @@\n+#!/usr/bin/perl\n+\n+use warnings;\n+use strict;\n+use Getopt::Std;\n+#use RNA;\n+\n+\n+################################# MIRDEEP #################################################\n+\n+################################## USAGE ##################################################\n+\n+\n+my $usage=\n+"$0 file_signature file_structure temp_out_directory\n+\n+This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with\n+information on the positions of reads aligned to potential precursor sequences (signature).\n+It also takes as input an RNAfold output file, giving information on the sequence, structure\n+and mimimum free energy of the potential precursor sequences.\n+\n+Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA\n+sequences that should be considered for conservation purposes. -t prints out the potential\n+precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that\n+exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors\n+that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences\n+obtained through conventional cloning. -z consider the number of base pairings in the lower\n+stems (this option is not well tested).\n+\n+-h print this usage\n+-s fasta file with known miRNAs\n+#-o temp directory ,maked befor running the program.\n+-t print filtered\n+-u limited output (only ids)\n+-v cut-off (default 1)\n+-x sensitive option for Sanger sequences\n+-y use Randfold\n+-z consider Drosha processing\n+";\n+\n+\n+\n+\n+\n+############################################################################################\n+\n+################################### INPUT ##################################################\n+\n+\n+#signature file in blast_parsed format\n+my $file_blast_parsed=shift or die $usage;\n+\n+#structure file outputted from RNAfold\n+my $file_struct=shift or die $usage;\n+\n+my $tmpdir=shift or die $usage;\n+#options\n+my %options=();\n+getopts("hs:tuv:xyz",\\%options);\n+\n+\n+\n+\n+\n+\n+#############################################################################################\n+\n+############################# GLOBAL VARIABLES ##############################################\n+\n+\n+#parameters\n+my $nucleus_lng=11;\n+\n+my $score_star=3.9;\n+my $score_star_not=-1.3;\n+my $score_nucleus=7.63;\n+my $score_nucleus_not=-1.17;\n+my $score_randfold=1.37;\n+my $score_randfold_not=-3.624;\n+my $score_intercept=0.3;\n+my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);\n+my $score_min=1;\n+if($options{v}){$score_min=$options{v};}\n+if($options{x}){$score_min=-5;}\n+\n+my $e=2.718281828;\n+\n+#hashes\n+my %hash_desc;\n+my %hash_seq;\n+my %hash_struct;\n+my %hash_mfe;\n+my %hash_nuclei;\n+my %hash_mirs;\n+my %hash_query;\n+my %hash_comp;\n+my %hash_bp;\n+\n+#other variables\n+my $subject_old;\n+my $message_filter;\n+my $message_score;\n+my $lines;\n+my $out_of_bound;\n+\n+\n+\n+##############################################################################################\n+\n+################################  MAIN  ###################################################### \n+\n+\n+#print help if that option is used\n+if($options{h}){die $usage;}\n+unless ($tmpdir=~/\\/$/) {$tmpdir .="/";}\n+if(!(-s $tmpdir)){mkdir $tmpdir;}\n+$tmpdir .="TMP_DIR/";\n+mkdir $tmpdir;\n+\n+#parse structure file outputted from RNAfold\n+parse_file_struct($file_struct);\n+\n+#if conservation is scored, the fasta file of known miRNA sequences is parsed\n+if($options{s}){create_hash_nuclei($options{s})};\n+\n+#parse signature file in blast_parsed format and resolve each potential precursor\n+parse_file_blast_parsed($file_blast_parsed);\n+#`rm -rf $tmpdir`;\n+exit;\n+\n+\n+\n+\n+##############################################################################################\n+\n+############################## SUBROUTINES ###################################################\n+\n+\n+\n+sub parse_file_blast_parsed{\n+\n+#    read through the signature blastparsed file, fills up a hash with infor'..b'th\n+#   the n\'t nucleotide on the plus strand\n+\n+#   This subroutine reverses the begin and end positions of positions of the minus strand so that they\n+#   are relative to the 5\' end of the plus strand\t\n+   \n+    my($beg,$end,$lng)=@_;\n+    \n+    my $new_end=$lng-$beg+1;\n+    my $new_beg=$lng-$end+1;\n+    \n+    return($new_beg,$new_end);\n+}\n+\n+sub round {\n+\n+    #rounds to nearest integer\n+   \n+    my($number) = shift;\n+    return int($number + .5);\n+    \n+}\n+\n+\n+sub rev{\n+\n+    #reverses the order of nucleotides in a sequence\n+\n+    my($sequence)=@_;\n+\n+    my $rev=reverse $sequence;   \n+\n+    return $rev;\n+}\n+\n+sub com{\n+\n+    #the complementary of a sequence\n+\n+    my($sequence)=@_;\n+\n+    $sequence=~tr/acgtuACGTU/TGCAATGCAA/;   \n+ \n+    return $sequence;\n+}\n+\n+sub revcom{\n+    \n+    #reverse complement\n+\n+    my($sequence)=@_;\n+\n+    my $revcom=rev(com($sequence));\n+\n+    return $revcom;\n+}\n+\n+\n+sub max2 {\n+\n+    #max of two numbers\n+    \n+    my($a, $b) = @_;\n+    return ($a>$b ? $a : $b);\n+}\n+\n+sub min2  {\n+\n+    #min of two numbers\n+\n+    my($a, $b) = @_;\n+    return ($a<$b ? $a : $b);\n+}\n+\n+\n+\n+sub score_freq{\n+\n+#   scores the count of reads that map to the potential precursor\n+#   Assumes geometric distribution as described in methods section of manuscript\n+\n+    my $freq=shift;\n+\n+    #parameters of known precursors and background hairpins\n+    my $parameter_test=0.999;\n+    my $parameter_control=0.6;\n+\n+    #log_odds calculated directly to avoid underflow\n+    my $intercept=log((1-$parameter_test)/(1-$parameter_control));\n+    my $slope=log($parameter_test/$parameter_control);\n+    my $log_odds=$slope*$freq+$intercept;\n+\n+    #if no strong evidence for 3\' overhangs, limit the score contribution to 0\n+    unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}\n+   \n+    return $log_odds;\n+}\n+\n+\n+\n+##sub score_mfe{\n+\n+#   scores the minimum free energy in kCal/mol of the potential precursor\n+#   Assumes Gumbel distribution as described in methods section of manuscript\n+\n+##    my $mfe=shift;\n+\n+    #numerical value, minimum 1\n+##    my $mfe_adj=max2(1,-$mfe);\n+\n+    #parameters of known precursors and background hairpins, scale and location\n+##    my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);\n+##    my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);\n+\n+##    my $odds=$prob_test/$prob_background;\n+##    my $log_odds=log($odds);\n+\n+##    return $log_odds;\n+##}\n+\n+sub score_mfe{\n+#\tuse bignum;\n+\n+#   scores the minimum free energy in kCal/mol of the potential precursor\n+#   Assumes Gumbel distribution as described in methods section of manuscript\n+\n+    my ($mfe,$mlng)=@_;\n+\n+    #numerical value, minimum 1\n+    my $mfe_adj=max2(1,-$mfe);\n+my $mfe_adj1=$mfe/$mlng;\n+    #parameters of known precursors and background hairpins, scale and location\n+\tmy $a=1.339e-12;my $b=2.778e-13;my $c=45.834;\n+\tmy $ev=$e**($mfe_adj1*$c);\n+\t#print STDERR "\\n***",$ev,"**\\t",$ev+$b,"\\t";\n+\tmy $log_odds=($a/($b+$ev));\n+\n+\n+\tmy $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);\n+    my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);\n+\n+    my $odds=$prob_test/$prob_background;\n+    my $log_odds_2=log($odds);\n+\t#print STDERR "log_odds :",$log_odds,"\\t",$log_odds_2,"\\n";\n+   return $log_odds;\n+}\n+\n+\n+\n+sub prob_gumbel_discretized{\n+\n+#   discretized Gumbel distribution, probabilities within windows of 1 kCal/mol\n+#   uses the subroutine that calculates the cdf to find the probabilities\n+\n+    my ($var,$scale,$location)=@_;\n+\n+    my $bound_lower=$var-0.5;\n+    my $bound_upper=$var+0.5;\n+\n+    my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);\n+    my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);\n+\n+    my $prob=$cdf_upper-$cdf_lower;\n+\n+    return $prob;\n+}\n+\n+\n+sub cdf_gumbel{\n+\n+#   calculates the cumulative distribution function of the Gumbel distribution\n+\n+    my ($var,$scale,$location)=@_;\n+\n+    my $cdf=$e**(-($e**(-($var-$location)/$scale)));\n+\n+    return $cdf;\n+}\n+\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 miRNA_Express_and_sequence.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/miRNA_Express_and_sequence.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,173 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2014-6-4
+#Modified:
+#Description: solexa miRNA express and sequence
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h");
+if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'list'};
+my $out=$opts{'fa'};
+my $preout=$opts{'pre'};
+
+=cut
+my %hash_pri;
+open PRI,"<$opts{p}";
+while (my $aline=<PRI>) {
+ chomp $aline;
+ if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;}
+}
+close PRI;
+=cut
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+open FA ,">$out";
+open PRE,">$preout";
+
+print OUT "#ID\tcoordinate\tpos1\tpos2";
+my @marks=split/\,/,$opts{'tag'};
+foreach  (@marks) {
+ print OUT "\t",$_,"_matureExp";
+}
+foreach  (@marks) {
+ print OUT "\t",$_,"_starExp";
+}
+foreach  (@marks) {
+ print OUT "\t",$_,"_totalExp";
+}
+
+print OUT "\n";
+
+my (%uniq_id,$novel);
+while (my $aline=<IN>) {
+ chomp $aline;
+ until ($aline =~ /^score\s+[-\d\.]+/){
+ $aline = <IN>;
+ if (eof) {last;}
+ }
+ if (eof) {last;}
+########## miRNA ID ################
+ $novel++;
+########### annotate####################
+ do {$aline=<IN>;} until($aline=~/flank_first_end/) ;
+ chomp $aline;
+ my @flank1=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/flank_second_beg/) ;
+ chomp $aline;
+ my @flank2=split/\t/,$aline;
+#
+########## mature start loop pre ####
+ do {$aline=<IN>;} until($aline=~/mature_beg/) ;
+ chomp $aline;
+ my @start=split/\t/,$aline;
+# $start[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/mature_end/) ;
+ chomp $aline;
+ my @end=split/\t/,$aline;
+# $end[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/mature_seq/) ;
+ chomp $aline;
+ my @arr1=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/pre_seq/) ;
+ chomp $aline;
+ my @arr2=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/pri_id/) ;
+ chomp $aline;
+ my @pri_id=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/pri_seq/) ;
+ chomp $aline;
+ my @pri_seq=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/star_beg/) ;
+ chomp $aline;
+ my @star_start=split/\t/,$aline;
+# $star_start[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/star_end/) ;
+ chomp $aline;
+ my @star_end=split/\t/,$aline;
+# $star_end[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/star_seq/) ;
+ chomp $aline;
+ my @arr3=split/\t/,$aline;
+ print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t";
+ #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t";
+ print FA ">miR-c-$novel\n$arr1[1]\n";
+ print PRE ">miR-c-$novel\n$pri_seq[1]\n";
+########## reads count #############
+ <IN>;
+ my @count1;my @count2;my @count3;my @count4;
+ $aline=<IN>;
+ do {
+ chomp $aline;
+ my @reads=split/\t/,$aline;
+ my @pos=();
+ $reads[5]=~/(\d+)\.\.(\d+)/;
+# $pos[0] =$1-$flank1[1];
+# $pos[1] =$2-$flank1[1];
+ $pos[0]=$1;
+ $pos[1]=$2;
+ $reads[0]=~/:([\d|_]+)_x(\d+)$/;
+ my @ss=split/_/,$1;
+ for (my $i=0;$i<@ss ;$i++) {
+ if (!(defined $count3[$i])) {
+ $count3[$i]=0;
+ }
+ if (!(defined $count4[$i])) {
+ $count4[$i]=0;
+ }
+ $count2[$i]+=$ss[$i];
+
+ }
+# $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 );
+# $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 );
+# $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1);
+# $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1);
+ if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 )
+ {
+ for (my $i=0;$i<@ss;$i++) {
+ $count3[$i]+=$ss[$i];
+ }
+ }
+ if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){
+ for (my $i=0;$i<@ss;$i++) {
+ $count4[$i]+=$ss[$i];
+ }
+ } 
+ $aline=<IN>;
+ chomp $aline;
+ } until(length $aline < 1) ;
+ $"="\t";
+ print OUT "@count3\t@count4\t@count2\n";
+ $"=" ";
+}
+
+close IN;
+close OUT;
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -list -fa -pre -tag
+options:
+-i input file,predictions file
+-list output file miRNA list file
+-fa output file ,miRNA sequence fasta file.
+-pre output file, miRNA precursor fasta file.
+-tag string, sample names# eg: samA,samB,samC
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 miRPlant.pl
--- a/miRPlant.pl Wed Dec 03 02:03:27 2014 -0500
+++ b/miRPlant.pl Fri Dec 05 00:11:02 2014 -0500
b
@@ -17,7 +17,7 @@
 #use Term::ANSIColor;
 
 my %opts;
-GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h");
+GetOptions(\%opts,"i:s@","tag:s@","phred:i","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h");
 if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments
 &usage;
 }
@@ -33,7 +33,9 @@
 }
 
 my $phred_qv=64;
-
+if (defined $opts{'phred'}) {
+ $phred_qv=$opts{'phred'};
+}
 
 my @inputfiles=@{$opts{'i'}};
 my @inputtags=@{$opts{'tag'}};
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 miRPlant.xml
--- a/miRPlant.xml Wed Dec 03 02:03:27 2014 -0500
+++ b/miRPlant.xml Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -15,10 +15,6 @@\n   <command interpreter="perl">miRPlant.pl \r\n    ## Change this to accommodate the number of threads you have available.\r\n         -t \\${GALAXY_SLOTS:-4}\r\n-   ## Do or not delet rfam mapped tags\r\n-    #if $params.delet_rfam == "yes":\r\n-\t-D \r\n-\t#end if\r\n \t-path \\$SCRIPT_PATH\r\n \r\n     #for $j, $s in enumerate( $series )\r\n@@ -27,7 +23,43 @@\n     -tag ${s.tag}\r\n     #end for\r\n \r\n-    -format $format -gfa $gfa -pre $pre -mat $mat -rfam $rfam  -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe > run.log\r\n+      ## prepare bowtie index\r\n+      #set index_path = \'\'\r\n+      #if str($reference_genome.source) == "history":\r\n+          bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa;\r\n+          #set index_path = \'genome\'\r\n+      #else:\r\n+          #set index_path = $reference_genome.index.fields.path\r\n+      #end if\r\n+\r\n+\r\n+   ## Do or not annotate rfam non-miRNA RNAs\r\n+    #if $params.annotate_rfam == "yes":\r\n+\r\n+\t\t  ## prepare Rfam bowtie index\r\n+\t\t  #set rfam_index_path = \'\'\r\n+\t\t  #if str($params.annotate_rfam.reference_rfam.source) == "history":\r\n+\t\t\t  bowtie-build "$params.annotate_rfam.reference_rfam.own_file" rfam; ln -s "$params.annotate_rfam.reference_rfam.own_file" rfam.fa;\r\n+\t\t\t  #set rfam_index_path = \'rfam\'\r\n+\t\t  #else:\r\n+\t\t\t  #set rfam_index_path = $params.annotate_rfam.reference_rfam.index.fields.path\r\n+\t\t  #end if\r\n+\r\n+\t\t-rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v \r\n+\t   ## Do or not delet rfam mapped tags\r\n+\t\t#if $params.annotate_rfam.rfamresult.delet_rfam == "yes":\r\n+\t\t-D \r\n+\t\t#end if\r\n+\t#end if\r\n+\r\n+\r\n+   ## Do or not annotate known microRNAs\r\n+    #if $params.known_microRNA == "yes":\r\n+\t-pre $pre -mat $mat \r\n+\t#end if\r\n+\r\n+\r\n+    -format $format -gfa ${index_path}.fa -idx $index_path -phred $phred   -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log\r\n   </command>\r\n \r\n   <inputs>\r\n@@ -37,25 +69,136 @@\n      <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/>\r\n    </repeat>\r\n \r\n+        <conditional name="reference_genome">\r\n+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">\r\n+            <option value="indexed">Use a built-in index</option>\r\n+            <option value="history">Use one from the history</option>\r\n+          </param>\r\n+          <when value="indexed">\r\n+            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">\r\n+              <options from_data_table="bowtie_indexes">\r\n+                <filter type="sort_by" column="2"/>\r\n+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>\r\n+              </options>\r\n+            </param>\r\n+          </when>\r\n+          <when value="history">\r\n+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />\r\n+          </when>\r\n+        </conditional>\r\n+\r\n \t<conditional name="params">\r\n-\t\t<param name="delet_rfam" type="select" label="delet rfam mapped reads">\r\n+\t\t<param name="annotate_rfam" type="select" label="annotate rfam nocoding RNAs(excluding miRNA)">\r\n \t\t  <option value="yes" selected="true">yes</option>\r\n \t\t  <option value="no">no</option>\r\n \t\t </param>\r\n+\t\t <when value="yes">\r\n+\t\t\t<!--param name="rfam" type="data" label="rfam sequence file" /-->\r\n+\t\t\t<conditional name="reference_rfam">\r\n+\t\t\t  <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">\r\n+\t\t\t\t<option value="indexed">Use a built-in index</option>\r\n+\t\t\t\t<option value="history">Use one from the history</option>\r\n+\t\t\t  '..b'abel="report end-to-end hits less than v mismatches for rfam mapping"/>\r\n+\t\t </when>\r\n+    </conditional> <!-- params -->\r\n+\r\n \t<!--param type="data" name="idx2" label="rfam sequence bowtie index " -->\r\n \t<param name="a" type="text" value="ATCTCGTATG" label="3\' adapter sequence" />\r\n \t<param name="mapnt" type="integer" value="8" label="minimum adapter map nts" />\r\n@@ -64,7 +207,6 @@\n \t<param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to precursors" />\r\n \t<param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" />\r\n \t<param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" />\r\n-\t<param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches"/>\r\n \t<param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" />\r\n \t<param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" />\r\n \t<param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" />\r\n@@ -72,11 +214,21 @@\n   </inputs>\r\n \r\n   <outputs>\r\n-   <data format="txt" name="known microRNA express list" from_work_dir="miRPlant_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list"/>\r\n-   <data format="txt" name="known microRNA express alignment" from_work_dir="miRPlant_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment"/>\r\n-   <data format="txt" name="known microRNA moRs result" from_work_dir="miRPlant_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result"/>\r\n-   <data format="txt" name="known microRNA precursor file" from_work_dir="miRPlant_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file"/>\r\n-   <data format="txt" name="known microRNA mature file" from_work_dir="miRPlant_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file"/>\r\n+   <data format="txt" name="known microRNA express list" from_work_dir="miRPlant_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list">\r\n+   <filter>(params[\'known_microRNA\'] == \'Yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA express alignment" from_work_dir="miRPlant_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment">\r\n+   <filter>(params[\'known_microRNA\'] == \'Yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA moRs result" from_work_dir="miRPlant_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result">\r\n+   <filter>(params[\'known_microRNA\'] == \'Yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA precursor file" from_work_dir="miRPlant_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file">\r\n+   <filter>(params[\'known_microRNA\'] == \'Yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA mature file" from_work_dir="miRPlant_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file">\r\n+   <filter>(params[\'known_microRNA\'] == \'Yes\')</filter>\r\n+   </data>\r\n    <data format="txt" name="novel microRNA express list" from_work_dir="miRPlant_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/>\r\n    <data format="txt" name="novel microRNA precursor file" from_work_dir="miRPlant_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/>\r\n    <data format="txt" name="novel microRNA mature sequence file" from_work_dir="miRPlant_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/>\r\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 microRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microRNA.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,253 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-4-22\n+#Modified:\n+#Description: plant microRNA prediction\n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use threads;\n+#use threads::shared;\n+use File::Path;\n+use File::Basename;\n+#use RNA;\n+#use Term::ANSIColor;\n+\n+my %opts;\n+GetOptions(\\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h");\n+if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $time=&Time();\n+print "miPlant program start:\\n   The time is $time!\\n";\n+print "Command line:\\n   $0 @ARGV\\n";\n+\n+my  $mypath=`pwd`;\n+chomp $mypath;\n+\n+my $dir=defined $opts{\'o\'} ? $opts{\'o\'} : "$mypath/miRNA_out/";\n+\n+\n+unless ($dir=~/\\/$/) {$dir.="/";}\n+if (not -d $dir) {\n+\tmkdir $dir;\n+}\n+my $config=$opts{\'i\'};\n+my $data=$opts{\'fa\'};\n+\n+my $scipt_path=defined $opts{\'path\'} ? $opts{\'path\'} : "/Users/big/galaxy-dist/tools/myTools/";\n+\n+my $t=1; #threads number\n+if (defined $opts{\'t\'}) {$t=$opts{\'t\'};}\n+\n+my $mis=0; #mismatch number for microRNA\n+if (defined $opts{\'mis\'}) {$mis=$opts{\'mis\'};}\n+\n+my $hit=25; # maximum reads mapping hits in genome\n+if (defined $opts{\'r\'}) {$hit=$opts{\'r\'};}\n+\n+my $upstream = 2; # microRNA 5\' extension\n+$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\n+\n+my $downstream = 5;# microRNA 3\' extension\n+$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\n+\n+my $maxd=defined $opts{\'dis\'} ? $opts{\'dis\'} : 200;\n+my $flank=defined $opts{\'flank\'} ? $opts{\'flank\'} :10;\n+my $mfe=defined $opts{\'mfe\'} ? $opts{\'mfe\'} : -20;\n+\n+$time=&Time();\n+print "$time, Checking input file!\\n";\n+\n+my (@filein,@mark);\n+&read_config();\n+\n+&checkfa($opts{pre}) if(defined $opts{pre});\n+&checkfa($opts{mat}) if(defined $opts{mat});\n+&checkfa($opts{gfa});\n+\n+chdir $dir;\n+my $data2=$data;\n+my $known_result=$dir."known_miRNA_Express";\n+if(defined $opts{pre} and defined $opts{mat}){\n+\t&quantify(); ### known microRAN quantify\n+\t$data2=$known_result."/mirbase_not_mapped.fa";\n+}\n+\n+my $genome_map=$dir."genome_match";\n+&genome($data2);\n+\n+#my $genome_map=&search($dir,"genome_match_");\n+my $mapfile=$genome_map."/genome_mapped.bwt";\n+my $mapfa=$genome_map."/genome_mapped.fa";\n+my $unmap=$genome_map."/genome_not_mapped.fa";\n+\n+#$time=Time();\n+#print "$time: Novel microRNA prediction!\\n\\n";\n+\n+&predict($mapfa);\n+\n+$time=Time();\n+print "$time: Program end!!\\n";\n+\n+############################## sub programs ###################################\n+sub predict{\n+\tmy ($file)=@_;\n+\t$time=&Time();\n+\tprint "$time: Novel microRNA prediction!\\n\\n";\n+\tmy $predict=$dir."Novel_miRNA_predict";\n+\tmkdir $predict;\n+\tchdir $predict;\n+\tsystem("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe");\n+#\tprint "\\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\\n";\n+\t\n+\tsystem("bowtie-build -f excised_precursor.fa excised_precursor");\n+#\tprint "\\nbowtie-build -f excised_precursor.fa excised_precursor\\n";\n+\t\n+\tsystem("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log");\n+#\tprint "\\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\\n";\n+\t\n+\tsystem("perl $scipt_path/convert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst");\n+#\tprint "\\nconvert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\\n";\n+\n+\tsystem("sort -k 4  precursor_mapped.bst  > signatures.bst");\n+#\tprint "\\nsort +3 -25  precursor_mapped.bst  > ../signatures.bst\\n";\n+\n+\tchdir $dir;\n+\tsystem("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd");\n+#\tpri'..b'ipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa");\n+\n+}\n+\n+sub genome{\n+\tmy ($file)=@_;\n+\tif(defined $opts{\'idx\'}){\n+\t\tsystem("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ;\n+#\t\tprint "\\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\\n";\n+\t}else{\n+\t\tsystem("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ;\n+#\t\tprint "\\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\\n";\n+\t}\n+}\n+\n+sub quantify{\n+\tmy $tag=join "\\\\;" ,@mark;\n+\tsystem("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag");\n+\tprint "\\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\\n";\n+}\n+\n+sub read_config{\n+\topen CON,"<$config";\n+\twhile (my $aline=<CON>) {\n+\t\tchomp $aline;\n+\t\tmy @tmp=split/\\t/,$aline;\n+\t\tpush @filein,$tmp[0];\n+\t\tpush @mark,$tmp[1];\n+\t\t#&check_rawdata($tmp[0]);\n+\t}\n+\tclose CON;\n+\tif (@filein != @mark) {\n+\t\t#&printErr();\n+\t\tdie "Maybe config file have some wrong!!!\\n";\n+\t}\n+}\n+sub checkfa{\n+\tmy ($file_reads)=@_;\n+\topen N,"<$file_reads";\n+\tmy $line=<N>;\n+\tchomp $line;\n+    if($line !~ /^>\\S+/){\n+        #printErr();\n+        die "The first line of file $file_reads does not start with \'>identifier\'\n+Reads file $file_reads is not a valid fasta file\\n\\n";\n+    }\n+    if(<N> !~ /^[ACGTNacgtn]*$/){\n+        #printErr();\n+        die "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+    }\n+\tclose N;\n+}\n+sub search{\n+\tmy ($dir,$str)=@_;\n+\topendir I,$dir;\n+\tmy @ret;\n+\twhile (my $file=readdir I) {\n+\t\tif ($file=~/$str/) {\n+\t\t\tpush @ret, $file;\n+\t\t}\n+\t}\n+\tclosedir I;\n+\tif (@ret != 1) {\n+\t\t#&printErr();\n+\n+\t\tdie "Can not find directory or file which name has string: $str !!!\\n";\n+\t}\n+\treturn $ret[0];\n+}\n+\n+\n+sub Time{\n+        my $time=time();\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n+        $month++;\n+        $year+=1900;\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\n+        if (length($min) == 1) {$min = "0"."$min";}\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\n+        if (length($day) == 1) {$day = "0"."$day";}\n+        if (length($month) == 1) {$month = "0"."$month";}\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\n+        return("$year-$month-$day $hour:$min:$sec");\n+}\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+\n+$0 -i -fa -gfa -idx -pre -mat -mis -e -f  -t  -o  -path\n+options:\n+-i input files, # config\n+\n+-fa ,#fasta sequence file\n+\n+-path scirpt path\n+\n+-gfa string,  input file # genome fasta. sequence file\n+-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-pre string,  input file #species specific microRNA precursor sequences\n+-mat string,  input file #species specific microRNA mature sequences\n+\n+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0\n+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2\n+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5\n+-r int       a read is allowed to map up to this number of positions in the genome,default is 25 \n+\n+-dis <int>   Maximal space between miRNA and miRNA* (200)\n+-flank <int>   Flank sequence length of miRNA precursor (10)\n+-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)\n+\n+-t int,    number of threads [1]\n+\n+-o output directory# absolute path\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 microRNA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microRNA.xml Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,111 @@
+<tool id="micrornas" name="microRNA" veision="1.0.0">
+  <description>Plant microRNA analysis </description>
+
+  <requirements>
+ <requirement type="package" version="0.0.13">fastx_toolkit </requirement>
+    <requirement type="package" version="0.12.7">bowtie</requirement>
+    <requirement type="set_environment">SCRIPT_PATH</requirement>
+    <!--requirement type="package" version="3.0.1">R</requirement!-->
+ <requirement type="package" version="2.59">SVG</requirement>
+ <requirement type="package" version="2.1.8">ViennaRNA</requirement>
+  </requirements>
+
+  <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command-->
+
+  <command interpreter="perl">microRNA.pl 
+   ## Change this to accommodate the number of threads you have available.
+        -t \${GALAXY_SLOTS:-4}
+ -path \$SCRIPT_PATH
+
+
+   ## Do or not annotate known microRNAs
+    #if $params.known_microRNA == "yes":
+ -pre $pre -mat $mat 
+ #end if
+
+        ## prepare bowtie index
+        #set index_path = ''
+        #if str($reference_genome.source) == "history":
+            bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa;
+            #set index_path = 'genome'
+        #else:
+            #set index_path = $reference_genome.index.fields.path
+ #end if
+
+
+    -gfa ${index_path}.fa -idx $index_path -mis $mismatch  -i $config -fa $reads -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe  > run.log
+  </command>
+
+  <inputs>
+
+         <!-- reference genome -->
+        <conditional name="reference_genome">
+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+            <option value="indexed">Use a built-in index</option>
+            <option value="history">Use one from the history</option>
+          </param>
+          <when value="indexed">
+            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+              <options from_data_table="bowtie_indexes">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+              </options>
+            </param>
+          </when>
+          <when value="history">
+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+          </when>
+        </conditional>
+
+ <conditional name="params">
+ <param name="known_microRNA" type="select" label="Analysis known microRNAs(eg. from mirbase)">
+   <option value="yes" selected="true">yes</option>
+   <option value="no">no</option>
+  </param>
+  <when value="yes">
+ <param name="mat" type="data" label="mature microRNA sequence file" />
+ <param name="pre" type="data" label="precursor microRNA sequence fie" />
+  </when>
+    </conditional> <!-- params -->
+
+ <param name="config" type="data" label="Raw data configs file" />
+ <param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" />
+
+
+ <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" />
+ <param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" />
+ <param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" />
+ <param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" />
+ <param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" />
+ <param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" />
+ <param name="mfe" type="float" value="-30" label="Maximal free energy allowed for a miRNA precursor" />
+
+  </inputs>
+
+  <outputs>
+   <data format="txt" name="known microRNA express list" from_work_dir="miRNA_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list">
+   <filter>(params['known_microRNA'] == 'Yes')</filter>
+   </data>
+   <data format="txt" name="known microRNA express alignment" from_work_dir="miRNA_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment">
+   <filter>(params['known_microRNA'] == 'Yes')</filter>
+   </data>
+   <data format="txt" name="known microRNA moRs result" from_work_dir="miRNA_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result">
+   <filter>(params['known_microRNA'] == 'Yes')</filter>
+   </data>
+   <data format="txt" name="known microRNA precursor file" from_work_dir="miRNA_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file">
+   <filter>(params['known_microRNA'] == 'Yes')</filter>
+   </data>
+   <data format="txt" name="known microRNA mature file" from_work_dir="miRNA_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file">
+   <filter>(params['known_microRNA'] == 'Yes')</filter>
+   </data>
+   <data format="txt" name="novel microRNA express list" from_work_dir="miRNA_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/>
+   <data format="txt" name="novel microRNA precursor file" from_work_dir="miRNA_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/>
+   <data format="txt" name="novel microRNA mature sequence file" from_work_dir="miRNA_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/>
+   <data format="txt" name="non-microRNA sequence FASTA file" from_work_dir="miRNA_out/non_miRNA_reads.fa" label="${tool.name} on ${on_string}: Sequence FASTA file of non-microRNA tags"/>
+
+  </outputs>
+
+ <help>
+
+ </help>
+ </tool>
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 nibls.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nibls.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,319 @@\n+#!/usr/bin/perl\n+#####################################################################################################\n+#LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. \n+#\n+# Dan MacLean\n+# dan.maclean@sainsbury-laboratory.ac.uk\n+#\n+# This program is free academic software; academic and non-profit\n+# users may redistribute it freely.\n+#\n+# This program is distributed in the hope that it will be useful,\n+# but WITHOUT ANY WARRANTY; without even the implied warranty of\n+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  \n+#\n+# This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007\n+# see included file GPL3.txt\n+#\n+#\n+\n+\n+###Dont forget you will need ... \n+#####################################################################################################\n+# Boost::Graph \n+#Copyright 2005 by David Burdick\n+# Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm\n+#Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself.\n+#####################################################################################################\n+\n+\n+\n+use strict;\n+use warnings;\n+use Boost::Graph;\n+use Getopt::Long;\n+\n+\n+my $usage = "usage: $0 -f GFF_FILE [options]\\n\\n -m minimum inclusion distance (default 5)\\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\\n";\n+\n+my $gff_file ;\n+my $min_inc = 5;\n+my $clus = 0.6;\n+my $buff = 0;\n+my $output_file;\n+my $temp;\n+my $mark;\n+\n+GetOptions(\n+\n+\t\'c=f\'          => \\$clus,\n+\t\'m=i\' => \\$min_inc,\n+\t\'f|file=s\'     => \\$gff_file,\n+\t\'b=i\' => \\$buff,\n+\t\'o=s\' => \\$output_file,\n+\t\'t=s\' => \\$temp,\n+\t\'k=s\' => \\$mark\n+) ;\n+\n+\n+die $usage unless $gff_file;\n+\n+\n+my $starttime = time;\n+warn "started $starttime\\n";\n+\n+## load in data\n+my %molecules; # stores starts and ends of srnas\n+open GFF, "<$gff_file";\n+\n+while (my $entry = <GFF>){\n+\n+\tchomp $entry;\n+\tnext if($entry=~/^\\#/);\n+\tmy @data = split(/\\t/,$entry);\n+\tmy $chr=shift @data;\n+\tmy $strand=shift @data;\n+\tmy $start=shift @data;\n+\tmy $end=shift @data;\n+#\tmy $length1=$end-$start+1;\n+#\tif ($length1>30) {\n+#\t\t$length1=40;\n+#\t}\n+\tmy $total;\n+\tfor (my $s=0;$s<@data ;$s++) {\n+\t\t$total+=$data[$s];\n+\t}\n+\tpush @data,$total;\n+#\tpush @data,$length1;\n+#\tif (defined $molecules{$chr}{$start}{$end}{$strand}) {\n+#\t\tmy @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand});\n+#\t\tfor (my $i=0;$i<$#old_data ;$i++) {\n+#\t\t\t$data[$i]+=$old_data[$i];\n+#\t\t}\n+#\t}\n+\tmy $data=join ";",@data;\n+\t$molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information\n+\t#print "$chr\\t$start\\t$end\\n";\n+}\n+\n+close GFF;\n+\n+warn "Data loaded...\\nBuilding graphs and finding loci\\nPlease be patient, this can take a while...\\n";\n+\n+my @sample=split/\\#/,$mark;\n+$mark=join"\\"\\t\\"",@sample;\n+open OUT, ">$output_file";\n+print OUT "\\"Chr\\"\\t\\"MajorLength\\"\\t\\"Percent\\"\\t\\"$mark\\"\\n";\n+open CLUSTER,">$temp";\n+print CLUSTER "\\#Chr\\tMajorLength\\tPercent\\tTagsNumber\\tTagsInfor\\n";\n+foreach my $chromosome (keys %molecules){\n+\tmy $g = new Boost::Graph(directed=>0);\n+\tmy @starts = keys(%{$molecules{$chromosome}} );\n+\t@starts = sort {$a <=> $b} @starts;  \n+\n+\twhile (my $srna_start = shift @starts){   ## work from left most sRNA to right most, add to graph if they close enough\n+\n+\n+\t\tforeach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){\n+\n+\n+\t\t\t###use new graph if the next srna is too far away from this one.. \n+\t\t\tif(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){\n+\n+\n+\t\t\t\t##dump the info from the old graph\n+\t\t\t\tif (scalar(@{$g->get_nodes()}) > 2){\n+\n+\t\t\t\t\tmy $cluster_coeff = get_cc($g);\n+\t\t\t\t\tif ($cluster_coeff >= $clus){\n+\t\t\t\t\t dump_locus($g, $cluster_coeff);\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\n+\n+\t\t\t\t$g = new Boost::Graph(directed=>0);\n+\n+\t\t\t}'..b'start}}){\n+\n+\t\t\t\t\t\tmy $end_node = $chromosome . \':\' . $start . \':\' . $end;\n+\t\t\t\t\t\t$g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>\'1\');\n+\t\t\t\t\t}\n+\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\n+\t\tif (!(defined $starts[0])) {\n+\t\t\t##dump the info from the last graph\n+\t\t\tif (scalar(@{$g->get_nodes()}) > 2){\n+\n+\t\t\t\tmy $cluster_coeff = get_cc($g);\n+\t\t\t\tif ($cluster_coeff >= $clus){\n+\t\t\t\t dump_locus($g, $cluster_coeff);\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t}\n+}\n+\n+warn "Loci printed\\nFinished\\n";\n+\n+my $endtime = time;\n+\n+my $elapsed = $endtime - $starttime;\n+\n+warn "Time elapsed = $elapsed s\\n";\t\n+close OUT;\n+close CLUSTER;\n+#########################################################################################\n+sub get_cc{   ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2\n+\n+\tmy $graph = shift;\n+\n+\tmy @component = @{$graph->get_nodes()}; #number of nodes\n+\tmy @clustering_coefficients;\n+\n+\tforeach my $vertex (@component)\n+\t{\n+\n+\t\tmy @neighbours = @{$graph->neighbors($vertex)};\n+\n+\t\tmy %edges_in_graph;\n+\n+\t\tmy $n = @neighbours; #n = the number of neighbours\n+\t\tmy $k = ($n * ($n - 1))/2;\t#k = total number of possible connections\n+\n+\t\tmy $e= 0; #actual number of connections within sub-graph\n+\n+\t\tforeach my $neighbour (@neighbours)\n+\t\t{\n+\t\t\tforeach my $neighbour_2 (@neighbours)\n+\t\t\t{\n+\t\t\tmy $edge1 = "$neighbour\\t$neighbour_2";\n+\t\t\tmy $edge2 = "$neighbour_2\\t$neighbour";\n+\t\t\t\tunless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2})\n+\t\t\t\t{\n+\t\t\t\t\tif ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour))\n+\t\t\t\t\t{\n+\t\t\t\t\t\t++$e;\n+\t\t\t\t\t\t$edges_in_graph{$edge1}=1;\n+\t\t\t\t\t\t$edges_in_graph{$edge2}=1;\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\n+\t\tif ($k >= 1)\n+\t\t{\t\n+\t\tmy $c = $e / $k;\n+\t\tpush @clustering_coefficients, $c;\n+\t\t}\n+\t\telse {push @clustering_coefficients, \'0\';}\n+\t}\n+\n+\tmy $graph_n = scalar(@clustering_coefficients);\n+\tmy $graph_cc = 0;\n+\tforeach my $cc (@clustering_coefficients){ \n+\n+\t\t$graph_cc = $graph_cc + $cc;\n+\n+\t}\n+\t$graph_cc = $graph_cc / $graph_n;\n+\n+\treturn $graph_cc;\n+}\n+\n+############################################################################################################\n+\n+sub dump_locus{\n+\n+\tmy $g = shift;\n+\tmy $cc = shift;\n+\tmy $chr;\n+\tmy $start = 1000000000000000000000000000000000000000000000; \n+\tmy $end = -1;\n+\tmy @sample;\n+\tmy @tag;\n+\tforeach my $node (@{$g->get_nodes()}){\n+\n+\t\t$node =~ m/^(\\S+):(\\d+):(\\d+)$/;\n+\t\tmy $c=$1;\n+\t\tmy $s=$2;\n+\t\tmy $e=$3;\n+\t#\tmy @data;\n+\t\tforeach my $str (keys %{$molecules{$c}{$s}{$e}}) {\n+\t\t\tmy @data=split(/;/,$molecules{$c}{$s}{$e}{$str});\n+\t\t\tpush @tag,($s.",".$e.",".$str.",".$data[-1]);\n+#\t\t\tfor (my $i=0;$i<$#old_data ;$i++) {\n+#\t\t\t\t$data[$i]+=$old_data[$i];\n+#\t\t\t}\n+\t\t\tmy $length=$e-$s+1;\n+\t\t\tif ($length>30) {\n+\t\t\t\t$length=40;\n+\t\t\t}\n+\t\t\tpush @data,$length;\n+\t\t\tmy $data=join ";",@data;#sample_exp/total_exp/length;\n+\t\t\tpush @sample,$data;\n+\t\t}\n+\n+\t\t$chr = $c;\n+\t\t$start = $s if $s < $start;\n+\t\t$end = $e if $e > $end;\n+\t}\n+\tmy $tag=join";",@tag;\n+\tmy $tag_number=@tag;\n+\tmy ($max_length,$max_p,@cluster_exp)=Max_length(\\@sample);\n+\tif ($max_length==40) {\n+\t\t$max_length="\\>30";\n+\t}\n+\tmy $cluster_exp=join"\\t",@cluster_exp;\n+\tmy $gff = $chr."\\:$start\\-$end\\t".$max_length."nt\\t".$max_p."\\t" . $cluster_exp;\n+\tprint CLUSTER "$chr\\:$start\\-$end\\t$max_length"."nt\\t$max_p\\t$tag_number\\t$tag\\n";\n+\tprint OUT $gff, "\\n";  \n+}\n+\n+sub Max_length{\n+\tmy @exp=@{$_[0]};\n+\tmy %sample_length;\n+\tmy $total_exp;\n+\tmy @each;\n+\tfor (my $i=0;$i<=$#exp ;$i++) {\n+\t\tmy @tag=split/;/,$exp[$i];\n+\t\tmy $length=pop(@tag);\n+\t\tmy $exp=pop(@tag);\n+\t\t$sample_length{$length}+=$exp;\n+\t\t$total_exp+=$exp;\n+\t\tfor (my $j=0;$j<=$#tag ;$j++) {\n+\t\t\t$each[$j]+=$tag[$j];\n+\t\t}\n+\t}\n+\tmy $max=0;\n+\tmy $max_key;\n+\tforeach my $key (sort keys %sample_length) {\n+\t\tmy $p=$sample_length{$key}/$total_exp;\n+\t\tif ($p>$max) {\n+\t\t\t$max=$p;\n+\t\t\t$max_key=$key;\n+\t\t}\n+\t\t$sample_length{$key}=sprintf("%.2f",$p);\n+\t}\n+\treturn($max_key,$sample_length{$max_key},@each);\n+}\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 non_miRNA_reads.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/non_miRNA_reads.pl Fri Dec 05 00:11:02 2014 -0500
b
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","fa=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'fa'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+my (%fa,%seq);
+while (my $aline=<IN>){
+ chomp $aline;
+ $aline=~s/^>//;
+ my $seq=<IN>;
+ chomp $seq;
+ #$seq{$seq}=$aline;
+ $fa{$aline}=$seq;
+}
+close IN;
+
+open IN,"<$opts{i}";
+while(my $aline=<IN>){
+ chomp $aline;
+ if($aline=~/^\S+_x\d+/){
+ $aline=~/^(\S+_x\d+)/;
+ my $name=$1;
+ delete($fa{$name});
+ }
+}
+close IN;
+
+open OUT,">$fileout"; #output file  
+foreach my $key (keys %fa) {
+ print OUT ">$key\n$fa{$key}\n";
+}
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 phased_siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phased_siRNA.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,254 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+#use Math::Cephes qw(:hypergeometrics); 
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+
+while (my $aline=<IN>) {
+ chomp $aline;
+ if ($aline=~/^\#/) {
+ print OUT $aline,"\tp-value\n";
+ next;
+ }
+ my @tmp=split/\t/,$aline;
+ my @pos=split/:|-/,$tmp[0];
+ $tmp[1]=~s/nt//;
+ my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]);
+
+ print OUT $aline,"\t",$pv,"\n";
+}
+close IN;
+close OUT;
+
+sub phase{
+ my ($tagL,$start,$end,$tags)=@_;
+ my @tmp=split/\;/,$tags;
+ my %tag;
+ for (my $i=0;$i<@tmp;$i++) {
+ my @aa=split/\,/,$tmp[$i];
+ next if($aa[1]-$aa[0]+1 != $tagL);
+# $tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+");
+# $tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-");
+ $tag{$aa[0]}+=$aa[3] if($aa[2] eq "+");
+ $tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-");
+ }
+
+ my $pv=&pvalue2(\%tag,$tagL,$start,$end);
+
+ return $pv;
+}
+
+sub pvalue2{
+ my ($tag,$tagL,$start,$end)=@_;
+
+ my $p=1; my $pp=1;
+ foreach my $ccs(keys %{$tag}){
+ my $n=0;
+ my $k=0;
+ my $K=0;
+ my $N=0;
+
+ my $cor= $ccs;
+ my $ss=$cor;
+ my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end;
+
+ my $max=0;
+ for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+ {
+ my $x=$i;
+ if (defined $$tag{$x})
+ {
+ if ($max<$$tag{$x}) {$max=$$tag{$x};}
+ $n +=$$tag{$x};
+ $N++;
+ }
+ }
+ for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+ {
+ my $x=$i;
+ if (defined $$tag{$x})
+ {
+ $k +=$$tag{$x};
+ $K++;
+ }
+ }
+
+
+ return $p if($K<3);
+ return $p if($max/$n>0.8);
+
+ my $pn=0; 
+ next if($n==$k);
+ $pn=10*$k/($n-$k)+1;
+ $pn = $pn ** ($K-2);
+ $pn = log($pn);
+ if ($p<$pn) {
+ $p=$pn;
+ }
+
+ }
+
+ return $p;
+
+}
+
+sub pvalue{
+ my ($tag,$tagL,$start,$end)=@_;
+
+ my $p=1;
+ foreach my $ccs(keys %{$tag}){
+ my $n=-1;
+ my $k=-1;
+
+ my ($cor, $str)=split(/,/, $ccs);
+ if ($str eq "+") # small RNAs on the Watson strand
+ {
+ my $ss=$cor;
+ my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end;
+ for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+ {
+ my $x=$i.","."+";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+ for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+ {
+ my $x=$i.","."+";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+
+ for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand
+ {
+ my $x=$j.","."-";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+
+ for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand
+ {
+ my $x=$j.","."-";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+ }
+
+ elsif ($str eq "-") # small RNAs on the Crick strand
+ {
+ my $ee=$cor;
+ my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start;
+ for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+ {
+ my $x=$i.","."-";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+ for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+ {
+ my $x=$i.","."-";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+
+ for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand
+ {
+ my $x=$j.","."+";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+ for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand
+ {
+ my $x=$j.","."+";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+ }
+
+ next if($k<3);
+
+ my $pn=0; my $N=$tagL*11*2-1; my $M=21;
+ for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k
+ {
+ my $c=1;
+ my $rr=1;
+ my $rw=1;
+
+ for (my $j=0; $j<=$w-1; $j++)
+ {
+ $c=$c*($M-$j)/($j+1);
+ }
+ for (my $x=0; $x<=$n-$w-1; $x++)
+ {
+ $rr=$rr*($N-$M-$x)/($x+1);
+ }
+ for (my $y=0; $y<=$n-1; $y++)
+ {
+ $rw=$rw*($y+1)/($N-$y);
+ }
+ my $pr=$c*$rr*$rw;
+
+ $pn=$pn+$pr;
+ }
+
+ $p=$pn<$p ? $pn :$p;
+
+ if ($p<0.001) #select and output small RNA clusters with p<0.001
+
+ {
+
+ return $p;
+
+ }
+
+ }
+ return $p;
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 preProcess.xml
--- a/preProcess.xml Wed Dec 03 02:03:27 2014 -0500
+++ b/preProcess.xml Fri Dec 05 00:11:02 2014 -0500
[
@@ -12,7 +12,7 @@
 
   <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command-->
 
-  <command interpreter="perl">miRPlant.pl 
+  <command interpreter="perl">preProcess.pl 
    ## Change this to accommodate the number of threads you have available.
         -t \${GALAXY_SLOTS:-4}
  -path \$SCRIPT_PATH
@@ -25,7 +25,16 @@
 
    ## Do or not annotate rfam non-miRNA RNAs
     #if $params.annotate_rfam == "yes":
- -rfam $rfam 
+   ## prepare Rfam bowtie index
+   #set rfam_index_path = ''
+   #if str($params.annotate_rfam.reference_rfam.source) == "history":
+   bowtie-build "$params.annotate_rfam.reference_rfam.own_file" rfam; ln -s "$params.annotate_rfam.reference_rfam.own_file" rfam.fa;
+   #set rfam_index_path = 'rfam'
+   #else:
+   #set rfam_index_path = $params.annotate_rfam.reference_rfam.index.fields.path
+   #end if
+
+ -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v 
  #end if
 
         ## prepare bowtie index
@@ -37,7 +46,7 @@
             #set index_path = $reference_genome.index.fields.path
  #end if
 
-    -format $format -gfa ${index_path}.fa -idx $index_path -a $a -M $mapnt -min $min -max $max -mis $mismatch -v $v > run.log
+    -format $format -gfa ${index_path}.fa -idx $index_path -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch  > run.log
   </command>
 
   <inputs>
@@ -75,6 +84,10 @@
 
  <!--param name="gfa"  type="data" label="genome sequence fasta file"/-->
  <!--param type="data" name="index" label="genome sequence bowtie index"/-->
+ <param name="phred" type="select" lable="input quals are Phred+64 or Phred+33" multiple="false">
+   <option value="64">Phred+64</option>
+   <option value="33" selected="true">Phred+33</option>
+ </param>
  <param name="a" type="text" value="ATCTCGTATG" label="3' adapter sequence" />
  <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" />
  <param name="min" type="integer" value="19" label="minimum microRNA length" />
@@ -87,8 +100,27 @@
    <option value="no">no</option>
   </param>
   <when value="yes">
- <param name="rfam" type="data" label="rfam sequence file" />
- <!--param type="data" name="idx2" label="rfam sequence bowtie index " -->
+ <!--param name="rfam" type="data" label="rfam sequence file" /-->
+  <when value="yes">
+ <!--param name="rfam" type="data" label="rfam sequence file" /-->
+ <conditional name="reference_rfam">
+   <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+ <option value="indexed">Use a built-in index</option>
+ <option value="history">Use one from the history</option>
+   </param>
+   <when value="indexed">
+ <param name="index" type="select" label="Select a reference" help="If your reference of interest is not listed, contact the Galaxy team">
+   <options from_data_table="rfam_bowtie_indexes">
+ <filter type="sort_by" column="2"/>
+ <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+   </options>
+ </param>
+   </when>
+   <when value="history">
+ <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" />
+   </when>
+ </conditional>
+
  <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches for rfam mapping"/>
   </when>
     </conditional> <!-- params -->
@@ -112,6 +144,8 @@
    <data format="txt" name="Rfam not mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_not_mapped.fa" label="${tool.name} on ${on_string}: Rfam not mapped FASTA file">
    <filter>(params['annotate_rfam'] == 'Yes')</filter>
    </data>
+   <data format="html" name="input config" from_work_dir="preProcess/input_config" label="${tool.name} on ${on_string}: input config"/>
+
   </outputs>
 
  <help>
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 precursors.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/precursors.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,829 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2013/7/19\n+#Modified:\n+#Description: \n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+#use RNA;\n+\n+my %opts;\n+GetOptions(\\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h");\n+if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $checkno=1;\n+my $filein=$opts{\'map\'};\n+my $faout=$opts{\'o\'};\n+my $strout=$opts{\'s\'};\n+my $genome= $opts{\'g\'};\n+\n+my $maxd=defined $opts{\'d\'} ? $opts{\'d\'} : 200;\n+my $flank=defined $opts{\'f\'}? $opts{\'f\'} : 10;\n+\n+my $MAX_ENERGY=-18;\n+if (defined $opts{\'e\'}) {$MAX_ENERGY=$opts{\'e\'};}\n+my $MAX_UNPAIR=5;\n+my $MIN_PAIR=15;\n+my $MAX_SIZEDIFF=4;\n+my $MAX_BULGE=2;\n+my $ASYMMETRY=5;\n+my $MIN_UNPAIR=0;\n+my $MIN_SPACE=5;\n+my $MAX_SPACE=$maxd;\n+my $FLANK=$flank;\n+\n+######### load in genome sequences start ########\n+my %genome;\n+my %lng;\n+my $name;\n+open IN,"<$genome";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tnext if($aline=~/^\\#/);\n+\tif ($aline=~/^>(\\S+)/) {\n+\t\t$name=$1;\n+\t\tnext;\n+\t}\n+\t$genome{$name} .=$aline;\n+}\n+close IN;\n+foreach my $key (keys %genome) {\n+\t$lng{$key}=length($genome{$key});\n+}\n+####### load in genome sequences end ##########\n+\n+my %breaks; ### reads number bigger than 3\n+open IN,"<$filein"; #input file  \n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tmy @tmp=split/\\t/,$aline;\n+\t$tmp[0]=~/_x(\\d+)$/;\n+\tmy $no=$1;\n+\tnext if($no<3);\n+\t#my $trand=&find_strand($tmp[9]);\n+\t#my @pos=split/\\.\\./,$tmp[5];\n+\tmy $end=$tmp[3]+length($tmp[4])-1;\n+\tif($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);}\n+\tpush @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base\n+}\n+close IN;\n+\n+my %cites; ### peaks\n+foreach my $chr (keys %breaks) {\n+\tforeach my $strand (keys %{$breaks{$chr}}) {\n+\t\tmy @array=@{$breaks{$chr}{$strand}};\n+\t\t@array=sort{$a->[0]<=>$b->[0]} @array;\n+\t\tfor (my $i=0;$i<@array;$i++) {\n+\t\t\tmy $start=$array[$i][0];my $end=$array[$i][1];\n+\t\t\tmy @subarray=();\n+\t\t\tpush @subarray,$array[$i];\n+\n+\t\t\tfor (my $j=$i+1;$j<@array;$j++) {\n+\t\t\t\tif ($start<$array[$j][1] && $end>$array[$j][0]) {\n+\t\t\t\t\tpush @subarray,$array[$j];\n+\t\t\t\t\t($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);\n+\t\t\t\t}\n+\t\t\t\telse{\n+\t\t\t\t\t$i=$j;\n+\t\t\t\t\t&find_cites(\\@subarray,$chr,$strand);\n+\t\t\t\t\tlast;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t}\n+}\n+\n+open FA,">$faout"; #output file \n+open STR,">$strout";\n+foreach my $chr (keys %cites) {\n+\tforeach my $strand (keys %{$cites{$chr}}) {\n+\n+\t\tmy @array2=@{$cites{$chr}{$strand}};\n+\t\t@array2=sort{$a->[0]<=>$b->[0]} @array2;\n+\t\t&excise(\\@array2,$chr,$strand);\n+\t}\n+}\n+close FA;\n+close STR;\n+sub oneCiteDn{\n+\tmy ($array,$a,$chr,$strand)=@_;\n+\n+\tmy $ss=$$array[$a][0]-$flank;\n+\t$ss=0 if($ss<0);\n+\tmy $ee=$$array[$a][1]+$maxd+$flank;\n+\t$ee=$lng{$chr} if($ee>$lng{$chr});\n+\t\n+\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n+\tif($strand eq "-"){$seq=revcom($seq);}\n+\n+\tmy $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);\n+\treturn $val;\n+}\n+sub oneCiteUp{\n+\tmy ($array,$a,$chr,$strand)=@_;\n+\n+\tmy $ss=$$array[$a][0]-$maxd-$flank;\n+\t$ss=0 if($ss<0);\n+\tmy $ee=$$array[$a][1]+$flank;\n+\t$ee=$lng{$chr} if($ee>$lng{$chr});\n+\t\n+\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n+\tif($strand eq "-"){$seq=revcom($seq);}\n+\n+\tmy $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);\n+\treturn $val;\n+\n+}\n+\n+sub twoCites{\n+\tmy ($array,$a,$b,$chr,$strand)=@_;\n+\n+\tmy $ss=$$array[$a][0]-$flank;\n+\t$ss=0 if($ss<0);\n+\tmy $ee=$$array[$b][1]+$flank;\n+\t$ee=$lng{$chr} if($ee>$lng{$chr});\n+\t\n+\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n+\tif($strand eq "-"){$seq=revcom($seq);}\n+\n+#\tmy( $str,$mfe)=RNA::fold($seq);\n+#\treturn 0 if($mfe>$MAX_ENERGY); ### minimum mfe\n+\tmy $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee);\n+\t\n+\treturn $val;\n+\n+}\n+sub excise{\n+\tmy ($cluster,$chr,$strand)=@_;\n+\n+\tmy $last_pos=0;\n+\tfor (my $i=0;$i<@{$cluster};$i++) {\n+\t\tnext if($$cluster[$i][0]<$last_pos);\n+\t\tmy $ok=0;\n+\t\tfor (my $j=$i+1;$j<@{$cluster} '..b'        s2\n+\n+\tmy $pair_num=0;\n+\tmy $overhang1=0;\n+\tmy $overhang2=0;\n+\tmy ($s1,$e1,$s2,$e2);\n+\tforeach my $i ($beg1..$end1) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif ($j <= $end2 && $j >= $beg2) {\n+\t\t\t\t$s1=$i;\n+\t\t\t\t$e2=$j;\n+\t\t\t\tlast;\n+\t\t\t}\n+\t\t}\n+\t}\n+\tforeach my $i (reverse ($beg1..$end1)) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif ($j <= $end2 && $j >= $beg2) {\n+\t\t\t\t$e1=$i;\n+\t\t\t\t$s2=$j;\n+\t\t\t\tlast;\n+\t\t\t}\n+\t\t}\n+\t}\n+\n+#\tprint "$beg1,$end1 $s1,$e1\\n";\n+#\tprint "$beg2,$end2 $s2,$e2\\n";\n+\n+\tforeach my $i ($beg1..$end1) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif ($j <= $end2 && $j >= $beg2) {\n+\t\t\t\t++$pair_num;\n+\t\t\t}\n+\t\t}\n+\t}\n+\tif (defined $s1 && defined $e2) {\n+\t\t$overhang1=($end2-$e2)-($s1-$beg1);\n+\t}\n+\tif (defined $e1 && defined $s2) {\n+\t\t$overhang2=($end1-$e1)-($s2-$beg2);\n+\t}\n+\t\n+\tif ($pair_num < 13) {\n+\t\t$like_mir_duplex=0;\n+\t}\n+\tif ($overhang1 < 0 && $overhang2 < 0) {\n+\t\t$like_mir_duplex=0;\n+\t}\n+\treturn ($like_mir_duplex,$pair_num,$overhang1,$overhang2);\n+}\n+sub parse_struct {\n+\tmy $struct=shift;\n+\tmy $table=shift;\n+\n+\tmy @t=split(\'\',$struct);\n+\tmy @lbs; # left brackets\n+\tforeach my $k (0..$#t) {\n+\t\tif ($t[$k] eq "(") {\n+\t\t\tpush @lbs, $k+1;\n+\t\t}\n+\t\telsif ($t[$k] eq ")") {\n+\t\t\tmy $lb=pop @lbs;\n+\t\t\tmy $rb=$k+1;\n+\t\t\t$table->{$lb}=$rb;\n+\t\t\t$table->{$rb}=$lb;\n+\t\t}\n+\t}\n+\tif (@lbs) {\n+\t\twarn "unbalanced RNA struct.\\n";\n+\t}\n+}\n+sub which_arm {\n+\tmy $substruct=shift;\n+\tmy $arm;\n+\tif ($substruct=~/\\(/ && $substruct=~/\\)/) {\n+\t\t$arm="-";\n+\t}\n+\telsif ($substruct=~/\\(/) {\n+\t\t$arm="5p";\n+\t}\n+\telse {\n+\t\t$arm="3p";\n+\t}\n+\treturn $arm;\n+}\n+sub biggest_bulge {\n+\tmy $struct=shift;\n+\tmy $bulge_size=0;\n+\tmy $max_bulge=0;\n+\twhile ($struct=~/(\\.+)/g) {\n+\t\t$bulge_size=length $1;\n+\t\tif ($bulge_size > $max_bulge) {\n+\t\t\t$max_bulge=$bulge_size;\n+\t\t}\n+\t}\n+\treturn $max_bulge;\n+}\n+sub get_asy {\n+\tmy($table,$a1,$a2)=@_;\n+\tmy ($pre_i,$pre_j);\n+\tmy $asymmetry=0;\n+\tforeach my $i ($a1..$a2) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif (defined $pre_i && defined $pre_j) {\n+\t\t\t\tmy $diff=($i-$pre_i)+($j-$pre_j);\n+\t\t\t\t$asymmetry += abs($diff);\n+\t\t\t}\n+\t\t\t$pre_i=$i;\n+\t\t\t$pre_j=$j;\n+\t\t}\n+\t}\n+\treturn $asymmetry;\n+}\n+\n+sub peaks{\n+\tmy @cluster=@{$_[0]};\n+\n+\treturn if(@cluster<1);\n+\n+\tmy $max=0; my $index=-1;\n+\tfor (my $i=0;$i<@cluster;$i++) {\n+\t\tif($cluster[$i][2]>$max){\n+\t\t\t$max=$cluster[$i][2];\n+\t\t\t$index=$i;\n+\t\t}\n+\t}\n+#\t&excise(\\@cluster,$index,$_[1],$_[2]);\n+\treturn($index);\n+}\n+\n+sub find_cites{\n+\tmy @tmp=@{$_[0]};\n+\tmy $i=&peaks(\\@tmp);\n+\t\n+\tmy $start=$tmp[$i][0];\n+\tmy $total=0; my $node5=0;\n+\tfor (my $j=0;$j<@tmp ;$j++) {\n+\t\t$total+=$tmp[$j][2];\n+\t\t$node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2);\n+\t}\n+\tpush @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5);\n+}\n+\n+sub newpos{\n+\tmy ($a,$b,$c,$d)=@_;\n+\tmy $s= $a>$c ? $c : $a;\n+\tmy $e=$b>$d ? $b : $d;\n+\treturn($s,$e);\n+}\n+\n+sub rev{\n+\n+    my($sequence)=@_;\n+\n+    my $rev=reverse $sequence;   \n+\n+    return $rev;\n+}\n+\n+sub com{\n+\n+    my($sequence)=@_;\n+\n+    $sequence=~tr/acgtuACGTU/TGCAATGCAA/;   \n+ \n+    return $sequence;\n+}\n+\n+sub revcom{\n+\n+    my($sequence)=@_;\n+\n+    my $revcom=rev(com($sequence));\n+\n+    return $revcom;\n+}\n+\n+sub find_strand{\n+\n+    #A subroutine to find the strand, parsing different blast formats\n+    my($other)=@_;\n+\n+    my $strand="+";\n+\n+    if($other=~/-/){\n+\t$strand="-";\n+    }\n+\n+    if($other=~/minus/i){\n+\t$strand="-";\n+    }\n+\n+    return($strand);\n+}\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -map -g -d -f -o -s -e\n+options:\n+  -map input file# align result # bst. format\n+  -g input file # genome sequence fasta format\n+  -d <int>   Maximal space between miRNA and miRNA* (200)\n+  -f <int>   Flank sequence length of miRNA precursor (10)\n+  -o output file# percursor fasta file\n+  -s output file# precursor structure file\n+  -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol)\n+\n+  -h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 quantify.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/quantify.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,502 @@\n+#!/usr/bin/perl -w\r\n+#Filename:\r\n+#Author: Tian Dongmei\r\n+#Email: tiandm@big.ac.cn\r\n+#Date: 2013/7/19\r\n+#Modified:\r\n+#Description: \r\n+my $version=1.00;\r\n+\r\n+use File::Path;\r\n+use strict;\r\n+use File::Basename;\r\n+#use Getopt::Std;\r\n+use Getopt::Long;\r\n+#use RNA;\r\n+\r\n+my %opts;\r\n+GetOptions(\\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","h");\r\n+if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments\r\n+&usage;\r\n+}\r\n+\r\n+my $read=$opts{\'r\'};\r\n+my $pre=$opts{\'p\'};\r\n+my $mature=$opts{\'m\'};\r\n+\r\n+my $dir=$opts{\'o\'};\r\n+unless ($dir=~/\\/$/) {$dir .="/";}\r\n+if (not -d $dir) {\r\n+\tmkdir $dir;\r\n+}\r\n+\r\n+my $threads=defined $opts{\'t\'} ? $opts{\'t\'} : 1;\r\n+my $mismatch=defined $opts{\'mis\'} ?  $opts{\'mis\'} : 0;\r\n+\r\n+my $upstream = 2;\r\n+my $downstream = 5;\r\n+\r\n+$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\r\n+$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\r\n+\r\n+my $marks=defined $opts{\'tag\'} ? $opts{\'tag\'} : "";\r\n+\r\n+my $time=Time();\r\n+\r\n+my $tmpdir="${dir}/known_miRNA_Express";\r\n+if(not -d $tmpdir){\r\n+\tmkdir($tmpdir);\r\n+}\r\n+chdir $tmpdir;\r\n+\r\n+`cp $pre ./`;\r\n+my $pre_file_name=basename($pre);\r\n+\r\n+&mapping(); # matures align to precursors && reads align to precursors;\r\n+\r\n+my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end;\r\n+&maturePosOnPre(); # acknowledge mature positions on precursor \r\n+\r\n+my %pre_read;\r\n+&readPosOnPre(); # acknowledge reads positions on precursors\r\n+\r\n+if(!(defined $opts{\'tag\'})){\r\n+\tforeach my $key (keys %pre_read) {\r\n+\t\t$pre_read{$key}[0][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n+\t\tmy @ss=split/_/,$1;\r\n+\t\tfor (my $i=1;$i<=@ss;$i++) {\r\n+\t\t\t$marks .="Smp$i;";\r\n+\t\t}\r\n+\t\tlast;\r\n+\t}\r\n+}\r\n+\r\n+my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...."\r\n+&attachPre();\r\n+\r\n+my $preno=scalar (keys %pre);\r\n+print "Total Precursor Number is $preno !!!!\\n";\r\n+\r\n+my %struc; #mature  star  loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe;\r\n+&structure();\r\n+\r\n+\r\n+##### analysis and print out  && moRs\r\n+my $aln=$dir."known_microRNA_express.aln";\r\n+my $list=$dir."known_microRNA_express.txt";\r\n+my $moRs=$dir."known_microRNA_express.moRs";\r\n+\r\n+system("ln -s $mature $dir/known_microRNA_mature.fa ");\r\n+system("ln -s $pre $dir/known_microRNA_precursor.fa ");\r\n+\r\n+open ALN,">$aln";\r\n+open LIST,">$list";\r\n+open MORS,">$moRs";\r\n+\r\n+$"="\\t"; ##### @array print in \\t\r\n+\r\n+my @marks=split/\\;/,$marks;\r\n+#print LIST "#matueID\\tpreID\\tpos1\\tpos2\\tmatureExp\\tstarExp\\ttotalExp\\n";\r\n+print LIST "#matueID\\tpreID\\tpos1\\tpos2";\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_matureExp";\r\n+}\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_starExp";\r\n+}\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_totalExp";\r\n+}\r\n+print LIST "\\n";\r\n+print ALN "#>precursor ID \\n#precursor sequence\\n#precursor structure (mfe)\\n#RNA_seq\\t@marks\\ttotal\\n";\r\n+print MORS "#>precursor ID\\tstrand\\texpress_reads\\texpress_reads\\/total_reads\\tblock_number\\tprecursor_sequence\\n#\\tblock_start\\tblock_end\\t@marks\\ttotal\\ttag_number\\tsequence\\n";\r\n+my %moRs;\r\n+\r\n+foreach my $key (keys %pre) {\r\n+\tprint ALN ">$key\\n$pre{$key}\\n$struc{$key}{struc}  ($struc{$key}{mfe})\\n";\r\n+\tnext if(! (exists $pre_read{$key}));\r\n+\tmy @array=@{$pre_read{$key}};\r\n+\t@array=sort{$a->[3]<=> $b->[3]} @array;\r\n+\t\r\n+\tmy $length=length($pre{$key});\r\n+\r\n+\tmy $maxline=-1;my $max=0; ### storage the maxinum express read line\r\n+\tmy $totalReadsNo=0; \r\n+\tmy @not_over=(); ### new read format better for moRs analysis\r\n+\r\n+####print out Aln file start\r\n+\tfor (my $i=0;$i<@array;$i++) {\r\n+\t\tmy $maps=$array[$i][3]+1;\r\n+\t\tmy $mape=$array[$i][3]+length($array[$i][4]);\r\n+\t\tmy $str="";\r\n+\t\t$str .= "." x ($maps-1);\r\n+\t\t$str .=$array[$i][4];\r\n+\t\t$str .="." x ($length-$mape);\r\n+\t\t$str .="        ";\r\n+\r\n+\t\t$array[$i][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n+\t\tmy @sample=split /\\_/,$1;\r\n+\t\tmy $total=$2;\r\n+\t\tprint ALN $str,"@s'..b'b{$i};\r\n+#\t\t\tif($a>$b){\r\n+#\t\t\t\t$startpos=$b-$n+2;\r\n+#\t\t\t\t$endpos=$b-$n+2+($end-$start);\r\n+#\t\t\t}\r\n+#\t\t\tif($a<$b){\r\n+\t\t\t\t$endpos=$b+$n+2;\r\n+\t\t\t\tif($endpos>length($structure)){$endpos=length($structure);}\r\n+\t\t\t\t$startpos=$b+$n+2-($end-$start);\r\n+\t\t\t\tif($startpos<1){$startpos=1;}\r\n+#\t\t\t}\r\n+\t\t\tlast;\r\n+\t\t}\r\n+\t\t$n++;\r\n+\t}\r\n+\treturn ($startpos,$endpos);\r\n+}\r\n+sub attachPre{\r\n+\topen IN, "<$pre_file_name";\r\n+\tmy $name;\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tif ($aline=~/^>(\\S+)/) {\r\n+\t\t\t$name=$1;\r\n+\t\t\tnext;\r\n+\t\t}\r\n+\t\t$pre{$name} .=$aline;\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub readPosOnPre{\r\n+\topen IN,"<read_mapped.bwt";\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tmy $id=lc($tmp[2]);\r\n+\t\tpush @{$pre_read{$tmp[2]}},[@tmp];\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub maturePosOnPre{\r\n+\topen IN,"<mature_mapped.bwt";\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tmy $mm=$tmp[0];\r\n+#\t\t$mm=~s/\\-3P|\\-5P//i;\r\n+\t\t$mm=lc($mm);\r\n+\t\tmy $pm=$tmp[2];\r\n+\t\t$pm=lc($pm);\r\n+\r\n+#\t\tnext if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a\r\n+\t\tnext if($mm!~/$pm/);\r\n+#\t\tprint "$tmp[2]\\t$tmp[0]\\n";\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream;\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0);\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream;\r\n+\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1;\r\n+\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]);\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub mapping{\r\n+    my $err;\r\n+## build bowtie index\r\n+    #print STDERR "building bowtie index\\n";\r\n+    $err = `bowtie-build $pre_file_name miRNA_precursor`;\r\n+\r\n+## map mature sequences against precursors\r\n+    #print STDERR "mapping mature sequences against index\\n";\r\n+\t$err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`;\r\n+\r\n+## map reads against precursors\r\n+    #print STDERR "mapping read sequences against index\\n";\r\n+    $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`;\r\n+\r\n+}\r\n+\r\n+sub subseq{\r\n+\tmy $seq=shift;\r\n+\tmy $beg=shift;\r\n+\tmy $end=shift;\r\n+\tmy $strand=shift;\r\n+\t\r\n+\tmy $subseq=substr($seq,$beg-1,$end-$beg+1);\r\n+\tif ($strand eq "-") {\r\n+\t\t$subseq=revcom($subseq);\r\n+\t}\r\n+\treturn uc $subseq;\r\n+}\r\n+\r\n+sub revcom{\r\n+\tmy $seq=shift;\r\n+\t$seq=~tr/ATCGatcg/TAGCtagc/;\r\n+\t$seq=reverse $seq;\r\n+\treturn uc $seq;\r\n+}\r\n+\r\n+sub Time{\r\n+        my $time=time();\r\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n+        $month++;\r\n+        $year+=1900;\r\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\r\n+        if (length($min) == 1) {$min = "0"."$min";}\r\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\r\n+        if (length($day) == 1) {$day = "0"."$day";}\r\n+        if (length($month) == 1) {$month = "0"."$month";}\r\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\r\n+        return("$year-$month-$day-$hour-$min-$sec");\r\n+}\r\n+\r\n+sub usage{\r\n+print <<"USAGE";\r\n+Version $version\r\n+Usage:\r\n+$0 -r -p -m -mis -t -e -f -tag -o -time\r\n+mandatory parameters:\r\n+-p precursor.fa  miRNA precursor sequences from miRBase # must be absolute path\r\n+-m mature.fa     miRNA sequences from miRBase # must be absolute path\r\n+-r reads.fa      your read sequences #must be absolute path\r\n+\r\n+-o output directory\r\n+\r\n+options:\r\n+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0\r\n+-t [int]     threads number,default 1\r\n+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2\r\n+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5\r\n+-tag [string] sample marks# eg. sampleA;sampleB;sampleC\r\n+-time sting #make directory time,default is the local time\r\n+-h help\r\n+USAGE\r\n+exit(1);\r\n+}\r\n+\r\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 quantify_siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/quantify_siRNA.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,64 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: chentt
+#Email:
+#Date: 2012-4-6
+#Modified:
+#Description:
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","d=s","h");
+if (!(defined $opts{i} and defined $opts{d} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $input=$opts{'i'};
+my $output=$opts{'o'};
+my $depth=$opts{'d'};
+
+open (IN,"<$input")||die"$!";
+open OUT,">$output";
+#my @Total=qw(15797079 18042650 17455254 17295526 18791753 16719596 15150009 18451484 17402501 17729362 19347595 17518516 15699663 16589265 15442892 14012264 14190746 17280260 13213117 12390121 14874304 );
+my @Total=split/\,/,$depth;
+#print OUT "#clusterID\tmajor_length\tpercent\n";
+while (my $aline=<IN>) {
+ chomp $aline;
+ if ($aline=~/^\"/){
+ my @title=split/\t/,$aline;
+ for (my $i=0;$i<@title ;$i++) {
+ $title[$i]=~s/^\"(\S+)\"$/$1/;
+ }
+ my $title=join "\t",@title;
+ print OUT "\#$title\n";
+ next;
+ }
+ my @temp=split/\t/,$aline;
+ print OUT "$temp[0]\t$temp[1]\t$temp[2]";
+ my @id=split/:/,$temp[0];
+ my @posi=split/-/,$id[1];
+ for (my $i=3;$i<@temp;$i++) {
+ my $rpkm=sprintf("%.2f",$temp[$i]/($posi[1]-$posi[0]+1)/$Total[$i-3]*1000000000);
+ print OUT "\t$rpkm";
+ }
+ print OUT "\n";
+}
+close IN;
+close OUT;
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -h
+options:
+-i input cluster file
+-o output file
+-d depth
+-h help
+USAGE
+exit(1);
+}
\ No newline at end of file
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 sRNA_plot.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRNA_plot.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,411 @@\n+#!/usr/bin/perl -w\n+#==========================================================================================\n+# Date: \n+# Title: \n+# Comment: Program to plot gene structure\n+# Input: 1. \n+#        2. \n+#        3. \n+# Output: output file of gene structure graph by html or svg formt\n+# Test Usage: \n+#========================================================================================\n+#use strict;\n+my $version=1.00;\n+use SVG;\n+use Getopt::Long;\n+my %opt;\n+GetOptions(\\%opt,"g=s","l=s","span=s","c=s","o=s","out=s","cen:s","mark=s","h");\n+if (!( defined $opt{o}) || defined $opt{h}) {\n+&usage;\n+}\n+my $span=$opt{span};\n+#my $sample_cloumn=$opt{n};\n+my $mark=$opt{mark};\n+my @mark=split/\\#/,$mark;\n+my $genelist=$opt{g};\n+#===============================Define Attribute==========================================\n+my %attribute=(\n+\tcanvas=>{\n+\t\t\'width\'=>1500,\n+\t\t\'height\'=>1800\n+\t},\n+\ttext=>{\n+\t\t\'stroke\'=>"#000000",\n+\t\t\'fill\'=>"none",\n+\t\t\'stroke-width\'=>0.5\n+\t},\n+\tline=>{\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>1\n+\t},\n+\tcsv=>{\n+\t\t\'stroke\'=>"red",\n+\t\t\'stroke-width\'=>0.5\n+\t},\n+\texon=>{\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>1\n+\t},\n+\tintron=>{\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>1.5\n+\t},\n+\tfont=>{\n+\t\t\'fill\'=>"#000000",\n+\t\t\'font-size\'=>12,\n+\t\t\'font-size2\'=>10,\n+\t\t#\'font-weight\'=>\'bold\',\n+\t\t\'font-family\'=>"Arial"\n+\t\t#\'font-family\'=>"ArialNarrow-bold"\n+\t},\n+\trect=>{\n+\t\t\'fill\'=>"lightgreen",\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>0.5\n+\t},\n+\treadwidth=>0.5\n+);\n+#############################s#define start coordinate and scale\n+open(TXT,">$opt{out}");\n+open(LENGTH,"$opt{l}")||die"cannot open the file $opt{l}";\n+my %length;\n+while (my $aline=<LENGTH>) {\n+\tchomp $aline;\n+\tnext if($aline=~/^\\#/);\n+\tmy @temp=split/\\t/,$aline;\n+\t$temp[0]=~s/^c/C/;\n+\t$length{$temp[0]}=$temp[1];\n+}\n+close LENGTH;\n+#---------------------------------------------------------------\n+open(GENE,"$opt{g}")||die"cannot open the file $opt{g}";\n+my %genelist;\n+while (my $aline=<GENE>) {\n+\tchomp $aline;#LOC_Os01g01280  Chr1    133291  134685  +\n+\tnext if($aline=~/^\\#/);\n+\tmy @temp=split/\\t/,$aline;\n+\tif ($temp[1]=~/^Chr(\\d)$/) {\n+\t\t$temp[1]="Chr0$1";\n+\t}\n+\tpush @{$genelist{$temp[1]}},[$temp[0],$temp[2],$temp[3]];\n+\n+}\n+close GENE;\n+#my %have_gene;\n+#foreach my $chr (sort keys %genelist) {\n+#\tmy @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}};\n+#\tmy $start=$genelist[0][1];\n+#\tmy $end=$genelist[0][2];\n+#\tfor (my $i=0;$i<@genelist ;$i++) {\n+#\t\tif ($gene) {\n+#\t\t}\n+#\t}\n+#}\n+\n+my %gene_desity;\n+foreach my $chr (sort keys %genelist) {\n+\tmy @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}};\n+\tfor (my $i=0;$i<@genelist ;$i++) {\n+\t\tmy $start=int($genelist[$i][1]/$span);\n+\t\tmy $end=int($genelist[$i][2]/$span);\n+\t\t#my @t_rpkm=split/\\t/,$target_rpkm{$genelist[$i][0]};\n+\t\tif ($start==$end) {\n+\t\t\t$gene_desity{$chr}[$start]++;\n+\t\t}\n+\t\telse{\n+\t\t\tfor (my $k=$start;$k<=$end ;$k++) {\n+\t\t\t\t$gene_desity{$chr}[$k]++;\n+\t\t\t}\n+\t\t}\n+\t}\n+}\n+#------------------------------------------region_gene_number-------------------------\n+my $max_gene_number=0;\n+my $total=0;\n+foreach my $chr (sort keys %genelist) {\n+\tfor (my $i=0;$i<@{$gene_desity{$chr}} ;$i++) {\n+\t\tif (!(defined($gene_desity{$chr}[$i]))) {\n+\t\t\t$gene_desity{$chr}[$i]=0;\n+\t\t}\n+\t\tif ($gene_desity{$chr}[$i]>$max_gene_number) {\n+\t\t\t$max_gene_number=$gene_desity{$chr}[$i];\n+\t\t\t#print "$gene_desity{$chr}[$i]\\n";\n+\t\t}\n+\t\t#print TXT "$i\\t$gene_desity[$i]\\n";\n+\t\t$total+=$gene_desity{$chr}[$i];\n+\t\t#print "$chr\\t$i\\t$gene_desity{$chr}[$i]\\n";\n+\t}\n+}\n+#print "Gene max:$max_gene_number\\ntotal:$total\\n";\n+\n+#---------------------------------------------------------------\n+my %centromere;\n+if (defined($opt{cen})) {\n+\topen CEN,"$opt{cen}";\n+\twhile (my $aline=<CEN>) {\n+\t\tchomp $aline;\n+\t\tnext if($aline=~/^\\#/);\n+\t\tmy @temp=split/\\t/,$aline;\n+\t\t$temp[0]=~s/^c/C/;\n+\t\t$centromere{$temp[0]}[0]=$temp[1];\n+\t\t$centromere{$temp[0]}[1]=$temp[2];\n+\t}\n+\tclose CEN;\n+}\n+\n+#---------------------------------------------------------------\n+m'..b',$attribute{font}{\'font-family\'},\'-cdata\',$chr);\n+\tmy $m_length=$length{$chr}%1000000;\n+\t$svg->text(\'x\',$chr_end_x+20,\'y\',$chr_start_y,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',12,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$m_length."M");\n+\t\n+\n+\tif (defined($centromere{$chr}[0])) {\n+\t\t$svg->rect(\'x\',$XOFFSET+$centromere{$chr}[0]*$Xscale,\'y\',$chr_start_y-2,\'width\',($centromere{$chr}[1]-$centromere{$chr}[0]+1)*$Xscale,\'height\',5,\'stroke\',"blue",\'stroke-width\',$attribute{intron}{\'stroke-width\'},\'fill\',"blue");\n+\t}\n+\tfor (my $s=0;$s<@mark ;$s++) {\n+\t\tfor (my $i=0;$i<$#{$cluster_density{$chr}}-1 ;$i++) {\n+\t\t\t#if ($cluster_density{$chr}[$i]*$Yscale>40) {\n+\t\t\t\t#$cluster_density{$chr}[$i]=40/$Yscale;\n+\t\t\t\t#$svg->rect(\'x\',$XOFFSET+$i*$span*$Xscale,\'y\',$chr_start_y-45,\'width\',$span*$Xscale,\'height\',5,\'stroke\',"green",\'stroke-width\',$attribute{intron}{\'stroke-width\'},\'fill\',"green");\n+\t\t\t#}\n+\t\t\t#print "$i\\t$cluster_density{$chr}[$i][$s]\\t$cluster_density{$chr}[$i+1][$s]\\n";\n+\t\t\tmy $cluster_density_start_x=$XOFFSET+$i*$span*$Xscale;\n+\t\t\tmy $cluster_density_end_x=$XOFFSET+($i+1)*$span*$Xscale;\n+\t\t\tmy $cluster_density_start_y=$chr_start_y-$cluster_density{$chr}[$i][$s]*$Yscale;\n+\t\t\tmy $cluster_density_end_y=$chr_start_y-$cluster_density{$chr}[$i+1][$s]*$Yscale;\n+\t\t\tmy $c_red=($s+1)/@mark*255;\n+\t\t\t$svg->line(\'x1\',$cluster_density_start_x,\'y1\',$cluster_density_start_y,\'x2\',$cluster_density_end_x,\'y2\',$cluster_density_end_y,\'stroke\',"rgb($c_red,125,0)",\'stroke-width\',0.3);\n+\t\t}\n+\n+\t}\n+\t#=======Y axis\n+\t$svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_start_x,y2=>$chr_start_y-$one_chr_heigth+15,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+\t#=======Y axis ===>3 xiaoge\n+\tmy $s10=1;\n+\tmy $e10=0;\n+\tmy $chr_max=$max_all_density;\n+\twhile ($chr_max>10) {\n+\t\t$chr_max=int($chr_max/10);\n+\t\t$s10=$s10*10;\n+\t\t$e10++;\n+\t}\n+\t$chr_max=$chr_max/2;\n+\t#print "*****$max_all_density\\t$chr_max\\t$s10\\n";\n+\tfor (my $i=1;$i<3 ;$i++) {\n+\t\tmy $y1=$chr_start_y-$chr_max*$s10*$Yscale*$i;\n+\t\tmy $xiaoge_Y=$chr_max*$i;\n+\t\t$svg->line(\'x1\',$chr_start_x,\'y1\',$y1,\'x2\',$chr_start_x+3,\'y2\',$y1,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+\t\t$svg->text(\'x\',$chr_start_x-26,\'y\',$y1+4,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',8,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$xiaoge_Y."e".$e10);\n+\t}\n+\t$cc++;\n+}\n+\n+for (my $s=0;$s<@mark ;$s++) {\n+\tmy $c_red=($s+1)/@mark*255;\n+\tprint "**$c_red\\n";\n+\t$svg->line(\'x1\',$canvas_end_x+100,\'y1\',$YOFFSET+$s*20+30,\'x2\',$canvas_end_x+130,\'y2\',$YOFFSET+$s*20+30,\'stroke\',"rgb($c_red,125,0)",\'stroke-width\',1);\n+\t$svg->text(\'x\',$canvas_end_x+150,\'y\',$YOFFSET+$s*20+5+30,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',10,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$mark[$s]);\n+}\n+#\n+#\n+if (defined($opt{cen})) {\n+\t$svg->rect(\'x\',$canvas_end_x+100,\'y\',$YOFFSET+@mark*20+30,\'width\',30,\'height\',5,\'stroke\',"blue",\'stroke-width\',$attribute{intron}{\'stroke-width\'},\'fill\',"blue");\n+\t$svg->text(\'x\',$canvas_end_x+150,\'y\',$YOFFSET+@mark*20+30+5,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',10,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',"centromere");\n+}\n+\n+close TXT;\n+\n+open (OUT,">$opt{o}");\n+print OUT $svg->xmlify();\n+\n+sub log2 {\n+\tmy $n = shift;\n+\treturn log($n)/log(2);\n+}\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 \n+options:\n+-g genelist\n+-span\n+-n sample cloumn\n+-mark sample name\n+-o output graph file name with html or svg extension\n+-c cluster file input\n+-out txt output\n+-l length of chr\n+-cen centromere\n+-h help\n+USAGE\n+exit(1);\n+}\n\\ No newline at end of file\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 sam2Bed_bowtie.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sam2Bed_bowtie.pl Fri Dec 05 00:11:02 2014 -0500
[
@@ -0,0 +1,74 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2011/11/7
+#Modified:
+#Description: sam2BED 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","mark=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+my $mark=$opts{'mark'};
+my @sample=split/\#/,$mark;
+$mark=join"\t",@sample;
+open OUT,">$fileout"; #output file  
+print OUT "#chr\tstrand\tstart\tend\t$mark\n";
+
+open IN,"<$filein"; #input file  
+my $Tags_num=0;
+my @read_num;
+#print OUT "#chr\tstart\tend\tnum\t<=20\t21\t22\t23\t24\t>=25\n";
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\@/);
+ my @tmp=split/\t/,$aline;
+ my $strand=$tmp[1];
+ my $start=$tmp[3]+1;
+ my $length=length($tmp[4]);
+ my $end=$start+$length-1;
+ my $hit=$tmp[6]+1;
+ #======express caculate weighted===================================
+ my $exp;
+ my @tempID=split/\:/,$tmp[0];
+ my @exp=split/\_/,$tempID[1];
+ pop @exp;
+ for (my $j=0;$j<@exp ;$j++) {
+ #my @tempID1=split/\=/,$tempID[$j];
+ $exp[$j]=sprintf("%.2f",$exp[$j]/$hit);
+ $read_num[$j]+=$exp[$j];
+ #print OUT "\t$exp";
+ }
+ $exp=join "\t",@exp;
+ print OUT $tmp[2],"\t",$strand,"\t",$start,"\t",$end,"\t",$exp,"\n";
+ $Tags_num++;
+
+}
+print "Total Tags numer: $Tags_num\n";
+my $read_number=join "\t",@read_num;
+print "Each sample numer: $read_number\n";
+close IN;
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-mark sampleA sampleB sampleC.....
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,403 @@\n+#!/usr/bin/perl -w\r\n+my $version=1.00;\r\n+use strict;\r\n+use warnings;\r\n+use Getopt::Long;\r\n+use Getopt::Std;\r\n+use threads;\r\n+use threads::shared;\r\n+use Parallel::ForkManager;\r\n+use lib \'/leofs/biotrans/chentt/perl_module/\';\r\n+#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq \r\n+print "\r\n+#####################################\r\n+#                                   # \r\n+#          sRNA cluster             # \r\n+#                                   #\r\n+#####################################\r\n+";\r\n+###########################################################################################\r\n+my $usage="$0\r\n+Options:\r\n+-i input file# fasta\r\n+-config input file \r\n+-g  genome file\r\n+-f  gff file\r\n+\r\n+-o  workdir file\r\n+-path  script path\r\n+-t  int,    number of threads [1]\r\n+-format  fastq, fq, fasta or fa\r\n+-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\r\n+\t\tstring must be the prefix of the bowtie index. For instance, if\r\n+\t\tthe first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\r\n+\t\tthe prefix is \'h_sapiens_37_asm\'.##can be null\r\n+-mis  int     number of allowed mismatches when mapping reads to genome, default 0\r\n+\r\n+-n  int max hits number,default 25; used in genome alignment\r\n+-d  int distance of tag to merged a cluster; default 100\r\n+-p  cluster method F :conventional default is F\r\n+\t\t\t\t   T :NIBLES \r\n+-l  int the length of the upstream and downstream,default 1000;used in position annotate\r\n+\r\n+-nat  natural antisense transcripts file\r\n+-repeat  repeat information file out of Repeatmasker\r\n+-deg  file config of de sample\r\n+-cen  centromere file input\r\n+-span  plot span, default 50000\r\n+";\r\n+\r\n+my %options;\r\n+GetOptions(\\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h");\r\n+#print help if that option is used\r\n+if($options{h}){die $usage;}\r\n+\r\n+my $filein=$options{\'i\'};\r\n+\r\n+#my $config=$options{\'i\'};\r\n+my $genome_fa=$options{\'g\'};\r\n+my $gff=$options{\'f\'};\r\n+\r\n+\r\n+##########################################################################################\r\n+my $predir=`pwd`;\r\n+chomp $predir;\r\n+my $workdir=defined($options{\'o\'}) ? $options{\'o\'}:$predir;\r\n+\r\n+my $path=$options{\'path\'};\r\n+\r\n+my $t=defined($options{\'t\'})? $options{\'t\'}:1; #threads number\r\n+\r\n+my $mis=defined $options{\'mis\'} ? $options{\'mis\'}:0;\r\n+\r\n+\r\n+my $hit=defined $options{\'n\'}?$options{\'n\'}:25;\r\n+\r\n+my $distance_of_merged_tag=defined $options{\'d\'} ? $options{\'d\'}:100;\r\n+\r\n+my $up_down_dis=defined $options{\'l\'} ?$options{\'l\'}:1000;\r\n+\r\n+my $cluster_mothod=defined $options{\'p\'}?$options{\'p\'}:"F";\r\n+\r\n+my $format=$options{\'format\'};\r\n+#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { \r\n+#\tdie "Parameter \\"-format\\" is error! Parameter is fastq, fq, fasta or fa\\n";\r\n+#}\r\n+\r\n+\r\n+\r\n+my $sample_number;\r\n+my ($dir,$dir_tmp);\r\n+################################  MAIN  ##################################################\r\n+print "\\ncluster program start:";\r\n+my $time=Time();\r\n+make_dir_tmp();\r\n+\r\n+my $mark;\r\n+my $sample_mark;\r\n+\r\n+my $config=$opts{\'config\'};\r\n+my (@filein,@mark);\r\n+&read_config();\r\n+$sample_number=@mark;\r\n+$mark=join "\\t",@mark;\r\n+$sample_mark=join "\\#",@mark;\r\n+\r\n+\r\n+\r\n+my $data3=$filein;   ### rfam not mapped reads\r\n+genome();\r\n+\r\n+my $bed=$dir."c'..b'ding\\n\\n";\r\n+\tmy $deg_file=$options{\'deg\'};\r\n+\topen IN,"<$deg_file";\r\n+\tmy @deg;\r\n+\tmy $s=0;\r\n+\t\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tnext if($aline=~/^\\#/);\r\n+\t\t$deg[$s]=$aline;\r\n+\t\tmy @ea=split/\\s+/,$aline;\r\n+\t\tpush @pairdir,"$ea[0]_VS_$ea[1]\\/";\r\n+\t\t#print "$deg[$s]\\n";\r\n+\t\t$s++;\r\n+\t}\r\n+\tclose IN;\r\n+\t$deg_dir=$dir."deg\\/";\r\n+\tmkdir ("$deg_dir");\r\n+\tmy $max_process = 10;\r\n+\tmy $pm = new Parallel::ForkManager( $max_process );\r\n+\tmy $number=@deg-1;\r\n+\tforeach(0..$number){\r\n+\t\t$pm->start and next;\r\n+\t\t&dec_pel($deg[$_]);\r\n+\t\t$pm->finish;\r\n+\t}\r\n+\t$pm->wait_all_children;\r\n+}\r\n+\r\n+sub dec_pel{\r\n+\tprint "\\n******************\\nstart:\\n";\r\n+\tTime();\r\n+\tmy $sample=shift(@_);\r\n+\tmy @each=split/\\s+/,$sample;\r\n+\tprint "$each[0]\\t$each[1]\\n";\r\n+\tmy $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\\/";\r\n+\tmkdir ("$deg_sample_dir");\r\n+\tprint "read: $read\\n";\r\n+\tprint "deg_sample_dir: $deg_sample_dir\\n";\r\n+\tprint "$id{$each[0]}\\t$each[0]\\n";\r\n+\tprint "$id{$each[1]}\\t$each[1]\\n";\r\n+\tmy $deg=`perl $path\\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2\r\n+\tmy $time2=time();\r\n+\tprint "end:\\n*************************\\n";\r\n+\tTime();\r\n+\tsleep 1;\r\n+}\r\n+\r\n+sub infor_merge{\r\n+\tmy ($input,$mark);\r\n+\tforeach (@pairdir) {\r\n+\t\tprint "@pairdir\\n";\r\n+\t\t$mark.=" -mark $_ ";\r\n+\t\t$input.=" -i $dir/deg\\/$_\\/output_score\\.txt ";\r\n+\t\tprint "$input\\n$mark\\n";\r\n+\t}\r\n+\tmy $infor_merge=`perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result `;\r\n+\tprint "perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result\\n\\n";\r\n+}\r\n+\r\n+sub infor_merge_no_dec{\r\n+\tmy $infor_merge_no_dec=`cp $annotate_dir\\/sample_c_p.anno $dir\\/total.result`;\r\n+}\r\n+\r\n+sub genome_length{\r\n+\tmy $length=`perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length`;\r\n+\tprint "perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length\\n\\n"\r\n+\r\n+}\r\n+\r\n+sub plot{\r\n+\t$plot_dir="$dir\\/plot\\/";\r\n+\tmkdir ("$plot_dir");\r\n+\tmy $span=defined($options{span})?$options{span}:50000;\r\n+\tmy $cen="";\r\n+\tif (defined $options{cen}) {\r\n+\t\t$cen="-cen $options{cen}";\r\n+\t}\r\n+\tmy $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `;\r\n+\t"print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \\n";\r\n+\r\n+}\r\n+\r\n+sub html{\r\n+\tmy $pathfile="$dir/path.txt";\r\n+\topen PA,">$pathfile";\r\n+\tprint PA "$config\\n";\r\n+\tprint PA "$preprocess\\n";\r\n+\tprint PA "$dir"."rfam_match\\n";\r\n+\tprint PA "$dir"."genome_match\\n";\r\n+\tprint PA "$cluster_file\\n";\r\n+\tprint PA "$annotate_dir\\n";\r\n+\tprint PA "$plot_dir\\n";\r\n+\tif (defined($deg_dir)) {\r\n+\t\t\tprint PA "$deg_dir\\n";\r\n+\t}\r\n+\tclose PA;\r\n+\tmy $html=`perl $path\\/html.pl -i $pathfile -format $format -o $dir/result.html`;\r\n+}\r\n+\r\n+sub Time{\r\n+\tmy $time=time();\r\n+\tmy ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n+\t$month++;\r\n+\t$year+=1900;\r\n+\tif (length($sec) == 1) {$sec = "0"."$sec";}\r\n+\tif (length($min) == 1) {$min = "0"."$min";}\r\n+\tif (length($hour) == 1) {$hour = "0"."$hour";}\r\n+\tif (length($day) == 1) {$day = "0"."$day";}\r\n+\tif (length($month) == 1) {$month = "0"."$month";}\r\n+\tprint "$year-$month-$day $hour:$min:$sec\\n";\r\n+\treturn("$year-$month-$day-$hour-$min-$sec");\r\n+}\r\n+#################################################################################\r\n+sub read_config{\r\n+\topen CON,"<$config";\r\n+\twhile (my $aline=<CON>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tpush @filein,$tmp[0];\r\n+\t\tpush @mark,$tmp[1];\r\n+\t\t#&check_rawdata($tmp[0]);\r\n+\t}\r\n+\tclose CON;\r\n+\tif (@filein != @mark) {\r\n+\t\t#&printErr();\r\n+\t\tdie "Maybe config file have some wrong!!!\\n";\r\n+\t}\r\n+}\r\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA.xml Fri Dec 05 00:11:02 2014 -0500
b
@@ -0,0 +1,122 @@
+<tool id="plant_sirna_v1" name="siRNA" veision="1.0.0">
+  <description>tool for plant siRNA analisis</description>
+
+  <requirements>
+    <requirement type="set_environment">SCRIPT_PATH</requirement>
+    <requirement type="package" version="0.12.7">bowtie</requirement>
+    <requirement type="package" version="3.0.1">R</requirement>
+ <requirement type="package" version="0.0.13">fastx_toolkit </requirement>
+ <requirement type="package" version="1.96">threads</requirement>
+ <requirement type="package" version="1.06">Parallel-ForkManager</requirement>
+ <requirement type="package" version="2.59">SVG</requirement>
+ <requirement type="package" version="1.4_001">Boost-Graph</requirement>
+  </requirements>
+
+  <command interpreter="perl">siRNA_pipeline.pl 
+   ## Change this to accommodate the number of threads you have available.
+        -t \${GALAXY_SLOTS:-4}
+
+ -path \$SCRIPT_PATH
+
+      ## prepare bowtie index
+      #set index_path = ''
+      #if str($reference_genome.source) == "history":
+          bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa;
+          #set index_path = 'genome'
+      #else:
+          #set index_path = $reference_genome.index.fields.path
+      #end if
+
+
+   ## Do or not annotate siRNAs by function
+    #if $params.function_anno == "yes":
+     -nat $params.nat -repeat $params.repeat
+ #end if
+   
+   ## Do or not DEG
+    #if $degseq.degseq_analysis == "yes" :
+    -deg $degseq.deg
+    #end if
+
+  -i $reads -config $config -n $hits -format $format -g ${index_path}.fa -idx $index_path -f $gff -mis $mis -d $d -p $p -l $l  -cen $cen -span $span > run.log
+
+  </command>
+
+  <inputs>
+
+ <param name="config" type="data" label="Raw data configs file" />
+ <param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" />
+
+ <param name="format" type="select" lable=" data format" multiple="false">
+   <option value="fastq">Raw data is fastq. format</option>
+   <option value="fasta">Raw data is fasta. format</option>
+ </param>
+
+
+        <conditional name="reference_genome">
+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+            <option value="indexed">Use a built-in index</option>
+            <option value="history">Use one from the history</option>
+          </param>
+          <when value="indexed">
+            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+              <options from_data_table="bowtie_indexes">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+              </options>
+            </param>
+          </when>
+          <when value="history">
+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+          </when>
+        </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/-->
+
+
+ <param name="gff" type="data" label="gff file" />
+ <param name="mis" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" />
+ <param name="mapnt" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" />
+ <param name="d" type="integer" value="100" label="distance of tag to merged a cluster" />
+
+ <param name="p" type="select" lable="cluster method" multiple="false">
+   <option value="F">conventional</option>
+   <option value="T">NIBLES</option>
+ </param>
+ <param name="l" type="integer" value="1000" label="the length of the upstream and downstream,used in position annotate" />
+
+
+ <conditional name="params">
+ <param name="function_anno" type="select" label="Do or not annotate siRNAs by function">
+   <option value="no" selected="true">no</option>
+   <option value="yes">yes</option>
+  </param>
+  <when value="yes">
+ <param name="nat" type="data" label="atural antisense transcripts file" />
+ <param name="repeat" type="data" label="repeat information file out of Repeatmasker" />
+  </when>
+    </conditional> <!-- params -->
+
+ <param name="cen" type="data" label="centromere file input" />
+ <param name="span" type="integer" value="50000" label="plot span" />
+
+    <conditional name="degseq">
+       <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters">
+   <option value="no" selected="true">no</option>
+   <option value="yes">yes</option>
+       </param>
+       <when value="yes">
+ <param name="deg" type="data" label="file config of de sample" />
+        </when>
+    </conditional>
+    
+  </inputs>
+
+  <outputs>
+   <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/>
+   <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/>
+
+  </outputs>
+
+ <help>
+
+ </help>
+ </tool>
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA_pipeline.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA_pipeline.pl Fri Dec 05 00:11:02 2014 -0500
[
b'@@ -0,0 +1,524 @@\n+#!/usr/bin/perl -w\r\n+my $version=1.00;\r\n+use strict;\r\n+use warnings;\r\n+use Getopt::Long;\r\n+use Getopt::Std;\r\n+use threads;\r\n+use threads::shared;\r\n+use Parallel::ForkManager;\r\n+use lib \'/leofs/biotrans/chentt/perl_module/\';\r\n+#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq \r\n+print "\r\n+#####################################\r\n+#                                   # \r\n+#          sRNA cluster             # \r\n+#                                   #\r\n+#####################################\r\n+";\r\n+###########################################################################################\r\n+my $usage="$0\r\n+Options:\r\n+-i input file# raw data file\r\n+-tag string #raw data sample name\r\n+-g  genome file\r\n+-f  gff file\r\n+\r\n+-o  workdir file\r\n+-path  script path\r\n+-t  int,    number of threads [1]\r\n+-format  fastq, fq, fasta or fa\r\n+-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\r\n+\t\tstring must be the prefix of the bowtie index. For instance, if\r\n+\t\tthe first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\r\n+\t\tthe prefix is \'h_sapiens_37_asm\'.##can be null\r\n+-mis  int     number of allowed mismatches when mapping reads to genome, default 0\r\n+-rfam  string,  input file# rfam database file.\r\n+-idx2  string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter\r\n+\t\tstring must be the prefix of the bowtie index. For instance, if\r\n+\t\tthe first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\r\n+\t\tthe prefix is \'h_sapiens_37_asm\'.##can be null\r\n+\r\n+-v  int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment\r\n+\r\n+-a string,  ADAPTER string. default is ATCTCGTATG.\r\n+-n  int max hits number,default 25; used in genome alignment\r\n+-d  int distance of tag to merged a cluster; default 100\r\n+-p  cluster method F :conventional default is F\r\n+\t\t\t\t   T :NIBLES \r\n+-l  int the length of the upstream and downstream,default 1000;used in position annotate\r\n+\r\n+-nat  natural antisense transcripts file\r\n+-repeat  repeat information file out of Repeatmasker\r\n+-deg  file config of de sample\r\n+-cen  centromere file input\r\n+-span  plot span, default 50000\r\n+";\r\n+\r\n+my %options;\r\n+GetOptions(\\%options,"i:s@","tag:s@","g=s","phed:i","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h");\r\n+#print help if that option is used\r\n+if($options{h}){die $usage;}\r\n+\r\n+my @filein=@{$options{\'i\'}};\r\n+my @mark=@{$options{\'tag\'}};\r\n+\r\n+#my $config=$options{\'i\'};\r\n+my $genome_fa=$options{\'g\'};\r\n+my $gff=$options{\'f\'};\r\n+\r\n+\r\n+##########################################################################################\r\n+my $predir=`pwd`;\r\n+chomp $predir;\r\n+my $workdir=defined($options{\'o\'}) ? $options{\'o\'}:$predir;\r\n+\r\n+my $path=$options{\'path\'};\r\n+\r\n+my $t=defined($options{\'t\'})? $options{\'t\'}:1; #threads number\r\n+\r\n+my $mis=defined $options{\'mis\'} ? $options{\'mis\'}:0;\r\n+\r\n+my $mis_rfam=defined $options{\'v\'} ? $options{\'v\'}:0;\r\n+\r\n+my $hit=defined $options{\'n\'}?$options{\'n\'}:25;\r\n+\r\n+my $distance_of_merged_tag=defined $options{\'d\'} ? $options{\'d\'}:100;\r\n+\r\n+my $up_down_dis=defined $options{\'l\'} ?$options{\'l\'}:1000;\r\n+\r\n+my $cluster_mothod=defined $options{\'p\'}?$options{\'p\'}:"F";\r\n+\r\n+my $format=$options{\'format\'};\r\n+#if ($format ne "fastq" && $'..b'ile -g $dir\\/ref\\/genelist.txt -d $up_down_dis -o $annotate_dir\\/sample_c_p.anno\\n\\n";\r\n+\treturn 0;\r\n+}\r\n+sub get_genelist{\r\n+\r\n+\tmy $get_genelist=`perl $path\\/get_genelist.pl -i $gff -o $dir\\/ref\\/genelist.txt`;\r\n+\tprint "perl $path\\/get_genelist.pl -i $gff -o $dir\\/ref\\/genelist.txt";\r\n+}\r\n+\r\n+sub dec{\r\n+\tprint "deg reading\\n\\n";\r\n+\tmy $deg_file=$options{\'deg\'};\r\n+\topen IN,"<$deg_file";\r\n+\tmy @deg;\r\n+\tmy $s=0;\r\n+\t\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tnext if($aline=~/^\\#/);\r\n+\t\t$deg[$s]=$aline;\r\n+\t\tmy @ea=split/\\s+/,$aline;\r\n+\t\tpush @pairdir,"$ea[0]_VS_$ea[1]\\/";\r\n+\t\t#print "$deg[$s]\\n";\r\n+\t\t$s++;\r\n+\t}\r\n+\tclose IN;\r\n+\t$deg_dir=$dir."deg\\/";\r\n+\tmkdir ("$deg_dir");\r\n+\tmy $max_process = 10;\r\n+\tmy $pm = new Parallel::ForkManager( $max_process );\r\n+\tmy $number=@deg-1;\r\n+\tforeach(0..$number){\r\n+\t\t$pm->start and next;\r\n+\t\t&dec_pel($deg[$_]);\r\n+\t\t$pm->finish;\r\n+\t}\r\n+\t$pm->wait_all_children;\r\n+}\r\n+\r\n+sub dec_pel{\r\n+\tprint "\\n******************\\nstart:\\n";\r\n+\tTime();\r\n+\tmy $sample=shift(@_);\r\n+\tmy @each=split/\\s+/,$sample;\r\n+\tprint "$each[0]\\t$each[1]\\n";\r\n+\tmy $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\\/";\r\n+\tmkdir ("$deg_sample_dir");\r\n+\tprint "read: $read\\n";\r\n+\tprint "deg_sample_dir: $deg_sample_dir\\n";\r\n+\tprint "$id{$each[0]}\\t$each[0]\\n";\r\n+\tprint "$id{$each[1]}\\t$each[1]\\n";\r\n+\tmy $deg=`perl $path\\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2\r\n+\tmy $time2=time();\r\n+\tprint "end:\\n*************************\\n";\r\n+\tTime();\r\n+\tsleep 1;\r\n+}\r\n+\r\n+sub infor_merge{\r\n+\tmy ($input,$mark);\r\n+\tforeach (@pairdir) {\r\n+\t\tprint "@pairdir\\n";\r\n+\t\t$mark.=" -mark $_ ";\r\n+\t\t$input.=" -i $dir/deg\\/$_\\/output_score\\.txt ";\r\n+\t\tprint "$input\\n$mark\\n";\r\n+\t}\r\n+\tmy $infor_merge=`perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result `;\r\n+\tprint "perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result\\n\\n";\r\n+}\r\n+\r\n+sub infor_merge_no_dec{\r\n+\tmy $infor_merge_no_dec=`cp $annotate_dir\\/sample_c_p.anno $dir\\/total.result`;\r\n+}\r\n+\r\n+sub genome_length{\r\n+\tmy $length=`perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length`;\r\n+\tprint "perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length\\n\\n"\r\n+\r\n+}\r\n+\r\n+sub plot{\r\n+\t$plot_dir="$dir\\/plot\\/";\r\n+\tmkdir ("$plot_dir");\r\n+\tmy $span=defined($options{span})?$options{span}:50000;\r\n+\tmy $cen="";\r\n+\tif (defined $options{cen}) {\r\n+\t\t$cen="-cen $options{cen}";\r\n+\t}\r\n+\tmy $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `;\r\n+\t"print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \\n";\r\n+\r\n+}\r\n+\r\n+sub html{\r\n+\tmy $pathfile="$dir/path.txt";\r\n+\topen PA,">$pathfile";\r\n+\tprint PA "$config\\n";\r\n+\tprint PA "$preprocess\\n";\r\n+\tprint PA "$dir"."rfam_match\\n";\r\n+\tprint PA "$dir"."genome_match\\n";\r\n+\tprint PA "$cluster_file\\n";\r\n+\tprint PA "$annotate_dir\\n";\r\n+\tprint PA "$plot_dir\\n";\r\n+\tif (defined($deg_dir)) {\r\n+\t\t\tprint PA "$deg_dir\\n";\r\n+\t}\r\n+\tclose PA;\r\n+\tmy $html=`perl $path\\/html.pl -i $pathfile -format $format -o $dir/result.html`;\r\n+}\r\n+\r\n+sub Time{\r\n+\tmy $time=time();\r\n+\tmy ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n+\t$month++;\r\n+\t$year+=1900;\r\n+\tif (length($sec) == 1) {$sec = "0"."$sec";}\r\n+\tif (length($min) == 1) {$min = "0"."$min";}\r\n+\tif (length($hour) == 1) {$hour = "0"."$hour";}\r\n+\tif (length($day) == 1) {$day = "0"."$day";}\r\n+\tif (length($month) == 1) {$month = "0"."$month";}\r\n+\tprint "$year-$month-$day $hour:$min:$sec\\n";\r\n+\treturn("$year-$month-$day-$hour-$min-$sec");\r\n+}\r\n+#################################################################################\r\n'
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA_pipeline.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA_pipeline.xml Fri Dec 05 00:11:02 2014 -0500
b
@@ -0,0 +1,164 @@
+<tool id="plant_sirna_v1" name="siRNA" veision="1.0.0">
+  <description>tool for plant siRNA analisis</description>
+
+  <requirements>
+    <requirement type="set_environment">SCRIPT_PATH</requirement>
+    <requirement type="package" version="0.12.7">bowtie</requirement>
+    <requirement type="package" version="3.0.1">R</requirement>
+ <requirement type="package" version="0.0.13">fastx_toolkit </requirement>
+ <requirement type="package" version="1.96">threads</requirement>
+ <requirement type="package" version="1.06">Parallel-ForkManager</requirement>
+ <requirement type="package" version="2.59">SVG</requirement>
+ <requirement type="package" version="1.4_001">Boost-Graph</requirement>
+  </requirements>
+
+  <command interpreter="perl">siRNA_pipeline.pl 
+   ## Change this to accommodate the number of threads you have available.
+        -t \${GALAXY_SLOTS:-4}
+
+ -path \$SCRIPT_PATH
+
+    #for $j, $s in enumerate( $series )
+    ##rank_of_series=$j
+    -i ${s.input}
+    -tag ${s.tag}
+    #end for
+
+      ## prepare bowtie index
+      #set index_path = ''
+      #if str($reference_genome.source) == "history":
+          bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa;
+          #set index_path = 'genome'
+      #else:
+          #set index_path = $reference_genome.index.fields.path
+      #end if
+
+
+   ## prepare Rfam bowtie index
+   #set rfam_index_path = ''
+   #if str($reference_rfam.source) == "history":
+   bowtie-build "$reference_rfam.own_file" rfam; ln -s $reference_rfam.own_file" rfam.fa;
+   #set rfam_index_path = 'rfam'
+  #else:
+   #set rfam_index_path = $reference_rfam.index.fields.path
+  #end if
+
+
+
+   ## Do or not annotate siRNAs by function
+    #if $params.function_anno == "yes":
+     -nat $params.nat -repeat $params.repeat
+ #end if
+   
+   ## Do or not DEG
+    #if $degseq.degseq_analysis == "yes" :
+    -deg $degseq.deg
+    #end if
+
+  -format $format -phred $phred -g ${index_path}.fa -idx $index_path -f $gff -mis $mis -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v -a $a -n $mapnt -d $d -p $p -l $l  -cen $cen -span $span > run.log
+
+  </command>
+
+  <inputs>
+
+   <repeat name="series" title="Series">
+     <param name="input" type="data" label="Raw data file"/>
+     <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/>
+   </repeat>
+
+ <param name="format" type="select" lable="raw data format" multiple="false">
+   <option value="fastq">Raw data is fastq. format</option>
+   <option value="fasta">Raw data is fasta. format</option>
+ </param>
+
+ <param name="phred" type="select" lable="input quals are Phred+64 or Phred+33" multiple="false">
+   <option value="64">Phred+64</option>
+   <option value="33" selected="true">Phred+33</option>
+ </param>
+
+        <conditional name="reference_genome">
+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+            <option value="indexed">Use a built-in index</option>
+            <option value="history">Use one from the history</option>
+          </param>
+          <when value="indexed">
+            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+              <options from_data_table="bowtie_indexes">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+              </options>
+            </param>
+          </when>
+          <when value="history">
+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+          </when>
+        </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/-->
+
+         <conditional name="reference_rfam">
+          <param name="source" type="select" label="Will you select a rfam reference  from your history or use a built-in index?" help="Built-ins were indexed using default options">
+            <option value="indexed">Use a built-in index</option>
+            <option value="history">Use one from the history</option>
+          </param>
+          <when value="indexed">
+            <param name="index" type="select" label="Select a reference " help="If your rfam of interest is not listed, contact the Galaxy team">
+              <options from_data_table="rfam_bowtie_indexes">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+              </options>
+            </param>
+          </when>
+          <when value="history">
+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" />
+          </when>
+        </conditional>
+
+ <param name="gff" type="data" label="gff file" />
+ <param name="mis" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" />
+ <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches"/>
+ <param name="a" type="text" value="ATCTCGTATG" label="3' adapter sequence" />
+ <param name="mapnt" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" />
+ <param name="d" type="integer" value="100" label="distance of tag to merged a cluster" />
+
+ <param name="p" type="select" lable="cluster method" multiple="false">
+   <option value="F">conventional</option>
+   <option value="T">NIBLES</option>
+ </param>
+ <param name="l" type="integer" value="1000" label="the length of the upstream and downstream,used in position annotate" />
+
+
+ <conditional name="params">
+ <param name="function_anno" type="select" label="Do or not annotate siRNAs by function">
+   <option value="no" selected="true">no</option>
+   <option value="yes">yes</option>
+  </param>
+  <when value="yes">
+ <param name="nat" type="data" label="atural antisense transcripts file" />
+ <param name="repeat" type="data" label="repeat information file out of Repeatmasker" />
+  </when>
+    </conditional> <!-- params -->
+
+ <param name="cen" type="data" label="centromere file input" />
+ <param name="span" type="integer" value="50000" label="plot span" />
+
+    <conditional name="degseq">
+       <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters">
+   <option value="no" selected="true">no</option>
+   <option value="yes">yes</option>
+       </param>
+       <when value="yes">
+ <param name="deg" type="data" label="file config of de sample" />
+        </when>
+    </conditional>
+    
+  </inputs>
+
+  <outputs>
+   <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/>
+   <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/>
+
+  </outputs>
+
+ <help>
+
+ </help>
+ </tool>
b
diff -r f008ab2cadc6 -r 7b5a48b972e9 tool_dependencies.xml
--- a/tool_dependencies.xml Wed Dec 03 02:03:27 2014 -0500
+++ b/tool_dependencies.xml Fri Dec 05 00:11:02 2014 -0500
b
@@ -26,6 +26,76 @@
     </actions>
     </install>
  </package>
+ <package name="R" version="3.0.1">
+    <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" />
+
+ <install version="1.0">
+    <actions>
+       <action type="set_environment_for_install">
+                    <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu">
+                        <package name="R" version="3.0.1" />
+                    </repository>
+              </action>
+       <action type="shell_command">R CMD BATCH $REPOSITORY_INSTALL_DIR/install_DEG.R </action>
+       <action type="shell_command">echo "export PATH=$PATH" &gt; $INSTALL_DIR/env.sh </action>
+   <action type="shell_command">chmod 755 $INSTALL_DIR/env.sh </action>
+
+  </actions>
+ </install>
+ </package>
+
+ <package name="threads" version="1.96">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/J/JD/JDHEDDEN/threads-1.96.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package>
+
+
+ <package name="Parallel-ForkManager" version="1.06">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/S/SZ/SZABGAB/Parallel-ForkManager-1.06.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package>
+
+ <package name="Boost-Graph" version="1.4_001">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/D/DU/DUFFEE/Boost-Graph-1.4_001.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ ls &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package>
 
  <package name="SVG" version="2.59">
     <install version="1.0">