Repository 'gpsrna'
hg clone https://toolshed.g2.bx.psu.edu/repos/bigrna/gpsrna

Changeset 0:87fe81de0931 (2015-01-04)
Commit message:
Uploaded
added:
Annotate.pl
ClassAnnotate.pl
DEGseq_2.pl
Length_Distibution.pl
SampleDEGseqMerge.pl
collapseReads2Tags.pl
conventional.pl
convert_bowtie_to_blast.pl
count_ref_length.pl
count_rfam_express.pl
filterReadsByCount.pl
filterReadsByLength.pl
get_genelist.pl
html_miRPlant.pl
html_preprocess.pl
html_siRNA.pl
install_DEG.R
matching.pl
miRDeep_plant.pl
miRNA_Express_and_sequence.pl
microRNA.pl
microRNA.xml
microRNA_pipeline.pl
microRNA_pipeline.xml
nibls.pl
non_miRNA_reads.pl
phased_siRNA.pl
preProcess.pl
preProcess.xml
preProcess_core.pl
precursors.pl
quantify.pl
quantify_siRNA.pl
rfam.pl
sRNA_plot.pl
sam2Bed_bowtie.pl
siRNA.pl
siRNA.xml
siRNA_pipeline.pl
siRNA_pipeline.xml
tool-data/bowtie_indices.loc.sample
tool-data/nat_annotate.loc.sample
tool-data/repeat_annotate.loc.sample
tool-data/rfam_bowtie_indices.loc.sample
tool_data_table_conf.xml.sample
tool_dependencies.xml
b
diff -r 000000000000 -r 87fe81de0931 Annotate.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Annotate.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,178 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chentt
+#Email: chentt@big.ac.cn
+#Date: 2014/4/10
+#Modified:
+#Description: cluster annotate by priority
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","d=i","g=s","o=s","t=s","h");
+if (!(defined $opts{i} and defined $opts{g} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+#my $genelistout=$opts{'t'};
+my $dis=defined $opts{'d'}? $opts{'d'}:1000;
+my %gene;
+
+#open OUT,">$genelistout"; #output file  
+#print OUT "#ID\tchr\tstart\tend\tstrand\ns";
+open IN,"<$opts{g}";
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;#ID chr start end strand
+ #push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
+ $gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[4]];
+}
+#while (my $aline=<IN>) {
+# chomp $aline;
+# next if($aline=~/^\#/);
+# my @tmp=split/\t/,$aline;
+# my $ID;
+# if ($tmp[2] eq "gene") {
+# $tmp[0]=~s/Chr\./Chr/;
+# $tmp[0]=~s/Chr/chr/;
+# my @infor=split/;/,$tmp[8];
+# for (my $i=0;$i<@infor ;$i++) {
+# if ($infor[$i]=~/Alias\=(\S+)$/) {
+# $ID=$1;
+# last;
+# }
+# }
+# $gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+# print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+# }
+#}
+close IN;
+#close OUT;
+
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+while (my $aline=<IN>) {
+ chomp $aline;
+ my @tmp=split/\t/,$aline;
+ if($aline=~/^\#/){print OUT "$aline\tP_annotate\n";next}
+ my @result;
+ #shift @tmp;
+ my @id=split/:/,$tmp[0];
+ $id[0]=~s/Chr0/Chr/;
+ my @posi=split/-/,$id[1];
+ foreach my $key (keys %{$gene{$id[0]}}) {
+ if ($posi[0]<$gene{$id[0]}{$key}[1] && $posi[1]>$gene{$id[0]}{$key}[0]) {
+ push @result,"gene-body;$key;$gene{$id[0]}{$key}[2]";#$te{$key}";
+ next;
+ }
+ #if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-1000) {
+ if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-$dis) {
+ push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
+ push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
+ next;
+ }
+ #if ($posi[0]<$gene{$id[0]}{$key}[1]+1000 && $posi[1]>$gene{$id[0]}{$key}[1]) {
+ if ($posi[0]<$gene{$id[0]}{$key}[1]+$dis && $posi[1]>$gene{$id[0]}{$key}[1]) {
+ push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+");
+ push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-");
+ next;
+ }
+ }
+ my $result;
+ if (!(@result)) {
+ $result="intergenic";
+ }
+ elsif($#result==0){
+ $result=$result[0];
+
+ }
+ else{
+ $result=join "\t",@result;
+ }
+# else{
+# my $te_num=0;
+# my @te_overlap;
+# my @te_up_down;
+# my @non_overlap;
+# my @non_up_down;
+# for (my $k=0;$k<@result ;$k++) {
+# my @rr=split/\;/,$result[$k];
+# if ($rr[3] eq "Y") {
+# $te_num++;
+# if ($rr[0] eq "overlap") {
+# push @te_overlap,$result[$k];
+# }
+# else{
+# push @te_up_down,$result[$k];
+# }
+# }
+# else{
+# if ($rr[0] eq "overlap") {
+# push @non_overlap,$result[$k];
+# }
+# else{
+# push @non_up_down,$result[$k];
+# }
+# }
+# }
+# if ($te_num==0) {#non TE 
+# if (!(@te_overlap)) {#down up
+# if ($#non_up_down==0) {
+# $result=$non_up_down[0];
+# }
+# else{#overlap
+# my $all_2=join "\t",@non_up_down;
+# $result="up&down1-kb\t".$all_2;
+# }
+# }
+# else{
+# $result=join "\t",@non_overlap;
+# if ($#non_overlap>=1) {
+# print "$aline\t$result\n";
+# }
+# }
+# }
+# else{#TE
+# if (!(@te_overlap)) {#down up
+# if ($#te_up_down==0) {
+# $result=$te_up_down[0];
+# }
+# else{#overlap
+# my $all_2=join "\t",@te_up_down;
+# $result="up&down1-kb\t".$all_2;
+# }
+# }
+# else{
+# $result=join "\t",@te_overlap;
+# if ($#te_overlap>=1) {
+# print "$aline\t$result\n";
+# }
+# }
+# }
+# }
+ print OUT "$aline\t$result\n";
+}
+
+close IN;
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -g -d 
+options:
+-i input file
+-g genelist file
+-d int the length of the upstream and downstream,default 1000
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 ClassAnnotate.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ClassAnnotate.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,251 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chen Tingting
+#Email: chentt@big.ac.cn
+#Date: 2014/5/13
+#Modified:
+#Description: cluster annotate
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h");
+if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t}  and defined $opts{l}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+#my %gene;
+my %gene1;
+open IN,"<$opts{g}";
+open OUT ,">$opts{l}";
+print OUT "#ID\tchr\tstart\tend\tstrand\n";
+my $n=1;
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;
+ my $ID;
+ if ($tmp[2] eq "gene") {
+ $tmp[0]=~s/Chr\./Chr/;
+ #$tmp[0]=~s/Chr/chr/;
+ my @infor=split/;/,$tmp[8];
+ for (my $i=0;$i<@infor ;$i++) {
+ if ($infor[$i]=~/Alias\=(\S+)$/) {
+ $ID=$1;
+ last;
+ }
+ else {
+ $ID="unknown$n";
+ $n++;
+ }
+ }
+ #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+ push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
+ print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+ }
+}
+#while (my $aline=<IN>) {
+# chomp $aline;
+# next if($aline=~/^\#/);
+# my @tmp=split/\t/,$aline;#ID chr start end strand
+# push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
+# #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]];
+#}
+close IN;
+close OUT;
+
+my %nat;
+open TMP,">$opts{t}";
+print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n";
+
+open IN,"<$opts{n}";
+my $nat_n=1;
+while (my $aline=<IN>) {
+ next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt -
+ chomp $aline;
+ my @arr=split /\t/,$aline;
+ my ($ns,$ne,$ns2,$ne2)=(0,0,0,0);
+ if ($arr[11]=~/Nearby/) {
+ ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]);
+ push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n];
+ print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n";
+ $nat_n++;
+ }else{
+ $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1;
+ $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1;
+ $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1;
+ $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1;
+ push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2
+ push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"];
+ print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n";
+ $nat_n++;
+ }
+}
+close IN;
+close TMP;
+
+my %repeat;
+open IN,"<$opts{'r'}";
+my $first=<IN>;
+$first=<IN>;
+$first=<IN>;
+while (my $aline=<IN>) {
+ chomp $aline;
+ $aline=~s/^\s+//;
+ my @tmp=split/\s+/,$aline;
+ $tmp[4]=~s/chr0/Chr/;
+ $tmp[4]=~s/chr/Chr/;
+ push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class
+ #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n";
+}
+close IN;
+
+my %phase;
+open IN,"<$opts{'p'}";
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;
+ if ($tmp[5]>=25) {
+ $phase{$tmp[0]}=$tmp[5];
+ }
+}
+close IN;
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+while (my $aline=<IN>) {
+ chomp $aline;
+ if($aline=~/^\#/){
+ print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n";
+ next;
+ }
+ my @tmp=split/\t/,$aline;
+ my @inf=split/\:/,$tmp[0];
+ my @pos=split/\-/,$inf[1];
+ my $chr=$inf[0];
+ $chr=~s/Chr0/Chr/;
+ my $start=$pos[0];
+ my $end=$pos[1];
+ #=========Repeat============
+ my @repeat;
+ if (defined(@{$repeat{$chr}})) {
+ my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}};
+ for (my $i=0;$i<@r_array ;$i++) {
+ if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) {
+ push @repeat,$r_array[$i][2];
+ }
+ elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){
+ push @repeat,$r_array[$i][2];
+
+ }
+ elsif($end<$r_array[$i][0]){
+ last;
+ }
+ else{next;}
+ }
+ }
+ my $repeat;
+ if (@repeat==0) {
+ $repeat="\/";
+ }
+ else{
+ $repeat=join ";",@repeat;
+ }
+ #=========nat===============
+ my @nat1;#class 1
+ my @nat2;#class 2
+ my @nat;#nat ID
+ #foreach my $chr (keys %nat) {
+ my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}};
+ for (my $i=0;$i<@n_array ;$i++) {
+ if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) {
+ push @nat1,$n_array[$i][2];
+ push @nat2,$n_array[$i][3];
+ push @nat,$n_array[$i][4];
+ }
+ elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){
+ push @nat1,$n_array[$i][2];
+ push @nat2,$n_array[$i][3];
+ push @nat,$n_array[$i][4];
+ }
+ elsif($end<$n_array[$i][0]){
+ last;
+ }
+ else{next;}
+ }
+ #}
+
+ my $nat1;
+ my $nat2;
+ my $nat;
+ if (@nat1==0) {
+ $nat1="\/";
+ }
+ else{
+ $nat1=join ";",@nat1;
+ }
+ if (@nat2==0) {
+ $nat2="\/";
+ }
+ else{
+ $nat2=join ";",@nat2;
+ }
+ if (@nat==0) {
+ $nat="\/";
+ }
+ else{
+ $nat=join ";",@nat;
+ }
+ #========phase==============
+ my $phase="\/";
+ if (defined($phase{$tmp[0]})) {
+ $phase="phase";
+ }
+ #=========long===============
+ my $long="\/";
+ if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) {
+ $long="long";
+ }
+ print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n";
+}
+
+close IN;
+close OUT;
+
+sub nearby{
+ my @p=@_;
+ my ($s,$e)=(0,0);
+ if ($p[1]<$p[2]) {
+ $s=$p[1];
+ $e=$p[2];
+ }else{
+ $s=$p[3];
+ $e=$p[0];
+ }
+ return ($s,$e);
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -g -n -r -p -t -l
+options:
+-i input file
+ -g gff file
+ -n NATs file
+ -r repeat file
+ -p phase file
+-o output file
+-t nat output file
+-l genelist output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 DEGseq_2.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/DEGseq_2.pl Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2009-05-06
+#Modified:
+#Description: ɾ��matched reads 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","outdir=s","column1:i","mark1=s","depth1:i","depth2:i","column2:i","mark2=s","h");
+if (!(defined $opts{i} and defined $opts{outdir} and defined $opts{mark1} and defined $opts{mark2}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $outputdir=$opts{'outdir'};
+unless ($outputdir=~/\/$/) {$outputdir .="/";}
+my $column1=defined $opts{column1} ? $opts{column1} : 3;
+my $column2=defined $opts{column2} ? $opts{column2} : 4;
+my $mark1=$opts{mark1};
+my $mark2=$opts{mark2};
+my $fileout=$outputdir."degseq.R";
+
+open OUT,">$fileout"; #output file  
+#my ($name,$dir);
+#$name=basename($filein);
+print OUT "library(DEGseq)\n";
+print OUT "geneExpFile <- system.file(package=\"DEGseq\")\n";
+print OUT "geneExpFile<-file.path(\"$filein\")\n";
+print OUT "layout(matrix(c(1,2,3,4,5,6), 3, 2, byrow=TRUE))\npar(mar=c(2, 2, 2,2))\n";
+print OUT "outputdir<-file.path(\"$outputdir\")\n";
+print OUT "geneExpMatrix1 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column1))\n";
+print OUT "geneExpMatrix2 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column2))\n";
+if(defined $opts{'depth1'} && defined $opts{'depth2'}){
+print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",depth1=$opts{depth1},depth2=$opts{depth2},outputDir=outputdir,method=\"MARS\")\n";
+}
+else{
+print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",outputDir=outputdir,method=\"MARS\")\n";
+}
+close OUT;
+
+
+system("R CMD BATCH $fileout");
+
+wait;
+
+
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -outdir -column1 -mark1 -column2 -mark2 -depth1 -depth2
+options:
+-i input file
+-outdir output file dir
+-column1 the first column for DEGseq
+-mark1 the name of the column1
+-depth1 depth for the first file,use for normalize
+-column2 the second column for DEGseq
+-mark2 the name of the column2
+-depth2 depth for the second file,use for normalize
+
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 Length_Distibution.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Length_Distibution.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,219 @@\n+#!/usr/bin/perl -w\n+#==========================================================================================\n+# Date: \n+# Title: \n+# Comment: Program to plot gene structure\n+# Input: 1. input file of Gene region annotation which format like GenePred\n+#        2. input file of Transcripts region annotation which format like GenePred\n+#        3. input file of gene snp detail info\n+# Output: output file of gene structure graph by html or svg formt\n+# Test Usage: \n+#========================================================================================\n+#use strict;\n+my $version=1.00;\n+use SVG;\n+use Getopt::Long;\n+my %opt;\n+GetOptions(\\%opt,"i=s","o=s",,"h");\n+if (!(defined $opt{i} and defined $opt{o}) || defined $opt{h}) {\n+&usage;\n+}\n+#===============================Define Attribute==========================================\n+my %attribute=(\n+\tcanvas=>{\n+\t\t\'width\'=>1500,\n+\t\t\'height\'=>1800\n+\t},\n+\ttext=>{\n+\t\t\'stroke\'=>"#000000",\n+\t\t\'fill\'=>"none",\n+\t\t\'stroke-width\'=>0.5\n+\t\t#\'stroke-width2\'=>1\n+\t},\n+\tline=>{\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>1\n+\t},\n+\tfont=>{\n+\t\t\'fill\'=>"#000000",\n+\t\t\'font-size\'=>12,\n+\t\t\'font-size2\'=>10,\n+\t\t\'font-weight\'=>\'bold\',\n+\t\t\'font-family\'=>"Arial"\n+\t\t#\'font-family2\'=>"ArialNarrow-bold"\n+\t},\n+\trect=>{\n+\t\t\'fill\'=>"lightgreen",\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>0.5\n+\t},\n+\treadwidth=>0.5\n+);\n+#my $Xscale=600/$length;#\xb6\xa8\xd2\xe5X\xd6\xe1\xb1\xc8\xc0\xfd\xb3\xdf 1:1000 x\xd6\xe1\xb5\xc4\xd7\xf8\xb1\xea\xb3\xa4\xb6\xc8\xb6\xbc\xd2\xaa\xb0\xb4\xd5\xd5\xb4\xcb\xb1\xc8\xc0\xfd\xb3\xdf\xbb\xbb\xcb\xe3\n+#========================================data============================\n+open(IN,"$opt{i}")||die"cannot open the file $opt{i}";\n+my @R_length;\n+my @T_length;\n+my $R_number=0;\n+my $T_number=0;\n+my $R_max=0;\n+my $T_max=0;\n+\n+my $title=<IN>;\n+chomp $title;\n+my @title=split/\\t/,$title;\n+my @mark=split/\\s+/,$title[1];\n+my $sample_number=@mark;\n+while (my $aline=<IN>) {\n+\tif ($aline=~/^\\s/) {\n+\t\tmy $T_title=<IN>;\n+\t\tchomp $T_title;\n+\t\twhile (my $a_aline=<IN>) {\n+\t\t\tchomp $a_aline;\n+\t\t\tmy @temp=split/\\t/,$a_aline;\n+\t\t\tmy @number=split/\\s+/,$temp[1];\n+\t\t\tfor (my $i=0;$i<@number ;$i++) {\n+\t\t\t\tif ($R_max<$number[$i]) {\n+\t\t\t\t\t$R_max=$number[$i];\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tpush @R_length,[$temp[0],@number];\n+\t\t\t$R_number++;\n+\t\t}\n+\t}\n+\telse {\n+\t\tchomp $aline;\n+\t\tmy @temp=split/\\t/,$aline;\n+\t\tmy @number=split/\\s+/,$temp[1];\n+\t\tfor (my $i=0;$i<@number ;$i++) {\n+\t\t\tif ($T_max<$number[$i]) {\n+\t\t\t\t$T_max=$number[$i];\n+\t\t\t}\n+\t\t}\n+\t\tpush @T_length,[$temp[0],@number];\n+\t\t$T_number++;\n+\t}\n+}\n+close IN;\n+print "Tag max: $T_max\\nRead max: $R_max\\n";\n+my $kd_number=5;\n+##=======================Reads \xd7\xdd\xd7\xf8\xb1\xea\xbf\xcc\xb6\xc8==========================\n+my $r=1;\n+my $rr=1;\n+my $R=$R_max;\n+while ($R>10) {\n+\t$R=$R/10;\n+\t$r=$r*10;\n+\t$rr++;\n+}\n+$R=int($R+0.5);\n+my $R_xg=$R/$kd_number*$r;#\xd7\xdd\xd7\xf8\xb1\xea\xd2\xbb\xd0\xa1\xb8\xf1\xb4\xf3\xd0\xa1\xa3\xa8\xd2\xbb\xb9\xb210\xb8\xf1\xa3\xa9\n+my $R_kedu_scale_x=6*$rr;#\xd7\xdd\xd7\xf8\xb1\xea\xbf\xcc\xb6\xc8\xce\xc4\xd7\xd6\n+##=======================Tags \xd7\xdd\xd7\xf8\xb1\xea\xbf\xcc\xb6\xc8==========================\n+my $t=1;\n+my $tt=1;\n+my $T=$T_max;\n+while ($T>10) {\n+\t$T=$T/10;\n+\t$t=$t*10;\n+\t$tt++;\n+}\n+$T=int($T+0.5);\n+my $T_xg=$T/$kd_number*$t;#\xd7\xdd\xd7\xf8\xb1\xea\xd2\xbb\xd0\xa1\xb8\xf1\xb4\xf3\xd0\xa1\xa3\xa8\xd2\xbb\xb9\xb210\xb8\xf1\xa3\xa9\n+my $T_kedu_scale_x=6*$tt;#\xd7\xdd\xd7\xf8\xb1\xea\xbf\xcc\xb6\xc8\xce\xc4\xd7\xd6\n+\n+#############################s#define start coordinate and scale\n+my $XOFFSET=50;\n+my $YOFFSET=60;\n+my $width=800;\n+my $heigth=800;\n+my $X_width=600;\n+#my $height=1600;\n+####    Starting    ####\n+#\xd0\xc2\xbd\xa8\xbb\xad\xb2\xbc\n+my $svg=SVG->new(width=>$width,height=>$heigth);\n+####\xd7\xf8\xb1\xea\xd6\xe1\n+my $axisL=300;#read \xd7\xdd\xd7\xf8\xb1\xea\xb3\xa4\xb6\xc8\n+my $x_margin = 50;\n+#=========Reads number setting==========================================\n+my $Y_R_title=30;#\xb1\xea\xcc\xe2\xb5\xc4\xd7\xdd\xcf\xf2\xbf\xed\xb6\xc8\n+my $Y_R_0=$YOFFSET+$axisL+$Y_R_title;\n+my $X_R_0=$XOFFSET+$x_margin;\n+my $R_Yscale=$axisL/$R_xg/$kd_number;\n+my $R_Xscale=$X_width/$R_number/($sample_number+1);\n+#=====================================Reads Y axis======================\n+$svg->line(\'x1\',$X_R_0,\'y1\',$Y_R_0,\'x2\',$X_R_0,\'y2\',$Y_R_0-$axisL,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+for (my $i=1;$i<$kd_number ;$i++) {\n+\t$svg->line(\'x1\',$X_R_0-5,\'y1\',$Y_R_0-$i*$R_xg*$R_Yscale,\'x2\',$X_R_0,\'y2\',$Y_R_0-$i*$R_xg*$R_Ys'..b'*$R_Xscale,\'y\',$Y_R_0+15,\'style\',\'fill:black;text-anchor:middle\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',6,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$R_length[$i][0]);\n+}\n+#===Reads number title\n+$svg->text(\'x\',$XOFFSET+400,\'y\',$YOFFSET+$Y_R_title,\'style\',\'fill:black;text-anchor:middle\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',"1",\'font-size\',15,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',"Reads Length Distribution");\n+#===Reads \n+for (my $i=0;$i<$sample_number ;$i++) {\n+\tmy $red=($i+1)/$sample_number*255;\n+\t$svg->rect(\'x\',$X_R_0+550,\'y\',$YOFFSET+$Y_R_title+20*$i,\'width\',15,\'height\',10,\'stroke\',"black",\'stroke-width\',"0.5",\'fill\',"rgb($red,125,0)");\n+\t$svg->text(\'x\',$X_R_0+550+30,\'y\',$YOFFSET+$Y_R_title+20*$i+10,\'style\',\'fill:black;text-anchor:middle\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',10,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$mark[$i]);\n+}\n+####==================================================================================\n+#=========================================Tag s\n+my $Y_T_title=30;#\xb1\xea\xcc\xe2\xb5\xc4\xd7\xdd\xcf\xf2\xbf\xed\xb6\xc8\n+my $Y_T_0=$Y_R_0+$axisL+$Y_R_title+50;#length size\n+my $X_T_0=$XOFFSET+$x_margin;\n+my $T_Yscale=$axisL/$T_xg/$kd_number;\n+my $T_Xscale=$X_width/$T_number/($sample_number+1);\n+#=====================================Tags Y axis======================\n+$svg->line(\'x1\',$X_T_0,\'y1\',$Y_T_0,\'x2\',$X_T_0,\'y2\',$Y_T_0-$axisL,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+for (my $i=1;$i<$kd_number ;$i++) {\n+\t$svg->line(\'x1\',$X_T_0-5,\'y1\',$Y_T_0-$i*$T_xg*$T_Yscale,\'x2\',$X_T_0,\'y2\',$Y_T_0-$i*$T_xg*$T_Yscale,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+\t$svg->text(\'x\',$X_T_0-$T_kedu_scale_x,\'y\',$Y_T_0-$i*$T_xg*$T_Yscale+4,\'style\',\'fill:black;text-anchor:middle\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',6,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$i*$T_xg);\n+}\n+#=====================================Tags X axis======================\n+$svg->line(\'x1\',$X_T_0,\'y1\',$Y_T_0,\'x2\',$X_T_0+$X_width,\'y2\',$Y_T_0,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+\n+#print "$R_number\\t$sample_number\\n";\n+for ($i=0;$i<$T_number ;$i++) {\n+\tfor (my $j=1;$j<$sample_number+1 ;$j++) {\n+\t\tmy $red=$j/$sample_number*255;\n+\t\t$svg->rect(\'x\',$X_T_0+($j+$i*($sample_number+1))*$T_Xscale,\'y\',$Y_T_0-$T_length[$i][$j]*$T_Yscale,\'width\',$T_Xscale,\'height\',$T_length[$i][$j]*$T_Yscale,\'stroke\',"black",\'stroke-width\',"0.5",\'fill\',"rgb($red,125,0)");\n+\t}\n+\t$svg->text(\'x\',$X_T_0+(1+$sample_number/2+$i*($sample_number+1))*$T_Xscale,\'y\',$Y_T_0+15,\'style\',\'fill:black;text-anchor:middle\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',6,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$T_length[$i][0]);\n+}\n+#===Reads number title\n+$svg->text(\'x\',$XOFFSET+400,\'y\',$Y_R_0+30+$Y_T_title,\'style\',\'fill:black;text-anchor:middle\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',"1",\'font-size\',15,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',"Tags Length Distribution");\n+#===Reads \n+for (my $i=0;$i<$sample_number ;$i++) {\n+\tmy $red=($i+1)/$sample_number*255;\n+\t$svg->rect(\'x\',$X_T_0+550,\'y\',$Y_R_0+30+$Y_T_title+20*$i,\'width\',15,\'height\',10,\'stroke\',"black",\'stroke-width\',"0.5",\'fill\',"rgb($red,125,0)");\n+\t$svg->text(\'x\',$X_T_0+550+30,\'y\',$Y_R_0+30+$Y_T_title+20*$i+10,\'style\',\'fill:black;text-anchor:middle\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',10,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$mark[$i]);\n+}\n+\n+\n+\n+\n+open (OUT,">$opt{o}");\n+print OUT $svg->xmlify();\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 \n+options:\n+-i \n+-o svg output\n+-h help\n+USAGE\n+exit(1);\n+}\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 87fe81de0931 SampleDEGseqMerge.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SampleDEGseqMerge.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,94 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: chentt@big.ac.cn
+#Date: 2014-05-21
+#Modified:
+#Description: merged deg file and total information
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i:s@","mark:s@","f:s","o=s","n=s","h");
+if (!(defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my @filein=@{$opts{'i'}};
+my @mark=@{$opts{'mark'}};
+my $fileout=$opts{'o'};
+my $number=$opts{'n'};
+
+my %hash;
+open IN,"<$filein[0]"; #input file  
+
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\"/);
+ my @temp=split/\t/,$aline;
+ $hash{$temp[0]}=$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1];
+}
+close IN;
+
+for (my $i=1;$i<=$#filein;$i++) {
+ open IN,"<$filein[$i]"; #input file  
+
+ while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\"/);
+ my @temp=split/\t/,$aline;
+ if (!(defined $hash{$temp[0]})) {
+ print "Not find $temp[0]in sample one!\n";
+ next;
+ }
+ $hash{$temp[0]} .="\t".$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1];
+ }
+ close IN;
+}
+
+open OUT,">$fileout"; #output file  
+my $deg_title;
+foreach (@mark) {
+ $deg_title.="log2(Fold_change)\tp_value\tq_value\t".$_."\t";
+}
+$deg_title=~s/\s+$//;
+my %function;
+my $title;
+open F,"<$opts{f}";
+while (my $aline=<F>) {
+ chomp $aline;
+ if($aline=~/^\#/){
+ my $title=$aline;
+ my @title=split/\t/,$aline;
+ $title[2+$number].="\t".$deg_title;
+ $title=join"\t",@title;
+ print OUT "$title\n";
+ next;
+ }
+ my @temp=split/\t/,$aline;
+ $temp[2+$number].="\t".$hash{$temp[0]};
+ my $temp=join"\t",@temp;
+ print OUT "$temp\n";
+
+}
+close F;
+close OUT;
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -mark -f
+options:
+-i input file # -i output_score.txt -i output_score.txt -i output_score.txt
+-mark  sample name  # -mark sam1_VS_sam2 -mark sam1_VS_sam3 -mark sam2_VS_sam3
+-f cluster file
+-n sample number
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 collapseReads2Tags.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/collapseReads2Tags.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,170 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2014-3-20
+#Modified:
+#Description: fastq file form reads cluster(the same sequence in the same cluster)
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h");
+if (!(defined $opts{o} and defined $opts{'format'})  || defined $opts{h}) { #necessary arguments
+&usage;
+}
+my @filein=@{$opts{i}} if(defined $opts{i});
+my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq";
+my $fileout=$opts{'o'};
+my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33;
+my %hash;##�ֿ���ԭʼ����
+
+my $format=$opts{'format'};
+if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
+ die "Parameter -format is error!\n";
+}
+
+my ($qualT,$qualV);
+if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) {  #quality filter
+ my @temp=split /:/,$opts{'qual'};
+ $qualT=$temp[0];
+ $qualV=$temp[1];
+
+ for (my $i=0;$i<@filein;$i++) {
+ open IN,"<$filein[$i]";
+ while (my $aline=<IN>) {
+ my $seq=<IN>;
+ my $n=<IN>;
+ my $qv=<IN>;
+ my $tag=&qvcheck($qv,$qualT,$qualV);
+ next if(!$tag);
+ my $str=substr($seq,0,6);
+ $hash{$str}[$i].=$seq;
+ }
+ close IN;
+ }
+}
+elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads
+ for (my $i=0;$i<@filein;$i++) {
+ open IN,"<$filein[$i]";
+ while (my $aline=<IN>) {
+ my $seq=<IN>;
+ my $n=<IN>;
+ my $qv=<IN>;
+ my $str=substr($seq,0,6);
+ $hash{$str}[$i].=$seq;
+ }
+ close IN;
+ }
+
+}
+elsif($format eq "fasta" || $format eq "fa"){
+ for (my $i=0;$i<@filein;$i++) {
+ open IN,"<$filein[$i]";
+ while (my $aline=<IN>) {
+ my $seq=<IN>;
+ my $str=substr($seq,0,6);
+ $hash{$str}[$i].=$seq;
+ }
+ close IN;
+ }
+}
+
+open OUT,">$fileout"; #output file  
+my $count=0;
+foreach my $key (keys %hash) {
+ my %cluster;
+ for (my $i=0;$i<@filein;$i++) {
+ next if(!(defined $hash{$key}[$i]));
+ my @tmp=split/\n/,$hash{$key}[$i];
+ foreach  (@tmp) {
+ $cluster{$_}[$i]++;
+ }
+ }
+
+ foreach my $seq (keys %cluster) {
+ my $exp=""; my $ee=0;
+ for (my $i=0;$i<@filein;$i++) {
+ if (defined $cluster{$seq}[$i]) {
+ $exp.="_$cluster{$seq}[$i]";
+ $ee+=$cluster{$seq}[$i];
+ }else{
+ $exp.="_0";
+ }
+ }
+ $count+=$ee;
+ $exp=~s/^_//;
+ print OUT ">$name","_$count:$exp","_x$ee\n$seq\n";
+ }
+}
+close OUT;
+
+
+sub qvcheck{
+ my ($str,$t,$v)=@_;
+ my $qv=0;
+ if($t eq "mean"){ 
+ $qv=&getMeanQuality($str);
+ }
+ elsif($t eq "min"){
+ $qv=&getMinQuality($str);
+ }
+ if ($qv<$v) {
+ return 0;
+ }
+ return 1;
+}
+
+sub getMeanQuality(){
+ chomp $_[0];
+ my @bases = split(//,$_[0]);
+ my $sum = 0;
+ for(my $i = 0; $i <= $#bases; $i++){
+ my $num = ord($bases[$i]) - $pq;
+ $sum += $num;
+ }
+
+ return $sum/($#bases+1);
+
+}
+
+###
+### This function gives back the Q-value of the worst base
+sub getMinQuality(){
+ chomp $_[0];
+ my @bases = split(//,$_[0]);
+ my $worst = 1000;
+ for(my $i = 0; $i <= $#bases; $i++){
+# printf ("base: $bases[$i]  --> %d\n",ord($bases[$i]));
+ my $num = ord($bases[$i]) - $pq;
+ if($num < $worst){
+ $worst = $num;
+ }
+ }
+ return $worst;
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -format -mark -qual -qv -o 
+options:
+-i input file#fastq file ##can be multiple -i file1 -i file2 ...
+-mark string#quary name,default is "seq"
+-o output file
+-format string # fastq|fasta|fq|fa
+
+-qual #reads filter
+   eg:(min:value/mean:value)
+   This parameter just for solexa reads.
+   If the input files are solid and needs filter,please do filter first .
+   
+-qv  integer #Phred quality64/33,default 33
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 conventional.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conventional.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,156 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chentt
+#Email: chentt@big.ac.cn
+#Date: 2014/04/09
+#Modified:
+#Description: islands merged of merged samples 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","d=i","o=s","N=i","t=s","mark=s","h");
+if (!(defined $opts{i} and defined $opts{d} and defined $opts{N} and defined $opts{mark} and defined $opts{t} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+my $distance=$opts{'d'};
+my $tempout=$opts{'t'};
+my $mark=$opts{'mark'};
+my @sample=split/\#/,$mark;
+$mark=join"\"\t\"",@sample;
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
+open TMP,">$tempout";
+print TMP "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
+my %hash;
+while (my $aline=<IN>) {
+ chomp $aline;
+ if($aline=~/^\#/){
+ #print OUT "$aline\n";
+ next;
+ }
+ my @tmp=split/\t/,$aline;
+ my $chr=shift @tmp;
+ #shift @tmp;
+ push @{$hash{$chr}},[@tmp];
+}
+
+close IN;
+
+foreach my $key (keys %hash) {
+ my @tag=sort{$a->[1] <=> $b->[1]} @{$hash{$key}};
+ my @sample;
+ my $start=$tag[0][1];
+ my $end=$tag[0][2];
+ push @sample,[@{$tag[0]}];
+ for (my $i=1;$i<@tag-1;$i++) {
+ if ($tag[$i][1]-$end<=$distance) {
+ if ($tag[$i][2]>$end) {
+ $end=$tag[$i][2];
+ }
+ push @sample,[@{$tag[$i]}];
+ }
+ else{
+ my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+ my $cluster_exp=join"\t",@cluster_exp;
+ if ($max_length>30) {
+ print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+ $max_length="\>30";
+ }
+ else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+ print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+ $start=$tag[$i][1];
+ $end=$tag[$i][2];
+
+ @sample=();
+ push @sample,[@{$tag[$i]}];
+ }
+ }
+ if ($tag[$#tag][1]-$end<=$distance) {
+ if ($tag[$#tag][2]>$end) {
+ $end=$tag[$#tag][2];
+ }
+ push @sample,[@{$tag[$#tag]}];
+ my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+ my $cluster_exp=join"\t",@cluster_exp;
+ if ($max_length>30) {
+ $max_length="\>30";
+ print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+ }
+ else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+ print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+ }
+ else{
+ my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+ my $cluster_exp=join"\t",@cluster_exp;
+ if ($max_length>30) {
+ $max_length="\>30";
+ print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+ }
+ else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+ print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+
+ }
+}
+close OUT;
+close TMP;
+sub Max_length{
+ my @exp=@{$_[0]};
+ my %sample_length;
+ my $total_exp;
+ my @each;
+ my @tag;
+ for (my $i=0;$i<=$#exp ;$i++) {
+ my $length=$exp[$i][2]-$exp[$i][1]+1;
+ #if ($length>30) {
+ # $length=40;
+ #}
+ my $exp=0;
+ foreach  (1..$opts{'N'}) {
+ $exp+=$exp[$i][$_+2];
+ $each[$_-1]+=$exp[$i][$_+2];
+ }
+ $sample_length{$length}+=$exp;
+ $total_exp+=$exp;
+ push @tag,($exp[$i][1].",".$exp[$i][2].",".$exp[$i][0].",".$exp);
+ }
+ my $max=0;
+ my $max_key;
+ foreach my $key (sort keys %sample_length) {
+ my $p=$sample_length{$key}/$total_exp;
+ if ($p>$max) {
+ $max=$p;
+ $max_key=$key;
+ }
+ $sample_length{$key}=sprintf("%.2f",$p);
+ }
+ my $tag_n=@tag;
+ my $tag=join";",@tag;
+ $tag=$tag_n."\t".$tag;
+ return($max_key,$sample_length{$max_key},$tag,@each);
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -d -N -t -mark
+options:
+-i input file
+-d distance of two islands
+-mark sample name;
+-o output file
+-N sample number
+-t temp output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 convert_bowtie_to_blast.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_bowtie_to_blast.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,126 @@
+#!/usr/bin/perl 
+
+
+use warnings;
+use strict;
+use Getopt::Std;
+
+######################################### USAGE ################################
+
+my $usage=
+"$0 file_bowtie_result file_solexa_seq file_chromosome
+
+This is a converter which changes Bowtie output into Blast format.
+The input includes three files: a Bowtie result file (default Bowtie
+output file), a fasta file consisting of small Reads and a chromosome
+fasta file. It outputs the alignments in blast_parsed format.
+
+file_bowtie_result likes:
+
+AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0
+AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0
+
+file_solexa_seq likes:
+
+>AtFlower100010_x2
+AAGGAGATTCTTTCAGTCCAG
+
+file_chromosome contains chromosome seq in fasta format
+
+";
+
+
+####################################### INPUT FILES ############################
+
+my $file_bowtie_result=shift or die $usage;
+my $file_short_seq=shift or die $usage;
+my $file_chromosome_seq=shift or die $usage;
+
+
+##################################### GLOBAL VARIBALES #########################
+
+my %short_seq_length=();
+my %chromosome_length=();
+
+
+######################################### MAIN ################################# 
+
+#get the short sequence id and its length
+sequence_length($file_short_seq,\%short_seq_length);
+
+#get the chromosome sequence id and its length
+sequence_length($file_chromosome_seq,\%chromosome_length);
+
+#convert bowtie result format to blast format;
+change_format($file_bowtie_result);
+
+exit;
+
+
+##################################### SUBROUTINES ##############################
+
+sub sequence_length{
+    my ($file,$hash) = @_;
+    my ($id, $desc, $sequence, $seq_length) = ();
+
+    open (FASTA, "<$file") or die "can not open $$file\n";
+    while (<FASTA>)
+    {
+        chomp;
+        if (/^>(\S+)(.*)/)
+ {
+     $id       = $1;
+     $desc     = $2;
+     $sequence = "";
+     while (<FASTA>){
+                chomp;
+                if (/^>(\S+)(.*)/){
+     $$hash{$id}  = length $sequence;
+     $id         = $1;
+     $desc       = $2;
+     $sequence   = "";
+     next;
+                }
+ $sequence .= $_;
+            }
+        }
+    }
+    $seq_length=length($sequence);
+    $$hash{$id} = $seq_length;
+    close FASTA;
+}
+
+
+
+
+
+sub change_format{
+ #Change Bowtie format into blast format
+ my $file=shift @_;
+ open(FILE,"<$file")||die"can not open the bowtie result file:$!\n";
+  #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n";
+
+ while(<FILE>){
+ chomp;
+ my @tmp=split("\t",$_);
+ #Clean the reads ID
+ my @tmp1=split(" ",$tmp[0]);
+ print  "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t";
+ if($tmp[1] eq "+"){
+ my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]};
+ my $seq_bg=$tmp[3] + 1;
+ print  "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n";
+ }
+ if($tmp[1] eq "-"){
+ my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3];
+ my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1;
+ print  "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n";
+ }
+ }
+
+# close BLASTOUT;
+
+}
+
+
+
b
diff -r 000000000000 -r 87fe81de0931 count_ref_length.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/count_ref_length.pl Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,58 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2009-05-06
+#Modified:
+#Description: ɾ��matched reads 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+
+my ($name,$seq);
+while (my $aline=<IN>) {
+ chomp $aline;
+ if ($aline=~/^>(\S+)/) {
+ $name=$1;
+ while (my $new=<IN>) {
+ chomp $new;
+ if ($new=~/^>(\S+)/) {
+ print OUT $name,"\t",length($seq),"\n";
+ $seq ="";
+ $name=$1;
+ next;
+ }
+ else{$seq .=$new;}
+ }
+ }
+ print OUT $name,"\t",length($seq),"\n";
+}

+close IN;
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 count_rfam_express.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/count_rfam_express.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,1800 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2013/7/19\n+#Modified:\n+#Description: \n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use File::Basename;\n+\n+my %opts;\n+GetOptions(\\%opts,"i=s","o=s","tag:s","h");\n+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $filein=$opts{\'i\'};\n+my $fileout=$opts{\'o\'};\n+\n+my $marks=defined $opts{\'tag\'} ? $opts{\'tag\'} : "";\n+\n+if(!(defined $opts{\'tag\'})){\n+\tmy $line=`head -1 $filein`;\n+\tmy @tmp=split/\\t/,$line;\n+\t$tmp[0]=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @ss=split/_/,$1;\n+\tfor (my $i=1;$i<=@ss;$i++) {\n+\t\t$marks .="Smp$i;";\n+\t}\n+}\n+\n+my @marks=split/\\;/,$marks;\n+\n+my %rfam_key;\n+while(<DATA>){\n+    chomp;\n+    if(/^(\\S+)\\s+(\\S+)$/){\n+\t\t$rfam_key{$1}=$2;\n+    }\n+}\n+\n+\n+my %reads;\n+my %tags;\n+open IN,"<$filein";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tmy @tmp=split/\\t/,$aline;\n+\t$tmp[0]=~/:([\\d|_]+)_x(\\d+)$/;\n+\n+\tmy @exp=split/_/,$1;\n+\tmy @tag=split/\\;/,$tmp[2];\n+\n+\tif (defined $rfam_key{$tag[0]}) {\n+\t\tfor (my $i=0;$i<@exp;$i++) {\n+\t\t\t$reads{$rfam_key{$tag[0]}}[$i]+=$exp[$i];\n+\t\t\t$tags{$rfam_key{$tag[0]}}[$i]++ if($exp[$i]!=0);\n+\t\t}\n+\t}else{\n+\t\tfor (my $i=0;$i<@exp;$i++) {\n+\t\t\t$reads{other}[$i]+=$exp[$i];\n+\t\t\t$tags{other}[$i]++ if($exp[$i]!=0);\n+\t\t}\n+\t}\n+\n+}\n+close IN;\n+\n+$"="\\t"; ##### @array print in \\t\n+open OUT,">$fileout";\n+print OUT "####################################\\n# small RNA expressed reads number #\\n####################################\\n";\n+print OUT "#RNAname\\t@marks\\n";\n+foreach my $key (keys %reads) {\n+\tprint OUT $key;\n+\tfor (my $i=0;$i<@{$reads{$key}} ;$i++) {\n+\t\tprint OUT "\\t",$reads{$key}[$i];\n+\t}\n+\tprint OUT "\\n";\n+}\n+\n+print OUT "\\n\\n####################################\\n# small RNA expressed tags number #\\n####################################\\n";\n+print OUT "#RNAname\\t@marks\\n";\n+\n+foreach my $key (keys %tags) {\n+\tprint OUT  $key;\n+\tfor (my $i=0;$i<@{$reads{$key}} ;$i++) {\n+\t\tif(defined $tags{$key}[$i]){print OUT "\\t",$tags{$key}[$i];}\n+\t\telse{print OUT "\\t0";}\n+\t}\n+\tprint OUT  "\\n";\n+}\n+\n+close OUT;\n+$"=" "; ##### @array print in \\t\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -i -tag -o\n+options:\n+-i input file# rfam bowtie bwt. format mapping result\n+-tag [string] sample marks# eg. sampleA;sampleB;sampleC\n+-o output file\n+\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n+__DATA__\n+RF00635\tlncRNA\n+RF01868\tlncRNA\n+RF01869\tlncRNA\n+RF01870\tlncRNA\n+RF01871\tlncRNA\n+RF01872\tlncRNA\n+RF01873\tlncRNA\n+RF01874\tlncRNA\n+RF01875\tlncRNA\n+RF01876\tlncRNA\n+RF01877\tlncRNA\n+RF01878\tlncRNA\n+RF01879\tlncRNA\n+RF01880\tlncRNA\n+RF01881\tlncRNA\n+RF01882\tlncRNA\n+RF01883\tlncRNA\n+RF01884\tlncRNA\n+RF01885\tlncRNA\n+RF01886\tlncRNA\n+RF01887\tlncRNA\n+RF01888\tlncRNA\n+RF01889\tlncRNA\n+RF01890\tlncRNA\n+RF01891\tlncRNA\n+RF01892\tlncRNA\n+RF01893\tlncRNA\n+RF01894\tlncRNA\n+RF01904\tlncRNA\n+RF01905\tlncRNA\n+RF01906\tlncRNA\n+RF01907\tlncRNA\n+RF01908\tlncRNA\n+RF01909\tlncRNA\n+RF01928\tlncRNA\n+RF01929\tlncRNA\n+RF01930\tlncRNA\n+RF01931\tlncRNA\n+RF01932\tlncRNA\n+RF01933\tlncRNA\n+RF01934\tlncRNA\n+RF01935\tlncRNA\n+RF01946\tlncRNA\n+RF01947\tlncRNA\n+RF01948\tlncRNA\n+RF01950\tlncRNA\n+RF01951\tlncRNA\n+RF01952\tlncRNA\n+RF01953\tlncRNA\n+RF01954\tlncRNA\n+RF01955\tlncRNA\n+RF01956\tlncRNA\n+RF01957\tlncRNA\n+RF01958\tlncRNA\n+RF01961\tlncRNA\n+RF01962\tlncRNA\n+RF01963\tlncRNA\n+RF01964\tlncRNA\n+RF01965\tlncRNA\n+RF01966\tlncRNA\n+RF01967\tlncRNA\n+RF01968\tlncRNA\n+RF01969\tlncRNA\n+RF01970\tlncRNA\n+RF01971\tlncRNA\n+RF01972\tlncRNA\n+RF01973\tlncRNA\n+RF01974\tlncRNA\n+RF01975\tlncRNA\n+RF01976\tlncRNA\n+RF01977\tlncRNA\n+RF01978\tlncRNA\n+RF01979\tlncRNA\n+RF01980\tlncRNA\n+RF01981\tlncRNA\n+RF01983\tlncRNA\n+RF01984\tlncRNA\n+RF01985\tlncRNA\n+RF01986\tlncRNA\n+RF01987\tlncRNA\n+RF01992\tlncRNA\n+RF02038\tlncRNA\n+RF02039\tlncRNA\n+RF02040\tlncRNA\n+RF02041\tlncRNA\n+RF02042\tlncRNA\n+RF02043\tlncRNA\n+RF02044\tlncRNA\n+RF02045\tlncRNA\n+RF02046\tlncRNA\n+RF02047\tlncRNA\n+RF02085\tlncRNA\n+RF02086\tlncRNA\n+RF02087\tlncRNA\n+RF02089\tlncRNA\n+RF02090\tlncRNA\n+RF02091\tlncRNA\n+RF02098\tlncRNA\n+RF02101\tlncRNA\n+RF02102\tlncR'..b'5\tsnRNA\n+RF01646\tsnRNA\n+RF01647\tsnRNA\n+RF01648\tsnRNA\n+RF01649\tsnRNA\n+RF01650\tsnRNA\n+RF01651\tsnRNA\n+RF01652\tsnRNA\n+RF01653\tsnRNA\n+RF01654\tsnRNA\n+RF01655\tsnRNA\n+RF01658\tsnRNA\n+RF01659\tsnRNA\n+RF01660\tsnRNA\n+RF01661\tsnRNA\n+RF01662\tsnRNA\n+RF01664\tsnRNA\n+RF01802\tsnRNA\n+RF01829\tsnRNA\n+RF01844\tsnRNA\n+RF01846\tsnRNA\n+RF01847\tsnRNA\n+RF01848\tsnRNA\n+RF01860\tsnRNA\n+RF01861\tsnRNA\n+RF01862\tsnRNA\n+RF01863\tsnRNA\n+RF01864\tsnRNA\n+RF01866\tsnRNA\n+RF02163\tsnRNA\n+RF00014\tsRNA\n+RF00018\tsRNA\n+RF00021\tsRNA\n+RF00034\tsRNA\n+RF00035\tsRNA\n+RF00057\tsRNA\n+RF00077\tsRNA\n+RF00078\tsRNA\n+RF00079\tsRNA\n+RF00081\tsRNA\n+RF00082\tsRNA\n+RF00083\tsRNA\n+RF00084\tsRNA\n+RF00101\tsRNA\n+RF00110\tsRNA\n+RF00111\tsRNA\n+RF00112\tsRNA\n+RF00113\tsRNA\n+RF00115\tsRNA\n+RF00116\tsRNA\n+RF00117\tsRNA\n+RF00118\tsRNA\n+RF00119\tsRNA\n+RF00121\tsRNA\n+RF00122\tsRNA\n+RF00124\tsRNA\n+RF00125\tsRNA\n+RF00126\tsRNA\n+RF00128\tsRNA\n+RF00166\tsRNA\n+RF00195\tsRNA\n+RF00368\tsRNA\n+RF00369\tsRNA\n+RF00370\tsRNA\n+RF00371\tsRNA\n+RF00372\tsRNA\n+RF00378\tsRNA\n+RF00444\tsRNA\n+RF00505\tsRNA\n+RF00519\tsRNA\n+RF00615\tsRNA\n+RF00616\tsRNA\n+RF01116\tsRNA\n+RF01385\tsRNA\n+RF01386\tsRNA\n+RF01387\tsRNA\n+RF01388\tsRNA\n+RF01389\tsRNA\n+RF01390\tsRNA\n+RF01391\tsRNA\n+RF01392\tsRNA\n+RF01393\tsRNA\n+RF01394\tsRNA\n+RF01395\tsRNA\n+RF01396\tsRNA\n+RF01397\tsRNA\n+RF01398\tsRNA\n+RF01399\tsRNA\n+RF01400\tsRNA\n+RF01401\tsRNA\n+RF01402\tsRNA\n+RF01403\tsRNA\n+RF01404\tsRNA\n+RF01405\tsRNA\n+RF01406\tsRNA\n+RF01407\tsRNA\n+RF01408\tsRNA\n+RF01409\tsRNA\n+RF01410\tsRNA\n+RF01411\tsRNA\n+RF01412\tsRNA\n+RF01457\tsRNA\n+RF01459\tsRNA\n+RF01460\tsRNA\n+RF01461\tsRNA\n+RF01462\tsRNA\n+RF01463\tsRNA\n+RF01464\tsRNA\n+RF01465\tsRNA\n+RF01466\tsRNA\n+RF01467\tsRNA\n+RF01468\tsRNA\n+RF01469\tsRNA\n+RF01470\tsRNA\n+RF01471\tsRNA\n+RF01472\tsRNA\n+RF01473\tsRNA\n+RF01474\tsRNA\n+RF01476\tsRNA\n+RF01477\tsRNA\n+RF01478\tsRNA\n+RF01479\tsRNA\n+RF01487\tsRNA\n+RF01488\tsRNA\n+RF01489\tsRNA\n+RF01492\tsRNA\n+RF01493\tsRNA\n+RF01494\tsRNA\n+RF01496\tsRNA\n+RF01503\tsRNA\n+RF01504\tsRNA\n+RF01512\tsRNA\n+RF01519\tsRNA\n+RF01520\tsRNA\n+RF01521\tsRNA\n+RF01527\tsRNA\n+RF01528\tsRNA\n+RF01529\tsRNA\n+RF01530\tsRNA\n+RF01571\tsRNA\n+RF01578\tsRNA\n+RF01579\tsRNA\n+RF01580\tsRNA\n+RF01581\tsRNA\n+RF01582\tsRNA\n+RF01619\tsRNA\n+RF01623\tsRNA\n+RF01643\tsRNA\n+RF01656\tsRNA\n+RF01663\tsRNA\n+RF01665\tsRNA\n+RF01668\tsRNA\n+RF01669\tsRNA\n+RF01670\tsRNA\n+RF01671\tsRNA\n+RF01672\tsRNA\n+RF01673\tsRNA\n+RF01674\tsRNA\n+RF01675\tsRNA\n+RF01676\tsRNA\n+RF01677\tsRNA\n+RF01678\tsRNA\n+RF01679\tsRNA\n+RF01680\tsRNA\n+RF01681\tsRNA\n+RF01682\tsRNA\n+RF01683\tsRNA\n+RF01684\tsRNA\n+RF01685\tsRNA\n+RF01686\tsRNA\n+RF01687\tsRNA\n+RF01690\tsRNA\n+RF01691\tsRNA\n+RF01693\tsRNA\n+RF01694\tsRNA\n+RF01696\tsRNA\n+RF01698\tsRNA\n+RF01699\tsRNA\n+RF01700\tsRNA\n+RF01701\tsRNA\n+RF01702\tsRNA\n+RF01703\tsRNA\n+RF01705\tsRNA\n+RF01706\tsRNA\n+RF01710\tsRNA\n+RF01712\tsRNA\n+RF01714\tsRNA\n+RF01718\tsRNA\n+RF01719\tsRNA\n+RF01722\tsRNA\n+RF01723\tsRNA\n+RF01728\tsRNA\n+RF01732\tsRNA\n+RF01742\tsRNA\n+RF01757\tsRNA\n+RF01762\tsRNA\n+RF01775\tsRNA\n+RF01781\tsRNA\n+RF01782\tsRNA\n+RF01783\tsRNA\n+RF01784\tsRNA\n+RF01789\tsRNA\n+RF01791\tsRNA\n+RF01793\tsRNA\n+RF01796\tsRNA\n+RF01808\tsRNA\n+RF01810\tsRNA\n+RF01812\tsRNA\n+RF01814\tsRNA\n+RF01815\tsRNA\n+RF01816\tsRNA\n+RF01817\tsRNA\n+RF01818\tsRNA\n+RF01819\tsRNA\n+RF01820\tsRNA\n+RF01821\tsRNA\n+RF01822\tsRNA\n+RF01823\tsRNA\n+RF01827\tsRNA\n+RF01828\tsRNA\n+RF01858\tsRNA\n+RF01867\tsRNA\n+RF02029\tsRNA\n+RF02030\tsRNA\n+RF02031\tsRNA\n+RF02049\tsRNA\n+RF02050\tsRNA\n+RF02051\tsRNA\n+RF02052\tsRNA\n+RF02053\tsRNA\n+RF02054\tsRNA\n+RF02055\tsRNA\n+RF02056\tsRNA\n+RF02057\tsRNA\n+RF02059\tsRNA\n+RF02060\tsRNA\n+RF02062\tsRNA\n+RF02063\tsRNA\n+RF02064\tsRNA\n+RF02065\tsRNA\n+RF02066\tsRNA\n+RF02067\tsRNA\n+RF02070\tsRNA\n+RF02071\tsRNA\n+RF02072\tsRNA\n+RF02073\tsRNA\n+RF02074\tsRNA\n+RF02075\tsRNA\n+RF02077\tsRNA\n+RF02078\tsRNA\n+RF02079\tsRNA\n+RF02080\tsRNA\n+RF02081\tsRNA\n+RF02082\tsRNA\n+RF02099\tsRNA\n+RF02100\tsRNA\n+RF02151\tsRNA\n+RF02221\tsRNA\n+RF02222\tsRNA\n+RF02223\tsRNA\n+RF02224\tsRNA\n+RF02225\tsRNA\n+RF02226\tsRNA\n+RF02227\tsRNA\n+RF02228\tsRNA\n+RF02230\tsRNA\n+RF02231\tsRNA\n+RF02232\tsRNA\n+RF02233\tsRNA\n+RF02234\tsRNA\n+RF02235\tsRNA\n+RF02236\tsRNA\n+RF02237\tsRNA\n+RF02238\tsRNA\n+RF02239\tsRNA\n+RF02240\tsRNA\n+RF02241\tsRNA\n+RF02242\tsRNA\n+RF02243\tsRNA\n+RF02268\tsRNA\n+RF02269\tsRNA\n+RF00127\tsRNA\n+RF01852\ttRNA\n+RF00005\ttRNA\n'
b
diff -r 000000000000 -r 87fe81de0931 filterReadsByCount.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filterReadsByCount.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,116 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2010-01
+#Modified:
+#Description:  
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","mark:s","h");
+if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
+my @mark=split /\#/,$mark;
+
+open OUT,">$opts{o}";
+open IN,"<$opts{i}";
+my %hash;my %reads;
+while (my $aline=<IN>) {
+ chomp $aline;
+ my $seq=<IN>;
+ chomp $seq;
+ if($aline=~/:([\d|_]+)_x(\d+)$/){
+ if ($2>3) {
+ my @ss=split/_/,$1;
+ for (my $i=0;$i<@ss;$i++) {
+ $hash{length($seq)}[$i]++ if($ss[$i]>0);
+ $hash{length($seq)}[$i] +=0 if($ss[$i]==0);
+ $reads{length($seq)}[$i]+=$ss[$i];
+ }
+ print OUT "$aline\n$seq\n";
+ }
+ }
+}
+close IN;
+close OUT;
+
+my $dir=dirname($opts{'o'});
+chdir $dir;
+my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt";
+open OUT, ">$lengthfile";
+open R,">$dir/length_distribution_after_count_filter.R";
+
+print OUT "Tags length\t@mark\n";
+
+my $samNo=@mark;
+my $avalue="";
+my @length=sort{$a<=>$b} keys %hash;
+foreach  (@length) {
+ print OUT $_,"\t@{$hash{$_}}\n";
+ my $vv=join ", ",@{$hash{$_}};
+ $avalue .="$vv,";
+}
+$avalue =~s/,$//;
+my $lengths=join ",",@length;
+my $marks=join "\",\"",@mark;
+
+print R  "a<-c($avalue)
+b<-matrix(a,ncol=$samNo,byrow=T)
+cl<-colors()
+names=c($lengths)
+legends=c(\"$marks\")
+png(\"Tags_length_after_count_filter.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+$avalue="";
+print OUT "\nReads length\t@mark\n";
+foreach  (@length) {
+ print OUT $_,"\t@{$reads{$_}}\n";
+ my $vv=join ", ", @{$reads{$_}};
+ $avalue .= "$vv,";
+}
+$avalue =~s/,$//;
+
+print R  "a<-c($avalue)\n
+b<-matrix(a,ncol=$samNo,byrow=T)
+
+png(\"Reads_length_after_count_filter.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+close OUT;
+close R;
+
+system ("R CMD BATCH $dir/length_distribution_after_count_filter.R");
+
+#system ("rm $dir/length_distribution.R");
+#system ("rm $dir/length_distribution.Rout");
+#system ("rm $dir/.RData");
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o  -min -max -mark
+options:
+
+-i input file
+-o output file
+-mark string #sample name eg: samA#samB#samC
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 filterReadsByLength.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filterReadsByLength.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,121 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2010-01
+#Modified:
+#Description:  
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","min=i","max=i","o=s","mark:s","h");
+if (!(defined $opts{i} and defined $opts{o} and defined $opts{min} and defined $opts{max}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
+my @mark=split /,/,$mark;
+
+
+open OUT,">$opts{o}";
+open IN,"<$opts{i}";
+my %hash;my %reads;
+while (my $aline=<IN>) {
+ chomp $aline;
+ my $seq=<IN>;
+ chomp $seq;
+
+ if($aline=~/:([\d|_]+)_x(\d+)$/){
+ my @ss=split/_/,$1;
+ for (my $i=0;$i<@ss;$i++) {
+ $hash{length($seq)}[$i]++ if($ss[$i]>0);
+ $hash{length($seq)}[$i] +=0 if($ss[$i]==0);
+ $reads{length($seq)}[$i]+=$ss[$i];
+ }
+ }
+ #else{$reads{length($seq)}+=1;}
+ if (length ($seq)>=$opts{'min'}  && length ($seq) <=$opts{'max'}) {
+ print OUT "$aline\n$seq\n";
+ }
+}
+close IN;
+close OUT;
+
+my $dir=dirname($opts{'o'});
+chdir $dir;
+my $lengthfile=$dir."/reads_length_distribution.txt";
+open OUT, ">$lengthfile";
+open R,">$dir/length_distribution.R";
+
+print OUT "Tags length\t@mark\n";
+
+my $samNo=@mark;
+my $avalue="";
+my @length=sort{$a<=>$b} keys %hash;
+foreach  (@length) {
+ print OUT $_,"\t@{$hash{$_}}\n";
+ my $vv=join ", ",@{$hash{$_}};
+ $avalue .="$vv,";
+}
+$avalue =~s/,$//;
+my $lengths=join ",",@length;
+my $marks=join "\",\"",@mark;
+
+print R  "a<-c($avalue)
+b<-matrix(a,ncol=$samNo,byrow=T)
+cl<-colors()
+names=c($lengths)
+legends=c(\"$marks\")
+png(\"Tags_length.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+$avalue="";
+print OUT "\nReads length\t@mark\n";
+foreach  (@length) {
+ print OUT $_,"\t@{$reads{$_}}\n";
+ my $vv=join ", ", @{$reads{$_}};
+ $avalue .= "$vv,";
+}
+$avalue =~s/,$//;
+
+print R  "a<-c($avalue)\n
+b<-matrix(a,ncol=$samNo,byrow=T)
+
+png(\"Reads_length.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+close OUT;
+close R;
+
+#system ("R CMD BATCH $dir/length_distribution.R");
+
+#system ("rm $dir/length_distribution.R");
+#system ("rm $dir/length_distribution.Rout");
+#system ("rm $dir/.RData");
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o  -min -max -mark
+options:
+
+-i input file
+-o output file
+-min reads min length.
+-max reads max length.
+-mark string #sample name eg: samA,samB,samC
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 get_genelist.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_genelist.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: chentt
+#Email:
+#Date: 2012-4-6
+#Modified:
+#Description:
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+open IN,"<$opts{i}";
+open OUT ,">$opts{o}";
+print OUT "#ID\tchr\tstart\tend\tstrand\n";
+my $n=1;
+my %gene1;
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\#/);
+ my @tmp=split/\t/,$aline;
+ my $ID;
+ if ($tmp[2] eq "gene") {
+ $tmp[0]=~s/Chr\./Chr/;
+ #$tmp[0]=~s/Chr/chr/;
+ my @infor=split/;/,$tmp[8];
+ for (my $i=0;$i<@infor ;$i++) {
+ if ($infor[$i]=~/Alias\=(\S+)$/) {
+ $ID=$1;
+ last;
+ }
+ else {
+ $ID="unknown$n";
+ $n++;
+ }
+ }
+ #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+ push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
+ print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+ }
+}
+close IN;
+close OUT;
+
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -h
+options:
+-i input cluster file
+-o output file
+-h help
+USAGE
+exit(1);
+}
\ No newline at end of file
b
diff -r 000000000000 -r 87fe81de0931 html_miRPlant.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/html_miRPlant.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,273 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2014-5-29
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","format=s","o=s","h");
+if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath);
+my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir);
+open IN,"<$opts{i}";
+$config=<IN>; chomp $config;
+$prepath=<IN>; chomp $prepath;
+$knownpath=<IN>; chomp $knownpath;
+$rfampath=<IN>;chomp $rfampath;
+$genomepath=<IN>; chomp $genomepath;
+$novelpath=<IN>; chomp $novelpath;
+close IN;
+my @tmp=split/\//,$prepath;
+$predir=$tmp[-1];
+@tmp=split/\//,$rfampath;
+$rfamdir=$tmp[-1];
+@tmp=split/\//,$knownpath;
+$knowndir=$tmp[-1];
+@tmp=split/\//,$genomepath;
+$genomedir=$tmp[-1];
+@tmp=split/\//,$novelpath;
+$noveldir=$tmp[-1];
+
+my $dir=dirname($opts{'o'});
+
+open OUT ,">$opts{'o'}";
+print OUT "<HTML>\n  <HEAD>\n  <TITLE> Analysis Report </TITLE>\n </HEAD>
+ <BODY bgcolor=\"lightgray\">\n  <h1 align=\"center\">\n    <font face=\"����\">\n <b>Small RNA Analysis Report</b>\n  </font>\n  </h1>
+  <h2>1. Sequence No. and quality</h2>
+  <h3>1.1 Sequece No.</h3>
+";
+
+### raw data no
+open IN,"<$config";
+my @files;my @marks; my @rawNo;
+while (my $aline=<IN>) {
+ chomp $aline;
+ my @tmp=split/\t/,$aline;
+ push @files,$tmp[0];
+
+ my $no=`less $tmp[0] |wc -l `;
+ chomp $no;
+ if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") {
+ $no=$no/4;
+ }
+ else{
+ $no=$no/2;
+ }
+ push @rawNo,$no;
+
+ push @marks,$tmp[1];
+}
+close IN;
+
+### preprocess 
+unless ($prepath=~/\/$/) {
+ $prepath .="/";
+}
+
+my @trimNo;my @collapse;
+my $collapsefile=$prepath."collapse_reads.fa";
+open IN,"<$collapsefile";
+while (my $aline=<IN>) {
+ chomp $aline;
+ <IN>;
+ $aline=~/:([\d|_]+)_x(\d+)$/;
+ my @lng=split/_/,$1;
+ for (my $i=0;$i<@lng;$i++) {
+ if ($lng[$i]>0) {
+ $trimNo[$i] +=$lng[$i];
+ $collapse[$i] ++;
+ }
+ }
+}
+close IN;
+
+my @cleanR;my @cleanT;
+my $clean=$prepath."collapse_reads_19_28.fa";
+open IN,"<$clean";
+while (my $aline=<IN>) {
+ chomp $aline;
+ <IN>;
+ $aline=~/:([\d|_]+)_x(\d+)$/;
+ my @lng=split/_/,$1;
+ for (my $i=0;$i<@lng;$i++) {
+ if ($lng[$i]>0) {
+ $cleanR[$i] +=$lng[$i];
+ $cleanT[$i] ++;
+ }
+ }
+}
+close IN;
+
+print OUT "<table border=\"1\">
+<tr align=\"center\">
+<th>&nbsp;</th>
+";
+foreach  (@marks) {
+ print OUT "<th> $_ </th>\n";
+}
+print  OUT "</tr>
+<tr align=\"center\">
+<th align=\"left\">Raw Reads No. </th>
+";
+foreach  (@rawNo) {
+ print OUT "<td> $_ </td>\n";
+}
+print OUT "</tr>
+<tr align=\"center\">
+<th align=\"left\">Reads No. After Trimed 3\' adapter </th>
+";
+foreach  (@trimNo) {
+ print OUT "<td> $_ </td>\n";
+}
+print OUT "</tr>
+<tr align=\"center\">
+<th align=\"left\">Unique Tags No. </th>
+";
+foreach  (@collapse) {
+ print OUT "<td> $_ </td>\n";
+}
+print OUT  "</tr>
+<tr align=\"center\">
+<th align=\"left\">Clean Reads No. </th>
+";
+foreach  (@cleanR) {
+ print OUT "<td> $_ </td>\n";
+}
+print OUT "</tr>
+<tr align=\"center\">
+<th align=\"left\">Clean Tags No. </th>
+";
+foreach  (@cleanT) {
+ print OUT "<td> $_ </td>\n";
+}
+print OUT "</tr>\n</table>";
+print OUT "<p>
+Note:<br />
+The raw data file path is: <b>$files[0]</b><br />
+";
+for (my $i=1;$i<@files;$i++) {
+ print OUT "&nbsp;&nbsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;<b>$files[$i]</b><br />";
+}
+print OUT "The collapsed file path is: <b>$collapsefile</b><br />
+The clean data file path is: <b>$clean</b><br />
+</p>
+<h2> 1. Sequence length count</h2>
+";
+print OUT "\n";
+
+my $length=$prepath."length.html";
+open IN,"<$length";
+while (my $aline=<IN>) {
+ chomp $aline;
+ print OUT "$aline\n";
+}
+
+print OUT "<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution.txt\"> length file</a>
+</p>
+";
+
+#### rfam
+unless ($rfampath=~/\/$/) {
+ $rfampath .="/";
+}
+print OUT "<h2>2. Rfam non-miRNA annotation</h2>
+<h3>2.1 Reads count</h3>
+<table border=\"1\">
+<tr align=\"center\">
+";
+
+my @rfamR; my @rfamT;
+my $tag=1;
+open IN,"<$dir/rfam_non-miRNA_annotation.txt";
+while (my $aline=<IN>) {
+ chomp $aline;
+ $tag=0 if($aline=~/tags\s+number/);
+ next if($aline=~/^\#/);
+ next if($aline=~/^\s*$/);
+ my @tmp=split/\s+/,$aline;
+ if($tag == 1){push @rfamR,[@tmp];}
+ else{push @rfamT,[@tmp];}
+}
+close IN;
+
+
+print OUT "<th>RNA Name</th>\n";
+foreach  (@marks) {
+ print OUT "<th> $_ </th>\n";
+}
+for (my $i=0;$i<@rfamR;$i++) {
+ print  OUT "</tr>
+ <tr align=\"center\">
+ <th align=\"left\">$rfamR[$i][0]</th>
+ ";
+ for (my $j=1;$j<@{$rfamR[$i]} ;$j++) {
+ print OUT "<td> $rfamR[$i][$j]</td>\n";
+ }
+}
+
+print OUT "</tr>\n</table>
+  <h3>2.2 Tags count</h3>
+  <table border=\"1\">
+   <tr align=\"center\">
+   <th>RNA Name</th>\n";
+foreach  (@marks) {
+ print OUT "<th> $_ </th>\n";
+}
+for (my $i=0;$i<@rfamT;$i++) {
+ print  OUT "</tr>
+ <tr align=\"center\">
+ <th align=\"left\">$rfamT[$i][0]</th>
+ ";
+ for (my $j=1;$j<@{$rfamT[$i]} ;$j++) {
+ print OUT "<td> $rfamT[$i][$j]</td>\n";
+ }
+}
+print OUT "</tr>\n</table>
+<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>";
+print OUT "<b>rfam_mapped.bwt</b></p>
+  <h2>3. MicroRNA result</h2>
+  <h3>3.1 known microRNA</h3>
+  <p>The known microRNA express list: <a href=\"./known_microRNA_express.txt\"> known_microRNA_express.txt</a><br/>
+  The known microRNA alngment file: <a href=\"./known_microRNA_express.aln\"> known_microRNA_express.aln</a><br/>
+  The known moRs file: <a href=\"./known_microRNA_express.moRs\"> known_microRNA_express.moRs</a><br/>
+  The known microRNA mature sequence file: <a href=\"./known_microRNA_mature.fa\"> known_microRNA_mature.fa</a><br/>
+  The knowm microRNA precursor sequence file: <a href=\"./known_microRNA_precursor.fa\"> known_microRNA_precursor.fa</a>
+  </p>
+
+  <h3>3.2 novel microRNA</h3>
+  <p>The novel microRNA prediction file:<a href=\"./microRNA_prediction.mrd\"> microRNA_prediction.mrd</a><br/>
+  The novel microRNA express list: <a href=\"./novel_microRNA_express.txt\"> novel_microRNA_express.txt</a><br/>
+  The novel microRNA mature sequence file: <a href=\"./novel_microRNA_mature.fa\"> novel_microRNA_mature.fa</a><br/>
+  The novel microRNA precursor sequence file: <a href=\"./novel_microRNA_precursor.fa\"> novel_microRNA_precursor.fa</a>
+  </p>
+";
+
+
+
+print OUT "
+ </BODY>
+</HTML>
+";
+close OUT;
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -o
+options:
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 html_preprocess.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/html_preprocess.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,388 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-5-29\n+#Modified:\n+#Description: \n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use File::Basename;\n+\n+my %opts;\n+GetOptions(\\%opts,"i=s","format=s","min=i","max=i","o=s","h");\n+if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath);\n+my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir);\n+open IN,"<$opts{i}";\n+$config=<IN>; chomp $config;\n+$prepath=<IN>; chomp $prepath;\n+$genomepath=<IN>; chomp $genomepath;\n+$rfampath=<IN>;\n+close IN;\n+my @tmp=split/\\//,$prepath;\n+$predir=$tmp[-1];\n+@tmp=split/\\//,$genomepath;\n+$genomedir=$tmp[-1];\n+\n+my $dir=dirname($opts{\'o\'});\n+\n+open OUT ,">$opts{\'o\'}";\n+print OUT "<HTML>\\n  <HEAD>\\n  <TITLE> Analysis Report </TITLE>\\n </HEAD>\n+ <BODY bgcolor=\\"lightgray\\">\\n  <h1 align=\\"center\\">\\n    <font face=\\"\xba\xda\xcc\xe5\\">\\n\t<b>Preprocess Report</b>\\n  </font>\\n  </h1>\n+  <h2>1. Sequence No. and quality</h2>\n+  <h3>1.1 Sequece No.</h3>\n+";\n+\n+### raw data no\n+open IN,"<$config";\n+my @files;my @marks; my @rawNo;\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tmy @tmp=split/\\t/,$aline;\n+\tpush @files,$tmp[0];\n+\t\n+\tmy $no=`less $tmp[0] |wc -l `;\n+\tchomp $no;\n+\tif ($opts{\'format\'} eq "fq" || $opts{\'format\'} eq "fastq") {\n+\t\t$no=$no/4;\n+\t}\n+\telse{\n+\t\t$no=$no/2;\n+\t}\n+\tpush @rawNo,$no;\n+\n+\tpush @marks,$tmp[1];\n+}\n+close IN;\n+\n+### preprocess \n+unless ($prepath=~/\\/$/) {\n+\t$prepath .="/";\n+}\n+\n+my @trimNo;my @collapse;\n+my $collapsefile=$prepath."collapse_reads.fa";\n+open IN,"<$collapsefile";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$trimNo[$i] +=$lng[$i];\n+\t\t\t$collapse[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+my @cleanR;my @cleanT;\n+my $clean=$prepath."collapse_reads_$opts{min}_$opts{max}.fa";\n+open IN,"<$clean";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$cleanR[$i] +=$lng[$i];\n+\t\t\t$cleanT[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">\n+<th>&nbsp;</th>\n+";\n+foreach  (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print  OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Raw Reads No. </th>\n+";\n+foreach  (@rawNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Reads No. After Trimed 3\\\' adapter </th>\n+";\n+foreach  (@trimNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Unique Tags No. </th>\n+";\n+foreach  (@collapse) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT  "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Reads No. </th>\n+";\n+foreach  (@cleanR) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Tags No. </th>\n+";\n+foreach  (@cleanT) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\\n</table>";\n+print OUT "<p>\n+Note:<br />\n+The raw data file path is: <b>$files[0]</b><br />\n+";\n+for (my $i=1;$i<@files;$i++) {\n+\tprint OUT "&nbsp;&nbsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;<b>$files[$i]</b><br />";\n+}\n+print OUT "The collapsed file path is: <b>$collapsefile</b><br />\n+The clean data file path is: <b>$clean</b><br />\n+</p>\n+<h2> 1. Sequence length count</h2>\n+<h3> 1.1 Reads length count </h3>\n+";\n+print OUT "\\n";\n+\n+my (%length); my $key="Tags Length";\n+open IN,"<$prepath/reads_length_distribution.txt";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tnext if($aline=~/^\\s*$/);\n+\tif ($aline=~/^Reads/) { $key="Reads Length";}\n+\tmy @tmp=split/\\t/,$aline;\n+\tmy @array=split/\\s/,$tmp[1];\n+\tpush @{$length{$key}},[$tmp[0],@array];\n+}\n+close IN;\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">";\n+my $hashkey="Reads Length'..b'+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">";\n+$hashkey="Tags Length";\n+foreach  (@{$length{$hashkey}[0]}) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print  OUT "</tr>";\n+\n+for (my $i=1;$i<@{$length{$hashkey}};$i++) {\n+\tprint OUT "<tr align=\\"center\\">\n+\t<th > $length{$hashkey}[$i][0] </th>\n+\t";\n+\tfor(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) {\n+\t\tprint OUT "<td> $length{$hashkey}[$i][$j] </td>\\n";\n+\t}\n+\tprint OUT "</tr>\\n";\n+}\n+\n+print OUT "</table>\\n";\n+\n+print OUT "<h2> 2. Sequence length distribution </h2>";\n+my $length=$prepath."length.html";\n+open IN,"<$length";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tprint OUT "$aline\\n";\n+}\n+\n+#print OUT "<p> Note:<br />The sequence length data: <a href=\\"./$predir/reads_length_distribution.txt\\"> length file</a>\n+#</p>\n+#";\n+\n+\n+\n+\n+####genome map\n+#unless ($genomedir=~/\\/$/) {\n+#\t$genomedir .="/";\n+#}\n+\n+print OUT "<h2>2. Genome Alignment Result</h2>\n+<h3>2.1 Mapping count</h3>\n+";\n+\n+open IN,"<$genomepath/genome_mapped.fa";\n+my (@gread,@gtag);\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @sss=split/_/,$1;\n+\tfor (my $i=0;$i<@sss;$i++) {\n+\t\tif ($sss[$i]>0) {\n+\t\t\t$gread[$i] +=$sss[$i];\n+\t\t\t$gtag[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">\n+<th>&nbsp;</th>\n+";\n+foreach  (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print  OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Reads No. </th>\n+";\n+foreach  (@gread) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Reads Percent </th>\n+";\n+\n+for (my $i=0;$i<@gread;$i++) {\n+\tmy $per=sprintf ("%.2f",$gread[$i]/$cleanR[$i]*100);\n+\tprint OUT "<td> $per\\%</td>\\n";\n+}\n+\n+print  OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Tags No. </th>\n+";\n+foreach  (@gtag) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Tags Percent </th>\n+";\n+\n+for (my $i=0;$i<@gtag;$i++) {\n+\tmy $per=sprintf ("%.2f",$gtag[$i]/$cleanT[$i]*100);\n+\tprint OUT "<td> $per\\%</td>\\n";\n+}\n+print OUT "</tr>\\n</table>";\n+print OUT "<p>\n+Note:<br />\n+The genome mapped bwt file path is: <b>$genomedir/genome_mapped.bwt</b><br />\n+The genome mapped FASTA file path is: <b>$genomedir/genome_mapped.fa</b>\n+<br />\n+";\n+\n+\n+\n+#### rfam\n+if(defined $rfampath && $rfampath=~/rfam_match/){\n+chomp $rfampath;\n+@tmp=split/\\//,$rfampath;\n+$rfamdir=$tmp[-1];\n+\n+unless ($rfampath=~/\\/$/) {\n+\t$rfampath .="/";\n+}\n+print OUT "<h2>3. Rfam non-miRNA annotation</h2>\n+<h3>3.1 Reads count</h3>\n+<table border=\\"1\\">\n+<tr align=\\"center\\">\n+";\n+\n+my @rfamR; my @rfamT;\n+my $tag=1;\n+open IN,"<$dir/rfam_non-miRNA_annotation.txt";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t$tag=0 if($aline=~/tags\\s+number/);\n+\tnext if($aline=~/^\\#/);\n+\tnext if($aline=~/^\\s*$/);\n+\tmy @tmp=split/\\s+/,$aline;\n+\tif($tag == 1){push @rfamR,[@tmp];}\n+\telse{push @rfamT,[@tmp];}\n+}\n+close IN;\n+\n+\n+print OUT "<th>RNA Name</th>\\n";\n+foreach  (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+for (my $i=0;$i<@rfamR;$i++) {\n+\tprint  OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">$rfamR[$i][0]</th>\n+\t";\n+\tfor (my $j=1;$j<@{$rfamR[$i]} ;$j++) {\n+\tprint OUT "<td> $rfamR[$i][$j]</td>\\n";\n+\t}\n+}\n+\n+print OUT "</tr>\\n</table>\n+  <h3>3.2 Tags count</h3>\n+  <table border=\\"1\\">\n+   <tr align=\\"center\\">\n+   <th>RNA Name</th>\\n";\n+foreach  (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+for (my $i=0;$i<@rfamT;$i++) {\n+\tprint  OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">$rfamT[$i][0]</th>\n+\t";\n+\tfor (my $j=1;$j<@{$rfamT[$i]} ;$j++) {\n+\tprint OUT "<td> $rfamT[$i][$j]</td>\\n";\n+\t}\n+}\n+print OUT "</tr>\\n</table>\n+<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>";\n+print OUT "<b>rfam_mapped.bwt</b></p>\n+";\n+}\n+\n+\n+print OUT "\n+ </BODY>\n+</HTML>\n+";\n+close OUT;\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -o\n+options:\n+-o output file\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r 000000000000 -r 87fe81de0931 html_siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/html_siRNA.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,788 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-5-29\n+#Modified:\n+#Description: \n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use File::Basename;\n+\n+my %opts;\n+GetOptions(\\%opts,"i=s","format=s","o=s","h");\n+if (!(defined $opts{o}  and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+my ($config,$prepath,$rfampath,$genomepath,$clusterpath,$annotatepath,$plotpath,$degpath);\n+my ($predir,$rfamdir,$genomedir,$clusterdir,$annotatedir,$plotdir,$degdir);\n+open IN,"<$opts{i}";\n+$config=<IN>; chomp $config;\n+$prepath=<IN>; chomp $prepath;\n+$rfampath=<IN>;chomp $rfampath;\n+$genomepath=<IN>; chomp $genomepath;\n+$clusterpath=<IN>; chomp $clusterpath;\n+$annotatepath=<IN>; chomp $annotatepath;\n+$plotpath=<IN>; chomp $plotpath;\n+my $deg_tag=1;\n+if(eof){$deg_tag=0;}\n+else{\n+\t$degpath=<IN>; chomp $degpath;\n+}\n+close IN;\n+my @tmp=split/\\//,$prepath;\n+$predir=$tmp[-1];\n+@tmp=split/\\//,$rfampath;\n+$rfamdir=$tmp[-1];\n+@tmp=split/\\//,$genomepath;\n+$genomedir=$tmp[-1];\n+@tmp=split/\\//,$clusterpath;\n+$clusterdir=$tmp[-1];\n+@tmp=split/\\//,$annotatepath;\n+$annotatedir=$tmp[-1];\n+@tmp=split/\\//,$plotpath;\n+$plotdir=$tmp[-1];\n+\n+my $dir=dirname($opts{\'o\'});\n+\n+open OUT ,">$opts{\'o\'}";\n+print OUT "<HTML>\\n  <HEAD>\\n  <TITLE> Analysis Report </TITLE>\\n </HEAD>\n+ <BODY bgcolor=\\"lightgray\\">\\n  <h1 align=\\"center\\">\\n    <font face=\\"\xba\xda\xcc\xe5\\">\\n\t<b>Small RNA Analysis Report</b>\\n  </font>\\n  </h1>\n+  <h2>1. Sequence No. and quality</h2>\n+  <h3>1.1 Sequece No.</h3>\n+";\n+\n+### raw data no\n+open IN,"<$config";\n+my @files;my @marks; my @rawNo;\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tmy @tmp=split/\\t/,$aline;\n+\tpush @files,$tmp[0];\n+\t\n+\tmy $no=`less $tmp[0] |wc -l `;\n+\tchomp $no;\n+\tif ($opts{\'format\'} eq "fq" || $opts{\'format\'} eq "fastq") {\n+\t\t$no=$no/4;\n+\t}\n+\telse{\n+\t\t$no=$no/2;\n+\t}\n+\tpush @rawNo,$no;\n+\n+\tpush @marks,$tmp[1];\n+}\n+close IN;\n+\n+### preprocess \n+unless ($prepath=~/\\/$/) {\n+\t$prepath .="/";\n+}\n+\n+my @trimNo;my @collapse;\n+my $collapsefile=$prepath."collapse_reads.fa";\n+open IN,"<$collapsefile";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$trimNo[$i] +=$lng[$i];\n+\t\t\t$collapse[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+my @cleanR;my @cleanT;\n+my $clean=$prepath."collapse_reads_18-40.fa";\n+open IN,"<$clean";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$cleanR[$i] +=$lng[$i];\n+\t\t\t$cleanT[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+my @filterR;my @filterT;\n+my $filter=$prepath."collapse_reads_out.fa";\n+open IN,"<$filter";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$filterR[$i] +=$lng[$i];\n+\t\t\t$filterT[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">\n+<th>&nbsp;</th>\n+";\n+foreach  (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print  OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Raw Reads No. </th>\n+";\n+foreach  (@rawNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Reads No. After Trimed 3\\\' adapter </th>\n+";\n+foreach  (@trimNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Unique Tags No. </th>\n+";\n+foreach  (@collapse) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT  "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Reads No. </th>\n+";\n+foreach  (@cleanR) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Tags No. </th>\n+";\n+foreach  (@cleanT) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT  "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Filter Reads No. \\(reads count \\>3\\) </th>\n+";\n+forea'..b'ters number</th>\\n\n+\t";\n+\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Phase</th>\\n\n+\t";\n+\tforeach  (@phase) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Long</th>\\n\n+\t";\n+\tforeach  (@long) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Repeat</th>\\n\n+\t";\n+\tforeach  (@repeat) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Nat</th>\\n\n+\t";\n+\tforeach  (@nat) {\n+\t\tprint OUT "<td> $_ </td>\\n";\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\n+\tprint OUT "<p>\n+\tRepeat subclass annotate:\n+\t";\n+\n+\tprint OUT "<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Repeat subclass</th>\\n\n+\t";\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\n+\tforeach my $key (sort keys %repeat) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@marks ;$i++) {\n+\t\t\tif (defined($repeat{$key}[$i])) {\n+\t\t\t\tprint OUT "<td> $repeat{$key}[$i] </td>\\n";\n+\t\t\t}\n+\t\t\telse{print OUT "<td> 0 </td>\\n";}\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\n+\n+\tprint OUT "<p>\n+\tNat subclass1 annotate:\n+\t";\n+\n+\tprint OUT "<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Nat subclass1</th>\\n\n+\t";\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\tforeach my $key (sort keys %nat1) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@marks ;$i++) {\n+\t\t\tif (defined($nat1{$key}[$i])) {\n+\t\t\t\tprint OUT "<td> $nat1{$key}[$i] </td>\\n";\n+\t\t\t}\n+\t\t\telse{print OUT "<td> 0 </td>\\n";}\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\n+\tprint OUT "<p>\n+\tNat subclass2 annotate:\n+\t";\n+\n+\tprint OUT "<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Nat subclass2</th>\\n\n+\t";\n+\tforeach  (@marks) {\n+\t\tprint OUT "<th> $_ </th>\\n";\n+\t}\n+\tforeach my $key (sort keys %nat2) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@marks ;$i++) {\n+\t\t\tif (defined($nat2{$key}[$i])) {\n+\t\t\t\tprint OUT "<td> $nat2{$key}[$i] </td>\\n";\n+\t\t\t}\n+\t\t\telse{print OUT "<td> 0 </td>\\n";}\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+\tprint OUT "<p>\n+\tNote:<br />\n+\tOne cluster mybe annotate to multiple repeats or nats<br />\n+\t";\n+}\n+else {\n+\tprint OUT "<h3>5.2 Cluster source mechanism annotate</h3>\n+\t<br />Do not do source mechanism annotate <br />";\n+\n+}\n+\n+print OUT "<h2>6. Graph of Clusters of all samples</h2> \\n";\n+\n+my $plot=$plotpath."cluster.html";\n+open IN,"<$plot";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tprint OUT "$aline\\n";\n+}\n+close IN;\n+\n+\n+if ($deg_tag) {\n+\tmy $deg_file=`ls $degpath`;\n+\tchomp $deg_file;\n+\tmy @deg_file=split/\\n/,$deg_file;\n+\tmy %deg;\n+\tforeach  (@deg_file) {\n+\t\tmy $output="$degpath/$_/output_score.txt";\n+\t\topen IN,"<$output";\n+\t\t$deg{$_}[0]=0;\n+\t\t$deg{$_}[1]=0;\n+\t\t$deg{$_}[2]=0;\n+\t\twhile (my $aline=<IN>) {\n+\t\t\tnext if ($aline=~/^\\"/);\n+\t\t\tchomp $aline;\n+\t\t\tmy @temp=split/\\t/,$aline;\n+\t\t\tif ($temp[9] eq "TRUE") {\n+\t\t\t\t$deg{$_}[0]++;\n+\t\t\t\tif ($temp[4] >0) {\n+\t\t\t\t\t$deg{$_}[1]++;\n+\t\t\t\t}\n+\t\t\t\tif ($temp[4] <0) {\n+\t\t\t\t\t$deg{$_}[2]++;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\tclose IN;\n+\t}\n+\n+\tprint OUT "<h2>7. DEG</h2>\n+\t<table border=\\"1\\">\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">Genes number</th>\\n\n+\t<th> DEG </th>\\n\n+\t<th> UP </th>\\n\n+\t<th> DOWN </th>\\n\n+\t";\n+\n+\tforeach my $key (sort keys %deg) {\n+\t\tprint  OUT "</tr>\n+\t\t\t<tr align=\\"center\\">\n+\t\t\t<th align=\\"left\\">$key</th>\n+\t\t";\n+\t\tfor (my $i=0;$i<@{$deg{$key}} ;$i++) {\n+\t\t\tprint OUT "<td> $deg{$key}[$i] </td>\\n";\n+\t\t}\n+\t}\n+\tprint OUT "</tr>\\n</table>";\n+}\n+else{\n+\tprint OUT "<h2>7. DEG</h2>\n+\t<br />Do not do DE clusters <br />";\n+}\n+\n+print OUT "\n+ </BODY>\n+</HTML>\n+";\n+close OUT;\n+\n+\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -o\n+options:\n+-i \n+-format\n+-o output file\n+-h help\n+USAGE\n+exit(1);\n+}\n'
b
diff -r 000000000000 -r 87fe81de0931 install_DEG.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/install_DEG.R Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,2 @@
+source("http://bioconductor.org/biocLite.R")
+biocLite("DEGseq")
b
diff -r 000000000000 -r 87fe81de0931 matching.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matching.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,86 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","g=s","index:s","v:i","p:i","r:s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+unless ($fileout=~/\/$/) {$fileout.="/";}
+my $genome=$opts{'g'};
+my $mis=defined $opts{'v'}? $opts{'v'} : 0;
+my $hits=defined $opts{'r'}? $opts{'r'} : 25;
+my $index=defined $opts{'index'} ? $opts{'index'} : "";
+my $threads=defined $opts{'p'} ? $opts{'p'} : 1;
+
+
+my $time=&Time();
+my $mapdir=$fileout."/genome_match";
+if(not -d $mapdir){
+ mkdir $mapdir;
+}
+chdir $mapdir;
+###check genome index
+if (-s $index.".1.ebwt") {
+}else{
+ `bowtie-build $genome $genome`;
+ $index="$genome";
+}
+
+### genome mapping
+`bowtie -v $mis -f -p $threads -m $hits -a --best --strata $index $filein --al genome_mapped.fa --un genome_not_mapped.fa --max genome_mapped_Mlimit.fa > genome_mapped.bwt  2> run.log`;
+
+#`convert_bowtie_to_blast.pl  genome_mapped.bwt genome_mapped.fa $genome > genome_mapped.bst`;
+
+sub Time{
+        my $time=time();
+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
+        $month++;
+        $year+=1900;
+        if (length($sec) == 1) {$sec = "0"."$sec";}
+        if (length($min) == 1) {$min = "0"."$min";}
+        if (length($hour) == 1) {$hour = "0"."$hour";}
+        if (length($day) == 1) {$day = "0"."$day";}
+        if (length($month) == 1) {$month = "0"."$month";}
+        #print "$year-$month-$day $hour:$min:$sec\n";
+        return("$year-$month-$day-$hour-$min-$sec");
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file# input reads fasta/fastq file
+-g input file# genome file
+-index file-prefix #(must be indexed by bowtie-build) The parameter
+                string must be the prefix of the bowtie index. For instance, if
+                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
+                the prefix is 'h_sapiens_37_asm'.##can be null
+-v <int>           report end-to-end hits w/ <=v mismatches; ignore qualities,default 0;
+
+-p/--threads <int> number of alignment threads to launch (default: 1)
+
+-r int          a read is allowed to map up to this number of positions in the genome
+                default is 25 
+
+-o output directory 
+
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 miRDeep_plant.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/miRDeep_plant.pl Sun Jan 04 02:47:25 2015 -0500
b
b'@@ -0,0 +1,1622 @@\n+#!/usr/bin/perl\n+\n+use warnings;\n+use strict;\n+use Getopt::Std;\n+#use RNA;\n+\n+\n+################################# MIRDEEP #################################################\n+\n+################################## USAGE ##################################################\n+\n+\n+my $usage=\n+"$0 file_signature file_structure temp_out_directory\n+\n+This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with\n+information on the positions of reads aligned to potential precursor sequences (signature).\n+It also takes as input an RNAfold output file, giving information on the sequence, structure\n+and mimimum free energy of the potential precursor sequences.\n+\n+Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA\n+sequences that should be considered for conservation purposes. -t prints out the potential\n+precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that\n+exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors\n+that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences\n+obtained through conventional cloning. -z consider the number of base pairings in the lower\n+stems (this option is not well tested).\n+\n+-h print this usage\n+-s fasta file with known miRNAs\n+#-o temp directory ,maked befor running the program.\n+-t print filtered\n+-u limited output (only ids)\n+-v cut-off (default 1)\n+-x sensitive option for Sanger sequences\n+-y use Randfold\n+-z consider Drosha processing\n+";\n+\n+\n+\n+\n+\n+############################################################################################\n+\n+################################### INPUT ##################################################\n+\n+\n+#signature file in blast_parsed format\n+my $file_blast_parsed=shift or die $usage;\n+\n+#structure file outputted from RNAfold\n+my $file_struct=shift or die $usage;\n+\n+my $tmpdir=shift or die $usage;\n+#options\n+my %options=();\n+getopts("hs:tuv:xyz",\\%options);\n+\n+\n+\n+\n+\n+\n+#############################################################################################\n+\n+############################# GLOBAL VARIABLES ##############################################\n+\n+\n+#parameters\n+my $nucleus_lng=11;\n+\n+my $score_star=3.9;\n+my $score_star_not=-1.3;\n+my $score_nucleus=7.63;\n+my $score_nucleus_not=-1.17;\n+my $score_randfold=1.37;\n+my $score_randfold_not=-3.624;\n+my $score_intercept=0.3;\n+my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);\n+my $score_min=1;\n+if($options{v}){$score_min=$options{v};}\n+if($options{x}){$score_min=-5;}\n+\n+my $e=2.718281828;\n+\n+#hashes\n+my %hash_desc;\n+my %hash_seq;\n+my %hash_struct;\n+my %hash_mfe;\n+my %hash_nuclei;\n+my %hash_mirs;\n+my %hash_query;\n+my %hash_comp;\n+my %hash_bp;\n+\n+#other variables\n+my $subject_old;\n+my $message_filter;\n+my $message_score;\n+my $lines;\n+my $out_of_bound;\n+\n+\n+\n+##############################################################################################\n+\n+################################  MAIN  ###################################################### \n+\n+\n+#print help if that option is used\n+if($options{h}){die $usage;}\n+unless ($tmpdir=~/\\/$/) {$tmpdir .="/";}\n+if(!(-s $tmpdir)){mkdir $tmpdir;}\n+$tmpdir .="TMP_DIR/";\n+mkdir $tmpdir;\n+\n+#parse structure file outputted from RNAfold\n+parse_file_struct($file_struct);\n+\n+#if conservation is scored, the fasta file of known miRNA sequences is parsed\n+if($options{s}){create_hash_nuclei($options{s})};\n+\n+#parse signature file in blast_parsed format and resolve each potential precursor\n+parse_file_blast_parsed($file_blast_parsed);\n+#`rm -rf $tmpdir`;\n+exit;\n+\n+\n+\n+\n+##############################################################################################\n+\n+############################## SUBROUTINES ###################################################\n+\n+\n+\n+sub parse_file_blast_parsed{\n+\n+#    read through the signature blastparsed file, fills up a hash with infor'..b'th\n+#   the n\'t nucleotide on the plus strand\n+\n+#   This subroutine reverses the begin and end positions of positions of the minus strand so that they\n+#   are relative to the 5\' end of the plus strand\t\n+   \n+    my($beg,$end,$lng)=@_;\n+    \n+    my $new_end=$lng-$beg+1;\n+    my $new_beg=$lng-$end+1;\n+    \n+    return($new_beg,$new_end);\n+}\n+\n+sub round {\n+\n+    #rounds to nearest integer\n+   \n+    my($number) = shift;\n+    return int($number + .5);\n+    \n+}\n+\n+\n+sub rev{\n+\n+    #reverses the order of nucleotides in a sequence\n+\n+    my($sequence)=@_;\n+\n+    my $rev=reverse $sequence;   \n+\n+    return $rev;\n+}\n+\n+sub com{\n+\n+    #the complementary of a sequence\n+\n+    my($sequence)=@_;\n+\n+    $sequence=~tr/acgtuACGTU/TGCAATGCAA/;   \n+ \n+    return $sequence;\n+}\n+\n+sub revcom{\n+    \n+    #reverse complement\n+\n+    my($sequence)=@_;\n+\n+    my $revcom=rev(com($sequence));\n+\n+    return $revcom;\n+}\n+\n+\n+sub max2 {\n+\n+    #max of two numbers\n+    \n+    my($a, $b) = @_;\n+    return ($a>$b ? $a : $b);\n+}\n+\n+sub min2  {\n+\n+    #min of two numbers\n+\n+    my($a, $b) = @_;\n+    return ($a<$b ? $a : $b);\n+}\n+\n+\n+\n+sub score_freq{\n+\n+#   scores the count of reads that map to the potential precursor\n+#   Assumes geometric distribution as described in methods section of manuscript\n+\n+    my $freq=shift;\n+\n+    #parameters of known precursors and background hairpins\n+    my $parameter_test=0.999;\n+    my $parameter_control=0.6;\n+\n+    #log_odds calculated directly to avoid underflow\n+    my $intercept=log((1-$parameter_test)/(1-$parameter_control));\n+    my $slope=log($parameter_test/$parameter_control);\n+    my $log_odds=$slope*$freq+$intercept;\n+\n+    #if no strong evidence for 3\' overhangs, limit the score contribution to 0\n+    unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}\n+   \n+    return $log_odds;\n+}\n+\n+\n+\n+##sub score_mfe{\n+\n+#   scores the minimum free energy in kCal/mol of the potential precursor\n+#   Assumes Gumbel distribution as described in methods section of manuscript\n+\n+##    my $mfe=shift;\n+\n+    #numerical value, minimum 1\n+##    my $mfe_adj=max2(1,-$mfe);\n+\n+    #parameters of known precursors and background hairpins, scale and location\n+##    my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);\n+##    my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);\n+\n+##    my $odds=$prob_test/$prob_background;\n+##    my $log_odds=log($odds);\n+\n+##    return $log_odds;\n+##}\n+\n+sub score_mfe{\n+#\tuse bignum;\n+\n+#   scores the minimum free energy in kCal/mol of the potential precursor\n+#   Assumes Gumbel distribution as described in methods section of manuscript\n+\n+    my ($mfe,$mlng)=@_;\n+\n+    #numerical value, minimum 1\n+    my $mfe_adj=max2(1,-$mfe);\n+my $mfe_adj1=$mfe/$mlng;\n+    #parameters of known precursors and background hairpins, scale and location\n+\tmy $a=1.339e-12;my $b=2.778e-13;my $c=45.834;\n+\tmy $ev=$e**($mfe_adj1*$c);\n+\t#print STDERR "\\n***",$ev,"**\\t",$ev+$b,"\\t";\n+\tmy $log_odds=($a/($b+$ev));\n+\n+\n+\tmy $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);\n+    my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);\n+\n+    my $odds=$prob_test/$prob_background;\n+    my $log_odds_2=log($odds);\n+\t#print STDERR "log_odds :",$log_odds,"\\t",$log_odds_2,"\\n";\n+   return $log_odds;\n+}\n+\n+\n+\n+sub prob_gumbel_discretized{\n+\n+#   discretized Gumbel distribution, probabilities within windows of 1 kCal/mol\n+#   uses the subroutine that calculates the cdf to find the probabilities\n+\n+    my ($var,$scale,$location)=@_;\n+\n+    my $bound_lower=$var-0.5;\n+    my $bound_upper=$var+0.5;\n+\n+    my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);\n+    my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);\n+\n+    my $prob=$cdf_upper-$cdf_lower;\n+\n+    return $prob;\n+}\n+\n+\n+sub cdf_gumbel{\n+\n+#   calculates the cumulative distribution function of the Gumbel distribution\n+\n+    my ($var,$scale,$location)=@_;\n+\n+    my $cdf=$e**(-($e**(-($var-$location)/$scale)));\n+\n+    return $cdf;\n+}\n+\n'
b
diff -r 000000000000 -r 87fe81de0931 miRNA_Express_and_sequence.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/miRNA_Express_and_sequence.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,173 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2014-6-4
+#Modified:
+#Description: solexa miRNA express and sequence
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h");
+if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'list'};
+my $out=$opts{'fa'};
+my $preout=$opts{'pre'};
+
+=cut
+my %hash_pri;
+open PRI,"<$opts{p}";
+while (my $aline=<PRI>) {
+ chomp $aline;
+ if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;}
+}
+close PRI;
+=cut
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+open FA ,">$out";
+open PRE,">$preout";
+
+print OUT "#ID\tcoordinate\tpos1\tpos2";
+my @marks=split/\,/,$opts{'tag'};
+foreach  (@marks) {
+ print OUT "\t",$_,"_matureExp";
+}
+foreach  (@marks) {
+ print OUT "\t",$_,"_starExp";
+}
+foreach  (@marks) {
+ print OUT "\t",$_,"_totalExp";
+}
+
+print OUT "\n";
+
+my (%uniq_id,$novel);
+while (my $aline=<IN>) {
+ chomp $aline;
+ until ($aline =~ /^score\s+[-\d\.]+/){
+ $aline = <IN>;
+ if (eof) {last;}
+ }
+ if (eof) {last;}
+########## miRNA ID ################
+ $novel++;
+########### annotate####################
+ do {$aline=<IN>;} until($aline=~/flank_first_end/) ;
+ chomp $aline;
+ my @flank1=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/flank_second_beg/) ;
+ chomp $aline;
+ my @flank2=split/\t/,$aline;
+#
+########## mature start loop pre ####
+ do {$aline=<IN>;} until($aline=~/mature_beg/) ;
+ chomp $aline;
+ my @start=split/\t/,$aline;
+# $start[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/mature_end/) ;
+ chomp $aline;
+ my @end=split/\t/,$aline;
+# $end[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/mature_seq/) ;
+ chomp $aline;
+ my @arr1=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/pre_seq/) ;
+ chomp $aline;
+ my @arr2=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/pri_id/) ;
+ chomp $aline;
+ my @pri_id=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/pri_seq/) ;
+ chomp $aline;
+ my @pri_seq=split/\t/,$aline;
+ do {$aline=<IN>;} until($aline=~/star_beg/) ;
+ chomp $aline;
+ my @star_start=split/\t/,$aline;
+# $star_start[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/star_end/) ;
+ chomp $aline;
+ my @star_end=split/\t/,$aline;
+# $star_end[1] -=$flank1[1];
+ do {$aline=<IN>;} until($aline=~/star_seq/) ;
+ chomp $aline;
+ my @arr3=split/\t/,$aline;
+ print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t";
+ #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t";
+ print FA ">miR-c-$novel\n$arr1[1]\n";
+ print PRE ">miR-c-$novel\n$pri_seq[1]\n";
+########## reads count #############
+ <IN>;
+ my @count1;my @count2;my @count3;my @count4;
+ $aline=<IN>;
+ do {
+ chomp $aline;
+ my @reads=split/\t/,$aline;
+ my @pos=();
+ $reads[5]=~/(\d+)\.\.(\d+)/;
+# $pos[0] =$1-$flank1[1];
+# $pos[1] =$2-$flank1[1];
+ $pos[0]=$1;
+ $pos[1]=$2;
+ $reads[0]=~/:([\d|_]+)_x(\d+)$/;
+ my @ss=split/_/,$1;
+ for (my $i=0;$i<@ss ;$i++) {
+ if (!(defined $count3[$i])) {
+ $count3[$i]=0;
+ }
+ if (!(defined $count4[$i])) {
+ $count4[$i]=0;
+ }
+ $count2[$i]+=$ss[$i];
+
+ }
+# $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 );
+# $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 );
+# $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1);
+# $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1);
+ if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 )
+ {
+ for (my $i=0;$i<@ss;$i++) {
+ $count3[$i]+=$ss[$i];
+ }
+ }
+ if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){
+ for (my $i=0;$i<@ss;$i++) {
+ $count4[$i]+=$ss[$i];
+ }
+ } 
+ $aline=<IN>;
+ chomp $aline;
+ } until(length $aline < 1) ;
+ $"="\t";
+ print OUT "@count3\t@count4\t@count2\n";
+ $"=" ";
+}
+
+close IN;
+close OUT;
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -list -fa -pre -tag
+options:
+-i input file,predictions file
+-list output file miRNA list file
+-fa output file ,miRNA sequence fasta file.
+-pre output file, miRNA precursor fasta file.
+-tag string, sample names# eg: samA,samB,samC
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 microRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microRNA.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,253 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-4-22\n+#Modified:\n+#Description: plant microRNA prediction\n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use threads;\n+#use threads::shared;\n+use File::Path;\n+use File::Basename;\n+#use RNA;\n+#use Term::ANSIColor;\n+\n+my %opts;\n+GetOptions(\\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h");\n+if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $time=&Time();\n+print "miPlant program start:\\n   The time is $time!\\n";\n+print "Command line:\\n   $0 @ARGV\\n";\n+\n+my  $mypath=`pwd`;\n+chomp $mypath;\n+\n+my $dir=defined $opts{\'o\'} ? $opts{\'o\'} : "$mypath/miRNA_out/";\n+\n+\n+unless ($dir=~/\\/$/) {$dir.="/";}\n+if (not -d $dir) {\n+\tmkdir $dir;\n+}\n+my $config=$opts{\'i\'};\n+my $data=$opts{\'fa\'};\n+\n+my $scipt_path=defined $opts{\'path\'} ? $opts{\'path\'} : "/Users/big/galaxy-dist/tools/myTools/";\n+\n+my $t=1; #threads number\n+if (defined $opts{\'t\'}) {$t=$opts{\'t\'};}\n+\n+my $mis=0; #mismatch number for microRNA\n+if (defined $opts{\'mis\'}) {$mis=$opts{\'mis\'};}\n+\n+my $hit=25; # maximum reads mapping hits in genome\n+if (defined $opts{\'r\'}) {$hit=$opts{\'r\'};}\n+\n+my $upstream = 2; # microRNA 5\' extension\n+$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\n+\n+my $downstream = 5;# microRNA 3\' extension\n+$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\n+\n+my $maxd=defined $opts{\'dis\'} ? $opts{\'dis\'} : 200;\n+my $flank=defined $opts{\'flank\'} ? $opts{\'flank\'} :10;\n+my $mfe=defined $opts{\'mfe\'} ? $opts{\'mfe\'} : -20;\n+\n+$time=&Time();\n+print "$time, Checking input file!\\n";\n+\n+my (@filein,@mark);\n+&read_config();\n+\n+&checkfa($opts{pre}) if(defined $opts{pre});\n+&checkfa($opts{mat}) if(defined $opts{mat});\n+&checkfa($opts{gfa});\n+\n+chdir $dir;\n+my $data2=$data;\n+my $known_result=$dir."known_miRNA_Express";\n+if(defined $opts{pre} and defined $opts{mat}){\n+\t&quantify(); ### known microRAN quantify\n+\t$data2=$known_result."/mirbase_not_mapped.fa";\n+}\n+\n+my $genome_map=$dir."genome_match";\n+&genome($data2);\n+\n+#my $genome_map=&search($dir,"genome_match_");\n+my $mapfile=$genome_map."/genome_mapped.bwt";\n+my $mapfa=$genome_map."/genome_mapped.fa";\n+my $unmap=$genome_map."/genome_not_mapped.fa";\n+\n+#$time=Time();\n+#print "$time: Novel microRNA prediction!\\n\\n";\n+\n+&predict($mapfa);\n+\n+$time=Time();\n+print "$time: Program end!!\\n";\n+\n+############################## sub programs ###################################\n+sub predict{\n+\tmy ($file)=@_;\n+\t$time=&Time();\n+\tprint "$time: Novel microRNA prediction!\\n\\n";\n+\tmy $predict=$dir."Novel_miRNA_predict";\n+\tmkdir $predict;\n+\tchdir $predict;\n+\tsystem("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe");\n+#\tprint "\\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\\n";\n+\t\n+\tsystem("bowtie-build -f excised_precursor.fa excised_precursor");\n+#\tprint "\\nbowtie-build -f excised_precursor.fa excised_precursor\\n";\n+\t\n+\tsystem("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log");\n+#\tprint "\\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\\n";\n+\t\n+\tsystem("perl $scipt_path/convert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst");\n+#\tprint "\\nconvert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\\n";\n+\n+\tsystem("sort -k 4  precursor_mapped.bst  > signatures.bst");\n+#\tprint "\\nsort +3 -25  precursor_mapped.bst  > ../signatures.bst\\n";\n+\n+\tchdir $dir;\n+\tsystem("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd");\n+#\tpri'..b'ipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa");\n+\n+}\n+\n+sub genome{\n+\tmy ($file)=@_;\n+\tif(defined $opts{\'idx\'}){\n+\t\tsystem("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ;\n+#\t\tprint "\\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\\n";\n+\t}else{\n+\t\tsystem("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ;\n+#\t\tprint "\\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\\n";\n+\t}\n+}\n+\n+sub quantify{\n+\tmy $tag=join "\\\\;" ,@mark;\n+\tsystem("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag");\n+\tprint "\\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\\n";\n+}\n+\n+sub read_config{\n+\topen CON,"<$config";\n+\twhile (my $aline=<CON>) {\n+\t\tchomp $aline;\n+\t\tmy @tmp=split/\\t/,$aline;\n+\t\tpush @filein,$tmp[0];\n+\t\tpush @mark,$tmp[1];\n+\t\t#&check_rawdata($tmp[0]);\n+\t}\n+\tclose CON;\n+\tif (@filein != @mark) {\n+\t\t#&printErr();\n+\t\tdie "Maybe config file have some wrong!!!\\n";\n+\t}\n+}\n+sub checkfa{\n+\tmy ($file_reads)=@_;\n+\topen N,"<$file_reads";\n+\tmy $line=<N>;\n+\tchomp $line;\n+    if($line !~ /^>\\S+/){\n+        #printErr();\n+        die "The first line of file $file_reads does not start with \'>identifier\'\n+Reads file $file_reads is not a valid fasta file\\n\\n";\n+    }\n+    if(<N> !~ /^[ACGTNacgtn]*$/){\n+        #printErr();\n+        die "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+    }\n+\tclose N;\n+}\n+sub search{\n+\tmy ($dir,$str)=@_;\n+\topendir I,$dir;\n+\tmy @ret;\n+\twhile (my $file=readdir I) {\n+\t\tif ($file=~/$str/) {\n+\t\t\tpush @ret, $file;\n+\t\t}\n+\t}\n+\tclosedir I;\n+\tif (@ret != 1) {\n+\t\t#&printErr();\n+\n+\t\tdie "Can not find directory or file which name has string: $str !!!\\n";\n+\t}\n+\treturn $ret[0];\n+}\n+\n+\n+sub Time{\n+        my $time=time();\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n+        $month++;\n+        $year+=1900;\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\n+        if (length($min) == 1) {$min = "0"."$min";}\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\n+        if (length($day) == 1) {$day = "0"."$day";}\n+        if (length($month) == 1) {$month = "0"."$month";}\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\n+        return("$year-$month-$day $hour:$min:$sec");\n+}\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+\n+$0 -i -fa -gfa -idx -pre -mat -mis -e -f  -t  -o  -path\n+options:\n+-i input files, # config\n+\n+-fa ,#fasta sequence file\n+\n+-path scirpt path\n+\n+-gfa string,  input file # genome fasta. sequence file\n+-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-pre string,  input file #species specific microRNA precursor sequences\n+-mat string,  input file #species specific microRNA mature sequences\n+\n+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0\n+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2\n+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5\n+-r int       a read is allowed to map up to this number of positions in the genome,default is 25 \n+\n+-dis <int>   Maximal space between miRNA and miRNA* (200)\n+-flank <int>   Flank sequence length of miRNA precursor (10)\n+-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)\n+\n+-t int,    number of threads [1]\n+\n+-o output directory# absolute path\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r 000000000000 -r 87fe81de0931 microRNA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microRNA.xml Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,113 @@
+<tool id="micrornas_v1" name="microRNA" veision="1.0.0">
+  <description>Program for plant microRNA analysis(miRNA only) </description>
+
+  <requirements>
+ <requirement type="package" version="0.0.13">fastx_toolkit </requirement>
+    <requirement type="package" version="0.12.7">bowtie</requirement>
+    <requirement type="set_environment">SCRIPT_PATH</requirement>
+    <!--requirement type="package" version="3.0.1">R</requirement!-->
+ <requirement type="package" version="2.59">SVG</requirement>
+ <requirement type="package" version="2.1.8">ViennaRNA</requirement>
+  </requirements>
+
+  <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command-->
+
+  <command interpreter="perl">microRNA.pl 
+   ## Change this to accommodate the number of threads you have available.
+        -t \${GALAXY_SLOTS:-4}
+ -path \$SCRIPT_PATH
+
+
+   ## Do or not annotate known microRNAs
+    #if $mirbase.known_microRNA == "yes":
+ -pre $mirbase.pre -mat $mirbase.mat 
+ #end if
+
+        ## prepare bowtie index
+        #set index_path = ''
+        #if str($reference_genome.source) == "history":
+           ### bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa;
+            #set index_path = $reference_genome.own_file
+ -gfa $index_path
+        #else:
+            #set index_path = $reference_genome.index.fields.path
+ -gfa ${index_path}.fa -idx $index_path
+ #end if
+
+
+     -mis $mismatch  -i $config -fa $reads -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe  > run.log
+  </command>
+
+  <inputs>
+ <param name="config" type="data" label="Raw data configs file" />
+ <param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" />
+
+         <!-- reference genome -->
+        <conditional name="reference_genome">
+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+            <option value="indexed">Use a built-in index</option>
+            <option value="history">Use one from the history</option>
+          </param>
+          <when value="indexed">
+            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+              <options from_data_table="bowtie_indexes">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+              </options>
+            </param>
+          </when>
+          <when value="history">
+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+          </when>
+        </conditional>
+
+ <conditional name="mirbase">
+ <param name="known_microRNA" type="select" label="Analysis known microRNAs(eg. from mirbase)">
+   <option value="yes" selected="true">yes</option>
+   <option value="no">no</option>
+  </param>
+  <when value="yes">
+ <param name="mat" type="data" label="mature microRNA sequence file" />
+ <param name="pre" type="data" label="precursor microRNA sequence fie" />
+  </when>
+    </conditional> <!-- params -->
+
+
+
+ <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" />
+ <param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" />
+ <param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" />
+ <param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" />
+ <param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" />
+ <param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" />
+ <param name="mfe" type="float" value="-30" label="Maximal free energy allowed for a miRNA precursor" />
+
+  </inputs>
+
+  <outputs>
+   <data format="txt" name="known microRNA express list" from_work_dir="miRNA_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list">
+   <filter>(mirbase['known_microRNA'] == 'yes')</filter>
+   </data>
+   <data format="txt" name="known microRNA express alignment" from_work_dir="miRNA_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment">
+   <filter>(mirbase['known_microRNA'] == 'yes')</filter>
+   </data>
+   <data format="txt" name="known microRNA moRs result" from_work_dir="miRNA_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result">
+   <filter>(mirbase['known_microRNA'] == 'yes')</filter>
+   </data>
+   <data format="fasta" name="known microRNA precursor file" from_work_dir="miRNA_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file">
+   <filter>(mirbase['known_microRNA'] == 'yes')</filter>
+   </data>
+   <data format="fasta" name="known microRNA mature file" from_work_dir="miRNA_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file">
+   <filter>(mirbase['known_microRNA'] == 'yes')</filter>
+   </data>
+   <data format="txt" name="novel microRNA express list" from_work_dir="miRNA_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/>
+   <data format="fasta" name="novel microRNA precursor file" from_work_dir="miRNA_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/>
+   <data format="fasta" name="novel microRNA mature sequence file" from_work_dir="miRNA_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/>
+   <data format="fasta" name="non-microRNA sequence FASTA file" from_work_dir="miRNA_out/non_microRNA_sequence.fa" label="${tool.name} on ${on_string}: Sequence FASTA file of non-microRNA tags"/>
+
+  </outputs>
+
+ <help>
+
+ </help>
+ </tool>
b
diff -r 000000000000 -r 87fe81de0931 microRNA_pipeline.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microRNA_pipeline.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,508 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-4-22\n+#Modified:\n+#Description: plant microRNA prediction\n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use threads;\n+#use threads::shared;\n+use File::Path;\n+use File::Basename;\n+#use RNA;\n+#use Term::ANSIColor;\n+\n+my %opts;\n+GetOptions(\\%opts,"i:s@","tag:s@","phred:i","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h");\n+if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $time=&Time();\n+print "miPlant program start:\\n   The time is $time!\\n";\n+print "Command line:\\n   $0 @ARGV\\n";\n+\n+my $format=$opts{\'format\'};\n+if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { \n+\t#&printErr();\n+\tdie "Parameter \\"-format\\" is error! Parameter is fastq, fq, fasta or fa\\n";\n+}\n+\n+my $phred_qv=64;\n+if (defined $opts{\'phred\'}) {\n+\t$phred_qv=$opts{\'phred\'};\n+}\n+\n+my @inputfiles=@{$opts{\'i\'}};\n+my @inputtags=@{$opts{\'tag\'}};\n+\n+my  $mypath=`pwd`;\n+chomp $mypath;\n+\n+my $dir=defined $opts{\'o\'} ? $opts{\'o\'} : "$mypath/miRPlant_out/";\n+\n+\n+unless ($dir=~/\\/$/) {$dir.="/";}\n+if (not -d $dir) {\n+\tmkdir $dir;\n+}\n+my $config=$dir."/input_config";\n+open CONFIG,">$config";\n+\tfor (my $i=0;$i<@inputfiles;$i++) {\n+\t\tprint CONFIG $inputfiles[$i],"\\t",$inputtags[$i],"\\n";\n+\t}\n+close CONFIG;\n+\n+my $scipt_path=defined $opts{\'path\'} ? $opts{\'path\'} : "/Users/big/galaxy-dist/tools/myTools/";\n+\n+my $a="ATCTCGTATG";  #adapter\n+if (defined $opts{\'a\'}) {$a=$opts{\'a\'};}\n+\n+my $m=6;  #adapter minimum mapped nt\n+if (defined $opts{\'M\'}) {$m=$opts{\'M\'};}\n+\n+my $t=1; #threads number\n+if (defined $opts{\'t\'}) {$t=$opts{\'t\'};}\n+\n+my $min_nt=19; # minimum reads length\n+if (defined $opts{\'min\'}) {$min_nt=$opts{\'min\'};}\n+\n+my $max_nt=28; #maximum reads length\n+if (defined $opts{\'max\'}) {$max_nt=$opts{\'max\'};}\n+\n+my $mis=0; #mismatch number for microRNA\n+if (defined $opts{\'mis\'}) {$mis=$opts{\'mis\'};}\n+\n+my $mis_rfam=0;# mismatch number for rfam\n+if (defined $opts{\'v\'}) {$mis_rfam=$opts{\'v\'};}\n+\n+my $hit=25; # maximum reads mapping hits in genome\n+if (defined $opts{\'r\'}) {$hit=$opts{\'r\'};}\n+\n+my $upstream = 2; # microRNA 5\' extension\n+$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\n+\n+my $downstream = 5;# microRNA 3\' extension\n+$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\n+\n+my $maxd=defined $opts{\'dis\'} ? $opts{\'dis\'} : 200;\n+my $flank=defined $opts{\'flank\'} ? $opts{\'flank\'} :10;\n+my $mfe=defined $opts{\'mfe\'} ? $opts{\'mfe\'} : -20;\n+\n+$time=&Time();\n+print "$time, Checking input file!\\n";\n+\n+my (@filein,@mark,@clean);\n+#&read_config();\n+@filein=@inputfiles;\n+@mark=@inputtags;\n+\n+&checkfa($opts{pre});\n+&checkfa($opts{mat});\n+&checkfa($opts{gfa});\n+\n+\n+##### clip adpter --> clean data  start\n+$time=&Time();\n+print "$time, Preprocess:\\n   trim adapter, reads collapse and filter reads by length.\\n";\n+\n+$time=~s/:/-/g;\n+$time=~s/ /-/g;\n+my $preprocess=$dir."preProcess/";\n+mkdir $preprocess;\n+my $can_use_threads = eval \'use threads; 1\';\n+if ($can_use_threads) {\n+# Do processing using threads\n+\tprint "Do processing using threads\\n";\n+\tmy @filein1=@filein; my @mark1=@mark;\n+\twhile (@filein1>0) {\n+\t\tmy @thrs; my @res;\n+\t\tfor (my $i=0;$i<$t ;$i++) {\n+\t\t\tlast if(@filein1==0);\n+\t\t\tmy $in=shift @filein1;\n+\t\t\tmy $out=shift @mark1;\n+\t\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t\t$thrs[$i]=threads->create(\\&clips,$in,$out);\n+\t\t}\n+\t\tfor (my $i=0;$i<@thrs;$i++) {\n+\t\t\t$res[$i]=$thrs[$i]->join();\n+\t\t}\n+\t}\n+} else {\n+# Do not processing using threads\n+\tprint "Do not processing using threads\\n";\n+\tfor (my $i=0;$i<@filein ;$i++) {\n+\t\tmy $in=$filein[$i];\n+\t\tmy $out=$mark[$i];\n+\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t&clips($in,$out);\n+\t}\n+}\n+\n+##### clip adpter --> clean data '..b'opendir I,$dir;\n+\tmy @ret;\n+\twhile (my $file=readdir I) {\n+\t\tif ($file=~/$str/) {\n+\t\t\tpush @ret, $file;\n+\t\t}\n+\t}\n+\tclosedir I;\n+\tif (@ret != 1) {\n+\t\t#&printErr();\n+\n+\t\tdie "Can not find directory or file which name has string: $str !!!\\n";\n+\t}\n+\treturn $ret[0];\n+}\n+\n+=cut\n+\n+sub printErr{\n+    print STDERR color \'bold red\';\n+    print STDERR "Error: ";\n+    print STDERR color \'reset\';\n+}\n+sub Time{\n+        my $time=time();\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n+        $month++;\n+        $year+=1900;\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\n+        if (length($min) == 1) {$min = "0"."$min";}\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\n+        if (length($day) == 1) {$day = "0"."$day";}\n+        if (length($month) == 1) {$month = "0"."$month";}\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\n+        return("$year-$month-$day-$hour-$min-$sec");\n+}\n+=cut\n+sub Time{\n+        my $time=time();\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n+        $month++;\n+        $year+=1900;\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\n+        if (length($min) == 1) {$min = "0"."$min";}\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\n+        if (length($day) == 1) {$day = "0"."$day";}\n+        if (length($month) == 1) {$month = "0"."$month";}\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\n+        return("$year-$month-$day $hour:$min:$sec");\n+}\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+\n+$0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t  -o  -path\n+options:\n+-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ...\n+-tag string # raw data file names, -tag xxx -tag xxx\n+\n+-format string,#specific input rawdata file format : fastq|fq|fasta|fa\n+\n+-path scirpt path\n+\n+-gfa string,  input file # genome fasta. sequence file\n+-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-pre string,  input file #species specific microRNA precursor sequences\n+-mat string,  input file #species specific microRNA mature sequences\n+\n+-rfam string,  input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.\n+-idx2 string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-D      If [-D] is specified,will discard rfam mapped reads(nead -rfam).\n+\n+-a string,  ADAPTER string. default is ATCTCGTATG.\n+-M int,    require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don\'t clip it. \n+-min int,  reads min length,default is 19.\n+-max int,  reads max length,default is 28.\n+\n+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0\n+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2\n+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5\n+-v <int>     report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment\n+-r int       a read is allowed to map up to this number of positions in the genome,default is 25 \n+\n+-dis <int>   Maximal space between miRNA and miRNA* (200)\n+-flank <int>   Flank sequence length of miRNA precursor (10)\n+-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)\n+\n+-t int,    number of threads [1]\n+\n+-o output directory# absolute path\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r 000000000000 -r 87fe81de0931 microRNA_pipeline.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microRNA_pipeline.xml Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,192 @@\n+<tool id="plant_microRNA_v1" name="microRNA_pipeline" veision="1.0.0">\r\n+  <description>Program for plant microRNA analysis (rawdata preprocess -> genome mapping -> non-coding RNA(exclude miRNAs) annotation -> microRNA analysis)</description>\r\n+\r\n+  <requirements>\r\n+\t<requirement type="package" version="0.0.13">fastx_toolkit </requirement>\r\n+    <requirement type="package" version="0.12.7">bowtie</requirement>\r\n+    <requirement type="set_environment">SCRIPT_PATH</requirement>\r\n+    <!--requirement type="package" version="3.0.1">R</requirement!-->\r\n+\t<requirement type="package" version="1.96">threads</requirement>\r\n+\t<requirement type="package" version="2.59">SVG</requirement>\r\n+\t<!--requirement type="package" version="0.228">parent</requirement-->\r\n+\t<requirement type="package" version="2.1.8">ViennaRNA</requirement>\r\n+  </requirements>\r\n+\r\n+  <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command-->\r\n+\r\n+  <command interpreter="perl">microRNA_pipeline.pl \r\n+   ## Change this to accommodate the number of threads you have available.\r\n+        -t \\${GALAXY_SLOTS:-4}\r\n+\t-path \\$SCRIPT_PATH\r\n+\r\n+    #for $j, $s in enumerate( $series )\r\n+    ##rank_of_series=$j\r\n+    -i ${s.input}\r\n+    -tag ${s.tag}\r\n+    #end for\r\n+\r\n+      ## prepare bowtie index\r\n+      #set index_path = \'\'\r\n+      #if str($reference_genome.source) == "history":\r\n+          #####bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa;\r\n+          #set index_path = $reference_genome.own_file\r\n+\t\t  -gfa $index_path \r\n+\t  #else:\r\n+          #set index_path = $reference_genome.index.fields.path\r\n+\t\t  -gfa ${index_path}.fa -idx $index_path \r\n+      #end if\r\n+\r\n+\r\n+   ## Do or not annotate rfam non-miRNA RNAs\r\n+    #if $params.annotate_rfam == "yes":\r\n+\r\n+\t\t  ## prepare Rfam bowtie index\r\n+\t\t  #set rfam_index_path = \'\'\r\n+\t\t  #if str($params.reference_rfam.source) == "history":\r\n+\t\t\t ######## bowtie-build "$params.reference_rfam.own_file" rfam; ln -s "$params.reference_rfam.own_file" rfam.fa;\r\n+\t\t\t  #set rfam_index_path = $params.reference_rfam.own_file\r\n+\t\t\t -rfam $rfam_index_path \r\n+\t\t  #else:\r\n+\t\t\t#set rfam_index_path = $params.reference_rfam.index.fields.path\r\n+\t\t\t-rfam ${rfam_index_path}.fa -idx2 $rfam_index_path \r\n+\t\t  #end if\r\n+\r\n+\t\t-v $params.v \r\n+\t   ## Do or not delete rfam mapped tags\r\n+\t\t#if $params.delete_rfam == "yes":\r\n+\t\t-D \r\n+\t\t#end if\r\n+\t#end if\r\n+\r\n+\r\n+   ## Do or not annotate known microRNAs\r\n+    #if $mirbase.known_microRNA == "yes":\r\n+\t-pre $mirbase.pre -mat $mirbase.mat \r\n+\t#end if\r\n+\r\n+\r\n+    -format $format -phred $phred   -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log\r\n+  </command>\r\n+\r\n+  <inputs>\r\n+\r\n+   <repeat name="series" title="Raw sequence data">\r\n+     <param name="input" type="data" label="Raw data"/>\r\n+     <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/>\r\n+   </repeat>\r\n+\r\n+\t<param name="format" type="select" label="Raw data format" multiple="false">\r\n+\t  <option value="fastq">Raw data is fastq. format</option>\r\n+\t  <option value="fasta">Raw data is fasta. format</option>\r\n+\t</param>\r\n+\r\n+\t<param name="phred" type="select" label="Input quals are Phred+64 or Phred+33" multiple="false">\r\n+\t  <option value="64">Phred+64</option>\r\n+\t  <option value="33" selected="true">Phred+33</option>\r\n+\t</param>\r\n+\r\n+        <conditional name="reference_genome">\r\n+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">\r\n+            <option value="indexed">Use a built-in index</option>\r\n+            <option value="history">Use one from the history</option>\r\n+  '..b'e="integer" value="0" label="report end-to-end hits less than v mismatches for rfam mapping"/>\r\n+\r\n+\t\t\t<param name="delete_rfam" type="select" label="delet rfam mapped reads">\r\n+\t\t\t  <option value="yes" selected="true">yes</option>\r\n+\t\t\t  <option value="no">no</option>\r\n+\t\t\t</param>\r\n+\t\t </when>\r\n+\r\n+    </conditional> \r\n+\r\n+\t<conditional name="mirbase">\r\n+\t\t<param name="known_microRNA" type="select" label="Analysis known microRNAs(eg. from mirbase)">\r\n+\t\t  <option value="yes" selected="true">yes</option>\r\n+\t\t  <option value="no">no</option>\r\n+\t\t </param>\r\n+\t\t <when value="yes">\r\n+\t\t\t<param name="mat" type="data" label="mature microRNA sequence file" />\r\n+\t\t\t<param name="pre" type="data" label="precursor microRNA sequence file" />\r\n+\t\t </when>\r\n+\r\n+\t</conditional>\r\n+\t\r\n+\r\n+\t<param name="a" type="text" value="TGGAATTCTCGGGTGCCAAGG" label="3\' adapter sequence" />\r\n+\t<param name="mapnt" type="integer" value="8" label="minimum adapter map nts" />\r\n+\t<param name="min" type="integer" value="19" label="minimum microRNA length" />\r\n+\t<param name="max" type="integer" value="28" label="maximum microRNA length" />\r\n+\t<param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to precursors" />\r\n+\t<param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" />\r\n+\t<param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" />\r\n+\t<param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" />\r\n+\t<param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" />\r\n+\t<param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" />\r\n+\t<param name="mfe" type="float" value="-30" label="Maximal free energy allowed for a miRNA precursor" />\r\n+  </inputs>\r\n+\r\n+  <outputs>\r\n+   <data format="txt" name="known microRNA express list" from_work_dir="miRPlant_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list">\r\n+   <filter>(mirbase[\'known_microRNA\'] == \'yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA express alignment" from_work_dir="miRPlant_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment">\r\n+   <filter>(mirbase[\'known_microRNA\'] == \'yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA moRs result" from_work_dir="miRPlant_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result">\r\n+   <filter>(mirbase[\'known_microRNA\'] == \'yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA precursor file" from_work_dir="miRPlant_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file">\r\n+   <filter>(mirbase[\'known_microRNA\'] == \'yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="known microRNA mature file" from_work_dir="miRPlant_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file">\r\n+   <filter>(mirbase[\'known_microRNA\'] == \'yes\')</filter>\r\n+   </data>\r\n+   <data format="txt" name="novel microRNA express list" from_work_dir="miRPlant_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/>\r\n+   <data format="txt" name="novel microRNA precursor file" from_work_dir="miRPlant_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/>\r\n+   <data format="txt" name="novel microRNA mature sequence file" from_work_dir="miRPlant_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/>\r\n+   <data format="html" name="analysis result" from_work_dir="miRPlant_out/result.html" label="${tool.name} on ${on_string}: analysis result"/>\r\n+  </outputs>\r\n+\r\n+ <help>\r\n+\r\n+ </help>\r\n+ </tool>\r\n'
b
diff -r 000000000000 -r 87fe81de0931 nibls.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nibls.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,319 @@\n+#!/usr/bin/perl\n+#####################################################################################################\n+#LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. \n+#\n+# Dan MacLean\n+# dan.maclean@sainsbury-laboratory.ac.uk\n+#\n+# This program is free academic software; academic and non-profit\n+# users may redistribute it freely.\n+#\n+# This program is distributed in the hope that it will be useful,\n+# but WITHOUT ANY WARRANTY; without even the implied warranty of\n+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  \n+#\n+# This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007\n+# see included file GPL3.txt\n+#\n+#\n+\n+\n+###Dont forget you will need ... \n+#####################################################################################################\n+# Boost::Graph \n+#Copyright 2005 by David Burdick\n+# Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm\n+#Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself.\n+#####################################################################################################\n+\n+\n+\n+use strict;\n+use warnings;\n+use Boost::Graph;\n+use Getopt::Long;\n+\n+\n+my $usage = "usage: $0 -f GFF_FILE [options]\\n\\n -m minimum inclusion distance (default 5)\\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\\n";\n+\n+my $gff_file ;\n+my $min_inc = 5;\n+my $clus = 0.6;\n+my $buff = 0;\n+my $output_file;\n+my $temp;\n+my $mark;\n+\n+GetOptions(\n+\n+\t\'c=f\'          => \\$clus,\n+\t\'m=i\' => \\$min_inc,\n+\t\'f|file=s\'     => \\$gff_file,\n+\t\'b=i\' => \\$buff,\n+\t\'o=s\' => \\$output_file,\n+\t\'t=s\' => \\$temp,\n+\t\'k=s\' => \\$mark\n+) ;\n+\n+\n+die $usage unless $gff_file;\n+\n+\n+my $starttime = time;\n+warn "started $starttime\\n";\n+\n+## load in data\n+my %molecules; # stores starts and ends of srnas\n+open GFF, "<$gff_file";\n+\n+while (my $entry = <GFF>){\n+\n+\tchomp $entry;\n+\tnext if($entry=~/^\\#/);\n+\tmy @data = split(/\\t/,$entry);\n+\tmy $chr=shift @data;\n+\tmy $strand=shift @data;\n+\tmy $start=shift @data;\n+\tmy $end=shift @data;\n+#\tmy $length1=$end-$start+1;\n+#\tif ($length1>30) {\n+#\t\t$length1=40;\n+#\t}\n+\tmy $total;\n+\tfor (my $s=0;$s<@data ;$s++) {\n+\t\t$total+=$data[$s];\n+\t}\n+\tpush @data,$total;\n+#\tpush @data,$length1;\n+#\tif (defined $molecules{$chr}{$start}{$end}{$strand}) {\n+#\t\tmy @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand});\n+#\t\tfor (my $i=0;$i<$#old_data ;$i++) {\n+#\t\t\t$data[$i]+=$old_data[$i];\n+#\t\t}\n+#\t}\n+\tmy $data=join ";",@data;\n+\t$molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information\n+\t#print "$chr\\t$start\\t$end\\n";\n+}\n+\n+close GFF;\n+\n+warn "Data loaded...\\nBuilding graphs and finding loci\\nPlease be patient, this can take a while...\\n";\n+\n+my @sample=split/\\#/,$mark;\n+$mark=join"\\"\\t\\"",@sample;\n+open OUT, ">$output_file";\n+print OUT "\\"Chr\\"\\t\\"MajorLength\\"\\t\\"Percent\\"\\t\\"$mark\\"\\n";\n+open CLUSTER,">$temp";\n+print CLUSTER "\\#Chr\\tMajorLength\\tPercent\\tTagsNumber\\tTagsInfor\\n";\n+foreach my $chromosome (keys %molecules){\n+\tmy $g = new Boost::Graph(directed=>0);\n+\tmy @starts = keys(%{$molecules{$chromosome}} );\n+\t@starts = sort {$a <=> $b} @starts;  \n+\n+\twhile (my $srna_start = shift @starts){   ## work from left most sRNA to right most, add to graph if they close enough\n+\n+\n+\t\tforeach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){\n+\n+\n+\t\t\t###use new graph if the next srna is too far away from this one.. \n+\t\t\tif(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){\n+\n+\n+\t\t\t\t##dump the info from the old graph\n+\t\t\t\tif (scalar(@{$g->get_nodes()}) > 2){\n+\n+\t\t\t\t\tmy $cluster_coeff = get_cc($g);\n+\t\t\t\t\tif ($cluster_coeff >= $clus){\n+\t\t\t\t\t dump_locus($g, $cluster_coeff);\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\n+\n+\t\t\t\t$g = new Boost::Graph(directed=>0);\n+\n+\t\t\t}'..b'start}}){\n+\n+\t\t\t\t\t\tmy $end_node = $chromosome . \':\' . $start . \':\' . $end;\n+\t\t\t\t\t\t$g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>\'1\');\n+\t\t\t\t\t}\n+\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\n+\t\tif (!(defined $starts[0])) {\n+\t\t\t##dump the info from the last graph\n+\t\t\tif (scalar(@{$g->get_nodes()}) > 2){\n+\n+\t\t\t\tmy $cluster_coeff = get_cc($g);\n+\t\t\t\tif ($cluster_coeff >= $clus){\n+\t\t\t\t dump_locus($g, $cluster_coeff);\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t}\n+}\n+\n+warn "Loci printed\\nFinished\\n";\n+\n+my $endtime = time;\n+\n+my $elapsed = $endtime - $starttime;\n+\n+warn "Time elapsed = $elapsed s\\n";\t\n+close OUT;\n+close CLUSTER;\n+#########################################################################################\n+sub get_cc{   ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2\n+\n+\tmy $graph = shift;\n+\n+\tmy @component = @{$graph->get_nodes()}; #number of nodes\n+\tmy @clustering_coefficients;\n+\n+\tforeach my $vertex (@component)\n+\t{\n+\n+\t\tmy @neighbours = @{$graph->neighbors($vertex)};\n+\n+\t\tmy %edges_in_graph;\n+\n+\t\tmy $n = @neighbours; #n = the number of neighbours\n+\t\tmy $k = ($n * ($n - 1))/2;\t#k = total number of possible connections\n+\n+\t\tmy $e= 0; #actual number of connections within sub-graph\n+\n+\t\tforeach my $neighbour (@neighbours)\n+\t\t{\n+\t\t\tforeach my $neighbour_2 (@neighbours)\n+\t\t\t{\n+\t\t\tmy $edge1 = "$neighbour\\t$neighbour_2";\n+\t\t\tmy $edge2 = "$neighbour_2\\t$neighbour";\n+\t\t\t\tunless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2})\n+\t\t\t\t{\n+\t\t\t\t\tif ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour))\n+\t\t\t\t\t{\n+\t\t\t\t\t\t++$e;\n+\t\t\t\t\t\t$edges_in_graph{$edge1}=1;\n+\t\t\t\t\t\t$edges_in_graph{$edge2}=1;\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\n+\t\tif ($k >= 1)\n+\t\t{\t\n+\t\tmy $c = $e / $k;\n+\t\tpush @clustering_coefficients, $c;\n+\t\t}\n+\t\telse {push @clustering_coefficients, \'0\';}\n+\t}\n+\n+\tmy $graph_n = scalar(@clustering_coefficients);\n+\tmy $graph_cc = 0;\n+\tforeach my $cc (@clustering_coefficients){ \n+\n+\t\t$graph_cc = $graph_cc + $cc;\n+\n+\t}\n+\t$graph_cc = $graph_cc / $graph_n;\n+\n+\treturn $graph_cc;\n+}\n+\n+############################################################################################################\n+\n+sub dump_locus{\n+\n+\tmy $g = shift;\n+\tmy $cc = shift;\n+\tmy $chr;\n+\tmy $start = 1000000000000000000000000000000000000000000000; \n+\tmy $end = -1;\n+\tmy @sample;\n+\tmy @tag;\n+\tforeach my $node (@{$g->get_nodes()}){\n+\n+\t\t$node =~ m/^(\\S+):(\\d+):(\\d+)$/;\n+\t\tmy $c=$1;\n+\t\tmy $s=$2;\n+\t\tmy $e=$3;\n+\t#\tmy @data;\n+\t\tforeach my $str (keys %{$molecules{$c}{$s}{$e}}) {\n+\t\t\tmy @data=split(/;/,$molecules{$c}{$s}{$e}{$str});\n+\t\t\tpush @tag,($s.",".$e.",".$str.",".$data[-1]);\n+#\t\t\tfor (my $i=0;$i<$#old_data ;$i++) {\n+#\t\t\t\t$data[$i]+=$old_data[$i];\n+#\t\t\t}\n+\t\t\tmy $length=$e-$s+1;\n+\t\t\tif ($length>30) {\n+\t\t\t\t$length=40;\n+\t\t\t}\n+\t\t\tpush @data,$length;\n+\t\t\tmy $data=join ";",@data;#sample_exp/total_exp/length;\n+\t\t\tpush @sample,$data;\n+\t\t}\n+\n+\t\t$chr = $c;\n+\t\t$start = $s if $s < $start;\n+\t\t$end = $e if $e > $end;\n+\t}\n+\tmy $tag=join";",@tag;\n+\tmy $tag_number=@tag;\n+\tmy ($max_length,$max_p,@cluster_exp)=Max_length(\\@sample);\n+\tif ($max_length==40) {\n+\t\t$max_length="\\>30";\n+\t}\n+\tmy $cluster_exp=join"\\t",@cluster_exp;\n+\tmy $gff = $chr."\\:$start\\-$end\\t".$max_length."nt\\t".$max_p."\\t" . $cluster_exp;\n+\tprint CLUSTER "$chr\\:$start\\-$end\\t$max_length"."nt\\t$max_p\\t$tag_number\\t$tag\\n";\n+\tprint OUT $gff, "\\n";  \n+}\n+\n+sub Max_length{\n+\tmy @exp=@{$_[0]};\n+\tmy %sample_length;\n+\tmy $total_exp;\n+\tmy @each;\n+\tfor (my $i=0;$i<=$#exp ;$i++) {\n+\t\tmy @tag=split/;/,$exp[$i];\n+\t\tmy $length=pop(@tag);\n+\t\tmy $exp=pop(@tag);\n+\t\t$sample_length{$length}+=$exp;\n+\t\t$total_exp+=$exp;\n+\t\tfor (my $j=0;$j<=$#tag ;$j++) {\n+\t\t\t$each[$j]+=$tag[$j];\n+\t\t}\n+\t}\n+\tmy $max=0;\n+\tmy $max_key;\n+\tforeach my $key (sort keys %sample_length) {\n+\t\tmy $p=$sample_length{$key}/$total_exp;\n+\t\tif ($p>$max) {\n+\t\t\t$max=$p;\n+\t\t\t$max_key=$key;\n+\t\t}\n+\t\t$sample_length{$key}=sprintf("%.2f",$p);\n+\t}\n+\treturn($max_key,$sample_length{$max_key},@each);\n+}\n'
b
diff -r 000000000000 -r 87fe81de0931 non_miRNA_reads.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/non_miRNA_reads.pl Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","fa=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'fa'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+my (%fa,%seq);
+while (my $aline=<IN>){
+ chomp $aline;
+ $aline=~s/^>//;
+ my $seq=<IN>;
+ chomp $seq;
+ #$seq{$seq}=$aline;
+ $fa{$aline}=$seq;
+}
+close IN;
+
+open IN,"<$opts{i}";
+while(my $aline=<IN>){
+ chomp $aline;
+ if($aline=~/^\S+_x\d+/){
+ $aline=~/^(\S+_x\d+)/;
+ my $name=$1;
+ delete($fa{$name});
+ }
+}
+close IN;
+
+open OUT,">$fileout"; #output file  
+foreach my $key (keys %fa) {
+ print OUT ">$key\n$fa{$key}\n";
+}
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 phased_siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phased_siRNA.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,254 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+#use Math::Cephes qw(:hypergeometrics); 
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+
+while (my $aline=<IN>) {
+ chomp $aline;
+ if ($aline=~/^\#/) {
+ print OUT $aline,"\tp-value\n";
+ next;
+ }
+ my @tmp=split/\t/,$aline;
+ my @pos=split/:|-/,$tmp[0];
+ $tmp[1]=~s/nt//;
+ my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]);
+
+ print OUT $aline,"\t",$pv,"\n";
+}
+close IN;
+close OUT;
+
+sub phase{
+ my ($tagL,$start,$end,$tags)=@_;
+ my @tmp=split/\;/,$tags;
+ my %tag;
+ for (my $i=0;$i<@tmp;$i++) {
+ my @aa=split/\,/,$tmp[$i];
+ next if($aa[1]-$aa[0]+1 != $tagL);
+# $tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+");
+# $tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-");
+ $tag{$aa[0]}+=$aa[3] if($aa[2] eq "+");
+ $tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-");
+ }
+
+ my $pv=&pvalue2(\%tag,$tagL,$start,$end);
+
+ return $pv;
+}
+
+sub pvalue2{
+ my ($tag,$tagL,$start,$end)=@_;
+
+ my $p=1; my $pp=1;
+ foreach my $ccs(keys %{$tag}){
+ my $n=0;
+ my $k=0;
+ my $K=0;
+ my $N=0;
+
+ my $cor= $ccs;
+ my $ss=$cor;
+ my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end;
+
+ my $max=0;
+ for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+ {
+ my $x=$i;
+ if (defined $$tag{$x})
+ {
+ if ($max<$$tag{$x}) {$max=$$tag{$x};}
+ $n +=$$tag{$x};
+ $N++;
+ }
+ }
+ for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+ {
+ my $x=$i;
+ if (defined $$tag{$x})
+ {
+ $k +=$$tag{$x};
+ $K++;
+ }
+ }
+
+
+ return $p if($K<3);
+ return $p if($max/$n>0.8);
+
+ my $pn=0; 
+ next if($n==$k);
+ $pn=10*$k/($n-$k)+1;
+ $pn = $pn ** ($K-2);
+ $pn = log($pn);
+ if ($p<$pn) {
+ $p=$pn;
+ }
+
+ }
+
+ return $p;
+
+}
+
+sub pvalue{
+ my ($tag,$tagL,$start,$end)=@_;
+
+ my $p=1;
+ foreach my $ccs(keys %{$tag}){
+ my $n=-1;
+ my $k=-1;
+
+ my ($cor, $str)=split(/,/, $ccs);
+ if ($str eq "+") # small RNAs on the Watson strand
+ {
+ my $ss=$cor;
+ my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end;
+ for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+ {
+ my $x=$i.","."+";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+ for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+ {
+ my $x=$i.","."+";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+
+ for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand
+ {
+ my $x=$j.","."-";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+
+ for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand
+ {
+ my $x=$j.","."-";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+ }
+
+ elsif ($str eq "-") # small RNAs on the Crick strand
+ {
+ my $ee=$cor;
+ my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start;
+ for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+ {
+ my $x=$i.","."-";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+ for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+ {
+ my $x=$i.","."-";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+
+ for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand
+ {
+ my $x=$j.","."+";
+ if (defined $$tag{$x})
+ {
+ $n=$n+1;
+ }
+ }
+ for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand
+ {
+ my $x=$j.","."+";
+ if (defined $$tag{$x})
+ {
+ $k=$k+1;
+ }
+ }
+ }
+
+ next if($k<3);
+
+ my $pn=0; my $N=$tagL*11*2-1; my $M=21;
+ for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k
+ {
+ my $c=1;
+ my $rr=1;
+ my $rw=1;
+
+ for (my $j=0; $j<=$w-1; $j++)
+ {
+ $c=$c*($M-$j)/($j+1);
+ }
+ for (my $x=0; $x<=$n-$w-1; $x++)
+ {
+ $rr=$rr*($N-$M-$x)/($x+1);
+ }
+ for (my $y=0; $y<=$n-1; $y++)
+ {
+ $rw=$rw*($y+1)/($N-$y);
+ }
+ my $pr=$c*$rr*$rw;
+
+ $pn=$pn+$pr;
+ }
+
+ $p=$pn<$p ? $pn :$p;
+
+ if ($p<0.001) #select and output small RNA clusters with p<0.001
+
+ {
+
+ return $p;
+
+ }
+
+ }
+ return $p;
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 preProcess.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preProcess.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,385 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-12-2\n+#Modified:\n+#Description: RNA-seq data pre-process\n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use threads;\n+#use threads::shared;\n+use File::Path;\n+use File::Basename;\n+#use RNA;\n+#use Term::ANSIColor;\n+\n+my %opts;\n+GetOptions(\\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h");\n+if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $time=&Time();\n+print "miPlant program start:\\n   The time is $time!\\n";\n+print "Command line:\\n   $0 @ARGV\\n";\n+\n+my $format=$opts{\'format\'};\n+if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { \n+\t#&printErr();\n+\tdie "Parameter \\"-format\\" is error! Parameter is fastq, fq, fasta or fa\\n";\n+}\n+\n+my $phred_qv=64;\n+if (defined $opts{\'phred\'}) {$phred_qv=$opts{\'phred\'};}\n+\n+my @inputfiles=@{$opts{\'i\'}};\n+my @inputtags=@{$opts{\'tag\'}};\n+\n+my  $mypath=`pwd`;\n+chomp $mypath;\n+\n+my $dir=defined $opts{\'o\'} ? $opts{\'o\'} : "$mypath/preProcess/";\n+\n+\n+unless ($dir=~/\\/$/) {$dir.="/";}\n+if (not -d $dir) {\n+\tmkdir $dir;\n+}\n+my $config=$dir."/input_config";\n+open CONFIG,">$config";\n+\tfor (my $i=0;$i<@inputfiles;$i++) {\n+\t\tprint CONFIG $inputfiles[$i],"\\t",$inputtags[$i],"\\n";\n+\t}\n+close CONFIG;\n+\n+my $scipt_path=defined $opts{\'path\'} ? $opts{\'path\'} : "/Users/big/galaxy-dist/tools/myTools/";\n+\n+my $a="ATCTCGTATG";  #adapter\n+if (defined $opts{\'a\'}) {$a=$opts{\'a\'};}\n+\n+my $m=6;  #adapter minimum mapped nt\n+if (defined $opts{\'M\'}) {$m=$opts{\'M\'};}\n+\n+my $t=1; #threads number\n+if (defined $opts{\'t\'}) {$t=$opts{\'t\'};}\n+\n+my $min_nt=19; # minimum reads length\n+if (defined $opts{\'min\'}) {$min_nt=$opts{\'min\'};}\n+\n+my $max_nt=28; #maximum reads length\n+if (defined $opts{\'max\'}) {$max_nt=$opts{\'max\'};}\n+\n+my $mis=0; #mismatch number for microRNA\n+if (defined $opts{\'mis\'}) {$mis=$opts{\'mis\'};}\n+\n+my $mis_rfam=0;# mismatch number for rfam\n+if (defined $opts{\'v\'}) {$mis_rfam=$opts{\'v\'};}\n+\n+my (@filein,@mark,@clean);\n+#&read_config();\n+@filein=@inputfiles;\n+@mark=@inputtags;\n+\n+&checkfa($opts{gfa});\n+\n+\n+##### clip adpter --> clean data  start\n+my $preprocess=$dir."preProcess_clean/";\n+mkdir $preprocess;\n+my $can_use_threads = eval \'use threads; 1\';\n+if ($can_use_threads) {\n+# Do processing using threads\n+\tprint "Do processing using threads\\n";\n+\tmy @filein1=@filein; my @mark1=@mark;\n+\twhile (@filein1>0) {\n+\t\tmy @thrs; my @res;\n+\t\tfor (my $i=0;$i<$t ;$i++) {\n+\t\t\tlast if(@filein1==0);\n+\t\t\tmy $in=shift @filein1;\n+\t\t\tmy $out=shift @mark1;\n+\t\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t\t$thrs[$i]=threads->create(\\&clips,$in,$out);\n+\t\t}\n+\t\tfor (my $i=0;$i<@thrs;$i++) {\n+\t\t\t$res[$i]=$thrs[$i]->join();\n+\t\t}\n+\t}\n+} else {\n+# Do not processing using threads\n+\tprint "Do not processing using threads\\n";\n+\tfor (my $i=0;$i<@filein ;$i++) {\n+\t\tmy $in=$filein[$i];\n+\t\tmy $out=$mark[$i];\n+\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t&clips($in,$out);\n+\t}\n+}\n+\n+##### clip adpter --> clean data  end\n+\n+my $collapsed=$preprocess."collapse_reads.fa";\n+my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data\n+&collapse(\\@clean,$collapsed);   #collapse reads to tags\n+\n+&filterbylength();  # filter <$min_nt  && >$max_nt\n+\n+print "The final clean data file is $data, only contains reads which length is among $min_nt\\~$max_nt\\n\\n";\n+\n+my $clean_data=$preprocess."clean_data.fa";\n+system("ln -s $data $clean_data");\n+\n+$time=Time();\n+print "$time: Genome alignment!\\n\\n";\n+my $genome_map=$dir."genome_match";\n+&genome($data);\n+#my $genome_map=&search($dir,"genome_match_");\n+my $mapfile=$genome_map."/genome_mapped.bwt";\n+my $mapfa=$genome_map."/genome_mapped.fa";\n+my $unmap=$genome_map."/genome_not_mapped.fa";\n+\n+chdir $dir;\n+my $pathfile="$dir/path.txt";\n+open PA,">$pathfile";\n+'..b'<$file_reads";\n+\tmy $line=<N>;\n+\tchomp $line;\n+    if($line !~ /^>\\S+/){\n+        #printErr();\n+        die "The first line of file $file_reads does not start with \'>identifier\'\n+Reads file $file_reads is not a valid fasta file\\n\\n";\n+    }\n+    if(<N> !~ /^[ACGTNacgtn]*$/){\n+        #printErr();\n+        die "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+    }\n+\tclose N;\n+}\n+sub checkfq{\n+\tmy ($file_reads)=@_;\n+\n+\topen N,"<$file_reads";\n+\tfor (my $i=0;$i<10;$i++) {\n+\t\tmy $a=<N>;\n+\t\tmy $b=<N>;\n+\t\tmy $c=<N>;\n+\t\tmy $d=<N>;\n+\t\tchomp $a;\n+\t\tchomp $b;\n+\t\tchomp $c;\n+\t\tchomp $d;\n+\t\tif($a!~/^\\@/){\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tif($b!~ /^[ACGTNacgtn]*$/){\n+\t\t\t#&printErr();\n+\t\t\tdie "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+\t\t}\n+\t\tif ($c!~/^\\@/ && $c!~/^\\+/) {\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tif ((length $b) != (length $d)) {\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tmy @qv=split //,$d;\n+\t\tfor (my $j=0;$j<@qv ;$j++) {\n+\t\t\tmy $q=ord($qv[$j])-64;\n+\t\t\tif($q<0){$phred_qv=33;}\n+\t\t}\n+\t}\n+\tclose N;\n+}\n+\n+sub search{\n+\tmy ($dir,$str)=@_;\n+\topendir I,$dir;\n+\tmy @ret;\n+\twhile (my $file=readdir I) {\n+\t\tif ($file=~/$str/) {\n+\t\t\tpush @ret, $file;\n+\t\t}\n+\t}\n+\tclosedir I;\n+\tif (@ret != 1) {\n+\t\t#&printErr();\n+\n+\t\tdie "Can not find directory or file which name has string: $str !!!\\n";\n+\t}\n+\treturn $ret[0];\n+}\n+\n+sub Time{\n+        my $time=time();\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n+        $month++;\n+        $year+=1900;\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\n+        if (length($min) == 1) {$min = "0"."$min";}\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\n+        if (length($day) == 1) {$day = "0"."$day";}\n+        if (length($month) == 1) {$month = "0"."$month";}\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\n+        return("$year-$month-$day $hour:$min:$sec");\n+}\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t  -o  -path\n+options:\n+-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ...\n+-tag string # raw data file names, -tag xxx -tag xxx\n+\n+-format string,#specific input rawdata file format : fastq|fq|fasta|fa\n+-phred  int # phred quality number, default is 64\n+\n+-path scirpt path\n+\n+-gfa string,  input file # genome fasta. sequence file\n+-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-rfam string,  input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.\n+-idx2 string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-a string,  ADAPTER string. default is ATCTCGTATG.\n+-M int,    require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don\'t clip it. \n+-min int,  reads min length,default is 19.\n+-max int,  reads max length,default is 28.\n+\n+-mis [int]     number of allowed mismatches when mapping reads to genome, default 0\n+-v <int>     report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment\n+\n+-t int,    number of threads [1]\n+\n+-o output directory# absolute path\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r 000000000000 -r 87fe81de0931 preProcess.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preProcess.xml Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,155 @@
+<tool id="preprocess" name="preProcess" veision="1.0.0">
+  <description>Program for Raw data preprocess analysis, including 3' adapter triming, reads collaping, genome mapping and  rfam non-miRNA analysis </description>
+
+  <requirements>
+ <requirement type="package" version="0.0.13">fastx_toolkit </requirement>
+    <requirement type="package" version="0.12.7">bowtie</requirement>
+    <requirement type="set_environment">SCRIPT_PATH</requirement>
+    <!--requirement type="package" version="3.0.1">R</requirement!-->
+ <requirement type="package" version="1.96">threads</requirement>
+ <requirement type="package" version="2.59">SVG</requirement>
+ <requirement type="package" version="2.1.8">ViennaRNA</requirement>
+  </requirements>
+
+  <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command-->
+
+   <command interpreter="perl">preProcess.pl 
+   ## Change this to accommodate the number of threads you have available.
+        -t \${GALAXY_SLOTS:-4}
+ -path \$SCRIPT_PATH
+
+    #for $j, $s in enumerate( $series )
+    ##rank_of_series=$j
+    -i ${s.input}
+    -tag ${s.tag}
+    #end for
+
+     ## Do or not annotate rfam non-miRNA RNAs
+    #if $nocoding.annotate_rfam == "yes":
+   ## prepare Rfam bowtie index
+   #set rfam_index_path = ''
+   #if str($nocoding.reference_rfam.source) == "history":
+   -rfam $nocoding.reference_rfam.own_file 
+   #else:
+   #set rfam_index_path = $nocoding.reference_rfam.index.fields.path
+       -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path
+   #end if
+  -v $nocoding.v
+ #end if
+
+        ## prepare bowtie index
+        #set index_path = ''
+        #if str($reference_genome.source) == "history":
+            #set index_path = 'genome'
+            -gfa $reference_genome.own_file
+ #else:
+            #set index_path = $reference_genome.index.fields.path
+    -gfa ${index_path}.fa -idx $index_path
+ #end if
+
+    -format $format -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch  > run.log
+  </command>
+
+  <inputs>
+
+   <repeat name="series" title="Raw sequence data">
+     <param name="input" type="data" label="Raw data"/>
+     <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/>
+   </repeat>
+
+ <!--param name="input" format="tabular"  type="data" label="input config file" /-->
+
+ <param name="format" type="select" label="Raw data format" multiple="false">
+   <option value="fastq">Raw data is fastq. format</option>
+   <option value="fasta">Raw data is fasta. format</option>
+ </param>
+ <param name="phred" type="select" label="Input quals are Phred+64 or Phred+33" multiple="false">
+   <option value="64">Phred+64</option>
+   <option value="33" selected="true">Phred+33</option>
+ </param>
+
+         <!-- reference genome -->
+        <conditional name="reference_genome">
+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+            <option value="indexed">Use a built-in index</option>
+            <option value="history">Use one from the history</option>
+          </param>
+          <when value="indexed">
+            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
+              <options from_data_table="bowtie_indexes">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+              </options>
+            </param>
+          </when>
+          <when value="history">
+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+          </when>
+        </conditional>
+
+ <!--param name="gfa"  type="data" label="genome sequence fasta file"/-->
+ <!--param type="data" name="index" label="genome sequence bowtie index"/-->
+ <param name="a" type="text" value="TGGAATTCTCGGGTGCCAAGG" label="3' adapter sequence" />
+ <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" />
+ <param name="min" type="integer" value="19" label="Minimum microRNA length" />
+ <param name="max" type="integer" value="28" label="Maximum microRNA length" />
+ <param name="mismatch" type="integer" value="0" label="Number of allowed mismatches when mapping reads to genome" />
+
+ <conditional name="nocoding">
+ <param name="annotate_rfam" type="select" label="Annotate nocoding RNAs(excluding miRNA)">
+   <option value="yes" selected="true">yes</option>
+   <option value="no">no</option>
+  </param>
+  <when value="yes">
+ <!--param name="rfam" type="data" label="rfam sequence file" /-->
+ <conditional name="reference_rfam">
+   <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+ <option value="indexed">Use a built-in index</option>
+ <option value="history">Use one from the history</option>
+   </param>
+   <when value="indexed">
+ <param name="index" type="select" label="Select a reference" help="If your reference of interest is not listed, contact the Galaxy team">
+   <options from_data_table="rfam_bowtie_indexes">
+ <filter type="sort_by" column="2"/>
+ <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+   </options>
+ </param>
+   </when>
+   <when value="history">
+ <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" />
+   </when>
+ </conditional>
+
+ <param name="v" type="integer" value="0" label="Report end-to-end hits less than v mismatches for non-coding RNA annotation"/>
+  </when>
+    </conditional> 
+
+
+
+  </inputs>
+
+  <outputs>
+   <data format="html" name="preprocess result" from_work_dir="preProcess/preprocessResult.html" label="${tool.name} on ${on_string}: preprocess result"/>
+
+   <data format="txt" name="clean FASTA data" from_work_dir="preProcess/preProcess_clean/clean_data.fa" label="${tool.name} on ${on_string}: clean FASTA data"/>
+
+   <data format="txt" name="genome mapping result" from_work_dir="preProcess/genome_match/genome_mapped.bwt" label="${tool.name} on ${on_string}: genome mapping result"/>
+   <data format="txt" name="genome mapped FASTA reads" from_work_dir="preProcess/genome_match/genome_mapped.fa" label="${tool.name} on ${on_string}: genome mapped FASTA reads"/>
+
+   <data format="txt" name="Rfam mapping result" from_work_dir="preProcess/rfam_match/rfam_mapped.bwt" label="${tool.name} on ${on_string}: Rfam mapping result">
+   <filter>(nocoding['annotate_rfam'] == 'yes')</filter>
+   </data>
+   <data format="txt" name="Rfam mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_mapped.fa" label="${tool.name} on ${on_string}: Rfam mapped FASTA file">
+   <filter>(nocoding['annotate_rfam'] == 'yes')</filter>
+   </data>
+   <data format="txt" name="Rfam not mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_not_mapped.fa" label="${tool.name} on ${on_string}: Rfam not mapped FASTA file">
+   <filter>(nocoding['annotate_rfam'] == 'yes')</filter>
+   </data>
+   <data format="txt" name="input config" from_work_dir="preProcess/input_config" label="${tool.name} on ${on_string}: input config"/>
+
+  </outputs>
+
+ <help>
+
+ </help>
+ </tool>
b
diff -r 000000000000 -r 87fe81de0931 preProcess_core.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preProcess_core.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,385 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-12-2\n+#Modified:\n+#Description: RNA-seq data pre-process\n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use threads;\n+#use threads::shared;\n+use File::Path;\n+use File::Basename;\n+#use RNA;\n+#use Term::ANSIColor;\n+\n+my %opts;\n+GetOptions(\\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h");\n+if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $time=&Time();\n+print "miPlant program start:\\n   The time is $time!\\n";\n+print "Command line:\\n   $0 @ARGV\\n";\n+\n+my $format=$opts{\'format\'};\n+if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { \n+\t#&printErr();\n+\tdie "Parameter \\"-format\\" is error! Parameter is fastq, fq, fasta or fa\\n";\n+}\n+\n+my $phred_qv=64;\n+if (defined $opts{\'phred\'}) {$phred_qv=$opts{\'phred\'};}\n+\n+my @inputfiles=@{$opts{\'i\'}};\n+my @inputtags=@{$opts{\'tag\'}};\n+\n+my  $mypath=`pwd`;\n+chomp $mypath;\n+\n+my $dir=defined $opts{\'o\'} ? $opts{\'o\'} : "$mypath/preProcess/";\n+\n+\n+unless ($dir=~/\\/$/) {$dir.="/";}\n+if (not -d $dir) {\n+\tmkdir $dir;\n+}\n+my $config=$dir."/input_config";\n+open CONFIG,">$config";\n+\tfor (my $i=0;$i<@inputfiles;$i++) {\n+\t\tprint CONFIG $inputfiles[$i],"\\t",$inputtags[$i],"\\n";\n+\t}\n+close CONFIG;\n+\n+my $scipt_path=defined $opts{\'path\'} ? $opts{\'path\'} : "/Users/big/galaxy-dist/tools/myTools/";\n+\n+my $a="ATCTCGTATG";  #adapter\n+if (defined $opts{\'a\'}) {$a=$opts{\'a\'};}\n+\n+my $m=6;  #adapter minimum mapped nt\n+if (defined $opts{\'M\'}) {$m=$opts{\'M\'};}\n+\n+my $t=1; #threads number\n+if (defined $opts{\'t\'}) {$t=$opts{\'t\'};}\n+\n+my $min_nt=19; # minimum reads length\n+if (defined $opts{\'min\'}) {$min_nt=$opts{\'min\'};}\n+\n+my $max_nt=28; #maximum reads length\n+if (defined $opts{\'max\'}) {$max_nt=$opts{\'max\'};}\n+\n+my $mis=0; #mismatch number for microRNA\n+if (defined $opts{\'mis\'}) {$mis=$opts{\'mis\'};}\n+\n+my $mis_rfam=0;# mismatch number for rfam\n+if (defined $opts{\'v\'}) {$mis_rfam=$opts{\'v\'};}\n+\n+my (@filein,@mark,@clean);\n+#&read_config();\n+@filein=@inputfiles;\n+@mark=@inputtags;\n+\n+&checkfa($opts{gfa});\n+\n+\n+##### clip adpter --> clean data  start\n+my $preprocess=$dir."preProcess_clean/";\n+mkdir $preprocess;\n+my $can_use_threads = eval \'use threads; 1\';\n+if ($can_use_threads) {\n+# Do processing using threads\n+\tprint "Do processing using threads\\n";\n+\tmy @filein1=@filein; my @mark1=@mark;\n+\twhile (@filein1>0) {\n+\t\tmy @thrs; my @res;\n+\t\tfor (my $i=0;$i<$t ;$i++) {\n+\t\t\tlast if(@filein1==0);\n+\t\t\tmy $in=shift @filein1;\n+\t\t\tmy $out=shift @mark1;\n+\t\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t\t$thrs[$i]=threads->create(\\&clips,$in,$out);\n+\t\t}\n+\t\tfor (my $i=0;$i<@thrs;$i++) {\n+\t\t\t$res[$i]=$thrs[$i]->join();\n+\t\t}\n+\t}\n+} else {\n+# Do not processing using threads\n+\tprint "Do not processing using threads\\n";\n+\tfor (my $i=0;$i<@filein ;$i++) {\n+\t\tmy $in=$filein[$i];\n+\t\tmy $out=$mark[$i];\n+\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t&clips($in,$out);\n+\t}\n+}\n+\n+##### clip adpter --> clean data  end\n+\n+my $collapsed=$preprocess."collapse_reads.fa";\n+my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data\n+&collapse(\\@clean,$collapsed);   #collapse reads to tags\n+\n+&filterbylength();  # filter <$min_nt  && >$max_nt\n+\n+print "The final clean data file is $data, only contains reads which length is among $min_nt\\~$max_nt\\n\\n";\n+\n+my $clean_data=$preprocess."clean_data.fa";\n+system("ln -s $data $clean_data");\n+\n+$time=Time();\n+print "$time: Genome alignment!\\n\\n";\n+my $genome_map=$dir."genome_match";\n+&genome($data);\n+#my $genome_map=&search($dir,"genome_match_");\n+my $mapfile=$genome_map."/genome_mapped.bwt";\n+my $mapfa=$genome_map."/genome_mapped.fa";\n+my $unmap=$genome_map."/genome_not_mapped.fa";\n+\n+chdir $dir;\n+my $pathfile="$dir/path.txt";\n+open PA,">$pathfile";\n+'..b'<$file_reads";\n+\tmy $line=<N>;\n+\tchomp $line;\n+    if($line !~ /^>\\S+/){\n+        #printErr();\n+        die "The first line of file $file_reads does not start with \'>identifier\'\n+Reads file $file_reads is not a valid fasta file\\n\\n";\n+    }\n+    if(<N> !~ /^[ACGTNacgtn]*$/){\n+        #printErr();\n+        die "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+    }\n+\tclose N;\n+}\n+sub checkfq{\n+\tmy ($file_reads)=@_;\n+\n+\topen N,"<$file_reads";\n+\tfor (my $i=0;$i<10;$i++) {\n+\t\tmy $a=<N>;\n+\t\tmy $b=<N>;\n+\t\tmy $c=<N>;\n+\t\tmy $d=<N>;\n+\t\tchomp $a;\n+\t\tchomp $b;\n+\t\tchomp $c;\n+\t\tchomp $d;\n+\t\tif($a!~/^\\@/){\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tif($b!~ /^[ACGTNacgtn]*$/){\n+\t\t\t#&printErr();\n+\t\t\tdie "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+\t\t}\n+\t\tif ($c!~/^\\@/ && $c!~/^\\+/) {\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tif ((length $b) != (length $d)) {\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tmy @qv=split //,$d;\n+\t\tfor (my $j=0;$j<@qv ;$j++) {\n+\t\t\tmy $q=ord($qv[$j])-64;\n+\t\t\tif($q<0){$phred_qv=33;}\n+\t\t}\n+\t}\n+\tclose N;\n+}\n+\n+sub search{\n+\tmy ($dir,$str)=@_;\n+\topendir I,$dir;\n+\tmy @ret;\n+\twhile (my $file=readdir I) {\n+\t\tif ($file=~/$str/) {\n+\t\t\tpush @ret, $file;\n+\t\t}\n+\t}\n+\tclosedir I;\n+\tif (@ret != 1) {\n+\t\t#&printErr();\n+\n+\t\tdie "Can not find directory or file which name has string: $str !!!\\n";\n+\t}\n+\treturn $ret[0];\n+}\n+\n+sub Time{\n+        my $time=time();\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n+        $month++;\n+        $year+=1900;\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\n+        if (length($min) == 1) {$min = "0"."$min";}\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\n+        if (length($day) == 1) {$day = "0"."$day";}\n+        if (length($month) == 1) {$month = "0"."$month";}\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\n+        return("$year-$month-$day $hour:$min:$sec");\n+}\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t  -o  -path\n+options:\n+-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ...\n+-tag string # raw data file names, -tag xxx -tag xxx\n+\n+-format string,#specific input rawdata file format : fastq|fq|fasta|fa\n+-phred  int # phred quality number, default is 64\n+\n+-path scirpt path\n+\n+-gfa string,  input file # genome fasta. sequence file\n+-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-rfam string,  input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.\n+-idx2 string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+                string must be the prefix of the bowtie index. For instance, if\n+                the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+                the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-a string,  ADAPTER string. default is ATCTCGTATG.\n+-M int,    require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don\'t clip it. \n+-min int,  reads min length,default is 19.\n+-max int,  reads max length,default is 28.\n+\n+-mis [int]     number of allowed mismatches when mapping reads to genome, default 0\n+-v <int>     report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment\n+\n+-t int,    number of threads [1]\n+\n+-o output directory# absolute path\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r 000000000000 -r 87fe81de0931 precursors.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/precursors.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,858 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2013/7/19\n+#Modified:\n+#Description: \n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+#use RNA;\n+\n+my %opts;\n+GetOptions(\\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h");\n+if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $checkno=1;\n+my $filein=$opts{\'map\'};\n+my $faout=$opts{\'o\'};\n+my $strout=$opts{\'s\'};\n+my $genome= $opts{\'g\'};\n+\n+my $maxd=defined $opts{\'d\'} ? $opts{\'d\'} : 200;\n+my $flank=defined $opts{\'f\'}? $opts{\'f\'} : 10;\n+\n+my $MAX_ENERGY=-18;\n+if (defined $opts{\'e\'}) {$MAX_ENERGY=$opts{\'e\'};}\n+my $MAX_UNPAIR=5;\n+my $MIN_PAIR=15;\n+my $MAX_SIZEDIFF=4;\n+my $MAX_BULGE=2;\n+my $ASYMMETRY=5;\n+my $MIN_UNPAIR=0;\n+my $MIN_SPACE=5;\n+my $MAX_SPACE=$maxd;\n+my $FLANK=$flank;\n+\n+######### load in genome sequences start ########\n+my %genome;\n+my %lng;\n+my $name;\n+open IN,"<$genome";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tnext if($aline=~/^\\#/);\n+\tif ($aline=~/^>(\\S+)/) {\n+\t\t$name=$1;\n+\t\tnext;\n+\t}\n+\t$genome{$name} .=$aline;\n+}\n+close IN;\n+foreach my $key (keys %genome) {\n+\t$lng{$key}=length($genome{$key});\n+}\n+####### load in genome sequences end ##########\n+\n+my %breaks; ### reads number bigger than 3\n+open IN,"<$filein"; #input file  \n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tmy @tmp=split/\\t/,$aline;\n+\t$tmp[0]=~/_x(\\d+)$/;\n+\tmy $no=$1;\n+\tnext if($no<3);\n+\t#my $trand=&find_strand($tmp[9]);\n+\t#my @pos=split/\\.\\./,$tmp[5];\n+\tmy $end=$tmp[3]+length($tmp[4])-1;\n+\tif($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);}\n+\tpush @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base\n+}\n+close IN;\n+\n+my %cites; ### peaks\n+foreach my $chr (keys %breaks) {\n+\tforeach my $strand (keys %{$breaks{$chr}}) {\n+\t\tmy @array=@{$breaks{$chr}{$strand}};\n+\t\t@array=sort{$a->[0]<=>$b->[0]} @array;\n+\t\tfor (my $i=0;$i<@array;$i++) {\n+\t\t\tmy $start=$array[$i][0];my $end=$array[$i][1];\n+\t\t\tmy @subarray=();\n+\t\t\tpush @subarray,$array[$i];\n+\n+\t\t\tfor (my $j=$i+1;$j<@array;$j++) {\n+\t\t\t\tif ($start<$array[$j][1] && $end>$array[$j][0]) {   ###overlap\n+\t\t\t\t\tpush @subarray,$array[$j];\n+\t\t\t\t\t($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);\n+\t\t\t\t}\n+\t\t\t\telse{\n+\t\t\t\t\t$i=$j-1;\n+\t\t\t\t\t&find_cites(\\@subarray,$chr,$strand);\n+\t\t\t\t\tlast;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t}\n+}\n+\n+my %cluster;\n+foreach my $chr (keys %cites) {\n+\tforeach my $strand (keys %{$cites{$chr}}) {\n+\t\tmy @array=@{$sites{$chr}{$strand}};\n+\t\t@array=sort{$a->[0]<=>$b->[0]} @array;\n+\t\tfor (my $i=0;$i<@array;$i++) {\n+\t\t\tmy $start=$array[$i][0];my $end=$array[$i][1];\n+\t\t\tmy @subarray=();\n+\t\t\tpush @subarray,$array[$i];\n+\n+\t\t\tfor (my $j=$i+1;$j<@array;$j++) {\n+\t\t\t\tif ($end>$array[$j][0]-$maxd) {   ###distance less than 200bp\n+\t\t\t\t\tpush @subarray,$array[$j];\n+\t\t\t\t\t($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);\n+\t\t\t\t}\n+\t\t\t\telse{\n+\t\t\t\t\t@{$cluster{$chr}{$strand}{$i}}=@subarray;\n+\t\t\t\t\t$i=$j-1;\n+\t\t\t\t\tlast;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\n+\t}\n+}\n+\n+\n+open FA,">$faout"; #output file \n+open STR,">$strout";\n+foreach my $chr (keys %cluster) {\n+\tforeach my $strand (keys %{$cluster{$chr}}) {\n+\t\tforeach my $no (keys %{$cluster{$chr}{$strand}}) {\n+\t\t\tmy @array2=@{$cluster{$chr}{$strand}{$no}};\n+\t\t\t@array2=sort{$a->[0]<=>$b->[0]} @array2;\n+\t\t\t&excise(\\@array2,$chr,$strand);\n+\t\t}\n+\t}\n+}\n+close FA;\n+close STR;\n+sub oneCiteDn{\n+\tmy ($array,$a,$chr,$strand)=@_;\n+\n+\tmy $ss=$$array[$a][0]-$flank;\n+\t$ss=0 if($ss<0);\n+\tmy $ee=$$array[$a][1]+$maxd+$flank;\n+\t$ee=$lng{$chr} if($ee>$lng{$chr});\n+\t\n+\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n+\tif($strand eq "-"){$seq=revcom($seq);}\n+\n+\tmy $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);\n+\treturn $val;\n+}\n+sub oneCiteUp{\n+\tmy ($array,$a,$chr,$strand)=@_;\n+\n+\tmy $ss=$$array[$a][0]-$maxd-$flank;\n+\t$ss=0 if($ss<0);\n+\tmy $ee=$$array[$a][1]+$flank;\n+\t$ee=$lng{$chr} if($ee>$lng{$chr});\n+\t\n+\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n+\tif($strand eq "-"){$seq=revcom('..b'        s2\n+\n+\tmy $pair_num=0;\n+\tmy $overhang1=0;\n+\tmy $overhang2=0;\n+\tmy ($s1,$e1,$s2,$e2);\n+\tforeach my $i ($beg1..$end1) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif ($j <= $end2 && $j >= $beg2) {\n+\t\t\t\t$s1=$i;\n+\t\t\t\t$e2=$j;\n+\t\t\t\tlast;\n+\t\t\t}\n+\t\t}\n+\t}\n+\tforeach my $i (reverse ($beg1..$end1)) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif ($j <= $end2 && $j >= $beg2) {\n+\t\t\t\t$e1=$i;\n+\t\t\t\t$s2=$j;\n+\t\t\t\tlast;\n+\t\t\t}\n+\t\t}\n+\t}\n+\n+#\tprint "$beg1,$end1 $s1,$e1\\n";\n+#\tprint "$beg2,$end2 $s2,$e2\\n";\n+\n+\tforeach my $i ($beg1..$end1) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif ($j <= $end2 && $j >= $beg2) {\n+\t\t\t\t++$pair_num;\n+\t\t\t}\n+\t\t}\n+\t}\n+\tif (defined $s1 && defined $e2) {\n+\t\t$overhang1=($end2-$e2)-($s1-$beg1);\n+\t}\n+\tif (defined $e1 && defined $s2) {\n+\t\t$overhang2=($end1-$e1)-($s2-$beg2);\n+\t}\n+\t\n+\tif ($pair_num < 13) {\n+\t\t$like_mir_duplex=0;\n+\t}\n+\tif ($overhang1 < 0 && $overhang2 < 0) {\n+\t\t$like_mir_duplex=0;\n+\t}\n+\treturn ($like_mir_duplex,$pair_num,$overhang1,$overhang2);\n+}\n+sub parse_struct {\n+\tmy $struct=shift;\n+\tmy $table=shift;\n+\n+\tmy @t=split(\'\',$struct);\n+\tmy @lbs; # left brackets\n+\tforeach my $k (0..$#t) {\n+\t\tif ($t[$k] eq "(") {\n+\t\t\tpush @lbs, $k+1;\n+\t\t}\n+\t\telsif ($t[$k] eq ")") {\n+\t\t\tmy $lb=pop @lbs;\n+\t\t\tmy $rb=$k+1;\n+\t\t\t$table->{$lb}=$rb;\n+\t\t\t$table->{$rb}=$lb;\n+\t\t}\n+\t}\n+\tif (@lbs) {\n+\t\twarn "unbalanced RNA struct.\\n";\n+\t}\n+}\n+sub which_arm {\n+\tmy $substruct=shift;\n+\tmy $arm;\n+\tif ($substruct=~/\\(/ && $substruct=~/\\)/) {\n+\t\t$arm="-";\n+\t}\n+\telsif ($substruct=~/\\(/) {\n+\t\t$arm="5p";\n+\t}\n+\telse {\n+\t\t$arm="3p";\n+\t}\n+\treturn $arm;\n+}\n+sub biggest_bulge {\n+\tmy $struct=shift;\n+\tmy $bulge_size=0;\n+\tmy $max_bulge=0;\n+\twhile ($struct=~/(\\.+)/g) {\n+\t\t$bulge_size=length $1;\n+\t\tif ($bulge_size > $max_bulge) {\n+\t\t\t$max_bulge=$bulge_size;\n+\t\t}\n+\t}\n+\treturn $max_bulge;\n+}\n+sub get_asy {\n+\tmy($table,$a1,$a2)=@_;\n+\tmy ($pre_i,$pre_j);\n+\tmy $asymmetry=0;\n+\tforeach my $i ($a1..$a2) {\n+\t\tif (defined $table->{$i}) {\n+\t\t\tmy $j=$table->{$i};\n+\t\t\tif (defined $pre_i && defined $pre_j) {\n+\t\t\t\tmy $diff=($i-$pre_i)+($j-$pre_j);\n+\t\t\t\t$asymmetry += abs($diff);\n+\t\t\t}\n+\t\t\t$pre_i=$i;\n+\t\t\t$pre_j=$j;\n+\t\t}\n+\t}\n+\treturn $asymmetry;\n+}\n+\n+sub peaks{\n+\tmy @cluster=@{$_[0]};\n+\n+\treturn if(@cluster<1);\n+\n+\tmy $max=0; my $index=-1;\n+\tfor (my $i=0;$i<@cluster;$i++) {\n+\t\tif($cluster[$i][2]>$max){\n+\t\t\t$max=$cluster[$i][2];\n+\t\t\t$index=$i;\n+\t\t}\n+\t}\n+#\t&excise(\\@cluster,$index,$_[1],$_[2]);\n+\treturn($index);\n+}\n+\n+sub find_cites{\n+\tmy @tmp=@{$_[0]};\n+\tmy $i=&peaks(\\@tmp);\n+\t\n+\tmy $start=$tmp[$i][0];\n+\tmy $total=0; my $node5=0;\n+\tfor (my $j=0;$j<@tmp ;$j++) {\n+\t\t$total+=$tmp[$j][2];\n+\t\t$node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2);\n+\t}\n+\tpush @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5);\n+}\n+\n+sub newpos{\n+\tmy ($a,$b,$c,$d)=@_;\n+\tmy $s= $a>$c ? $c : $a;\n+\tmy $e=$b>$d ? $b : $d;\n+\treturn($s,$e);\n+}\n+\n+sub rev{\n+\n+    my($sequence)=@_;\n+\n+    my $rev=reverse $sequence;   \n+\n+    return $rev;\n+}\n+\n+sub com{\n+\n+    my($sequence)=@_;\n+\n+    $sequence=~tr/acgtuACGTU/TGCAATGCAA/;   \n+ \n+    return $sequence;\n+}\n+\n+sub revcom{\n+\n+    my($sequence)=@_;\n+\n+    my $revcom=rev(com($sequence));\n+\n+    return $revcom;\n+}\n+\n+sub find_strand{\n+\n+    #A subroutine to find the strand, parsing different blast formats\n+    my($other)=@_;\n+\n+    my $strand="+";\n+\n+    if($other=~/-/){\n+\t$strand="-";\n+    }\n+\n+    if($other=~/minus/i){\n+\t$strand="-";\n+    }\n+\n+    return($strand);\n+}\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -map -g -d -f -o -s -e\n+options:\n+  -map input file# align result # bst. format\n+  -g input file # genome sequence fasta format\n+  -d <int>   Maximal space between miRNA and miRNA* (200)\n+  -f <int>   Flank sequence length of miRNA precursor (10)\n+  -o output file# percursor fasta file\n+  -s output file# precursor structure file\n+  -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol)\n+\n+  -h help\n+USAGE\n+exit(1);\n+}\n+\n'
b
diff -r 000000000000 -r 87fe81de0931 quantify.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/quantify.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,502 @@\n+#!/usr/bin/perl -w\r\n+#Filename:\r\n+#Author: Tian Dongmei\r\n+#Email: tiandm@big.ac.cn\r\n+#Date: 2013/7/19\r\n+#Modified:\r\n+#Description: \r\n+my $version=1.00;\r\n+\r\n+use File::Path;\r\n+use strict;\r\n+use File::Basename;\r\n+#use Getopt::Std;\r\n+use Getopt::Long;\r\n+#use RNA;\r\n+\r\n+my %opts;\r\n+GetOptions(\\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","h");\r\n+if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments\r\n+&usage;\r\n+}\r\n+\r\n+my $read=$opts{\'r\'};\r\n+my $pre=$opts{\'p\'};\r\n+my $mature=$opts{\'m\'};\r\n+\r\n+my $dir=$opts{\'o\'};\r\n+unless ($dir=~/\\/$/) {$dir .="/";}\r\n+if (not -d $dir) {\r\n+\tmkdir $dir;\r\n+}\r\n+\r\n+my $threads=defined $opts{\'t\'} ? $opts{\'t\'} : 1;\r\n+my $mismatch=defined $opts{\'mis\'} ?  $opts{\'mis\'} : 0;\r\n+\r\n+my $upstream = 2;\r\n+my $downstream = 5;\r\n+\r\n+$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\r\n+$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\r\n+\r\n+my $marks=defined $opts{\'tag\'} ? $opts{\'tag\'} : "";\r\n+\r\n+my $time=Time();\r\n+\r\n+my $tmpdir="${dir}/known_miRNA_Express";\r\n+if(not -d $tmpdir){\r\n+\tmkdir($tmpdir);\r\n+}\r\n+chdir $tmpdir;\r\n+\r\n+`cp $pre ./`;\r\n+my $pre_file_name=basename($pre);\r\n+\r\n+&mapping(); # matures align to precursors && reads align to precursors;\r\n+\r\n+my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end;\r\n+&maturePosOnPre(); # acknowledge mature positions on precursor \r\n+\r\n+my %pre_read;\r\n+&readPosOnPre(); # acknowledge reads positions on precursors\r\n+\r\n+if(!(defined $opts{\'tag\'})){\r\n+\tforeach my $key (keys %pre_read) {\r\n+\t\t$pre_read{$key}[0][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n+\t\tmy @ss=split/_/,$1;\r\n+\t\tfor (my $i=1;$i<=@ss;$i++) {\r\n+\t\t\t$marks .="Smp$i;";\r\n+\t\t}\r\n+\t\tlast;\r\n+\t}\r\n+}\r\n+\r\n+my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...."\r\n+&attachPre();\r\n+\r\n+my $preno=scalar (keys %pre);\r\n+print "Total Precursor Number is $preno !!!!\\n";\r\n+\r\n+my %struc; #mature  star  loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe;\r\n+&structure();\r\n+\r\n+\r\n+##### analysis and print out  && moRs\r\n+my $aln=$dir."known_microRNA_express.aln";\r\n+my $list=$dir."known_microRNA_express.txt";\r\n+my $moRs=$dir."known_microRNA_express.moRs";\r\n+\r\n+system("ln -s $mature $dir/known_microRNA_mature.fa ");\r\n+system("ln -s $pre $dir/known_microRNA_precursor.fa ");\r\n+\r\n+open ALN,">$aln";\r\n+open LIST,">$list";\r\n+open MORS,">$moRs";\r\n+\r\n+$"="\\t"; ##### @array print in \\t\r\n+\r\n+my @marks=split/\\;/,$marks;\r\n+#print LIST "#matueID\\tpreID\\tpos1\\tpos2\\tmatureExp\\tstarExp\\ttotalExp\\n";\r\n+print LIST "#matueID\\tpreID\\tpos1\\tpos2";\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_matureExp";\r\n+}\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_starExp";\r\n+}\r\n+for (my $i=0;$i<@marks;$i++) {\r\n+\tprint LIST "\\t",$marks[$i],"_totalExp";\r\n+}\r\n+print LIST "\\n";\r\n+print ALN "#>precursor ID \\n#precursor sequence\\n#precursor structure (mfe)\\n#RNA_seq\\t@marks\\ttotal\\n";\r\n+print MORS "#>precursor ID\\tstrand\\texpress_reads\\texpress_reads\\/total_reads\\tblock_number\\tprecursor_sequence\\n#\\tblock_start\\tblock_end\\t@marks\\ttotal\\ttag_number\\tsequence\\n";\r\n+my %moRs;\r\n+\r\n+foreach my $key (keys %pre) {\r\n+\tprint ALN ">$key\\n$pre{$key}\\n$struc{$key}{struc}  ($struc{$key}{mfe})\\n";\r\n+\tnext if(! (exists $pre_read{$key}));\r\n+\tmy @array=@{$pre_read{$key}};\r\n+\t@array=sort{$a->[3]<=> $b->[3]} @array;\r\n+\t\r\n+\tmy $length=length($pre{$key});\r\n+\r\n+\tmy $maxline=-1;my $max=0; ### storage the maxinum express read line\r\n+\tmy $totalReadsNo=0; \r\n+\tmy @not_over=(); ### new read format better for moRs analysis\r\n+\r\n+####print out Aln file start\r\n+\tfor (my $i=0;$i<@array;$i++) {\r\n+\t\tmy $maps=$array[$i][3]+1;\r\n+\t\tmy $mape=$array[$i][3]+length($array[$i][4]);\r\n+\t\tmy $str="";\r\n+\t\t$str .= "." x ($maps-1);\r\n+\t\t$str .=$array[$i][4];\r\n+\t\t$str .="." x ($length-$mape);\r\n+\t\t$str .="        ";\r\n+\r\n+\t\t$array[$i][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n+\t\tmy @sample=split /\\_/,$1;\r\n+\t\tmy $total=$2;\r\n+\t\tprint ALN $str,"@s'..b'b{$i};\r\n+#\t\t\tif($a>$b){\r\n+#\t\t\t\t$startpos=$b-$n+2;\r\n+#\t\t\t\t$endpos=$b-$n+2+($end-$start);\r\n+#\t\t\t}\r\n+#\t\t\tif($a<$b){\r\n+\t\t\t\t$endpos=$b+$n+2;\r\n+\t\t\t\tif($endpos>length($structure)){$endpos=length($structure);}\r\n+\t\t\t\t$startpos=$b+$n+2-($end-$start);\r\n+\t\t\t\tif($startpos<1){$startpos=1;}\r\n+#\t\t\t}\r\n+\t\t\tlast;\r\n+\t\t}\r\n+\t\t$n++;\r\n+\t}\r\n+\treturn ($startpos,$endpos);\r\n+}\r\n+sub attachPre{\r\n+\topen IN, "<$pre_file_name";\r\n+\tmy $name;\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tif ($aline=~/^>(\\S+)/) {\r\n+\t\t\t$name=$1;\r\n+\t\t\tnext;\r\n+\t\t}\r\n+\t\t$pre{$name} .=$aline;\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub readPosOnPre{\r\n+\topen IN,"<read_mapped.bwt";\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tmy $id=lc($tmp[2]);\r\n+\t\tpush @{$pre_read{$tmp[2]}},[@tmp];\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub maturePosOnPre{\r\n+\topen IN,"<mature_mapped.bwt";\r\n+\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tmy $mm=$tmp[0];\r\n+#\t\t$mm=~s/\\-3P|\\-5P//i;\r\n+\t\t$mm=lc($mm);\r\n+\t\tmy $pm=$tmp[2];\r\n+\t\t$pm=lc($pm);\r\n+\r\n+#\t\tnext if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a\r\n+\t\tnext if($mm!~/$pm/);\r\n+#\t\tprint "$tmp[2]\\t$tmp[0]\\n";\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream;\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0);\r\n+#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream;\r\n+\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1;\r\n+\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]);\r\n+\t}\r\n+\tclose IN;\r\n+}\r\n+sub mapping{\r\n+    my $err;\r\n+## build bowtie index\r\n+    #print STDERR "building bowtie index\\n";\r\n+    $err = `bowtie-build $pre_file_name miRNA_precursor`;\r\n+\r\n+## map mature sequences against precursors\r\n+    #print STDERR "mapping mature sequences against index\\n";\r\n+\t$err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`;\r\n+\r\n+## map reads against precursors\r\n+    #print STDERR "mapping read sequences against index\\n";\r\n+    $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`;\r\n+\r\n+}\r\n+\r\n+sub subseq{\r\n+\tmy $seq=shift;\r\n+\tmy $beg=shift;\r\n+\tmy $end=shift;\r\n+\tmy $strand=shift;\r\n+\t\r\n+\tmy $subseq=substr($seq,$beg-1,$end-$beg+1);\r\n+\tif ($strand eq "-") {\r\n+\t\t$subseq=revcom($subseq);\r\n+\t}\r\n+\treturn uc $subseq;\r\n+}\r\n+\r\n+sub revcom{\r\n+\tmy $seq=shift;\r\n+\t$seq=~tr/ATCGatcg/TAGCtagc/;\r\n+\t$seq=reverse $seq;\r\n+\treturn uc $seq;\r\n+}\r\n+\r\n+sub Time{\r\n+        my $time=time();\r\n+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n+        $month++;\r\n+        $year+=1900;\r\n+        if (length($sec) == 1) {$sec = "0"."$sec";}\r\n+        if (length($min) == 1) {$min = "0"."$min";}\r\n+        if (length($hour) == 1) {$hour = "0"."$hour";}\r\n+        if (length($day) == 1) {$day = "0"."$day";}\r\n+        if (length($month) == 1) {$month = "0"."$month";}\r\n+        #print "$year-$month-$day $hour:$min:$sec\\n";\r\n+        return("$year-$month-$day-$hour-$min-$sec");\r\n+}\r\n+\r\n+sub usage{\r\n+print <<"USAGE";\r\n+Version $version\r\n+Usage:\r\n+$0 -r -p -m -mis -t -e -f -tag -o -time\r\n+mandatory parameters:\r\n+-p precursor.fa  miRNA precursor sequences from miRBase # must be absolute path\r\n+-m mature.fa     miRNA sequences from miRBase # must be absolute path\r\n+-r reads.fa      your read sequences #must be absolute path\r\n+\r\n+-o output directory\r\n+\r\n+options:\r\n+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0\r\n+-t [int]     threads number,default 1\r\n+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2\r\n+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5\r\n+-tag [string] sample marks# eg. sampleA;sampleB;sampleC\r\n+-time sting #make directory time,default is the local time\r\n+-h help\r\n+USAGE\r\n+exit(1);\r\n+}\r\n+\r\n'
b
diff -r 000000000000 -r 87fe81de0931 quantify_siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/quantify_siRNA.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,64 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: chentt
+#Email:
+#Date: 2012-4-6
+#Modified:
+#Description:
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","d=s","h");
+if (!(defined $opts{i} and defined $opts{d} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $input=$opts{'i'};
+my $output=$opts{'o'};
+my $depth=$opts{'d'};
+
+open (IN,"<$input")||die"$!";
+open OUT,">$output";
+#my @Total=qw(15797079 18042650 17455254 17295526 18791753 16719596 15150009 18451484 17402501 17729362 19347595 17518516 15699663 16589265 15442892 14012264 14190746 17280260 13213117 12390121 14874304 );
+my @Total=split/\,/,$depth;
+#print OUT "#clusterID\tmajor_length\tpercent\n";
+while (my $aline=<IN>) {
+ chomp $aline;
+ if ($aline=~/^\"/){
+ my @title=split/\t/,$aline;
+ for (my $i=0;$i<@title ;$i++) {
+ $title[$i]=~s/^\"(\S+)\"$/$1/;
+ }
+ my $title=join "\t",@title;
+ print OUT "\#$title\n";
+ next;
+ }
+ my @temp=split/\t/,$aline;
+ print OUT "$temp[0]\t$temp[1]\t$temp[2]";
+ my @id=split/:/,$temp[0];
+ my @posi=split/-/,$id[1];
+ for (my $i=3;$i<@temp;$i++) {
+ my $rpkm=sprintf("%.2f",$temp[$i]/($posi[1]-$posi[0]+1)/$Total[$i-3]*1000000000);
+ print OUT "\t$rpkm";
+ }
+ print OUT "\n";
+}
+close IN;
+close OUT;
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -h
+options:
+-i input cluster file
+-o output file
+-d depth
+-h help
+USAGE
+exit(1);
+}
\ No newline at end of file
b
diff -r 000000000000 -r 87fe81de0931 rfam.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rfam.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,100 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","ref=s","index:s","v:i","p:i","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $dir=$opts{'o'};
+unless ($dir=~/\/$/) {$dir.="/";}
+my $rfam=$opts{'ref'};
+my $mis=defined $opts{'v'}? $opts{'v'} : 0;
+my $index=defined $opts{'index'} ? $opts{'index'} : "";
+my $threads=defined $opts{'p'} ? $opts{'p'} : 1;
+
+if (not -d $dir) {
+ mkdir $dir;
+}
+
+
+my $time=Time();
+
+my $mapdir=$dir."/rfam_match";
+if(not -d $mapdir){
+ mkdir $mapdir;
+}
+chdir $mapdir;
+###check genome index
+if (-s $index.".1.ebwt") {
+}else{
+ &checkACGT($rfam);
+ `bowtie-build $rfam $rfam`;
+ $index="$rfam";
+}
+### genome mapping
+`bowtie -v $mis -f -p $threads -k 1 $index $filein --al rfam_mapped.fa --un rfam_not_mapped.fa > rfam_mapped.bwt 2> run.log`;
+
+sub checkACGT{
+ my $string;
+ open IN,"<$rfam";
+ while (my $aline=<IN>) {
+ if ($aline!~/^>/) {
+ $aline=~s/U/T/gi;
+ }
+ $string .=$aline;
+ }
+ close IN;
+ $rfam=basename($rfam);
+ open OUT, ">$rfam";
+ print OUT $string;
+ close OUT;
+}
+
+sub Time{
+        my $time=time();
+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
+        $month++;
+        $year+=1900;
+        if (length($sec) == 1) {$sec = "0"."$sec";}
+        if (length($min) == 1) {$min = "0"."$min";}
+        if (length($hour) == 1) {$hour = "0"."$hour";}
+        if (length($day) == 1) {$day = "0"."$day";}
+        if (length($month) == 1) {$month = "0"."$month";}
+        #print "$year-$month-$day $hour:$min:$sec\n";
+        return("$year-$month-$day-$hour-$min-$sec");
+}
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file# input reads fasta/fastq file
+-ref input file# rfam file, which do not contain miRNAs
+-index file-prefix #(must be indexed by bowtie-build) The parameter
+                string must be the prefix of the bowtie index. For instance, if
+                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
+                the prefix is 'h_sapiens_37_asm'.##can be null
+-v <int>           report end-to-end hits w/ <=v mismatches; ignore qualities,default 0;
+
+-p/--threads <int> number of alignment threads to launch (default: 1)
+
+-o output directory 
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 sRNA_plot.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRNA_plot.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,411 @@\n+#!/usr/bin/perl -w\n+#==========================================================================================\n+# Date: \n+# Title: \n+# Comment: Program to plot gene structure\n+# Input: 1. \n+#        2. \n+#        3. \n+# Output: output file of gene structure graph by html or svg formt\n+# Test Usage: \n+#========================================================================================\n+#use strict;\n+my $version=1.00;\n+use SVG;\n+use Getopt::Long;\n+my %opt;\n+GetOptions(\\%opt,"g=s","l=s","span=s","c=s","o=s","out=s","cen:s","mark=s","h");\n+if (!( defined $opt{o}) || defined $opt{h}) {\n+&usage;\n+}\n+my $span=$opt{span};\n+#my $sample_cloumn=$opt{n};\n+my $mark=$opt{mark};\n+my @mark=split/\\#/,$mark;\n+my $genelist=$opt{g};\n+#===============================Define Attribute==========================================\n+my %attribute=(\n+\tcanvas=>{\n+\t\t\'width\'=>1500,\n+\t\t\'height\'=>1800\n+\t},\n+\ttext=>{\n+\t\t\'stroke\'=>"#000000",\n+\t\t\'fill\'=>"none",\n+\t\t\'stroke-width\'=>0.5\n+\t},\n+\tline=>{\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>1\n+\t},\n+\tcsv=>{\n+\t\t\'stroke\'=>"red",\n+\t\t\'stroke-width\'=>0.5\n+\t},\n+\texon=>{\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>1\n+\t},\n+\tintron=>{\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>1.5\n+\t},\n+\tfont=>{\n+\t\t\'fill\'=>"#000000",\n+\t\t\'font-size\'=>12,\n+\t\t\'font-size2\'=>10,\n+\t\t#\'font-weight\'=>\'bold\',\n+\t\t\'font-family\'=>"Arial"\n+\t\t#\'font-family\'=>"ArialNarrow-bold"\n+\t},\n+\trect=>{\n+\t\t\'fill\'=>"lightgreen",\n+\t\t\'stroke\'=>"black",\n+\t\t\'stroke-width\'=>0.5\n+\t},\n+\treadwidth=>0.5\n+);\n+#############################s#define start coordinate and scale\n+open(TXT,">$opt{out}");\n+open(LENGTH,"$opt{l}")||die"cannot open the file $opt{l}";\n+my %length;\n+while (my $aline=<LENGTH>) {\n+\tchomp $aline;\n+\tnext if($aline=~/^\\#/);\n+\tmy @temp=split/\\t/,$aline;\n+\t$temp[0]=~s/^c/C/;\n+\t$length{$temp[0]}=$temp[1];\n+}\n+close LENGTH;\n+#---------------------------------------------------------------\n+open(GENE,"$opt{g}")||die"cannot open the file $opt{g}";\n+my %genelist;\n+while (my $aline=<GENE>) {\n+\tchomp $aline;#LOC_Os01g01280  Chr1    133291  134685  +\n+\tnext if($aline=~/^\\#/);\n+\tmy @temp=split/\\t/,$aline;\n+\tif ($temp[1]=~/^Chr(\\d)$/) {\n+\t\t$temp[1]="Chr0$1";\n+\t}\n+\tpush @{$genelist{$temp[1]}},[$temp[0],$temp[2],$temp[3]];\n+\n+}\n+close GENE;\n+#my %have_gene;\n+#foreach my $chr (sort keys %genelist) {\n+#\tmy @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}};\n+#\tmy $start=$genelist[0][1];\n+#\tmy $end=$genelist[0][2];\n+#\tfor (my $i=0;$i<@genelist ;$i++) {\n+#\t\tif ($gene) {\n+#\t\t}\n+#\t}\n+#}\n+\n+my %gene_desity;\n+foreach my $chr (sort keys %genelist) {\n+\tmy @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}};\n+\tfor (my $i=0;$i<@genelist ;$i++) {\n+\t\tmy $start=int($genelist[$i][1]/$span);\n+\t\tmy $end=int($genelist[$i][2]/$span);\n+\t\t#my @t_rpkm=split/\\t/,$target_rpkm{$genelist[$i][0]};\n+\t\tif ($start==$end) {\n+\t\t\t$gene_desity{$chr}[$start]++;\n+\t\t}\n+\t\telse{\n+\t\t\tfor (my $k=$start;$k<=$end ;$k++) {\n+\t\t\t\t$gene_desity{$chr}[$k]++;\n+\t\t\t}\n+\t\t}\n+\t}\n+}\n+#------------------------------------------region_gene_number-------------------------\n+my $max_gene_number=0;\n+my $total=0;\n+foreach my $chr (sort keys %genelist) {\n+\tfor (my $i=0;$i<@{$gene_desity{$chr}} ;$i++) {\n+\t\tif (!(defined($gene_desity{$chr}[$i]))) {\n+\t\t\t$gene_desity{$chr}[$i]=0;\n+\t\t}\n+\t\tif ($gene_desity{$chr}[$i]>$max_gene_number) {\n+\t\t\t$max_gene_number=$gene_desity{$chr}[$i];\n+\t\t\t#print "$gene_desity{$chr}[$i]\\n";\n+\t\t}\n+\t\t#print TXT "$i\\t$gene_desity[$i]\\n";\n+\t\t$total+=$gene_desity{$chr}[$i];\n+\t\t#print "$chr\\t$i\\t$gene_desity{$chr}[$i]\\n";\n+\t}\n+}\n+#print "Gene max:$max_gene_number\\ntotal:$total\\n";\n+\n+#---------------------------------------------------------------\n+my %centromere;\n+if (defined($opt{cen})) {\n+\topen CEN,"$opt{cen}";\n+\twhile (my $aline=<CEN>) {\n+\t\tchomp $aline;\n+\t\tnext if($aline=~/^\\#/);\n+\t\tmy @temp=split/\\t/,$aline;\n+\t\t$temp[0]=~s/^c/C/;\n+\t\t$centromere{$temp[0]}[0]=$temp[1];\n+\t\t$centromere{$temp[0]}[1]=$temp[2];\n+\t}\n+\tclose CEN;\n+}\n+\n+#---------------------------------------------------------------\n+m'..b',$attribute{font}{\'font-family\'},\'-cdata\',$chr);\n+\tmy $m_length=$length{$chr}%1000000;\n+\t$svg->text(\'x\',$chr_end_x+20,\'y\',$chr_start_y,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',12,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$m_length."M");\n+\t\n+\n+\tif (defined($centromere{$chr}[0])) {\n+\t\t$svg->rect(\'x\',$XOFFSET+$centromere{$chr}[0]*$Xscale,\'y\',$chr_start_y-2,\'width\',($centromere{$chr}[1]-$centromere{$chr}[0]+1)*$Xscale,\'height\',5,\'stroke\',"blue",\'stroke-width\',$attribute{intron}{\'stroke-width\'},\'fill\',"blue");\n+\t}\n+\tfor (my $s=0;$s<@mark ;$s++) {\n+\t\tfor (my $i=0;$i<$#{$cluster_density{$chr}}-1 ;$i++) {\n+\t\t\t#if ($cluster_density{$chr}[$i]*$Yscale>40) {\n+\t\t\t\t#$cluster_density{$chr}[$i]=40/$Yscale;\n+\t\t\t\t#$svg->rect(\'x\',$XOFFSET+$i*$span*$Xscale,\'y\',$chr_start_y-45,\'width\',$span*$Xscale,\'height\',5,\'stroke\',"green",\'stroke-width\',$attribute{intron}{\'stroke-width\'},\'fill\',"green");\n+\t\t\t#}\n+\t\t\t#print "$i\\t$cluster_density{$chr}[$i][$s]\\t$cluster_density{$chr}[$i+1][$s]\\n";\n+\t\t\tmy $cluster_density_start_x=$XOFFSET+$i*$span*$Xscale;\n+\t\t\tmy $cluster_density_end_x=$XOFFSET+($i+1)*$span*$Xscale;\n+\t\t\tmy $cluster_density_start_y=$chr_start_y-$cluster_density{$chr}[$i][$s]*$Yscale;\n+\t\t\tmy $cluster_density_end_y=$chr_start_y-$cluster_density{$chr}[$i+1][$s]*$Yscale;\n+\t\t\tmy $c_red=($s+1)/@mark*255;\n+\t\t\t$svg->line(\'x1\',$cluster_density_start_x,\'y1\',$cluster_density_start_y,\'x2\',$cluster_density_end_x,\'y2\',$cluster_density_end_y,\'stroke\',"rgb($c_red,125,0)",\'stroke-width\',0.3);\n+\t\t}\n+\n+\t}\n+\t#=======Y axis\n+\t$svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_start_x,y2=>$chr_start_y-$one_chr_heigth+15,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+\t#=======Y axis ===>3 xiaoge\n+\tmy $s10=1;\n+\tmy $e10=0;\n+\tmy $chr_max=$max_all_density;\n+\twhile ($chr_max>10) {\n+\t\t$chr_max=int($chr_max/10);\n+\t\t$s10=$s10*10;\n+\t\t$e10++;\n+\t}\n+\t$chr_max=$chr_max/2;\n+\t#print "*****$max_all_density\\t$chr_max\\t$s10\\n";\n+\tfor (my $i=1;$i<3 ;$i++) {\n+\t\tmy $y1=$chr_start_y-$chr_max*$s10*$Yscale*$i;\n+\t\tmy $xiaoge_Y=$chr_max*$i;\n+\t\t$svg->line(\'x1\',$chr_start_x,\'y1\',$y1,\'x2\',$chr_start_x+3,\'y2\',$y1,\'stroke\',$attribute{line}{\'stroke\'},\'stroke-width\',$attribute{line}{\'stroke-width\'});\n+\t\t$svg->text(\'x\',$chr_start_x-26,\'y\',$y1+4,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',8,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$xiaoge_Y."e".$e10);\n+\t}\n+\t$cc++;\n+}\n+\n+for (my $s=0;$s<@mark ;$s++) {\n+\tmy $c_red=($s+1)/@mark*255;\n+\tprint "**$c_red\\n";\n+\t$svg->line(\'x1\',$canvas_end_x+100,\'y1\',$YOFFSET+$s*20+30,\'x2\',$canvas_end_x+130,\'y2\',$YOFFSET+$s*20+30,\'stroke\',"rgb($c_red,125,0)",\'stroke-width\',1);\n+\t$svg->text(\'x\',$canvas_end_x+150,\'y\',$YOFFSET+$s*20+5+30,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',10,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',$mark[$s]);\n+}\n+#\n+#\n+if (defined($opt{cen})) {\n+\t$svg->rect(\'x\',$canvas_end_x+100,\'y\',$YOFFSET+@mark*20+30,\'width\',30,\'height\',5,\'stroke\',"blue",\'stroke-width\',$attribute{intron}{\'stroke-width\'},\'fill\',"blue");\n+\t$svg->text(\'x\',$canvas_end_x+150,\'y\',$YOFFSET+@mark*20+30+5,\'style\',\'fill:black;text-anchor:left\',\'stroke\',$attribute{text}{\'stroke\'},\'stroke-width\',$attribute{text}{\'stroke-width\'},\'font-size\',10,\'font-family\',$attribute{font}{\'font-family\'},\'-cdata\',"centromere");\n+}\n+\n+close TXT;\n+\n+open (OUT,">$opt{o}");\n+print OUT $svg->xmlify();\n+\n+sub log2 {\n+\tmy $n = shift;\n+\treturn log($n)/log(2);\n+}\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 \n+options:\n+-g genelist\n+-span\n+-n sample cloumn\n+-mark sample name\n+-o output graph file name with html or svg extension\n+-c cluster file input\n+-out txt output\n+-l length of chr\n+-cen centromere\n+-h help\n+USAGE\n+exit(1);\n+}\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 87fe81de0931 sam2Bed_bowtie.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sam2Bed_bowtie.pl Sun Jan 04 02:47:25 2015 -0500
[
@@ -0,0 +1,74 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2011/11/7
+#Modified:
+#Description: sam2BED 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","mark=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+my $mark=$opts{'mark'};
+my @sample=split/\#/,$mark;
+$mark=join"\t",@sample;
+open OUT,">$fileout"; #output file  
+print OUT "#chr\tstrand\tstart\tend\t$mark\n";
+
+open IN,"<$filein"; #input file  
+my $Tags_num=0;
+my @read_num;
+#print OUT "#chr\tstart\tend\tnum\t<=20\t21\t22\t23\t24\t>=25\n";
+while (my $aline=<IN>) {
+ chomp $aline;
+ next if($aline=~/^\@/);
+ my @tmp=split/\t/,$aline;
+ my $strand=$tmp[1];
+ my $start=$tmp[3]+1;
+ my $length=length($tmp[4]);
+ my $end=$start+$length-1;
+ my $hit=$tmp[6]+1;
+ #======express caculate weighted===================================
+ my $exp;
+ my @tempID=split/\:/,$tmp[0];
+ my @exp=split/\_/,$tempID[1];
+ pop @exp;
+ for (my $j=0;$j<@exp ;$j++) {
+ #my @tempID1=split/\=/,$tempID[$j];
+ $exp[$j]=sprintf("%.2f",$exp[$j]/$hit);
+ $read_num[$j]+=$exp[$j];
+ #print OUT "\t$exp";
+ }
+ $exp=join "\t",@exp;
+ print OUT $tmp[2],"\t",$strand,"\t",$start,"\t",$end,"\t",$exp,"\n";
+ $Tags_num++;
+
+}
+print "Total Tags numer: $Tags_num\n";
+my $read_number=join "\t",@read_num;
+print "Each sample numer: $read_number\n";
+close IN;
+close OUT;
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-mark sampleA sampleB sampleC.....
+-o output file
+-h help
+USAGE
+exit(1);
+}
+
b
diff -r 000000000000 -r 87fe81de0931 siRNA.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,402 @@\n+#!/usr/bin/perl -w\r\n+my $version=1.00;\r\n+use strict;\r\n+use warnings;\r\n+use Getopt::Long;\r\n+use Getopt::Std;\r\n+use threads;\r\n+#use threads::shared;\r\n+use Parallel::ForkManager;\r\n+#use lib \'/leofs/biotrans/chentt/perl_module/\';\r\n+#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq \r\n+print "\r\n+#####################################\r\n+#                                   # \r\n+#          sRNA cluster             # \r\n+#                                   #\r\n+#####################################\r\n+";\r\n+###########################################################################################\r\n+my $usage="$0\r\n+Options:\r\n+-i input file# fasta\r\n+-config input file \r\n+-g  genome file\r\n+-f  gff file\r\n+\r\n+-o  workdir file\r\n+-path  script path\r\n+-t  int,    number of threads [1]\r\n+-format  fastq, fq, fasta or fa\r\n+-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\r\n+\t\tstring must be the prefix of the bowtie index. For instance, if\r\n+\t\tthe first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\r\n+\t\tthe prefix is \'h_sapiens_37_asm\'.##can be null\r\n+-mis  int     number of allowed mismatches when mapping reads to genome, default 0\r\n+\r\n+-n  int max hits number,default 25; used in genome alignment\r\n+-d  int distance of tag to merged a cluster; default 100\r\n+-p  cluster method F :conventional default is F\r\n+\t\t\t\t   T :NIBLES \r\n+-l  int the length of the upstream and downstream,default 1000;used in position annotate\r\n+\r\n+-nat  natural antisense transcripts file\r\n+-repeat  repeat information file out of Repeatmasker\r\n+-deg  file config of de sample\r\n+-cen  centromere file input\r\n+-span  plot span, default 50000\r\n+";\r\n+\r\n+my %options;\r\n+GetOptions(\\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h");\r\n+#print help if that option is used\r\n+if($options{h}){die $usage;}\r\n+\r\n+my $filein=$options{\'i\'};\r\n+\r\n+#my $config=$options{\'i\'};\r\n+my $genome_fa=$options{\'g\'};\r\n+my $gff=$options{\'f\'};\r\n+\r\n+\r\n+##########################################################################################\r\n+my $predir=`pwd`;\r\n+chomp $predir;\r\n+my $workdir=defined($options{\'o\'}) ? $options{\'o\'}:$predir;\r\n+\r\n+my $path=$options{\'path\'};\r\n+\r\n+my $t=defined($options{\'t\'})? $options{\'t\'}:1; #threads number\r\n+\r\n+my $mis=defined $options{\'mis\'} ? $options{\'mis\'}:0;\r\n+\r\n+\r\n+my $hit=defined $options{\'n\'}?$options{\'n\'}:25;\r\n+\r\n+my $distance_of_merged_tag=defined $options{\'d\'} ? $options{\'d\'}:100;\r\n+\r\n+my $up_down_dis=defined $options{\'l\'} ?$options{\'l\'}:1000;\r\n+\r\n+my $cluster_mothod=defined $options{\'p\'}?$options{\'p\'}:"F";\r\n+\r\n+my $format=$options{\'format\'};\r\n+#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { \r\n+#\tdie "Parameter \\"-format\\" is error! Parameter is fastq, fq, fasta or fa\\n";\r\n+#}\r\n+\r\n+\r\n+\r\n+my $sample_number;\r\n+my ($dir,$dir_tmp);\r\n+################################  MAIN  ##################################################\r\n+print "\\ncluster program start:";\r\n+my $time=Time();\r\n+make_dir_tmp();\r\n+\r\n+my $mark;\r\n+my $sample_mark;\r\n+\r\n+my $config=$options{\'config\'};\r\n+my (@filein,@mark);\r\n+&read_config();\r\n+$sample_number=@mark;\r\n+$mark=join "\\t",@mark;\r\n+$sample_mark=join "\\#",@mark;\r\n+\r\n+\r\n+\r\n+my $data3=$filein;   ### rfam not mapped reads\r\n+genome();\r\n+\r\n+my $bed=$d'..b'ing\\n\\n";\r\n+\tmy $deg_file=$options{\'deg\'};\r\n+\topen IN,"<$deg_file";\r\n+\tmy @deg;\r\n+\tmy $s=0;\r\n+\t\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tnext if($aline=~/^\\#/);\r\n+\t\t$deg[$s]=$aline;\r\n+\t\tmy @ea=split/\\s+/,$aline;\r\n+\t\tpush @pairdir,"$ea[0]_VS_$ea[1]\\/";\r\n+\t\t#print "$deg[$s]\\n";\r\n+\t\t$s++;\r\n+\t}\r\n+\tclose IN;\r\n+\t$deg_dir=$dir."deg\\/";\r\n+\tmkdir ("$deg_dir");\r\n+\tmy $max_process = 10;\r\n+\tmy $pm = new Parallel::ForkManager( $max_process );\r\n+\tmy $number=@deg-1;\r\n+\tforeach(0..$number){\r\n+\t\t$pm->start and next;\r\n+\t\t&dec_pel($deg[$_]);\r\n+\t\t$pm->finish;\r\n+\t}\r\n+\t$pm->wait_all_children;\r\n+}\r\n+\r\n+sub dec_pel{\r\n+\tprint "\\n******************\\nstart:\\n";\r\n+\tTime();\r\n+\tmy $sample=shift(@_);\r\n+\tmy @each=split/\\s+/,$sample;\r\n+\tprint "$each[0]\\t$each[1]\\n";\r\n+\tmy $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\\/";\r\n+\tmkdir ("$deg_sample_dir");\r\n+\tprint "read: $read\\n";\r\n+\tprint "deg_sample_dir: $deg_sample_dir\\n";\r\n+\tprint "$id{$each[0]}\\t$each[0]\\n";\r\n+\tprint "$id{$each[1]}\\t$each[1]\\n";\r\n+\tmy $deg=`perl $path\\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2\r\n+\tmy $time2=time();\r\n+\tprint "end:\\n*************************\\n";\r\n+\tTime();\r\n+\tsleep 1;\r\n+}\r\n+\r\n+sub infor_merge{\r\n+\tmy ($input,$mark);\r\n+\tforeach (@pairdir) {\r\n+\t\tprint "@pairdir\\n";\r\n+\t\t$mark.=" -mark $_ ";\r\n+\t\t$input.=" -i $dir/deg\\/$_\\/output_score\\.txt ";\r\n+\t\tprint "$input\\n$mark\\n";\r\n+\t}\r\n+\tmy $infor_merge=`perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result `;\r\n+\tprint "perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result\\n\\n";\r\n+}\r\n+\r\n+sub infor_merge_no_dec{\r\n+\tmy $infor_merge_no_dec=`cp $annotate_dir\\/sample_c_p.anno $dir\\/total.result`;\r\n+}\r\n+\r\n+sub genome_length{\r\n+\tmy $length=`perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length`;\r\n+\tprint "perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length\\n\\n"\r\n+\r\n+}\r\n+\r\n+sub plot{\r\n+\t$plot_dir="$dir\\/plot\\/";\r\n+\tmkdir ("$plot_dir");\r\n+\tmy $span=defined($options{span})?$options{span}:50000;\r\n+\tmy $cen="";\r\n+\tif (defined $options{cen}) {\r\n+\t\t$cen="-cen $options{cen}";\r\n+\t}\r\n+\tmy $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `;\r\n+\t"print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \\n";\r\n+\r\n+}\r\n+\r\n+sub html{\r\n+\tmy $pathfile="$dir/path.txt";\r\n+\topen PA,">$pathfile";\r\n+\tprint PA "$config\\n";\r\n+\tprint PA "$preprocess\\n";\r\n+\tprint PA "$dir"."rfam_match\\n";\r\n+\tprint PA "$dir"."genome_match\\n";\r\n+\tprint PA "$cluster_file\\n";\r\n+\tprint PA "$annotate_dir\\n";\r\n+\tprint PA "$plot_dir\\n";\r\n+\tif (defined($deg_dir)) {\r\n+\t\t\tprint PA "$deg_dir\\n";\r\n+\t}\r\n+\tclose PA;\r\n+\t#my $html=`perl $path\\/html.pl -i $pathfile -format $format -o $dir/result.html`;\r\n+}\r\n+\r\n+sub Time{\r\n+\tmy $time=time();\r\n+\tmy ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n+\t$month++;\r\n+\t$year+=1900;\r\n+\tif (length($sec) == 1) {$sec = "0"."$sec";}\r\n+\tif (length($min) == 1) {$min = "0"."$min";}\r\n+\tif (length($hour) == 1) {$hour = "0"."$hour";}\r\n+\tif (length($day) == 1) {$day = "0"."$day";}\r\n+\tif (length($month) == 1) {$month = "0"."$month";}\r\n+\tprint "$year-$month-$day $hour:$min:$sec\\n";\r\n+\treturn("$year-$month-$day-$hour-$min-$sec");\r\n+}\r\n+#################################################################################\r\n+sub read_config{\r\n+\topen CON,"<$config";\r\n+\twhile (my $aline=<CON>) {\r\n+\t\tchomp $aline;\r\n+\t\tmy @tmp=split/\\t/,$aline;\r\n+\t\tpush @filein,$tmp[0];\r\n+\t\tpush @mark,$tmp[1];\r\n+\t\t#&check_rawdata($tmp[0]);\r\n+\t}\r\n+\tclose CON;\r\n+\tif (@filein != @mark) {\r\n+\t\t#&printErr();\r\n+\t\tdie "Maybe config file have some wrong!!!\\n";\r\n+\t}\r\n+}\r\n'
b
diff -r 000000000000 -r 87fe81de0931 siRNA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA.xml Sun Jan 04 02:47:25 2015 -0500
b
b'@@ -0,0 +1,181 @@\n+<tool id="plant_sirna_v1" name="siRNA" veision="1.0.0">\r\n+  <description>Program for plant siRNA analysis (siRNA only)</description>\r\n+\r\n+  <requirements>\r\n+    <requirement type="set_environment">SCRIPT_PATH</requirement>\r\n+    <requirement type="package" version="0.12.7">bowtie</requirement>\r\n+    <requirement type="package" version="3.0.1">R</requirement>\r\n+\t<requirement type="package" version="0.0.13">fastx_toolkit </requirement>\r\n+\t<requirement type="package" version="1.96">threads</requirement>\r\n+\t<requirement type="package" version="1.06">Parallel-ForkManager</requirement>\r\n+\t<requirement type="package" version="2.59">SVG</requirement>\r\n+\t<requirement type="package" version="1.4_001">Boost-Graph</requirement>\r\n+  </requirements>\r\n+\r\n+  <command interpreter="perl">siRNA.pl \r\n+   ## Change this to accommodate the number of threads you have available.\r\n+        -t \\${GALAXY_SLOTS:-4}\r\n+\r\n+\t-path \\$SCRIPT_PATH\r\n+\r\n+      ## prepare bowtie index\r\n+      #set index_path = \'\'\r\n+      #if str($reference_genome.source) == "history":\r\n+         #### bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa;\r\n+          #set index_path = $reference_genome.own_file\r\n+\t\t-g $index_path \r\n+\r\n+\t  #else:\r\n+          #set index_path = $reference_genome.index.fields.path\r\n+\t\t-g ${index_path}.fa -idx $index_path \r\n+\t  #end if\r\n+\r\n+\r\n+   ## Do or not annotate siRNAs by function\r\n+    #if $params.function_anno == "yes":\r\n+\r\n+      ## prepare bowtie index\r\n+      #set nat_path = \'\'\r\n+      #if str($params.nat_file.source) == "history":\r\n+          #set nat_path = $params.nat_file.nat\r\n+\r\n+\t  #else:\r\n+          #set nat_path = $params.nat_file.index.fields.path\r\n+\t  #end if\r\n+\r\n+      ## prepare bowtie index\r\n+      #set repeat_path = \'\'\r\n+      #if str($params.repeat_file.source) == "history":\r\n+          #set repeat_path = $params.repeat_file.repeat\r\n+\r\n+\t  #else:\r\n+          #set repeat_path = $params.repeat_file.index.fields.path\r\n+\t  #end if\r\n+\r\n+\r\n+\t -nat $nat_path -repeat $repeat_path\r\n+\t#end if\r\n+   \r\n+   ## Do or not DEG\r\n+    #if $degseq.degseq_analysis == "yes" :\r\n+    -deg $degseq.deg\r\n+    #end if\r\n+\r\n+  -i $reads -config $config -n $mapnt -format $format -f $gff -mis $mis -d $d -p $p -l $l  -cen $cen -span $span > run.log\r\n+\r\n+  </command>\r\n+\r\n+  <inputs>\r\n+\r\n+\t<param name="config" type="data" label="Raw data configs file" />\r\n+\t<param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" />\r\n+\r\n+\t<param name="format" type="select" lable=" Data format" multiple="false">\r\n+\t  <option value="fastq">Input data is fastq. format</option>\r\n+\t  <option value="fasta">Input data is fasta. format</option>\r\n+\t</param>\r\n+\t\r\n+\r\n+        <conditional name="reference_genome">\r\n+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">\r\n+            <option value="indexed">Use a built-in index</option>\r\n+            <option value="history">Use one from the history</option>\r\n+          </param>\r\n+          <when value="indexed">\r\n+            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">\r\n+              <options from_data_table="bowtie_indexes">\r\n+                <filter type="sort_by" column="2"/>\r\n+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>\r\n+              </options>\r\n+            </param>\r\n+          </when>\r\n+          <when value="history">\r\n+            <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />\r\n+          </when>\r\n+        </conditional>\t<!--param type="data" name="index" label="genome sequence bowtie index"/-->\r\n+\r\n+\r\n+\t<param name="gff" type="data" label="gff file" />\r\n+\t<param name="mis" type="integ'..b'd mismatches when mapping reads to genome" />\r\n+\t<param name="mapnt" type="integer" value="25" label="A read is allowed to map up to this number of positions in the genome" />\r\n+\t<param name="d" type="integer" value="100" label="Distance of tag to merged a cluster" />\r\n+\r\n+\t<param name="p" type="select" lable="siRNA cluster method" multiple="false">\r\n+\t  <option value="F">conventional</option>\r\n+\t  <option value="T">NIBLES</option>\r\n+\t</param>\r\n+\t<param name="l" type="integer" value="1000" label="The length of the upstream and downstream,used in position annotate" />\r\n+\r\n+\r\n+\t<conditional name="params">\r\n+\t\t<param name="function_anno" type="select" label="Do or not annotate siRNAs by function">\r\n+\t\t  <option value="no" selected="true">no</option>\r\n+\t\t  <option value="yes">yes</option>\r\n+\t\t </param>\r\n+\t\t <when value="yes">\r\n+\r\n+\r\n+        <conditional name="nat_file">\r\n+          <param name="source" type="select" label="Will you select a atural antisense transcripts file from your history ?" help="down load from ***">\r\n+            <option value="indexed">Use a built-in file</option>\r\n+            <option value="history">Use one from the history</option>\r\n+          </param>\r\n+          <when value="indexed">\r\n+            <param name="index" type="select" label="Select a atural antisense transcripts file" help="If your species of interest is not listed, contact the Galaxy team">\r\n+              <options from_data_table="nat_annotate">\r\n+                <filter type="sort_by" column="2"/>\r\n+                <validator type="no_options" message="No files are available for the selected input dataset"/>\r\n+              </options>\r\n+            </param>\r\n+          </when>\r\n+          <when value="history">\r\n+            <param name="nat" type="data" format="txt" label="atural antisense transcripts file" />\r\n+\t\t  </when>\r\n+        </conditional>\t<!--param type="data" name="index" label="genome sequence bowtie index"/-->\r\n+\r\n+        <conditional name="repeat_file">\r\n+          <param name="source" type="select" label="Will you select a repeat information file from your history ?" help="down load from ***">\r\n+            <option value="indexed">Use a built-in file</option>\r\n+            <option value="history">Use one from the history</option>\r\n+          </param>\r\n+          <when value="indexed">\r\n+            <param name="index" type="select" label="Select a repeat information file" help="If your species of interest is not listed, contact the Galaxy team">\r\n+              <options from_data_table="repeat_annotate">\r\n+                <filter type="sort_by" column="2"/>\r\n+                <validator type="no_options" message="No files are available for the selected input dataset"/>\r\n+              </options>\r\n+            </param>\r\n+          </when>\r\n+          <when value="history">\r\n+\t\t\t<param name="repeat" type="data" label="repeat information file out of Repeatmasker" />\r\n+\t\t  </when>\r\n+        </conditional>\t<!--param type="data" name="index" label="genome sequence bowtie index"/-->\r\n+\t\t </when>\r\n+    </conditional> <!-- params -->\r\n+\t\r\n+\t<param name="cen" type="data" label="centromere file input" />\r\n+\t<param name="span" type="integer" value="50000" label="plot span" />\r\n+\t\n+    <conditional name="degseq">\n+       <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters">\n+\t\t  <option value="no" selected="true">no</option>\n+\t\t  <option value="yes">yes</option>\n+       </param>\n+       <when value="yes">\n+\t\t\t<param name="deg" type="data" label="file config of de sample" />\n+        </when>\r\n+    </conditional>\n+\r\n+\t\r\n+\t</inputs>\r\n+\r\n+  <outputs>\r\n+   <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/>\r\n+   <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/>\r\n+\r\n+  </outputs>\r\n+\r\n+ <help>\r\n+\r\n+ </help>\r\n+ </tool>\r\n'
b
diff -r 000000000000 -r 87fe81de0931 siRNA_pipeline.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA_pipeline.pl Sun Jan 04 02:47:25 2015 -0500
[
b'@@ -0,0 +1,524 @@\n+#!/usr/bin/perl -w\r\n+my $version=1.00;\r\n+use strict;\r\n+use warnings;\r\n+use Getopt::Long;\r\n+use Getopt::Std;\r\n+use threads;\r\n+#use threads::shared;\r\n+use Parallel::ForkManager;\r\n+#use lib \'/leofs/biotrans/chentt/perl_module/\';\r\n+#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq \r\n+print "\r\n+#####################################\r\n+#                                   # \r\n+#          sRNA cluster             # \r\n+#                                   #\r\n+#####################################\r\n+";\r\n+###########################################################################################\r\n+my $usage="$0\r\n+Options:\r\n+-i input file# raw data file\r\n+-tag string #raw data sample name\r\n+-g  genome file\r\n+-f  gff file\r\n+\r\n+-o  workdir file\r\n+-path  script path\r\n+-t  int,    number of threads [1]\r\n+-format  fastq, fq, fasta or fa\r\n+-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\r\n+\t\tstring must be the prefix of the bowtie index. For instance, if\r\n+\t\tthe first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\r\n+\t\tthe prefix is \'h_sapiens_37_asm\'.##can be null\r\n+-mis  int     number of allowed mismatches when mapping reads to genome, default 0\r\n+-rfam  string,  input file# rfam database file.\r\n+-idx2  string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter\r\n+\t\tstring must be the prefix of the bowtie index. For instance, if\r\n+\t\tthe first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\r\n+\t\tthe prefix is \'h_sapiens_37_asm\'.##can be null\r\n+\r\n+-v  int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment\r\n+\r\n+-a string,  ADAPTER string. default is ATCTCGTATG.\r\n+-n  int max hits number,default 25; used in genome alignment\r\n+-d  int distance of tag to merged a cluster; default 100\r\n+-p  cluster method F :conventional default is F\r\n+\t\t\t\t   T :NIBLES \r\n+-l  int the length of the upstream and downstream,default 1000;used in position annotate\r\n+\r\n+-nat  natural antisense transcripts file\r\n+-repeat  repeat information file out of Repeatmasker\r\n+-deg  file config of de sample\r\n+-cen  centromere file input\r\n+-span  plot span, default 50000\r\n+";\r\n+\r\n+my %options;\r\n+GetOptions(\\%options,"i:s@","tag:s@","g=s","phred:i","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h");\r\n+#print help if that option is used\r\n+if($options{h}){die $usage;}\r\n+\r\n+my @filein=@{$options{\'i\'}};\r\n+my @mark=@{$options{\'tag\'}};\r\n+\r\n+#my $config=$options{\'i\'};\r\n+my $genome_fa=$options{\'g\'};\r\n+my $gff=$options{\'f\'};\r\n+\r\n+\r\n+##########################################################################################\r\n+my $predir=`pwd`;\r\n+chomp $predir;\r\n+my $workdir=defined($options{\'o\'}) ? $options{\'o\'}:$predir;\r\n+\r\n+my $path=$options{\'path\'};\r\n+\r\n+my $t=defined($options{\'t\'})? $options{\'t\'}:1; #threads number\r\n+\r\n+my $mis=defined $options{\'mis\'} ? $options{\'mis\'}:0;\r\n+\r\n+my $mis_rfam=defined $options{\'v\'} ? $options{\'v\'}:0;\r\n+\r\n+my $hit=defined $options{\'n\'}?$options{\'n\'}:25;\r\n+\r\n+my $distance_of_merged_tag=defined $options{\'d\'} ? $options{\'d\'}:100;\r\n+\r\n+my $up_down_dis=defined $options{\'l\'} ?$options{\'l\'}:1000;\r\n+\r\n+my $cluster_mothod=defined $options{\'p\'}?$options{\'p\'}:"F";\r\n+\r\n+my $format=$options{\'format\'};\r\n+#if ($format ne "fastq" &'..b' $dir\\/ref\\/genelist.txt -d $up_down_dis -o $annotate_dir\\/sample_c_p.anno\\n\\n";\r\n+\treturn 0;\r\n+}\r\n+sub get_genelist{\r\n+\r\n+\tmy $get_genelist=`perl $path\\/get_genelist.pl -i $gff -o $dir\\/ref\\/genelist.txt`;\r\n+\tprint "perl $path\\/get_genelist.pl -i $gff -o $dir\\/ref\\/genelist.txt";\r\n+}\r\n+\r\n+sub dec{\r\n+\tprint "deg reading\\n\\n";\r\n+\tmy $deg_file=$options{\'deg\'};\r\n+\topen IN,"<$deg_file";\r\n+\tmy @deg;\r\n+\tmy $s=0;\r\n+\t\twhile (my $aline=<IN>) {\r\n+\t\tchomp $aline;\r\n+\t\tnext if($aline=~/^\\#/);\r\n+\t\t$deg[$s]=$aline;\r\n+\t\tmy @ea=split/\\s+/,$aline;\r\n+\t\tpush @pairdir,"$ea[0]_VS_$ea[1]\\/";\r\n+\t\t#print "$deg[$s]\\n";\r\n+\t\t$s++;\r\n+\t}\r\n+\tclose IN;\r\n+\t$deg_dir=$dir."deg\\/";\r\n+\tmkdir ("$deg_dir");\r\n+\tmy $max_process = 10;\r\n+\tmy $pm = new Parallel::ForkManager( $max_process );\r\n+\tmy $number=@deg-1;\r\n+\tforeach(0..$number){\r\n+\t\t$pm->start and next;\r\n+\t\t&dec_pel($deg[$_]);\r\n+\t\t$pm->finish;\r\n+\t}\r\n+\t$pm->wait_all_children;\r\n+}\r\n+\r\n+sub dec_pel{\r\n+\tprint "\\n******************\\nstart:\\n";\r\n+\tTime();\r\n+\tmy $sample=shift(@_);\r\n+\tmy @each=split/\\s+/,$sample;\r\n+\tprint "$each[0]\\t$each[1]\\n";\r\n+\tmy $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\\/";\r\n+\tmkdir ("$deg_sample_dir");\r\n+\tprint "read: $read\\n";\r\n+\tprint "deg_sample_dir: $deg_sample_dir\\n";\r\n+\tprint "$id{$each[0]}\\t$each[0]\\n";\r\n+\tprint "$id{$each[1]}\\t$each[1]\\n";\r\n+\tmy $deg=`perl $path\\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2\r\n+\tmy $time2=time();\r\n+\tprint "end:\\n*************************\\n";\r\n+\tTime();\r\n+\tsleep 1;\r\n+}\r\n+\r\n+sub infor_merge{\r\n+\tmy ($input,$mark);\r\n+\tforeach (@pairdir) {\r\n+\t\tprint "@pairdir\\n";\r\n+\t\t$mark.=" -mark $_ ";\r\n+\t\t$input.=" -i $dir/deg\\/$_\\/output_score\\.txt ";\r\n+\t\tprint "$input\\n$mark\\n";\r\n+\t}\r\n+\tmy $infor_merge=`perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result `;\r\n+\tprint "perl $path\\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\\/sample_c_p.anno -n $sample_number -o $dir\\/total.result\\n\\n";\r\n+}\r\n+\r\n+sub infor_merge_no_dec{\r\n+\tmy $infor_merge_no_dec=`cp $annotate_dir\\/sample_c_p.anno $dir\\/total.result`;\r\n+}\r\n+\r\n+sub genome_length{\r\n+\tmy $length=`perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length`;\r\n+\tprint "perl $path\\/count_ref_length.pl -i $genome_fa -o $dir\\/ref\\/genome\\.length\\n\\n"\r\n+\r\n+}\r\n+\r\n+sub plot{\r\n+\t$plot_dir="$dir\\/plot\\/";\r\n+\tmkdir ("$plot_dir");\r\n+\tmy $span=defined($options{span})?$options{span}:50000;\r\n+\tmy $cen="";\r\n+\tif (defined $options{cen}) {\r\n+\t\t$cen="-cen $options{cen}";\r\n+\t}\r\n+\tmy $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `;\r\n+\t"print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \\n";\r\n+\r\n+}\r\n+\r\n+sub html{\r\n+\tmy $pathfile="$dir/path.txt";\r\n+\topen PA,">$pathfile";\r\n+\tprint PA "$config\\n";\r\n+\tprint PA "$preprocess\\n";\r\n+\tprint PA "$dir"."rfam_match\\n";\r\n+\tprint PA "$dir"."genome_match\\n";\r\n+\tprint PA "$cluster_file\\n";\r\n+\tprint PA "$annotate_dir\\n";\r\n+\tprint PA "$plot_dir\\n";\r\n+\tif (defined($deg_dir)) {\r\n+\t\t\tprint PA "$deg_dir\\n";\r\n+\t}\r\n+\tclose PA;\r\n+\tmy $html=`perl $path\\/html_siRNA.pl -i $pathfile -format $format -o $dir/result.html`;\r\n+}\r\n+\r\n+sub Time{\r\n+\tmy $time=time();\r\n+\tmy ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n+\t$month++;\r\n+\t$year+=1900;\r\n+\tif (length($sec) == 1) {$sec = "0"."$sec";}\r\n+\tif (length($min) == 1) {$min = "0"."$min";}\r\n+\tif (length($hour) == 1) {$hour = "0"."$hour";}\r\n+\tif (length($day) == 1) {$day = "0"."$day";}\r\n+\tif (length($month) == 1) {$month = "0"."$month";}\r\n+\tprint "$year-$month-$day $hour:$min:$sec\\n";\r\n+\treturn("$year-$month-$day-$hour-$min-$sec");\r\n+}\r\n+#################################################################################\r\n'
b
diff -r 000000000000 -r 87fe81de0931 siRNA_pipeline.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA_pipeline.xml Sun Jan 04 02:47:25 2015 -0500
b
b'@@ -0,0 +1,221 @@\n+<tool id="plant_sirna_pipeline_v1" name="siRNA_pipeline" veision="1.0.0">\r\n+  <description>Program for plant siRNA analysis (rawdata preprocess -> genome alignment -> non-coding annotate ->  siRNA analysis)</description>\r\n+\r\n+  <requirements>\r\n+    <requirement type="set_environment">SCRIPT_PATH</requirement>\r\n+    <requirement type="package" version="0.12.7">bowtie</requirement>\r\n+    <requirement type="package" version="3.0.1">R</requirement>\r\n+\t<requirement type="package" version="0.0.13">fastx_toolkit </requirement>\r\n+\t<requirement type="package" version="1.96">threads</requirement>\r\n+\t<requirement type="package" version="1.06">Parallel-ForkManager</requirement>\r\n+\t<requirement type="package" version="2.59">SVG</requirement>\r\n+\t<requirement type="package" version="1.4_001">Boost-Graph</requirement>\r\n+  </requirements>\r\n+\r\n+  <command interpreter="perl">siRNA_pipeline.pl \r\n+   ## Change this to accommodate the number of threads you have available.\r\n+        -t \\${GALAXY_SLOTS:-4}\r\n+\r\n+\t-path \\$SCRIPT_PATH\r\n+\r\n+    #for $j, $s in enumerate( $series )\r\n+    ##rank_of_series=$j\r\n+    -i ${s.input}\r\n+    -tag ${s.tag}\r\n+    #end for\r\n+\r\n+      ## prepare bowtie index\r\n+      #set index_path = \'\'\r\n+      #if str($reference_genome.source) == "history":\r\n+          #set index_path = $reference_genome.own_file\r\n+\t\t-g  $index_path \r\n+\t  #else:\r\n+          #set index_path = $reference_genome.index.fields.path\r\n+\t\t-g ${index_path}.fa -idx $index_path \r\n+\t  #end if\r\n+\r\n+\r\n+\t  ## prepare Rfam bowtie index\r\n+\t  #set rfam_index_path = \'\'\r\n+\t  #if str($reference_rfam.source) == "history":\r\n+\t\t  #set rfam_index_path = $reference_rfam.own_file\r\n+\t\t-rfam $rfam_index_path -v $v \r\n+\t  #else:\r\n+\t\t  #set rfam_index_path = $reference_rfam.index.fields.path\r\n+\t\t-rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v \r\n+\t  #end if\r\n+\r\n+\r\n+\r\n+   ## Do or not annotate siRNAs by function\r\n+    #if $params.function_anno == "yes":\r\n+\r\n+      ## prepare bowtie index\r\n+      #set nat_path = \'\'\r\n+      #if str($params.nat_file.source) == "history":\r\n+          #set nat_path = $params.nat_file.nat\r\n+\r\n+\t  #else:\r\n+          #set nat_path = $params.nat_file.index.fields.path\r\n+\t  #end if\r\n+\r\n+      ## prepare bowtie index\r\n+      #set repeat_path = \'\'\r\n+      #if str($params.repeat_file.source) == "history":\r\n+          #set repeat_path = $params.repeat_file.repeat\r\n+\r\n+\t  #else:\r\n+          #set repeat_path = $params.repeat_file.index.fields.path\r\n+\t  #end if\r\n+\r\n+\r\n+\t -nat $nat_path -repeat $repeat_path\r\n+\t#end if\r\n+   \r\n+   ## Do or not DEG\r\n+    #if $degseq.degseq_analysis == "yes" :\r\n+    -deg $degseq.deg\r\n+    #end if\r\n+\r\n+  -format $format -phred $phred -f $gff -mis $mis -a $a -n $mapnt -d $d -p $p -l $l  -cen $cen -span $span > run.log\r\n+\r\n+  </command>\r\n+\r\n+  <inputs>\r\n+\r\n+   <repeat name="series" title="Raw sequence data file">\r\n+     <param name="input" type="data" label="Raw data file"/>\r\n+     <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/>\r\n+   </repeat>\r\n+\r\n+\t<param name="format" type="select" label="raw data format" multiple="false">\r\n+\t  <option value="fastq">Raw data is fastq. format</option>\r\n+\t  <option value="fasta">Raw data is fasta. format</option>\r\n+\t</param>\r\n+\t\r\n+\t<param name="phred" type="select" label="input quals are Phred+64 or Phred+33" multiple="false">\r\n+\t  <option value="64">Phred+64</option>\r\n+\t  <option value="33" selected="true">Phred+33</option>\r\n+\t</param>\r\n+\r\n+        <conditional name="reference_genome">\r\n+          <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">\r\n+            <option value="indexed">Use a built-in index</option>\r\n+            <option value="history">Use one from the history</option>\r\n+          </param>\r\n+          <when value="indexed">\r\n+            <param name="index" type="select" label="Select a reference genome" help="'..b'am name="mapnt" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" />\r\n+\t<param name="d" type="integer" value="100" label="distance of tag to merged a cluster" />\r\n+\r\n+\t<param name="p" type="select" label="siRNA cluster identify method" multiple="false">\r\n+\t  <option value="F">conventional</option>\r\n+\t  <option value="T">NIBLES</option>\r\n+\t</param>\r\n+\t<param name="l" type="integer" value="1000" label="the length of the upstream and downstream,used in position annotate" />\r\n+\r\n+\r\n+\t<conditional name="params">\r\n+\t\t<param name="function_anno" type="select" label="Do or not annotate siRNAs by function">\r\n+\t\t  <option value="no" selected="true">no</option>\r\n+\t\t  <option value="yes">yes</option>\r\n+\t\t </param>\r\n+\t\t <when value="yes">\r\n+\r\n+\r\n+        <conditional name="nat_file">\r\n+          <param name="source" type="select" label="Will you select a atural antisense transcripts file from your history ?" help="down load from ***">\r\n+            <option value="indexed">Use a built-in file</option>\r\n+            <option value="history">Use one from the history</option>\r\n+          </param>\r\n+          <when value="indexed">\r\n+            <param name="index" type="select" label="Select a atural antisense transcripts file" help="If your species of interest is not listed, contact the Galaxy team">\r\n+              <options from_data_table="nat_annotate">\r\n+                <filter type="sort_by" column="2"/>\r\n+                <validator type="no_options" message="No files are available for the selected input dataset"/>\r\n+              </options>\r\n+            </param>\r\n+          </when>\r\n+          <when value="history">\r\n+            <param name="nat" type="data" format="txt" metadata_name="dbkey" label="atural antisense transcripts file" />\r\n+\t\t  </when>\r\n+        </conditional>\t<!--param type="data" name="index" label="genome sequence bowtie index"/-->\r\n+\r\n+        <conditional name="repeat_file">\r\n+          <param name="source" type="select" label="Will you select a repeat information file from your history ?" help="down load from ***">\r\n+            <option value="indexed">Use a built-in file</option>\r\n+            <option value="history">Use one from the history</option>\r\n+          </param>\r\n+          <when value="indexed">\r\n+            <param name="index" type="select" label="Select a repeat information file" help="If your species of interest is not listed, contact the Galaxy team">\r\n+              <options from_data_table="repeat_annotate">\r\n+                <filter type="sort_by" column="2"/>\r\n+                <validator type="no_options" message="No files are available for the selected input dataset"/>\r\n+              </options>\r\n+            </param>\r\n+          </when>\r\n+          <when value="history">\r\n+\t\t\t<param name="repeat" type="data" metadata_name="dbkey" label="repeat information file out of Repeatmasker" />\r\n+\t\t  </when>\r\n+        </conditional>\t<!--param type="data" name="index" label="genome sequence bowtie index"/-->\r\n+\t\t </when>\r\n+    </conditional> <!-- params -->\r\n+\t\r\n+\t<param name="cen" type="data" label="centromere file input" />\r\n+\t<param name="span" type="integer" value="50000" label="plot span" />\r\n+\t\n+    <conditional name="degseq">\n+       <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters">\n+\t\t  <option value="no" selected="true">no</option>\n+\t\t  <option value="yes">yes</option>\n+       </param>\n+       <when value="yes">\n+\t\t\t<param name="deg" type="data" label="file config of de sample" />\n+        </when>\r\n+    </conditional>\n+    \n+  </inputs>\r\n+\r\n+  <outputs>\r\n+   <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/>\r\n+   <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/>\r\n+\r\n+  </outputs>\r\n+\r\n+ <help>\r\n+\r\n+ </help>\r\n+ </tool>\r\n'
b
diff -r 000000000000 -r 87fe81de0931 tool-data/bowtie_indices.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/bowtie_indices.loc.sample Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,37 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Bowtie indexed sequences data files. You will
+#need to create these data files and then create a bowtie_indices.loc
+#file similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bowtie_indices.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_base_path>
+#
+#So, for example, if you had hg18 indexed stored in
+#/depot/data2/galaxy/bowtie/hg18/,
+#then the bowtie_indices.loc entry would look like this:
+#
+#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18
+#
+#and your /depot/data2/galaxy/bowtie/hg18/ directory
+#would contain hg18.*.ebwt files:
+#
+#-rw-r--r--  1 james    universe 830134 2005-09-13 10:12 hg18.1.ebwt
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 hg18.2.ebwt
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 hg18.3.ebwt
+#...etc...
+#
+#Your bowtie_indices.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files. For example:
+#
+#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon
+#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full
+#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
b
diff -r 000000000000 -r 87fe81de0931 tool-data/nat_annotate.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/nat_annotate.loc.sample Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,37 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Bowtie indexed sequences data files. You will
+#need to create these data files and then create a bowtie_indices.loc
+#file similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bowtie_indices.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_base_path>
+#
+#So, for example, if you had hg18 indexed stored in
+#/depot/data2/galaxy/bowtie/hg18/,
+#then the bowtie_indices.loc entry would look like this:
+#
+#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18
+#
+#and your /depot/data2/galaxy/bowtie/hg18/ directory
+#would contain hg18.*.ebwt files:
+#
+#-rw-r--r--  1 james    universe 830134 2005-09-13 10:12 hg18.1.ebwt
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 hg18.2.ebwt
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 hg18.3.ebwt
+#...etc...
+#
+#Your bowtie_indices.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files. For example:
+#
+#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon
+#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full
+#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
b
diff -r 000000000000 -r 87fe81de0931 tool-data/repeat_annotate.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/repeat_annotate.loc.sample Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,37 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Bowtie indexed sequences data files. You will
+#need to create these data files and then create a bowtie_indices.loc
+#file similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bowtie_indices.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_base_path>
+#
+#So, for example, if you had hg18 indexed stored in
+#/depot/data2/galaxy/bowtie/hg18/,
+#then the bowtie_indices.loc entry would look like this:
+#
+#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18
+#
+#and your /depot/data2/galaxy/bowtie/hg18/ directory
+#would contain hg18.*.ebwt files:
+#
+#-rw-r--r--  1 james    universe 830134 2005-09-13 10:12 hg18.1.ebwt
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 hg18.2.ebwt
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 hg18.3.ebwt
+#...etc...
+#
+#Your bowtie_indices.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files. For example:
+#
+#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon
+#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full
+#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
b
diff -r 000000000000 -r 87fe81de0931 tool-data/rfam_bowtie_indices.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/rfam_bowtie_indices.loc.sample Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,37 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Bowtie indexed sequences data files. You will
+#need to create these data files and then create a bowtie_indices.loc
+#file similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bowtie_indices.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_base_path>
+#
+#So, for example, if you had hg18 indexed stored in
+#/depot/data2/galaxy/bowtie/hg18/,
+#then the bowtie_indices.loc entry would look like this:
+#
+#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18
+#
+#and your /depot/data2/galaxy/bowtie/hg18/ directory
+#would contain hg18.*.ebwt files:
+#
+#-rw-r--r--  1 james    universe 830134 2005-09-13 10:12 hg18.1.ebwt
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 hg18.2.ebwt
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 hg18.3.ebwt
+#...etc...
+#
+#Your bowtie_indices.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files. For example:
+#
+#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon
+#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full
+#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
b
diff -r 000000000000 -r 87fe81de0931 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,24 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of indexes in the Bowtie mapper format -->
+    <table name="bowtie_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/bowtie_indices.loc" />
+    </table>
+    <!-- Locations of indexes in the Bowtie mapper format -->
+    <table name="rfam_bowtie_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/rfam_bowtie_indices.loc" />
+    </table>
+    <!-- Locations of repeat annotation file -->
+    <table name="repeat_annotate" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/repeat_annotate.loc" />
+    </table>
+    <!-- Locations of nat annotation file -->
+    <table name="nat_annotate" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/nat_annotate.loc" />
+    </table>
+
+</tables>
b
diff -r 000000000000 -r 87fe81de0931 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Sun Jan 04 02:47:25 2015 -0500
b
@@ -0,0 +1,134 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="fastx_toolkit" version="0.0.13">
+        <repository changeset_revision="ec66ae4c269b" name="package_fastx_toolkit_0_0_13" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="bowtie" version="0.12.7">
+        <repository changeset_revision="9f9f38617a98" name="package_bowtie_0_12_7" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+    <set_environment version="1.0">
+        <environment_variable action="set_to" name="SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable>
+    </set_environment>
+ <!--package name="R" version="3.0.1">
+    <repository name="package_r_3_0_1" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+ </package!-->
+
+ <package name="ViennaRNA" version="2.1.8">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.tbi.univie.ac.at/RNA/packages/source/ViennaRNA-2.1.8.tar.gz</action>
+  <action type="shell_command">./configure --prefix=$INSTALL_DIR </action>
+  <action type="shell_command">make</action>
+  <action type="shell_command">make install</action>
+  <action type="set_environment">
+    <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable>
+  </action>
+    </actions>
+    </install>
+ </package>
+ <package name="R" version="3.0.1">
+    <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" />
+
+ <install version="1.0">
+    <actions>
+       <action type="set_environment_for_install">
+                    <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu">
+                        <package name="R" version="3.0.1" />
+                    </repository>
+              </action>
+       <action type="shell_command">R CMD BATCH $REPOSITORY_INSTALL_DIR/install_DEG.R </action>
+       <action type="shell_command">echo "export PATH=$PATH" &gt; $INSTALL_DIR/env.sh </action>
+   <action type="shell_command">chmod 755 $INSTALL_DIR/env.sh </action>
+
+  </actions>
+ </install>
+ </package>
+
+ <package name="threads" version="1.96">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/J/JD/JDHEDDEN/threads-1.96.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package>
+
+
+ <package name="Parallel-ForkManager" version="1.06">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/S/SZ/SZABGAB/Parallel-ForkManager-1.06.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package>
+
+ <package name="Boost-Graph" version="1.4_001">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/D/DU/DUFFEE/Boost-Graph-1.4_001.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ ls &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package>
+
+ <package name="SVG" version="2.59">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/S/SZ/SZABGAB/SVG-2.59.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package>
+
+ <!--package name="parent" version="0.228">
+    <install version="1.0">
+    <actions>
+      <action type="download_by_url">http://www.cpan.org/authors/id/C/CO/CORION/parent-0.228.tar.gz</action>
+ <action type="make_directory">$INSTALL_DIR/lib/perl5</action>
+ <action type="shell_command">
+ perl Makefile.PL INSTALL_BASE=$INSTALL_DIR &amp;&amp;
+ make &amp;&amp;
+ make install 
+ </action>
+ <action type="set_environment">
+ <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable>
+ </action>
+    </actions>
+    </install>
+ </package-->
+
+</tool_dependency>