Mercurial > repos > big-tiandm > mirplant2
changeset 50:7b5a48b972e9 draft
Uploaded
author | big-tiandm |
---|---|
date | Fri, 05 Dec 2014 00:11:02 -0500 |
parents | f008ab2cadc6 |
children | 3202911efdae |
files | Annotate.pl ClassAnnotate.pl DEGseq_2.pl SampleDEGseqMerge.pl conventional.pl convert_bowtie_to_blast.pl count_ref_length.pl filterReadsByCount.pl filterReadsByLength.pl get_genelist.pl html_siRNA.pl install_DEG.R miRDeep_plant.pl miRNA_Express_and_sequence.pl miRPlant.pl miRPlant.xml microRNA.pl microRNA.xml nibls.pl non_miRNA_reads.pl phased_siRNA.pl preProcess.xml precursors.pl quantify.pl quantify_siRNA.pl sRNA_plot.pl sam2Bed_bowtie.pl siRNA.pl siRNA.xml siRNA_pipeline.pl siRNA_pipeline.xml tool_dependencies.xml |
diffstat | 32 files changed, 8074 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Annotate.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,178 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chentt +#Email: chentt@big.ac.cn +#Date: 2014/4/10 +#Modified: +#Description: cluster annotate by priority +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","d=i","g=s","o=s","t=s","h"); +if (!(defined $opts{i} and defined $opts{g} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} +#my $genelistout=$opts{'t'}; +my $dis=defined $opts{'d'}? $opts{'d'}:1000; +my %gene; + +#open OUT,">$genelistout"; #output file +#print OUT "#ID\tchr\tstart\tend\tstrand\ns"; +open IN,"<$opts{g}"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline;#ID chr start end strand + #push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; + $gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[4]]; +} +#while (my $aline=<IN>) { +# chomp $aline; +# next if($aline=~/^\#/); +# my @tmp=split/\t/,$aline; +# my $ID; +# if ($tmp[2] eq "gene") { +# $tmp[0]=~s/Chr\./Chr/; +# $tmp[0]=~s/Chr/chr/; +# my @infor=split/;/,$tmp[8]; +# for (my $i=0;$i<@infor ;$i++) { +# if ($infor[$i]=~/Alias\=(\S+)$/) { +# $ID=$1; +# last; +# } +# } +# $gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] +# print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; +# } +#} +close IN; +#close OUT; + + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + if($aline=~/^\#/){print OUT "$aline\tP_annotate\n";next} + my @result; + #shift @tmp; + my @id=split/:/,$tmp[0]; + $id[0]=~s/Chr0/Chr/; + my @posi=split/-/,$id[1]; + foreach my $key (keys %{$gene{$id[0]}}) { + if ($posi[0]<$gene{$id[0]}{$key}[1] && $posi[1]>$gene{$id[0]}{$key}[0]) { + push @result,"gene-body;$key;$gene{$id[0]}{$key}[2]";#$te{$key}"; + next; + } + #if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-1000) { + if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-$dis) { + push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+"); + push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-"); + next; + } + #if ($posi[0]<$gene{$id[0]}{$key}[1]+1000 && $posi[1]>$gene{$id[0]}{$key}[1]) { + if ($posi[0]<$gene{$id[0]}{$key}[1]+$dis && $posi[1]>$gene{$id[0]}{$key}[1]) { + push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+"); + push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-"); + next; + } + } + my $result; + if (!(@result)) { + $result="intergenic"; + } + elsif($#result==0){ + $result=$result[0]; + + } + else{ + $result=join "\t",@result; + } +# else{ +# my $te_num=0; +# my @te_overlap; +# my @te_up_down; +# my @non_overlap; +# my @non_up_down; +# for (my $k=0;$k<@result ;$k++) { +# my @rr=split/\;/,$result[$k]; +# if ($rr[3] eq "Y") { +# $te_num++; +# if ($rr[0] eq "overlap") { +# push @te_overlap,$result[$k]; +# } +# else{ +# push @te_up_down,$result[$k]; +# } +# } +# else{ +# if ($rr[0] eq "overlap") { +# push @non_overlap,$result[$k]; +# } +# else{ +# push @non_up_down,$result[$k]; +# } +# } +# } +# if ($te_num==0) {#non TE +# if (!(@te_overlap)) {#down up +# if ($#non_up_down==0) { +# $result=$non_up_down[0]; +# } +# else{#overlap +# my $all_2=join "\t",@non_up_down; +# $result="up&down1-kb\t".$all_2; +# } +# } +# else{ +# $result=join "\t",@non_overlap; +# if ($#non_overlap>=1) { +# print "$aline\t$result\n"; +# } +# } +# } +# else{#TE +# if (!(@te_overlap)) {#down up +# if ($#te_up_down==0) { +# $result=$te_up_down[0]; +# } +# else{#overlap +# my $all_2=join "\t",@te_up_down; +# $result="up&down1-kb\t".$all_2; +# } +# } +# else{ +# $result=join "\t",@te_overlap; +# if ($#te_overlap>=1) { +# print "$aline\t$result\n"; +# } +# } +# } +# } + print OUT "$aline\t$result\n"; +} + +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -g -d +options: +-i input file +-g genelist file +-d int the length of the upstream and downstream,default 1000 +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ClassAnnotate.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,251 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chen Tingting +#Email: chentt@big.ac.cn +#Date: 2014/5/13 +#Modified: +#Description: cluster annotate +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h"); +if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments +&usage; +} + +#my %gene; +my %gene1; +open IN,"<$opts{g}"; +open OUT ,">$opts{l}"; +print OUT "#ID\tchr\tstart\tend\tstrand\n"; +my $n=1; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + my $ID; + if ($tmp[2] eq "gene") { + $tmp[0]=~s/Chr\./Chr/; + #$tmp[0]=~s/Chr/chr/; + my @infor=split/;/,$tmp[8]; + for (my $i=0;$i<@infor ;$i++) { + if ($infor[$i]=~/Alias\=(\S+)$/) { + $ID=$1; + last; + } + else { + $ID="unknown$n"; + $n++; + } + } + #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] + push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; + print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; + } +} +#while (my $aline=<IN>) { +# chomp $aline; +# next if($aline=~/^\#/); +# my @tmp=split/\t/,$aline;#ID chr start end strand +# push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; +# #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]]; +#} +close IN; +close OUT; + +my %nat; +open TMP,">$opts{t}"; +print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n"; + +open IN,"<$opts{n}"; +my $nat_n=1; +while (my $aline=<IN>) { + next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt - + chomp $aline; + my @arr=split /\t/,$aline; + my ($ns,$ne,$ns2,$ne2)=(0,0,0,0); + if ($arr[11]=~/Nearby/) { + ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]); + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n"; + $nat_n++; + }else{ + $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1; + $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1; + $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1; + $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1; + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2 + push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n"; + $nat_n++; + } +} +close IN; +close TMP; + +my %repeat; +open IN,"<$opts{'r'}"; +my $first=<IN>; +$first=<IN>; +$first=<IN>; +while (my $aline=<IN>) { + chomp $aline; + $aline=~s/^\s+//; + my @tmp=split/\s+/,$aline; + $tmp[4]=~s/chr0/Chr/; + $tmp[4]=~s/chr/Chr/; + push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class + #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n"; +} +close IN; + +my %phase; +open IN,"<$opts{'p'}"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + if ($tmp[5]>=25) { + $phase{$tmp[0]}=$tmp[5]; + } +} +close IN; + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +while (my $aline=<IN>) { + chomp $aline; + if($aline=~/^\#/){ + print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n"; + next; + } + my @tmp=split/\t/,$aline; + my @inf=split/\:/,$tmp[0]; + my @pos=split/\-/,$inf[1]; + my $chr=$inf[0]; + $chr=~s/Chr0/Chr/; + my $start=$pos[0]; + my $end=$pos[1]; + #=========Repeat============ + my @repeat; + if (defined(@{$repeat{$chr}})) { + my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}}; + for (my $i=0;$i<@r_array ;$i++) { + if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) { + push @repeat,$r_array[$i][2]; + } + elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){ + push @repeat,$r_array[$i][2]; + + } + elsif($end<$r_array[$i][0]){ + last; + } + else{next;} + } + } + my $repeat; + if (@repeat==0) { + $repeat="\/"; + } + else{ + $repeat=join ";",@repeat; + } + #=========nat=============== + my @nat1;#class 1 + my @nat2;#class 2 + my @nat;#nat ID + #foreach my $chr (keys %nat) { + my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}}; + for (my $i=0;$i<@n_array ;$i++) { + if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) { + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){ + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($end<$n_array[$i][0]){ + last; + } + else{next;} + } + #} + + my $nat1; + my $nat2; + my $nat; + if (@nat1==0) { + $nat1="\/"; + } + else{ + $nat1=join ";",@nat1; + } + if (@nat2==0) { + $nat2="\/"; + } + else{ + $nat2=join ";",@nat2; + } + if (@nat==0) { + $nat="\/"; + } + else{ + $nat=join ";",@nat; + } + #========phase============== + my $phase="\/"; + if (defined($phase{$tmp[0]})) { + $phase="phase"; + } + #=========long=============== + my $long="\/"; + if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) { + $long="long"; + } + print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n"; +} + +close IN; +close OUT; + +sub nearby{ + my @p=@_; + my ($s,$e)=(0,0); + if ($p[1]<$p[2]) { + $s=$p[1]; + $e=$p[2]; + }else{ + $s=$p[3]; + $e=$p[0]; + } + return ($s,$e); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -g -n -r -p -t -l +options: +-i input file + -g gff file + -n NATs file + -r repeat file + -p phase file +-o output file +-t nat output file +-l genelist output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DEGseq_2.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2009-05-06 +#Modified: +#Description: ɾ³ýmatched reads +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","outdir=s","column1:i","mark1=s","depth1:i","depth2:i","column2:i","mark2=s","h"); +if (!(defined $opts{i} and defined $opts{outdir} and defined $opts{mark1} and defined $opts{mark2}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $outputdir=$opts{'outdir'}; +unless ($outputdir=~/\/$/) {$outputdir .="/";} +my $column1=defined $opts{column1} ? $opts{column1} : 3; +my $column2=defined $opts{column2} ? $opts{column2} : 4; +my $mark1=$opts{mark1}; +my $mark2=$opts{mark2}; +my $fileout=$outputdir."degseq.R"; + +open OUT,">$fileout"; #output file +#my ($name,$dir); +#$name=basename($filein); +print OUT "library(DEGseq)\n"; +print OUT "geneExpFile <- system.file(package=\"DEGseq\")\n"; +print OUT "geneExpFile<-file.path(\"$filein\")\n"; +print OUT "layout(matrix(c(1,2,3,4,5,6), 3, 2, byrow=TRUE))\npar(mar=c(2, 2, 2,2))\n"; +print OUT "outputdir<-file.path(\"$outputdir\")\n"; +print OUT "geneExpMatrix1 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column1))\n"; +print OUT "geneExpMatrix2 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column2))\n"; +if(defined $opts{'depth1'} && defined $opts{'depth2'}){ +print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",depth1=$opts{depth1},depth2=$opts{depth2},outputDir=outputdir,method=\"MARS\")\n"; +} +else{ +print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",outputDir=outputdir,method=\"MARS\")\n"; +} +close OUT; + + +system("R CMD BATCH $fileout"); + +wait; + + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -outdir -column1 -mark1 -column2 -mark2 -depth1 -depth2 +options: +-i input file +-outdir output file dir +-column1 the first column for DEGseq +-mark1 the name of the column1 +-depth1 depth for the first file,use for normalize +-column2 the second column for DEGseq +-mark2 the name of the column2 +-depth2 depth for the second file,use for normalize + +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SampleDEGseqMerge.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: chentt@big.ac.cn +#Date: 2014-05-21 +#Modified: +#Description: merged deg file and total information +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i:s@","mark:s@","f:s","o=s","n=s","h"); +if (!(defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my @filein=@{$opts{'i'}}; +my @mark=@{$opts{'mark'}}; +my $fileout=$opts{'o'}; +my $number=$opts{'n'}; + +my %hash; +open IN,"<$filein[0]"; #input file + +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\"/); + my @temp=split/\t/,$aline; + $hash{$temp[0]}=$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1]; +} +close IN; + +for (my $i=1;$i<=$#filein;$i++) { + open IN,"<$filein[$i]"; #input file + + while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\"/); + my @temp=split/\t/,$aline; + if (!(defined $hash{$temp[0]})) { + print "Not find $temp[0]in sample one!\n"; + next; + } + $hash{$temp[0]} .="\t".$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1]; + } + close IN; +} + +open OUT,">$fileout"; #output file +my $deg_title; +foreach (@mark) { + $deg_title.="log2(Fold_change)\tp_value\tq_value\t".$_."\t"; +} +$deg_title=~s/\s+$//; +my %function; +my $title; +open F,"<$opts{f}"; +while (my $aline=<F>) { + chomp $aline; + if($aline=~/^\#/){ + my $title=$aline; + my @title=split/\t/,$aline; + $title[2+$number].="\t".$deg_title; + $title=join"\t",@title; + print OUT "$title\n"; + next; + } + my @temp=split/\t/,$aline; + $temp[2+$number].="\t".$hash{$temp[0]}; + my $temp=join"\t",@temp; + print OUT "$temp\n"; + +} +close F; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -mark -f +options: +-i input file # -i output_score.txt -i output_score.txt -i output_score.txt +-mark sample name # -mark sam1_VS_sam2 -mark sam1_VS_sam3 -mark sam2_VS_sam3 +-f cluster file +-n sample number +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/conventional.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,156 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chentt +#Email: chentt@big.ac.cn +#Date: 2014/04/09 +#Modified: +#Description: islands merged of merged samples +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","d=i","o=s","N=i","t=s","mark=s","h"); +if (!(defined $opts{i} and defined $opts{d} and defined $opts{N} and defined $opts{mark} and defined $opts{t} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +my $distance=$opts{'d'}; +my $tempout=$opts{'t'}; +my $mark=$opts{'mark'}; +my @sample=split/\#/,$mark; +$mark=join"\"\t\"",@sample; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n"; +open TMP,">$tempout"; +print TMP "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n"; +my %hash; +while (my $aline=<IN>) { + chomp $aline; + if($aline=~/^\#/){ + #print OUT "$aline\n"; + next; + } + my @tmp=split/\t/,$aline; + my $chr=shift @tmp; + #shift @tmp; + push @{$hash{$chr}},[@tmp]; +} + +close IN; + +foreach my $key (keys %hash) { + my @tag=sort{$a->[1] <=> $b->[1]} @{$hash{$key}}; + my @sample; + my $start=$tag[0][1]; + my $end=$tag[0][2]; + push @sample,[@{$tag[0]}]; + for (my $i=1;$i<@tag-1;$i++) { + if ($tag[$i][1]-$end<=$distance) { + if ($tag[$i][2]>$end) { + $end=$tag[$i][2]; + } + push @sample,[@{$tag[$i]}]; + } + else{ + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + $max_length="\>30"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + $start=$tag[$i][1]; + $end=$tag[$i][2]; + + @sample=(); + push @sample,[@{$tag[$i]}]; + } + } + if ($tag[$#tag][1]-$end<=$distance) { + if ($tag[$#tag][2]>$end) { + $end=$tag[$#tag][2]; + } + push @sample,[@{$tag[$#tag]}]; + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + $max_length="\>30"; + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + } + else{ + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + $max_length="\>30"; + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + + } +} +close OUT; +close TMP; +sub Max_length{ + my @exp=@{$_[0]}; + my %sample_length; + my $total_exp; + my @each; + my @tag; + for (my $i=0;$i<=$#exp ;$i++) { + my $length=$exp[$i][2]-$exp[$i][1]+1; + #if ($length>30) { + # $length=40; + #} + my $exp=0; + foreach (1..$opts{'N'}) { + $exp+=$exp[$i][$_+2]; + $each[$_-1]+=$exp[$i][$_+2]; + } + $sample_length{$length}+=$exp; + $total_exp+=$exp; + push @tag,($exp[$i][1].",".$exp[$i][2].",".$exp[$i][0].",".$exp); + } + my $max=0; + my $max_key; + foreach my $key (sort keys %sample_length) { + my $p=$sample_length{$key}/$total_exp; + if ($p>$max) { + $max=$p; + $max_key=$key; + } + $sample_length{$key}=sprintf("%.2f",$p); + } + my $tag_n=@tag; + my $tag=join";",@tag; + $tag=$tag_n."\t".$tag; + return($max_key,$sample_length{$max_key},$tag,@each); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -d -N -t -mark +options: +-i input file +-d distance of two islands +-mark sample name; +-o output file +-N sample number +-t temp output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_bowtie_to_blast.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,126 @@ +#!/usr/bin/perl + + +use warnings; +use strict; +use Getopt::Std; + +######################################### USAGE ################################ + +my $usage= +"$0 file_bowtie_result file_solexa_seq file_chromosome + +This is a converter which changes Bowtie output into Blast format. +The input includes three files: a Bowtie result file (default Bowtie +output file), a fasta file consisting of small Reads and a chromosome +fasta file. It outputs the alignments in blast_parsed format. + +file_bowtie_result likes: + +AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0 +AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0 + +file_solexa_seq likes: + +>AtFlower100010_x2 +AAGGAGATTCTTTCAGTCCAG + +file_chromosome contains chromosome seq in fasta format + +"; + + +####################################### INPUT FILES ############################ + +my $file_bowtie_result=shift or die $usage; +my $file_short_seq=shift or die $usage; +my $file_chromosome_seq=shift or die $usage; + + +##################################### GLOBAL VARIBALES ######################### + +my %short_seq_length=(); +my %chromosome_length=(); + + +######################################### MAIN ################################# + +#get the short sequence id and its length +sequence_length($file_short_seq,\%short_seq_length); + +#get the chromosome sequence id and its length +sequence_length($file_chromosome_seq,\%chromosome_length); + +#convert bowtie result format to blast format; +change_format($file_bowtie_result); + +exit; + + +##################################### SUBROUTINES ############################## + +sub sequence_length{ + my ($file,$hash) = @_; + my ($id, $desc, $sequence, $seq_length) = (); + + open (FASTA, "<$file") or die "can not open $$file\n"; + while (<FASTA>) + { + chomp; + if (/^>(\S+)(.*)/) + { + $id = $1; + $desc = $2; + $sequence = ""; + while (<FASTA>){ + chomp; + if (/^>(\S+)(.*)/){ + $$hash{$id} = length $sequence; + $id = $1; + $desc = $2; + $sequence = ""; + next; + } + $sequence .= $_; + } + } + } + $seq_length=length($sequence); + $$hash{$id} = $seq_length; + close FASTA; +} + + + + + +sub change_format{ + #Change Bowtie format into blast format + my $file=shift @_; + open(FILE,"<$file")||die"can not open the bowtie result file:$!\n"; + #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n"; + + while(<FILE>){ + chomp; + my @tmp=split("\t",$_); + #Clean the reads ID + my @tmp1=split(" ",$tmp[0]); + print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t"; + if($tmp[1] eq "+"){ + my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]}; + my $seq_bg=$tmp[3] + 1; + print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n"; + } + if($tmp[1] eq "-"){ + my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3]; + my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1; + print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n"; + } + } + +# close BLASTOUT; + +} + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/count_ref_length.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,58 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2009-05-06 +#Modified: +#Description: ɾ³ýmatched reads +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file + +my ($name,$seq); +while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^>(\S+)/) { + $name=$1; + while (my $new=<IN>) { + chomp $new; + if ($new=~/^>(\S+)/) { + print OUT $name,"\t",length($seq),"\n"; + $seq =""; + $name=$1; + next; + } + else{$seq .=$new;} + } + } + print OUT $name,"\t",length($seq),"\n"; +} + +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filterReadsByCount.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2010-01 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","o=s","mark:s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample"; +my @mark=split /\#/,$mark; + +open OUT,">$opts{o}"; +open IN,"<$opts{i}"; +my %hash;my %reads; +while (my $aline=<IN>) { + chomp $aline; + my $seq=<IN>; + chomp $seq; + if($aline=~/:([\d|_]+)_x(\d+)$/){ + if ($2>3) { + my @ss=split/_/,$1; + for (my $i=0;$i<@ss;$i++) { + $hash{length($seq)}[$i]++ if($ss[$i]>0); + $hash{length($seq)}[$i] +=0 if($ss[$i]==0); + $reads{length($seq)}[$i]+=$ss[$i]; + } + print OUT "$aline\n$seq\n"; + } + } +} +close IN; +close OUT; + +my $dir=dirname($opts{'o'}); +chdir $dir; +my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt"; +open OUT, ">$lengthfile"; +open R,">$dir/length_distribution_after_count_filter.R"; + +print OUT "Tags length\t@mark\n"; + +my $samNo=@mark; +my $avalue=""; +my @length=sort{$a<=>$b} keys %hash; +foreach (@length) { + print OUT $_,"\t@{$hash{$_}}\n"; + my $vv=join ", ",@{$hash{$_}}; + $avalue .="$vv,"; +} +$avalue =~s/,$//; +my $lengths=join ",",@length; +my $marks=join "\",\"",@mark; + +print R "a<-c($avalue) +b<-matrix(a,ncol=$samNo,byrow=T) +cl<-colors() +names=c($lengths) +legends=c(\"$marks\") +png(\"Tags_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +$avalue=""; +print OUT "\nReads length\t@mark\n"; +foreach (@length) { + print OUT $_,"\t@{$reads{$_}}\n"; + my $vv=join ", ", @{$reads{$_}}; + $avalue .= "$vv,"; +} +$avalue =~s/,$//; + +print R "a<-c($avalue)\n +b<-matrix(a,ncol=$samNo,byrow=T) + +png(\"Reads_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +close OUT; +close R; + +system ("R CMD BATCH $dir/length_distribution_after_count_filter.R"); + +#system ("rm $dir/length_distribution.R"); +#system ("rm $dir/length_distribution.Rout"); +#system ("rm $dir/.RData"); +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -min -max -mark +options: + +-i input file +-o output file +-mark string #sample name eg: samA#samB#samC +-h help +USAGE +exit(1); +} +
--- a/filterReadsByLength.pl Wed Dec 03 02:03:27 2014 -0500 +++ b/filterReadsByLength.pl Fri Dec 05 00:11:02 2014 -0500 @@ -33,7 +33,7 @@ my @ss=split/_/,$1; for (my $i=0;$i<@ss;$i++) { $hash{length($seq)}[$i]++ if($ss[$i]>0); - $hash{length($seq)}[$i] +=0 if($ss[$i]>0); + $hash{length($seq)}[$i] +=0 if($ss[$i]==0); $reads{length($seq)}[$i]+=$ss[$i]; } }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_genelist.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,62 @@ +#!/usr/bin/perl -w +#Filename: +#Author: chentt +#Email: +#Date: 2012-4-6 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} +open IN,"<$opts{i}"; +open OUT ,">$opts{o}"; +print OUT "#ID\tchr\tstart\tend\tstrand\n"; +my $n=1; +my %gene1; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + my $ID; + if ($tmp[2] eq "gene") { + $tmp[0]=~s/Chr\./Chr/; + #$tmp[0]=~s/Chr/chr/; + my @infor=split/;/,$tmp[8]; + for (my $i=0;$i<@infor ;$i++) { + if ($infor[$i]=~/Alias\=(\S+)$/) { + $ID=$1; + last; + } + else { + $ID="unknown$n"; + $n++; + } + } + #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] + push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; + print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; + } +} +close IN; +close OUT; + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -h +options: +-i input cluster file +-o output file +-h help +USAGE +exit(1); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,788 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-5-29 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","format=s","o=s","h"); +if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments +&usage; +} +my ($config,$prepath,$rfampath,$genomepath,$clusterpath,$annotatepath,$plotpath,$degpath); +my ($predir,$rfamdir,$genomedir,$clusterdir,$annotatedir,$plotdir,$degdir); +open IN,"<$opts{i}"; +$config=<IN>; chomp $config; +$prepath=<IN>; chomp $prepath; +$rfampath=<IN>;chomp $rfampath; +$genomepath=<IN>; chomp $genomepath; +$clusterpath=<IN>; chomp $clusterpath; +$annotatepath=<IN>; chomp $annotatepath; +$plotpath=<IN>; chomp $plotpath; +my $deg_tag=1; +if(eof){$deg_tag=0;} +else{ + $degpath=<IN>; chomp $degpath; +} +close IN; +my @tmp=split/\//,$prepath; +$predir=$tmp[-1]; +@tmp=split/\//,$rfampath; +$rfamdir=$tmp[-1]; +@tmp=split/\//,$genomepath; +$genomedir=$tmp[-1]; +@tmp=split/\//,$clusterpath; +$clusterdir=$tmp[-1]; +@tmp=split/\//,$annotatepath; +$annotatedir=$tmp[-1]; +@tmp=split/\//,$plotpath; +$plotdir=$tmp[-1]; + +my $dir=dirname($opts{'o'}); + +open OUT ,">$opts{'o'}"; +print OUT "<HTML>\n <HEAD>\n <TITLE> Analysis Report </TITLE>\n </HEAD> + <BODY bgcolor=\"lightgray\">\n <h1 align=\"center\">\n <font face=\"ºÚÌå\">\n <b>Small RNA Analysis Report</b>\n </font>\n </h1> + <h2>1. Sequence No. and quality</h2> + <h3>1.1 Sequece No.</h3> +"; + +### raw data no +open IN,"<$config"; +my @files;my @marks; my @rawNo; +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @files,$tmp[0]; + + my $no=`less $tmp[0] |wc -l `; + chomp $no; + if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { + $no=$no/4; + } + else{ + $no=$no/2; + } + push @rawNo,$no; + + push @marks,$tmp[1]; +} +close IN; + +### preprocess +unless ($prepath=~/\/$/) { + $prepath .="/"; +} + +my @trimNo;my @collapse; +my $collapsefile=$prepath."collapse_reads.fa"; +open IN,"<$collapsefile"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $trimNo[$i] +=$lng[$i]; + $collapse[$i] ++; + } + } +} +close IN; + +my @cleanR;my @cleanT; +my $clean=$prepath."collapse_reads_18-40.fa"; +open IN,"<$clean"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $cleanR[$i] +=$lng[$i]; + $cleanT[$i] ++; + } + } +} +close IN; + +my @filterR;my @filterT; +my $filter=$prepath."collapse_reads_out.fa"; +open IN,"<$filter"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $filterR[$i] +=$lng[$i]; + $filterT[$i] ++; + } + } +} +close IN; + + +print OUT "<table border=\"1\"> +<tr align=\"center\"> +<th> </th> +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Raw Reads No. </th> +"; +foreach (@rawNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Reads No. After Trimed 3\' adapter </th> +"; +foreach (@trimNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Unique Tags No. </th> +"; +foreach (@collapse) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Reads No. </th> +"; +foreach (@cleanR) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Tags No. </th> +"; +foreach (@cleanT) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Filter Reads No. \(reads count \>3\) </th> +"; +foreach (@filterR) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Filter Tags No. \(reads count \>3\) </th> +"; +foreach (@filterT) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +The raw data file path is: <b>$files[0]</b><br /> +"; +for (my $i=1;$i<@files;$i++) { + print OUT "          <b>$files[$i]</b><br />"; +} +print OUT "The collapsed file path is: <b>$collapsefile</b><br /> +The clean data file path is: <b>$clean</b><br /> +The filter (remain total reads>3) data file path is: <b>$filter</b><br /> +</p> +<h2> 1. Sequence length count</h2> +"; +print OUT "\n"; + +my $length=$prepath."length.html"; +open IN,"<$length"; +while (my $aline=<IN>) { + chomp $aline; + print OUT "$aline\n"; +} +close IN; + +print OUT "<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution_after_count_filter.txt\"> length file</a> +</p> +"; + +#### rfam +unless ($rfampath=~/\/$/) { + $rfampath .="/"; +} +unless ($genomepath=~/\/$/) { + $genomepath .="/"; +} +print OUT "<h2>2. Rfam non-miRNA annotation</h2> +<h3>2.1 Reads count</h3> +<table border=\"1\"> +<tr align=\"center\"> +"; + +my @rfamR; my @rfamT; +my $tag=1; +open IN,"<$dir/rfam_match/rfam_non-miRNA_annotation.txt"; +while (my $aline=<IN>) { + chomp $aline; + $tag=0 if($aline=~/tags\s+number/); + next if($aline=~/^\#/); + next if($aline=~/^\s*$/); + my @tmp=split/\s+/,$aline; + if($tag == 1){push @rfamR,[@tmp];} + else{push @rfamT,[@tmp];} +} +close IN; + + +print OUT "<th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamR;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamR[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { + print OUT "<td> $rfamR[$i][$j]</td>\n"; + } +} + +print OUT "</tr>\n</table> + <h3>2.2 Tags count</h3> + <table border=\"1\"> + <tr align=\"center\"> + <th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamT;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamT[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { + print OUT "<td> $rfamT[$i][$j]</td>\n"; + } +} +print OUT "</tr>\n</table> +<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>"; +print OUT "<b>rfam_mapped.bwt</b></p>"; + +open IN,"<$dir/genome_match/genome_mapped.bwt"; +my @genome_r_u; +my @genome_r_m; +my @genome_t_u; +my @genome_t_m; +my $tags_map_number=0; +while (my $aline=<IN>) { + chomp $aline; + my @temp=split/\t/,$aline; + if ($temp[6]==0) { + $aline=~/:([\d|_]+)_x(\d+)/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $genome_r_u[$i] +=$lng[$i]; + $genome_t_u[$i] ++; + } + } + $tags_map_number++; + } + if ($temp[6]>0) { + $aline=~/:([\d|_]+)_x(\d+)/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $genome_r_m[$i] +=$lng[$i]; + $genome_t_m[$i] ++; + } + } + for (my $i=0;$i<$temp[6] ;$i++) { + my $next=<IN>; + } + $tags_map_number++; + } +} +close IN; +#<h3>3.1 Reads count</h3> +#<table border=\"1\"> +#<tr align=\"center\"> +print OUT "<h2>3. genome mapping result</h2> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">Map</th>\n +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Uniq Map Reads No.</th> +"; +for (my $i=0;$i<@genome_r_u ;$i++) { + print OUT "<td> $genome_r_u[$i]</td>\n"; +} + +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Uniq Map Tags No.</th> +"; +for (my $i=0;$i<@genome_t_u ;$i++) { + print OUT "<td> $genome_t_u[$i]</td>\n"; +} + +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Multiple Map Reads No.</th> +"; +for (my $i=0;$i<@genome_r_m ;$i++) { + print OUT "<td> $genome_r_m[$i]</td>\n"; +} + +print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Multiple Map Tags No.</th> +"; +for (my $i=0;$i<@genome_t_m ;$i++) { + print OUT "<td> $genome_t_m[$i]</td>\n"; +} + +print OUT "</tr>\n</table> +<p>Note:<br />The genome mapping results is: <b>$genomepath</b>"; +print OUT "<b>genome_mapped.bwt</b></p>"; + +my $cluster="$clusterpath/sample_reads.cluster"; +my $cluster_number=`less $cluster |wc -l `; +$cluster_number=$cluster_number-1; +my (%cluster_length,@exp,@rpkm); +my @exp_range=qw(0 \(0--10] \(10--100] \(100--1000] \(1000--10000] \(10000--100000] \(100000--**\)); +my @rpkm_range=qw(0 \(0--0.25] \(0.25--0.5] \(0.5--1] \(1.0-5.0] \(5--10] \(10--50] \(50--100] \(100--500] \(500--1000] \(1000--**]); + +open IN,"<$cluster"; +while (my $aline=<IN>) { + next if($aline=~/^\"/); + chomp $aline; + my @temp=split/\t/,$aline; + my @id=split/:|-/,$temp[0]; + $cluster_length{$id[2]-$id[1]+1}++; + for (my $i=0;$i<@marks ;$i++) { + if ($temp[$i+3] == 0) {$exp[0][$i]++;} + elsif ($temp[$i+3]>0 && $temp[$i+3]<= 10 ){$exp[1][$i]++;} + elsif ($temp[$i+3]>10 && $temp[$i+3]<=100){$exp[2][$i]++;} + elsif ($temp[$i+3]>100 && $temp[$i+3]<=1000){$exp[3][$i]++;} + elsif ($temp[$i+3]>1000 && $temp[$i+3]<=10000){$exp[4][$i]++;} + elsif ($temp[$i+3]>10000 && $temp[$i+3]<=100000){$exp[5][$i]++;} + elsif ($temp[$i+3]>100000){$exp[6][$i]++;} + } +} +close IN; + +my $cluster_rpkm="$clusterpath/sample_rpkm.cluster"; +open IN,"<$cluster_rpkm"; +while (my $aline=<IN>) { + next if($aline=~/^\#/); + chomp $aline; + my @temp=split/\t/,$aline; + for (my $i=0;$i<@marks ;$i++) { + if ($temp[$i+3]==0) {$rpkm[0][$i]++;} + elsif($temp[$i+3]>0 && $temp[$i+3]<=0.25){$rpkm[1][$i]++;} + elsif($temp[$i+3]>0.25 && $temp[$i+3]<=0.5){$rpkm[2][$i]++;} + elsif($temp[$i+3]>0.5 && $temp[$i+3]<=1){$rpkm[3][$i]++;} + elsif($temp[$i+3]>1 && $temp[$i+3]<=5){$rpkm[4][$i]++;} + elsif($temp[$i+3]>5 && $temp[$i+3]<=10){$rpkm[5][$i]++;} + elsif($temp[$i+3]>10 && $temp[$i+3]<=50){$rpkm[6][$i]++;} + elsif($temp[$i+3]>50 && $temp[$i+3]<=100){$rpkm[7][$i]++;} + elsif($temp[$i+3]>100 && $temp[$i+3]<=500){$rpkm[8][$i]++;} + elsif($temp[$i+3]>500 && $temp[$i+3]<=1000){$rpkm[9][$i]++;} + else{$rpkm[10][$i]++;} + } +} + +close IN; + +my $cluster_length_file="$clusterpath/cluster_length.txt"; +open LEN,">$cluster_length_file"; +print LEN "\#length\tcluster_number\n"; +foreach my $key (sort keys %cluster_length) { + print LEN "$key\t$cluster_length{$key}\n"; +} +close LEN; +print OUT "<h2>4. cluster result</h2> +<h3>4.1 Cluster count</h3> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\"> </th> +<td>Merged samples</td></tr> +<tr align=\"center\"> +<th align=\"left\">Tags number</th> +<td>$tags_map_number</td></tr> +<tr align=\"center\"> +<th align=\"left\">Cluster number</th> +<td>$cluster_number</td></tr>\n</table> +"; + +print OUT "<h3>4.2 Cluster length</h3> +<p> Note:<br />The clusters length data: <a href=\"./$clusterdir/cluster_length.txt\"> length file</a> +</p> +"; +print OUT "<h3>4.3 Quantify</h3> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">Reads Range</th>\n +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@exp_range;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$exp_range[$i]</th> + "; + for (my $j=0;$j<@marks ;$j++) { + if (!(defined($exp[$i][$j]))) { + print OUT "<td> 0</td>\n"; + } + else{print OUT "<td> $exp[$i][$j]</td>\n";} + } +} +print OUT "</tr>\n</table>"; + +print OUT "\n<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">RPKM Range</th>\n +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rpkm_range;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rpkm_range[$i]</th> + "; + for (my $j=0;$j<@marks ;$j++) { + if (!(defined($rpkm[$i][$j]))) { + print OUT "<td> 0</td>\n"; + } + else{print OUT "<td> $rpkm[$i][$j]</td>\n";} + } +} +print OUT "</tr>\n</table>"; + +my $annotate="$annotatepath/sample_c_p.anno"; +my (%posit,%repeat,%nat1,%nat2); +my (@phase,@long,@repeat,@nat); +for (my $j=0;$j<@marks ;$j++) { + $phase[$j]=0; + $long[$j]=0; + $repeat[$j]=0; + $nat[$j]=0; +} + +my $class_anno=1; +open ANNO,"<$annotate"; +while (my $aline=<ANNO>) { + chomp $aline; + my @temp=split/\t/,$aline; + if($aline=~/^\#/){ + if (@temp != 10+@marks) { + $class_anno=0; + } + next; + } + for (my $i=3+@marks+$class_anno;$i<@temp;$i++) { + my @posit=split/\;/,$temp[$i]; + for (my $j=0;$j<@marks ;$j++) { + if ($temp[3+$j]>0) { + $posit{$posit[0]}[$j]++; + } + else{ + if (!(defined($posit{$posit[0]}[$j]))) { + $posit{$posit[0]}[$j]=0; + } + } + } + } + if ($class_anno) { + for (my $j=0;$j<@marks ;$j++) { + if ($temp[3+$j]>0) { + if ($temp[6] eq "phase") { + $phase[$j]++; + } + if ($temp[7] eq "long") { + $long[$j]++; + } + if ($temp[8] ne "\/") { + $repeat[$j]++; + my @rr=split/\;/,$temp[8]; + foreach (@rr) { + $repeat{$_}[$j]++; + } + } + if ($temp[9] ne "\/") { + $nat[$j]++; + my @nn1=split/\;/,$temp[9]; + my @nn2=split/\;/,$temp[10]; + for (my $k=0;$k<@nn1 ;$k++) { + $nat1{$nn1[$k]}[$j]++; + $nat2{$nn2[$k]}[$j]++; + } + } + } + } + } +} +close ANNO; + +print OUT "<h2>5. Cluster Annotate</h2> +<h3>5.1 Cluster genome position annotate</h3> +<table border=\"1\"> +<tr align=\"center\"> +<th align=\"left\">clusters number</th>\n +"; + +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +foreach my $key (sort keys %posit) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + foreach (@{$posit{$key}}) { + print OUT "<td> $_</td>\n"; + } +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +One cluster mybe annotate to multiple genes<br /> +"; + +if ($class_anno) { + print OUT "<h3>5.2 Cluster source mechanism annotate</h3> + <table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">clusters number</th>\n + "; + + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Phase</th>\n + "; + foreach (@phase) { + print OUT "<td> $_ </td>\n"; + } + + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Long</th>\n + "; + foreach (@long) { + print OUT "<td> $_ </td>\n"; + } + + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Repeat</th>\n + "; + foreach (@repeat) { + print OUT "<td> $_ </td>\n"; + } + + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">Nat</th>\n + "; + foreach (@nat) { + print OUT "<td> $_ </td>\n"; + } + print OUT "</tr>\n</table>"; + + print OUT "<p> + Repeat subclass annotate: + "; + + print OUT "<table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Repeat subclass</th>\n + "; + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + + foreach my $key (sort keys %repeat) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($repeat{$key}[$i])) { + print OUT "<td> $repeat{$key}[$i] </td>\n"; + } + else{print OUT "<td> 0 </td>\n";} + } + } + print OUT "</tr>\n</table>"; + + + print OUT "<p> + Nat subclass1 annotate: + "; + + print OUT "<table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Nat subclass1</th>\n + "; + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + foreach my $key (sort keys %nat1) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($nat1{$key}[$i])) { + print OUT "<td> $nat1{$key}[$i] </td>\n"; + } + else{print OUT "<td> 0 </td>\n";} + } + } + print OUT "</tr>\n</table>"; + + print OUT "<p> + Nat subclass2 annotate: + "; + + print OUT "<table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Nat subclass2</th>\n + "; + foreach (@marks) { + print OUT "<th> $_ </th>\n"; + } + foreach my $key (sort keys %nat2) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($nat2{$key}[$i])) { + print OUT "<td> $nat2{$key}[$i] </td>\n"; + } + else{print OUT "<td> 0 </td>\n";} + } + } + print OUT "</tr>\n</table>"; + print OUT "<p> + Note:<br /> + One cluster mybe annotate to multiple repeats or nats<br /> + "; +} +else { + print OUT "<h3>5.2 Cluster source mechanism annotate</h3> + <br />Do not do source mechanism annotate <br />"; + +} + +print OUT "<h2>6. Graph of Clusters of all samples</h2> \n"; + +my $plot=$plotpath."cluster.html"; +open IN,"<$plot"; +while (my $aline=<IN>) { + chomp $aline; + print OUT "$aline\n"; +} +close IN; + + +if ($deg_tag) { + my $deg_file=`ls $degpath`; + chomp $deg_file; + my @deg_file=split/\n/,$deg_file; + my %deg; + foreach (@deg_file) { + my $output="$degpath/$_/output_score.txt"; + open IN,"<$output"; + $deg{$_}[0]=0; + $deg{$_}[1]=0; + $deg{$_}[2]=0; + while (my $aline=<IN>) { + next if ($aline=~/^\"/); + chomp $aline; + my @temp=split/\t/,$aline; + if ($temp[9] eq "TRUE") { + $deg{$_}[0]++; + if ($temp[4] >0) { + $deg{$_}[1]++; + } + if ($temp[4] <0) { + $deg{$_}[2]++; + } + } + } + close IN; + } + + print OUT "<h2>7. DEG</h2> + <table border=\"1\"> + <tr align=\"center\"> + <th align=\"left\">Genes number</th>\n + <th> DEG </th>\n + <th> UP </th>\n + <th> DOWN </th>\n + "; + + foreach my $key (sort keys %deg) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$key</th> + "; + for (my $i=0;$i<@{$deg{$key}} ;$i++) { + print OUT "<td> $deg{$key}[$i] </td>\n"; + } + } + print OUT "</tr>\n</table>"; +} +else{ + print OUT "<h2>7. DEG</h2> + <br />Do not do DE clusters <br />"; +} + +print OUT " + </BODY> +</HTML> +"; +close OUT; + + + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -o +options: +-i +-format +-o output file +-h help +USAGE +exit(1); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/install_DEG.R Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,2 @@ +source("http://bioconductor.org/biocLite.R") +biocLite("DEGseq")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRDeep_plant.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,1622 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use Getopt::Std; +#use RNA; + + +################################# MIRDEEP ################################################# + +################################## USAGE ################################################## + + +my $usage= +"$0 file_signature file_structure temp_out_directory + +This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with +information on the positions of reads aligned to potential precursor sequences (signature). +It also takes as input an RNAfold output file, giving information on the sequence, structure +and mimimum free energy of the potential precursor sequences. + +Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA +sequences that should be considered for conservation purposes. -t prints out the potential +precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that +exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors +that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences +obtained through conventional cloning. -z consider the number of base pairings in the lower +stems (this option is not well tested). + +-h print this usage +-s fasta file with known miRNAs +#-o temp directory ,maked befor running the program. +-t print filtered +-u limited output (only ids) +-v cut-off (default 1) +-x sensitive option for Sanger sequences +-y use Randfold +-z consider Drosha processing +"; + + + + + +############################################################################################ + +################################### INPUT ################################################## + + +#signature file in blast_parsed format +my $file_blast_parsed=shift or die $usage; + +#structure file outputted from RNAfold +my $file_struct=shift or die $usage; + +my $tmpdir=shift or die $usage; +#options +my %options=(); +getopts("hs:tuv:xyz",\%options); + + + + + + +############################################################################################# + +############################# GLOBAL VARIABLES ############################################## + + +#parameters +my $nucleus_lng=11; + +my $score_star=3.9; +my $score_star_not=-1.3; +my $score_nucleus=7.63; +my $score_nucleus_not=-1.17; +my $score_randfold=1.37; +my $score_randfold_not=-3.624; +my $score_intercept=0.3; +my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0); +my $score_min=1; +if($options{v}){$score_min=$options{v};} +if($options{x}){$score_min=-5;} + +my $e=2.718281828; + +#hashes +my %hash_desc; +my %hash_seq; +my %hash_struct; +my %hash_mfe; +my %hash_nuclei; +my %hash_mirs; +my %hash_query; +my %hash_comp; +my %hash_bp; + +#other variables +my $subject_old; +my $message_filter; +my $message_score; +my $lines; +my $out_of_bound; + + + +############################################################################################## + +################################ MAIN ###################################################### + + +#print help if that option is used +if($options{h}){die $usage;} +unless ($tmpdir=~/\/$/) {$tmpdir .="/";} +if(!(-s $tmpdir)){mkdir $tmpdir;} +$tmpdir .="TMP_DIR/"; +mkdir $tmpdir; + +#parse structure file outputted from RNAfold +parse_file_struct($file_struct); + +#if conservation is scored, the fasta file of known miRNA sequences is parsed +if($options{s}){create_hash_nuclei($options{s})}; + +#parse signature file in blast_parsed format and resolve each potential precursor +parse_file_blast_parsed($file_blast_parsed); +#`rm -rf $tmpdir`; +exit; + + + + +############################################################################################## + +############################## SUBROUTINES ################################################### + + + +sub parse_file_blast_parsed{ + +# read through the signature blastparsed file, fills up a hash with information on queries +# (deep sequences) mapping to the current subject (potential precursor) and resolve each +# potential precursor in turn + + my $file_blast_parsed=shift; + + open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n"; + while (my $line=<FILE_BLAST_PARSED>){ + if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){ + my $query=$1; + my $query_lng=$2; + my $query_beg=$3; + my $query_end=$4; + my $subject=$5; + my $subject_lng=$6; + my $subject_beg=$7; + my $subject_end=$8; + my $e_value=$9; + my $pid=$10; + my $bitscore=$11; + my $other=$12; + + #if the new line concerns a new subject (potential precursor) then the old subject must be resolved + if($subject_old and $subject_old ne $subject){ + resolve_potential_precursor(); + } + + #resolve the strand + my $strand=find_strand($other); + + #resolve the number of reads that the deep sequence represents + my $freq=find_freq($query); + + #read information of the query (deep sequence) into hash + $hash_query{$query}{"subject_beg"}=$subject_beg; + $hash_query{$query}{"subject_end"}=$subject_end; + $hash_query{$query}{"strand"}=$strand; + $hash_query{$query}{"freq"}=$freq; + + #save the signature information + $lines.=$line; + + $subject_old=$subject; + } + } + resolve_potential_precursor(); +} + +sub resolve_potential_precursor{ + +# dissects the potential precursor in parts by filling hashes, and tests if it passes the +# initial filter and the scoring filter + +# binary variable whether the potential precursor is still viable + my $ret=1; +#print STDERR ">$subject_old\n"; + + fill_structure(); +#print STDERR "\%hash_bp",scalar keys %hash_bp,"\n"; + fill_pri(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_mature(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_star(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_loop(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_lower_flanks(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + +# do_test_assemble(); + +# this is the actual classification + unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;} + + print_results($ret); + + reset_variables(); + + return; + +} + + + +sub print_results{ + + my $ret=shift; + +# print out if the precursor is accepted and accepted precursors should be printed out +# or if the potential precursor is discarded and discarded potential precursors should +# be printed out + + if((!$options{t} and $ret) or ($options{t} and !$ret)){ + #full output + unless($options{u}){ + if($message_filter){print $message_filter;} + if($message_score){print $message_score;} + print_hash_comp(); + print $lines,"\n\n"; + return; + } + #limited output (only ids) + my $id=$hash_comp{"pri_id"}; + print "$id\n"; + } +} + + + + + + + +sub pass_threshold_score{ + +# this is the scoring + + #minimum free energy of the potential precursor +# my $score_mfe=score_mfe($hash_comp{"pri_mfe"}); + my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"}); + + #count of reads that map in accordance with Dicer processing + my $score_freq=score_freq($hash_comp{"freq"}); +#print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n"; + + #basic score + my $score=$score_mfe+$score_freq; + + #scoring of conserved nucleus/seed (optional) + if($options{s}){ + + #if the nucleus is conserved + if(test_nucleus_conservation()){ + + #nucleus from position 2-8 + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + + #resolve DNA/RNA ambiguities + $nucleus=~tr/[T]/[U]/; + + #print score contribution + score_s("score_nucleus\t$score_nucleus"); + + #print the ids of known miRNAs with same nucleus + score_s("$hash_mirs{$nucleus}"); +#print STDERR "score_nucleus\t$score_nucleus\n"; + + #add to score + $score+=$score_nucleus; + + #if the nucleus is not conserved + }else{ + #print (negative) score contribution + score_s("score_nucleus\t$score_nucleus_not"); + + #add (negative) score contribution + $score+=$score_nucleus_not; + } + } + + #if the majority of potential star reads fall as expected from Dicer processing + if($hash_comp{"star_read"}){ + score_s("score_star\t$score_star"); +#print STDERR "score_star\t$score_star\n"; + $score+=$score_star; + }else{ + score_s("score_star\t$score_star_not"); +#print STDERR "score_star_not\t$score_star_not\n"; + $score+=$score_star_not; + } + + #score lower stems for potential for Drosha recognition (highly optional) + if($options{z}){ + my $stem_bp=$hash_comp{"stem_bp"}; + my $score_stem=$scores_stem[$stem_bp]; + $score+=$score_stem; + score_s("score_stem\t$score_stem"); + } + +#print STDERR "score_intercept\t$score_intercept\n"; + + $score+=$score_intercept; + + #score for randfold (optional) + if($options{y}){ + +# only calculate randfold value if it can make the difference between the potential precursor +# being accepted or discarded + if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){ + + #randfold value<0.05 + if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");} + + #randfold value>0.05 + else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");} + } + } + + #round off values to one decimal + my $round_mfe=round($score_mfe*10)/10; + my $round_freq=round($score_freq*10)/10; + my $round=round($score*10)/10; + + #print scores + score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round"); + + #return 1 if the potential precursor is accepted, return 0 if discarded + unless($score>=$score_min){return 0;} + return 1; +} + +sub test_randfold{ + + #print sequence to temporary file, test randfold value, return 1 or 0 + +# print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); + #my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; + #open(FILE, ">$tmpfile"); + #print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; + #close FILE; + +# my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; + #my $p1=`randfold -s $tmpfile 999 | cut -f 3`; + #my $p2=`randfold -s $tmpfile 999 | cut -f 3`; + my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999); + my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999); + my $p_value=($p1+$p2)/2; + wait; +# system "rm $tmpfile"; + + if($p_value<=0.05){return 1;} + + return 0; +} + +sub randfold_pvalue{ + my $cpt_sup = 0; + my $cpt_inf = 0; + my $cpt_ega = 1; + + my ($seq,$number_of_randomizations)=@_; + #my $str =$seq; + #my $mfe = RNA::fold($seq,$str); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $str=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + for (my $i=0;$i<$number_of_randomizations;$i++) { + $seq = shuffle_sequence_dinucleotide($seq); + #$str = $seq; + + #my $rand_mfe = RNA::fold($str,$str); + $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $str=$rawfolds[1]; + my $rand_mfe=$rawfolds[-1]; + $rand_mfe=~s/\(//; + $rand_mfe=~s/\)//; + + if ($rand_mfe < $mfe) { + $cpt_inf++; + } + if ($rand_mfe == $mfe) { + $cpt_ega++; + } + if ($rand_mfe > $mfe) { + $cpt_sup++; + } + } + + my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1); + + #print "$name\t$mfe\t$proba\n"; + return $proba; +} + +sub shuffle_sequence_dinucleotide { + + my ($str) = @_; + + # upper case and convert to ATGC + $str = uc($str); + $str =~ s/U/T/g; + + my @nuc = ('A','T','G','C'); + my $count_swap = 0; + # set maximum number of permutations + my $stop = length($str) * 10; + + while($count_swap < $stop) { + + my @pos; + + # look start and end letters + my $firstnuc = $nuc[int(rand 4)]; + my $thirdnuc = $nuc[int(rand 4)]; + + # get positions for matching nucleotides + for (my $i=0;$i<(length($str)-2);$i++) { + if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) { + push (@pos,($i+1)); + $i++; + } + } + + # swap at random trinucleotides + my $max = scalar(@pos); + for (my $i=0;$i<$max;$i++) { + my $swap = int(rand($max)); + if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) { + $count_swap++; + my $w1 = substr($str,$pos[$i],1); + my $w2 = substr($str,$pos[$swap],1); + substr($str,$pos[$i],1,$w2); + substr($str,$pos[$swap],1,$w1); + } + } + } + return($str); +} + +sub test_nucleus_conservation{ + + #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 + + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + $nucleus=~tr/[T]/[U]/; + if($hash_nuclei{$nucleus}){return 1;} + + return 0; +} + + + +sub pass_filtering_initial{ + + #test if the structure forms a plausible hairpin + unless(pass_filtering_structure()){filter_p("structure problem"); return 0;} + + #test if >90% of reads map to the hairpin in consistence with Dicer processing + unless(pass_filtering_signature()){filter_p("signature problem");return 0;} + + return 1; + +} + + +sub pass_filtering_signature{ + + #number of reads that map in consistence with Dicer processing + my $consistent=0; + + #number of reads that map inconsistent with Dicer processing + my $inconsistent=0; + +# number of potential star reads map in good consistence with Drosha/Dicer processing +# (3' overhangs relative to mature product) + my $star_perfect=0; + +# number of potential star reads that do not map in good consistence with 3' overhang + my $star_fuzzy=0; + + + #sort queries (deep sequences) by their position on the hairpin + my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query; + + foreach my $query(@queries){ + + #number of reads that the deep sequence represents + unless(defined($hash_query{$query}{"freq"})){next;} + my $query_freq=$hash_query{$query}{"freq"}; + + #test which Dicer product (if any) the deep sequence corresponds to + my $product=test_query($query); + + #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable + if($product){$consistent+=$query_freq;} + + #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable + else{$inconsistent+=$query_freq;} + + #test a potential star sequence has good 3' overhang + if($product eq "star"){ + if(test_star($query)){$star_perfect+=$query_freq;} + else{$star_fuzzy+=$query_freq;} + } + } + +# if the majority of potential star sequences map in good accordance with 3' overhang +# score for the presence of star evidence + if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;} + + #total number of reads mapping to the hairpin + my $freq=$consistent+$inconsistent; + $hash_comp{"freq"}=$freq; + unless($freq>0){filter_s("read frequency too low"); return 0;} + + #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded + my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent); + unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;} + + #the hairpin is retained + return 1; +} + +sub test_star{ + + #test if a deep sequence maps in good consistence with 3' overhang + + my $query=shift; + + #5' begin and 3' end positions + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + + #the difference between observed and expected begin positions must be 0 or 1 + my $offset=$beg-$hash_comp{"star_beg"}; + if($offset==0 or $offset==1 or $offset==-1){return 1;} + + return 0; +} + + + +sub test_query{ + + #test if deep sequence maps in consistence with Dicer processing + + my $query=shift; + + #begin, end, strand and read count + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + my $strand=$hash_query{$query}{"strand"}; + my $freq=$hash_query{$query}{"freq"}; + + #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs) + if($strand eq '-'){return 0;} + + #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end + my $fuzz_beg=2; + #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end + my $fuzz_end=2; + + #if in accordance with Dicer processing, return the type of Dicer product + if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";} + if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";} + if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";} + + #if not in accordance, return 0 + return 0; +} + + +sub pass_filtering_structure{ + + #The potential precursor must form a hairpin with miRNA precursor-like characteristics + + #return value + my $ret=1; + + #potential mature, star, loop and lower flank parts must be identifiable + unless(test_components()){return 0;} + + #no bifurcations + unless(no_bifurcations_precursor()){$ret=0;} + + #minimum 14 base pairings in duplex + unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");} + + #not more than 6 nt difference between mature and star length + unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") } + + return $ret; +} + + + + + + +sub test_components{ + + #tests whether potential mature, star, loop and lower flank parts are identifiable + + unless($hash_comp{"mature_struct"}){ + filter_s("no mature"); +# print STDERR "no mature\n"; + return 0; + } + + unless($hash_comp{"star_struct"}){ + filter_s("no star"); +# print STDERR "no star\n"; + return 0; + } + + unless($hash_comp{"loop_struct"}){ + filter_s("no loop"); +# print STDERR "no loop\n"; + return 0; + } + + unless($hash_comp{"flank_first_struct"}){ + filter_s("no flanks"); +#print STDERR "no flanks_first_struct\n"; + return 0; + } + + unless($hash_comp{"flank_second_struct"}){ + filter_s("no flanks"); +# print STDERR "no flanks_second_struct\n"; + return 0; + } + return 1; +} + + + + + +sub no_bifurcations_precursor{ + + #tests whether there are bifurcations in the hairpin + + #assembles the potential precursor sequence and structure from the expected Dicer products + #this is the expected biological precursor, in contrast with 'pri_seq' that includes + #some genomic flanks on both sides + + my $pre_struct; + my $pre_seq; + if($hash_comp{"mature_arm"} eq "first"){ + $pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}; + $pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"}; + }else{ + $pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}; + $pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"}; + } + + #read into hash + $hash_comp{"pre_struct"}=$pre_struct; + $hash_comp{"pre_seq"}=$pre_seq; + + #simple pattern matching checks for bifurcations + unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){ + filter_s("bifurcation in precursor"); +# print STDERR "bifurcation in precursor\n"; + return 0; + } + + return 1; +} + +sub bp_precursor{ + + #total number of bps in the precursor + + my $pre_struct=$hash_comp{"pre_struct"}; + + #simple pattern matching + my $pre_bps=0; + while($pre_struct=~/\(/g){ + $pre_bps++; + } + return $pre_bps; +} + + +sub bp_duplex{ + + #total number of bps in the duplex + + my $duplex_bps=0; + my $mature_struct=$hash_comp{"mature_struct"}; + + #simple pattern matching + while($mature_struct=~/(\(|\))/g){ + $duplex_bps++; + } + return $duplex_bps; +} + +sub diff_lng{ + + #find difference between mature and star lengths + + my $mature_lng=length $hash_comp{"mature_struct"}; + my $star_lng=length $hash_comp{"star_struct"}; + my $diff_lng=$mature_lng-$star_lng; + return $diff_lng; +} + + + +sub do_test_assemble{ + +# not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks) +# is identical to 'pri_struct' before disassembly into parts + + my $assemble_struct; + + if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){ + if($hash_comp{"mature_arm"} eq "first"){ + $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"}; + }else{ + $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"}; + } + unless($assemble_struct eq $hash_comp{"pri_struct"}){ + $hash_comp{"test_assemble"}=$assemble_struct; + print_hash_comp(); + } + } + return; + } + + + +sub fill_structure{ + + #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired + + my $struct=$hash_struct{$subject_old}; + my $lng=length $struct; + + #local stack for keeping track of basepairings + my @bps; + + for(my $pos=1;$pos<=$lng;$pos++){ + my $struct_pos=excise_struct($struct,$pos,$pos,"+"); + + if($struct_pos eq "("){ + push(@bps,$pos); + } + + if($struct_pos eq ")"){ + my $pos_prev=pop(@bps); + $hash_bp{$pos_prev}=$pos; + $hash_bp{$pos}=$pos_prev; + } + } + return; +} + + + +sub fill_star{ + + #fills specifics on the expected star strand into 'comp' hash ('component' hash) + + #if the mature sequence is not plausible, don't look for the star arm + my $mature_arm=$hash_comp{"mature_arm"}; + unless($mature_arm){$hash_comp{"star_arm"}=0; return;} + + #if the star sequence is not plausible, don't fill into the hash + my($star_beg,$star_end)=find_star(); + my $star_arm=arm_star($star_beg,$star_end); + unless($star_arm){return;} + + #excise expected star sequence and structure + my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+"); + my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+"); + + #fill into hash + $hash_comp{"star_beg"}=$star_beg; + $hash_comp{"star_end"}=$star_end; + $hash_comp{"star_seq"}=$star_seq; + $hash_comp{"star_struct"}=$star_struct; + $hash_comp{"star_arm"}=$star_arm; + + return; +} + + +sub find_star{ + + #uses the 'bp' hash to find the expected star begin and end positions from the mature positions + + #the -2 is for the overhang + my $mature_beg=$hash_comp{"mature_beg"}; + my $mature_end=$hash_comp{"mature_end"}-2; + my $mature_lng=$mature_end-$mature_beg+1; + + #in some cases, the last nucleotide of the mature sequence does not form a base pair, + #and therefore does not basepair with the first nucleotide of the star sequence. + #In this case, the algorithm searches for the last nucleotide of the mature sequence + #to form a base pair. The offset is the number of nucleotides searched through. + my $offset_star_beg=0; + my $offset_beg=0; + + #the offset should not be longer than the length of the mature sequence, then it + #means that the mature sequence does not form any base pairs + while(!$offset_star_beg and $offset_beg<$mature_lng){ + if($hash_bp{$mature_end-$offset_beg}){ + $offset_star_beg=$hash_bp{$mature_end-$offset_beg}; + }else{ + $offset_beg++; + } + } + #when defining the beginning of the star sequence, compensate for the offset + my $star_beg=$offset_star_beg-$offset_beg; + + #same as above + my $offset_star_end=0; + my $offset_end=0; + while(!$offset_star_end and $offset_end<$mature_lng){ + if($hash_bp{$mature_beg+$offset_end}){ + $offset_star_end=$hash_bp{$mature_beg+$offset_end}; + }else{ + $offset_end++; + } + } + #the +2 is for the overhang + my $star_end=$offset_star_end+$offset_end+2; + + return($star_beg,$star_end); +} + + +sub fill_pri{ + + #fills basic specifics on the precursor into the 'comp' hash + + my $seq=$hash_seq{$subject_old}; + my $struct=$hash_struct{$subject_old}; + my $mfe=$hash_mfe{$subject_old}; + my $length=length $seq; + + $hash_comp{"pri_id"}=$subject_old; + $hash_comp{"pri_seq"}=$seq; + $hash_comp{"pri_struct"}=$struct; + $hash_comp{"pri_mfe"}=$mfe; + $hash_comp{"pri_beg"}=1; + $hash_comp{"pri_end"}=$length; + + return; +} + + +sub fill_mature{ + + #fills specifics on the mature sequence into the 'comp' hash + + my $mature_query=find_mature_query(); + my($mature_beg,$mature_end)=find_positions_query($mature_query); + my $mature_strand=find_strand_query($mature_query); + my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand); + my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand); + my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand); + + $hash_comp{"mature_query"}=$mature_query; + $hash_comp{"mature_beg"}=$mature_beg; + $hash_comp{"mature_end"}=$mature_end; + $hash_comp{"mature_strand"}=$mature_strand; + $hash_comp{"mature_struct"}=$mature_struct; + $hash_comp{"mature_seq"}=$mature_seq; + $hash_comp{"mature_arm"}=$mature_arm; + + return; +} + + + +sub fill_loop{ + + #fills specifics on the loop sequence into the 'comp' hash + + #unless both mature and star sequences are plausible, do not look for the loop + unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;} + + my $loop_beg; + my $loop_end; + + #defining the begin and end positions of the loop from the mature and star positions + #excision depends on whether the mature or star sequence is 5' of the loop ('first') + if($hash_comp{"mature_arm"} eq "first"){ + $loop_beg=$hash_comp{"mature_end"}+1; + }else{ + $loop_end=$hash_comp{"mature_beg"}-1; + } + + if($hash_comp{"star_arm"} eq "first"){ + $loop_beg=$hash_comp{"star_end"}+1; + }else{ + $loop_end=$hash_comp{"star_beg"}-1; + } + + #unless the positions are plausible, do not fill into hash + unless(test_loop($loop_beg,$loop_end)){return;} + + my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+"); + my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+"); + + $hash_comp{"loop_beg"}=$loop_beg; + $hash_comp{"loop_end"}=$loop_end; + $hash_comp{"loop_seq"}=$loop_seq; + $hash_comp{"loop_struct"}=$loop_struct; + + return; +} + + +sub fill_lower_flanks{ + + #fills specifics on the lower flanks and unpaired strands into the 'comp' hash + + #unless both mature and star sequences are plausible, do not look for the flanks + unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;} + + my $flank_first_end; + my $flank_second_beg; + + #defining the begin and end positions of the flanks from the mature and star positions + #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first') + if($hash_comp{"mature_arm"} eq "first"){ + $flank_first_end=$hash_comp{"mature_beg"}-1; + }else{ + $flank_second_beg=$hash_comp{"mature_end"}+1; + } + + if($hash_comp{"star_arm"} eq "first"){ + $flank_first_end=$hash_comp{"star_beg"}-1; + }else{ + $flank_second_beg=$hash_comp{"star_end"}+1; + } + + #unless the positions are plausible, do not fill into hash + unless(test_flanks($flank_first_end,$flank_second_beg)){return;} + + $hash_comp{"flank_first_end"}=$flank_first_end; + $hash_comp{"flank_second_beg"}=$flank_second_beg; + $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+"); + $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+"); + $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+"); + $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+"); + + if($options{z}){ + fill_stems_drosha(); + } + + return; +} + + +sub fill_stems_drosha{ + + #scores the number of base pairings formed by the first ten nt of the lower stems + #in general, the more stems, the higher the score contribution + #warning: this options has not been thoroughly tested + + my $flank_first_struct=$hash_comp{"flank_first_struct"}; + my $flank_second_struct=$hash_comp{"flank_second_struct"}; + + my $stem_first=substr($flank_first_struct,-10); + my $stem_second=substr($flank_second_struct,0,10); + + my $stem_bp_first=0; + my $stem_bp_second=0; + + #find base pairings by simple pattern matching + while($stem_first=~/\(/g){ + $stem_bp_first++; + } + + while($stem_second=~/\)/g){ + $stem_bp_second++; + } + + my $stem_bp=min2($stem_bp_first,$stem_bp_second); + + $hash_comp{"stem_first"}=$stem_first; + $hash_comp{"stem_second"}=$stem_second; + $hash_comp{"stem_bp_first"}=$stem_bp_first; + $hash_comp{"stem_bp_second"}=$stem_bp_second; + $hash_comp{"stem_bp"}=$stem_bp; + + return; +} + + + + +sub arm_mature{ + + #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor + + my ($beg,$end,$strand)=@_; + + #mature and star sequences should alway be on plus strand + if($strand eq "-"){return 0;} + + #there should be no bifurcations and minimum one base pairing + my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand); + if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){ + return "first"; + }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){ + return "second"; + } + return 0; +} + + +sub arm_star{ + + #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + #no overlap between the mature and the star sequence + if($hash_comp{"mature_arm"} eq "first"){ + ($hash_comp{"mature_end"}<$beg) or return 0; + }elsif($hash_comp{"mature_arm"} eq "second"){ + ($end<$hash_comp{"mature_beg"}) or return 0; + } + + #there should be no bifurcations and minimum one base pairing + my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+"); + if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){ + return "first"; + }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){ + return "second"; + } + return 0; +} + + +sub test_loop{ + + #tests the loop positions + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub test_flanks{ + + #tests the positions of the lower flanks + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub comp{ + + #subroutine to retrive from the 'comp' hash + + my $type=shift; + my $component=$hash_comp{$type}; + return $component; +} + + +sub find_strand_query{ + + #subroutine to find the strand for a given query + + my $query=shift; + my $strand=$hash_query{$query}{"strand"}; + return $strand; +} + + +sub find_positions_query{ + + #subroutine to find the begin and end positions for a given query + + my $query=shift; + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + return ($beg,$end); +} + + + +sub find_mature_query{ + + #finds the query with the highest frequency of reads and returns it + #is used to determine the positions of the potential mature sequence + + my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query; + my $mature_query=$queries[0]; + return $mature_query; +} + + + + +sub reset_variables{ + + #resets the hashes for the next potential precursor + +# %hash_query=(); +# %hash_comp=(); +# %hash_bp=(); + foreach my $key (keys %hash_query) {delete($hash_query{$key});} + foreach my $key (keys %hash_comp) {delete($hash_comp{$key});} + foreach my $key (keys %hash_bp) {delete($hash_bp{$key});} + +# $message_filter=(); +# $message_score=(); +# $lines=(); + undef($message_filter); + undef($message_score); + undef($lines); + return; +} + + + +sub excise_seq{ + + #excise sub sequence from the potential precursor + + my($seq,$beg,$end,$strand)=@_; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($seq)){$out_of_bound++;return 0;} + + #if on the minus strand, the reverse complement should be excised + if($strand eq "-"){$seq=revcom($seq);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_seq=substr($seq,$beg-1,$end-$beg+1); + + return $sub_seq; + +} + +sub excise_struct{ + + #excise sub structure + + my($struct,$beg,$end,$strand)=@_; + my $lng=length $struct; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($struct)){return 0;} + + #if excising relative to minus strand, positions are reversed + if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_struct=substr($struct,$beg-1,$end-$beg+1); + + return $sub_struct; +} + + +sub create_hash_nuclei{ + #parses a fasta file with sequences of known miRNAs considered for conservation purposes + #reads the nuclei into a hash + + my ($file) = @_; + my ($id, $desc, $sequence, $nucleus) = (); + + open (FASTA, "<$file") or die "can not open $file\n"; + while (<FASTA>) + { + chomp; + if (/^>(\S+)(.*)/) + { + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + while (<FASTA>){ + chomp; + if (/^>(\S+)(.*)/){ + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + next; + } + $sequence .= $_; + } + } + } + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + close FASTA; +} + + +sub parse_file_struct{ + #parses the output from RNAfoldand reads it into hashes + my($file) = @_; + my($id,$desc,$seq,$struct,$mfe) = (); + open (FILE_STRUCT, "<$file") or die "can not open $file\n"; + while (<FILE_STRUCT>){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $id= $1; + $desc= $2; + $seq= ""; + $struct= ""; + $mfe= ""; + while (<FILE_STRUCT>){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + $id = $1; + $desc = $2; + $seq = ""; + $struct = ""; + $mfe = ""; + next; + } + if(/^\w/){ + tr/uU/tT/; + $seq .= $_; + next; + } + if(/((\.|\(|\))+)/){$struct .=$1;} + if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;} + } + } + } + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + close FILE_STRUCT; + return; +} + + +sub score_s{ + + #this score message is appended to the end of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score.=$message."\n";; + return; +} + + + +sub score_p{ + + #this score message is appended to the beginning of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score=$message."\n".$message_score; + return; +} + + + +sub filter_s{ + + #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor + + my $message=shift; + $message_filter.=$message."\n"; + return; +} + + +sub filter_p{ + + #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor + + my $message=shift; + if(defined $message_filter){$message_filter=$message."\n".$message_filter;} + else{$message_filter=$message."\n";} + return; +} + + +sub find_freq{ + + #finds the frequency of a given read query from its id. + + my($query)=@_; + + if($query=~/x(\d+)/i){ + my $freq=$1; + return $freq; + }else{ + #print STDERR "Problem with read format\n"; + return 0; + } +} + + +sub print_hash_comp{ + + #prints the 'comp' hash + + my @keys=sort keys %hash_comp; + foreach my $key(@keys){ + my $value=$hash_comp{$key}; + print "$key \t$value\n"; + } +} + + + +sub print_hash_bp{ + + #prints the 'bp' hash + + my @keys=sort {$a<=>$b} keys %hash_bp; + foreach my $key(@keys){ + my $value=$hash_bp{$key}; + print "$key\t$value\n"; + } + print "\n"; +} + + + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + return($strand); +} + + +sub contained{ + + #Is the stretch defined by the first positions contained in the stretch defined by the second? + + my($beg1,$end1,$beg2,$end2)=@_; + + testbeginend($beg1,$end1,$beg2,$end2); + + if($beg2<=$beg1 and $end1<=$end2){ + return 1; + }else{ + return 0; + } +} + + +sub testbeginend{ + + #Are the beginposition numerically smaller than the endposition for each pair? + + my($begin1,$end1,$begin2,$end2)=@_; + + unless($begin1<=$end1 and $begin2<=$end2){ + print STDERR "beg can not be larger than end for $subject_old\n"; + exit; + } +} + + +sub rev_pos{ + +# The blast_parsed format always uses positions that are relative to the 5' of the given strand +# This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with +# the n't nucleotide on the plus strand + +# This subroutine reverses the begin and end positions of positions of the minus strand so that they +# are relative to the 5' end of the plus strand + + my($beg,$end,$lng)=@_; + + my $new_end=$lng-$beg+1; + my $new_beg=$lng-$end+1; + + return($new_beg,$new_end); +} + +sub round { + + #rounds to nearest integer + + my($number) = shift; + return int($number + .5); + +} + + +sub rev{ + + #reverses the order of nucleotides in a sequence + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + #the complementary of a sequence + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + #reverse complement + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + + +sub max2 { + + #max of two numbers + + my($a, $b) = @_; + return ($a>$b ? $a : $b); +} + +sub min2 { + + #min of two numbers + + my($a, $b) = @_; + return ($a<$b ? $a : $b); +} + + + +sub score_freq{ + +# scores the count of reads that map to the potential precursor +# Assumes geometric distribution as described in methods section of manuscript + + my $freq=shift; + + #parameters of known precursors and background hairpins + my $parameter_test=0.999; + my $parameter_control=0.6; + + #log_odds calculated directly to avoid underflow + my $intercept=log((1-$parameter_test)/(1-$parameter_control)); + my $slope=log($parameter_test/$parameter_control); + my $log_odds=$slope*$freq+$intercept; + + #if no strong evidence for 3' overhangs, limit the score contribution to 0 + unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);} + + return $log_odds; +} + + + +##sub score_mfe{ + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + +## my $mfe=shift; + + #numerical value, minimum 1 +## my $mfe_adj=max2(1,-$mfe); + + #parameters of known precursors and background hairpins, scale and location +## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); +## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + +## my $odds=$prob_test/$prob_background; +## my $log_odds=log($odds); + +## return $log_odds; +##} + +sub score_mfe{ +# use bignum; + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + + my ($mfe,$mlng)=@_; + + #numerical value, minimum 1 + my $mfe_adj=max2(1,-$mfe); +my $mfe_adj1=$mfe/$mlng; + #parameters of known precursors and background hairpins, scale and location + my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; + my $ev=$e**($mfe_adj1*$c); + #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; + my $log_odds=($a/($b+$ev)); + + + my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); + my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + + my $odds=$prob_test/$prob_background; + my $log_odds_2=log($odds); + #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; + return $log_odds; +} + + + +sub prob_gumbel_discretized{ + +# discretized Gumbel distribution, probabilities within windows of 1 kCal/mol +# uses the subroutine that calculates the cdf to find the probabilities + + my ($var,$scale,$location)=@_; + + my $bound_lower=$var-0.5; + my $bound_upper=$var+0.5; + + my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location); + my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location); + + my $prob=$cdf_upper-$cdf_lower; + + return $prob; +} + + +sub cdf_gumbel{ + +# calculates the cumulative distribution function of the Gumbel distribution + + my ($var,$scale,$location)=@_; + + my $cdf=$e**(-($e**(-($var-$location)/$scale))); + + return $cdf; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRNA_Express_and_sequence.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,173 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-6-4 +#Modified: +#Description: solexa miRNA express and sequence +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h"); +if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'list'}; +my $out=$opts{'fa'}; +my $preout=$opts{'pre'}; + +=cut +my %hash_pri; +open PRI,"<$opts{p}"; +while (my $aline=<PRI>) { + chomp $aline; + if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;} +} +close PRI; +=cut + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +open FA ,">$out"; +open PRE,">$preout"; + +print OUT "#ID\tcoordinate\tpos1\tpos2"; +my @marks=split/\,/,$opts{'tag'}; +foreach (@marks) { + print OUT "\t",$_,"_matureExp"; +} +foreach (@marks) { + print OUT "\t",$_,"_starExp"; +} +foreach (@marks) { + print OUT "\t",$_,"_totalExp"; +} + +print OUT "\n"; + +my (%uniq_id,$novel); +while (my $aline=<IN>) { + chomp $aline; + until ($aline =~ /^score\s+[-\d\.]+/){ + $aline = <IN>; + if (eof) {last;} + } + if (eof) {last;} +########## miRNA ID ################ + $novel++; +########### annotate#################### + do {$aline=<IN>;} until($aline=~/flank_first_end/) ; + chomp $aline; + my @flank1=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/flank_second_beg/) ; + chomp $aline; + my @flank2=split/\t/,$aline; +# +########## mature start loop pre #### + do {$aline=<IN>;} until($aline=~/mature_beg/) ; + chomp $aline; + my @start=split/\t/,$aline; +# $start[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/mature_end/) ; + chomp $aline; + my @end=split/\t/,$aline; +# $end[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/mature_seq/) ; + chomp $aline; + my @arr1=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/pre_seq/) ; + chomp $aline; + my @arr2=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/pri_id/) ; + chomp $aline; + my @pri_id=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/pri_seq/) ; + chomp $aline; + my @pri_seq=split/\t/,$aline; + do {$aline=<IN>;} until($aline=~/star_beg/) ; + chomp $aline; + my @star_start=split/\t/,$aline; +# $star_start[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/star_end/) ; + chomp $aline; + my @star_end=split/\t/,$aline; +# $star_end[1] -=$flank1[1]; + do {$aline=<IN>;} until($aline=~/star_seq/) ; + chomp $aline; + my @arr3=split/\t/,$aline; + print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t"; + #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t"; + print FA ">miR-c-$novel\n$arr1[1]\n"; + print PRE ">miR-c-$novel\n$pri_seq[1]\n"; +########## reads count ############# + <IN>; + my @count1;my @count2;my @count3;my @count4; + $aline=<IN>; + do { + chomp $aline; + my @reads=split/\t/,$aline; + my @pos=(); + $reads[5]=~/(\d+)\.\.(\d+)/; +# $pos[0] =$1-$flank1[1]; +# $pos[1] =$2-$flank1[1]; + $pos[0]=$1; + $pos[1]=$2; + $reads[0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=0;$i<@ss ;$i++) { + if (!(defined $count3[$i])) { + $count3[$i]=0; + } + if (!(defined $count4[$i])) { + $count4[$i]=0; + } + $count2[$i]+=$ss[$i]; + + } +# $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 ); +# $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 ); +# $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1); +# $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1); + if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 ) + { + for (my $i=0;$i<@ss;$i++) { + $count3[$i]+=$ss[$i]; + } + } + if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){ + for (my $i=0;$i<@ss;$i++) { + $count4[$i]+=$ss[$i]; + } + } + $aline=<IN>; + chomp $aline; + } until(length $aline < 1) ; + $"="\t"; + print OUT "@count3\t@count4\t@count2\n"; + $"=" "; +} + +close IN; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -list -fa -pre -tag +options: +-i input file,predictions file +-list output file miRNA list file +-fa output file ,miRNA sequence fasta file. +-pre output file, miRNA precursor fasta file. +-tag string, sample names# eg: samA,samB,samC +-h help +USAGE +exit(1); +} +
--- a/miRPlant.pl Wed Dec 03 02:03:27 2014 -0500 +++ b/miRPlant.pl Fri Dec 05 00:11:02 2014 -0500 @@ -17,7 +17,7 @@ #use Term::ANSIColor; my %opts; -GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); +GetOptions(\%opts,"i:s@","tag:s@","phred:i","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments &usage; } @@ -33,7 +33,9 @@ } my $phred_qv=64; - +if (defined $opts{'phred'}) { + $phred_qv=$opts{'phred'}; +} my @inputfiles=@{$opts{'i'}}; my @inputtags=@{$opts{'tag'}};
--- a/miRPlant.xml Wed Dec 03 02:03:27 2014 -0500 +++ b/miRPlant.xml Fri Dec 05 00:11:02 2014 -0500 @@ -15,10 +15,6 @@ <command interpreter="perl">miRPlant.pl ## Change this to accommodate the number of threads you have available. -t \${GALAXY_SLOTS:-4} - ## Do or not delet rfam mapped tags - #if $params.delet_rfam == "yes": - -D - #end if -path \$SCRIPT_PATH #for $j, $s in enumerate( $series ) @@ -27,7 +23,43 @@ -tag ${s.tag} #end for - -format $format -gfa $gfa -pre $pre -mat $mat -rfam $rfam -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe > run.log + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + ## Do or not annotate rfam non-miRNA RNAs + #if $params.annotate_rfam == "yes": + + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($params.annotate_rfam.reference_rfam.source) == "history": + bowtie-build "$params.annotate_rfam.reference_rfam.own_file" rfam; ln -s "$params.annotate_rfam.reference_rfam.own_file" rfam.fa; + #set rfam_index_path = 'rfam' + #else: + #set rfam_index_path = $params.annotate_rfam.reference_rfam.index.fields.path + #end if + + -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v + ## Do or not delet rfam mapped tags + #if $params.annotate_rfam.rfamresult.delet_rfam == "yes": + -D + #end if + #end if + + + ## Do or not annotate known microRNAs + #if $params.known_microRNA == "yes": + -pre $pre -mat $mat + #end if + + + -format $format -gfa ${index_path}.fa -idx $index_path -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log </command> <inputs> @@ -37,25 +69,136 @@ <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/> </repeat> + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> + <conditional name="params"> - <param name="delet_rfam" type="select" label="delet rfam mapped reads"> + <param name="annotate_rfam" type="select" label="annotate rfam nocoding RNAs(excluding miRNA)"> <option value="yes" selected="true">yes</option> <option value="no">no</option> </param> + <when value="yes"> + <!--param name="rfam" type="data" label="rfam sequence file" /--> + <conditional name="reference_rfam"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference" help="If your reference of interest is not listed, contact the Galaxy team"> + <options from_data_table="rfam_bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" /> + </when> + </conditional> + + <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches for rfam mapping"/> + + <conditional name="rfamresult"> + <param name="delet_rfam" type="select" label="delet rfam mapped reads"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + </conditional> <!-- params --> + + + </when> </conditional> <!-- params --> + <!--param name="input" format="tabular" type="data" label="input config file" /--> <param name="format" type="select" lable="raw data format" multiple="false"> <option value="fastq">Raw data is fastq. format</option> <option value="fasta">Raw data is fasta. format</option> </param> - - <param name="gfa" type="data" label="genome sequence fasta file"/> + + <param name="phred" type="select" lable="input quals are Phred+64 or Phred+33" multiple="false"> + <option value="64">Phred+64</option> + <option value="33" selected="true">Phred+33</option> + </param> + + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> + <!--param type="data" name="index" label="genome sequence bowtie index"/--> - <param name="mat" type="data" label="mature microRNA sequence file" /> - <param name="pre" type="data" label="precursor microRNA sequence fie" /> - <param name="rfam" type="data" label="rfam sequence file" /> + <conditional name="params"> + <param name="known_microRNA" type="select" label="Analysis known microRNAs(eg. from mirbase)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <param name="mat" type="data" label="mature microRNA sequence file" /> + <param name="pre" type="data" label="precursor microRNA sequence fie" /> + </when> + </conditional> <!-- params --> + + <conditional name="params"> + <param name="annotate_rfam" type="select" label="annotate rfam nocoding RNAs(excluding miRNA)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <!--param name="rfam" type="data" label="rfam sequence file" /--> + <when value="yes"> + <!--param name="rfam" type="data" label="rfam sequence file" /--> + <conditional name="reference_rfam"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference" help="If your reference of interest is not listed, contact the Galaxy team"> + <options from_data_table="rfam_bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" /> + </when> + </conditional> + + <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches for rfam mapping"/> + </when> + </conditional> <!-- params --> + <!--param type="data" name="idx2" label="rfam sequence bowtie index " --> <param name="a" type="text" value="ATCTCGTATG" label="3' adapter sequence" /> <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" /> @@ -64,7 +207,6 @@ <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to precursors" /> <param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" /> <param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" /> - <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches"/> <param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> <param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" /> <param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" /> @@ -72,11 +214,21 @@ </inputs> <outputs> - <data format="txt" name="known microRNA express list" from_work_dir="miRPlant_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list"/> - <data format="txt" name="known microRNA express alignment" from_work_dir="miRPlant_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment"/> - <data format="txt" name="known microRNA moRs result" from_work_dir="miRPlant_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result"/> - <data format="txt" name="known microRNA precursor file" from_work_dir="miRPlant_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file"/> - <data format="txt" name="known microRNA mature file" from_work_dir="miRPlant_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file"/> + <data format="txt" name="known microRNA express list" from_work_dir="miRPlant_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA express alignment" from_work_dir="miRPlant_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA moRs result" from_work_dir="miRPlant_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA precursor file" from_work_dir="miRPlant_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA mature file" from_work_dir="miRPlant_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> <data format="txt" name="novel microRNA express list" from_work_dir="miRPlant_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/> <data format="txt" name="novel microRNA precursor file" from_work_dir="miRPlant_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/> <data format="txt" name="novel microRNA mature sequence file" from_work_dir="miRPlant_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,253 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-4-22 +#Modified: +#Description: plant microRNA prediction +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +#use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +#use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h"); +if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$opts{'i'}; +my $data=$opts{'fa'}; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $hit=25; # maximum reads mapping hits in genome +if (defined $opts{'r'}) {$hit=$opts{'r'};} + +my $upstream = 2; # microRNA 5' extension +$upstream = $opts{'e'} if(defined $opts{'e'}); + +my $downstream = 5;# microRNA 3' extension +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; +my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; +my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; + +$time=&Time(); +print "$time, Checking input file!\n"; + +my (@filein,@mark); +&read_config(); + +&checkfa($opts{pre}) if(defined $opts{pre}); +&checkfa($opts{mat}) if(defined $opts{mat}); +&checkfa($opts{gfa}); + +chdir $dir; +my $data2=$data; +my $known_result=$dir."known_miRNA_Express"; +if(defined $opts{pre} and defined $opts{mat}){ + &quantify(); ### known microRAN quantify + $data2=$known_result."/mirbase_not_mapped.fa"; +} + +my $genome_map=$dir."genome_match"; +&genome($data2); + +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +#$time=Time(); +#print "$time: Novel microRNA prediction!\n\n"; + +&predict($mapfa); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub predict{ + my ($file)=@_; + $time=&Time(); + print "$time: Novel microRNA prediction!\n\n"; + my $predict=$dir."Novel_miRNA_predict"; + mkdir $predict; + chdir $predict; + system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); +# print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; + + system("bowtie-build -f excised_precursor.fa excised_precursor"); +# print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; + + system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log"); +# print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; + + system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); +# print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; + + system("sort -k 4 precursor_mapped.bst > signatures.bst"); +# print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; + + chdir $dir; + system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); +# print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; + #system("rm novel_tmp_dir -rf"); + my $tag=join "," ,@mark; + system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); + + system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa"); + +} + +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} + +sub quantify{ + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); + print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + #&check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=<N>; + chomp $line; + if($line !~ /^>\S+/){ + #printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if(<N> !~ /^[ACGTNacgtn]*$/){ + #printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + #&printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: + +$0 -i -fa -gfa -idx -pre -mat -mis -e -f -t -o -path +options: +-i input files, # config + +-fa ,#fasta sequence file + +-path scirpt path + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-pre string, input file #species specific microRNA precursor sequences +-mat string, input file #species specific microRNA mature sequences + +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-r int a read is allowed to map up to this number of positions in the genome,default is 25 + +-dis <int> Maximal space between miRNA and miRNA* (200) +-flank <int> Flank sequence length of miRNA precursor (10) +-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20) + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA.xml Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,111 @@ +<tool id="micrornas" name="microRNA" veision="1.0.0"> + <description>Plant microRNA analysis </description> + + <requirements> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <!--requirement type="package" version="3.0.1">R</requirement!--> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="2.1.8">ViennaRNA</requirement> + </requirements> + + <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command--> + + <command interpreter="perl">microRNA.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + -path \$SCRIPT_PATH + + + ## Do or not annotate known microRNAs + #if $params.known_microRNA == "yes": + -pre $pre -mat $mat + #end if + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + -gfa ${index_path}.fa -idx $index_path -mis $mismatch -i $config -fa $reads -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log + </command> + + <inputs> + + <!-- reference genome --> + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> + + <conditional name="params"> + <param name="known_microRNA" type="select" label="Analysis known microRNAs(eg. from mirbase)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <param name="mat" type="data" label="mature microRNA sequence file" /> + <param name="pre" type="data" label="precursor microRNA sequence fie" /> + </when> + </conditional> <!-- params --> + + <param name="config" type="data" label="Raw data configs file" /> + <param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" /> + + + <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" /> + <param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" /> + <param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" /> + <param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> + <param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" /> + <param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" /> + <param name="mfe" type="float" value="-30" label="Maximal free energy allowed for a miRNA precursor" /> + + </inputs> + + <outputs> + <data format="txt" name="known microRNA express list" from_work_dir="miRNA_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA express alignment" from_work_dir="miRNA_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA moRs result" from_work_dir="miRNA_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA precursor file" from_work_dir="miRNA_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="known microRNA mature file" from_work_dir="miRNA_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file"> + <filter>(params['known_microRNA'] == 'Yes')</filter> + </data> + <data format="txt" name="novel microRNA express list" from_work_dir="miRNA_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/> + <data format="txt" name="novel microRNA precursor file" from_work_dir="miRNA_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/> + <data format="txt" name="novel microRNA mature sequence file" from_work_dir="miRNA_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/> + <data format="txt" name="non-microRNA sequence FASTA file" from_work_dir="miRNA_out/non_miRNA_reads.fa" label="${tool.name} on ${on_string}: Sequence FASTA file of non-microRNA tags"/> + + </outputs> + + <help> + + </help> + </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nibls.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,319 @@ +#!/usr/bin/perl +##################################################################################################### +#LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. +# +# Dan MacLean +# dan.maclean@sainsbury-laboratory.ac.uk +# +# This program is free academic software; academic and non-profit +# users may redistribute it freely. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 +# see included file GPL3.txt +# +# + + +###Dont forget you will need ... +##################################################################################################### +# Boost::Graph +#Copyright 2005 by David Burdick +# Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm +#Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself. +##################################################################################################### + + + +use strict; +use warnings; +use Boost::Graph; +use Getopt::Long; + + +my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n"; + +my $gff_file ; +my $min_inc = 5; +my $clus = 0.6; +my $buff = 0; +my $output_file; +my $temp; +my $mark; + +GetOptions( + + 'c=f' => \$clus, + 'm=i' => \$min_inc, + 'f|file=s' => \$gff_file, + 'b=i' => \$buff, + 'o=s' => \$output_file, + 't=s' => \$temp, + 'k=s' => \$mark +) ; + + +die $usage unless $gff_file; + + +my $starttime = time; +warn "started $starttime\n"; + +## load in data +my %molecules; # stores starts and ends of srnas +open GFF, "<$gff_file"; + +while (my $entry = <GFF>){ + + chomp $entry; + next if($entry=~/^\#/); + my @data = split(/\t/,$entry); + my $chr=shift @data; + my $strand=shift @data; + my $start=shift @data; + my $end=shift @data; +# my $length1=$end-$start+1; +# if ($length1>30) { +# $length1=40; +# } + my $total; + for (my $s=0;$s<@data ;$s++) { + $total+=$data[$s]; + } + push @data,$total; +# push @data,$length1; +# if (defined $molecules{$chr}{$start}{$end}{$strand}) { +# my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand}); +# for (my $i=0;$i<$#old_data ;$i++) { +# $data[$i]+=$old_data[$i]; +# } +# } + my $data=join ";",@data; + $molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information + #print "$chr\t$start\t$end\n"; +} + +close GFF; + +warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n"; + +my @sample=split/\#/,$mark; +$mark=join"\"\t\"",@sample; +open OUT, ">$output_file"; +print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n"; +open CLUSTER,">$temp"; +print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n"; +foreach my $chromosome (keys %molecules){ + my $g = new Boost::Graph(directed=>0); + my @starts = keys(%{$molecules{$chromosome}} ); + @starts = sort {$a <=> $b} @starts; + + while (my $srna_start = shift @starts){ ## work from left most sRNA to right most, add to graph if they close enough + + + foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){ + + + ###use new graph if the next srna is too far away from this one.. + if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){ + + + ##dump the info from the old graph + if (scalar(@{$g->get_nodes()}) > 2){ + + my $cluster_coeff = get_cc($g); + if ($cluster_coeff >= $clus){ + dump_locus($g, $cluster_coeff); + } + } + + + $g = new Boost::Graph(directed=>0); + + } + + foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){ ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below + + unless ($e eq $srna_end){ + my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end; ## turn coordinate of sRNA inro a node name + my $en = $chromosome. ':' . $srna_start . ':' . $e; + $g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1'); + } + + } + + foreach my $start (@starts){ ##build graph of overlaps + my $new = 0; + last if $start - $min_inc > $srna_end; + if ($start - $min_inc <= $srna_end){ + + my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end; + foreach my $end (keys %{$molecules{$chromosome}{$start}}){ + + my $end_node = $chromosome . ':' . $start . ':' . $end; + $g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1'); + } + + } + } + } + + if (!(defined $starts[0])) { + ##dump the info from the last graph + if (scalar(@{$g->get_nodes()}) > 2){ + + my $cluster_coeff = get_cc($g); + if ($cluster_coeff >= $clus){ + dump_locus($g, $cluster_coeff); + } + } + } + } +} + +warn "Loci printed\nFinished\n"; + +my $endtime = time; + +my $elapsed = $endtime - $starttime; + +warn "Time elapsed = $elapsed s\n"; +close OUT; +close CLUSTER; +######################################################################################### +sub get_cc{ ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2 + + my $graph = shift; + + my @component = @{$graph->get_nodes()}; #number of nodes + my @clustering_coefficients; + + foreach my $vertex (@component) + { + + my @neighbours = @{$graph->neighbors($vertex)}; + + my %edges_in_graph; + + my $n = @neighbours; #n = the number of neighbours + my $k = ($n * ($n - 1))/2; #k = total number of possible connections + + my $e= 0; #actual number of connections within sub-graph + + foreach my $neighbour (@neighbours) + { + foreach my $neighbour_2 (@neighbours) + { + my $edge1 = "$neighbour\t$neighbour_2"; + my $edge2 = "$neighbour_2\t$neighbour"; + unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2}) + { + if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour)) + { + ++$e; + $edges_in_graph{$edge1}=1; + $edges_in_graph{$edge2}=1; + } + } + } + } + + if ($k >= 1) + { + my $c = $e / $k; + push @clustering_coefficients, $c; + } + else {push @clustering_coefficients, '0';} + } + + my $graph_n = scalar(@clustering_coefficients); + my $graph_cc = 0; + foreach my $cc (@clustering_coefficients){ + + $graph_cc = $graph_cc + $cc; + + } + $graph_cc = $graph_cc / $graph_n; + + return $graph_cc; +} + +############################################################################################################ + +sub dump_locus{ + + my $g = shift; + my $cc = shift; + my $chr; + my $start = 1000000000000000000000000000000000000000000000; + my $end = -1; + my @sample; + my @tag; + foreach my $node (@{$g->get_nodes()}){ + + $node =~ m/^(\S+):(\d+):(\d+)$/; + my $c=$1; + my $s=$2; + my $e=$3; + # my @data; + foreach my $str (keys %{$molecules{$c}{$s}{$e}}) { + my @data=split(/;/,$molecules{$c}{$s}{$e}{$str}); + push @tag,($s.",".$e.",".$str.",".$data[-1]); +# for (my $i=0;$i<$#old_data ;$i++) { +# $data[$i]+=$old_data[$i]; +# } + my $length=$e-$s+1; + if ($length>30) { + $length=40; + } + push @data,$length; + my $data=join ";",@data;#sample_exp/total_exp/length; + push @sample,$data; + } + + $chr = $c; + $start = $s if $s < $start; + $end = $e if $e > $end; + } + my $tag=join";",@tag; + my $tag_number=@tag; + my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample); + if ($max_length==40) { + $max_length="\>30"; + } + my $cluster_exp=join"\t",@cluster_exp; + my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp; + print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag_number\t$tag\n"; + print OUT $gff, "\n"; +} + +sub Max_length{ + my @exp=@{$_[0]}; + my %sample_length; + my $total_exp; + my @each; + for (my $i=0;$i<=$#exp ;$i++) { + my @tag=split/;/,$exp[$i]; + my $length=pop(@tag); + my $exp=pop(@tag); + $sample_length{$length}+=$exp; + $total_exp+=$exp; + for (my $j=0;$j<=$#tag ;$j++) { + $each[$j]+=$tag[$j]; + } + } + my $max=0; + my $max_key; + foreach my $key (sort keys %sample_length) { + my $p=$sample_length{$key}/$total_exp; + if ($p>$max) { + $max=$p; + $max_key=$key; + } + $sample_length{$key}=sprintf("%.2f",$p); + } + return($max_key,$sample_length{$max_key},@each); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/non_miRNA_reads.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,62 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","fa=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'fa'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +my (%fa,%seq); +while (my $aline=<IN>){ + chomp $aline; + $aline=~s/^>//; + my $seq=<IN>; + chomp $seq; + #$seq{$seq}=$aline; + $fa{$aline}=$seq; +} +close IN; + +open IN,"<$opts{i}"; +while(my $aline=<IN>){ + chomp $aline; + if($aline=~/^\S+_x\d+/){ + $aline=~/^(\S+_x\d+)/; + my $name=$1; + delete($fa{$name}); + } +} +close IN; + +open OUT,">$fileout"; #output file +foreach my $key (keys %fa) { + print OUT ">$key\n$fa{$key}\n"; +} +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phased_siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,254 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use Math::Cephes qw(:hypergeometrics); + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file + +while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^\#/) { + print OUT $aline,"\tp-value\n"; + next; + } + my @tmp=split/\t/,$aline; + my @pos=split/:|-/,$tmp[0]; + $tmp[1]=~s/nt//; + my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]); + + print OUT $aline,"\t",$pv,"\n"; +} +close IN; +close OUT; + +sub phase{ + my ($tagL,$start,$end,$tags)=@_; + my @tmp=split/\;/,$tags; + my %tag; + for (my $i=0;$i<@tmp;$i++) { + my @aa=split/\,/,$tmp[$i]; + next if($aa[1]-$aa[0]+1 != $tagL); +# $tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+"); +# $tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-"); + $tag{$aa[0]}+=$aa[3] if($aa[2] eq "+"); + $tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-"); + } + + my $pv=&pvalue2(\%tag,$tagL,$start,$end); + + return $pv; +} + +sub pvalue2{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; my $pp=1; + foreach my $ccs(keys %{$tag}){ + my $n=0; + my $k=0; + my $K=0; + my $N=0; + + my $cor= $ccs; + my $ss=$cor; + my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end; + + my $max=0; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + if ($max<$$tag{$x}) {$max=$$tag{$x};} + $n +=$$tag{$x}; + $N++; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + $k +=$$tag{$x}; + $K++; + } + } + + + return $p if($K<3); + return $p if($max/$n>0.8); + + my $pn=0; + next if($n==$k); + $pn=10*$k/($n-$k)+1; + $pn = $pn ** ($K-2); + $pn = log($pn); + if ($p<$pn) { + $p=$pn; + } + + } + + return $p; + +} + +sub pvalue{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; + foreach my $ccs(keys %{$tag}){ + my $n=-1; + my $k=-1; + + my ($cor, $str)=split(/,/, $ccs); + if ($str eq "+") # small RNAs on the Watson strand + { + my $ss=$cor; + my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + + for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + elsif ($str eq "-") # small RNAs on the Crick strand + { + my $ee=$cor; + my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + next if($k<3); + + my $pn=0; my $N=$tagL*11*2-1; my $M=21; + for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k + { + my $c=1; + my $rr=1; + my $rw=1; + + for (my $j=0; $j<=$w-1; $j++) + { + $c=$c*($M-$j)/($j+1); + } + for (my $x=0; $x<=$n-$w-1; $x++) + { + $rr=$rr*($N-$M-$x)/($x+1); + } + for (my $y=0; $y<=$n-1; $y++) + { + $rw=$rw*($y+1)/($N-$y); + } + my $pr=$c*$rr*$rw; + + $pn=$pn+$pr; + } + + $p=$pn<$p ? $pn :$p; + + if ($p<0.001) #select and output small RNA clusters with p<0.001 + + { + + return $p; + + } + + } + return $p; +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} +
--- a/preProcess.xml Wed Dec 03 02:03:27 2014 -0500 +++ b/preProcess.xml Fri Dec 05 00:11:02 2014 -0500 @@ -12,7 +12,7 @@ <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command--> - <command interpreter="perl">miRPlant.pl + <command interpreter="perl">preProcess.pl ## Change this to accommodate the number of threads you have available. -t \${GALAXY_SLOTS:-4} -path \$SCRIPT_PATH @@ -25,7 +25,16 @@ ## Do or not annotate rfam non-miRNA RNAs #if $params.annotate_rfam == "yes": - -rfam $rfam + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($params.annotate_rfam.reference_rfam.source) == "history": + bowtie-build "$params.annotate_rfam.reference_rfam.own_file" rfam; ln -s "$params.annotate_rfam.reference_rfam.own_file" rfam.fa; + #set rfam_index_path = 'rfam' + #else: + #set rfam_index_path = $params.annotate_rfam.reference_rfam.index.fields.path + #end if + + -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v #end if ## prepare bowtie index @@ -37,7 +46,7 @@ #set index_path = $reference_genome.index.fields.path #end if - -format $format -gfa ${index_path}.fa -idx $index_path -a $a -M $mapnt -min $min -max $max -mis $mismatch -v $v > run.log + -format $format -gfa ${index_path}.fa -idx $index_path -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch > run.log </command> <inputs> @@ -75,6 +84,10 @@ <!--param name="gfa" type="data" label="genome sequence fasta file"/--> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + <param name="phred" type="select" lable="input quals are Phred+64 or Phred+33" multiple="false"> + <option value="64">Phred+64</option> + <option value="33" selected="true">Phred+33</option> + </param> <param name="a" type="text" value="ATCTCGTATG" label="3' adapter sequence" /> <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" /> <param name="min" type="integer" value="19" label="minimum microRNA length" /> @@ -87,8 +100,27 @@ <option value="no">no</option> </param> <when value="yes"> - <param name="rfam" type="data" label="rfam sequence file" /> - <!--param type="data" name="idx2" label="rfam sequence bowtie index " --> + <!--param name="rfam" type="data" label="rfam sequence file" /--> + <when value="yes"> + <!--param name="rfam" type="data" label="rfam sequence file" /--> + <conditional name="reference_rfam"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference" help="If your reference of interest is not listed, contact the Galaxy team"> + <options from_data_table="rfam_bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" /> + </when> + </conditional> + <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches for rfam mapping"/> </when> </conditional> <!-- params --> @@ -112,6 +144,8 @@ <data format="txt" name="Rfam not mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_not_mapped.fa" label="${tool.name} on ${on_string}: Rfam not mapped FASTA file"> <filter>(params['annotate_rfam'] == 'Yes')</filter> </data> + <data format="html" name="input config" from_work_dir="preProcess/input_config" label="${tool.name} on ${on_string}: input config"/> + </outputs> <help>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/precursors.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,829 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use RNA; + +my %opts; +GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h"); +if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $checkno=1; +my $filein=$opts{'map'}; +my $faout=$opts{'o'}; +my $strout=$opts{'s'}; +my $genome= $opts{'g'}; + +my $maxd=defined $opts{'d'} ? $opts{'d'} : 200; +my $flank=defined $opts{'f'}? $opts{'f'} : 10; + +my $MAX_ENERGY=-18; +if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};} +my $MAX_UNPAIR=5; +my $MIN_PAIR=15; +my $MAX_SIZEDIFF=4; +my $MAX_BULGE=2; +my $ASYMMETRY=5; +my $MIN_UNPAIR=0; +my $MIN_SPACE=5; +my $MAX_SPACE=$maxd; +my $FLANK=$flank; + +######### load in genome sequences start ######## +my %genome; +my %lng; +my $name; +open IN,"<$genome"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $genome{$name} .=$aline; +} +close IN; +foreach my $key (keys %genome) { + $lng{$key}=length($genome{$key}); +} +####### load in genome sequences end ########## + +my %breaks; ### reads number bigger than 3 +open IN,"<$filein"; #input file +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + $tmp[0]=~/_x(\d+)$/; + my $no=$1; + next if($no<3); + #my $trand=&find_strand($tmp[9]); + #my @pos=split/\.\./,$tmp[5]; + my $end=$tmp[3]+length($tmp[4])-1; + if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);} + push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base +} +close IN; + +my %cites; ### peaks +foreach my $chr (keys %breaks) { + foreach my $strand (keys %{$breaks{$chr}}) { + my @array=@{$breaks{$chr}{$strand}}; + @array=sort{$a->[0]<=>$b->[0]} @array; + for (my $i=0;$i<@array;$i++) { + my $start=$array[$i][0];my $end=$array[$i][1]; + my @subarray=(); + push @subarray,$array[$i]; + + for (my $j=$i+1;$j<@array;$j++) { + if ($start<$array[$j][1] && $end>$array[$j][0]) { + push @subarray,$array[$j]; + ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); + } + else{ + $i=$j; + &find_cites(\@subarray,$chr,$strand); + last; + } + } + } + } +} + +open FA,">$faout"; #output file +open STR,">$strout"; +foreach my $chr (keys %cites) { + foreach my $strand (keys %{$cites{$chr}}) { + + my @array2=@{$cites{$chr}{$strand}}; + @array2=sort{$a->[0]<=>$b->[0]} @array2; + &excise(\@array2,$chr,$strand); + } +} +close FA; +close STR; +sub oneCiteDn{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$maxd+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; +} +sub oneCiteUp{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$maxd-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; + +} + +sub twoCites{ + my ($array,$a,$b,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$b][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + +# my( $str,$mfe)=RNA::fold($seq); +# return 0 if($mfe>$MAX_ENERGY); ### minimum mfe + my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee); + + return $val; + +} +sub excise{ + my ($cluster,$chr,$strand)=@_; + + my $last_pos=0; + for (my $i=0;$i<@{$cluster};$i++) { + next if($$cluster[$i][0]<$last_pos); + my $ok=0; + for (my $j=$i+1;$j<@{$cluster} ;$j++) { + if($$cluster[$j][0]-$$cluster[$i][1]>$maxd){ + $i=$j; + last; + }else{ + $ok=&twoCites($cluster,$i,$j,$chr,$strand); + if($ok){ $last_pos=$$cluster[$j][1]+$flank; $i=$j; last;} + } + } + next if($ok); + + $ok=&oneCiteDn($cluster,$i,$chr,$strand); + if($ok){$last_pos=$$cluster[$i][1]+$maxd+$flank; next;} + $ok=&oneCiteUp($cluster,$i,$chr,$strand); + if($ok){$last_pos=$$cluster[$i][1]+$flank;next;} + } + + +} + +sub ffw2{ + my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_; + + my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + # position tag1 and tag2 + my $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + my $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + if ($tag2_beg < $tag1_beg) { + # swap tag1 and tag2 + ($tag1,$tag2)=($tag2,$tag1); + ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg); + } + my $tag1_end=$tag1_beg+length($tag1)-1; + my $tag2_end=$tag2_beg+length($tag2)-1; + # re-clipping + my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1; + my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length; + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + # re-reposition + $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + + $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + $tag1_end=$tag1_beg+length($tag1)-1; + $tag2_end=$tag2_beg+length($tag2)-1; + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + #$mfe=sprintf "%.2f", $mfe; + if ($mfe > $MAX_ENERGY) {return 0;} + + # tag1 + my $tag1_length=$tag1_end-$tag1_beg+1; + my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length); + my $tag1_arm=which_arm($tag1_struct); + my $tag1_unpair=$tag1_struct=~tr/.//; + my $tag1_pair=$tag1_length-$tag1_unpair; + my $tag1_max_bulge=biggest_bulge($tag1_struct); + if ($tag1_arm ne "5p") { return 0;} # tag not in stem +# if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag1_pair < $MIN_PAIR) {return 0;} + if ($tag1_max_bulge > $MAX_BULGE) {return 0;} + + # tag2 + my $tag2_length=$tag2_end-$tag2_beg+1; + my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length); + my $tag2_arm=which_arm($tag2_struct); + my $tag2_unpair=$tag2_struct=~tr/.//; + my $tag2_pair=$tag2_length-$tag2_unpair; + my $tag2_max_bulge=biggest_bulge($tag2_struct); + if ($tag2_arm ne "3p") {return 0;} # star not in stem +# if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag2_pair < $MIN_PAIR) {return 0;} + if ($tag2_max_bulge > $MAX_BULGE) {return 0;} + + # space size between miR and miR* + my $space=$tag2_beg-$tag1_end-1; + if ($space < $MIN_SPACE) {return 0;} + if ($space > $MAX_SPACE) {return 0;} + + # size diff of miR and miR* + my $size_diff=abs($tag1_length-$tag2_length); + if ($size_diff > $MAX_SIZEDIFF) {return 0;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end); + my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end); + my $asy=($asy1 < $asy2) ? $asy1 : $asy2; + if ($asy > $ASYMMETRY) {return 0} + + # duplex fold, determine whether two matures like a miR/miR* ike duplex + my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2); + # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context + my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end); + if ($like_mir_duplex1==0 && $like_mir_duplex2==0) { + return 0; + } + + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + + return 1; +} + +sub ffw1{ + my ($seq,$tag,$chr,$strand,$ss,$ee)=@_; + my $pass=0; + + my $N_count=$seq=~tr/N//; + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + my $tag_length=length $tag; + + # position + my $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return $pass; + } + my $tag_end=$tag_beg+length($tag)-1; + + + # define candidate precursor by hybrid short arm to long arm, not solid enough + my($beg,$end)=define_precursor($seq,$tag); + if (not defined $beg) { + return $pass; + } + if (not defined $end) { + return $pass; + } + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + if ($mfe > $MAX_ENERGY) { + $pass=0; + return $pass; + } + + # reposition + $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return 0; + } + $tag_end=$tag_beg+length($tag)-1; + + my $tag_struct=substr($struct,$tag_beg-1,$tag_length); + my $tag_arm=which_arm($tag_struct); + my $tag_unpair=$tag_struct=~tr/.//; + my $tag_pair=$tag_length-$tag_unpair; + my $tag_max_bulge=biggest_bulge($tag_struct); + if ($tag_arm eq "-") { return $pass;} +# if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag_pair < $MIN_PAIR) { return $pass;} + if ($tag_max_bulge > $MAX_BULGE) {return $pass;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + # get star + my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); + my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); + my $star_length=$star_end-$star_beg+1; + my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); + my $star_arm=which_arm($star_struct); + my $star_unpair=$star_struct=~tr/.//; + my $star_pair=$star_length-$star_unpair; + my $star_max_bulge=biggest_bulge($star_struct); + if ($star_arm eq "-") { return $pass;} +# if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass} + if ($star_pair < $MIN_PAIR) {return $pass;} + if ($star_max_bulge > $MAX_BULGE) {return $pass;} + + if ($tag_arm eq $star_arm) {return $pass;} + + # space size between miR and miR* + my $space; + if ($tag_beg < $star_beg) { + $space=$star_beg-$tag_end-1; + } + else { + $space=$tag_beg-$star_end-1; + } + if ($space < $MIN_SPACE) { return $pass;} + if ($space > $MAX_SPACE) { return $pass;} + + # size diff + my $size_diff=abs($tag_length-$star_length); + if ($size_diff > $MAX_SIZEDIFF) { return $pass;} + + # asymmetry + my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); + if ($asy > $ASYMMETRY) {return $pass;} + + $pass=1; + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + return $pass; + +} +sub get_star { + my($table,$beg,$end)=@_; + + my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 + foreach my $i ($beg..$end) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $s1=$i; + $s2=$j; + last; + } + } + foreach my $i (reverse ($beg..$end)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $e1=$i; + $e2=$j; + last; + } + } +# print "$s1,$e1 $s2,$e2\n"; + + # correct terminus + my $off1=$s1-$beg; + my $off2=$end-$e1; + $s2+=$off1; + $s2+=2; # 081009 + $e2-=$off2; $e2=1 if $e2 < 1; + $e2+=2; $e2=1 if $e2 < 1; # 081009 + ($s2,$e2)=($e2,$s2) if ($s2 > $e2); + return ($s2,$e2); +} + +sub define_precursor { + my $seq=shift; + my $tag=shift; + + my $seq_length=length $seq; + my $tag_length=length $tag; + my $tag_beg=index($seq,$tag,0)+1; + my $tag_end=$tag_beg+$tag_length-1; + + # split the candidate region into short arm and long arm + my $tag_arm; + my ($larm,$larm_beg,$larm_end); + my ($sarm,$sarm_beg,$sarm_end); + if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm + $sarm=substr($seq,0,$tag_end); + $larm=substr($seq,$tag_end); + $sarm_beg=1; + $sarm_end=$tag_end; + $larm_beg=$tag_end+1; + $larm_end=$seq_length; + $tag_arm="5p"; + } + else { + $larm=substr($seq,0,$tag_beg-1); # on 3' arm + $sarm=substr($seq,$tag_beg-1); + $larm_beg=1; + $larm_end=$tag_beg-1; + $sarm_beg=$tag_beg; + $sarm_end=$seq_length; + $tag_arm="3p"; + } + +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # clipping short arm + if ($tag_arm eq "5p") { + $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } + else { + $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # define the precursor by hybriding short arm to long arm +=cut #modify in 2014-10-28 + my $duplex=RNA::duplexfold($sarm,$larm); + my $struct=$duplex->{structure}; + my $energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; +# print "pair=$pair\n"; + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut +###### new codes begin + my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; +######## new codes end + +# print "$beg1:$end1 $beg2:$end2\n"; + # transform coordinates + $beg1=$beg1+$sarm_beg-1; + $end1=$end1+$sarm_beg-1; + $beg2=$beg2+$larm_beg-1; + $end2=$end2+$larm_beg-1; +# print "$beg1:$end1 $beg2:$end2\n"; + + my $off5p=$beg1-$sarm_beg; + my $off3p=$sarm_end-$end1; + $beg2-=$off3p; $beg2=1 if $beg2 < 1; + $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length; + +# print "$beg1:$end1 $beg2:$end2\n"; + + my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2; + my $end=$sarm_end > $end2 ? $sarm_end : $end2; + + return if $pair < $MIN_PAIR; +# print "$beg,$end\n"; + return ($beg,$end); +} + + +# duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex +sub likeMirDuplex1 { + my $seq1=shift; + my $seq2=shift; + my $like_mir_duplex=1; + + my $length1=length $seq1; + my $length2=length $seq2; +=cut + my $duplex=RNA::duplexfold($seq1, $seq2); + my $duplex_struct=$duplex->{structure}; + my $duplex_energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$duplex_struct); + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut + my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $duplex_struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $duplex_energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$duplex_struct); + #my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; + + # revise beg1, end1, beg2, end2 + $str1=~/^(\.*)/; + $beg1+=length($1); + $str1=~/(\.*)$/; + $end1-=length($1); + $str2=~/^(\.*)/; + $beg2+=length($1); + $str2=~/(\.*)$/; + $end2-=length($1); + + my $pair_num=$str1=~tr/(//; + my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom + my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck +# print $pair_num,"\n"; +# print $overhang1,"\n"; +# print $overhang2,"\n"; + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 || $overhang2 < 0 ) { + $like_mir_duplex=0; + } + if ($overhang1 > 4 || $overhang2 > 4) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} + +# judge whether two matures form miR/miR* duplex, in hairpin context +sub likeMirDuplex2 { + my ($table,$beg1,$end1,$beg2,$end2)=@_; + my $like_mir_duplex=1; + +# s1 e1 +# 5 ----------------------------3 +# | | |||| ||| | +#3 -------------------------------5 +# e2 s2 + + my $pair_num=0; + my $overhang1=0; + my $overhang2=0; + my ($s1,$e1,$s2,$e2); + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $s1=$i; + $e2=$j; + last; + } + } + } + foreach my $i (reverse ($beg1..$end1)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $e1=$i; + $s2=$j; + last; + } + } + } + +# print "$beg1,$end1 $s1,$e1\n"; +# print "$beg2,$end2 $s2,$e2\n"; + + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + ++$pair_num; + } + } + } + if (defined $s1 && defined $e2) { + $overhang1=($end2-$e2)-($s1-$beg1); + } + if (defined $e1 && defined $s2) { + $overhang2=($end1-$e1)-($s2-$beg2); + } + + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 && $overhang2 < 0) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} +sub parse_struct { + my $struct=shift; + my $table=shift; + + my @t=split('',$struct); + my @lbs; # left brackets + foreach my $k (0..$#t) { + if ($t[$k] eq "(") { + push @lbs, $k+1; + } + elsif ($t[$k] eq ")") { + my $lb=pop @lbs; + my $rb=$k+1; + $table->{$lb}=$rb; + $table->{$rb}=$lb; + } + } + if (@lbs) { + warn "unbalanced RNA struct.\n"; + } +} +sub which_arm { + my $substruct=shift; + my $arm; + if ($substruct=~/\(/ && $substruct=~/\)/) { + $arm="-"; + } + elsif ($substruct=~/\(/) { + $arm="5p"; + } + else { + $arm="3p"; + } + return $arm; +} +sub biggest_bulge { + my $struct=shift; + my $bulge_size=0; + my $max_bulge=0; + while ($struct=~/(\.+)/g) { + $bulge_size=length $1; + if ($bulge_size > $max_bulge) { + $max_bulge=$bulge_size; + } + } + return $max_bulge; +} +sub get_asy { + my($table,$a1,$a2)=@_; + my ($pre_i,$pre_j); + my $asymmetry=0; + foreach my $i ($a1..$a2) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if (defined $pre_i && defined $pre_j) { + my $diff=($i-$pre_i)+($j-$pre_j); + $asymmetry += abs($diff); + } + $pre_i=$i; + $pre_j=$j; + } + } + return $asymmetry; +} + +sub peaks{ + my @cluster=@{$_[0]}; + + return if(@cluster<1); + + my $max=0; my $index=-1; + for (my $i=0;$i<@cluster;$i++) { + if($cluster[$i][2]>$max){ + $max=$cluster[$i][2]; + $index=$i; + } + } +# &excise(\@cluster,$index,$_[1],$_[2]); + return($index); +} + +sub find_cites{ + my @tmp=@{$_[0]}; + my $i=&peaks(\@tmp); + + my $start=$tmp[$i][0]; + my $total=0; my $node5=0; + for (my $j=0;$j<@tmp ;$j++) { + $total+=$tmp[$j][2]; + $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2); + } + push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5); +} + +sub newpos{ + my ($a,$b,$c,$d)=@_; + my $s= $a>$c ? $c : $a; + my $e=$b>$d ? $b : $d; + return($s,$e); +} + +sub rev{ + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + + return($strand); +} +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -map -g -d -f -o -s -e +options: + -map input file# align result # bst. format + -g input file # genome sequence fasta format + -d <int> Maximal space between miRNA and miRNA* (200) + -f <int> Flank sequence length of miRNA precursor (10) + -o output file# percursor fasta file + -s output file# precursor structure file + -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol) + + -h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantify.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,502 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use File::Path; +use strict; +use File::Basename; +#use Getopt::Std; +use Getopt::Long; +#use RNA; + +my %opts; +GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","h"); +if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $read=$opts{'r'}; +my $pre=$opts{'p'}; +my $mature=$opts{'m'}; + +my $dir=$opts{'o'}; +unless ($dir=~/\/$/) {$dir .="/";} +if (not -d $dir) { + mkdir $dir; +} + +my $threads=defined $opts{'t'} ? $opts{'t'} : 1; +my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0; + +my $upstream = 2; +my $downstream = 5; + +$upstream = $opts{'e'} if(defined $opts{'e'}); +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; + +my $time=Time(); + +my $tmpdir="${dir}/known_miRNA_Express"; +if(not -d $tmpdir){ + mkdir($tmpdir); +} +chdir $tmpdir; + +`cp $pre ./`; +my $pre_file_name=basename($pre); + +&mapping(); # matures align to precursors && reads align to precursors; + +my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end; +&maturePosOnPre(); # acknowledge mature positions on precursor + +my %pre_read; +&readPosOnPre(); # acknowledge reads positions on precursors + +if(!(defined $opts{'tag'})){ + foreach my $key (keys %pre_read) { + $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=1;$i<=@ss;$i++) { + $marks .="Smp$i;"; + } + last; + } +} + +my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...." +&attachPre(); + +my $preno=scalar (keys %pre); +print "Total Precursor Number is $preno !!!!\n"; + +my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe; +&structure(); + + +##### analysis and print out && moRs +my $aln=$dir."known_microRNA_express.aln"; +my $list=$dir."known_microRNA_express.txt"; +my $moRs=$dir."known_microRNA_express.moRs"; + +system("ln -s $mature $dir/known_microRNA_mature.fa "); +system("ln -s $pre $dir/known_microRNA_precursor.fa "); + +open ALN,">$aln"; +open LIST,">$list"; +open MORS,">$moRs"; + +$"="\t"; ##### @array print in \t + +my @marks=split/\;/,$marks; +#print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n"; +print LIST "#matueID\tpreID\tpos1\tpos2"; +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_matureExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_starExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_totalExp"; +} +print LIST "\n"; +print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n"; +print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n"; +my %moRs; + +foreach my $key (keys %pre) { + print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n"; + next if(! (exists $pre_read{$key})); + my @array=@{$pre_read{$key}}; + @array=sort{$a->[3]<=> $b->[3]} @array; + + my $length=length($pre{$key}); + + my $maxline=-1;my $max=0; ### storage the maxinum express read line + my $totalReadsNo=0; + my @not_over=(); ### new read format better for moRs analysis + +####print out Aln file start + for (my $i=0;$i<@array;$i++) { + my $maps=$array[$i][3]+1; + my $mape=$array[$i][3]+length($array[$i][4]); + my $str=""; + $str .= "." x ($maps-1); + $str .=$array[$i][4]; + $str .="." x ($length-$mape); + $str .=" "; + + $array[$i][0]=~/:([\d|_]+)_x(\d+)$/; + my @sample=split /\_/,$1; + my $total=$2; + print ALN $str,"@sample","\t",$total,"\n"; + + if($total>$max){$max=$total; $maxline=$i;} + $totalReadsNo+=$total; + + push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"]; + } +####print out Aln file end + +#### express list start + my ($ms,$me,$ss,$se); + if (!(exists($pre_mature{$key}))) { + $ms=$array[$maxline][3]+1; + $me=$array[$maxline][3]+length($array[$maxline][4]); + ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'}); + + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + else{ + foreach my $maID (keys %{$pre_mature{$key}}) { + $ms=$pre_mature{$key}{$maID}{"mature"}[0]; + $me=$pre_mature{$key}{$maID}{"mature"}[1]; + $ss=$pre_mature{$key}{$maID}{"star"}[0]; + $se=$pre_mature{$key}{$maID}{"star"}[1]; + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + } +#### express list end + +#### analysis moRs start + my @result; my @m_texp;my $m_texp=0; ### moRs informations + + while (@not_over>0) { + my @over=@not_over; + @not_over=(); + +#·á¶È×î¸ßtag + my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1; + for (my $i=0;$i<@over;$i++) { + my @m_array=@{$over[$i]}; + if ($m_max<$m_array[4]) { + $m_max=$m_array[4]; + $m_maxline=$i; + } + } + $m_start=$over[$m_maxline][1]; + $m_end=$over[$m_maxline][2]; + $m_exp=$m_max; + $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j]=$m_nums[$j]; + } + +#ͳ¼ÆÒÔ·á¶È×î¸ßtagΪ×ø±êµÄreads, Á½¶ËλÖòîÒì²»³¬¹ý3nt + for (my $i=0;$i<@over;$i++) { + next if($i==$m_maxline); + my @m_array=@{$over[$i]}; + if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) { + $m_exp+=$m_array[4]; + $m_no++; + $m_array[3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j] +=$m_nums[$j]; + } + } + elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #È¥³ý¿çÔ½blockµÄreads + } + if($m_exp>5){### 5¸öreads + $m_texp+=$m_exp; + for (my $j=0;$j<@m_exp;$j++) { + $m_texp[$j]+=$m_exp[$j]; + } + my $string=&subseq($pre{$key},$m_start,$m_end,"+"); + push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ; + } + } + + my $str=scalar @result; + my $percent=sprintf("%.2f",$m_texp/$totalReadsNo); + $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}"; + @{$moRs{$str}}=@result; + +#### analysis moRs end +} + +##### moRs print out start +foreach my $key (keys %moRs) { + my @tmp=split/\t/,$key; + next if ($tmp[4]<=2); + next if($tmp[3]<0.95); + my @over; + for (my $i=0;$i<@{$moRs{$key}};$i++) { + my @arrayi=split/\t/,$moRs{$key}[$i]; + for (my $j=0;$j<@{$moRs{$key}};$j++) { + next if($i==$j); + my @arrayj=split/\t/,$moRs{$key}[$j]; + if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) { + push @over,$moRs{$key}[$i]; + } + } + } + if (@over>0) { + print MORS "$key\n"; + foreach (@{$moRs{$key}}) { + print MORS "$_\n"; + } + } +} +###### moRs print out end +close ALN; +close LIST; +close MORS; + +$"=" ";##### reset + + +################### Sub programs ################# +sub express{ + my ($ms,$me,$ss,$se,$read)=@_; + my (@mexp,@sexp,@texp); + $$read[0][0]=~/:([_|\d]+)_x(\d+)$/; + my @numsample=split/_/,$1; + for (my $i=0;$i<@numsample;$i++) { + $mexp[$i]=0; + $sexp[$i]=0; + $texp[$i]=0; + } + + for (my $i=0;$i<@{$read};$i++) { + my $start=$$read[$i][3]+1; + my $end=$$read[$i][3]+length($$read[$i][4]); + $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/; + my $expresses=$1; + my @nums=split/_/,$expresses; + + for (my $j=0;$j<@nums;$j++) { + $texp[$j]+=$nums[$j]; + } + if ($start>=$ms && $end<=$me) { + for (my $j=0;$j<@nums;$j++) { + $mexp[$j]+=$nums[$j]; + } + } + if ($start>=$ss && $end<=$se) { + for (my $j=0;$j<@nums;$j++) { + $sexp[$j]+=$nums[$j]; + } + } + } + return(\@mexp,\@sexp,\@texp); +} + +sub structure{ + foreach my $key (keys %pre_mature) { + if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";} + #my ($str,$mfe)=RNA::fold($pre{$key}); + my $rnafold=`perl -e 'print "$pre{$key}"' | RNAfold --noPS`; + my @rnafolds=split/\s+/,$rnafold; + my $str=$rnafolds[1]; + my $mfe=$rnafolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + $struc{$key}{"struc"}=$str; + #$struc{$key}{"mfe"}=sprintf ("%.2f",$mfe); + $struc{$key}{"mfe"}=$mfe; + + foreach my $id (keys %{$pre_mature{$key}}) { + ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str); + } +=cut +##### Nucleotide complementary + my @tmp=split//,$str; + my %a2b; + my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } + +##### search star position + foreach my $id (keys %{$pre_mature{$key}}) { + my $n=0; + for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; + if($a>$b){ + $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2; + $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + if($a<$b{ + $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2; + $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + last; + } + $n++; + } + } +=cut + } +} +sub other_pair{ + my ($start,$end,$structure)=@_; + ##### Nucleotide complementary + my @tmp=split//,$structure; + my %a2b; my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } +##### search star position + my $n=0;my $startpos; my $endpos; + for (my $i=$start;$i<=$end ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; +# if($a>$b){ +# $startpos=$b-$n+2; +# $endpos=$b-$n+2+($end-$start); +# } +# if($a<$b){ + $endpos=$b+$n+2; + if($endpos>length($structure)){$endpos=length($structure);} + $startpos=$b+$n+2-($end-$start); + if($startpos<1){$startpos=1;} +# } + last; + } + $n++; + } + return ($startpos,$endpos); +} +sub attachPre{ + open IN, "<$pre_file_name"; + my $name; + while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $pre{$name} .=$aline; + } + close IN; +} +sub readPosOnPre{ + open IN,"<read_mapped.bwt"; + while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + my $id=lc($tmp[2]); + push @{$pre_read{$tmp[2]}},[@tmp]; + } + close IN; +} +sub maturePosOnPre{ + open IN,"<mature_mapped.bwt"; + while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + my $mm=$tmp[0]; +# $mm=~s/\-3P|\-5P//i; + $mm=lc($mm); + my $pm=$tmp[2]; + $pm=lc($pm); + +# next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a + next if($mm!~/$pm/); +# print "$tmp[2]\t$tmp[0]\n"; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0); +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]); + } + close IN; +} +sub mapping{ + my $err; +## build bowtie index + #print STDERR "building bowtie index\n"; + $err = `bowtie-build $pre_file_name miRNA_precursor`; + +## map mature sequences against precursors + #print STDERR "mapping mature sequences against index\n"; + $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`; + +## map reads against precursors + #print STDERR "mapping read sequences against index\n"; + $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`; + +} + +sub subseq{ + my $seq=shift; + my $beg=shift; + my $end=shift; + my $strand=shift; + + my $subseq=substr($seq,$beg-1,$end-$beg+1); + if ($strand eq "-") { + $subseq=revcom($subseq); + } + return uc $subseq; +} + +sub revcom{ + my $seq=shift; + $seq=~tr/ATCGatcg/TAGCtagc/; + $seq=reverse $seq; + return uc $seq; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -r -p -m -mis -t -e -f -tag -o -time +mandatory parameters: +-p precursor.fa miRNA precursor sequences from miRBase # must be absolute path +-m mature.fa miRNA sequences from miRBase # must be absolute path +-r reads.fa your read sequences #must be absolute path + +-o output directory + +options: +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-t [int] threads number,default 1 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-tag [string] sample marks# eg. sampleA;sampleB;sampleC +-time sting #make directory time,default is the local time +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantify_siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w +#Filename: +#Author: chentt +#Email: +#Date: 2012-4-6 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","d=s","h"); +if (!(defined $opts{i} and defined $opts{d} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $input=$opts{'i'}; +my $output=$opts{'o'}; +my $depth=$opts{'d'}; + +open (IN,"<$input")||die"$!"; +open OUT,">$output"; +#my @Total=qw(15797079 18042650 17455254 17295526 18791753 16719596 15150009 18451484 17402501 17729362 19347595 17518516 15699663 16589265 15442892 14012264 14190746 17280260 13213117 12390121 14874304 ); +my @Total=split/\,/,$depth; +#print OUT "#clusterID\tmajor_length\tpercent\n"; +while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^\"/){ + my @title=split/\t/,$aline; + for (my $i=0;$i<@title ;$i++) { + $title[$i]=~s/^\"(\S+)\"$/$1/; + } + my $title=join "\t",@title; + print OUT "\#$title\n"; + next; + } + my @temp=split/\t/,$aline; + print OUT "$temp[0]\t$temp[1]\t$temp[2]"; + my @id=split/:/,$temp[0]; + my @posi=split/-/,$id[1]; + for (my $i=3;$i<@temp;$i++) { + my $rpkm=sprintf("%.2f",$temp[$i]/($posi[1]-$posi[0]+1)/$Total[$i-3]*1000000000); + print OUT "\t$rpkm"; + } + print OUT "\n"; +} +close IN; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -h +options: +-i input cluster file +-o output file +-d depth +-h help +USAGE +exit(1); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRNA_plot.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,411 @@ +#!/usr/bin/perl -w +#========================================================================================== +# Date: +# Title: +# Comment: Program to plot gene structure +# Input: 1. +# 2. +# 3. +# Output: output file of gene structure graph by html or svg formt +# Test Usage: +#======================================================================================== +#use strict; +my $version=1.00; +use SVG; +use Getopt::Long; +my %opt; +GetOptions(\%opt,"g=s","l=s","span=s","c=s","o=s","out=s","cen:s","mark=s","h"); +if (!( defined $opt{o}) || defined $opt{h}) { +&usage; +} +my $span=$opt{span}; +#my $sample_cloumn=$opt{n}; +my $mark=$opt{mark}; +my @mark=split/\#/,$mark; +my $genelist=$opt{g}; +#===============================Define Attribute========================================== +my %attribute=( + canvas=>{ + 'width'=>1500, + 'height'=>1800 + }, + text=>{ + 'stroke'=>"#000000", + 'fill'=>"none", + 'stroke-width'=>0.5 + }, + line=>{ + 'stroke'=>"black", + 'stroke-width'=>1 + }, + csv=>{ + 'stroke'=>"red", + 'stroke-width'=>0.5 + }, + exon=>{ + 'stroke'=>"black", + 'stroke-width'=>1 + }, + intron=>{ + 'stroke'=>"black", + 'stroke-width'=>1.5 + }, + font=>{ + 'fill'=>"#000000", + 'font-size'=>12, + 'font-size2'=>10, + #'font-weight'=>'bold', + 'font-family'=>"Arial" + #'font-family'=>"ArialNarrow-bold" + }, + rect=>{ + 'fill'=>"lightgreen", + 'stroke'=>"black", + 'stroke-width'=>0.5 + }, + readwidth=>0.5 +); +#############################s#define start coordinate and scale +open(TXT,">$opt{out}"); +open(LENGTH,"$opt{l}")||die"cannot open the file $opt{l}"; +my %length; +while (my $aline=<LENGTH>) { + chomp $aline; + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + $temp[0]=~s/^c/C/; + $length{$temp[0]}=$temp[1]; +} +close LENGTH; +#--------------------------------------------------------------- +open(GENE,"$opt{g}")||die"cannot open the file $opt{g}"; +my %genelist; +while (my $aline=<GENE>) { + chomp $aline;#LOC_Os01g01280 Chr1 133291 134685 + + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + if ($temp[1]=~/^Chr(\d)$/) { + $temp[1]="Chr0$1"; + } + push @{$genelist{$temp[1]}},[$temp[0],$temp[2],$temp[3]]; + +} +close GENE; +#my %have_gene; +#foreach my $chr (sort keys %genelist) { +# my @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}}; +# my $start=$genelist[0][1]; +# my $end=$genelist[0][2]; +# for (my $i=0;$i<@genelist ;$i++) { +# if ($gene) { +# } +# } +#} + +my %gene_desity; +foreach my $chr (sort keys %genelist) { + my @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}}; + for (my $i=0;$i<@genelist ;$i++) { + my $start=int($genelist[$i][1]/$span); + my $end=int($genelist[$i][2]/$span); + #my @t_rpkm=split/\t/,$target_rpkm{$genelist[$i][0]}; + if ($start==$end) { + $gene_desity{$chr}[$start]++; + } + else{ + for (my $k=$start;$k<=$end ;$k++) { + $gene_desity{$chr}[$k]++; + } + } + } +} +#------------------------------------------region_gene_number------------------------- +my $max_gene_number=0; +my $total=0; +foreach my $chr (sort keys %genelist) { + for (my $i=0;$i<@{$gene_desity{$chr}} ;$i++) { + if (!(defined($gene_desity{$chr}[$i]))) { + $gene_desity{$chr}[$i]=0; + } + if ($gene_desity{$chr}[$i]>$max_gene_number) { + $max_gene_number=$gene_desity{$chr}[$i]; + #print "$gene_desity{$chr}[$i]\n"; + } + #print TXT "$i\t$gene_desity[$i]\n"; + $total+=$gene_desity{$chr}[$i]; + #print "$chr\t$i\t$gene_desity{$chr}[$i]\n"; + } +} +#print "Gene max:$max_gene_number\ntotal:$total\n"; + +#--------------------------------------------------------------- +my %centromere; +if (defined($opt{cen})) { + open CEN,"$opt{cen}"; + while (my $aline=<CEN>) { + chomp $aline; + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + $temp[0]=~s/^c/C/; + $centromere{$temp[0]}[0]=$temp[1]; + $centromere{$temp[0]}[1]=$temp[2]; + } + close CEN; +} + +#--------------------------------------------------------------- +my $max_length=0; +foreach my $chr (keys %length) { + if ($max_length<$length{$chr}) { + $max_length=$length{$chr}; + } + print "$chr\n"; +} +#====================================cluster data======================================= +open(CLUSTER,"$opt{c}")||die"cannot open the file $opt{c}"; +my %cluster; +my %cluster_density; +#my @sample=qw(39B3 3PA3 3LC3); +my @cluster_non_add; +while (my $aline=<CLUSTER>) { + next if($aline=~/^\#/); + chomp $aline;##Chr MajorLength Percent end 19B1 + my @temp=split/\t/,$aline; + my @ID=split/\:/,$temp[0]; + my @posi=split/\-/,$ID[1]; + my @all_rpkm=@temp; + shift @all_rpkm; + shift @all_rpkm; + shift @all_rpkm; +# for (my $s=0;$s<@all_rpkm ;$s++) {#log transfer +# $all_rpkm[$s]=log2($all_rpkm[$s]); +# } + push @{$cluster{$ID[0]}},[$temp[0],$posi[0],$posi[1],@all_rpkm];#ID start end rpkm(19B1,1PA1,1LC1); +} +close CLUSTER; +my %max_cluster; +my $chr_number=0; +print "@mark\n$mark\n"; +foreach my $chr (sort keys %cluster) { + for (my $i=0;$i<@mark ;$i++) { + $max_cluster{$chr}[$i]=0; + } + $chr_number++; +} +foreach my $chr (sort keys %cluster) { + @{$cluster{$chr}}=sort{$a->[1] <=> $b->[1]}@{$cluster{$chr}}; + for (my $i=0;$i<$#{$cluster{$chr}} ;$i++) { + for (my $s=0;$s<@mark;$s++) { + if ($cluster{$chr}[$i][3+$s]>$max_cluster{$chr}) { + $max_cluster{$chr}[$s]=$cluster{$chr}[$i][3+$s]; + } + } + } + +} +foreach my $chr (sort keys %max_cluster) { + for (my $s=0; $s<@mark;$s++) { + # print "$max_cluster{$chr}[$s]\n"; + } +} +#--------------------------------------------------------------------------------------- +foreach my $chr(keys %cluster) { + for(my $i=0;$i<$#{$cluster{$chr}};$i++) { + my $start=int($cluster{$chr}[$i][1]/$span); + my $end=int($cluster{$chr}[$i][2]/$span); + if ($start==$end) { + for (my $s=0;$s<@mark ;$s++) { + $cluster_density{$chr}[$start][$s]+=$cluster{$chr}[$i][3+$s]; + } + + } + else{ + for (my $m=$start;$m<=$end ;$m++) { + for (my $s=0;$s<@mark ;$s++) { + $cluster_density{$chr}[$m][$s]+=$cluster{$chr}[$i][3+$s]; + } + } + } + } +} +my %max_cluster_density; +my $max_all_density=0; +foreach my $chr (sort keys %cluster) {# + for (my $s=0;$s<@mark ;$s++) { + for (my $i=0;$i<$#{$cluster{$chr}} ;$i++) { + $max_cluster_density{$chr}[$s]=0; + } + } + +} +foreach my $chr (sort keys %cluster_density) { + print "$#{$cluster_density{$chr}}\n"; + for (my $k=0;$k<$#{$cluster_density{$chr}} ;$k++) { + print TXT "$chr\t$k"; + for (my $s=0;$s<@mark;$s++) { + if (!(defined($cluster_density{$chr}[$k][$s]))) { + $cluster_density{$chr}[$k][$s]=0; + } + if ($cluster_density{$chr}[$k][$s]>$max_cluster_density{$chr}[$s]) { + $max_cluster_density{$chr}[$s]=$cluster_density{$chr}[$k][$s]; + } + if ($cluster_density{$chr}[$k][$s]>$max_all_density) { + $max_all_density=$cluster_density{$chr}[$k][$s]; + } + print TXT "\t$cluster_density{$chr}[$k][$s]"; + } + print TXT "\n"; + } +} +print "max density: $max_all_density\n"; +#-------------------------------------------------------------------- +my $top_margin=30; +my $tail_margin=30; +my $XOFFSET=50; +my $YOFFSET=60; +my $chr_length=600; +my $Xscale=$chr_length/$max_length;#¶¨ÒåXÖá±ÈÀý³ß 1:1000 xÖáµÄ×ø±ê³¤¶È¶¼Òª°´Õմ˱ÈÀý³ß»»Ëã +#my $high_cov=$high_cov9B1=0.5;#¶¨Òå·åͼ×î¸ß·å +#my $Yscale=1/$high_cov;#¶¨ÒåYÖá±ÈÀý³ß 1:60 yÖáµÄ×ø±ê³¤¶È¶¼Òª°´Õմ˱ÈÀý³ß»»Ëã +#========================================New canvas============================ +#### Starting #### +#н¨»²¼ +my $width=1000; +my $heigth=100+130*$chr_number; +my $svg=SVG->new(width=>$width, height=>$heigth); +#»Í¼Æðʼµã +my $canvas_start_x=$XOFFSET; +my $canvas_end_x=$XOFFSET+$max_length*$Xscale;#°´ÕÕ±ÈÀý³ß »Ïß +my $canvas_start_y=$YOFFSET; +my $canvas_end_y=$YOFFSET; +my $chr_heigth=$heigth-$YOFFSET-$tail_margin; +print "chr number:$chr_number\n"; +my $one_chr_heigth=$chr_heigth/$chr_number; +my $Yscale=($one_chr_heigth-15)/$max_all_density; +#my $chr_width=$YOFFSET; +#my $chr_start_y; +#my $chr_end_y; +#my $Yscale=0.01; +#=======================================title of the graph=============================== +#my $span_k=$span/1000; +#$svg->text('x',$width/2,'y',$YOFFSET-20,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',15,'font-family',$attribute{font}{'font-family'},'-cdata',"Clusters rpkm/"."$span_k"."kb Distribution"); +#=======================================the top max chr line============================= +$svg->line(id=>'l1',x1=>$canvas_start_x,y1=>$canvas_start_y,x2=>$canvas_end_x,y2=>$canvas_end_y,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); +$long_scale=int ($max_length/10);#Ê®µÈ·Ö ´ó¿Ì¶È +#´ó×ø±ê¿Ì¶È +for ($i=0;$i<=10;$i++) { + my $long_x_start=$XOFFSET+$long_scale*$i*$Xscale; + my $long_x_end=$long_x_start; + my $long_y_start=$YOFFSET; + my $long_y_end=$YOFFSET-5; + $svg->line('x1',$long_x_start,'y1',$long_y_start,'x2',$long_x_end,'y2',$long_y_end,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + my $Bscale=$long_scale*$i; + my $cdata=int ($Bscale/1000000); + $svg->text('x',$long_x_start,'y',$long_y_start-10,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$cdata."M"); +} +#========================================================================================= +my $cc=1; +foreach my $chr (sort keys %length) { + my $chr_end_x=$XOFFSET+$length{$chr}*$Xscale; + my $chr_start_x=$XOFFSET; + my $chr_start_y=$YOFFSET+$cc*$one_chr_heigth; + my $chr_end_y=$chr_start_y; + #$chr_start_y+=$chr_width; + #$chr_end_y+=$chr_width; +# for (my $i=0;$i<@{$gene_desity{$chr}};$i++) { +# print "$chr\t$i\t$gene_desity{$chr}[$i]\n"; +# my $red=$gene_desity{$chr}[$i]/$max_gene_number*255; +# my $green=$gene_desity{$chr}[$i]/$max_gene_number*255; +# print "$red\t$green\t0\n"; +# $svg->rect('x',$chr_start_x+$i*$span*$Xscale,'y',$chr_start_y,'width',$span*$Xscale,'height',8,'stroke',"rgb($red,$green,0)",'stroke-width',0.1,'fill',"rgb($red,$green,0)"); +# } + + $svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_end_x,y2=>$chr_end_y,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$XOFFSET-40,'y',$chr_start_y,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$chr); + my $m_length=$length{$chr}%1000000; + $svg->text('x',$chr_end_x+20,'y',$chr_start_y,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$m_length."M"); + + + if (defined($centromere{$chr}[0])) { + $svg->rect('x',$XOFFSET+$centromere{$chr}[0]*$Xscale,'y',$chr_start_y-2,'width',($centromere{$chr}[1]-$centromere{$chr}[0]+1)*$Xscale,'height',5,'stroke',"blue",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"blue"); + } + for (my $s=0;$s<@mark ;$s++) { + for (my $i=0;$i<$#{$cluster_density{$chr}}-1 ;$i++) { + #if ($cluster_density{$chr}[$i]*$Yscale>40) { + #$cluster_density{$chr}[$i]=40/$Yscale; + #$svg->rect('x',$XOFFSET+$i*$span*$Xscale,'y',$chr_start_y-45,'width',$span*$Xscale,'height',5,'stroke',"green",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"green"); + #} + #print "$i\t$cluster_density{$chr}[$i][$s]\t$cluster_density{$chr}[$i+1][$s]\n"; + my $cluster_density_start_x=$XOFFSET+$i*$span*$Xscale; + my $cluster_density_end_x=$XOFFSET+($i+1)*$span*$Xscale; + my $cluster_density_start_y=$chr_start_y-$cluster_density{$chr}[$i][$s]*$Yscale; + my $cluster_density_end_y=$chr_start_y-$cluster_density{$chr}[$i+1][$s]*$Yscale; + my $c_red=($s+1)/@mark*255; + $svg->line('x1',$cluster_density_start_x,'y1',$cluster_density_start_y,'x2',$cluster_density_end_x,'y2',$cluster_density_end_y,'stroke',"rgb($c_red,125,0)",'stroke-width',0.3); + } + + } + #=======Y axis + $svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_start_x,y2=>$chr_start_y-$one_chr_heigth+15,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + #=======Y axis ===>3 xiaoge + my $s10=1; + my $e10=0; + my $chr_max=$max_all_density; + while ($chr_max>10) { + $chr_max=int($chr_max/10); + $s10=$s10*10; + $e10++; + } + $chr_max=$chr_max/2; + #print "*****$max_all_density\t$chr_max\t$s10\n"; + for (my $i=1;$i<3 ;$i++) { + my $y1=$chr_start_y-$chr_max*$s10*$Yscale*$i; + my $xiaoge_Y=$chr_max*$i; + $svg->line('x1',$chr_start_x,'y1',$y1,'x2',$chr_start_x+3,'y2',$y1,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$chr_start_x-26,'y',$y1+4,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',8,'font-family',$attribute{font}{'font-family'},'-cdata',$xiaoge_Y."e".$e10); + } + $cc++; +} + +for (my $s=0;$s<@mark ;$s++) { + my $c_red=($s+1)/@mark*255; + print "**$c_red\n"; + $svg->line('x1',$canvas_end_x+100,'y1',$YOFFSET+$s*20+30,'x2',$canvas_end_x+130,'y2',$YOFFSET+$s*20+30,'stroke',"rgb($c_red,125,0)",'stroke-width',1); + $svg->text('x',$canvas_end_x+150,'y',$YOFFSET+$s*20+5+30,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',$mark[$s]); +} +# +# +if (defined($opt{cen})) { + $svg->rect('x',$canvas_end_x+100,'y',$YOFFSET+@mark*20+30,'width',30,'height',5,'stroke',"blue",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"blue"); + $svg->text('x',$canvas_end_x+150,'y',$YOFFSET+@mark*20+30+5,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',"centromere"); +} + +close TXT; + +open (OUT,">$opt{o}"); +print OUT $svg->xmlify(); + +sub log2 { + my $n = shift; + return log($n)/log(2); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 +options: +-g genelist +-span +-n sample cloumn +-mark sample name +-o output graph file name with html or svg extension +-c cluster file input +-out txt output +-l length of chr +-cen centromere +-h help +USAGE +exit(1); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sam2Bed_bowtie.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,74 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2011/11/7 +#Modified: +#Description: sam2BED +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","mark=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +my $mark=$opts{'mark'}; +my @sample=split/\#/,$mark; +$mark=join"\t",@sample; +open OUT,">$fileout"; #output file +print OUT "#chr\tstrand\tstart\tend\t$mark\n"; + +open IN,"<$filein"; #input file +my $Tags_num=0; +my @read_num; +#print OUT "#chr\tstart\tend\tnum\t<=20\t21\t22\t23\t24\t>=25\n"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\@/); + my @tmp=split/\t/,$aline; + my $strand=$tmp[1]; + my $start=$tmp[3]+1; + my $length=length($tmp[4]); + my $end=$start+$length-1; + my $hit=$tmp[6]+1; + #======express caculate weighted=================================== + my $exp; + my @tempID=split/\:/,$tmp[0]; + my @exp=split/\_/,$tempID[1]; + pop @exp; + for (my $j=0;$j<@exp ;$j++) { + #my @tempID1=split/\=/,$tempID[$j]; + $exp[$j]=sprintf("%.2f",$exp[$j]/$hit); + $read_num[$j]+=$exp[$j]; + #print OUT "\t$exp"; + } + $exp=join "\t",@exp; + print OUT $tmp[2],"\t",$strand,"\t",$start,"\t",$end,"\t",$exp,"\n"; + $Tags_num++; + +} +print "Total Tags numer: $Tags_num\n"; +my $read_number=join "\t",@read_num; +print "Each sample numer: $read_number\n"; +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-mark sampleA sampleB sampleC..... +-o output file +-h help +USAGE +exit(1); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,403 @@ +#!/usr/bin/perl -w +my $version=1.00; +use strict; +use warnings; +use Getopt::Long; +use Getopt::Std; +use threads; +use threads::shared; +use Parallel::ForkManager; +use lib '/leofs/biotrans/chentt/perl_module/'; +#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq +print " +##################################### +# # +# sRNA cluster # +# # +##################################### +"; +########################################################################################### +my $usage="$0 +Options: +-i input file# fasta +-config input file +-g genome file +-f gff file + +-o workdir file +-path script path +-t int, number of threads [1] +-format fastq, fq, fasta or fa +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-mis int number of allowed mismatches when mapping reads to genome, default 0 + +-n int max hits number,default 25; used in genome alignment +-d int distance of tag to merged a cluster; default 100 +-p cluster method F :conventional default is F + T :NIBLES +-l int the length of the upstream and downstream,default 1000;used in position annotate + +-nat natural antisense transcripts file +-repeat repeat information file out of Repeatmasker +-deg file config of de sample +-cen centromere file input +-span plot span, default 50000 +"; + +my %options; +GetOptions(\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h"); +#print help if that option is used +if($options{h}){die $usage;} + +my $filein=$options{'i'}; + +#my $config=$options{'i'}; +my $genome_fa=$options{'g'}; +my $gff=$options{'f'}; + + +########################################################################################## +my $predir=`pwd`; +chomp $predir; +my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; + +my $path=$options{'path'}; + +my $t=defined($options{'t'})? $options{'t'}:1; #threads number + +my $mis=defined $options{'mis'} ? $options{'mis'}:0; + + +my $hit=defined $options{'n'}?$options{'n'}:25; + +my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; + +my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; + +my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; + +my $format=$options{'format'}; +#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { +# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +#} + + + +my $sample_number; +my ($dir,$dir_tmp); +################################ MAIN ################################################## +print "\ncluster program start:"; +my $time=Time(); +make_dir_tmp(); + +my $mark; +my $sample_mark; + +my $config=$opts{'config'}; +my (@filein,@mark); +&read_config(); +$sample_number=@mark; +$mark=join "\t",@mark; +$sample_mark=join "\#",@mark; + + + +my $data3=$filein; ### rfam not mapped reads +genome(); + +my $bed=$dir."cluster\/"."sample\.bed"; +my $read=$dir."cluster\/"."sample_reads\.cluster"; +my $read_txt=$dir."cluster\/"."cluster\.txt"; +my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; +my $preprocess; +my $cluster_file; +my $annotate_dir; +my $deg_dir; +my $plot_dir; +my %id; +for (my $i=0;$i<@mark ;$i++) { + $id{$mark[$i]}=$i+4; +} + + +my @map_read; +my $map_tag=0; + +bwt2bed(); + +cluster(); + +quantify(); + +phase(); + +if (defined($options{'nat'})&&defined($options{'repeat'})) { + class(); +} +else{ + get_genelist(); +} + +annotate(); + +genome_length(); + +plot(); + +my @pairdir; +if (defined($options{'deg'})) { + dec(); + infor_merge(); +} +else{infor_merge_no_dec()} +html(); +print "\ncluster program end:"; +Time(); +############################sub program################################################### +sub make_dir_tmp{ + + #make temporary directory + if(not -d "$workdir\/cluster_runs"){ + mkdir("$workdir\/cluster_runs"); + mkdir("$workdir\/cluster_runs\/ref\/"); + } + + $dir="$workdir\/cluster_runs\/"; + #print STDERR "mkdir $dir\n\n"; + return; +} + +sub genome{ + if(defined $options{'idx'}){ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; + }else{ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; + } + #=================== mapping sta =================================================== + my $map_file=$dir."genome_match\/genome_mapped\.fa"; + open (MAP,"<$map_file")||die"$!"; + print "\n#each sample mapping reads sta:\n\n"; + print "#$mark\ttotal\n"; + while (my $ID=<MAP>) { + chomp $ID; + my @tmp=split/\:/,$ID; + my @exp=split/\_/,$tmp[1]; + $exp[-1] =~ s/^x//; + for (my $i=0;$i<@exp ;$i++) { + $map_read[$i]+=$exp[$i]; + } + $map_tag++; + my $seq=<MAP>; + } + my $map_read=join"\t",@map_read; + print "$map_read\n\n"; + print "#total mapped tags:$map_read\n\n"; + close MAP; + return 0; +} + +sub bwt2bed{ + $cluster_file=$dir."cluster\/"; + mkdir ("$cluster_file"); + print "sam file changed to bed file\n"; + my ($file) = $dir."genome_match\/genome_mapped\.bwt"; + + my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; + print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; + return 0; +} + +sub cluster{ + print "tags is ready to merged clusters\n\n"; + my ($file) =$bed; + if ($cluster_mothod eq "F") { + my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; + print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; + } + elsif($cluster_mothod eq "T"){ + my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`; + print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n"; + } + else{print "\-p is wrong!\n\n";} + return 0; +} + + +sub quantify{ + print "clusters is ready to quantified\n\n"; + my @depth=@map_read; + pop @depth; + my $depth=join ",",@depth; + my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; + print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n"; + return 0; +} + +sub phase{ + $annotate_dir=$dir."annotate\/"; + mkdir ("$annotate_dir"); + print "clusters is to predict phase siRNA\n"; + my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; + print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; + return 0; +} + +sub class{ + print "clusters is ready to annotate by sources\n\n"; + my $nat=$options{'nat'}; + my $repeat=$options{'repeat'}; + my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; + print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; +} + +sub annotate{ + print "clusters is ready to annotate by gff file\n\n"; + my $file; + if (defined($options{'nat'})&&defined($options{'repeat'})) { + $file="$annotate_dir\/sample_class.anno"; + } + else{ + $file=$rpkm; + } + my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; + print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; + return 0; +} +sub get_genelist{ + + my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; + print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; +} + +sub dec{ + print "deg reading\n\n"; + my $deg_file=$options{'deg'}; + open IN,"<$deg_file"; + my @deg; + my $s=0; + while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + $deg[$s]=$aline; + my @ea=split/\s+/,$aline; + push @pairdir,"$ea[0]_VS_$ea[1]\/"; + #print "$deg[$s]\n"; + $s++; + } + close IN; + $deg_dir=$dir."deg\/"; + mkdir ("$deg_dir"); + my $max_process = 10; + my $pm = new Parallel::ForkManager( $max_process ); + my $number=@deg-1; + foreach(0..$number){ + $pm->start and next; + &dec_pel($deg[$_]); + $pm->finish; + } + $pm->wait_all_children; +} + +sub dec_pel{ + print "\n******************\nstart:\n"; + Time(); + my $sample=shift(@_); + my @each=split/\s+/,$sample; + print "$each[0]\t$each[1]\n"; + my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; + mkdir ("$deg_sample_dir"); + print "read: $read\n"; + print "deg_sample_dir: $deg_sample_dir\n"; + print "$id{$each[0]}\t$each[0]\n"; + print "$id{$each[1]}\t$each[1]\n"; + my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 + my $time2=time(); + print "end:\n*************************\n"; + Time(); + sleep 1; +} + +sub infor_merge{ + my ($input,$mark); + foreach (@pairdir) { + print "@pairdir\n"; + $mark.=" -mark $_ "; + $input.=" -i $dir/deg\/$_\/output_score\.txt "; + print "$input\n$mark\n"; + } + my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; + print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; +} + +sub infor_merge_no_dec{ + my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; +} + +sub genome_length{ + my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; + print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" + +} + +sub plot{ + $plot_dir="$dir\/plot\/"; + mkdir ("$plot_dir"); + my $span=defined($options{span})?$options{span}:50000; + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `; + "print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n"; + +} + +sub html{ + my $pathfile="$dir/path.txt"; + open PA,">$pathfile"; + print PA "$config\n"; + print PA "$preprocess\n"; + print PA "$dir"."rfam_match\n"; + print PA "$dir"."genome_match\n"; + print PA "$cluster_file\n"; + print PA "$annotate_dir\n"; + print PA "$plot_dir\n"; + if (defined($deg_dir)) { + print PA "$deg_dir\n"; + } + close PA; + my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +################################################################################# +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + #&check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA.xml Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,122 @@ +<tool id="plant_sirna_v1" name="siRNA" veision="1.0.0"> + <description>tool for plant siRNA analisis</description> + + <requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="package" version="3.0.1">R</requirement> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="1.96">threads</requirement> + <requirement type="package" version="1.06">Parallel-ForkManager</requirement> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="1.4_001">Boost-Graph</requirement> + </requirements> + + <command interpreter="perl">siRNA_pipeline.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + + -path \$SCRIPT_PATH + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + ## Do or not annotate siRNAs by function + #if $params.function_anno == "yes": + -nat $params.nat -repeat $params.repeat + #end if + + ## Do or not DEG + #if $degseq.degseq_analysis == "yes" : + -deg $degseq.deg + #end if + + -i $reads -config $config -n $hits -format $format -g ${index_path}.fa -idx $index_path -f $gff -mis $mis -d $d -p $p -l $l -cen $cen -span $span > run.log + + </command> + + <inputs> + + <param name="config" type="data" label="Raw data configs file" /> + <param name="reads" type="data" label="Input Fasta. file of candidate microRNA sequence" /> + + <param name="format" type="select" lable=" data format" multiple="false"> + <option value="fastq">Raw data is fastq. format</option> + <option value="fasta">Raw data is fasta. format</option> + </param> + + + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + + + <param name="gff" type="data" label="gff file" /> + <param name="mis" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" /> + <param name="mapnt" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> + <param name="d" type="integer" value="100" label="distance of tag to merged a cluster" /> + + <param name="p" type="select" lable="cluster method" multiple="false"> + <option value="F">conventional</option> + <option value="T">NIBLES</option> + </param> + <param name="l" type="integer" value="1000" label="the length of the upstream and downstream,used in position annotate" /> + + + <conditional name="params"> + <param name="function_anno" type="select" label="Do or not annotate siRNAs by function"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="nat" type="data" label="atural antisense transcripts file" /> + <param name="repeat" type="data" label="repeat information file out of Repeatmasker" /> + </when> + </conditional> <!-- params --> + + <param name="cen" type="data" label="centromere file input" /> + <param name="span" type="integer" value="50000" label="plot span" /> + + <conditional name="degseq"> + <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="deg" type="data" label="file config of de sample" /> + </when> + </conditional> + + </inputs> + + <outputs> + <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/> + <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/> + + </outputs> + + <help> + + </help> + </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA_pipeline.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,524 @@ +#!/usr/bin/perl -w +my $version=1.00; +use strict; +use warnings; +use Getopt::Long; +use Getopt::Std; +use threads; +use threads::shared; +use Parallel::ForkManager; +use lib '/leofs/biotrans/chentt/perl_module/'; +#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq +print " +##################################### +# # +# sRNA cluster # +# # +##################################### +"; +########################################################################################### +my $usage="$0 +Options: +-i input file# raw data file +-tag string #raw data sample name +-g genome file +-f gff file + +-o workdir file +-path script path +-t int, number of threads [1] +-format fastq, fq, fasta or fa +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-mis int number of allowed mismatches when mapping reads to genome, default 0 +-rfam string, input file# rfam database file. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-v int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment + +-a string, ADAPTER string. default is ATCTCGTATG. +-n int max hits number,default 25; used in genome alignment +-d int distance of tag to merged a cluster; default 100 +-p cluster method F :conventional default is F + T :NIBLES +-l int the length of the upstream and downstream,default 1000;used in position annotate + +-nat natural antisense transcripts file +-repeat repeat information file out of Repeatmasker +-deg file config of de sample +-cen centromere file input +-span plot span, default 50000 +"; + +my %options; +GetOptions(\%options,"i:s@","tag:s@","g=s","phed:i","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h"); +#print help if that option is used +if($options{h}){die $usage;} + +my @filein=@{$options{'i'}}; +my @mark=@{$options{'tag'}}; + +#my $config=$options{'i'}; +my $genome_fa=$options{'g'}; +my $gff=$options{'f'}; + + +########################################################################################## +my $predir=`pwd`; +chomp $predir; +my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; + +my $path=$options{'path'}; + +my $t=defined($options{'t'})? $options{'t'}:1; #threads number + +my $mis=defined $options{'mis'} ? $options{'mis'}:0; + +my $mis_rfam=defined $options{'v'} ? $options{'v'}:0; + +my $hit=defined $options{'n'}?$options{'n'}:25; + +my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; + +my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; + +my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; + +my $format=$options{'format'}; +#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { +# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +#} + +my $adpter="ATCTCGTATG"; #adapter +if (defined $options{'a'}) {$adpter=$options{'a'};} + + +my $phred_qv=64; +if(defined $opts{'phred'}){$phred_qv=$opts{'phred'};} +my $sample_number; +my ($dir,$dir_tmp); +################################ MAIN ################################################## +print "\ncluster program start:"; +my $time=Time(); +make_dir_tmp(); + +my @clip; +my $mark; +my $sample_mark; + +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@filein;$i++) { + print CONFIG $filein[$i],"\t",$mark[$i],"\n"; + } +close CONFIG; +if (@filein != @mark) { + die "Maybe config file have some wrong!!!\n"; +} +$sample_number=@mark; +$mark=join "\t",@mark; +$sample_mark=join "\#",@mark; + + +#read_config(); + +trim_adapter_and_filter(); + +my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data +my $data2=$filter_out; ### mirbase not mapped reads +my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa"; ### rfam not mapped reads +my $bed=$dir."cluster\/"."sample\.bed"; +my $read=$dir."cluster\/"."sample_reads\.cluster"; +my $read_txt=$dir."cluster\/"."cluster\.txt"; +my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; +my $preprocess; +my $cluster_file; +my $annotate_dir; +my $deg_dir; +my $plot_dir; +my %id; +for (my $i=0;$i<@mark ;$i++) { + $id{$mark[$i]}=$i+4; +} + +print "\n######## tiandm test start ###########\n"; +print "\@mark: @mark\n\%id keys number:"; +print scalar keys %id; +print "\n"; +foreach my $kyess (keys %id){ + print $kyess," --> $id{$kyess}\n"; +} +print "\n######## tiandm test end ############\n"; +group_and_filter(); #collapse reads to tags + +rfam(); + +my @map_read; +my $map_tag=0; +genome(); + +bwt2bed(); + +cluster(); + +quantify(); + +phase(); + +if (defined($options{'nat'})&&defined($options{'repeat'})) { + class(); +} +else{ + get_genelist(); +} + +annotate(); + +genome_length(); + +plot(); + +my @pairdir; +if (defined($options{'deg'})) { + dec(); + infor_merge(); +} +else{infor_merge_no_dec()} +html(); +print "\ncluster program end:"; +Time(); +############################sub program################################################### +sub make_dir_tmp{ + + #make temporary directory + if(not -d "$workdir\/cluster_runs"){ + mkdir("$workdir\/cluster_runs"); + mkdir("$workdir\/cluster_runs\/ref\/"); + } + + $dir="$workdir\/cluster_runs\/"; + #print STDERR "mkdir $dir\n\n"; + return; +} + +#sub read_config{ +# open IN,"<$config"; +# while (my $aline=<IN>) { +# chomp $aline; +# my @tmp=split/\t/,$aline; +# push @filein,$tmp[0]; +# push @mark,$tmp[1]; +# } +# close IN; +# if (@filein != @mark) { +# die "Maybe config file have some wrong!!!\n"; +# } +# $sample_number=@mark; +# $mark=join "\t",@mark; +# $sample_mark=join "\#",@mark; +#} + + +sub trim_adapter_and_filter{ + my $time=time(); + $preprocess=$dir."preProcess/"; + mkdir $preprocess; + my $can_use_threads = eval 'use threads; 1'; + if ($can_use_threads) { + # Do processing using threads + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } + } + else { +# Do not processing using threads + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + &clips($in,$out); + } + } +} + +sub clips{ + my ($filein,$fileout)=@_; + my $adapter=$dir."preProcess\/$fileout\_clip\.fq"; + if($format eq "fq" || $format eq "fastq"){ + my $clip=`fastx_clipper -a $adpter -M 6 -Q $phred_qv -i $filein -o $adapter`; + } + if($format eq "fa" || $format eq "fasta"){ + my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`; + } + #my $clean=$dir."preProcess\/$fileout\_clean.fq"; + #my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `; + return $fileout; +} + +sub group_and_filter{ + #my ($ins,$data)=@_; + my @ins=@clip; + my $str=""; + my $group_out_file=$dir."preProcess\/"."collapse_reads.fa"; + #print "$$ins[0]\t$$ins[0]\n"; + for (my $i=0;$i<@clip;$i++) { + $str .="-i $clip[$i] "; + #print "$$ins[$i]\n"; + } + my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`; + print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n"; + + my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa"; + my $tmpmark=join ",", @mark; + + my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $tmpmark`; + print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $tmpmark\n\n"; + my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`; + print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n"; + my $plot_l_D=`perl $path/Length_Distibution.pl -i $dir/preProcess/reads_length_distribution_after_count_filter.txt -o $dir/preProcess/length.html `; + print "perl $path\/Length_Distibution.pl -i $dir\/preProcess\/reads_length_distribution_after_count_filter.txt -o $dir\/preProcess\/length\.html\n\n"; + return 0; +} + +sub rfam{ + if (defined $options{'idx2'}) { + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}"); + }else{ + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir"); + } + my $tag=join "\\;" ,@mark; + my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`; + return 0; +} +sub genome{ + if(defined $options{'idx'}){ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; + }else{ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; + } + #=================== mapping sta =================================================== + my $map_file=$dir."genome_match\/genome_mapped\.fa"; + open (MAP,"<$map_file")||die"$!"; + print "\n#each sample mapping reads sta:\n\n"; + print "#$mark\ttotal\n"; + while (my $ID=<MAP>) { + chomp $ID; + my @tmp=split/\:/,$ID; + my @exp=split/\_/,$tmp[1]; + $exp[-1] =~ s/^x//; + for (my $i=0;$i<@exp ;$i++) { + $map_read[$i]+=$exp[$i]; + } + $map_tag++; + my $seq=<MAP>; + } + my $map_read=join"\t",@map_read; + print "$map_read\n\n"; + print "#total mapped tags:$map_read\n\n"; + close MAP; + return 0; +} + +sub bwt2bed{ + $cluster_file=$dir."cluster\/"; + mkdir ("$cluster_file"); + print "sam file changed to bed file\n"; + my ($file) = $dir."genome_match\/genome_mapped\.bwt"; + + my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; + print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; + return 0; +} + +sub cluster{ + print "tags is ready to merged clusters\n\n"; + my ($file) =$bed; + if ($cluster_mothod eq "F") { + my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; + print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; + } + elsif($cluster_mothod eq "T"){ + my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`; + print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n"; + } + else{print "\-p is wrong!\n\n";} + return 0; +} + + +sub quantify{ + print "clusters is ready to quantified\n\n"; + my @depth=@map_read; + pop @depth; + my $depth=join ",",@depth; + my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; + print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n"; + return 0; +} + +sub phase{ + $annotate_dir=$dir."annotate\/"; + mkdir ("$annotate_dir"); + print "clusters is to predict phase siRNA\n"; + my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; + print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; + return 0; +} + +sub class{ + print "clusters is ready to annotate by sources\n\n"; + my $nat=$options{'nat'}; + my $repeat=$options{'repeat'}; + my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; + print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; +} + +sub annotate{ + print "clusters is ready to annotate by gff file\n\n"; + my $file; + if (defined($options{'nat'})&&defined($options{'repeat'})) { + $file="$annotate_dir\/sample_class.anno"; + } + else{ + $file=$rpkm; + } + my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; + print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; + return 0; +} +sub get_genelist{ + + my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; + print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; +} + +sub dec{ + print "deg reading\n\n"; + my $deg_file=$options{'deg'}; + open IN,"<$deg_file"; + my @deg; + my $s=0; + while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + $deg[$s]=$aline; + my @ea=split/\s+/,$aline; + push @pairdir,"$ea[0]_VS_$ea[1]\/"; + #print "$deg[$s]\n"; + $s++; + } + close IN; + $deg_dir=$dir."deg\/"; + mkdir ("$deg_dir"); + my $max_process = 10; + my $pm = new Parallel::ForkManager( $max_process ); + my $number=@deg-1; + foreach(0..$number){ + $pm->start and next; + &dec_pel($deg[$_]); + $pm->finish; + } + $pm->wait_all_children; +} + +sub dec_pel{ + print "\n******************\nstart:\n"; + Time(); + my $sample=shift(@_); + my @each=split/\s+/,$sample; + print "$each[0]\t$each[1]\n"; + my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; + mkdir ("$deg_sample_dir"); + print "read: $read\n"; + print "deg_sample_dir: $deg_sample_dir\n"; + print "$id{$each[0]}\t$each[0]\n"; + print "$id{$each[1]}\t$each[1]\n"; + my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 + my $time2=time(); + print "end:\n*************************\n"; + Time(); + sleep 1; +} + +sub infor_merge{ + my ($input,$mark); + foreach (@pairdir) { + print "@pairdir\n"; + $mark.=" -mark $_ "; + $input.=" -i $dir/deg\/$_\/output_score\.txt "; + print "$input\n$mark\n"; + } + my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; + print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; +} + +sub infor_merge_no_dec{ + my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; +} + +sub genome_length{ + my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; + print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" + +} + +sub plot{ + $plot_dir="$dir\/plot\/"; + mkdir ("$plot_dir"); + my $span=defined($options{span})?$options{span}:50000; + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `; + "print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n"; + +} + +sub html{ + my $pathfile="$dir/path.txt"; + open PA,">$pathfile"; + print PA "$config\n"; + print PA "$preprocess\n"; + print PA "$dir"."rfam_match\n"; + print PA "$dir"."genome_match\n"; + print PA "$cluster_file\n"; + print PA "$annotate_dir\n"; + print PA "$plot_dir\n"; + if (defined($deg_dir)) { + print PA "$deg_dir\n"; + } + close PA; + my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +#################################################################################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA_pipeline.xml Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,164 @@ +<tool id="plant_sirna_v1" name="siRNA" veision="1.0.0"> + <description>tool for plant siRNA analisis</description> + + <requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="package" version="3.0.1">R</requirement> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="1.96">threads</requirement> + <requirement type="package" version="1.06">Parallel-ForkManager</requirement> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="1.4_001">Boost-Graph</requirement> + </requirements> + + <command interpreter="perl">siRNA_pipeline.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + + -path \$SCRIPT_PATH + + #for $j, $s in enumerate( $series ) + ##rank_of_series=$j + -i ${s.input} + -tag ${s.tag} + #end for + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($reference_rfam.source) == "history": + bowtie-build "$reference_rfam.own_file" rfam; ln -s $reference_rfam.own_file" rfam.fa; + #set rfam_index_path = 'rfam' + #else: + #set rfam_index_path = $reference_rfam.index.fields.path + #end if + + + + ## Do or not annotate siRNAs by function + #if $params.function_anno == "yes": + -nat $params.nat -repeat $params.repeat + #end if + + ## Do or not DEG + #if $degseq.degseq_analysis == "yes" : + -deg $degseq.deg + #end if + + -format $format -phred $phred -g ${index_path}.fa -idx $index_path -f $gff -mis $mis -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v -a $a -n $mapnt -d $d -p $p -l $l -cen $cen -span $span > run.log + + </command> + + <inputs> + + <repeat name="series" title="Series"> + <param name="input" type="data" label="Raw data file"/> + <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/> + </repeat> + + <param name="format" type="select" lable="raw data format" multiple="false"> + <option value="fastq">Raw data is fastq. format</option> + <option value="fasta">Raw data is fasta. format</option> + </param> + + <param name="phred" type="select" lable="input quals are Phred+64 or Phred+33" multiple="false"> + <option value="64">Phred+64</option> + <option value="33" selected="true">Phred+33</option> + </param> + + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> <!--param type="data" name="index" label="genome sequence bowtie index"/--> + + <conditional name="reference_rfam"> + <param name="source" type="select" label="Will you select a rfam reference from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference " help="If your rfam of interest is not listed, contact the Galaxy team"> + <options from_data_table="rfam_bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference" /> + </when> + </conditional> + + <param name="gff" type="data" label="gff file" /> + <param name="mis" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" /> + <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches"/> + <param name="a" type="text" value="ATCTCGTATG" label="3' adapter sequence" /> + <param name="mapnt" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> + <param name="d" type="integer" value="100" label="distance of tag to merged a cluster" /> + + <param name="p" type="select" lable="cluster method" multiple="false"> + <option value="F">conventional</option> + <option value="T">NIBLES</option> + </param> + <param name="l" type="integer" value="1000" label="the length of the upstream and downstream,used in position annotate" /> + + + <conditional name="params"> + <param name="function_anno" type="select" label="Do or not annotate siRNAs by function"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="nat" type="data" label="atural antisense transcripts file" /> + <param name="repeat" type="data" label="repeat information file out of Repeatmasker" /> + </when> + </conditional> <!-- params --> + + <param name="cen" type="data" label="centromere file input" /> + <param name="span" type="integer" value="50000" label="plot span" /> + + <conditional name="degseq"> + <param name="degseq_analysis" type="select" label="Do or not identify Difference Expression Clusters"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="deg" type="data" label="file config of de sample" /> + </when> + </conditional> + + </inputs> + + <outputs> + <data format="txt" name="siRNA cluster" from_work_dir="cluster_runs/total.result" label="${tool.name} on ${on_string}: siRNA cluster"/> + <data format="html" name="analysis result" from_work_dir="cluster_runs/result.html" label="${tool.name} on ${on_string}: analysis result"/> + + </outputs> + + <help> + + </help> + </tool>
--- a/tool_dependencies.xml Wed Dec 03 02:03:27 2014 -0500 +++ b/tool_dependencies.xml Fri Dec 05 00:11:02 2014 -0500 @@ -26,6 +26,76 @@ </actions> </install> </package> + <package name="R" version="3.0.1"> + <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> + + <install version="1.0"> + <actions> + <action type="set_environment_for_install"> + <repository changeset_revision="c5ff6dd33c79" name="package_r_3_0_1" owner="iuc" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu"> + <package name="R" version="3.0.1" /> + </repository> + </action> + <action type="shell_command">R CMD BATCH $REPOSITORY_INSTALL_DIR/install_DEG.R </action> + <action type="shell_command">echo "export PATH=$PATH" > $INSTALL_DIR/env.sh </action> + <action type="shell_command">chmod 755 $INSTALL_DIR/env.sh </action> + + </actions> + </install> + </package> + + <package name="threads" version="1.96"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/J/JD/JDHEDDEN/threads-1.96.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package> + + + <package name="Parallel-ForkManager" version="1.06"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/S/SZ/SZABGAB/Parallel-ForkManager-1.06.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package> + + <package name="Boost-Graph" version="1.4_001"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.cpan.org/authors/id/D/DU/DUFFEE/Boost-Graph-1.4_001.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/perl5</action> + <action type="shell_command"> + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + ls && + make && + make install + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> + </action> + </actions> + </install> + </package> <package name="SVG" version="2.59"> <install version="1.0">