# HG changeset patch # User big-tiandm # Date 1414118799 14400 # Node ID d317cee86d4b69fc8fdc514c8850fd6d43cc1152 # Parent 6aa0e8d63b17589a328d472c1b1190c8af1af686 Deleted selected files diff -r 6aa0e8d63b17 -r d317cee86d4b siRNA.pl --- a/siRNA.pl Thu Oct 23 22:46:32 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,522 +0,0 @@ -#!/usr/bin/perl -w -my $version=1.00; -use strict; -use warnings; -use Getopt::Long; -use Getopt::Std; -use threads; -use threads::shared; -use Parallel::ForkManager; -use lib '/leofs/biotrans/chentt/perl_module/'; -#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq -print " -##################################### -# # -# sRNA cluster # -# # -##################################### -"; -########################################################################################### -my $usage="$0 -Options: --i input file# raw data file --tag string #raw data sample name --g genome file --f gff file - --o workdir file --path script path --t int, number of threads [1] --format fastq, fq, fasta or fa --idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter - string must be the prefix of the bowtie index. For instance, if - the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then - the prefix is 'h_sapiens_37_asm'.##can be null --mis int number of allowed mismatches when mapping reads to genome, default 0 --rfam string, input file# rfam database file. --idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter - string must be the prefix of the bowtie index. For instance, if - the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then - the prefix is 'h_sapiens_37_asm'.##can be null - --v int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment - --a string, ADAPTER string. default is ATCTCGTATG. --n int max hits number,default 25; used in genome alignment --d int distance of tag to merged a cluster; default 100 --p cluster method F :conventional default is F - T :NIBLES --l int the length of the upstream and downstream,default 1000;used in position annotate - --nat natural antisense transcripts file --repeat repeat information file out of Repeatmasker --deg file config of de sample --cen centromere file input --span plot span, default 50000 -"; - -my %options; -GetOptions(\%options,"i:s@","tag:s@","g=s","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h"); -#print help if that option is used -if($options{h}){die $usage;} - -my @filein=@{$options{'i'}}; -my @mark=@{$options{'tag'}}; - -#my $config=$options{'i'}; -my $genome_fa=$options{'g'}; -my $gff=$options{'f'}; - - -########################################################################################## -my $predir=`pwd`; -chomp $predir; -my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; - -my $path=$options{'path'}; - -my $t=defined($options{'t'})? $options{'t'}:1; #threads number - -my $mis=defined $options{'mis'} ? $options{'mis'}:0; - -my $mis_rfam=defined $options{'v'} ? $options{'v'}:0; - -my $hit=defined $options{'n'}?$options{'n'}:25; - -my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; - -my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; - -my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; - -my $format=$options{'format'}; -#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { -# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; -#} - -my $adpter="ATCTCGTATG"; #adapter -if (defined $options{'a'}) {$adpter=$options{'a'};} - - -my $phred_qv=64; -my $sample_number; -my ($dir,$dir_tmp); -################################ MAIN ################################################## -print "\ncluster program start:"; -my $time=Time(); -make_dir_tmp(); - -my @clip; -my $mark; -my $sample_mark; - -my $config=$dir."/input_config"; -open CONFIG,">$config"; - for (my $i=0;$i<@filein;$i++) { - print CONFIG $filein[$i],"\t",$mark[$i],"\n"; - } -close CONFIG; -if (@filein != @mark) { - die "Maybe config file have some wrong!!!\n"; -} -$sample_number=@mark; -$mark=join "\t",@mark; -$sample_mark=join "\#",@mark; - - -#read_config(); - -trim_adapter_and_filter(); - -my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data -my $data2=$filter_out; ### mirbase not mapped reads -my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa"; ### rfam not mapped reads -my $bed=$dir."cluster\/"."sample\.bed"; -my $read=$dir."cluster\/"."sample_reads\.cluster"; -my $read_txt=$dir."cluster\/"."cluster\.txt"; -my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; -my $preprocess; -my $cluster_file; -my $annotate_dir; -my $deg_dir; -my %id; -for (my $i=0;$i<@mark ;$i++) { - $id{$mark[$i]}=$i+4; -} -group_and_filter(); #collapse reads to tags - -rfam(); - -my @map_read; -my $map_tag=0; -genome(); - -bwt2bed(); - -cluster(); - -quantify(); - -phase(); - -if (defined($options{'nat'})&&defined($options{'repeat'})) { - class(); -} -else{ - get_genelist(); -} - -annotate(); - -genome_length(); - -plot(); - -my @pairdir; -if (defined($options{'deg'})) { - dec(); - infor_merge(); -} -html(); -print "\ncluster program end:"; -Time(); -############################sub program################################################### -sub make_dir_tmp{ - - #make temporary directory - if(not -d "$workdir\/cluster_runs_$time"){ - mkdir("$workdir\/cluster_runs_$time"); - mkdir("$workdir\/cluster_runs_$time\/ref\/"); - } - - $dir="$workdir\/cluster_runs_$time\/"; - print STDERR "mkdir $dir\n\n"; - return; -} - -#sub read_config{ -# open IN,"<$config"; -# while (my $aline=) { -# chomp $aline; -# my @tmp=split/\t/,$aline; -# push @filein,$tmp[0]; -# push @mark,$tmp[1]; -# } -# close IN; -# if (@filein != @mark) { -# die "Maybe config file have some wrong!!!\n"; -# } -# $sample_number=@mark; -# $mark=join "\t",@mark; -# $sample_mark=join "\#",@mark; -#} - - -sub trim_adapter_and_filter{ - my $time=time(); - $preprocess=$dir."preProcess/"; - mkdir $preprocess; - my $can_use_threads = eval 'use threads; 1'; - if ($can_use_threads) { - # Do processing using threads - my @filein1=@filein; my @mark1=@mark; - while (@filein1>0) { - my @thrs; my @res; - for (my $i=0;$i<$t ;$i++) { - last if(@filein1==0); - my $in=shift @filein1; - my $out=shift @mark1; - push @clip,$dir."preProcess\/$out\_clip\.fq"; - $thrs[$i]=threads->create(\&clips,$in,$out); - } - for (my $i=0;$i<@thrs;$i++) { - $res[$i]=$thrs[$i]->join(); - } - } - } - else { -# Do not processing using threads - for (my $i=0;$i<@filein ;$i++) { - my $in=$filein[$i]; - my $out=$mark[$i]; - push @clip,$dir."preProcess\/$out\_clip\.fq"; - &clips($in,$out); - } - } -} - -sub clips{ - my ($filein,$fileout)=@_; - my $adapter=$dir."preProcess\/$fileout\_clip\.fq"; - if($format eq "fq" || $format eq "fastq"){ - my $clip=`fastx_clipper -a $adpter -M 6 -Q $phred_qv -i $filein -o $adapter`; - } - if($format eq "fa" || $format eq "fasta"){ - my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`; - } - #my $clean=$dir."preProcess\/$fileout\_clean.fq"; - #my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `; - return $fileout; -} - -sub group_and_filter{ - #my ($ins,$data)=@_; - my @ins=@clip; - my $str=""; - my $group_out_file=$dir."preProcess\/"."collapse_reads.fa"; - #print "$$ins[0]\t$$ins[0]\n"; - for (my $i=0;$i<@clip;$i++) { - $str .="-i $clip[$i] "; - #print "$$ins[$i]\n"; - } - my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`; - print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n"; - - my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa"; - my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark`; - print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark\n\n"; - my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`; - print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n"; - return 0; -} - -sub rfam{ - if (defined $options{'idx2'}) { - system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}"); - }else{ - system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir"); - } - my $tag=join "\\;" ,@mark; - my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`; - return 0; -} -sub genome{ - if(defined $options{'idx'}){ - system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; - }else{ - system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; - } - #=================== mapping sta =================================================== - my $map_file=$dir."genome_match\/genome_mapped\.fa"; - open (MAP,"<$map_file")||die"$!"; - print "\n#each sample mapping reads sta:\n\n"; - print "#$mark\ttotal\n"; - while (my $ID=) { - chomp $ID; - my @tmp=split/\:/,$ID; - my @exp=split/\_/,$tmp[1]; - $exp[-1] =~ s/^x//; - for (my $i=0;$i<@exp ;$i++) { - $map_read[$i]+=$exp[$i]; - } - $map_tag++; - my $seq=; - } - my $map_read=join"\t",@map_read; - print "$map_read\n\n"; - print "#total mapped tags:$map_read\n\n"; - close MAP; - return 0; -} - -sub bwt2bed{ - $cluster_file=$dir."cluster\/"; - mkdir ("$cluster_file"); - print "sam file changed to bed file\n"; - my ($file) = $dir."genome_match\/genome_mapped\.bwt"; - - my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; - print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; - return 0; -} - -sub cluster{ - print "tags is ready to merged clusters\n\n"; - my ($file) =$bed; - if ($cluster_mothod eq "F") { - my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; - print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; - } - elsif($cluster_mothod eq "T"){ - my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -mark $sample_mark`; - print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -mark $sample_mark\n\n"; - } - else{print "\-p is wrong!\n\n";} - return 0; -} - - -sub quantify{ - print "clusters is ready to quantified\n\n"; - my @depth=@map_read; - pop @depth; - my $depth=join ",",@depth; - my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; - print "perl $path\/quantify.pl -i $read -d $depth -o $rpkm\n\n\n"; - return 0; -} - -sub phase{ - $annotate_dir=$dir."annotate\/"; - mkdir ("$annotate_dir"); - print "clusters is to predict phase siRNA\n"; - my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; - print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; - return 0; -} - -sub class{ - print "clusters is ready to annotate by sources\n\n"; - my $nat=$options{'nat'}; - my $repeat=$options{'repeat'}; - my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; - print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; -} - -sub annotate{ - print "clusters is ready to annotate by gff file\n\n"; - my $file; - if (defined($options{'nat'})&&defined($options{'repeat'})) { - $file="$annotate_dir\/sample_class.anno"; - } - else{ - $file=$rpkm; - } - my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; - print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; - return 0; -} -sub get_genelist{ - - my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; - print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; -} - -sub dec{ - print "deg reading\n\n"; - my $deg_file=$options{'deg'}; - open IN,"<$deg_file"; - my @deg; - my $s=0; - while (my $aline=) { - chomp $aline; - next if($aline=~/^\#/); - $deg[$s]=$aline; - my @ea=split/\s+/,$aline; - push @pairdir,"$ea[0]_VS_$ea[1]\/"; - #print "$deg[$s]\n"; - $s++; - } - close IN; - $deg_dir=$dir."deg\/"; - mkdir ("$deg_dir"); - my $max_process = 10; - my $pm = new Parallel::ForkManager( $max_process ); - my $number=@deg-1; - foreach(0..$number){ - $pm->start and next; - &dec_pel($deg[$_]); - $pm->finish; - } - $pm->wait_all_children; -} - -sub dec_pel{ - print "start:\n"; - Time(); - my $sample=shift(@_); - my @each=split/\s+/,$sample; - print "$each[0]\t$each[1]\n"; - my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; - mkdir ("$deg_sample_dir"); - my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 - my $time2=time(); - print "end:\n"; - Time(); - sleep 1; -} - -sub infor_merge{ - my ($input,$mark); - foreach (@pairdir) { - print "@pairdir\n"; - $mark.=" -mark $_ "; - $input.=" -i $dir/deg\/$_\/output_score\.txt "; - print "$input\n$mark\n"; - } - my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; - print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; -} - -sub genome_length{ - my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; - print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" - -} - -sub plot{ - my $plot_file="$dir\/plot\/"; - mkdir ("$plot_file"); - my $genome_plot="$dir\/plot\/genome\/"; - mkdir ("$genome_plot"); - #genome cluster - my $span=defined($options{span})?$options{span}:50000; - foreach (1..$sample_number) { - my $mark=$mark[$_-1]; - my $cen=""; - if (defined $options{cen}) { - $cen="-cen $options{cen}"; - } - my $plot=`perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt`; - print "perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt\n\n"; - } - - my $chr_plot_dir="$dir\/plot\/chr\/"; - mkdir("$chr_plot_dir"); - my %chr; - open LEN,"<$dir\/ref\/genome\.length"; - while (my $aline=) { - next if($aline=~/^\#/); - chomp $aline; - my @temp=split/\t/,$aline; - $chr{$temp[0]}=$temp[1]; - } - close LEN; - foreach my $chr (sort keys %chr) { - my $cen=""; - if (defined $options{cen}) { - $cen="-cen $options{cen}"; - } - my $chr_plot=`perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html`; - print "perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html\n"; - } -} - -sub html{ - my $pathfile="$dir/path.txt"; - open PA,">$pathfile"; - print PA "$config\n"; - print PA "$preprocess\n"; - print PA "$dir"."rfam_match\n"; - print PA "$dir"."genome_match\n"; - print PA "$cluster_file\n"; - print PA "$annotate_dir\n"; - print PA "$deg_dir\n"; - close PA; - my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; -} - -sub Time{ - my $time=time(); - my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; - $month++; - $year+=1900; - if (length($sec) == 1) {$sec = "0"."$sec";} - if (length($min) == 1) {$min = "0"."$min";} - if (length($hour) == 1) {$hour = "0"."$hour";} - if (length($day) == 1) {$day = "0"."$day";} - if (length($month) == 1) {$month = "0"."$month";} - print "$year-$month-$day $hour:$min:$sec\n"; - return("$year-$month-$day-$hour-$min-$sec"); -} -#################################################################################