Mercurial > repos > big-tiandm > sirna_plant

--- a/siRNA.pl	Thu Oct 23 22:46:32 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,522 +0,0 @@
-#!/usr/bin/perl -w
-my $version=1.00;
-use strict;
-use warnings;
-use Getopt::Long;
-use Getopt::Std;
-use threads;
-use threads::shared;
-use Parallel::ForkManager;
-use lib '/leofs/biotrans/chentt/perl_module/';
-#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq
-print "
-#####################################
-#                                   #
-#          sRNA cluster             #
-#                                   #
-#####################################
-";
-###########################################################################################
-my $usage="$0
-Options:
--i input file# raw data file
--tag string #raw data sample name
--g  genome file
--f  gff file
-
--o  workdir file
--path  script path
--t  int,    number of threads [1]
--format  fastq, fq, fasta or fa
--idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
-		string must be the prefix of the bowtie index. For instance, if
-		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
-		the prefix is 'h_sapiens_37_asm'.##can be null
--mis  int     number of allowed mismatches when mapping reads to genome, default 0
--rfam  string,  input file# rfam database file.
--idx2  string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter
-		string must be the prefix of the bowtie index. For instance, if
-		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
-		the prefix is 'h_sapiens_37_asm'.##can be null
-
--v  int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment
-
--a string,  ADAPTER string. default is ATCTCGTATG.
--n  int max hits number,default 25; used in genome alignment
--d  int distance of tag to merged a cluster; default 100
--p  cluster method F :conventional default is F
-				   T :NIBLES
--l  int the length of the upstream and downstream,default 1000;used in position annotate
-
--nat  natural antisense transcripts file
--repeat  repeat information file out of Repeatmasker
--deg  file config of de sample
--cen  centromere file input
--span  plot span, default 50000
-";
-
-my %options;
-GetOptions(\%options,"i:s@","tag:s@","g=s","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h");
-#print help if that option is used
-if($options{h}){die $usage;}
-
-my @filein=@{$options{'i'}};
-my @mark=@{$options{'tag'}};
-
-#my $config=$options{'i'};
-my $genome_fa=$options{'g'};
-my $gff=$options{'f'};
-
-
-##########################################################################################
-my $predir=`pwd`;
-chomp $predir;
-my $workdir=defined($options{'o'}) ? $options{'o'}:$predir;
-
-my $path=$options{'path'};
-
-my $t=defined($options{'t'})? $options{'t'}:1; #threads number
-
-my $mis=defined $options{'mis'} ? $options{'mis'}:0;
-
-my $mis_rfam=defined $options{'v'} ? $options{'v'}:0;
-
-my $hit=defined $options{'n'}?$options{'n'}:25;
-
-my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100;
-
-my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000;
-
-my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F";
-
-my $format=$options{'format'};
-#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
-#	die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
-#}
-
-my $adpter="ATCTCGTATG";  #adapter
-if (defined $options{'a'}) {$adpter=$options{'a'};}
-
-
-my $phred_qv=64;
-my $sample_number;
-my ($dir,$dir_tmp);
-################################  MAIN  ##################################################
-print "\ncluster program start:";
-my $time=Time();
-make_dir_tmp();
-
-my @clip;
-my $mark;
-my $sample_mark;
-
-my $config=$dir."/input_config";
-open CONFIG,">$config";
-	for (my $i=0;$i<@filein;$i++) {
-		print CONFIG $filein[$i],"\t",$mark[$i],"\n";
-	}
-close CONFIG;
-if (@filein != @mark) {
-	die "Maybe config file have some wrong!!!\n";
-}
-$sample_number=@mark;
-$mark=join "\t",@mark;
-$sample_mark=join "\#",@mark;
-
-
-#read_config();
-
-trim_adapter_and_filter();
-
-my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data
-my $data2=$filter_out;   ### mirbase not mapped reads
-my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa";   ### rfam not mapped reads
-my $bed=$dir."cluster\/"."sample\.bed";
-my $read=$dir."cluster\/"."sample_reads\.cluster";
-my $read_txt=$dir."cluster\/"."cluster\.txt";
-my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster";
-my $preprocess;
-my $cluster_file;
-my $annotate_dir;
-my $deg_dir;
-my %id;
-for (my $i=0;$i<@mark ;$i++) {
-	$id{$mark[$i]}=$i+4;
-}
-group_and_filter();   #collapse reads to tags
-
-rfam();
-
-my @map_read;
-my $map_tag=0;
-genome();
-
-bwt2bed();
-
-cluster();
-
-quantify();
-
-phase();
-
-if (defined($options{'nat'})&&defined($options{'repeat'})) {
-	class();
-}
-else{
-	get_genelist();
-}
-
-annotate();
-
-genome_length();
-
-plot();
-
-my @pairdir;
-if (defined($options{'deg'})) {
-	dec();
-	infor_merge();
-}
-html();
-print "\ncluster program end:";
-Time();
-############################sub program###################################################
-sub make_dir_tmp{
-
-	#make temporary directory
-	if(not -d "$workdir\/cluster_runs_$time"){
-		mkdir("$workdir\/cluster_runs_$time");
-		mkdir("$workdir\/cluster_runs_$time\/ref\/");
-	}
-
-	$dir="$workdir\/cluster_runs_$time\/";
-	print STDERR "mkdir $dir\n\n";
-	return;
-}
-
-#sub read_config{
-#	open IN,"<$config";
-#	while (my $aline=<IN>) {
-#		chomp $aline;
-#		my @tmp=split/\t/,$aline;
-#		push @filein,$tmp[0];
-#		push @mark,$tmp[1];
-#	}
-#	close IN;
-#	if (@filein != @mark) {
-#		die "Maybe config file have some wrong!!!\n";
-#	}
-#	$sample_number=@mark;
-#	$mark=join "\t",@mark;
-#	$sample_mark=join "\#",@mark;
-#}
-
-
-sub trim_adapter_and_filter{
-	my $time=time();
-	$preprocess=$dir."preProcess/";
-	mkdir $preprocess;
-	my $can_use_threads = eval 'use threads; 1';
-	if ($can_use_threads) {
-	# Do processing using threads
-		my @filein1=@filein; my @mark1=@mark;
-		while (@filein1>0) {
-			my @thrs; my @res;
-			for (my $i=0;$i<$t ;$i++) {
-				last if(@filein1==0);
-				my $in=shift @filein1;
-				my $out=shift @mark1;
-				push @clip,$dir."preProcess\/$out\_clip\.fq";
-				$thrs[$i]=threads->create(\&clips,$in,$out);
-			}
-			for (my $i=0;$i<@thrs;$i++) {
-				$res[$i]=$thrs[$i]->join();
-			}
-		}
-	}
-	else {
-# Do not processing using threads
-		for (my $i=0;$i<@filein ;$i++) {
-			my $in=$filein[$i];
-			my $out=$mark[$i];
-			push @clip,$dir."preProcess\/$out\_clip\.fq";
-			&clips($in,$out);
-		}
-	}
-}
-
-sub clips{
-	my ($filein,$fileout)=@_;
-	my $adapter=$dir."preProcess\/$fileout\_clip\.fq";
-	if($format eq "fq" || $format eq "fastq"){
-		my $clip=`fastx_clipper -a $adpter -M 6  -Q $phred_qv -i $filein -o $adapter`;
-	}
-	if($format eq "fa" || $format eq "fasta"){
-		my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`;
-	}
-	#my $clean=$dir."preProcess\/$fileout\_clean.fq";
-	#my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `;
-	return $fileout;
-}
-
-sub group_and_filter{
-	#my ($ins,$data)=@_;
-	my @ins=@clip;
-	my $str="";
-	my $group_out_file=$dir."preProcess\/"."collapse_reads.fa";
-	#print "$$ins[0]\t$$ins[0]\n";
-	for (my $i=0;$i<@clip;$i++) {
-		$str .="-i $clip[$i] ";
-		#print "$$ins[$i]\n";
-	}
-	my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`;
-	print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n";
-
-	my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa";
-	my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark`;
-	print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark\n\n";
-	my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`;
-	print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n";
-	return 0;
-}
-
-sub rfam{
-	if (defined $options{'idx2'}) {
-		system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}");
-	}else{
-		system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir");
-	}
-	my $tag=join "\\;" ,@mark;
-	my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`;
-	return 0;
-}
-sub genome{
-	if(defined $options{'idx'}){
-		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ;
-	}else{
-		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ;
-	}
-	#=================== mapping sta ===================================================
-	my $map_file=$dir."genome_match\/genome_mapped\.fa";
-	open (MAP,"<$map_file")||die"$!";
-	print "\n#each sample mapping reads sta:\n\n";
-	print "#$mark\ttotal\n";
-	while (my $ID=<MAP>) {
-		chomp $ID;
-		my @tmp=split/\:/,$ID;
-		my @exp=split/\_/,$tmp[1];
-		$exp[-1] =~ s/^x//;
-		for (my $i=0;$i<@exp ;$i++) {
-			$map_read[$i]+=$exp[$i];
-		}
-		$map_tag++;
-		my $seq=<MAP>;
-	}
-	my $map_read=join"\t",@map_read;
-	print "$map_read\n\n";
-	print "#total mapped tags:$map_read\n\n";
-	close MAP;
-	return 0;
-}
-
-sub bwt2bed{
-	$cluster_file=$dir."cluster\/";
-	mkdir ("$cluster_file");
-	print "sam file changed to bed file\n";
-	my ($file) = $dir."genome_match\/genome_mapped\.bwt";
-
-	my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `;
-	print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n";
-	return 0;
-}
-
-sub cluster{
-	print "tags is ready to merged clusters\n\n";
-	my ($file) =$bed;
-	if ($cluster_mothod eq "F") {
-		my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`;
-		print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n";
-	}
-	elsif($cluster_mothod eq "T"){
-		my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -mark $sample_mark`;
-		print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -mark $sample_mark\n\n";
-	}
-	else{print "\-p is wrong!\n\n";}
-	return 0;
-}
-
-
-sub quantify{
-	print "clusters is ready to quantified\n\n";
-	my @depth=@map_read;
-	pop @depth;
-	my $depth=join ",",@depth;
-	my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`;
-	print "perl $path\/quantify.pl -i $read -d $depth -o $rpkm\n\n\n";
-	return 0;
-}
-
-sub phase{
-	$annotate_dir=$dir."annotate\/";
-	mkdir ("$annotate_dir");
-	print "clusters is to predict phase siRNA\n";
-	my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`;
-	print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n";
-	return 0;
-}
-
-sub class{
-	print "clusters is ready to annotate by sources\n\n";
-	my $nat=$options{'nat'};
-	my $repeat=$options{'repeat'};
-	my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`;
-	print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n";
-}
-
-sub annotate{
-	print "clusters is ready to annotate by gff file\n\n";
-	my $file;
-	if (defined($options{'nat'})&&defined($options{'repeat'})) {
-		$file="$annotate_dir\/sample_class.anno";
-	}
-	else{
-		$file=$rpkm;
-	}
-	my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`;
-	print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n";
-	return 0;
-}
-sub get_genelist{
-
-	my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`;
-	print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt";
-}
-
-sub dec{
-	print "deg reading\n\n";
-	my $deg_file=$options{'deg'};
-	open IN,"<$deg_file";
-	my @deg;
-	my $s=0;
-		while (my $aline=<IN>) {
-		chomp $aline;
-		next if($aline=~/^\#/);
-		$deg[$s]=$aline;
-		my @ea=split/\s+/,$aline;
-		push @pairdir,"$ea[0]_VS_$ea[1]\/";
-		#print "$deg[$s]\n";
-		$s++;
-	}
-	close IN;
-	$deg_dir=$dir."deg\/";
-	mkdir ("$deg_dir");
-	my $max_process = 10;
-	my $pm = new Parallel::ForkManager( $max_process );
-	my $number=@deg-1;
-	foreach(0..$number){
-		$pm->start and next;
-		&dec_pel($deg[$_]);
-		$pm->finish;
-	}
-	$pm->wait_all_children;
-}
-
-sub dec_pel{
-	print "start:\n";
-	Time();
-	my $sample=shift(@_);
-	my @each=split/\s+/,$sample;
-	print "$each[0]\t$each[1]\n";
-	my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/";
-	mkdir ("$deg_sample_dir");
-	my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2
-	my $time2=time();
-	print "end:\n";
-	Time();
-	sleep 1;
-}
-
-sub infor_merge{
-	my ($input,$mark);
-	foreach (@pairdir) {
-		print "@pairdir\n";
-		$mark.=" -mark $_ ";
-		$input.=" -i $dir/deg\/$_\/output_score\.txt ";
-		print "$input\n$mark\n";
-	}
-	my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `;
-	print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n";
-}
-
-sub genome_length{
-	my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`;
-	print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n"
-
-}
-
-sub plot{
-	my $plot_file="$dir\/plot\/";
-	mkdir ("$plot_file");
-	my $genome_plot="$dir\/plot\/genome\/";
-	mkdir ("$genome_plot");
-	#genome cluster
-	my $span=defined($options{span})?$options{span}:50000;
-	foreach  (1..$sample_number) {
-		my $mark=$mark[$_-1];
-		my $cen="";
-		if (defined $options{cen}) {
-			$cen="-cen $options{cen}";
-		}
-		my $plot=`perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt`;
-		print "perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt\n\n";
-	}
-
-	my $chr_plot_dir="$dir\/plot\/chr\/";
-	mkdir("$chr_plot_dir");
-	my %chr;
-	open LEN,"<$dir\/ref\/genome\.length";
-	while (my $aline=<LEN>) {
-		next if($aline=~/^\#/);
-		chomp $aline;
-		my @temp=split/\t/,$aline;
-		$chr{$temp[0]}=$temp[1];
-	}
-	close LEN;
-	foreach my $chr (sort keys %chr) {
-		my $cen="";
-		if (defined $options{cen}) {
-			$cen="-cen $options{cen}";
-		}
-		my $chr_plot=`perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span  -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html`;
-		print "perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span  -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html\n";
-	}
-}
-
-sub html{
-	my $pathfile="$dir/path.txt";
-	open PA,">$pathfile";
-	print PA "$config\n";
-	print PA "$preprocess\n";
-	print PA "$dir"."rfam_match\n";
-	print PA "$dir"."genome_match\n";
-	print PA "$cluster_file\n";
-	print PA "$annotate_dir\n";
-	print PA "$deg_dir\n";
-	close PA;
-	my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`;
-}
-
-sub Time{
-	my $time=time();
-	my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
-	$month++;
-	$year+=1900;
-	if (length($sec) == 1) {$sec = "0"."$sec";}
-	if (length($min) == 1) {$min = "0"."$min";}
-	if (length($hour) == 1) {$hour = "0"."$hour";}
-	if (length($day) == 1) {$day = "0"."$day";}
-	if (length($month) == 1) {$month = "0"."$month";}
-	print "$year-$month-$day $hour:$min:$sec\n";
-	return("$year-$month-$day-$hour-$min-$sec");
-}
-#################################################################################