diff siRNA.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA.pl	Fri Dec 05 00:11:02 2014 -0500
@@ -0,0 +1,403 @@
+#!/usr/bin/perl -w
+my $version=1.00;
+use strict;
+use warnings;
+use Getopt::Long;
+use Getopt::Std;
+use threads;
+use threads::shared;
+use Parallel::ForkManager;
+use lib '/leofs/biotrans/chentt/perl_module/';
+#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq 
+print "
+#####################################
+#                                   # 
+#          sRNA cluster             # 
+#                                   #
+#####################################
+";
+###########################################################################################
+my $usage="$0
+Options:
+-i input file# fasta
+-config input file 
+-g  genome file
+-f  gff file
+
+-o  workdir file
+-path  script path
+-t  int,    number of threads [1]
+-format  fastq, fq, fasta or fa
+-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
+		string must be the prefix of the bowtie index. For instance, if
+		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
+		the prefix is 'h_sapiens_37_asm'.##can be null
+-mis  int     number of allowed mismatches when mapping reads to genome, default 0
+
+-n  int max hits number,default 25; used in genome alignment
+-d  int distance of tag to merged a cluster; default 100
+-p  cluster method F :conventional default is F
+				   T :NIBLES 
+-l  int the length of the upstream and downstream,default 1000;used in position annotate
+
+-nat  natural antisense transcripts file
+-repeat  repeat information file out of Repeatmasker
+-deg  file config of de sample
+-cen  centromere file input
+-span  plot span, default 50000
+";
+
+my %options;
+GetOptions(\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h");
+#print help if that option is used
+if($options{h}){die $usage;}
+
+my $filein=$options{'i'};
+
+#my $config=$options{'i'};
+my $genome_fa=$options{'g'};
+my $gff=$options{'f'};
+
+
+##########################################################################################
+my $predir=`pwd`;
+chomp $predir;
+my $workdir=defined($options{'o'}) ? $options{'o'}:$predir;
+
+my $path=$options{'path'};
+
+my $t=defined($options{'t'})? $options{'t'}:1; #threads number
+
+my $mis=defined $options{'mis'} ? $options{'mis'}:0;
+
+
+my $hit=defined $options{'n'}?$options{'n'}:25;
+
+my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100;
+
+my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000;
+
+my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F";
+
+my $format=$options{'format'};
+#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { 
+#	die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
+#}
+
+
+
+my $sample_number;
+my ($dir,$dir_tmp);
+################################  MAIN  ##################################################
+print "\ncluster program start:";
+my $time=Time();
+make_dir_tmp();
+
+my $mark;
+my $sample_mark;
+
+my $config=$opts{'config'};
+my (@filein,@mark);
+&read_config();
+$sample_number=@mark;
+$mark=join "\t",@mark;
+$sample_mark=join "\#",@mark;
+
+
+
+my $data3=$filein;   ### rfam not mapped reads
+genome();
+
+my $bed=$dir."cluster\/"."sample\.bed";
+my $read=$dir."cluster\/"."sample_reads\.cluster";
+my $read_txt=$dir."cluster\/"."cluster\.txt";
+my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster";
+my $preprocess;
+my $cluster_file;
+my $annotate_dir;
+my $deg_dir;
+my $plot_dir;
+my %id;
+for (my $i=0;$i<@mark ;$i++) {
+	$id{$mark[$i]}=$i+4;
+}
+
+
+my @map_read;
+my $map_tag=0;
+
+bwt2bed();
+
+cluster();
+
+quantify();
+
+phase();
+
+if (defined($options{'nat'})&&defined($options{'repeat'})) {
+	class();
+}
+else{
+	get_genelist();
+}
+
+annotate();
+
+genome_length();
+
+plot();
+
+my @pairdir;
+if (defined($options{'deg'})) {
+	dec();
+	infor_merge();
+}
+else{infor_merge_no_dec()}
+html();
+print "\ncluster program end:";
+Time();
+############################sub program###################################################
+sub make_dir_tmp{
+
+	#make temporary directory
+	if(not -d "$workdir\/cluster_runs"){
+		mkdir("$workdir\/cluster_runs");
+		mkdir("$workdir\/cluster_runs\/ref\/");
+	}
+
+	$dir="$workdir\/cluster_runs\/";
+	#print STDERR "mkdir $dir\n\n";
+	return;
+}
+
+sub genome{
+	if(defined $options{'idx'}){
+		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ;
+	}else{
+		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ;
+	}
+	#=================== mapping sta ===================================================
+	my $map_file=$dir."genome_match\/genome_mapped\.fa";
+	open (MAP,"<$map_file")||die"$!";
+	print "\n#each sample mapping reads sta:\n\n";
+	print "#$mark\ttotal\n";
+	while (my $ID=<MAP>) {
+		chomp $ID;
+		my @tmp=split/\:/,$ID;
+		my @exp=split/\_/,$tmp[1];
+		$exp[-1] =~ s/^x//;
+		for (my $i=0;$i<@exp ;$i++) {
+			$map_read[$i]+=$exp[$i];
+		}
+		$map_tag++;
+		my $seq=<MAP>;
+	}
+	my $map_read=join"\t",@map_read;
+	print "$map_read\n\n";
+	print "#total mapped tags:$map_read\n\n";
+	close MAP;
+	return 0;
+}
+
+sub bwt2bed{
+	$cluster_file=$dir."cluster\/";
+	mkdir ("$cluster_file");
+	print "sam file changed to bed file\n";
+	my ($file) = $dir."genome_match\/genome_mapped\.bwt";
+
+	my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `;
+	print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n";
+	return 0;
+}
+
+sub cluster{
+	print "tags is ready to merged clusters\n\n";
+	my ($file) =$bed;
+	if ($cluster_mothod eq "F") {
+		my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`;
+		print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n";
+	}
+	elsif($cluster_mothod eq "T"){
+		my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`;
+		print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n";
+	}
+	else{print "\-p is wrong!\n\n";}
+	return 0;
+}
+
+
+sub quantify{
+	print "clusters is ready to quantified\n\n";
+	my @depth=@map_read;
+	pop @depth;
+	my $depth=join ",",@depth;
+	my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`;
+	print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n";
+	return 0;
+}
+
+sub phase{
+	$annotate_dir=$dir."annotate\/";
+	mkdir ("$annotate_dir");
+	print "clusters is to predict phase siRNA\n";
+	my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`;
+	print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n";
+	return 0;
+}
+
+sub class{
+	print "clusters is ready to annotate by sources\n\n";
+	my $nat=$options{'nat'};
+	my $repeat=$options{'repeat'};
+	my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`;
+	print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n";
+}
+
+sub annotate{
+	print "clusters is ready to annotate by gff file\n\n";
+	my $file;
+	if (defined($options{'nat'})&&defined($options{'repeat'})) {
+		$file="$annotate_dir\/sample_class.anno";
+	}
+	else{
+		$file=$rpkm;
+	}
+	my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`;
+	print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n";
+	return 0;
+}
+sub get_genelist{
+
+	my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`;
+	print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt";
+}
+
+sub dec{
+	print "deg reading\n\n";
+	my $deg_file=$options{'deg'};
+	open IN,"<$deg_file";
+	my @deg;
+	my $s=0;
+		while (my $aline=<IN>) {
+		chomp $aline;
+		next if($aline=~/^\#/);
+		$deg[$s]=$aline;
+		my @ea=split/\s+/,$aline;
+		push @pairdir,"$ea[0]_VS_$ea[1]\/";
+		#print "$deg[$s]\n";
+		$s++;
+	}
+	close IN;
+	$deg_dir=$dir."deg\/";
+	mkdir ("$deg_dir");
+	my $max_process = 10;
+	my $pm = new Parallel::ForkManager( $max_process );
+	my $number=@deg-1;
+	foreach(0..$number){
+		$pm->start and next;
+		&dec_pel($deg[$_]);
+		$pm->finish;
+	}
+	$pm->wait_all_children;
+}
+
+sub dec_pel{
+	print "\n******************\nstart:\n";
+	Time();
+	my $sample=shift(@_);
+	my @each=split/\s+/,$sample;
+	print "$each[0]\t$each[1]\n";
+	my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/";
+	mkdir ("$deg_sample_dir");
+	print "read: $read\n";
+	print "deg_sample_dir: $deg_sample_dir\n";
+	print "$id{$each[0]}\t$each[0]\n";
+	print "$id{$each[1]}\t$each[1]\n";
+	my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2
+	my $time2=time();
+	print "end:\n*************************\n";
+	Time();
+	sleep 1;
+}
+
+sub infor_merge{
+	my ($input,$mark);
+	foreach (@pairdir) {
+		print "@pairdir\n";
+		$mark.=" -mark $_ ";
+		$input.=" -i $dir/deg\/$_\/output_score\.txt ";
+		print "$input\n$mark\n";
+	}
+	my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `;
+	print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n";
+}
+
+sub infor_merge_no_dec{
+	my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`;
+}
+
+sub genome_length{
+	my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`;
+	print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n"
+
+}
+
+sub plot{
+	$plot_dir="$dir\/plot\/";
+	mkdir ("$plot_dir");
+	my $span=defined($options{span})?$options{span}:50000;
+	my $cen="";
+	if (defined $options{cen}) {
+		$cen="-cen $options{cen}";
+	}
+	my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `;
+	"print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n";
+
+}
+
+sub html{
+	my $pathfile="$dir/path.txt";
+	open PA,">$pathfile";
+	print PA "$config\n";
+	print PA "$preprocess\n";
+	print PA "$dir"."rfam_match\n";
+	print PA "$dir"."genome_match\n";
+	print PA "$cluster_file\n";
+	print PA "$annotate_dir\n";
+	print PA "$plot_dir\n";
+	if (defined($deg_dir)) {
+			print PA "$deg_dir\n";
+	}
+	close PA;
+	my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`;
+}
+
+sub Time{
+	my $time=time();
+	my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
+	$month++;
+	$year+=1900;
+	if (length($sec) == 1) {$sec = "0"."$sec";}
+	if (length($min) == 1) {$min = "0"."$min";}
+	if (length($hour) == 1) {$hour = "0"."$hour";}
+	if (length($day) == 1) {$day = "0"."$day";}
+	if (length($month) == 1) {$month = "0"."$month";}
+	print "$year-$month-$day $hour:$min:$sec\n";
+	return("$year-$month-$day-$hour-$min-$sec");
+}
+#################################################################################
+sub read_config{
+	open CON,"<$config";
+	while (my $aline=<CON>) {
+		chomp $aline;
+		my @tmp=split/\t/,$aline;
+		push @filein,$tmp[0];
+		push @mark,$tmp[1];
+		#&check_rawdata($tmp[0]);
+	}
+	close CON;
+	if (@filein != @mark) {
+		#&printErr();
+		die "Maybe config file have some wrong!!!\n";
+	}
+}