Mercurial > repos > big-tiandm > mirplant2

--- a/miRPlant.pl	Fri Jul 25 05:57:15 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,503 +0,0 @@
-#!/usr/bin/perl -w
-#Filename:
-#Author: Tian Dongmei
-#Email: tiandm@big.ac.cn
-#Date: 2014-4-22
-#Modified:
-#Description: plant microRNA prediction
-my $version=1.00;
-
-use strict;
-use Getopt::Long;
-use threads;
-use threads::shared;
-use File::Path;
-use File::Basename;
-#use RNA;
-use Term::ANSIColor;
-
-my %opts;
-GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h");
-if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments
-&usage;
-}
-
-my $time=&Time();
-print "miPlant program start:\n   The time is $time!\n";
-print "Command line:\n   $0 @ARGV\n";
-
-my $format=$opts{'format'};
-if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
-	&printErr();
-	die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
-}
-
-my $phred_qv=64;
-
-
-my @inputfiles=@{$opts{'i'}};
-my @inputtags=@{$opts{'tag'}};
-
-my  $mypath=`pwd`;
-chomp $mypath;
-
-my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRPlant_out/";
-
-
-unless ($dir=~/\/$/) {$dir.="/";}
-if (not -d $dir) {
-	mkdir $dir;
-}
-my $config=$dir."/input_config";
-open CONFIG,">$config";
-	for (my $i=0;$i<@inputfiles;$i++) {
-		print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n";
-	}
-close CONFIG;
-
-my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/";
-
-my $a="ATCTCGTATG";  #adapter
-if (defined $opts{'a'}) {$a=$opts{'a'};}
-
-my $m=6;  #adapter minimum mapped nt
-if (defined $opts{'M'}) {$m=$opts{'M'};}
-
-my $t=1; #threads number
-if (defined $opts{'t'}) {$t=$opts{'t'};}
-
-my $min_nt=19; # minimum reads length
-if (defined $opts{'min'}) {$min_nt=$opts{'min'};}
-
-my $max_nt=28; #maximum reads length
-if (defined $opts{'max'}) {$max_nt=$opts{'max'};}
-
-my $mis=0; #mismatch number for microRNA
-if (defined $opts{'mis'}) {$mis=$opts{'mis'};}
-
-my $mis_rfam=0;# mismatch number for rfam
-if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};}
-
-my $hit=25; # maximum reads mapping hits in genome
-if (defined $opts{'r'}) {$hit=$opts{'r'};}
-
-my $upstream = 2; # microRNA 5' extension
-$upstream = $opts{'e'} if(defined $opts{'e'});
-
-my $downstream = 5;# microRNA 3' extension
-$downstream = $opts{'f'} if(defined $opts{'f'});
-
-my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200;
-my $flank=defined $opts{'flank'} ? $opts{'flank'} :10;
-my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20;
-
-$time=&Time();
-print "$time, Checking input file!\n";
-
-my (@filein,@mark,@clean);
-#&read_config();
-@filein=@inputfiles;
-@mark=@inputtags;
-
-&checkfa($opts{pre});
-&checkfa($opts{mat});
-&checkfa($opts{gfa});
-
-
-##### clip adpter --> clean data  start
-$time=&Time();
-print "$time, Preprocess:\n   trim adapter, reads collapse and filter reads by length.\n";
-
-$time=~s/:/-/g;
-$time=~s/ /-/g;
-my $preprocess=$dir."preProcess_${time}/";
-mkdir $preprocess;
-my $can_use_threads = eval 'use threads; 1';
-if ($can_use_threads) {
-# Do processing using threads
-	print "Do processing using threads\n";
-	my @filein1=@filein; my @mark1=@mark;
-	while (@filein1>0) {
-		my @thrs; my @res;
-		for (my $i=0;$i<$t ;$i++) {
-			last if(@filein1==0);
-			my $in=shift @filein1;
-			my $out=shift @mark1;
-			push @clean,$preprocess.$out."_clips_adapter.fq";
-			$thrs[$i]=threads->create(\&clips,$in,$out);
-		}
-		for (my $i=0;$i<@thrs;$i++) {
-			$res[$i]=$thrs[$i]->join();
-		}
-	}
-} else {
-# Do not processing using threads
-	print "Do not processing using threads\n";
-	for (my $i=0;$i<@filein ;$i++) {
-		my $in=$filein[$i];
-		my $out=$mark[$i];
-		push @clean,$preprocess.$out."_clips_adapter.fq";
-		&clips($in,$out);
-	}
-}
-
-##### clip adpter --> clean data  end
-
-my $collapsed=$preprocess."collapse_reads.fa";
-my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data
-my $data2;   ### mirbase not mapped reads
-my $data3;   ### rfam not mapped reads
-&collapse(\@clean,$collapsed);   #collapse reads to tags
-
-&filterbylength();  # filter <$min_nt  && >$max_nt
-
-print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n";
-
-$time=Time();
-print "$time: known microRNA quantify!\n\n";
-
-chdir $dir;
-
-$time=~s/:/-/g;
-$time=~s/ /-/g;
-my $known_result=$dir."miRNA_Express_${time}/";
-&quantify(); ### known microRAN quantify
-
-
-#my $miR_exp_dir=&search($known_result,"miRNA_Express_");
-$data2=$known_result."/mirbase_not_mapped.fa";
-
-my $pathfile="$dir/path.txt";
-open PA,">$pathfile";
-print PA "$config\n";
-print PA "$preprocess\n";
-print PA "$known_result\n";
-
-if (defined $opts{'rfam'}) {              #rfam mapping and analysis
-	$time=Time();
-	print "$time: RNA annotate!\n\n";
-	$time=~s/:/-/g;
-	$time=~s/ /-/g;
-	my $rfam_exp_dir=$dir."rfam_match_${time}";
-	&rfam();
-	#my $rfam_exp_dir=&search($dir,"rfam_match_");
-	$data3=$rfam_exp_dir."/rfam_not_mapped.fa";
-print PA "$rfam_exp_dir\n";
-
-	my $tag=join "\\;" ,@mark;
-	system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt");
-}
-
-my $data4=$data;
-if (defined $opts{'D'}) {                 #genome mapping
-	$data4=$data3;
-}else{
-	$data4=$data2;
-}
-
-$time=Time();
-print "$time: Genome alignment!\n\n";
-$time=~s/:/-/g;
-$time=~s/ /-/g;
-my $genome_map=$dir."genome_match_${time}";
-&genome($data4);
-print PA "$genome_map\n";
-#my $genome_map=&search($dir,"genome_match_");
-my $mapfile=$genome_map."/genome_mapped.bwt";
-my $mapfa=$genome_map."/genome_mapped.fa";
-my $unmap=$genome_map."/genome_not_mapped.fa";
-
-#$time=Time();
-#print "$time: Novel microRNA prediction!\n\n";
-
-&predict($mapfa);
-
-close PA;
-system("perl $scipt_path/html.pl -i $pathfile -format $format -o $dir/result.html");
-
-$time=Time();
-print "$time: Program end!!\n";
-
-############################## sub programs ###################################
-sub predict{
-	my ($file)=@_;
-	$time=&Time();
-	print "$time: Novel microRNA prediction!\n\n";
-	$time=~s/:/-/g;
-	$time=~s/ /-/g;
-	my $predict=$dir."miRNA_predict_${time}";
-print PA "$predict\n";
-	mkdir $predict;
-	chdir $predict;
-	system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe");
-#	print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n";
-
-	system("bowtie-build -f excised_precursor.fa excised_precursor");
-#	print "\nbowtie-build -f excised_precursor.fa excised_precursor\n";
-
-	system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt");
-#	print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n";
-
-	system("perl $scipt_path/convert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst");
-#	print "\nconvert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n";
-
-	system("sort +3 -25  precursor_mapped.bst  > signatures.bst");
-#	print "\nsort +3 -25  precursor_mapped.bst  > ../signatures.bst\n";
-
-	chdir $dir;
-	system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd");
-#	print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n";
-	system("rm novel_tmp_dir -rf");
-	my $tag=join "," ,@mark;
-	system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag");
-}
-
-sub genome{
-	my ($file)=@_;
-	if(defined $opts{'idx'}){
-		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time") ;
-#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n";
-	}else{
-		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time") ;
-#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n";
-	}
-}
-sub rfam{
-	if (defined $opts{'idx2'}) {
-		system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time");
-#		print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n";
-	}else{
-		system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time");
-#		print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n";
-	}
-}
-sub quantify{
-	my $tag=join "\\;" ,@mark;
-	system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag");
-#	print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n";
-}
-sub filterbylength{
-	my $tmpmark=join ",", @mark;
-	system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark");
-#	print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n";
-
-}
-sub collapse{
-	my ($ins,$data)=@_;
-		my $str="";
-	for (my $i=0;$i<@{$ins};$i++) {
-		$str .="-i $$ins[$i] ";
-	}
-	system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format");
-#	print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n";
-}
-
-sub clips{
-	my ($in,$out)=@_;
-	my $adapter=$preprocess.$out."_clips_adapter.fq";
-	if($format eq "fq" || $format eq "fastq"){
-		system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ;
-		print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n";
-	}
-	if($format eq "fa" || $format eq "fasta"){
-		system("fastx_clipper -a $a -M $m -i $in -o $adapter") ;
-        #		print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n";
-	}
-	#my $clean=$preprocess.$out."_clean.fq";
-	#system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt ");
-
-	return;
-}
-
-sub read_config{
-	open CON,"<$config";
-	while (my $aline=<CON>) {
-		chomp $aline;
-		my @tmp=split/\t/,$aline;
-		push @filein,$tmp[0];
-		push @mark,$tmp[1];
-		&check_rawdata($tmp[0]);
-	}
-	close CON;
-	if (@filein != @mark) {
-		&printErr();
-		die "Maybe config file have some wrong!!!\n";
-	}
-}
-sub check_rawdata{
-	my ($fileforcheck)=@_;
-	if (!(-s $fileforcheck)) {
-		&printErr();
-		die "Can not find $fileforcheck, or file is empty!!!\n";
-	}
-	if ($format eq "fasta" || $format eq "fa") {
-		&checkfa($fileforcheck);
-	}
-	if ($format eq "fastq" || $format eq "fq") {
-		&checkfq($fileforcheck);
-	}
-}
-sub checkfa{
-	my ($file_reads)=@_;
-	open N,"<$file_reads";
-	my $line=<N>;
-	chomp $line;
-    if($line !~ /^>\S+/){
-        printErr();
-        die "The first line of file $file_reads does not start with '>identifier'
-Reads file $file_reads is not a valid fasta file\n\n";
-    }
-    if(<N> !~ /^[ACGTNacgtn]*$/){
-        printErr();
-        die "File $file_reads contains not allowed characters in sequences
-Allowed characters are ACGTN
-Reads file $file_reads is not a fasta file\n\n";
-    }
-	close N;
-}
-sub checkfq{
-	my ($file_reads)=@_;
-
-	open N,"<$file_reads";
-	for (my $i=0;$i<10;$i++) {
-		my $a=<N>;
-		my $b=<N>;
-		my $c=<N>;
-		my $d=<N>;
-		chomp $a;
-		chomp $b;
-		chomp $c;
-		chomp $d;
-		if($a!~/^\@/){
-			&printErr();
-			die "$file_reads is not a fastq file\n\n";
-		}
-		if($b!~ /^[ACGTNacgtn]*$/){
-			&printErr();
-			die "File $file_reads contains not allowed characters in sequences
-Allowed characters are ACGTN
-Reads file $file_reads is not a fasta file\n\n";
-		}
-		if ($c!~/^\@/ && $c!~/^\+/) {
-			&printErr();
-			die "$file_reads is not a fastq file\n\n";
-		}
-		if ((length $b) != (length $d)) {
-			&printErr();
-			die "$file_reads is not a fastq file\n\n";
-		}
-		my @qv=split //,$d;
-		for (my $j=0;$j<@qv ;$j++) {
-			my $q=ord($qv[$j])-64;
-			if($q<0){$phred_qv=33;}
-		}
-	}
-	close N;
-}
-
-sub search{
-	my ($dir,$str)=@_;
-	opendir I,$dir;
-	my @ret;
-	while (my $file=readdir I) {
-		if ($file=~/$str/) {
-			push @ret, $file;
-		}
-	}
-	closedir I;
-	if (@ret != 1) {
-		&printErr();
-
-		die "Can not find directory or file which name has string: $str !!!\n";
-	}
-	return $ret[0];
-}
-
-sub printErr{
-    print STDERR color 'bold red';
-    print STDERR "Error: ";
-    print STDERR color 'reset';
-}
-=cut
-sub Time{
-        my $time=time();
-        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
-        $month++;
-        $year+=1900;
-        if (length($sec) == 1) {$sec = "0"."$sec";}
-        if (length($min) == 1) {$min = "0"."$min";}
-        if (length($hour) == 1) {$hour = "0"."$hour";}
-        if (length($day) == 1) {$day = "0"."$day";}
-        if (length($month) == 1) {$month = "0"."$month";}
-        #print "$year-$month-$day $hour:$min:$sec\n";
-        return("$year-$month-$day-$hour-$min-$sec");
-}
-=cut
-sub Time{
-        my $time=time();
-        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
-        $month++;
-        $year+=1900;
-        if (length($sec) == 1) {$sec = "0"."$sec";}
-        if (length($min) == 1) {$min = "0"."$min";}
-        if (length($hour) == 1) {$hour = "0"."$hour";}
-        if (length($day) == 1) {$day = "0"."$day";}
-        if (length($month) == 1) {$month = "0"."$month";}
-        #print "$year-$month-$day $hour:$min:$sec\n";
-        return("$year-$month-$day $hour:$min:$sec");
-}
-
-
-sub usage{
-print <<"USAGE";
-Version $version
-Usage:
-$0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t  -o  -path
-options:
--i string,  input file#input files information file
-		/path/filename	mark
-		/path/filename	mark
-		...
-
--format string,#specific input rawdata file format : fastq|fq|fasta|fa
-
--gfa string,  input file # genome fasta. sequence file
--idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
-                string must be the prefix of the bowtie index. For instance, if
-                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
-                the prefix is 'h_sapiens_37_asm'.##can be null
-
--pre string,  input file #species specific microRNA precursor sequences
--mat string,  input file #species specific microRNA mature sequences
-
--rfam string,  input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.
--idx2 string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter
-                string must be the prefix of the bowtie index. For instance, if
-                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
-                the prefix is 'h_sapiens_37_asm'.##can be null
-
--D      If [-D] is specified,will discard rfam mapped reads(nead -rfam).
-
--a string,  ADAPTER string. default is ATCTCGTATG.
--M int,    require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it.
--min int,  reads min length,default is 19.
--max int,  reads max length,default is 28.
-
--mis [int]     number of allowed mismatches when mapping reads to precursors, default 0
--e [int]     number of nucleotides upstream of the mature sequence to consider, default 2
--f [int]     number of nucleotides downstream of the mature sequence to consider, default 5
--v <int>     report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment
--r int       a read is allowed to map up to this number of positions in the genome,default is 25
-
--dis <int>   Maximal space between miRNA and miRNA* (200)
--flank <int>   Flank sequence length of miRNA precursor (10)
--mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)
-
--t int,    number of threads [1]
-
--o output directory# absolute path
--h help
-USAGE
-exit(1);
-}
-