view preProcess.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents c75593f79aa9
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2014-12-2
#Modified:
#Description: RNA-seq data pre-process
my $version=1.00;

use strict;
use Getopt::Long;
use threads;
#use threads::shared;
use File::Path;
use File::Basename;
#use RNA;
#use Term::ANSIColor;

my %opts;
GetOptions(\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h");
if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments
&usage;
}

my $time=&Time();
print "miPlant program start:\n   The time is $time!\n";
print "Command line:\n   $0 @ARGV\n";

my $format=$opts{'format'};
if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { 
	#&printErr();
	die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
}

my $phred_qv=64;
if (defined $opts{'phred'}) {$phred_qv=$opts{'phred'};}

my @inputfiles=@{$opts{'i'}};
my @inputtags=@{$opts{'tag'}};

my  $mypath=`pwd`;
chomp $mypath;

my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/preProcess/";


unless ($dir=~/\/$/) {$dir.="/";}
if (not -d $dir) {
	mkdir $dir;
}
my $config=$dir."/input_config";
open CONFIG,">$config";
	for (my $i=0;$i<@inputfiles;$i++) {
		print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n";
	}
close CONFIG;

my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/";

my $a="ATCTCGTATG";  #adapter
if (defined $opts{'a'}) {$a=$opts{'a'};}

my $m=6;  #adapter minimum mapped nt
if (defined $opts{'M'}) {$m=$opts{'M'};}

my $t=1; #threads number
if (defined $opts{'t'}) {$t=$opts{'t'};}

my $min_nt=19; # minimum reads length
if (defined $opts{'min'}) {$min_nt=$opts{'min'};}

my $max_nt=28; #maximum reads length
if (defined $opts{'max'}) {$max_nt=$opts{'max'};}

my $mis=0; #mismatch number for microRNA
if (defined $opts{'mis'}) {$mis=$opts{'mis'};}

my $mis_rfam=0;# mismatch number for rfam
if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};}

my (@filein,@mark,@clean);
#&read_config();
@filein=@inputfiles;
@mark=@inputtags;

&checkfa($opts{gfa});


##### clip adpter --> clean data  start
my $preprocess=$dir."preProcess_clean/";
mkdir $preprocess;
my $can_use_threads = eval 'use threads; 1';
if ($can_use_threads) {
# Do processing using threads
	print "Do processing using threads\n";
	my @filein1=@filein; my @mark1=@mark;
	while (@filein1>0) {
		my @thrs; my @res;
		for (my $i=0;$i<$t ;$i++) {
			last if(@filein1==0);
			my $in=shift @filein1;
			my $out=shift @mark1;
			push @clean,$preprocess.$out."_clips_adapter.fq";
			$thrs[$i]=threads->create(\&clips,$in,$out);
		}
		for (my $i=0;$i<@thrs;$i++) {
			$res[$i]=$thrs[$i]->join();
		}
	}
} else {
# Do not processing using threads
	print "Do not processing using threads\n";
	for (my $i=0;$i<@filein ;$i++) {
		my $in=$filein[$i];
		my $out=$mark[$i];
		push @clean,$preprocess.$out."_clips_adapter.fq";
		&clips($in,$out);
	}
}

##### clip adpter --> clean data  end

my $collapsed=$preprocess."collapse_reads.fa";
my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data
&collapse(\@clean,$collapsed);   #collapse reads to tags

&filterbylength();  # filter <$min_nt  && >$max_nt

print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n";


$time=Time();
print "$time: Genome alignment!\n\n";
my $genome_map=$dir."genome_match";
&genome($data);
#my $genome_map=&search($dir,"genome_match_");
my $mapfile=$genome_map."/genome_mapped.bwt";
my $mapfa=$genome_map."/genome_mapped.fa";
my $unmap=$genome_map."/genome_not_mapped.fa";

chdir $dir;
my $pathfile="$dir/path.txt";
open PA,">$pathfile";
print PA "$config\n";
print PA "$preprocess\n";
print PA "$genome_map\n";

if (defined $opts{'rfam'}) {              #rfam mapping and analysis
	$time=Time();
	print "$time: RNA annotate!\n\n";
	$time=~s/:/-/g;
	$time=~s/ /-/g;
	my $rfam_exp_dir=$dir."rfam_match";
	&rfam();
	#my $rfam_exp_dir=&search($dir,"rfam_match_");
print PA "$rfam_exp_dir\n";

	my $tag=join "\\;" ,@mark;
	system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt");
}


close PA;
system("perl $scipt_path/html_preprocess.pl -i $pathfile -format $format -min $min_nt -max $max_nt -o $dir/preprocessResult.html");

$time=Time();
print "$time: Program end!!\n";

############################## sub programs ###################################
sub genome{
	my ($file)=@_;
	if(defined $opts{'idx'}){
		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir -index $opts{idx}") ;
#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n";
	}else{
		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir") ;
#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n";
	}
}
sub rfam{
	if (defined $opts{'idx2'}) {
		system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} ");
#		print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n";
	}else{
		system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir ");
#		print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n";
	}
}
sub filterbylength{
	my $tmpmark=join ",", @mark;
	system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark");
	system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt  -o $preprocess/length.html");
#	print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n";

}
sub collapse{
	my ($ins,$data)=@_;
		my $str="";
	for (my $i=0;$i<@{$ins};$i++) {
		$str .="-i $$ins[$i] ";
	}
	system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format");
#	print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n";
}

sub clips{
	my ($in,$out)=@_;
	my $adapter=$preprocess.$out."_clips_adapter.fq";
	if($format eq "fq" || $format eq "fastq"){
		system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ;
#		print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n";
	}
	if($format eq "fa" || $format eq "fasta"){
		system("fastx_clipper -a $a -M $m -i $in -o $adapter") ;
        #		print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n";
	}
	#my $clean=$preprocess.$out."_clean.fq";
	#system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt ");

	return;
}

sub read_config{
	open CON,"<$config";
	while (my $aline=<CON>) {
		chomp $aline;
		my @tmp=split/\t/,$aline;
		push @filein,$tmp[0];
		push @mark,$tmp[1];
		&check_rawdata($tmp[0]);
	}
	close CON;
	if (@filein != @mark) {
		#&printErr();
		die "Maybe config file have some wrong!!!\n";
	}
}
sub check_rawdata{
	my ($fileforcheck)=@_;
	if (!(-s $fileforcheck)) {
		#&printErr();
		die "Can not find $fileforcheck, or file is empty!!!\n";
	}
	if ($format eq "fasta" || $format eq "fa") {
		&checkfa($fileforcheck);
	}
	if ($format eq "fastq" || $format eq "fq") {
		&checkfq($fileforcheck);
	}
}
sub checkfa{
	my ($file_reads)=@_;
	open N,"<$file_reads";
	my $line=<N>;
	chomp $line;
    if($line !~ /^>\S+/){
        #printErr();
        die "The first line of file $file_reads does not start with '>identifier'
Reads file $file_reads is not a valid fasta file\n\n";
    }
    if(<N> !~ /^[ACGTNacgtn]*$/){
        #printErr();
        die "File $file_reads contains not allowed characters in sequences
Allowed characters are ACGTN
Reads file $file_reads is not a fasta file\n\n";
    }
	close N;
}
sub checkfq{
	my ($file_reads)=@_;

	open N,"<$file_reads";
	for (my $i=0;$i<10;$i++) {
		my $a=<N>;
		my $b=<N>;
		my $c=<N>;
		my $d=<N>;
		chomp $a;
		chomp $b;
		chomp $c;
		chomp $d;
		if($a!~/^\@/){
			#&printErr();
			die "$file_reads is not a fastq file\n\n";
		}
		if($b!~ /^[ACGTNacgtn]*$/){
			#&printErr();
			die "File $file_reads contains not allowed characters in sequences
Allowed characters are ACGTN
Reads file $file_reads is not a fasta file\n\n";
		}
		if ($c!~/^\@/ && $c!~/^\+/) {
			#&printErr();
			die "$file_reads is not a fastq file\n\n";
		}
		if ((length $b) != (length $d)) {
			#&printErr();
			die "$file_reads is not a fastq file\n\n";
		}
		my @qv=split //,$d;
		for (my $j=0;$j<@qv ;$j++) {
			my $q=ord($qv[$j])-64;
			if($q<0){$phred_qv=33;}
		}
	}
	close N;
}

sub search{
	my ($dir,$str)=@_;
	opendir I,$dir;
	my @ret;
	while (my $file=readdir I) {
		if ($file=~/$str/) {
			push @ret, $file;
		}
	}
	closedir I;
	if (@ret != 1) {
		#&printErr();

		die "Can not find directory or file which name has string: $str !!!\n";
	}
	return $ret[0];
}

sub Time{
        my $time=time();
        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
        $month++;
        $year+=1900;
        if (length($sec) == 1) {$sec = "0"."$sec";}
        if (length($min) == 1) {$min = "0"."$min";}
        if (length($hour) == 1) {$hour = "0"."$hour";}
        if (length($day) == 1) {$day = "0"."$day";}
        if (length($month) == 1) {$month = "0"."$month";}
        #print "$year-$month-$day $hour:$min:$sec\n";
        return("$year-$month-$day $hour:$min:$sec");
}


sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t  -o  -path
options:
-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ...
-tag string # raw data file names, -tag xxx -tag xxx

-format string,#specific input rawdata file format : fastq|fq|fasta|fa
-phred  int # phred quality number, default is 64

-path scirpt path

-gfa string,  input file # genome fasta. sequence file
-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
                string must be the prefix of the bowtie index. For instance, if
                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
                the prefix is 'h_sapiens_37_asm'.##can be null

-rfam string,  input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.
-idx2 string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter
                string must be the prefix of the bowtie index. For instance, if
                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
                the prefix is 'h_sapiens_37_asm'.##can be null

-a string,  ADAPTER string. default is ATCTCGTATG.
-M int,    require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. 
-min int,  reads min length,default is 19.
-max int,  reads max length,default is 28.

-mis [int]     number of allowed mismatches when mapping reads to genome, default 0
-v <int>     report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment

-t int,    number of threads [1]

-o output directory# absolute path
-h help
USAGE
exit(1);
}