view filterReadsByCount.pl @ 23:cad6a1dfb174 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:11:49 -0500
parents 07745c0958dd
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2010-01
#Modified:
#Description:  
my $version=1.00;

use strict;
use Getopt::Long;
use File::Basename;

my %opts;
GetOptions(\%opts,"i=s","o=s","mark:s","h");
if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
&usage;
}

my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
my @mark=split /\#/,$mark;

open OUT,">$opts{o}";
open IN,"<$opts{i}";
my %hash;my %reads;
while (my $aline=<IN>) {
	chomp $aline;
	my $seq=<IN>;
	chomp $seq;
	if($aline=~/:([\d|_]+)_x(\d+)$/){
		if ($2>3) {
			my @ss=split/_/,$1;
			for (my $i=0;$i<@ss;$i++) {
				$hash{length($seq)}[$i]++ if($ss[$i]>0);
				$hash{length($seq)}[$i] +=0 if($ss[$i]==0);
				$reads{length($seq)}[$i]+=$ss[$i];
			}
			print OUT "$aline\n$seq\n";
		}		
	}
}
close IN;
close OUT;

my $dir=dirname($opts{'o'});
chdir $dir;
my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt";
open OUT, ">$lengthfile";
open R,">$dir/length_distribution_after_count_filter.R";

print OUT "Tags length\t@mark\n";

my $samNo=@mark;
my $avalue="";
my @length=sort{$a<=>$b} keys %hash;
foreach  (@length) {
	print OUT $_,"\t@{$hash{$_}}\n";
	my $vv=join ", ",@{$hash{$_}};
	$avalue .="$vv,";
}
$avalue =~s/,$//;
my $lengths=join ",",@length;
my $marks=join "\",\"",@mark;

print R  "a<-c($avalue)
b<-matrix(a,ncol=$samNo,byrow=T)
cl<-colors()
names=c($lengths)
legends=c(\"$marks\")
png(\"Tags_length_after_count_filter.png\",width=800,height=600)
barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
abline(h=0)
dev.off()

";
$avalue="";
print OUT "\nReads length\t@mark\n";
foreach  (@length) {
	print OUT $_,"\t@{$reads{$_}}\n";
	my $vv=join ", ", @{$reads{$_}};
	$avalue .= "$vv,";
}
$avalue =~s/,$//;

print R  "a<-c($avalue)\n
b<-matrix(a,ncol=$samNo,byrow=T)

png(\"Reads_length_after_count_filter.png\",width=800,height=600)
barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
abline(h=0)
dev.off()

";
close OUT;
close R;

system ("R CMD BATCH $dir/length_distribution_after_count_filter.R");

#system ("rm $dir/length_distribution.R");
#system ("rm $dir/length_distribution.Rout");
#system ("rm $dir/.RData");
sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -o  -min -max -mark
options:

-i input file
-o output file
-mark string #sample name eg: samA#samB#samC
-h help
USAGE
exit(1);
}