diff filterReadsByCount.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filterReadsByCount.pl	Fri Dec 05 00:11:02 2014 -0500
@@ -0,0 +1,116 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2010-01
+#Modified:
+#Description:  
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","mark:s","h");
+if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
+my @mark=split /\#/,$mark;
+
+open OUT,">$opts{o}";
+open IN,"<$opts{i}";
+my %hash;my %reads;
+while (my $aline=<IN>) {
+	chomp $aline;
+	my $seq=<IN>;
+	chomp $seq;
+	if($aline=~/:([\d|_]+)_x(\d+)$/){
+		if ($2>3) {
+			my @ss=split/_/,$1;
+			for (my $i=0;$i<@ss;$i++) {
+				$hash{length($seq)}[$i]++ if($ss[$i]>0);
+				$hash{length($seq)}[$i] +=0 if($ss[$i]==0);
+				$reads{length($seq)}[$i]+=$ss[$i];
+			}
+			print OUT "$aline\n$seq\n";
+		}		
+	}
+}
+close IN;
+close OUT;
+
+my $dir=dirname($opts{'o'});
+chdir $dir;
+my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt";
+open OUT, ">$lengthfile";
+open R,">$dir/length_distribution_after_count_filter.R";
+
+print OUT "Tags length\t@mark\n";
+
+my $samNo=@mark;
+my $avalue="";
+my @length=sort{$a<=>$b} keys %hash;
+foreach  (@length) {
+	print OUT $_,"\t@{$hash{$_}}\n";
+	my $vv=join ", ",@{$hash{$_}};
+	$avalue .="$vv,";
+}
+$avalue =~s/,$//;
+my $lengths=join ",",@length;
+my $marks=join "\",\"",@mark;
+
+print R  "a<-c($avalue)
+b<-matrix(a,ncol=$samNo,byrow=T)
+cl<-colors()
+names=c($lengths)
+legends=c(\"$marks\")
+png(\"Tags_length_after_count_filter.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+$avalue="";
+print OUT "\nReads length\t@mark\n";
+foreach  (@length) {
+	print OUT $_,"\t@{$reads{$_}}\n";
+	my $vv=join ", ", @{$reads{$_}};
+	$avalue .= "$vv,";
+}
+$avalue =~s/,$//;
+
+print R  "a<-c($avalue)\n
+b<-matrix(a,ncol=$samNo,byrow=T)
+
+png(\"Reads_length_after_count_filter.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+close OUT;
+close R;
+
+system ("R CMD BATCH $dir/length_distribution_after_count_filter.R");
+
+#system ("rm $dir/length_distribution.R");
+#system ("rm $dir/length_distribution.Rout");
+#system ("rm $dir/.RData");
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o  -min -max -mark
+options:
+
+-i input file
+-o output file
+-mark string #sample name eg: samA#samB#samC
+-h help
+USAGE
+exit(1);
+}
+