Repository 'mirplant2'
hg clone https://toolshed.g2.bx.psu.edu/repos/big-tiandm/mirplant2

Changeset 29:253b51f72993 (2014-07-31)
Previous changeset 28:5e05c9bb919c (2014-07-31) Next changeset 30:7cb8a65f8012 (2014-07-31)
Commit message:
Uploaded
added:
filterReadsByLength.pl
b
diff -r 5e05c9bb919c -r 253b51f72993 filterReadsByLength.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filterReadsByLength.pl Thu Jul 31 03:06:43 2014 -0400
[
@@ -0,0 +1,121 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2010-01
+#Modified:
+#Description:  
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","min=i","max=i","o=s","mark:s","h");
+if (!(defined $opts{i} and defined $opts{o} and defined $opts{min} and defined $opts{max}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
+my @mark=split /,/,$mark;
+
+
+open OUT,">$opts{o}";
+open IN,"<$opts{i}";
+my %hash;my %reads;
+while (my $aline=<IN>) {
+ chomp $aline;
+ my $seq=<IN>;
+ chomp $seq;
+
+ if($aline=~/:([\d|_]+)_x(\d+)$/){
+ my @ss=split/_/,$1;
+ for (my $i=0;$i<@ss;$i++) {
+ $hash{length($seq)}[$i]++ if($ss[$i]>0);
+ $hash{length($seq)}[$i] +=0 if($ss[$i]>0);
+ $reads{length($seq)}[$i]+=$ss[$i];
+ }
+ }
+ #else{$reads{length($seq)}+=1;}
+ if (length ($seq)>=$opts{'min'}  && length ($seq) <=$opts{'max'}) {
+ print OUT "$aline\n$seq\n";
+ }
+}
+close IN;
+close OUT;
+
+my $dir=dirname($opts{'o'});
+chdir $dir;
+my $lengthfile=$dir."/reads_length_distribution.txt";
+open OUT, ">$lengthfile";
+open R,">$dir/length_distribution.R";
+
+print OUT "Tags length\t@mark\n";
+
+my $samNo=@mark;
+my $avalue="";
+my @length=sort{$a<=>$b} keys %hash;
+foreach  (@length) {
+ print OUT $_,"\t@{$hash{$_}}\n";
+ my $vv=join ", ",@{$hash{$_}};
+ $avalue .="$vv,";
+}
+$avalue =~s/,$//;
+my $lengths=join ",",@length;
+my $marks=join "\",\"",@mark;
+
+print R  "a<-c($avalue)
+b<-matrix(a,ncol=$samNo,byrow=T)
+cl<-colors()
+names=c($lengths)
+legends=c(\"$marks\")
+png(\"Tags_length.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+$avalue="";
+print OUT "\nReads length\t@mark\n";
+foreach  (@length) {
+ print OUT $_,"\t@{$reads{$_}}\n";
+ my $vv=join ", ", @{$reads{$_}};
+ $avalue .= "$vv,";
+}
+$avalue =~s/,$//;
+
+print R  "a<-c($avalue)\n
+b<-matrix(a,ncol=$samNo,byrow=T)
+
+png(\"Reads_length.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+close OUT;
+close R;
+
+system ("R CMD BATCH $dir/length_distribution.R");
+
+#system ("rm $dir/length_distribution.R");
+#system ("rm $dir/length_distribution.Rout");
+#system ("rm $dir/.RData");
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o  -min -max -mark
+options:
+
+-i input file
+-o output file
+-min reads min length.
+-max reads max length.
+-mark string #sample name eg: samA,samB,samC
+-h help
+USAGE
+exit(1);
+}
+