diff NEUMA-1.2.1/bowtieout2mappingstat.3.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NEUMA-1.2.1/bowtieout2mappingstat.3.pl	Thu Aug 08 00:46:13 2013 -0400
@@ -0,0 +1,111 @@
+#!/usr/bin/perl
+use Getopt::Long;
+# This version includes max_insert_length filtered mapping stat, optionally. (version 2)
+
+
+my $max_insert_length;
+my $readlength;
+my $datatype = 'E';  # E or R.
+&GetOptions ( 'm|max_insert_length=i' => \$max_insert_length,
+              'l|readlength=i' => \$readlength,
+              'd|datatype=s' => \$datatype
+);
+
+if(defined($max_insert_length) && !defined($readlength)) {  die "readlength (-l|--readlength) must be defined if -m (--max_insert_length) is defined.\n"; }
+if(@ARGV<3){ &help; exit; }
+
+
+my $bowtie = shift @ARGV;
+my $sample_name = shift @ARGV;
+my $PE = shift @ARGV;
+
+my $prev_head='';
+my $NM=0; my $NR=0; my $all=0; my $NM_lf = 0; my $NR_lf = 0;  my $all_lf=0; # lf : insert-length-filtered version
+my $countNM=0; my $countNM_lf =0;
+my $countNR=0; my $countNR_lf = 0;
+my $countAll=0; my $countAll_lf=0;
+
+
+open IN, $bowtie or die "Can't open bowtiefile $bowtie\n";
+while(<IN>){
+  chomp;
+  my ($head,$tr,$pos1)= (split/\t/)[0,2,3];
+  if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) { $tr = $tmp_tr;  $tr=~s/\.\d+$//; }}
+
+  if($PE==1){
+    chomp(my $secondline = <IN>);
+    my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
+    $head = &min($head,$head2);
+
+  }
+
+  if($prev_head ne $head) { 
+
+     if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
+     if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
+     if($all_lf==1) { $countAll_lf++; $all_lf=0; }
+
+     if($NM==1) { $countNM++; $NM=0; }    ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
+     if($NR==1) { $countNR++; $NR=0; }
+     if($all==1) { $countAll++; $all=0; }
+  }
+  if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) { 
+           if($tr=~/^NM_/) { $NM_lf = 1; }
+           elsif($tr=~/^NR_/) { $NR_lf = 1; }
+           $all_lf = 1; 
+  }
+
+  if($tr=~/^NM_/) { $NM=1; }
+  elsif($tr=~/^NR_/) { $NR=1; }
+  $all=1;
+  $prev_head=$head;
+}
+
+# last line
+if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
+if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
+if($all_lf==1) { $countAll_lf++; $all_lf=0; }
+
+if($NM==1) { $countNM++; $NM=0; }    ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
+if($NR==1) { $countNR++; $NR=0; }
+if($all==1) { $countAll++; $all=0; }
+
+
+close IN;
+
+
+if($datatype eq 'E') { ($countNM,$countNR,$countNM_lf,$countNM_lf) = ('NA','NA','NA','NA'); }
+
+$"="\t";
+if(defined($max_insert_length)){
+  print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\tmRNA_mapped(lenfiltered)\tncRNA_mapped(lenfiltered)\ttotalRNA_mapped(lenfiltered)\n";
+  print "$sample_name\t$countNM\t$countNR\t$countAll\t$countNM_lf\t$countNR_lf\t$countAll_lf\n";
+}
+else {
+  print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\n";
+  print "$sample_name\t$countNM\t$countNR\t$countAll\n";
+}
+
+
+
+sub min {
+   if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
+}
+
+
+sub help {
+  print STDERR<<EOF;
+
+usage: $0 <options> bowtieoutfile samplename PE(1/0) (> mappingstat) 
+
+Options:
+  -m|--max_insert_length <max_insert_length> : the output includes insert-length-filtered mapping stats as well. -l(--readlength) option must be accompanied. This option is valid only for paired-end data. (For single-end, it is ignored)
+  -l|--readlength <readlength> : read length. This is required only when -m(--max_insert_length) option is used.
+  -d|--datatype <datatype> : Either E (Ensembl) or R (Refseq). Use R if the data is Refseq and want to have NM and NR counts. If option R is used, transcript name format is automatically detected between NM_xxx and gi|xxx|xxx|NM_xxx. (Default E)
+
+EOF
+}
+
+
+
+