view NEUMA-1.2.1/bowtieout2mappingstat.3.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use Getopt::Long;
# This version includes max_insert_length filtered mapping stat, optionally. (version 2)


my $max_insert_length;
my $readlength;
my $datatype = 'E';  # E or R.
&GetOptions ( 'm|max_insert_length=i' => \$max_insert_length,
              'l|readlength=i' => \$readlength,
              'd|datatype=s' => \$datatype
);

if(defined($max_insert_length) && !defined($readlength)) {  die "readlength (-l|--readlength) must be defined if -m (--max_insert_length) is defined.\n"; }
if(@ARGV<3){ &help; exit; }


my $bowtie = shift @ARGV;
my $sample_name = shift @ARGV;
my $PE = shift @ARGV;

my $prev_head='';
my $NM=0; my $NR=0; my $all=0; my $NM_lf = 0; my $NR_lf = 0;  my $all_lf=0; # lf : insert-length-filtered version
my $countNM=0; my $countNM_lf =0;
my $countNR=0; my $countNR_lf = 0;
my $countAll=0; my $countAll_lf=0;


open IN, $bowtie or die "Can't open bowtiefile $bowtie\n";
while(<IN>){
  chomp;
  my ($head,$tr,$pos1)= (split/\t/)[0,2,3];
  if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) { $tr = $tmp_tr;  $tr=~s/\.\d+$//; }}

  if($PE==1){
    chomp(my $secondline = <IN>);
    my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
    $head = &min($head,$head2);

  }

  if($prev_head ne $head) { 

     if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
     if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
     if($all_lf==1) { $countAll_lf++; $all_lf=0; }

     if($NM==1) { $countNM++; $NM=0; }    ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
     if($NR==1) { $countNR++; $NR=0; }
     if($all==1) { $countAll++; $all=0; }
  }
  if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) { 
           if($tr=~/^NM_/) { $NM_lf = 1; }
           elsif($tr=~/^NR_/) { $NR_lf = 1; }
           $all_lf = 1; 
  }

  if($tr=~/^NM_/) { $NM=1; }
  elsif($tr=~/^NR_/) { $NR=1; }
  $all=1;
  $prev_head=$head;
}

# last line
if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
if($all_lf==1) { $countAll_lf++; $all_lf=0; }

if($NM==1) { $countNM++; $NM=0; }    ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
if($NR==1) { $countNR++; $NR=0; }
if($all==1) { $countAll++; $all=0; }


close IN;


if($datatype eq 'E') { ($countNM,$countNR,$countNM_lf,$countNM_lf) = ('NA','NA','NA','NA'); }

$"="\t";
if(defined($max_insert_length)){
  print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\tmRNA_mapped(lenfiltered)\tncRNA_mapped(lenfiltered)\ttotalRNA_mapped(lenfiltered)\n";
  print "$sample_name\t$countNM\t$countNR\t$countAll\t$countNM_lf\t$countNR_lf\t$countAll_lf\n";
}
else {
  print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\n";
  print "$sample_name\t$countNM\t$countNR\t$countAll\n";
}



sub min {
   if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
}


sub help {
  print STDERR<<EOF;

usage: $0 <options> bowtieoutfile samplename PE(1/0) (> mappingstat) 

Options:
  -m|--max_insert_length <max_insert_length> : the output includes insert-length-filtered mapping stats as well. -l(--readlength) option must be accompanied. This option is valid only for paired-end data. (For single-end, it is ignored)
  -l|--readlength <readlength> : read length. This is required only when -m(--max_insert_length) option is used.
  -d|--datatype <datatype> : Either E (Ensembl) or R (Refseq). Use R if the data is Refseq and want to have NM and NR counts. If option R is used, transcript name format is automatically detected between NM_xxx and gi|xxx|xxx|NM_xxx. (Default E)

EOF
}