annotate NEUMA-1.2.1/bowtieout2mappingstat.3.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
1 #!/usr/bin/perl
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
2 use Getopt::Long;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
3 # This version includes max_insert_length filtered mapping stat, optionally. (version 2)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
4
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
5
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
6 my $max_insert_length;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
7 my $readlength;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
8 my $datatype = 'E'; # E or R.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
9 &GetOptions ( 'm|max_insert_length=i' => \$max_insert_length,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
10 'l|readlength=i' => \$readlength,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
11 'd|datatype=s' => \$datatype
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
12 );
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
13
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
14 if(defined($max_insert_length) && !defined($readlength)) { die "readlength (-l|--readlength) must be defined if -m (--max_insert_length) is defined.\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
15 if(@ARGV<3){ &help; exit; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
16
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
17
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
18 my $bowtie = shift @ARGV;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
19 my $sample_name = shift @ARGV;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
20 my $PE = shift @ARGV;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
21
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
22 my $prev_head='';
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
23 my $NM=0; my $NR=0; my $all=0; my $NM_lf = 0; my $NR_lf = 0; my $all_lf=0; # lf : insert-length-filtered version
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
24 my $countNM=0; my $countNM_lf =0;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
25 my $countNR=0; my $countNR_lf = 0;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
26 my $countAll=0; my $countAll_lf=0;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
27
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
28
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
29 open IN, $bowtie or die "Can't open bowtiefile $bowtie\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
30 while(<IN>){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
31 chomp;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
32 my ($head,$tr,$pos1)= (split/\t/)[0,2,3];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
33 if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) { $tr = $tmp_tr; $tr=~s/\.\d+$//; }}
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
34
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
35 if($PE==1){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
36 chomp(my $secondline = <IN>);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
37 my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
38 $head = &min($head,$head2);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
39
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
40 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
41
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
42 if($prev_head ne $head) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
43
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
44 if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
45 if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
46 if($all_lf==1) { $countAll_lf++; $all_lf=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
47
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
48 if($NM==1) { $countNM++; $NM=0; } ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
49 if($NR==1) { $countNR++; $NR=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
50 if($all==1) { $countAll++; $all=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
51 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
52 if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
53 if($tr=~/^NM_/) { $NM_lf = 1; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
54 elsif($tr=~/^NR_/) { $NR_lf = 1; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
55 $all_lf = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
56 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
57
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
58 if($tr=~/^NM_/) { $NM=1; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
59 elsif($tr=~/^NR_/) { $NR=1; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
60 $all=1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
61 $prev_head=$head;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
62 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
63
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
64 # last line
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
65 if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
66 if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
67 if($all_lf==1) { $countAll_lf++; $all_lf=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
68
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
69 if($NM==1) { $countNM++; $NM=0; } ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
70 if($NR==1) { $countNR++; $NR=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
71 if($all==1) { $countAll++; $all=0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
72
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
73
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
74 close IN;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
75
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
76
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
77 if($datatype eq 'E') { ($countNM,$countNR,$countNM_lf,$countNM_lf) = ('NA','NA','NA','NA'); }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
78
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
79 $"="\t";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
80 if(defined($max_insert_length)){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
81 print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\tmRNA_mapped(lenfiltered)\tncRNA_mapped(lenfiltered)\ttotalRNA_mapped(lenfiltered)\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
82 print "$sample_name\t$countNM\t$countNR\t$countAll\t$countNM_lf\t$countNR_lf\t$countAll_lf\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
83 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
84 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
85 print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
86 print "$sample_name\t$countNM\t$countNR\t$countAll\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
87 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
88
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
89
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
90
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
91 sub min {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
92 if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
93 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
94
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
95
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
96 sub help {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
97 print STDERR<<EOF;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
98
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
99 usage: $0 <options> bowtieoutfile samplename PE(1/0) (> mappingstat)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
100
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
101 Options:
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
102 -m|--max_insert_length <max_insert_length> : the output includes insert-length-filtered mapping stats as well. -l(--readlength) option must be accompanied. This option is valid only for paired-end data. (For single-end, it is ignored)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
103 -l|--readlength <readlength> : read length. This is required only when -m(--max_insert_length) option is used.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
104 -d|--datatype <datatype> : Either E (Ensembl) or R (Refseq). Use R if the data is Refseq and want to have NM and NR counts. If option R is used, transcript name format is automatically detected between NM_xxx and gi|xxx|xxx|NM_xxx. (Default E)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
105
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
106 EOF
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
107 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
108
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
109
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
110
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
111