annotate extractSClip.pl @ 1:4f6952e0af48 default tip

CREST - add crest.loc.sample
author Jim Johnson <jj@umn.edu>
date Wed, 08 Feb 2012 16:08:01 -0600
parents acc8d8bfeb9a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/perl -w
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
2 use strict;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
3 #use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev';
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
4 use Carp;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
5 use Getopt::Long;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
6 use English;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
7 use Pod::Usage;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
8 use Data::Dumper;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
9 use Bio::DB::Sam;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
10 use File::Temp qw/ tempfile tempdir /;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
11 use File::Spec;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
12 use File::Basename;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
13 use Cwd;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
14 use List::MoreUtils qw/ uniq /;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
15 #custom packages
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
16 use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
17 use SVUtil qw($use_scratch $work_dir get_input_bam get_work_dir parse_range is_PCR_dup);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
18
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
19 use constant FQ_BASE_NUMBER => 33;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
20
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
21 my $rmdup = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
22 my $print_read = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
23 $use_scratch = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
24 $min_percent_id = 90;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
25 my ($work_dir, $out_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
26 my $ref_genome;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
27 # input/output
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
28 my ($out_prefix, $range, $input_bam );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
29 my $out_suffix = "sclip.txt";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
30 my $paired = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
31 my ( $help, $man, $version, $usage );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
32 my $optionOK = GetOptions(
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
33 'i|in|input=s' => \$input_bam,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
34 'o|out_dir=s' => \$out_dir,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
35 'ref_genome=s' => \$ref_genome,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
36 'p=s' => \$out_prefix,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
37 'scratch!' => \$use_scratch,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
38 'paired!' => \$paired,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
39 'rmdup!' => \$rmdup,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
40 'lq_cutoff=i' => \$lowqual_cutoff,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
41 'min_pct_id=i' => \$min_percent_id,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
42 'min_pct_hq=i' => \$min_percent_hq,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
43 'print_read!' => \$print_read,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
44 'r|range=s' => \$range,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
45 'h|help|?' => \$help,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
46 'man' => \$man,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
47 'usage' => \$usage,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
48 'v|version' => \$version,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
49 );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
50
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
51 pod2usage(-verbose=>2) if($man or $usage);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
52 pod2usage(1) if($help or $version );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
53
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
54 my $start_dir = getcwd;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
55 if($input_bam) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
56 croak "The bam file you specified does not exist!\n" unless(-e $input_bam);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
57 $input_bam = File::Spec->rel2abs($input_bam);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
58 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
59 else{
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
60 croak "You must specify the input bam file or sample name";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
61 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
62 croak "You must provide the reference genome in fasta format!" if(!$ref_genome);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
63 croak "The reference genome file you speicified does not exist!\n"
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
64 unless(-e $ref_genome);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
65
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
66 my $input_base = fileparse($input_bam);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
67
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
68 #setup output dir and workind directory
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
69 $out_dir = getcwd if(!$out_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
70 mkdir $out_dir if(!-e $out_dir || ! -d $out_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
71 $work_dir = get_work_dir() if($use_scratch);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
72 $work_dir = $out_dir if(!$work_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
73 $use_scratch = undef if($work_dir eq $out_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
74 chdir($work_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
75
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
76 # figure out output prefix
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
77 $out_prefix = $input_base if(!$out_prefix);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
78
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
79 my $sam = Bio::DB::Sam->new( -bam => $input_bam, -fasta => $ref_genome);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
80
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
81 my $output_file;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
82 my $validator = SCValidator->new();
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
83 if(!$paired) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
84 $validator->remove_validator("strand_validator");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
85 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
86 if($range) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
87 my ($chr, $start, $end) = parse_range($range);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
88 my $tmp = $chr;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
89 $tmp = $tmp . ".$start" if($start);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
90 $tmp = $tmp . ".$end" if($end);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
91 $output_file = join('.', $out_prefix, $tmp, $out_suffix);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
92 $output_file = File::Spec->catfile($out_dir, $output_file);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
93
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
94 my($pcover, $ncover) = extract_range_sclip(
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
95 -SAM => $sam,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
96 -RANGE => $range,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
97 -WORK_DIR => $work_dir,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
98 -OUTPUT => $output_file,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
99 -VALIDATOR => $validator);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
100 $output_file = join('.', $out_prefix, $tmp, "cover");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
101 $output_file = File::Spec->catfile($out_dir, $output_file);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
102 warn "$output_file file exists and it will be replaced!" if(-e $output_file);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
103 $rmdup = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
104 #$start = 0 unless $start;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
105 #my ($tid) = $sam->header->parse_region($chr);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
106 #my $chr_len = $sam->header->target_len->[$tid];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
107 #$end = $chr_len unless $end;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
108
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
109 #my ($coverage) = $sam->features(-type => 'coverage', -seq_id => $chr,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
110 # -start => $start, -end => $end);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
111 #my @cc = $coverage->coverage;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
112
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
113 $rmdup = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
114 my($fh, $fname) = tempfile( DIR => $work_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
115 foreach my $p (keys(%{$pcover})) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
116 my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p});
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
117 print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p)), "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
118 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
119 foreach my $p (keys(%{$ncover})) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
120 my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p});
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
121 print $fh join("\t", $chr, $p, "-", $c , count_coverage($sam, $chr, $p)), "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
122 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
123 system("sort -k 2 -n $fname -o $output_file");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
124 system("rm $fname");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
125 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
126 else{
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
127 $output_file = join('.', $out_prefix, $out_suffix);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
128 $output_file = File::Spec->catfile($out_dir, $output_file);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
129 my $header = $sam->header;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
130 my $target_names = $header->target_name;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
131 $output_file = join('.', $out_prefix, "cover");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
132 my $read_file = join('.', $out_prefix, "sclip.txt");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
133 $output_file = File::Spec->catfile($out_dir, $output_file);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
134 $read_file = File::Spec->catfile($out_dir, $read_file);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
135
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
136 if(-e $output_file) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
137 warn "$output_file file exists and it will be replaced!";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
138 system("rm $output_file");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
139 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
140 my @t_names = uniq @{$target_names};
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
141 foreach my $chr (@t_names){
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
142 my($fh, $fname) = tempfile( DIR => $work_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
143 my($pcover, $ncover) = extract_range_sclip(
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
144 -SAM => $sam,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
145 -RANGE =>$chr,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
146 -WORK_DIR => $work_dir,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
147 -OUTPUT => $read_file,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
148 -VALIDATOR => $validator);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
149 foreach my $p (keys(%{$pcover})) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
150 my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p});
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
151 print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p) ), "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
152 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
153 foreach my $p (keys(%{$ncover})) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
154 my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p});
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
155 print $fh join("\t", $chr, $p, "-", $c, count_coverage($sam, $chr, $p)), "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
156 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
157 system("sort -k 2 -n $fname -o $fname.sorted");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
158 system("cat $fname.sorted >> $output_file");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
159 system("rm $fname");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
160 system("rm $fname.sorted");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
161 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
162 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
163 chdir $start_dir;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
164 exit(0);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
165
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
166 sub count_coverage {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
167 my ($sam, $chr, $pos, $clip) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
168 if($rmdup) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
169 my @pairs;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
170 my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
171 my $n = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
172 my $itr = $seg->features(-iterator => 1);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
173 while( my $a = $itr->next_seq) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
174 my $sclip_len = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
175 if($clip) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
176 my @cigar_array = @{$a->cigar_array};
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
177 $sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
178 $sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
179 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
180 next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len));
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
181 $n++;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
182 #return $n if( $n > $max_repetitive_cover);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
183 if($a->mpos) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
184 push @pairs, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
185 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
186 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
187 push @pairs, [$a->start, $a->end, 0, 0, $sclip_len];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
188 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
189
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
190 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
191 return $n;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
192 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
193 else{
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
194 my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
195 -start => $pos, -end => $pos);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
196 my @c_d = $c->coverage;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
197 return $c_d[0];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
198 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
199 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
200
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
201 sub extract_range_sclip {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
202 my %arg = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
203 my $sam = $arg{-SAM} || croak "missing -SAM";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
204 my $range = $arg{-RANGE} || croak "missing -RANGE";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
205 my $output_file = $arg{-OUTPUT} || croak "missing -OUTPUT";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
206 my $validator = $arg{-VALIDATOR} || croak "missing -VALIDATOR";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
207
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
208 my($fh, $fname) = tempfile( DIR => $work_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
209 my (%plus_cover, %neg_cover);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
210 $sam->fetch($range,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
211 sub {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
212 my $a = shift;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
213 my $cigar_str = $a->cigar_str;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
214 # print STDERR $a->qname, "\t", $cigar_str, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
215 my @cigar_array = @{$a->cigar_array};
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
216 return if($a->cigar_str !~ m/S/);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
217 if($paired && !$a->proper_pair) { #paired but mate is not mapped
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
218 $validator->remove_validator("strand_validator");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
219 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
220 #return if(!$a->proper_pair && $paired);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
221 #return if($paired && !$a->mpos);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
222 my ($sclip_len, $ort, $pos, $seq, $qual_str, $qual);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
223 $qual_str = join( "", (map { chr $_ + FQ_BASE_NUMBER } $a->qscore));
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
224 if($cigar_array[0]->[0] eq 'S' && $validator->validate($a, LEFT_CLIP) ) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
225 $sclip_len = $cigar_array[0]->[1]; $ort = "-"; $pos = $a->start;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
226 $seq = substr($a->query->dna, 0, $sclip_len );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
227 $qual = substr($qual_str, 0, $sclip_len);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
228
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
229 my $print = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
230 if($rmdup) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
231 if(exists $neg_cover{$pos}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
232 $print = 0 if(is_PCR_dup($a, $neg_cover{$pos}, $sclip_len));
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
233 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
234 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
235 $neg_cover{$pos} = [];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
236 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
237 if($print == 1) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
238 if($a->mpos) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
239 push @{$neg_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
240 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
241 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
242 push @{$neg_cover{$pos}}, [$a->start, $a->end, 0, 0, $sclip_len];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
243 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
244 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
245 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
246 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
247 if(exists $neg_cover{$pos}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
248 $neg_cover{$pos}++;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
249 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
250 else{
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
251 $neg_cover{$pos} = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
252 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
253 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
254 print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n"
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
255 if($print_read && $print == 1);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
256 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
257
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
258 if($cigar_array[$#cigar_array]->[0] eq 'S' && $validator->validate($a, RIGHT_CLIP)) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
259 $sclip_len = $cigar_array[$#cigar_array]->[1]; $ort = '+'; $pos = $a->end;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
260 my $l = length($a->query->dna);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
261 $seq = substr($a->query->dna, $l - $sclip_len);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
262 $qual = substr($qual_str, $l - $sclip_len);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
263
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
264 my $print = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
265 if($rmdup) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
266 if(exists $plus_cover{$pos}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
267 $print = 0 if(is_PCR_dup($a, $plus_cover{$pos}, $sclip_len));
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
268 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
269 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
270 $plus_cover{$pos} = [];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
271 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
272 if($print ==1 ) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
273 if($a->mpos) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
274 push @{$plus_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
275 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
276 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
277 push @{$plus_cover{$pos}}, [$a->start, 0, 0, $a->mate_end, $sclip_len];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
278 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
279 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
280 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
281 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
282 if(exists $plus_cover{$pos}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
283 $plus_cover{$pos}++;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
284 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
285 else{
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
286 $plus_cover{$pos} = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
287 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
288 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
289 $print = is_PCR_dup($a, $plus_cover{$pos}, $sclip_len) if($rmdup);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
290 print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n"
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
291 if($print_read && $print);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
292 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
293 if($paired && !$a->proper_pair) { #paired but mate is not mapped, add back
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
294 $validator->add_validator("strand_validator");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
295 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
296 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
297 );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
298 if($print_read) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
299 system("sort -k 2 -n $fname -o $fname.sorted");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
300 system("cat $fname.sorted >> $output_file");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
301 system("rm $fname");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
302 system("rm $fname.sorted");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
303 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
304 return(\%plus_cover, \%neg_cover);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
305 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
306
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
307
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
308 =head1 NAME
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
309
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
310 extractSClip.pl - extract positions with soft clipped read in bam file.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
311
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
312
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
313 =head1 VERSION
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
314
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
315 This documentation refers to extractSClip.pl version 0.0.1.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
316
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
317
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
318 =head1 USAGE
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
319
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
320 # extract all positions with soft clipped reads in whole genome:
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
321 ./extractSClip.pl -i sample.bam -g hg18.fa
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
322 # extract chr1 positions with soft clipped reads
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
323 ./extractSClip.pl -i sample.bam -g hg18.fa -r chr1
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
324
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
325
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
326 =head1 REQUIRED ARGUMENTS
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
327
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
328 -i: Input bam file.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
329 --ref_genome: The genome file in fa file, must be the same used to map reads.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
330
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
331
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
332 =head1 OPTIONS
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
333
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
334 -r: The range of positions need to be extracted. Format: chr1 or chr1:500-5000.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
335 -o: The output directory, default is current directory.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
336 --scratch: use scracth space, default off.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
337 --rmdup: remove PCR dumplicate reads, default on, use --normdup to turn it off.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
338 --lq_cutoff: low quality cutoff value, default 20.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
339 --min_pct_id: minimum percent identify for the aligned high qual part,default 90.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
340 --min_pct_hq: minimum percent high quality for soft clipped part, default 80.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
341 --print_read: individual soft-clipped read will be printed, default off.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
342 -h, --help: The help page.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
343 --man: Print the man page.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
344 --usage: Print usage information.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
345 -v, --version: print version information.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
346
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
347
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
348 =head1 DESCRIPTION
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
349
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
350 This is a program to extract all soft-clipped positions such that for each position
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
351 a list of requirements need to be satisfied. More specifically, the orientaion of
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
352 pair-end read should be satisfied, the minimum percent identify of aligned part
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
353 need to be satisfied, and the minimum percent of hiqh quality soft-clipped part
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
354 should be satisfied.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
355
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
356
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
357 =head1 DEPENDENCIES
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
358
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
359 The program depend on several packages:
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
360 1. Bioperl perl module.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
361 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
362
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
363 =head1 BUGS AND LIMITATIONS
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
364
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
365 There are no known bugs in this module, but the method is limitted to bam file
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
366 that has soft-clipping cigar string generated.Please report problems to
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
367 Jianmin Wang (Jianmin.Wang@stjude.org)
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
368 Patches are welcome.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
369
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
370 =head1 AUTHOR
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
371
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
372 Jianmin Wang (Jianmin.Wang@stjude.org)
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
373
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
374
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
375 =head1 LICENCE AND COPYRIGHT
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
376
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
377 Copyright (c) 2010 by St. Jude Children's Research Hospital.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
378
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
379 This program is free software: you can redistribute it and/or modify
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
380 it under the terms of the GNU General Public License as published by
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
381 the Free Software Foundation, either version 2 of the License, or
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
382 (at your option) any later version.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
383
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
384 This program is distributed in the hope that it will be useful,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
385 but WITHOUT ANY WARRANTY; without even the implied warranty of
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
386 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
387 GNU General Public License for more details.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
388
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
389 You should have received a copy of the GNU General Public License
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
390 along with this program. If not, see <http://www.gnu.org/licenses/>.