comparison StructVar.pm @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc8d8bfeb9a
1 package StructVar;
2
3 # $Id: StructVar.pm, v 1.00 2010/09/24 jianmin.wang Exp $
4
5 use strict;
6 use Bio::DB::Sam;
7 use Carp;
8 use Data::Dumper;
9 use English;
10 use Bio::SearchIO;
11 use GeneModel;
12 use Cwd;
13 use SVExtTools;
14 use List::MoreUtils qw/ uniq /;
15 use SVUtil qw( prepare_reads read_fa_file get_sclip_reads rev_comp);
16 use SCValidator qw(LEFT_CLIP RIGHT_CLIP);
17
18 use constant UNK => 0;
19 use constant CTX => 1;
20 use constant INV => 2;
21 use constant INS => 3;
22 use constant DEL => 4;
23 use constant ITX => 5;
24
25 # type of SVs and thier string
26 my %type2str = (
27 0 => 'UNKNOWN', # unknow type, not single event
28 1 => 'CTX', # cross chromosome translocation
29 2 => 'INV', # inversion, requires both breakpoints available
30 3 => 'INS', # (tandem) insertion
31 4 => 'DEL', # deletion
32 5 => 'ITX', # within chromosome translocation
33 );
34 # class static variables
35 my $sam_d;
36 my $sam_g;
37 my $gm;
38 my $assembler;
39 my $aligner;
40 my $mapper;
41 my @filters = (\&_germline_sclip_filter, \&_type_distance_filter, \&_germline_indel_filter);
42 my $RNASeq;
43 my $read_len;
44 my $genome;
45 my $tr_max_indel_size; #tandem repeat mediated indel size
46 my $tr_min_size;
47 my $tr_max_size; #tandem repeat max_size of single repeat
48 my $tr_min_num; #tandem repeat minimum number of occurence
49
50 my $max_bp_dist = 15;
51 my $germline_seq_width = 100;
52 my $germline_search_width = 50;
53 my $min_percent_cons_of_read = 0.75;
54
55 my %default_filters = (
56 'germline_sclip' => \&_germline_sclip_filter,
57 'type_distance' => \&_type_distance_filter,
58 'germline_indel' => \&_germline_indel_filter,
59 # 'mapping_artifact' => \&_mapping_artifact_filter,
60 'mapping_quality' => \&_mapping_quality_filter,
61 'tandem_repeat' => \&_tandem_repeat_filter,
62 );
63
64 sub new {
65 my $class = shift;
66 my %param = @_;
67 my $self = {};
68 $self->{first_bp} = {};
69 $self->{second_bp} = {};
70 $self->{first_bp} = $param{-FIRST_BP} if($param{-FIRST_BP});
71 $self->{second_bp} = $param{-SECOND_BP} if($param{-SECOND_BP});
72 $self->{consensus} = $param{-CONSENSUS} ? $param{-CONSENSUS} : "";
73 $self->{type} = "";
74 bless $self, ref($class) || $class;
75 return $self;
76 }
77
78 sub add_filter {
79 my $self = shift;
80 my $name = shift;
81 my $f = shift;
82 croak "you must specify a filter" if(!$f);
83 croak "the filter must be a subroutine" if(ref($f) ne 'CODE');
84 $default_filters{$name} = $f;
85 }
86
87 sub add_RNASeq_filter {
88 $default_filters{'RNASeq_strand'} = \&_RNASeq_strand_filter;
89 $default_filters{'RNASeq_INS'} = \&_RNASeq_INS_filter;
90 }
91
92 sub remove_filter {
93 my $self = shift;
94 my $name = shift;
95 if($default_filters{$name}){
96 delete $default_filters{$name};
97 }
98 else {
99 print STDERR "Filter $name is not in filter list";
100 }
101 }
102
103 sub tr_max_indel_size { #tandem repeat mediated indel size
104 my $self = shift;
105 my $value = shift;
106 $tr_max_indel_size = $value if($value);
107 return $tr_max_indel_size;
108 }
109
110 sub min_percent_cons_of_read {
111 my $self= shift;
112 my $value = shift;
113 $min_percent_cons_of_read = $value if($value);
114 return $min_percent_cons_of_read;
115 }
116
117 sub tr_min_size {
118 my $self = shift;
119 my $value = shift;
120 $tr_min_size = $value if($value);
121 return $tr_min_size;
122 }
123 sub germline_seq_width {
124 my $self = shift;
125 my $value = shift;
126 $germline_seq_width = $value if($value);
127 return $germline_seq_width;
128 }
129
130 sub germline_search_width {
131 my $self = shift;
132 my $value = shift;
133 $germline_search_width = $value if($value);
134 return $germline_search_width;
135 }
136
137 sub max_bp_dist {
138 my $self = shift;
139 my $value = shift;
140 $max_bp_dist = $value if($value);
141 return $max_bp_dist;
142 }
143
144 sub tr_max_size { #tandem repeat max_size of single repeat
145 my $self = shift;
146 my $value = shift;
147 $tr_max_size = $value if($value);
148 return $tr_max_size;
149 }
150
151 sub tr_min_num { #tandem repeat minimum number of occurence
152 my $self = shift;
153 my $value = shift;
154 $tr_min_num = $value if($value);
155 return $tr_min_num;
156 }
157 sub genome {
158 my $self = shift;
159 my $value = shift;
160 $genome = $value if($value);
161 return $genome;
162 }
163 sub sam_d {
164 my $self = shift;
165 my $value = shift;
166 $sam_d = $value if($value);
167 return $sam_d;
168 }
169
170 sub sam_g {
171 my $self = shift;
172 my $value = shift;
173 $sam_g = $value if($value);
174 return $sam_g;
175 }
176
177 sub assembler {
178 my $self = shift;
179 my $value = shift;
180 $assembler = $value if($value);
181 return $assembler;
182 }
183
184 sub aligner {
185 my $self = shift;
186 my $value = shift;
187 $aligner = $value if($value);
188 return $aligner;
189 }
190
191 sub RNASeq {
192 my $self = shift;
193 my $value = shift;
194 $RNASeq = 1 if($value);
195 return $RNASeq;
196 }
197
198 sub mapper {
199 my $self = shift;
200 my $value = shift;
201 $mapper = $value if($value);
202 return $mapper;
203 }
204
205 sub gene_model {
206 my $self = shift;
207 my $value = shift;
208 $gm = $value if($value);
209 return $gm;
210 }
211
212 sub read_len {
213 my $self = shift;
214 my $value = shift;
215 $read_len = $value if($value);
216 return $read_len;
217
218 }
219 sub first_bp {
220 my $self = shift;
221 my %param = @_;
222 if(%param) {
223 $self->{first_bp}{left_chr} = $param{-LEFT_CHR};
224 $self->{first_bp}{left_pos} = $param{-LEFT_POS};
225 $self->{first_bp}{left_ort} = $param{-LEFT_ORT};
226 $self->{first_bp}{right_chr} = $param{-RIGHT_CHR};
227 $self->{first_bp}{right_pos} = $param{-RIGHT_POS};
228 $self->{first_bp}{right_ort} = $param{-RIGHT_ORT};
229 $self->{first_bp}{sc_reads} = $param{-SC_READS};
230 $self->{first_bp}{pos} = $param{-POS};
231 $self->{first_bp}{cover} = $param{-COVER};
232 $self->{first_bp}{sc_seq} = $param{-SC_SEQ};
233 $self->{first_bp}{chr} = $param{-CHR};
234 }
235 return $self->{first_bp};
236 }
237
238 sub second_bp {
239 my $self = shift;
240 my %param = @_;
241 if(%param) {
242 $self->{second_bp}{left_chr} = $param{-LEFT_CHR};
243 $self->{second_bp}{left_pos} = $param{-LEFT_POS};
244 $self->{second_bp}{left_ort} = $param{-LEFT_ORT};
245 $self->{second_bp}{right_chr} = $param{-RIGHT_CHR};
246 $self->{second_bp}{right_pos} = $param{-RIGHT_POS};
247 $self->{second_bp}{right_ort} = $param{-RIGHT_ORT};
248 $self->{second_bp}{sc_reads} = $param{-SC_READS};
249 $self->{second_bp}{pos} = $param{-POS};
250 $self->{second_bp}{cover} = $param{-COVER};
251 $self->{second_bp}{sc_seq} = $param{-SC_SEQ};
252 $self->{second_bp}{chr} = $param{-CHR};
253 }
254 return $self->{second_bp};
255 }
256
257 sub type {
258 my $self = shift;
259 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
260 if($bp1->{left_chr} ne $bp1->{right_chr}) {
261 return CTX if($bp1->{left_ort} eq $bp2->{left_ort} &&
262 $bp1->{right_ort} eq $bp2->{right_ort});
263
264 return UNK;
265 }
266 if( $bp1->{left_ort} eq $bp1->{right_ort} &&
267 $bp2->{left_ort} eq $bp2->{right_ort} ) { # Insertion or deletion
268 return DEL if( $bp1->{left_pos} <= $bp1->{right_pos} &&
269 $bp2->{left_pos} <= $bp2->{right_pos});
270 return INS if( $bp1->{left_pos} >= $bp1->{right_pos} &&
271 $bp2->{left_pos} >= $bp2->{right_pos});
272 return UNK;
273 }
274 if( $bp1->{left_ort} ne $bp1->{right_ort} &&
275 $bp2->{left_ort} ne $bp2->{right_ort} ) {
276 return INV if( ($bp1->{left_ort} eq '+' && $bp2->{left_ort} eq '-')
277 || ($bp2->{left_ort} eq '+' && $bp1->{left_ort} eq '-'));
278 return ITX if( ($bp1->{left_ort} eq '+' && $bp2->{left_ort} eq '+')
279 || ($bp1->{left_ort} eq '-' && $bp2->{left_ort} eq '-') );
280 return UNK;
281 }
282 return UNK;
283 }
284
285 sub to_string {
286 my $self = shift;
287 my $type = $self->type;
288 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
289 my ($left_len, $right_len) = (0, 0);
290 if(exists $bp1->{left_len}) {
291 ($left_len, $right_len ) = ($bp1->{left_len}, $bp1->{right_len});
292 }
293 else {
294 $left_len = length($bp1->{sc_seq}) if($bp1->{sc_seq});
295 $right_len = length($bp2->{sc_seq}) if($bp2->{sc_seq});
296 }
297 if($type == INV && $bp1->{pos} > $bp2->{pos}) {
298 ($bp1, $bp2) = ($bp2, $bp1);
299 ($left_len, $right_len) = ($right_len, $left_len);
300 }
301 my ($cover1, $cover2) = ($bp1->{cover}, $bp2->{cover});
302
303 my ($chrA, $chrB, $ortA, $ortB) = ($bp1->{left_chr}, $bp1->{right_chr},
304 $bp1->{left_ort}, $bp1->{right_ort});
305 my ($posA, $posB, $countA, $countB) = ($bp1->{pos}, $bp2->{pos}, $bp1->{sc_reads}, $bp2->{sc_reads});
306 if($bp1->{chr} eq $bp1->{right_chr} && $bp1->{pos} == $bp1->{right_pos}) {
307 $posA = $bp2->{pos};
308 $posB = $bp1->{pos};
309 $countA = $bp2->{sc_reads};
310 $countB = $bp1->{sc_reads};
311 ($cover1, $cover2) = ($cover2, $cover1);
312 ($left_len, $right_len) = ($right_len, $left_len);
313 }
314
315 my $rtn = join("\t", ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB,
316 $type2str{$type}, $cover1, $cover2, $left_len, $right_len, $self->get_statistics(100),
317 $self->{c_start}, $self->{start_chr}, $self->{t_start},
318 $self->{c_end}, $self->{end_chr}, $self->{t_end}, $self->{consensus}));
319 if($self->{type} == INV) {
320 $rtn = join("\t", ($rtn, $self->{c_start2}, $self->{start_chr2}, $self->{t_start2},
321 $self->{c_end2}, $self->{end_chr2}, $self->{t_end}, $self->{consensus2}));
322 }
323 return $rtn;
324 }
325
326 sub set_consensus {
327 my $self = shift;
328 my ($validator, $paired ) = @_;
329 if(!$assembler) {
330 print STDERR "No assembler set, no consensus generated\n";
331 return;
332 }
333 my %all_reads;
334 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
335 my $N = 0;
336
337 my @s_reads1 = $self->_get_reads($sam_d, $bp1);
338 my @s_reads2 = $self->_get_reads($sam_d, $bp2);
339
340 if($self->{type} == INV) {
341 $self->{consensus} = _generate_consensus(\@s_reads1);
342 $self->{consensus2} = _generate_consensus(\@s_reads2);
343 }
344 else {
345 push @s_reads1, @s_reads2;
346 $self->{consensus} = _generate_consensus(\@s_reads1);
347 }
348 }
349
350 sub _generate_consensus {
351 my $s_reads = shift;
352 my $N = scalar(@{$s_reads});
353 return $s_reads->[0]{seq} if($N == 1);
354 my $fa_name = prepare_reads($s_reads);
355 my ($contig_file, $sclip_count) = $assembler->run($fa_name);
356 if(!$contig_file || -s $contig_file == 0) {
357 system ("rm $fa_name"); system("rm $fa_name*");
358 return;
359 }
360 my $n = 0;
361 my $contig_name;
362 foreach my $c (keys %{$sclip_count}) {
363 if($sclip_count->{$c} > $n) {
364 $n = $sclip_count->{$c};
365 $contig_name = $c;
366 }
367 }
368 return if($N * 8 > $n * 10);
369 my $contig_seqs = read_fa_file($contig_file);
370 return $contig_seqs->{$contig_name};
371 }
372
373 sub _get_reads{
374 my $self = shift;
375 my ($sam, $bp) = @_;
376
377 my ($chr, $pos) = ($bp->{chr}, $bp->{pos});
378 my ($start, $end) = ($pos, $pos);
379 if($bp->{all_pos}) {
380 my @tmp = @{$bp->{all_pos}};
381 @tmp = sort {$a <=> $b} @tmp;
382 ($start, $end) = ($tmp[0], $tmp[$#tmp]);
383 }
384 my %reads;
385 return if(scalar @{$bp->{reads}} == 0);
386 foreach my $r (@{$bp->{reads}}){
387 $reads{$r} = 1;
388 }
389
390 #my ($sam, $chr, $start, $end, $reads) =
391 # ($args{-SAM}, $args{-CHR}, $args{-START}, $args{-END}, $args{-READS}) ;
392
393 my @rtn;
394 my $range = $chr;
395 $range = $chr . ":" . $start . "-" . $end;
396 my($s, $e) = ($start, $end);
397 $sam->fetch($range, sub {
398 my $a = shift;
399 return unless (exists $reads{$a->qname});
400 return unless ($a->start && $a->end);
401 return unless (($a->start >= $start && $a->start <= $end)
402 || ($a->end >= $start && $a->end <= $end));
403 $s = $a->start if($s > $a->start);
404 $e = $a->end if($e < $a->end);
405 my $qscore = $a->qscore;
406 delete $reads{$a->qname};
407 push @rtn, {
408 name => $a->qname,
409 seq => $a->query->dna,
410 qual => $qscore,
411 };
412 } );
413 # $bp->{range} = [$s, $e];
414 if($self->{type} == INV) {
415 if($start == $bp->{left_pos} || $end == $bp->{left_pos}) {
416 $bp->{left_range} = [$s, $e];
417 }
418 else {
419 $bp->{right_range} = [$s, $e];
420 }
421 return @rtn;
422 }
423 if($chr eq $self->{first_bp}{left_chr} && ($start == $self->{first_bp}{left_pos} || $end == $self->{first_bp}{left_pos})){
424 $self->{first_bp}{left_range} = [$s, $e];
425 }
426 if($chr eq $self->{first_bp}{right_chr} && ($start == $self->{first_bp}{right_pos} || $end == $self->{first_bp}{right_pos})){
427 $self->{first_bp}{right_range} = [$s, $e];
428 }
429
430 return @rtn;
431 }
432
433 my $start_dir;
434 BEGIN {
435 $start_dir = getcwd;
436 }
437
438 sub _cat_genes {
439 my ($ort, @genes) = @_;
440 my $rtn;
441 my @names;
442 foreach my $g (@genes) {
443 push @names, $g->val->name;
444 }
445 @names = uniq @names;
446 $rtn = join(";", @names);
447 $rtn = $rtn . "($ort)" if($rtn);
448 return $rtn;
449 }
450
451 sub get_genes {
452 my $self = shift;
453 return ['NA', 'NA', 'NA'] unless $gm;
454
455 my ($gene1, $gene2);
456 my $dist;
457 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
458 my ($ort1, $ort2) = ($bp1->{left_ort}, $bp1->{right_ort});
459 my ($chr1, $chr2) = ($bp1->{left_chr}, $bp1->{right_chr});
460 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos});
461 ($pos1, $pos2) = ($pos2, $pos1) if($bp1->{right_pos} == $pos1);
462 my $range1 = $ort1 eq "+" ? [$pos1 - 5, $pos1] : [$pos1, $pos1 + 5];
463 my $range2 = $ort2 eq "+" ? [$pos2, $pos2 + 5] : [$pos2 - 5, $pos2];
464
465 my ($tmp1, $tmp2) = ($chr1 =~ m/chr/ ? $chr1 : "chr" . $chr1,
466 $chr2 =~ m/chr/ ? $chr2 : "chr" . $chr2);
467 my ($r_tree1, $f_tree1) = ($gm->sub_model($tmp1, "-"), $gm->sub_model($tmp1, "+"));
468 my ($r_tree2, $f_tree2) = ($gm->sub_model($tmp2, "-"), $gm->sub_model($tmp2, "+"));
469
470 my ($g_ort1, $g_ort2);
471 my @genes;
472 @genes = $r_tree1->intersect($range1) if($r_tree1);
473 $gene1 = _cat_genes("-", @genes) if(scalar @genes > 0 );
474 undef @genes;
475 @genes = $f_tree1->intersect($range1) if($f_tree1);
476 if(scalar @genes > 0) {
477 my $tmp = _cat_genes("+", @genes);
478 if($gene1) {
479 $gene1 .= ";" . $tmp;
480 }
481 else {
482 $gene1 = $tmp;
483 }
484 }
485 undef @genes;
486 @genes = $r_tree2->intersect($range2) if($r_tree2);
487 $gene2 = _cat_genes("-", @genes) if(scalar @genes > 0 );
488 undef @genes;
489 @genes = $f_tree2->intersect($range2) if($f_tree2);;
490 if(scalar @genes > 0) {
491 my $tmp = _cat_genes("+", @genes);
492 if($gene2) {
493 $gene2 .= ";" . $tmp;
494 }
495 else {
496 $gene2 = $tmp;
497 }
498 }
499
500 $gene1 = 'NA' unless $gene1;
501 $gene2 = 'NA' unless $gene2;
502 my $type = $self->{type};
503 if( $type == INS or $type == DEL ) {
504 ($pos1, $pos2) = ($pos2, $pos1) if($pos1 > $pos2);
505 @genes = $r_tree1->intersect([$pos1 - 5, $pos2 + 5]);
506 if(scalar @genes >= 2) {
507 $dist = scalar(@genes) - 2;
508 $dist .= "(-)";
509 }
510 @genes = $f_tree1->intersect([$pos1 - 5, $pos2 + 5]);
511 if(scalar @genes >= 2) {
512 if($dist) {
513 $dist .= ":" . (scalar (@genes) - 2);
514 }
515 else {
516 $dist = scalar (@genes) - 2;
517 }
518 $dist .= "(+)";
519 }
520 }
521 $dist = 'NA' unless $dist;
522 return [$gene1, $gene2, $dist];
523 }
524
525 sub to_full_string {
526 my $self = shift;
527 my $type = $self->{type};
528 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
529 return join("\t", ( $bp1->{left_chr}, $bp1->{left_pos}, $bp1->{left_ort},
530 $bp1->{right_chr}, $bp1->{right_pos}, $bp1->{right_ort}, $bp1->{sc_reads},
531 $bp2->{left_chr}, $bp2->{left_pos}, $bp2->{left_ort},
532 $bp2->{right_chr}, $bp2->{right_pos}, $bp2->{right_ort}, $bp2->{sc_reads},
533 $type2str{$type}));
534 }
535
536 sub filter {
537 my $self = shift;
538 print STDERR "SV filter starting....\n";
539 $self->{type} = $self->type;
540 $self->set_consensus;
541
542 if(!$self->{consensus} || length $self->{consensus} < $read_len * $min_percent_cons_of_read) {
543 $self->{error} = "No Consensus or consensus too short";
544 print STDERR "FAILED\n";
545 return;
546 }
547 if($self->{type} == INV && (!$self->{consensus2} || length $self->{consensus2} < $read_len * $min_percent_cons_of_read)) {
548 $self->{error} = "No Consensus or consensus too short";
549 print STDERR "FAILED\n";
550 return;
551 }
552
553 foreach my $f (values(%default_filters) ){
554 if(! $f->($self)){
555 print STDERR "FAILED\n";
556 return;
557 }
558 }
559 print STDERR "PASSED\n";
560 return 1;
561 }
562
563 sub _mapping_quality_filter {
564 my $self = shift;
565 if($self->{type} == INV) {
566 my $tmp1 = $self->_mapping_quality($self->{first_bp}, $self->{consensus}, 1);
567 my $tmp2 = $self->_mapping_quality($self->{second_bp}, $self->{consensus2}, 2);
568 return $tmp1 && $tmp2;
569 }
570 return $self->_mapping_quality($self->{first_bp}, $self->{consensus}, 1);
571 }
572
573 sub _mapping_quality {
574 my $self = shift;
575 my $bp = shift;
576 my $con = shift;
577 my $which = shift;
578 $which = "" unless $which == 2;
579 my $l = length($con);
580 print STDERR "Mapping quality filter ... ";
581
582 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
583 print $QUERY ">query\n", $con, "\n";
584 close $QUERY;
585
586 my ($range1, $range2) = $self->_find_bp_ref_range($bp, $con);
587 my ($s1, $e1, $t_s1, $t_e1) = $self->_map_consensus($range1);
588 return unless $e1;
589 my ($chr, $s, $e) = split /[:|-]/, $range2;
590 my $offset = 0;
591 if($s < $t_e1 && $e > $t_s1) { #overlap
592 my $tmp = substr $con, 0, $s;
593 if($s1 < $l - $e1) {
594 $offset = $e1;
595 $tmp = substr $con, $e1 + 1;
596 }
597 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
598 print $QUERY ">query\n", $tmp, "\n";
599 close $QUERY;
600 }
601 my ($s2, $e2, $t_s2, $t_e2) = $self->_map_consensus($range2);
602 return unless $e2;
603 $s2 += $offset;
604 $e2 += $offset;
605 if($s1 < $s2) {
606 if($e1 < $s2) {
607 $self->{"insert" . $which} = substr($con, $e1, $s2 - $e1);
608 }
609 $self->{"c_start" . $which} = $s1;
610 $self->{"t_start" . $which} = $t_s1;
611 $self->{"start_chr" . $which} = $bp->{left_chr};
612 $self->{"c_end" . $which} = $e2;
613 $self->{"t_end" . $which} = $t_e2;
614 $self->{"end_chr". $which} = $bp->{right_chr};
615 $s1 = 1;
616 ($e1, $s2) = ($s2, $e1);
617 $e2 = $l;
618 }
619 else {
620 if($e2 < $s1) {
621 $self->{"insert" . $which} = substr($con, $e2, $s1 - $e2);
622 }
623 $self->{"c_start" . $which} = $s2;
624 $self->{"t_start" . $which} = $t_s2;
625 $self->{"start_chr" . $which} = $bp->{right_chr};
626 $self->{"c_end" . $which} = $e1;
627 $self->{"t_end" . $which} = $t_e1;
628 $self->{"end_chr" . $which } = $bp->{left_chr};
629 $s2 = 1;
630 ($e2, $s1) = ($s1, $e2);
631 $e1 = $l;
632 }
633
634 my $n = 1;
635 foreach my $r( ($range1, $range2) ) {
636 my $r_a = $r eq $range1 ? $range2 : $range1;
637 my($chr, $s, $e) = split /[:|-]/, $r;
638 my($chr_a, $s_a, $e_a) = split /[:|-]/, $r_a;
639 my($s_c, $e_c) = $n == 1 ? ($s1, $e1) : ($s2, $e2);
640 $n++;
641 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
642 print $QUERY ">query\n", substr($self->{consensus}, $s_c, $e_c - $s_c + 1), "\n";
643 close $QUERY;
644 my $hit = $aligner->run(-TARGET => $genome . ":" . $r, -QUERY => "query.fa");
645 my $h = $hit->{query};
646 return unless $h;
647 my $hsp = $h->next_hsp;
648 my @matches = $hsp->matches;
649 $hit = $mapper->run(-QUERY => "query.fa");
650 foreach $h (@{$hit->{query}}) {
651 next if($h->{tchr} eq $chr && $h->{tstart} >= $s - $l && $h->{tend} <= $e + $l); # the same one
652 return if($h->{matches} >= $matches[0]); #find a better or as good as match
653 return if($h->{tchr} eq $chr_a && $h->{matches} - $matches[0] >= -5 && $self->{type} == CTX);
654 }
655 }
656 print STDERR "PASSED\n";
657 return 1;
658 }
659
660 sub _map_consensus {
661 my $self = shift;
662 my $range = shift;
663 my ($s_c, $e_c, $s_t, $e_t);
664
665 my $hits = $aligner->run(-TARGET => $genome . ":" . $range, -QUERY => "query.fa");
666 my ($chr, $s, $e) = split /[:|-]/, $range;
667 my $h = $hits->{query};
668 return unless $h; # can't find a good enough match
669 my $hsp = $h->next_hsp;
670 my @matches = $hsp->matches;
671 ($s_c, $e_c, $s_t, $e_t) = ($hsp->start('query'), $hsp->end('query'),
672 $hsp->start('hit'), $hsp->end('hit'));
673 return if( ($e_c - $s_c + 1) * 0.97 > $matches[0]); # the alignment is not good
674 ($s_t, $e_t) = $hsp->strand == 1 ? ($s_t + $s, $e_t + $s) : ($e_t + $s, $s_t + $s);
675 return ($s_c, $e_c, $s_t, $e_t);
676 }
677
678 sub _find_bp_ref_range {
679 my $self = shift;
680 my $bp = shift;
681 my $con = shift;
682
683 my $l = int(length($con)/2);
684 my $ext = $l * 2;
685 $ext = 100000 if($RNASeq);
686 if($self->{type} == DEL && abs($bp->{left_pos} - $bp->{right_pos}) < $l ){
687 $l = abs($bp->{left_pos} - $bp->{right_pos}) - 1;
688 }
689
690 my($range1, $range2);
691 my ($chr, $p) = ($bp->{left_chr}, $bp->{left_pos});
692 my $ort = $bp->{left_ort};
693 my ($s, $e);
694 if($bp->{left_range} ) {
695 $s = $bp->{left_range}->[0] - $l;
696 $e = $bp->{left_range}->[1] + $l;
697 }
698 else {
699 $s = $e = $p;
700 $s = $ort eq "+" ? $p - $ext : $p - $l;
701 $e = $ort eq "+" ? $p + $l : $p + $ext;
702 }
703 $s = 1 if($s < 1);
704 $e = $bp->{left_pos} + $l if($self->{type} == DEL);
705 $range1 = $chr . ":" . $s . "-" . $e;
706
707 ($chr, $p) = ($bp->{right_chr}, $bp->{right_pos});
708 $ort = $bp->{left_ort};
709 if($bp->{right_range} ) {
710 $s = $bp->{right_range}->[0] - $l;
711 $e = $bp->{right_range}->[1] + $l;
712 }
713 else {
714 $s = $e = $p;
715 $s = $ort eq "-" ? $p - $ext : $p - $l;
716 $e = $ort eq "-" ? $p + $l : $p + $ext;
717 }
718 $s = $bp->{right_pos} - $l if($self->{type} == DEL);
719 $s = 1 if($s < 1);
720 $range2 = $chr . ":" . $s . "-" . $e;
721
722 return ($range1, $range2);
723 }
724
725 sub _find_ref_range {
726 my $self = shift;
727 my $l = int(length($self->{consensus})/2);
728 my ($range1, $range2);
729 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
730
731 my ($chr, $p) = ($bp1->{chr}, $bp1->{pos});
732 my ($s, $e);
733 my $ext = $l*2 ;
734 $ext = 100000 if($RNASeq);
735 if($self->{type} == DEL && abs($bp1->{pos} - $bp2->{pos}) < $l ){
736 $l = abs($bp1->{pos} - $bp2->{pos}) - 1;
737 }
738
739 my $ort = $bp1->{pos} == $bp1->{left_pos} ?
740 ($bp1->{left_ort} eq "+" ? "-" : "+") : $bp1->{right_ort};
741 if($bp1->{range}) {
742 $s = $bp1->{range}->[0] - $l;
743 $s = 1 if($s < 1);
744 $e = $bp1->{range}->[1] + $l;
745 }
746 else {
747 $s = $e = $p;
748 $s = $ort eq "+" ? $p - $l : $p - $ext;
749 $e = $ort eq "+" ? $p + $ext : $p + $l;
750 }
751 $s = 1 if($s < 1);
752 $e = $bp1->{pos} + $l if($self->{type} == DEL);
753
754 $range1 = $chr . ":" . $s . "-" . $e;
755
756 ($chr, $p) = ($bp2->{chr}, $bp2->{pos});
757 $ort = $bp2->{pos} == $bp2->{left_pos} ?
758 ($bp2->{left_ort} eq '+' ? '-' : '+') : $bp2->{right_ort};
759 if($self->{type} == INV) {
760 $ort = $bp1->{pos} == $bp1->{left_pos} ?
761 $bp1->{right_ort} : ($bp1->{left_ort} eq '+' ? '-' : '+');
762 }
763
764 if($bp2->{range} && $self->{type} != INV) {
765 $s = $bp2->{range}->[0] - $l;
766 $e = $bp2->{range}->[1] + $l;
767 }
768 else {
769 $s = $e = $p;
770 $s = $ort eq "+" ? $p - $l : $p - $ext;
771 $e = $ort eq "+" ? $p + $ext : $p + $l;
772 }
773 $s = $bp2->{pos} - $l if($self->{type} == DEL);
774 $s = 1 if($s < 1);
775
776 $range2 = $chr . ":" . $s . "-" . $e;
777 return($range1, $range2);
778 }
779
780 my $GAP = -5;
781 my $MATCH = 2;
782 my $MISMATCH = -5;
783 sub _refine_bp {
784 my $self = shift;
785 my $h = shift;
786 my $ref_seq = shift;
787 my $con = $self->{consensus};
788 my $hsp = $h->next_hsp;
789 my ($i, $j) = ($hsp->start("hit"), $hsp->start("query"));
790 if($hsp->strand == -1) {
791 $con = rev_comp($con);
792 $j = length($con) - $hsp->end("query");
793 }
794
795 my ($score, $g, $max_score, $s) = (0, 0, 0);
796 my ($current_s_r, $current_s_c) = ($i, $j);
797 my ($s_r, $s_c, $e_r, $e_c);
798 my $p;
799 my ($n_matches, $n_max_matches) = (0, 0);
800 my @bl_r = @{$hsp->gap_blocks('hit')};
801 my @bl_c = @{$hsp->gap_blocks('query')};
802 for( my $n = 0; $n < scalar @bl_r; $n++) {
803 my $m = $bl_r[$n]->[1];
804 if($p) { #gaps
805 if (!$RNASeq || $bl_r[$n]->[0] - $p < 25) {
806 $score += $GAP * ($bl_r[$n]->[0] - $p);
807 $score = 0 if($score < 0 )
808 }
809 $i += $m;
810 $j += $bl_c[$n]->[1];
811 }
812
813 while($m) { #matches / mismatches
814 ($current_s_r, $current_s_c, $n_matches) = ($i, $j, 0)
815 if($score == 0); #reset
816
817 if(substr($ref_seq, $i, 1) eq substr($con, $j, 1)) {
818 $score += $MATCH;
819 $n_matches++;
820 }
821 else {
822 $score += $MISMATCH;
823 }
824 if($score > $max_score) {
825 $n_max_matches = $n_matches;
826 $max_score = $score;
827 ($s_r, $s_c) = ($current_s_r, $current_s_c);
828 ($e_r, $e_c) = ($i, $j);
829 }
830 $n_matches = $score = 0 if($score < 0 );
831 $m--; $i++; $j++;
832 }
833 $p = $bl_r[$n]->[0] + $bl_r[$n]->[1];
834 }
835 return($n_max_matches, $s_r, $e_r, $s_c, $e_c);
836 }
837
838 sub _mapping_artifact_filter {
839 my $self = shift;
840 if($self->{type} == INV) {
841 return ($self->_mapping_artifact($self->{consensus})
842 || $self->_mapping_artifact($self->{consensus2}) );
843 }
844 return $self->_mapping_artifact($self->{consensus});
845 }
846
847 sub _mapping_artifact {
848 my $self = shift;
849 my $con = shift;
850 my $l = length $con;
851 print STDERR "Mapping artifact filter ... ";
852 $self->{error} = "Mapping Artifact";
853 $l = $l - 5;
854 my $options = " minIdentity=97 -out=psl -nohead";
855 $options = $options . " -maxIntron=5 " unless $RNASeq;
856 my $exp = $l;
857 if($self->{type} == DEL && $RNASeq) {
858 $exp = $self->{second_bp}{pos} - $self->{first_bp}{pos};
859 $options = $options . " -maxIntron=" . ($exp + 100) if($exp > 75000);
860 }
861
862 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
863 print $QUERY ">query\n", $con, "\n";
864 close $QUERY;
865 my $tmp_mapping = $mapper->run(-QUERY => "query.fa", -OPTIONS => $options);
866 system("rm query.fa"); system("rm query.fa*");
867
868 $l = $l + 5;
869 if($RNASeq) {
870 #can't find a decent mapping for DEL event
871 return if($self->{type} == DEL && !$tmp_mapping->{query});
872 foreach my $h(@{$tmp_mapping->{query}}) {
873 #next if($h->{tend} - $h->{tstart} < $exp - 10);
874 return if ($self->{type} != DEL && $h->{qend} - $h->{qstart} > $l - 10);
875 my $chr = $h->{tchr};
876 $chr = "chr" . $chr unless $chr =~ m/^chr/;
877 my ($r_tree, $f_tree) = ($gm->sub_model($chr, "-"), $gm->sub_model($chr, "+"));
878 return unless ($f_tree && $r_tree);
879 my @f_genes = $f_tree->intersect([$h->{tstart}, $h->{tend}]);
880 my @r_genes = $r_tree->intersect([$h->{tstart}, $h->{tend}]);
881 return if(scalar @f_genes <= 1 && scalar @r_genes <= 1);
882 my @f_tmp_genes;
883 my @r_tmp_genes;
884
885 foreach my $g (@f_genes) {
886 foreach my $block (@{$h->{blocks}}) {
887 if($g->val->overlap($block)){
888 push @f_tmp_genes, $g;
889 last;
890 }
891 }
892 }
893 foreach my $g (@r_genes) {
894 foreach my $block (@{$h->{blocks}}) {
895 if($g->val->overlap($block)){
896 push @r_tmp_genes, $g;
897 last;
898 }
899 }
900 }
901 return if(scalar @f_tmp_genes < 1 && scalar @r_tmp_genes < 1);
902 }
903 }
904 else {
905 foreach my $h (@{$tmp_mapping->{query}}){
906 return if($h->{tend} - $h->{tstart} < $l + 10 && $h->{qend} - $h->{qstart} > $l - 5);
907 }
908 }
909 $self->{error} = undef;
910 return 1;
911 }
912
913 sub _germline_sclip_filter {
914 my $self = shift;
915 print STDERR "Germline sclip filter\n";
916 return 1 if(!$sam_g);
917 $self->{error} = "Germline Event";
918 my $rtn = _compare_seq($self->{first_bp}) &&
919 _compare_seq($self->{second_bp});
920 $self->{error} = undef if($rtn);
921 return $rtn;
922
923 }
924
925 sub _compare_seq {
926 my $bp = shift;
927 my $seq;
928 if($bp->{pos} == $bp->{left_pos}) {
929 my $seg = $sam_d->segment($bp->{right_chr}, $bp->{right_pos} -
930 $germline_seq_width, $bp->{right_pos} + $germline_seq_width);
931 $seq = $seg->dna;
932 }
933 else {
934 my $seg = $sam_d->segment($bp->{left_chr}, $bp->{left_pos} - $germline_seq_width,
935 $bp->{left_pos} + $germline_seq_width);
936 $seq = $seg->dna;
937 }
938 my $sclip_seq = _get_sclip_seqs($sam_g, $bp->{chr}, $bp->{pos} - $germline_search_width,
939 $bp->{pos} + $germline_search_width);
940
941
942 open my $TARGET, ">target.fa" or croak "can't open target.fa : $OS_ERROR";
943 print $TARGET ">target\n", $seq, "\n";
944 close $TARGET;
945
946 if(keys(%{$sclip_seq})) {
947 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
948 foreach my $s (keys(%{$sclip_seq})) {
949 print $QUERY ">", $s, "\n", $sclip_seq->{$s}, "\n";
950 }
951 close($QUERY);
952 my $hits = $aligner->run(-TARGET => "target.fa", -QUERY => 'query.fa');
953 return if(keys(%{$hits}));
954 }
955 return 1;
956 }
957
958 sub _type_distance_filter {
959 my $self = shift;
960 my $type = $self->{type};
961 $self->{error} = "Type Distance";
962 print STDERR "Type distance filter\n";
963 if($type == UNK) { #those events are not sure, always ignore them
964 print STDERR $self->to_full_string, "\n";
965 return;
966 }
967 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
968 return 1 if($bp2->{sc_reads} == 0);
969 my ($d1, $d2);
970 if($type == INS || $type == DEL) {
971 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos},
972 $bp1->{right_pos} - $bp2->{right_pos});
973 }
974 if( $type == INV ){
975 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos},
976 $bp2->{right_pos} - $bp1->{right_pos});
977 }
978 if( $type == ITX ) {
979 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{right_pos},
980 $bp2->{left_pos} - $bp1->{right_pos});
981 }
982 if($type == CTX ) {
983 if($bp1->{left_ort} eq $bp1->{right_ort} ||
984 $bp1->{sc_reads} == 0 || $bp2->{sc_reads} == 0) {
985 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos}, $bp1->{right_pos} - $bp2->{right_pos}) ;
986 }
987 else {
988 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{right_pos}, $bp2->{left_pos} - $bp1->{right_pos});
989 }
990 }
991 print STDERR $self->to_full_string, "\t", $d1, "\t", $d2, "\n";
992 if($d1 <= 1 && $d2 <= 1 || abs($d1 - $d2) <= 5) {
993 $self->{error} = undef;
994 return 1;
995 }
996 if(abs($d1) > $max_bp_dist || abs($d2) > $max_bp_dist){
997 print STDERR "Distance fail\n";
998 return;
999 }
1000 $self->{error} = undef;
1001 return 1; # the distance is small enough
1002 }
1003
1004 sub _germline_indel_filter {
1005 my $self = shift;
1006 my $type = $self->{type};
1007 print STDERR "Germline INDEL FILTER test\n";
1008 return if(abs($self->{first_bp}{pos} - $self->{second_bp}{pos}) < 40);
1009 return 1 if(!$sam_g);
1010 return 1 if( $type != INS && $type != DEL);
1011 return 1 if( abs($self->{first_bp}{pos} - $self->{second_bp}{pos}) > 50 );
1012 $self->{error} = "Germline INDEL";
1013 my ($start, $end ) = ($self->{first_bp}{left_pos}, $self->{first_bp}{right_pos});
1014 my $indel_len = abs($end - $start);
1015 ($start, $end) = ($end, $start) if($start > $end);
1016
1017 my $itr = $sam_g->features(-iterator => 1, -seq_id => $self->{first_bp}{chr},
1018 -start => $start, -end => $end);
1019 while( my $a = $itr->next_seq ) {
1020 next if($type == DEL && $a->cigar_str !~ m/D/);
1021 next if($type == INS && $a->cigar_str !~ m/I/);
1022 my $cigar_array = $a->cigar_array;
1023 my $pos = $a->pos;
1024 foreach my $ca (@{$cigar_array}) {
1025 if($ca->[0] eq 'M') {
1026 $pos += $ca->[1];
1027 next;
1028 }
1029 if($ca->[0] eq 'I' && $type == INS ){
1030 return if(abs($ca->[1] - $indel_len) <= 20 && abs($pos - $start) <= 20);
1031 next;
1032 }
1033 if($ca->[0] eq 'D' && $type == DEL) {
1034 return if(abs($ca->[1] - $indel_len) <= 20 && abs($pos - $start) <= 20);
1035 $pos += $ca->[1];
1036 next;
1037 }
1038 }
1039 }
1040 $self->{error} = undef;
1041 return 1;
1042 }
1043
1044 sub _tandem_repeat_filter {
1045 my $self = shift;
1046 my $con = $self->{consensus};
1047 print STDERR "low compexity filter\n";
1048 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
1049 print $QUERY ">query\n", $self->{consensus}, "\n";
1050 close $QUERY;
1051
1052 system("ptrfinder -seq query.fa -repsize $tr_min_size,$tr_max_size -minrep $tr_min_num > query.fa.rep");
1053 if(-e "query.fa.rep") {
1054 open my $REP, "<query.fa.rep" or croak "can't open query.fa.rep:$OS_ERROR";
1055 while( my $line = <$REP>) {
1056 chomp $line;
1057 my($pattern, $len, $times, $s, $e) =
1058 $line =~ m/PATTERN\s(\w+)\sLENGTH\s(\d+)\sTIMES\s(\d+)\sSTART\s(\d+)\sSTOP\s(\d+)\sID/;
1059 if($self->{type} == DEL || $self->{type} == INS){
1060 if(abs($self->{first_bp}{left_pos} - $self->{first_bp}{right_pos}) < $tr_max_indel_size) {
1061 print STDERR "Tandem repeat mediated INDEL!\n";
1062 close $REP;
1063 return;
1064 }
1065 }
1066 else{
1067 if(($s < 5 || length($self->{consensus}) - $e < 5) && $len * $times > 30 ) {
1068 print STDERR "Tandem repeat mediated events!\n";
1069 close $REP;
1070 return;
1071 }
1072 }
1073 }
1074 }
1075 return 1;
1076 }
1077
1078 sub _get_sclip_seqs {
1079 my ($sam, $chr, $start, $end) = @_;
1080 my %rtn;
1081 my $range = $chr . ":" . $start . "-" . $end;
1082
1083 $sam->fetch($range, sub {
1084 my $a = shift;
1085 my $cigar_str = $a->cigar_str;
1086 return if($cigar_str !~ /S/);
1087 my ($sclip_len, $pos, $seq, $qual);
1088 my @cigar_array = @{$a->cigar_array};
1089 if($cigar_array[0]->[0] eq 'S' ) {
1090 $sclip_len = $cigar_array[0]->[1]; $pos = $a->start;
1091 return if($pos < $start or $pos > $end); # the softclipping position is not in range
1092 $seq = substr($a->query->dna, 0, $sclip_len );
1093 $rtn{$a->qname} = $seq if($sclip_len >= 10);
1094 }
1095 #if($cigar_str =~ m/S(\d+)$/) {
1096 if($cigar_array[$#cigar_array]->[0] eq 'S') {
1097 $sclip_len = $cigar_array[$#cigar_array]->[1];
1098 $pos = $a->end;
1099 return if($pos < $start or $pos > $end); # the softclipping position is not in range
1100 $seq = substr($a->qseq, $a->l_qseq - $sclip_len );
1101 $rtn{$a->qname} = $seq if($sclip_len >= 10);
1102 }
1103 }
1104 );
1105 return \%rtn;
1106 }
1107
1108 sub _RNASeq_strand_filter {
1109 my $self = shift;
1110 my $type = $self->type;
1111 print STDERR "RNASeq strand filter\n";
1112 return 1 unless $gm;
1113 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
1114 my ($ort1, $ort2) = ($bp1->{left_ort}, $bp1->{right_ort});
1115 my ($chr1, $chr2) = ($bp1->{left_chr}, $bp1->{right_chr});
1116 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos});
1117 ($pos1, $pos2) = ($pos2, $pos1) if($bp1->{right_pos} == $pos1);
1118 my ($tmp1, $tmp2) = ($chr1 =~ m/chr/ ? $chr1 : "chr" . $chr1,
1119 $chr2 =~ m/chr/ ? $chr2 : "chr" . $chr2);
1120 $tmp1 = "chrM" if($tmp1 eq "chrMT");
1121 $tmp2 = "chrM" if($tmp2 eq "chrMT");
1122 my ($r_tree1, $f_tree1) = ($gm->sub_model($tmp1, "-"), $gm->sub_model($tmp1, "+"));
1123 my ($r_tree2, $f_tree2) = ($gm->sub_model($tmp2, "-"), $gm->sub_model($tmp2, "+"));
1124 my ($g_ort1, $g_ort2);
1125 return 1 unless($r_tree1 && $r_tree2 && $f_tree1 && $f_tree2);
1126 $g_ort1 = "-" if(scalar($r_tree1->intersect([$pos1 - 5, $pos1])) > 0);
1127 $g_ort1 = "+" if(scalar($f_tree1->intersect([$pos1 - 5, $pos1])) > 0);
1128 $g_ort2 = "-" if(scalar($r_tree2->intersect([$pos2, $pos2 + 5])) > 0);
1129 $g_ort2 = "+" if(scalar($f_tree2->intersect([$pos2, $pos2 + 5])) > 0);
1130 return 1 unless($g_ort1 && $g_ort2);
1131 return 1 if($g_ort1 eq $ort1 && $g_ort2 eq $ort2);
1132 return 1 if($g_ort1 ne $ort1 && $g_ort2 ne $ort2);
1133 return;
1134 }
1135
1136 # INS filter check to make sure that the INS part overlap with any gene
1137 # if the INS part only in intron will return as a false positive
1138 sub _RNASeq_INS_filter {
1139 my $self = shift;
1140 my $type = $self->{type};
1141 return 1 if($type != INS);
1142 print STDERR "RNAseq INS filter\n";
1143 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
1144 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos});
1145 ($pos1, $pos2) = ($pos2, $pos1) if($pos2 < $pos1);
1146 my $chr = $bp1->{chr};
1147 $chr = "chr" . $chr unless $chr =~ m/chr/;
1148 my ($r_tree, $f_tree) = ($gm->sub_model($chr, "-"), $gm->sub_model($chr, "+"));
1149 foreach my $tree( ($r_tree, $f_tree) ) {
1150 my @tmp;
1151 @tmp = $f_tree->intersect([$pos1, $pos2]);
1152 foreach my $g (@tmp) {
1153 return 1 if($g->val->overlap([$pos1, $pos2]));
1154 }
1155 }
1156 return;
1157 }
1158
1159 sub _RNASeq_DEL_filter {
1160 my $self = shift;
1161 my $type = $self->type;
1162 return 1 if($type != DEL);
1163 print STDERR "RNAseq DEL filter\n";
1164
1165 my ($gene1, $gene2, $dist) = @{$self->get_genes};
1166 return if($gene1 eq $gene2);
1167 my ($d_plus, $d_minus) = (0, 0);
1168 $d_plus = $1 if($dist =~ m/(\d+)\(\+\)/);
1169 $d_minus = $1 if($dist =~ m/(\d+)\(\-\)/);
1170 return if($d_plus == 0 && $d_minus == 0);
1171 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
1172 return if($bp1->{cover} == 0 || $bp2->{cover} == 0);
1173 my($left_len, $right_len);
1174 if(exists $bp1->{left_len}) {
1175 ($left_len, $right_len ) = ($bp1->{left_len}, $bp1->{right_len});
1176 }
1177 else {
1178 $left_len = length($bp1->{sc_seq}) if($bp1->{sc_seq});
1179 $right_len = length($bp2->{sc_seq}) if($bp2->{sc_seq});
1180 }
1181 return if($left_len < 30 || $right_len < 30);
1182 return if($bp1->{sc_reads} < 10 || $bp2->{sc_reads} < 10);
1183 return unless $bp1->{left_gene}->overlap([$bp1->{pos} - $left_len, $bp1->{pos}]);
1184 return unless $bp1->{right_gene}->overlap([$bp2->{pos} + $right_len, $bp2->{pos}]);
1185 return 1;
1186 }
1187
1188 sub get_statistics {
1189 my $self = shift;
1190 my $half_width = shift;
1191 my $sam = $self->sam_d;
1192 my @rtn;
1193
1194 foreach my $bp ( ($self->{first_bp}, $self->{second_bp})) {
1195 my ($chr, $pos ) = ($bp->{chr}, $bp->{pos});
1196 my $range = $chr . ":" . ($pos - $half_width) . "-" . ($pos + $half_width);
1197 my ($n_seq, $n_rep, $total_pid) = (0, 0, 0);
1198
1199 $sam->fetch($range, sub {
1200 my $a = shift;
1201 return unless ($a->start && $a->end);
1202 return unless ($a->start >= $pos - $half_width && $a->end <= $pos + $half_width);
1203 $n_seq++;
1204 $total_pid += _cal_pid($a);
1205 if($a->has_tag("XT")) {
1206 $n_rep++ if($a->aux_get("XT") ne "U");
1207 }
1208 }
1209 );
1210 if($n_seq == 0) {
1211 push @rtn, (0, 0);
1212 }
1213 else {
1214 push @rtn, ($total_pid/$n_seq, $n_rep/$n_seq);
1215 }
1216 }
1217 return @rtn;
1218 }
1219
1220 sub _cal_pid {
1221 my $a = shift;
1222 my ($ref, $matches, $query) = $a->padded_alignment;
1223 my ($n_match, $n) = (0, 0);
1224 for( my $i = 0; $i < length($matches); $i++) {
1225 my $c = substr $matches, $i, 1;
1226 $n_match++ if($c eq "|");
1227 $n++;
1228 }
1229 return 0 if($n == 0);
1230 return $n_match/$n;
1231 }
1232
1233
1234 1;
1235 __END__;
1236
1237 =pod
1238
1239 =head1 NAME
1240