comparison CREST.pl @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc8d8bfeb9a
1 #!/usr/bin/perl -w
2 #use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev';
3 use strict;
4 use Carp;
5 use Getopt::Long;
6 use English;
7 use Pod::Usage;
8 use Data::Dumper;
9 use Bio::DB::Sam;
10 use Bio::DB::Sam::Constants;
11 use Bio::SearchIO;
12 use Bio::SeqIO;
13 use File::Temp qw/ tempfile tempdir /;
14 use File::Spec;
15 use File::Path;
16 use File::Copy;
17 use File::Basename;
18 use List::MoreUtils qw/ uniq /;
19 use Cwd;
20 use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP);
21 use SVUtil qw($use_scratch $work_dir clip_fa_file prepare_reads parse_range get_input_bam
22 find_smallest_cover get_work_dir is_PCR_dup read_fa_file get_sclip_reads);
23 use StructVar;
24 require SVExtTools;
25
26 use Transcript;
27 use Gene;
28 use GeneModel;
29
30 my $debug = 1;
31 $min_percent_id = 90;
32 # input/output
33 my ( $out_dir, $out_prefix, $range, $bam_d, $bam_g, $sclip_file);
34 my $out_suffix = "predSV.txt";
35 my $read_len = 100;
36 my $ref_genome ;
37
38 my $hetero_factor = 0.4;
39 my $triger_p_value = 0.05;
40
41 #RNASeq support
42 my $RNASeq = 0;
43 my $gene_model_file;
44 my $gm_format = "REFFLAT";
45 my $gm;
46 my $max_sclip_reads = 50; # we will consider the position if we have enough sclipped reads
47 my $min_one_side_reads = 5;
48 my $min_pct_sclip_reads = 5; # we require at least 5% reads have softclipping
49
50 # external programs related variable
51 # 1. cap3 related variables
52 my $cap3 = "cap3";
53 my $cap3_options=" -h 70 -y 10 > /dev/null";
54
55 # 2 blat related variables, using blat server and blat standalone
56 my $blat_client_exe = "gfClient";
57 my $blat_client_options = ' -out=psl -nohead -minIdentity=95 -maxIntron=5';
58 my $target_genome = "hg18.2bit";
59 my $dir_2bit = "/";
60 my $blat_server = "sjblat";
61 my $blat_port = 50000;
62 my $blat_exe = "blat";
63 my $blat_options = " -tileSize=7 -stepSize=1 -out=psl -minScore=15 -noHead -maxIntron=1 ";
64
65 $use_scratch = 0;
66 my $paired = 1;
67
68 my $rescue = 1;
69 # other options
70 my ($min_sclip_reads, $max_repetitive_cover, $min_sclip_len );
71 my $min_hit_reads;
72 my $rmdup;
73 my $min_clip_len;
74 my ($max_score_diff, $max_num_hits);
75 my ($min_dist_diff, $min_hit_len) = (15, 15);
76
77 #SV filter related parameters
78 my $max_bp_dist = 15;
79 my $min_percent_cons_of_read = 0.75;
80 my $germline_seq_width = 100;
81 my $germline_search_width = 50;
82 my $rm_tandem_repeat = 1; #tandem repeat mediated events
83 my $tr_max_indel_size = 100; #tandem repeat mediated indel size
84 my $tr_min_size = 2;
85 my $tr_max_size = 8; #tandem repeat max_size of single repeat
86 my $tr_min_num = 4; #tandem repeat minimum number of occurence
87
88 # verification pupuse
89 my $verify_file;
90 my $sensitive;
91 my $cluster_size;
92
93 # common help options
94 my ( $help, $man, $version, $usage );
95 my $optionOK = GetOptions(
96 # SV validation related parameters
97 'max_bp_dist=i' => \$max_bp_dist,
98 'min_percent_cons_of_read=f' => \$min_percent_cons_of_read,
99 'germline_seq_width=i' => \$germline_seq_width,
100 'germline_search_width=i' => \$germline_search_width,
101 # tandem repeat related parameters
102 'rm_tandem_repeat!' => \$rm_tandem_repeat,
103 'tr_max_indel_size=i' => \$tr_max_indel_size,
104 'tr_min_size=i' => \$tr_min_size,
105 'tr_max_size=i' => \$tr_max_size,
106 'tr_min_num=i' => \$tr_min_num,
107 'hetero_factor=f' => \$hetero_factor,
108 'triger_p_value=f' => \$triger_p_value,
109
110 # input/output
111 #'s|sample=s' => \$sample,
112 'd|input_d=s' => \$bam_d,
113 'g|inpug_g=s' => \$bam_g,
114 'f|sclipfile=s' => \$sclip_file,
115 'p|prefix=s' => \$out_prefix,
116 'o|out_dir=s' => \$out_dir,
117 'l|read_len=i' => \$read_len,
118 'ref_genome=s' => \$ref_genome,
119 'v|verify_file=s' => \$verify_file,
120 'sensitive' => \$sensitive,
121 'paired!' => \$paired,
122 'rescue!' => \$rescue,
123 'cluster_size=i' => \$cluster_size,
124 # external programs location and options
125 'scratch!' => \$use_scratch,
126 'cap3=s' => \$cap3,
127 'cap3opt=s' => \$cap3_options,
128 'blatclient=s' => \$blat_client_exe,
129 'blatclientopt=s' => \$blat_client_options,
130 'blat=s' => \$blat_exe,
131 't|target_genome=s' => \$target_genome,
132 'blatopt=s' => \$blat_options,
133 'blatserver=s' => \$blat_server,
134 'blatport=i' => \$blat_port,
135 '2bitdir=s' => \$dir_2bit,
136 #RNAseq support
137 'RNASeq' => \$RNASeq,
138 'genemodel=s' => \$gene_model_file,
139 'gmformat=s' => \$gm_format,
140 #other related parameters
141 'r|range=s' => \$range,
142 'max_score_diff=i' => \$max_score_diff,
143 'm|min_sclip_reads=i' => \$min_sclip_reads,
144 'min_one_side_reads=i' => \$min_one_side_reads,
145 'max_rep_cover=i' => \$max_repetitive_cover,
146 'min_sclip_len=i' => \$min_sclip_len,
147 'min_hit_len=i' => \$min_hit_len,
148 'min_dist_diff=i' => \$min_dist_diff,
149 'min_hit_reads=i' => \$min_hit_reads,
150 'rmdup!' => \$rmdup,
151
152 # soft_clipping related parameters (from SVDector package)
153 'min_percent_id=i' => \$min_percent_id,
154 'min_percent_hq=i' => \$SCValidator::min_percent_hq,
155 'lowqual_cutoff=i' => \$lowqual_cutoff,
156
157 # common help parameters
158 'h|help|?' => \$help,
159 'man' => \$man,
160 'usage' => \$usage,
161 'version' => \$version,
162 );
163
164 pod2usage(-verbose=>2) if($man or $usage);
165 pod2usage(1) if($help or $version );
166
167 my $start_dir = getcwd;
168
169 # figure out input file
170 if(!$bam_d) {
171 pod2usage(1);
172 croak "You need specify input bam file(s)";
173 }
174
175 if(!$sclip_file) {
176 pod2usage(1);
177 croak "You need to specify the softclipping file";
178 }
179 $sclip_file = File::Spec->rel2abs($sclip_file);
180
181 if(!$ref_genome) {
182 pod2usage(1);
183 croak "You need to specify the reference genome used by bam file";
184 }
185
186 croak "The file you sepcified does not exist" unless (
187 -e $sclip_file && -e $bam_d && -e $ref_genome && -e $target_genome);
188 my $input_base;
189 $bam_d = File::Spec->rel2abs($bam_d);
190 $input_base = fileparse($bam_d);
191 $bam_g = File::Spec->rel2abs($bam_g) if($bam_g);
192
193 #RNASeq support
194 if($RNASeq) {
195 croak "You need specify the input gene model file" unless ($gene_model_file);
196 StructVar->add_RNASeq_filter();
197 $min_sclip_reads = 10 unless $min_sclip_reads;
198 $max_repetitive_cover = 5000 unless $max_repetitive_cover;
199 $min_sclip_len = 20 unless $min_clip_len;
200 $max_score_diff = 5 unless $max_score_diff;
201 $max_num_hits = 3 unless $max_num_hits;
202 $min_hit_reads = 5 unless $min_hit_reads;
203 $cluster_size = 5 unless $cluster_size;
204 }
205 else {
206 $min_sclip_reads = 3 unless $min_sclip_reads;
207 $max_repetitive_cover = 500 unless $max_repetitive_cover;
208 $min_sclip_len = 20 unless $min_clip_len;
209 $max_score_diff = 5 unless $max_score_diff;
210 $max_num_hits = 10 unless $max_num_hits;
211 $min_hit_reads = 1 unless $min_hit_reads;
212 $cluster_size = 1 unless $cluster_size;
213 }
214
215 if($gene_model_file) {
216 $gm = GeneModel->new if($gene_model_file);
217 $gm->from_file($gene_model_file, $gm_format);
218 StructVar->gene_model($gm);
219 }
220
221 # set up the external programs and validators
222 # Those variable will be global
223 my $assembler = Assembler->new(
224 -PRG => $cap3,
225 -OPTIONS => $cap3_options
226 );
227
228 my $mapper = Mapper->new(
229 -PRG => join(' ', ($blat_client_exe, $blat_server, $blat_port)),
230 -OPTIONS => $blat_client_options,
231 -BIT2_DIR => $dir_2bit,
232 -MAX_SCORE_DIFF => $max_score_diff,
233 -MAX_NUM_HITS => $max_num_hits,
234 );
235
236 my $aligner = Aligner->new(
237 -PRG => $blat_exe,
238 -OPTIONS => $blat_options,
239 );
240
241 StructVar->assembler($assembler);
242 StructVar->read_len($read_len);
243 StructVar->aligner($aligner);
244 StructVar->mapper($mapper);
245 StructVar->RNASeq(1) if($RNASeq);
246 StructVar->genome($target_genome);
247 StructVar->remove_filter("tandem_repeat") unless($rm_tandem_repeat);
248 StructVar->tr_max_indel_size($tr_max_indel_size);
249 StructVar->tr_min_size($tr_min_size);
250 StructVar->tr_max_size($tr_max_size);
251 StructVar->tr_min_num($tr_min_num);
252 StructVar->max_bp_dist($max_bp_dist) if($max_bp_dist);
253 StructVar->germline_seq_width($germline_seq_width) if($germline_seq_width);
254 StructVar->germline_search_width($germline_search_width) if($germline_search_width);
255
256
257 my $validator = SCValidator->new();
258 $validator->remove_validator('strand_validator') if(!$paired);
259
260 #setup output and working directory
261 $out_dir = getcwd if(!$out_dir);
262 mkdir $out_dir if(!-e $out_dir || ! -d $out_dir);
263 $work_dir = get_work_dir(-SCRATCH => $use_scratch);
264 $work_dir = $out_dir if(!$work_dir);
265 $use_scratch = undef if($work_dir eq $out_dir); # don't erase the out_dir
266 chdir($work_dir);
267
268 # figure out output prefix
269 if(!$out_prefix) {
270 $out_prefix = $input_base;
271 }
272
273 my $sam_d = Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome);
274 StructVar->sam_d($sam_d);
275 my $sam_g;
276 if($bam_g) {
277 $sam_g = Bio::DB::Sam->new( -bam => $bam_g);
278 StructVar->sam_g($sam_g);
279 }
280
281 #my $output_file = File::Spec->catfile($outdir, $out_prefix . $out_suffix);
282 my $output_file;
283 $output_file = join('.', $out_prefix, $out_suffix);
284 $output_file = File::Spec->catfile($out_dir, $output_file);
285
286 # the softclip file is sorted, so no need to re-sort it
287 open my $SCLIP, "<$sclip_file" or croak "can't open $sclip_file:$OS_ERROR";
288 my %sclip;
289 my $pre_p;
290 while( my $line = <$SCLIP> ) {
291 chomp $line;
292 my ($chr, $pos, $ort, $cover, $C) = split /\t/, $line;
293 if( ! exists($sclip{$chr})) {
294 $sclip{$chr} = [];
295 $pre_p = $pos;
296 }
297 $C = 30 unless $C;
298 if($pos < $pre_p) {
299 print STDERR "The input file is not sorted!";
300 exit(1);
301 }
302 push @{$sclip{$chr}}, [$pos, $ort, $cover, $C];
303 }
304 close $SCLIP;
305
306 my @final_SV;
307 my @s_cover = @{find_smallest_cover($min_sclip_reads, $hetero_factor, $triger_p_value)};
308 if($range) {
309 @final_SV = detect_range_SVs($sam_d, $range, \%sclip, \@s_cover);
310 }
311 else {
312 foreach my $chr (keys %sclip) {
313 my @tmpSV = detect_range_SVs($sam_d, $chr, \%sclip, \@s_cover);
314 @final_SV = @tmpSV unless (@final_SV);
315 foreach my $sv (@tmpSV) {
316 push @final_SV, $sv if($sv && ! is_dup_SV(\@final_SV, $sv));
317 }
318 }
319 }
320 undef %sclip;
321 open my $OUT, ">$output_file" or croak "can't open $output_file:$OS_ERROR";
322 foreach my $SV (@final_SV) {
323 if($SV->filter) {
324 print $OUT $SV->to_string ;
325 if($RNASeq) {
326 print $OUT "\t", join("\t", @{$SV->get_genes});
327 }
328 print $OUT "\n";
329 }
330 }
331 chdir $start_dir;
332 exit(0);
333
334 sub detect_range_SVs {
335 my ($sam_d, $range, $r_sclips, $r_scover) = @_;
336 my ($chr, $start, $end) = parse_range($range);
337 my ($tid) = $sam_d->header->parse_region($chr);
338 my $chr_len = $sam_d->header->target_len->[$tid];
339 my $r_range_sclips = search_sclip($r_sclips, $range);
340 return unless $r_range_sclips;
341 my @scover = @{$r_scover};
342 my @rtn;
343
344 my ($f_tree, $r_tree);
345 if($RNASeq) {
346 my $full_chr = $chr;
347 my ($gm_chr) = $gm->get_all_chr;
348 $full_chr = "chr" . $chr if($chr !~ m/chr/ && $gm_chr =~ /chr/);
349 ($f_tree, $r_tree) = ($gm->sub_model($full_chr, "+"), $gm->sub_model($full_chr, "-")) if($RNASeq);
350 }
351 my $n = scalar @{$r_range_sclips};
352 push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '+', 1, 50];
353 push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '-', 1, 50];
354 $n = $n + 2;
355
356 # try to save the 1 base off problem of soft-clipping
357 my (@ss1, $p1, $c1);
358 my (@ss2, $p2, $c2);
359 my ($n1, $n2, $cover1, $cover2) = (0, 0, 0, 0);
360 for( my $i = 0; $i < $n; $i++) {
361 my $s = $r_range_sclips->[$i];
362 # print join("\t", @{$s}), "\n" if($debug);
363 my $clip = $s->[1] eq '+' ? RIGHT_CLIP : LEFT_CLIP;
364 next if($s->[0] >= $chr_len );
365 if($clip == RIGHT_CLIP) {
366 $p1 = $c1 = $s->[0] unless $p1;
367 if($s->[0] < $c1) {
368 print STDERR "The soft-clipping file has problem! It's either not sorted or
369 the file has mutliple parts for a chromsome!";
370 last;
371 }
372 if($s->[0] - $c1 < $cluster_size) {
373 $n1 += $s->[2];
374 $cover1 = $s->[3] if($cover1 < $s->[3]);
375 push @ss1, $s;
376 if($s->[0] < $c1) {
377 $p1 += $s->[0];
378 $c1 = $p1 / scalar @ss1;
379 }
380 next;
381 }
382 }
383 else {
384 $p2 = $c2 = $s->[0] unless $p2;
385 if($s->[0] < $c2) {
386 print STDERR "The soft-clipping file has problem! It's either not sorted or
387 the file has mutliple parts for a chromsome!";
388 last;
389 }
390 if($s->[0] - $p2 < $cluster_size) {
391 $n2 += $s->[2];
392 $cover2 = $s->[3] if($cover2 < $s->[3]);
393 push @ss2, $s;
394 if($s->[0] < $c2) {
395 $p2 += $s->[0];
396 $c2 = $p2 / scalar @ss2;
397 }
398 next;
399 }
400 }
401
402 my @cc = $clip == LEFT_CLIP ? @ss2 : @ss1;
403 my $pos = $clip == LEFT_CLIP ? $ss2[$#ss2]->[0] : $ss1[0]->[0];
404 my ($n_s, $cover_s) = $clip == LEFT_CLIP ? ($n2, $cover2) : ($n1, $cover1);
405 my $tmp_range = $clip == LEFT_CLIP ? [$ss2[0]->[0], $ss2[$#ss2]->[0] + $cluster_size] :
406 [$ss1[0]->[0] - $cluster_size, $ss1[$#ss1]->[0]];
407
408 if($clip == LEFT_CLIP) {
409 @ss2 = (); $p2 = $c2 = $s->[0];
410 $n2 = $s->[2];
411 $cover2 = $s->[3];
412 push @ss2, $s;
413 }
414 else {
415 @ss1 = (); $p1 = $c1 = $s->[0];
416 $n1 = $s->[2];
417 $cover1 = $s->[3];
418 push @ss1, $s;
419 }
420
421 my @s_reads;
422 if($RNASeq) {
423 # print $n_s, "\n";
424 next if($n_s < $min_sclip_reads && $cover_s > $s_cover[$n_s] ) ;
425 next if($n_s < $max_sclip_reads && $n_s * 100 <= $cover_s * $min_pct_sclip_reads);
426 my @f_genes = $f_tree->intersect($tmp_range);
427 my @r_genes = $r_tree->intersect($tmp_range);
428 next if(scalar @f_genes == 0 && scalar @r_genes == 0);
429 }
430 else {
431 if($n_s < $min_sclip_reads) { #too few covered
432 if($cover_s > $s_cover[$n_s]) {
433 next if(!$sensitive);
434 foreach my $c (@cc) {
435 next if($c->[0] >= $chr_len);
436 push @s_reads, get_sclip_reads(-SAM => $sam_d,
437 -CHR =>$chr,
438 -START => $c->[0],
439 -END => $c->[0],
440 -CLIP => $clip,
441 -MINCLIPLEN => 0,
442 -VALIDATOR => $validator,
443 -PAIRED => $paired,
444 -RMDUP => $rmdup,
445 -EXTRA => abs($c->[0] - $pos) );
446 }
447 }
448 }
449 }
450 if(! @s_reads) {
451 foreach my $c (@cc) {
452 next if($c->[0] >= $chr_len);
453 push @s_reads, get_sclip_reads(-SAM => $sam_d,
454 -CHR =>$chr,
455 -START => $c->[0],
456 -END => $c->[0],
457 -CLIP => $clip,
458 -MINCLIPLEN => 0,
459 -VALIDATOR => $validator,
460 -PAIRED => $paired,
461 -RMDUP => $rmdup,
462 -EXTRA => abs($c->[0] - $pos) );
463 }
464 }
465 my $l = 0;
466 foreach my $r (@s_reads) {
467 my $len = length $r->{seq};
468 $l = $len if($l < $len);
469 }
470 next if ($l < $min_sclip_len);
471 print STDERR join("\t", ($chr, $pos, $clip == LEFT_CLIP ? "-" : "+", scalar @s_reads)), "\n";
472 my @SV = detect_SV(
473 -SAM => $sam_d,
474 -CHR => $chr,
475 -POS => $pos,
476 -ORT => $clip == LEFT_CLIP ? "-" : "+",
477 -SCLIP => \%sclip,
478 -COVER => $cover_s,
479 -SREADS => \@s_reads);
480 foreach my $sv (@SV) {
481 $sv->{type} = $sv->type;
482 push @rtn, $sv if($sv && ! is_dup_SV(\@rtn, $sv));
483 }
484 }
485
486 if($RNASeq) {
487 push @rtn, find_del_SVs($sam_d, $chr, $start, $end);
488 }
489 return @rtn;
490 }
491
492 sub is_dup_SV {
493 my($r_SVs, $sv) = @_;
494 foreach my $s (@{$r_SVs}) {
495 return 1
496 if( $s->first_bp->{pos} == $sv->second_bp->{pos} &&
497 $s->first_bp->{chr} eq $sv->second_bp->{chr} &&
498 $s->second_bp->{pos} == $sv->first_bp->{pos} &&
499 $s->second_bp->{chr} eq $sv->first_bp->{chr} &&
500 $s->{type} == $sv->{type} );
501 return 1
502 if( $s->first_bp->{pos} == $sv->first_bp->{pos} &&
503 $s->first_bp->{chr} eq $sv->first_bp->{chr} &&
504 $s->second_bp->{pos} == $sv->second_bp->{pos} &&
505 $s->second_bp->{chr} eq $sv->second_bp->{chr} &&
506 $s->{type} == $sv->{type} );
507
508 }
509 return;
510 }
511
512 sub search_sclip {
513 my ($r_sclip, $range) = @_;
514 my ($chr, $start, $end) = parse_range($range);
515 return $r_sclip->{$chr} if(!$start);
516 my $start_i = bin_search($r_sclip->{$chr}, $start);
517 my $end_i = bin_search($r_sclip->{$chr}, $end);
518 my @tmp = @{$r_sclip->{$chr}};
519 if($start_i <= $end_i) {
520 @tmp = @tmp[$start_i .. $end_i];
521 if($start_i == $end_i) {
522 return undef if($tmp[0]->[0] < $start || $tmp[0]->[0] > $end);
523 }
524 return \@tmp;
525 }
526 else {
527 return undef;
528 }
529 }
530
531 sub bin_search {
532 my ($a, $p) = @_;
533 my ($s, $e) = (0, scalar(@{$a})-1);
534 my $m = int( ($s + $e)/2);
535 while(1) {
536 return $s if($a->[$s][0] >= $p);
537 return $e if($a->[$e][0] <= $p);
538 return $m if($a->[$m][0] == $p || ($a->[$m-1][0] < $p && $a->[$m+1][0] > $p));
539 if($a->[$m][0] > $p) {
540 $e = $m;
541 }
542 else {
543 $s = $m;
544 }
545 $m = int( ($s+$e)/2 );
546 }
547 }
548
549 sub count_coverage {
550 my ($sam, $chr, $pos, $clip) = @_;
551 if($rmdup && !$RNASeq) {
552 my @pairs;
553 my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos);
554 my $n = 0;
555 return 0 unless $seg;
556 my $itr = $seg->features(-iterator => 1);
557 while( my $a = $itr->next_seq) {
558 next unless($a->start && $a->end); #why unmapped reads here?
559 my $sclip_len = 0;
560 if($clip) {
561 my @cigar_array = @{$a->cigar_array};
562 #$sclip_len = $1 if($a->cigar_str =~ m/S(\d+)$/ && $clip == RIGHT_CLIP);
563 $sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP);
564 #$sclip_len = $1 if($a->cigar_str =~ m/^S(\d+)/ && $clip == LEFT_CLIP);
565 $sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP);
566 }
567 next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len));
568 $n++;
569 return $n if( $n > $max_repetitive_cover);
570 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
571 $a->mate_end ? $a->mate_end : 0, $sclip_len];
572 }
573 return $n;
574 }
575 else{
576 my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr,
577 -start => $pos, -end => $pos);
578 return 0 unless $c;
579 my @c_d = $c->coverage;
580 return $c_d[0];
581 }
582 }
583
584 sub detect_SV {
585 my %param = @_;
586 my @s_reads = @{$param{-SREADS}};
587 my($sam, $chr, $ort, $r_sclip, $coverage) = ( $param{-SAM},
588 $param{-CHR}, $param{-ORT}, $param{-SCLIP}, $param{-COVER});
589 my $clip = $ort eq '+' ? RIGHT_CLIP : LEFT_CLIP;
590 return if($coverage > $max_repetitive_cover);
591 my $fa_name = prepare_reads(\@s_reads, $clip);
592 my($contig_file, $sclip_count, $contig_reads) = $assembler->run($fa_name);
593 return if(!$contig_file or !(-e $contig_file));
594
595 $contig_file = clip_fa_file($contig_file, $clip);
596 my $contig_seqs = read_fa_file($contig_file);
597 if(scalar @s_reads == 1) {
598 $contig_file = clip_fa_file($fa_name, $clip);
599 $contig_seqs = read_fa_file($contig_file);
600 my @seq_names = keys %{$contig_seqs};
601 $sclip_count->{$seq_names[0]} = 1;
602 }
603
604 my @SV;
605 my $where = ($clip == LEFT_CLIP ? "right" : "left");
606 my $mapping = $mapper->run( -QUERY => $contig_file );
607 my (%reads, %quals, %s_lens, %pos);
608 foreach my $r (@s_reads) {
609 $reads{$r->{name}} = $r->{full_seq};
610 $quals{$r->{name}} = $r->{full_qual};
611 $s_lens{$r->{name}} = length($r->{seq});
612 $pos{$r->{name}} = $r->{pos};
613 }
614
615 foreach my $s (keys(%{$mapping})) {
616 next if($sclip_count->{$s} < $min_sclip_reads);
617
618 # try to find a better mapping
619 my @tmp_reads;
620 my $selected_read;
621 my @tmp_pos;
622 foreach my $n (@{$contig_reads->{$s}}) {
623 push @tmp_pos, $pos{$n};
624 }
625 @tmp_pos = uniq @tmp_pos;
626 my $real_pos = $clip == LEFT_CLIP ? $tmp_pos[$#tmp_pos] : $tmp_pos[0];
627
628 my $n_new_SV = 0;
629 my $tmp_bp;
630 foreach my $t (@{$mapping->{$s}}) {
631 my $qort = $t->{qstrand};
632 my $t_pos = $t->{tstart};
633 $t_pos = $t->{tend} if( ($qort eq "+" && $clip == LEFT_CLIP) ||
634 ($qort eq '-' && $clip == RIGHT_CLIP));
635 if($t->{tchr} eq $chr && (abs($t_pos - $real_pos) < $min_dist_diff)) { # the soft clipping is incorrect
636 @SV = @SV[0 .. ($#SV - $n_new_SV + 1)] if($n_new_SV > 0);
637 last;
638 }
639 my $first_bp = {};
640 $first_bp->{sc_reads} = $sclip_count->{$s};
641 $first_bp->{$where . "_chr"} = $chr;
642 $first_bp->{$where . "_pos"} = $real_pos;
643 $first_bp->{all_pos} = \@tmp_pos;
644 $first_bp->{$where . "_ort"} = "+";
645 $first_bp->{pos} = $real_pos;
646 $first_bp->{cover} = $coverage;
647 $first_bp->{chr} = $chr;
648 my $new_where = ($where eq "right" ? "left" : "right");
649
650 $first_bp->{$new_where . "_chr"} = $t->{tchr};
651 $first_bp->{$new_where . "_pos"} = $t_pos;
652 $first_bp->{$new_where . "_ort"} = $qort;
653 $first_bp->{sc_seq} = $contig_seqs->{$s};
654 $first_bp->{reads} = $contig_reads->{$s};
655
656 $tmp_bp = $first_bp unless $tmp_bp;
657 my $second_bp = check_sclip(-SAM => $sam, -TARGET => $t, -CHR => $chr,
658 -POS => $real_pos, -SCLIP => $r_sclip, -CLIP => $clip);
659 if(@{$second_bp}) {
660 foreach my $tmp_bp (@{$second_bp}) {
661 push @SV, StructVar->new(-FIRST_BP => $first_bp, -SECOND_BP => $tmp_bp);
662 $n_new_SV++;
663 }
664 }
665 }
666 # save some one side good soft-clipping SV
667 if( $rescue == 1 &&
668 $n_new_SV == 0 &&
669 $sclip_count->{$s} >= $min_one_side_reads &&
670 (scalar(@{$mapping->{$s}}) == 1 || ($mapping->{$s}[0]{perfect} == 1 && $mapping->{$s}->[1]{perfect} == 0)) &&
671 $tmp_bp->{chr} &&
672 length($contig_seqs->{$s}) * 0.95 < $mapping->{$s}[0]{matches} ) {
673
674 my %second_bp = %{$tmp_bp};
675 $second_bp{sc_reads} = 0;
676 $second_bp{sc_seq} = '';
677 ($second_bp{chr}, $second_bp{pos}) = $second_bp{pos} == $second_bp{left_pos} ?
678 ($second_bp{right_chr}, $second_bp{right_pos}) : ($second_bp{left_chr}, $second_bp{left_pos});
679 $second_bp{cover} = count_coverage($sam, $second_bp{chr}, $second_bp{pos});
680 $second_bp{reads} = [];
681 push @SV, StructVar->new(-FIRST_BP => $tmp_bp, -SECOND_BP => \%second_bp);
682 }
683
684 }
685 system("rm $fa_name"); system("rm $fa_name*");
686 return @SV;
687 }
688
689 sub get_gene_range {
690 my ($chr, $pos, $clip) = @_;
691 my $r = $clip == LEFT_CLIP ? [$pos, $pos+5] : [$pos - 5, $pos];
692 my @genes;
693 my $tmp = $chr;
694 $tmp = "chr" . $tmp if($tmp !~ m/chr/);
695 my ($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-"));
696 push @genes, $f_tree->intersect($r);
697 push @genes, $r_tree->intersect($r);
698 my ($s, $e);
699 if(scalar @genes == 0) { #no gene here
700 return [$pos - $read_len, $pos + $read_len];
701 }
702 foreach my $g (@genes) {
703 my $start = $g->val->get_start($pos, $read_len);
704 my $end = $g->val->get_end($pos, $read_len);
705 $s = $start if(!$s || $s > $start);
706 $e = $end if(!$e || $e < $end);
707 }
708 return [$s, $e];
709 }
710
711 sub check_sclip {
712 my %arg = @_;
713 my ($sam, $chr, $pos, $target, $r_sclip, $clip) =
714 ( $arg{-SAM}, $arg{-CHR}, $arg{-POS}, $arg{-TARGET}, $arg{-SCLIP}, $arg{-CLIP} );
715
716 my @bp;
717
718 # identify the searching region for partner soft cliping
719 my $extension;
720 if($RNASeq) {
721 $extension = 50;
722 }
723 else {
724 $extension = $read_len - ($target->{qend} - $target->{qstart});
725 }
726 my ($tpos, $ort) = ($target->{tend}, $target->{qstrand});
727 $tpos = $target->{tstart} if( ($clip == LEFT_CLIP && $ort eq '-')
728 || ($clip == RIGHT_CLIP && $ort eq '+'));
729 $extension = abs($tpos - $pos) - 1
730 if($chr eq $target->{tchr} && abs($tpos - $pos) <= $extension); # don't consider itself
731 my $range =$target->{tchr} . ":";
732 if($tpos < $extension) {
733 $range .= '1';
734 }
735 else{
736 $range .= $tpos - $extension;
737 }
738
739 $range .= "-"; $range .= $tpos + $extension;
740
741 # the orginal genome sequence where we find the soft_clipping
742 my $orig;
743 my $tmp = $chr;
744 #$tmp = 'chr' . $tmp if($tmp !~ m/^chr/);
745 $orig = $target_genome . ":" . $tmp . ":";
746 my $base_pos;
747 if($RNASeq) { #let's do a wild guess!
748 my $r = get_gene_range($tmp, $pos, $clip);
749 return \@bp unless $r;
750 $base_pos = $r->[0];
751 $orig = join("", ($orig, $r->[0], "-", $r->[1]));
752 }
753 else {
754 $base_pos = $pos - $read_len;
755 $orig = join("", ($orig, $pos - $read_len, "-", $pos + $read_len));
756 }
757
758 return \@bp if(!exists( $r_sclip->{$target->{tchr}})); #mapped to chrY or other minor chrs
759 # the soft clipping happens in highly repetitive region
760 # return \@bp if($coverage > $max_repetitive_cover);
761
762 my $r_pp = search_sclip($r_sclip, $range);
763 return \@bp if(!$r_pp);
764
765 my $real_pos;
766 my $found = 0;
767 my @r_reads;
768 my @l_reads;
769 foreach my $s (@{$r_pp}) {
770 #next if($s->[2] < $min_hit_reads);
771 next if($chr ne $target->{tchr} && (
772 ($clip == LEFT_CLIP && $ort ne $s->[1] ) ||
773 ($clip == RIGHT_CLIP && $ort eq $s->[1] )) );
774
775 next if($chr eq $target->{tchr} && ($s->[0] < $tpos - $extension
776 || $s->[0] > $tpos + $extension));
777 my $tort = $s->[1];
778 next if($s->[3] > $max_repetitive_cover);
779 my $tclip = $tort eq '+' ? RIGHT_CLIP : LEFT_CLIP;
780 my @reads = get_sclip_reads(
781 -SAM => $sam,
782 -CHR => $target->{tchr},
783 -START => $s->[0],
784 -END => $s->[0],
785 -CLIP => $tclip,
786 -MINCLIPLEN => $min_hit_len,
787 -VALIDATOR => $validator,
788 -PAIRED => $paired,
789 -RMDUP => $rmdup);
790 next if(!@reads || scalar(@reads) == 0);
791 # push @r_reads, @reads if($tclip == RIGHT_CLIP);
792 # push @l_reads, @reads if($tclip == LEFT_CLIP);
793 # }
794
795 # foreach my $tclip ( (RIGHT_CLIP, LEFT_CLIP)) {
796 # my $s_reads = $tclip == RIGHT_CLIP ? \@r_reads : \@l_reads;
797 # next if(scalar @{$s_reads} == 0);
798 my %count;
799 # my $fa_name = prepare_reads($s_reads, $tclip);
800 my $fa_name = prepare_reads(\@reads, $tclip);
801
802 my($contig_file, $sclip_count, $contig_reads, $singlet_file) = $assembler->run($fa_name);
803 my (%reads, %quals, %s_lens, %pos);
804 # foreach my $r (@{$s_reads}) {
805 foreach my $r (@reads) {
806 $reads{$r->{name}} = $r->{full_seq};
807 $quals{$r->{name}} = $r->{full_qual};
808 $s_lens{$r->{name}} = length($r->{seq});
809 $pos{$r->{name}} = $r->{pos};
810 }
811
812 $contig_file = clip_fa_file($contig_file, $tclip);
813 if( -s $singlet_file ) {
814 $singlet_file = clip_fa_file($singlet_file, $tclip);
815 system("cat $singlet_file >> $contig_file");
816 system("rm $singlet_file");
817 }
818 my $seqs = read_fa_file($contig_file);
819 my $hits = $aligner->run( -TARGET => $orig, -QUERY => $contig_file);
820 foreach my $t (keys %{$hits}) {
821 my $h = $hits->{$t};
822 my $tmp_bp;
823 my @all_pos;
824 my $all_reads_name;
825 if(exists $contig_reads->{$t}) {
826 foreach my $n (@{$contig_reads->{$t}}) {
827 push @all_pos, $pos{$n};
828 }
829 $all_reads_name = $contig_reads->{$t};
830 @all_pos = uniq @all_pos;
831 @all_pos = sort {$a <=> $b} @all_pos;
832
833 }
834 else {
835 push @all_pos, $pos{$t};
836 $all_reads_name = [$t];
837 }
838
839 if(is_good_hit($h, $clip, $tclip)) {
840 my $hit_ort = ($h->strand('query') == 1 ? "+" : "-");
841 my $real_pos = $tclip == RIGHT_CLIP ? $all_pos[0] : $all_pos[$#all_pos];
842 if($tclip == RIGHT_CLIP ) {
843 $tmp_bp = {
844 left_ort => '+',
845 left_pos => $real_pos,
846 left_chr => $target->{tchr},
847 right_ort => $hit_ort,
848 right_pos => ($hit_ort eq "+" ? $h->start("hit") : $h->end("hit")) + $base_pos,
849 right_chr => $chr,
850 }
851 }
852 else {
853 $tmp_bp = {
854 left_ort => $hit_ort,
855 left_chr => $chr,
856 left_pos => ($hit_ort eq "+" ? $h->end("hit") : $h->start("hit")) + $base_pos,
857 right_ort => "+",
858 right_chr => $target->{tchr},
859 right_pos => $real_pos,
860 }
861 }
862 $tmp_bp->{chr} = $target->{tchr};
863 $tmp_bp->{pos} = $real_pos;
864 $tmp_bp->{all_pos} = \@all_pos;
865 $tmp_bp->{sc_seq} = $seqs->{$t};
866 $tmp_bp->{cover} = count_coverage($sam, $target->{tchr}, $real_pos);
867 $tmp_bp->{sc_reads} = exists $sclip_count->{$t} ? $sclip_count->{$t} : 1;
868 $tmp_bp->{reads} = exists $contig_reads->{$t} ? $contig_reads->{$t} : [$t];
869
870 push @bp, $tmp_bp;
871 }
872 }
873 system("rm $fa_name"); system("rm $fa_name*");
874 }
875 return \@bp;
876 }
877
878 # many more filter can be added here
879 sub is_good_hit {
880 my ($hit, $clip, $tclip) = @_;
881
882 return if($hit->length_aln('query') < $min_hit_len);
883 return if($hit->frac_identical * 100 < $min_percent_id );
884 return 1;
885
886 # do we want to do the check?
887 my $ort = ($hit->start('query') < $hit->end('query') ? "+" : "-");
888 my ($dist, $t_dist, $qstart, $qend);
889 $qstart = $ort eq '+' ? $hit->start('query') : $hit->end('query') ;
890 $qend = $ort eq '+' ? $hit->end('query') : $hit->start('query');
891 $t_dist = $qstart;
892 $t_dist = $hit->query_length - $qend if($tclip == LEFT_CLIP);
893
894 }
895
896 # RNASeq support to identify deletions
897
898 sub cigar2hit {
899 my ($s, @cigar_array) = @_;
900 my @pos;
901 my $p = $s;
902 foreach my $c (@cigar_array){
903 my($op, $len) = @{$c};
904 if($op eq 'N') {
905 push @pos, [$s, $p];
906 $s = $p + $len;
907 $p = $s;
908 next;
909 }
910 $p += $len if($op eq 'M' || $op eq 'D');
911 }
912 push @pos, [$s, $p];
913 return \@pos;
914 }
915
916 sub find_del_SVs {
917 my ($sam, $chr, $start, $end) = @_;
918 my $max_sclip_len = 0;
919 my $range = $chr;
920 if($start && $end) {
921 $range = $chr . ":" . $start . "-" . $end;
922 }
923 my %SV;
924 my $tmp = $chr;
925 $tmp = "chr" . $chr if($chr !~ m/chr/);
926
927 my($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-"));
928
929 $sam->fetch($range, sub {
930 my $a = shift;
931 my $cigar_str = $a->cigar_str;
932 # print $a->qname, "\t", $cigar_str, "\n";
933 return unless $cigar_str =~ m/N/; # the read cross two genes must have N
934 # return unless $a->score >= 97;
935 my $sv = identify_del_SV($f_tree, $a);
936 if($sv){
937 my $tmp1 = $sv->[0] . "+" . $sv->[1];
938 if(!exists($SV{$tmp1})) {
939 $SV{$tmp1} = {};
940 $SV{$tmp1}->{num} = 0;
941 $SV{$tmp1}->{left} = 0;
942 $SV{$tmp1}->{right} = 0;
943 }
944 $SV{$tmp1}->{num}++;
945 $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]);
946 $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]);
947 $SV{$tmp1}->{gene_ort} = "+";
948 $SV{$tmp1}->{left_gene} = $sv->[3];
949 $SV{$tmp1}->{right_gene} = $sv->[4];
950 push @{$SV{$tmp1}->{reads}}, $a->qname;
951
952 }
953 $sv = identify_del_SV($r_tree, $a);
954 if($sv){
955 my $tmp1 = $sv->[0] . "-" . $sv->[1];
956 if(!exists($SV{$tmp1})) {
957 $SV{$tmp1} = {};
958 $SV{$tmp1}->{num} = 0;
959 $SV{$tmp1}->{left} = 0;
960 $SV{$tmp1}->{right} = 0;
961 $SV{$tmp1}->{reads} = [];
962 }
963 $SV{$tmp1}->{num}++;
964 $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]);
965 $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]);
966 $SV{$tmp1}->{gene_ort} = "-";
967 $SV{$tmp1}->{left_gene} = $sv->[3];
968 $SV{$tmp1}->{right_gene} = $sv->[4];
969 push @{$SV{$tmp1}->{reads}}, $a->qname;
970 }
971 }
972 );
973
974 my @rtn;
975 foreach my $sv (keys %SV) {
976 my( $pos1, $pos2 ) = $sv =~ m/(\d+)[+|-](\d+)/;
977 print $pos1, "\t", $pos2, "\n";
978 my( $cover1, $cover2) = (count_coverage($sam, $chr, $pos1 - 1), count_coverage($sam, $chr, $pos2));
979 next if($SV{$sv}->{num} < $min_sclip_reads);
980 my $cover = $cover1 < $cover2 ? $cover1 : $cover2;
981 next if($SV{$sv}->{num} * 100 < $cover * $min_pct_sclip_reads && $SV{$sv} < 50);
982 push @rtn, StructVar->new (
983 -FIRST_BP => { left_ort => '+',
984 left_pos => $pos1 - 1,
985 left_chr => $chr,
986 right_ort => '+',
987 right_pos => $pos2,
988 right_chr => $chr,
989 chr => $chr,
990 cover => count_coverage($sam, $chr, $pos1 - 1),
991 pos => $pos1 - 1,
992 sc_reads => $SV{$sv}->{num},
993 left_len => $SV{$sv}->{left},
994 right_len => $SV{$sv}->{right},
995 gene_ort => $SV{$sv}->{gene_ort},
996 left_gene => $SV{$sv}->{left_gene},
997 right_gene => $SV{$sv}->{right_gene},
998 reads => $SV{$sv}->{reads},
999 },
1000 -SECOND_BP => { left_ort => '+',
1001 left_pos => $pos1 - 1,
1002 left_chr => $chr,
1003 right_ort => '+',
1004 right_pos => $pos2,
1005 right_chr => $chr,
1006 chr => $chr,
1007 pos => $pos2,
1008 cover => count_coverage($sam, $chr, $pos2),
1009 sc_reads => $SV{$sv}->{num},
1010 reads => [],
1011 }
1012 );
1013 }
1014 return @rtn;
1015 }
1016
1017 sub identify_del_SV {
1018 my ($tree, $a) = @_;
1019 my @cigar_array = @{$a->cigar_array};
1020 return unless($a->start && $a->end);
1021 return if(scalar $tree->intersect([$a->start, $a->end]) <= 1);
1022
1023 my @hits = @{ cigar2hit($a->start, @cigar_array) };
1024 my @genes;
1025 my $last_gene;
1026 my @change;
1027 my ($left, $right) = (0, 0);
1028 for( my $i = 0; $i < scalar @hits; $i++) {
1029 my $h = $hits[$i];
1030 my @g = $tree->intersect($h);
1031 my $g_size = scalar @g;
1032 if($g_size == 0) {
1033 # print STDERR $a->qname, " is mapped outside of gene\n";
1034 next;
1035 }
1036 if( scalar @g == 1) {
1037 push @genes, $g[0] unless ($last_gene && $last_gene->val->name eq $g[0]->val->name);
1038 if($last_gene && $last_gene->val->name ne $g[0]->val->name) {
1039 $right += ($h->[1] - $h->[0] );
1040 push @change, $i;
1041 }
1042 $left += ($h->[1] - $h->[0] ) if($right == 0);
1043 $last_gene = $g[0];
1044 next;
1045 }
1046 # print STDERR $a->qname, " connect two genes into one?\n";
1047 return;
1048 }
1049 return if(scalar @genes <= 1);
1050 if(scalar @genes > 2) {
1051 # print STDERR $a->qname, "crossed more than 2 genes!\n";
1052 return;
1053 }
1054 # now we down to 2 genes
1055 my ($left_gene, $right_gene) = @genes;
1056 return unless $left_gene->val->overlap([$hits[$change[0]-1]->[0], $hits[$change[0]-1]->[1]]);
1057 return unless $right_gene->val->overlap([$hits[$change[0]]->[0], $hits[$change[0]]->[1]]);
1058 my $p = $left_gene->val->end;
1059 print join("\t", ( $a->score,
1060 $a->cigar_str, $left, $right, $hits[$change[0]-1]->[1], $hits[$change[0]]->[0],
1061 $left_gene->val->name, $right_gene->val->name)), "\n";
1062 return [$hits[$change[0]-1]->[1], $hits[$change[0]]->[0], $a, $left_gene->val, $right_gene->val, $left, $right];
1063 }
1064
1065 =head1 NAME
1066
1067 CREST.pl - a Structure Variation detection tools using softclipping for
1068 whole genome sequencing.
1069
1070
1071 =head1 VERSION
1072
1073 This documentation refers to CREST.pl version 0.0.1.
1074
1075
1076 =head1 USAGE
1077
1078 This program depends on several things that need to be installed and/or
1079 specified. The program uses BioPerl and Bio::DB::Sam module to parse
1080 the files and bam files. Also it uses Blat software suits to do genome
1081 mapping and alignment. To make the program efficient, it also requires
1082 a blat server setup. And the program uses CAP3 assembler.
1083
1084 Identify somatic SVs from input:
1085 CREST.pl -f somatic.cover -d tumor.bam -g germline.bam
1086 Identify SVs from a single bam compared wtih reference genome:
1087 CREST.pl -f sclip.cover -d sample.bam
1088 Identify SVs from a single bam on chr1 only
1089 CREST.pl -f sclip.cover -d sample.bam -r chr1
1090
1091 =head1 REQUIRED ARGUMENTS
1092
1093 To run the program, several parameter must specified.
1094 -d, --input_d: The input (diagnositic) bam file
1095 -f, --sclipfile: The soft clipping information generated by extractSClip.pl
1096 --ref_genome: The reference genome file in fa format
1097 -t, --target_genome: The 2bit genome file used by blat server and blat
1098 --blatserver: The blat server name or ip address
1099 --blatport: The blat server port
1100
1101 =head1 OPTIONS
1102
1103 The options that can be used for the program.
1104 -g, --input_g The germline (paired) bam file
1105 -p, --prefix The output prefix, default is the input bam file name
1106 -o, --out_dir Output directory, default is the working directory
1107 -l, --read_len The read length of the sequencing data, defaut 100
1108 --sensitive The program will generate more SVs with higher false positive rate.
1109
1110 --(no)scratch Use the scratch space, default is off.
1111 --(no)paired Use paired reads or not, defafult is on, so change to --nopaired for unpaired reads.
1112 --cap3 CAP3 executable position, default cap3
1113 --cap3opt CAP3 options, single quoted, Default ' > /dev/null'
1114 --blatclient gfClient excutable position, default gfClient
1115 --blatclientopt gfClient options, single quoted, default '-out=psl -nohead'
1116 --blat blat executable potion, default balt
1117 --blatopt blat options, single quoted, default '-tileSize=7 -stepSize=1 -out=psl -minScore=15'
1118
1119 -r, --range The range where SV will be detected, using chr1:100-200 format.
1120 --max_score_diff The maximum score difference when stopping select hit, default 10.
1121 --min_sclip_reads Minimum number of soft clipping read to triger the procedure, default 3.
1122 --max_rep_cover The min number of coverage to be called as repetitive and don't triger
1123 the procedure, default 500.
1124 --min_sclip_len The min length of soft clipping part at a position to triger the detection,
1125 default 20.
1126 --min_hit_len Min length of a hit for genome mapping
1127 --min_dist_diff Min distance between the mapped position and the soft clipping position, default 20.
1128 --(no)rmdup Remove PCR dumplicate. Default remove.
1129
1130 --min_percent_id Min percentage of identity of soft clipping read mapping, default 90
1131 --min_percent_hq Min percentage of high quality base in soft clipping reads, default 80
1132 --lowqual_cutoff Low quality cutoff value, default 20.
1133
1134 -h, --help, -? Help information
1135 --man Man page.
1136 --usage Usage information.
1137 --version Software version.
1138
1139 --(no)rm_tandem_repeat Remove tandem repeat caused SV events, default is ON. When it's on ptrfinder program
1140 need to be on the path.
1141 --tr_max_indel_size Maximum tandem repeat mediated INDEL events, default 100
1142 --tr_min_size Minimum tandem reapet size, default 2
1143 --tr_max_size Maximum tandem repeat size, default 8
1144 --tr_min_num Minimum tandem repeat number, defaut 4
1145 --min_percent_cons_of_read Minimum percent of consensus length of read length, default 0.75
1146 --max_bp_dist Maximum distance between two idenfitifed break points, default 15
1147 --germline_seq_width Half window width of genomic sequence around break point for germline SV filtering,
1148 default 100
1149 --germline_search_width Half window width for seaching soft-clipped reads around breakpoint for germline SV
1150 filtering, default 50.
1151
1152 --hetero_factor The factor about the SV's heterogenirity and heterozygosity, default 0.4.
1153 --triger_p_value The p-value that will triger the SV detection when number of soft-clipped reads is small,
1154 defaut 0.05.
1155
1156 --(no)rescue Use rescue mode, when it's on, the a SV with only 1 side with enough soft-clipped reads
1157 is considered as a valid one instead of rejecting it. Default on.
1158 --min_one_side_reads When rescure mode is on, the minimum number of soft-clipped reads on one side, default 5.
1159
1160 --RNASeq RNAseq mode, default off
1161 --genemodel A gene model file, currently only refFlat format (BED) is supported. Requried for RNASeq.
1162 --cluster_size The soft-clipped reads within cluster_size will be considered together, default is 3, RNAseq mode
1163 only.
1164
1165
1166 =head1 DESCRIPTION
1167
1168 This is a program designed to identify Structure Variations (SVs) using soft
1169 clipping reads. With the improvement of next-gen sequencing techniques,
1170 both the coverage and length of pair-end reads have been increased steady.
1171 Many SV detection software availalbe uses pair-end information due to the
1172 limitted read length. Now 100bp is pretty common and many reads will cross
1173 the break points. Some mapping programs ( bwa, etc ) have the ability to
1174 identify so called soft-clipping reads. A soft-clipping read is a read that
1175 different part can be mapped to different genomic posiiton, but the read can
1176 be uniquely positioned using the mate-pair position and the insert length.
1177 So this program use those reads to do an assembly-mapping-assembly-alignment
1178 procedure to identify potential structure variiations.
1179
1180
1181 =head1 DIAGNOSTICS
1182
1183 A list of every error and warning message that the application can generate
1184 (even the ones that will "never happen"), with a full explanation of each
1185 problem, one or more likely causes, and any suggested remedies. If the
1186 application generates exit status codes (e.g., under Unix), then list the exit
1187 status associated with each error.
1188
1189 =head1 CONFIGURATION AND ENVIRONMENT
1190
1191 The program is designed to use under a cluster or high performance computing
1192 environment since it's dealing with over 100G input data. The program can be
1193 used as highly parallellized. And the program is developped under linux/unix.
1194
1195
1196 =head1 DEPENDENCIES
1197
1198 The program depend on several packages:
1199 1. Bioperl perl module.
1200 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed.
1201 3. blat suits, include blat, gfClient, gfServer.
1202 4. CAP3 assembly program.
1203
1204
1205 =head1 BUGS AND LIMITATIONS
1206
1207 There are no known bugs in this module, but the method is limitted to bam file
1208 that has soft-clipping cigar string generated.Please report problems to
1209 Jianmin Wang (Jianmin.Wang@stjude.org)
1210 Patches are welcome.
1211
1212 =head1 AUTHOR
1213
1214 Jianmin Wang (Jianmin.Wang@stjude.org)
1215
1216
1217 =head1 LICENCE AND COPYRIGHT
1218
1219 Copyright (c) 2010 by St. Jude Children's Research Hospital.
1220
1221 This program is free software: you can redistribute it and/or modify
1222 it under the terms of the GNU General Public License as published by
1223 the Free Software Foundation, either version 2 of the License, or
1224 (at your option) any later version.
1225
1226 This program is distributed in the hope that it will be useful,
1227 but WITHOUT ANY WARRANTY; without even the implied warranty of
1228 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1229 GNU General Public License for more details.
1230
1231 You should have received a copy of the GNU General Public License
1232 along with this program. If not, see <http://www.gnu.org/licenses/>.