comparison SoftSearch.pl @ 10:e741e2f9ad03 draft

Uploaded
author priti
date Fri, 06 Jun 2014 01:53:32 -0400
parents
children
comparison
equal deleted inserted replaced
9:6285f58ad4e9 10:e741e2f9ad03
1 #!/usr/bin/perl
2
3 ####
4 #### Usage: SoftSearch.pl [-lqrmsd] -b <BAM> -f <Genome.fa> -sam <samtools path> -bed <bedtools path>
5 #### Created 1-30-2012 by Steven Hart, PhD
6 #### hart.steven@mayo.edu
7 #### Required bedtools & samtools to be in path
8
9
10 use lib "/home/plus91/2.4/lib" ;
11
12 use Getopt::Long;
13 use strict;
14 use warnings;
15 #use Data::Dumper;
16 use LevD;
17 use File::Basename;
18
19 my ($INPUT_BAM,$INPUT_FASTA,$OUTPUT_FILE,$minSoft,$minSoftReads,$dist_To_Soft,$bedtools,$samtools);
20 my ($minRP, $temp_output, $num_sd, $MapQ, $chrom, $unmated_pairs, $minBQ, $pair_only, $disable_RP_only);
21 my ($levD_local_threshold, $levD_distl_threshold,$pe_upper_limit,$high_qual,$sv_only,$blacklist,$genome_file,$verbose);
22
23 my $cmd = "";
24
25 #Declare variables
26 GetOptions(
27 'b=s' => \$INPUT_BAM,
28 'f=s' => \$INPUT_FASTA,
29 'o:s' => \$OUTPUT_FILE,
30 'm:i' => \$minRP,
31 'l:i' => \$minSoft,
32 'r:i' => \$minSoftReads,
33 't:i' => \$temp_output,
34 's:s' => \$num_sd,
35 'd:i' => \$dist_To_Soft,
36 'q:i' => \$MapQ,
37 'c:s' => \$chrom,
38 'u:s' => \$unmated_pairs,
39 'x:s' => \$minBQ,
40 'p' => \$pair_only,
41 'g' => \$disable_RP_only,
42 'j:s' => \$levD_local_threshold,
43 'k:s' => \$levD_distl_threshold,
44 'a:s' => \$pe_upper_limit,
45 'e:s' => \$high_qual,
46 'L' => \$sv_only,
47 'v' => \$verbose,
48 'blacklist:s' => \$blacklist,
49 'genome_file:s' => \$genome_file,
50 "help|h|?" => \&usage);
51
52 unless($sv_only){$sv_only=""};
53 if(defined($INPUT_BAM)){$INPUT_BAM=$INPUT_BAM} else {print usage();die "Where is the BAM file?\n\n"}
54 if(defined($INPUT_FASTA)){$INPUT_FASTA=$INPUT_FASTA} else {print usage();die "Where is the fasta file?\n\n"}
55 my ($fn,$pathname) = fileparse($INPUT_BAM,".bam");
56 my $index=`ls $pathname/$fn*bai|head -1`;
57 #my $index =`ls \${INPUT_BAM%.bam}*bai`;
58 #print "INDEX=$index\n";
59 if(!$index){die "\n\nERROR: you need index your BAM file\n\n"}
60
61 ### get current time
62 print "Start Time : " . &spGetCurDateTime() . "\n";
63 my $now = time;
64
65 #if(defined($OUTPUT_FILE)){$OUTPUT_FILE=$OUTPUT_FILE} else {$OUTPUT_FILE="output.vcf"; print "\nNo outfile specified. Using output.vcf as default\n\n"}
66 if(defined($minSoft)){$minSoft=$minSoft} else {$minSoft=5}
67 if(defined($minRP)){$minRP=$minRP} else {$minRP=5}
68 if(defined($minSoftReads)){$minSoftReads=$minSoftReads} else {$minSoftReads=5}
69 if(defined($dist_To_Soft)){$dist_To_Soft=$dist_To_Soft} else {$dist_To_Soft=300}
70 if(defined($num_sd)){$num_sd=$num_sd} else {$num_sd=6}
71 if(defined($MapQ)){$MapQ=$MapQ} else {$MapQ=20}
72
73 unless (defined $pe_upper_limit) { $pe_upper_limit = 10000; }
74 unless (defined $levD_local_threshold) { $levD_local_threshold = 0.05; }
75 unless (defined $levD_distl_threshold) { $levD_distl_threshold = 0.05; }
76 #Get sample name if available
77 my $SAMPLE_NAME="";
78 my $OUTNAME ="";
79 $SAMPLE_NAME=`samtools view -f2 -H $INPUT_BAM|awk '{if(\$1~/^\@RG/){sub("ID:","",\$2);name=\$2;print name}}'|head -1`;
80 $SAMPLE_NAME=~s/\n//g;
81 if (!$OUTPUT_FILE){
82 if($SAMPLE_NAME ne ""){$OUTNAME=$SAMPLE_NAME.".vcf"}
83 else {$OUTNAME="output.vcf"}
84 }
85 else{$OUTNAME=$OUTPUT_FILE}
86
87 print "Writing results to $OUTNAME\n";
88
89
90
91 ##Make sure if submitting on SGE, to prepned the "chr". Not all referecne FAST files require "chr", so we shouldn't force the issue.
92 if(!defined($chrom)){$chrom=""}
93 if(!defined($unmated_pairs)){$unmated_pairs=0}
94
95 my $badQualValue=chr($MapQ);
96 if(defined($minBQ)){ $badQualValue=chr($minBQ); }
97
98 if($badQualValue eq "#"){$badQualValue="\#"}
99
100 # adding and cheking for samtools and bedtools in the PATh
101 ## check for bedtools and samtools in the path
102 $bedtools=`which intersectBed` ;
103 if(!defined($bedtools)){die "\nError:\n\tno bedtools. Please install bedtools and add to the path\n";}
104 #$samtools=`samtools 2>&1`;
105 $samtools=`which samtools`;
106 if($samtools !~ /(samtools)/i){die "\nError:\n\tno samtools. Please install samtools and add to the path\n";}
107
108 print "Usage = SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -s $num_sd -c $chrom -b $INPUT_BAM -f $INPUT_FASTA -o $OUTNAME \n\n";
109 sub usage {
110 print "\nusage: SoftSearch.pl [-cqlrmsd] -b <BAM> -f <Genome.fa> \n";
111 print "\t-q\t\tMinimum mapping quality [20]\n";
112 print "\t-l\t\tMinimum length of soft-clipped segment [5]\n";
113 print "\t-r\t\tMinimum depth of soft-clipped reads at position [5]\n";
114 print "\t-m\t\tMinimum number of discordant read pairs [5]\n";
115 print "\t-s\t\tNumber of sd away from mean to be considered discordant [6]\n";
116 print "\t-u\t\tNumber of unmated pairs [0]\n";
117 print "\t-d\t\tMax distance between soft-clipped segments and discordant read pairs [300]\n";
118 print "\t-o\t\tOutput file name [output.vcf]\n";
119 print "\t-t\t\tPrint temp files for debugging [no|yes]\n";
120 print "\t-c\t\tuse only this chrom or chr:pos1-pos2\n";
121 print "\t-p\t\tuse paired-end mode only. In other words, don't try to find soft-clipping events!\n";
122 print "\t-g\t\tEnable paired-only seach. This will look for discordant read pairs even without soft clips.\n";
123 print "\t-a\t\tset the minimum distance for a discordant read pair without soft-clipping info [10000]\n";
124 print "\t-L\t\tFlag to print out even small deletions (low quality)\n";
125 print "\t-e\t\tdisable strict quality filtering of base qualities in soft-clipped reads [no]\n";
126 print "\t-blacklist\tareas of the genome to skip calling. Requires -genome_file\n";
127 print "\t-genome_file\ttab seperated value of chromosome name and length. Only used with -blacklist option\n\n";
128
129 exit 1;
130 }
131
132
133 #############################################################
134 # create temporary variable name
135 #############################################################
136 srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip -f`);
137 our $random_name=join "", map { ("a".."z")[rand 26] } 1..8;
138
139 #############################################################
140 ## create green list
141 ##############################################################
142 #
143 my $new_blacklist="";
144 if($blacklist){
145 if(!$genome_file){die "if using a blacklist, you must also specify the location of a genome_file
146 The format of the genome_file should be
147 chrom size
148 chr1 249250621
149 chr2 243199373
150 ...
151
152 If using hg19, you can ge the genome file by
153 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \"select chrom, size from hg19.chromInfo\" > hg19.genome";}
154
155 $cmd=join("","complementBed -i $blacklist -g $genome_file >",$random_name,".bed") ;
156 system ($cmd);
157 $new_blacklist=join(""," -L ",$random_name,".bed ");
158 }
159
160 if($verbose){print "CMD=$cmd\nBlacklist is $new_blacklist\n";}
161
162
163
164
165
166 #############################################################
167 # Calcualte insert size distribution of properly mated reads
168 #############################################################
169
170 #Change for compatability with other operating systems
171 #my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|cut -f9|head -10000|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)**2)}'`;
172
173 my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|head -10000|cut -f9|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)^2)}'`;
174 #my ($mean,$stdev)=split(/ /,$metrics);
175 my ($mean,$stdev)=split(/\s/,$metrics);
176 $stdev=~s/\n//;
177 my $upper_limit=int($mean+($num_sd*$stdev));
178 my $lower_limit=int($mean-($num_sd*$stdev));
179 die if (!$mean);
180 print qq{The mean insert size is $mean +/- $stdev (sd)
181 The upper limit = $upper_limit
182 The lower limit = $lower_limit\n
183 };
184 if($lower_limit<0){
185 print "Warning!! Given this insert size distribution, we can not call small indels. No other data will be affected\n";
186 $lower_limit=1;
187 }
188 my $tmp_name=join ("",$random_name,".tmp.bam");
189 my $random_file_sc = "";
190 my $command = "";
191
192 #############################################################
193 # Make sam file that has soft clipped reads
194 #############################################################
195 #give file a name
196 if(!defined($pair_only)){
197 $random_file_sc=join ("",$random_name,".sc.sam");
198 $command=join ("","samtools view -q $MapQ -F 1024 $INPUT_BAM $chrom $new_blacklist| awk '{OFS=\"\\t\"}{c=0;if(\$6~/S/){++c};if(c == 1){print}}' | perl -ane '\$TR=(\@F[10]=~tr/\#//);if(\$TR<2){print}' > ", $random_file_sc);
199
200 print "Making SAM file of soft-clipped reads\n";
201 if($verbose){ print "$command\n";}
202 system("$command");
203
204 #############################################################
205 # Find areas that have deep enough soft-clip coverage
206 print "Identifying soft-clipped regions that are at least $minSoft bp long \n";
207 open (FILE,"$random_file_sc")||die "Can't open soft-clipped sam file $random_file_sc\n";
208
209 my $tmpfile=join("",$random_file_sc,".sc.passfilter");
210 open (OUT,">$tmpfile")||die "Can't write files here!\n";
211
212 while(<FILE>){
213 @_ = split(/\t/, $_);
214 #### parse CIGAR string and create a hash of array of each operation
215 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]);
216 my $hash;
217 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR;
218
219 #for ($i=0; $i<=$#softclip_pos; $i++) {
220 foreach my $softclip (@{$hash->{S}}) {
221 #if ($CIGAR[$softclip_pos[$i]] > $minSoft){
222 if ($softclip > $minSoft){
223 ###############Make sure base qualities don't have more than 2 bad marks
224 my $qual=$_[10];
225 my $TR=($qual=~tr/$badQualValue//);
226 if($badQualValue eq "#"){ $TR=($qual=~tr/\#//); }
227 #Skip the soft clip if there is more than 2 bad qual values
228 #next if($TR > 2);
229 # if (!$high_qual){next if($TR > 2);}
230 print OUT;
231 last;
232 }
233 }
234 }
235 close FILE;
236 close OUT;
237
238 $command=join(" ","mv",$tmpfile,$random_file_sc);
239 if($verbose){ print "$command\n";}
240 system("$command");
241 }
242
243 #########################################################
244 #Stack up SoftClips
245 #########################################################
246 my $random_file=join("",$random_name,".sc.direction.bed");
247 if(!defined($pair_only)){
248 open (FILE,"$random_file_sc")|| die "Can't open sam file\n";
249 #$random_file=join("",$random_name,".sc.direction");
250
251 print "Calling sides of soft-clips\n";
252 #\nTMPOUT=$random_file\tINPUT=$random_file_sc\n\n";
253 open (TMPOUT,">$random_file")|| die "Can't create tmp file\n";
254
255 while (<FILE>){
256 @_ = split(/\t/, $_);
257 #### parse CIGAR string and create a hash of array of each operation
258 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]);
259 my $hash;
260 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR;
261
262 #### next if softclips on each end
263 next if ($_[5] =~ /^[0-9]+S.*S$/);
264
265 #### next softclip occurs in the middle
266 next if ($_[5] =~ /^[0-9]+[^S][0-9].*S.+$/);
267
268 my $softclip = $hash->{S}[0];
269
270 my $end1 = 0;
271 my $end2 = 0;
272 my $softBases = "";
273 my $right_corrected="";my $left_corrected="";
274
275 if ($softclip > $minSoft) {
276
277 ####If the soft clip occurs at end of read and its on the minus strand, then it's a right clip
278 if ($_[5] =~ /^.*S$/) {
279 $end1=$_[3]+length($_[9])-$softclip-1;
280 $end2=$end1+1;
281 next if ($end1<0);
282 #RIGHT clip on Minus
283 $softBases=substr($_[9], length($_[9])-$softclip, length($_[9]));
284 #Right clips don't always get clipped correctly, so fix that
285 # Check to see if sc base matches ref
286 $right_corrected=baseCheck($_[2],$end2,"right",$softBases);
287 print TMPOUT "$right_corrected\n"
288
289 } else {
290 #### Begins with S (left clip)
291 $end1=$_[3]-$softclip;
292 next if ($end1<0);
293
294 $softBases=substr($_[9], 0,$softclip);#print "TMP=$softBases\n";
295 $left_corrected=baseCheck($_[2],$end1,"left",$softBases);
296 if(!$left_corrected){print "baseCheck($_[2],$end1,left,$softBases)\n";next}
297 print TMPOUT "$left_corrected\n";
298 #print "\nSEQ=$_[9]\t\n";
299
300 }
301 }
302 }
303 close FILE;
304 close TMPOUT;
305 }
306 sub baseCheck{
307 my ($chrom,$pos,$direction,$softBases)=@_;
308 #skip if position is less than 0, which is caused by MT DNA
309 return if ($pos<0);
310 my $exit="";
311
312 while(!$exit){
313 if($direction=~/right/){
314 my $refBase=getSeq($chrom,$pos,$INPUT_FASTA);
315 my $softBase=substr($softBases,0,1);
316 if ($softBase !~ /$refBase/){
317 my $value=join("\t",$chrom,$pos,$pos+1,join("|",$softBases,$direction));
318 $exit="STOP";
319 return $value;
320 }
321 else{
322 $pos=$pos+1;
323 $softBases=substr($softBases, 1,length($softBases));
324 }
325 }
326 else{
327 my $refBase=getSeq($chrom,$pos+1,$INPUT_FASTA);
328 my $softBase=substr($softBases,-1,1);
329 if ($softBase !~ /$refBase/){
330 $pos=$pos-1+length($softBases);
331 my $value=join("\t",$chrom,$pos-1,$pos,join("|",$softBases,$direction));
332 $exit="STOP";
333 return $value;
334 }
335 else{
336 $pos=$pos-1;
337 $softBases=substr($softBases, 0, -1);
338 #print "Trying again $softBases\n";
339 }
340
341 }
342
343 }
344 }
345 #Remove SAM files to conserve space
346 unlink($random_file_sc);
347
348
349 my $random_file_disc="$INPUT_BAM";
350 ###
351 #
352 ######################################################
353 # Transform Read pair groups into softclip equivalents
354 ######################################################
355 #
356 #
357 #
358 my $v="";
359 #if($disable_RP_only){
360 print "Running Bam2pair.pl\n";
361 print "Looking for discordant read pairs without requiring soft-clipping information\n";
362 use FindBin qw($Bin);
363 my $path=$Bin;
364 # print"\n\nPATH=$path\n\n";
365 if($verbose){$v="-v"}
366 my $tmp_out=join("",$random_name,"RP.out");
367 $command=join("","perl ",$path,"/Bam2pair.pl -b $random_file_disc -o $tmp_out -isize $pe_upper_limit -winsize $dist_To_Soft -min $minRP -chrom $chrom -prefix $random_name -q $MapQ -blacklist $random_name.bed $v");
368 if($verbose){ print "$command\n"};
369 system("$command");
370 $command=join("","perl -ane '\$end1=\@F[1];\$end2=\@F[3];print join(\"\\t\",\@F[0..1],\$end1,\"unknown|left\");print \"\\n\";print join(\"\\t\",\@F[2..3],\$end2,\"unknown|left\");print \"\\n\"' ", $tmp_out," >> ",$random_file);
371 if($verbose){print "$command\n"};
372 system($command);
373 unlink($tmp_out);
374 #}
375 #
376
377
378 ######################################################
379 unlink("$random_file","$tmp_name","$random_file","$index","$random_name","$new_blacklist") if (-z $random_file || ! -e $random_file ) ;
380 if (-z $random_file || ! -e $random_file){
381 print "Softclipped file is empty($random_file).\nNo soft clipping found using desired paramters\n\n";
382 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
383 &print_header();
384 close OUT;
385 exit;
386 }
387
388
389 #############################################################
390 # Make sure there are enough soft-clippped supporting reads
391 #############################################################
392 my $outfile=join("",$random_file,".sc.merge.bed");
393 #sortbed -i .sc.direction | mergeBed -nms -d 25 -i stdin > .sc.merge.bed
394 $command=join(" ","sortBed -i",$random_file," | mergeBed -nms -i stdin","|egrep \";|,\"","|awk '{OFS=\"\t\"}(NF==4)'",">",$outfile);
395
396 print "$command\n" if ($verbose);
397 system("$command");
398
399 if (-z $outfile || ! -e $outfile){
400 unlink("$tmp_name","$random_file","$outfile","$index","$random_name","$new_blacklist");
401 print "mergeBed file is empty.\nNo strucutral variants found\n\n" ;
402 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
403 &print_header();
404 close OUT;
405 exit;
406 }
407
408 print "completed mergeBed\n";
409
410 ###############################################################
411 # If left and right are on the same line, make into 2 lines
412 ###############################################################
413 open (INFILE,$outfile)||die "couldn't open temp file : $. \n\n";
414 my $tmp2=join("",$random_name,".sc.fixed.merge.bed");
415 #print "INFILE=$outfile\tOUTFILE=$tmp2\n\n";
416 #INPUT FORMAT=chr9\t131467\t131473\tATGCTTATTAAAA|left;TTATTAAAAGCATA|left
417 open (OUTFILE,">$tmp2")||die "couldn't create temp file : $. \n\n";
418 while(<INFILE>){
419 chomp $_;
420 my $l = $_;
421
422 my @a = split(/\t/, $l);
423 my $info = $a[3];
424 my @info_arr = split(/\;/, $info);
425 my @left_arr=();
426 my @right_arr=();
427 @left_arr = grep(/left/, @info_arr);
428 @right_arr = grep(/right/, @info_arr);
429
430 #New
431 my $left = join(";", @left_arr);
432 my $right = join(";", @right_arr);
433 $info = join(";", @info_arr);
434
435 if((@left_arr) && (@right_arr)){
436 print OUTFILE "$a[0]\t$a[1]\t$a[2]\t$left\n$a[0]\t$a[1]\t$a[2]\t$right\n";
437 } else{
438 my $all=join("\t",@a[0..2],$info);
439 print OUTFILE "$all\n";
440 }
441 }
442
443 # make sure output file name is $outfile
444 $command=join(" ","sed -e '/ /s//\t/g'", $tmp2,"|awk 'BEGIN{OFS=\"\\t\"}(NF==4)'", "|perl -pne 's/ /\t/g'>",$outfile);
445 system("$command");
446 if($verbose){print "$command\n"};
447 unlink("$tmp_name","$random_file","$tmp2","$outfile","$index","random_name","$new_blacklist") if (-z $outfile || ! -e $outfile) ;
448 if (-z $outfile || ! -e $outfile){
449 print "Fixed mergeBed file is empty($outfile).\nNo strucutral variants found\n\n";
450 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
451 &print_header();
452 close OUT;
453 exit;
454 }
455
456 print "completed fixing mergeBed\n\n";
457
458 ###############################################################
459 # Seperate directions of soft clips
460 ###############################################################
461 my $left_sc = join("", "left", $tmp2);
462 my $right_sc = join("", "right", $tmp2);
463 use FindBin qw($Bin);
464 #my $path=$Bin;
465
466 $command=join("","grep left ", $tmp2, " |sed -e '/left /s//left\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$left_sc);
467 system("$command");
468 #print "$command\n";
469 $command=join("","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$right_sc);
470 #$command=join(" ","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g' >",$right_sc);
471 system("$command");
472 #print "$command\n";
473 #die "CHECK $right_sc\n";
474
475 ###############################################################
476 # Count the number and identify directions of soft clips
477 ###############################################################
478 print "Count the number and identify directions of soft clips\n";
479 #print "looking in $outfile\n";
480 $outfile=join("",$random_name,".sc.fixed.merge.bed");
481
482 open (INFILE,$outfile)||die "couldn't open temp file\n\n";
483 my $tmp3 = join("", $random_file, "predSV");
484 open (OUTFILE, ">$tmp3")||die "couldn't create temp file\n\n";
485 while(<INFILE>){
486 chomp;
487 @_=split(/\t/,$_);
488 my $count=tr/\;//;$count+=tr/\,//;
489 $count=$count+1;
490 my $left=0;
491 my $right=0;
492
493 while ($_ =~ /left/g) { $left++ } # count number of right clips
494 while ($_ =~ /right/g) { $right++ } # count number of left clips
495
496 ###############################################################
497 if ($count >= $minSoftReads){
498 ####get longets soft-clipped read
499 my @clips=split(/\;|,|\|/,$_[3]);
500
501 my ($max, $temp, $temp2, $temp3, $dir, $maxSclip) = (0) x 6;
502 for (my $i=0; $i<$count; $i++) {
503 my $plus1=$i+1;
504 $temp=length($clips[$i]);
505 $temp2=$clips[$plus1];
506 $temp3=$clips[$i];
507
508 if ($temp > $max){
509 $maxSclip=$temp3;
510 $max =$temp;
511 $dir=$temp2;
512 } else {
513 $max=$max;
514 $dir=$dir;
515 $maxSclip=$maxSclip;
516 }
517 $i++;
518 }
519 my $order2 = join("|", $left, $right);
520 #print join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n";
521 print OUTFILE join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n";
522 } elsif($_=~/unknown/){
523 print OUTFILE join ("\t",@_[0..2],"NA","NA","left","NA","NA|NA") . "\n";
524 print OUTFILE join ("\t",@_[0..2],"NA","NA","right","NA","NA|NA") . "\n";
525 }
526 ####Format is Chrom,start, end,longest Soft-clip,length of longest Soft-clip, direction of longest soft-clip,#supporting softclips,#right Sclips|#left Sclips
527 }
528 close INFILE;
529 close OUTFILE;
530
531 unlink("$tmp2","$tmp_name","$random_file","$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$new_blacklist") if (-z $tmp3 || !-e $tmp3) ;
532
533 if (-z $tmp3 || !-e $tmp3){
534 print "No structural variants found while Counting the number and identify directions of soft clips.\n" ;
535
536 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
537 &print_header();
538 close OUT;
539 exit;
540
541 }
542
543 print "Done counting Softclipped reads\n";
544 ###############################################################
545 #### Print header information
546 ###############################################################
547 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
548 &print_header();
549 close OUT;
550
551 ###############################################################
552 ###############################################################
553 #### DO the bulk of the work
554 ###############################################################
555 use List::Util qw(min max);
556 open (FILE,"$tmp3")|| die "Can't open file\n";
557 open (OUT,">>$OUTNAME")|| die "Can't open file\n";
558
559 #print "\nusing $tmp3 and writing to $OUTPUT_FILE \n";
560 while (<FILE>){
561 #If left clip {+- or -- or -+ }{+- are uninformative b/c they go upstream}
562 #If right clip {++ or -- or +-}
563 chomp $_;
564 my $line = $_;
565 my @info = split(/\t/, $_);
566
567 if($info[5] eq "left") {
568 bulk_work("left", $line, $random_file_disc);
569
570 } elsif ($info[5] eq "right") {
571 bulk_work("right", $line, $random_file_disc);
572 }
573 #if($. ==6){print "THIS IS LINE 6\n$_\n";die}
574 print "Completed line $.\n" if ($verbose);
575 }
576 close FILE;
577 close OUT;
578
579 ###############################################################################
580 ###############################################################################
581 #### Delete temp files
582 my $meregedBed=join("",$random_name,".sc.direction.bed.sc.merge.bed");
583
584 if(defined($temp_output)){$temp_output=$temp_output} else {$temp_output="no"}
585
586 if ($temp_output eq "no"){
587 unlink("$tmp_name","$random_file","$tmp2",,"$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$meregedBed","$random_name.bed");
588 }
589 ####Sort VCF
590 my $tmp=join(".",$random_name,"tmp");
591 #Get header
592 $cmd="grep \"#\" $OUTNAME > $tmp";
593 system($cmd);
594 #sort results
595 $cmd="grep -v \"#\" $OUTNAME|perl -pne 's/chr//'|sort -k1,1n -k2,2n|perl -ne 'print \"chr\".\$_' >>$tmp";
596 system($cmd);
597 $cmd="mv $tmp $OUTNAME";
598 system($cmd);
599 #remove entries next to each other
600
601
602
603
604 #############################################################
605 ##May not need this anymore since filtering on left and right
606 #############################################################
607 #my $tmpout=$OUTNAME;
608 #$tmpout.=".tmp";
609 #use FindBin qw($Bin);
610 ##my $path=$Bin;
611 #$command="perl ".$path."/Extract_nSC.pl $OUTNAME -q nSC > $tmpout";
612 ##print "Command=$command\n";
613 #system($command);
614 #$command="perl ".$path."/reduce_redundancy.pl $tmpout $upper_limit |cut -f1-10 > $OUTNAME";
615 ##print "$command\n";
616 #system($command);
617 #system("rm $tmpout");
618 ########################################################
619
620
621
622
623 print "Analysis Completed\n\nYou did it!!!\n";
624 print "Finish Time : " . &spGetCurDateTime() . "\n";
625 $now = time - $now;
626 printf("\n\nTotal running time: %02d:%02d:%02d\n\n", int($now / 3600), int(($now % 3600) / 60),
627 int($now % 60));
628
629 exit;
630
631 ###############################################################################
632 sub rev_comp {
633 my $dna = shift;
634 my $revcomp = reverse($dna);
635 $revcomp =~ tr/ACGTacgt/TGCAtgca/;
636
637 return $revcomp;
638 }
639
640
641 ###############################################################################
642 #### to get reference base
643 sub getSeq{
644 my ($chr,$pos,$fasta)=@_;
645 #don't require chr
646 #if($chr !~ /^chr/){die "$chr is not correct\n";}
647 # die "$pos is not a number\n" if ($pos <0);
648 my @result=();
649 if ($pos <0){print "$pos is not a valid position (likely caused by circular MT chromosome)\n";return;}
650
651 @result = `samtools faidx $fasta $chr:$pos-$pos`;
652 if($result[1]){chomp($result[1]);
653 return uc($result[1]);
654 }
655 return("NA");
656 #### after return will not be printed
657 ####print "RESULTS=@result\n";
658 }
659
660 sub getBases{
661 my ($chr,$pos1,$pos2,$fasta)=@_;
662 #don't require chr
663 #if($chr !~ /^chr/){die "$chr is not correct\n";}
664 my @result=();
665 if ($pos1 <0){print "$pos1 is not a valid position (likely caused by circular MT chromosome)\n";return;};
666
667 @result = `samtools faidx $fasta $chr:$pos1-$pos2`;
668 if(!$result[1]){$result[1]="NA"};
669 chomp($result[1]);
670 return uc($result[1]);
671
672 #### after return will not be printed
673 ####print "RESULTS=@result\n";
674 }
675 ###############################################################################
676 #### to get time
677 sub spGetCurDateTime {
678 my ($sec, $min, $hour, $mday, $mon, $year) = localtime();
679 my $curDateTime = sprintf "%4d-%02d-%02d %02d:%02d:%02d",
680 $year+1900, $mon+1, $mday, $hour, $min, $sec;
681 return ($curDateTime);
682 }
683
684
685 ###############################################################################
686 #### print header
687 sub print_header {
688 my $date=&spGetCurDateTime();
689 my $header = qq{##fileformat=VCFv4.1
690 ##fileDate=$date
691 ##source=SoftSearch.pl
692 ##reference=$INPUT_FASTA
693 ##Usage= SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -u $unmated_pairs -s $num_sd -b $INPUT_BAM -f $INPUT_FASTA -o $OUTNAME
694 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
695 ##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
696 ##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
697 ##INFO=<ID=ISIZE,Number=.,Type=String,Description="Size of the SV">
698 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
699 ##FORMAT=<ID=lSC,Number=1,Type=Integer,Description="Length of the longest soft clips supporting the BND">
700 ##FORMAT=<ID=nSC,Number=1,Type=Integer,Description="Number of supporting soft-clips\">
701 ##FORMAT=<ID=uRP,Number=1,Type=Integer,Description="Number of unmated read pairs nearby Soft-Clips">
702 ##FORMAT=<ID=levD_local,Number=1,Type=Float,Description="Levenstein distance between soft-clipped bases and the area around the original soft-clipped site">
703 ##FORMAT=<ID=levD_distl,Number=1,Type=Float,Description="Levenstein distance between the soft-clipped bases and mate location">
704 ##FORMAT=<ID=CTX,Number=1,Type=Integer,Description="Number of chromosomal translocations">
705 ##FORMAT=<ID=DEL,Number=1,Type=Integer,Description="Number of reads supporting Large Deletions">
706 ##FORMAT=<ID=INS,Number=1,Type=Integer,Description="Number of reads supporting Large insertions">
707 ##FORMAT=<ID=NOV_INS,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion">
708 ##FORMAT=<ID=TDUP,Number=1,Type=Integer,Description="Number of reads supporting a tandem duplication">
709 ##FORMAT=<ID=INV,Number=1,Type=Integer,Description="Number of reads supporting inversions">
710 ##FORMAT=<ID=sDEL,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion">
711 ##INFO=<ID=NO_MATE_SC,Number=1,Type=Flag,Description="When there is no softclipping of the mate read location, an appromiate position is used">
712 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Dummy value for maintaining VCF-Spec">
713 #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$SAMPLE_NAME\n};
714
715 print OUT $header;
716 }
717
718
719 ###############################################################################
720 sub bulk_work {
721 print "#####################################@_\n" if ($verbose);
722 my ($side, $line, $file) = @_;
723 my $local_levD = 0;
724 my $distl_levD = 0;
725
726 #my @info = split(/\t/, $line);
727 my @plus_Reads = split(/\t/, $line);
728 $plus_Reads[7] =~ s/\n//g;
729
730 #### softclip length and softclip size.
731 my $lSC = $plus_Reads[4];
732 my $nSC = $plus_Reads[6];
733
734
735 #Get all types of compatible reads
736 #Get improperly paired reads (@ max distance)
737
738 #### default value for left SIDE.
739 #If left-clip, then look downstream for match of softclipped reads to define a deletion, but look for DRPs upstream
740 my $sv_type = "SVTYPE=BND";
741 my $start_local=0; my $end_local=0;my $target_local="";my $target_drp="";my $start_drp="";my $end_drp="";
742 if ($side =~ /left/) {
743 $start_local = $plus_Reads[1]-$dist_To_Soft;
744 $end_local = $plus_Reads[2];
745 $start_drp = $plus_Reads[1];
746 $end_drp = $plus_Reads[1]+$dist_To_Soft;
747
748 }
749 else{
750 $start_local = $plus_Reads[1];
751 $end_local = $plus_Reads[1]+$dist_To_Soft;
752 $start_drp = $plus_Reads[1]-$dist_To_Soft;
753 $end_drp = $plus_Reads[1];
754 }
755
756 $target_local=join("", $plus_Reads[0], ":", $start_local, "-", $end_local);
757 $target_drp=join("", $plus_Reads[0], ":", $start_drp, "-", $end_drp);
758 my $num_unmapped_pairs="";
759 if ($side =~ /right/) {
760 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f8 -F 1536 -c $INPUT_BAM $target_drp`;
761 } else {
762 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $INPUT_BAM $target_drp`;
763 }
764 if($verbose){print "samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $INPUT_BAM $target_drp\n";}
765
766 $num_unmapped_pairs=~s/\n//;
767 if($verbose){print "NUM UNMAPPED PAIRS= $num_unmapped_pairs\n";}
768 my $REF1_base = "";
769 my $REF2_base = "";
770 my $INFO_1 = "";
771 my $INFO_2 = "";
772 my $ALT_1 = "";
773 my $ALT_2 = "";
774 my $isize = 0;
775 my $QUAL = "";
776 my $FORMAT = "GT:";
777
778 #### get 8 bit rand id
779 my $BND1_name = join "", map { ("a".."z")[rand 26] } 1..8;
780 my $BND2_name = join "", map { ("a".."z")[rand 26] } 1..8;
781 $BND1_name=join "_","BND",$BND1_name;
782 $BND2_name=join "_","BND",$BND2_name;
783
784 my $counts = {CTX => 0, DEL => 0, INS => 0, INV => 0, TDUP => 0, NOV_INS => 0 };
785 my $event_mate_info = {CTX => "", DEL => "", INS => "", INV => "", TDUP => "", NOV_INS => "" };
786
787 #### get mate pair info and counts per event
788 foreach my $e (sort keys %{$counts}) {
789 my $h = get_counts_n_info($e, $side, $MapQ, $file, $dist_To_Soft, $target_drp, $upper_limit, $lower_limit);
790
791 $counts->{$e} = $h->{count};
792 $event_mate_info->{$e} = $h->{info};
793 }
794 #print Dumper($counts);
795
796 my $max = 0;
797 my $type = "UNKNOWN";
798 my $nRP = 0;
799 my $mate_info = "NA\tNA\tNA\tNA";
800 my $summary = "GT:";
801
802 #### find max count of events and set type, nRP and info to corresponding
803 #### max count event.
804 #### also create a summary string of all counts to be added to VCF file.
805 foreach my $e (sort keys %{$counts}){
806 # if ($counts->{$e} >=i $max){
807 if ($counts->{$e} > $max){
808 $type = $e .",". $counts->{$e};
809 $nRP = $counts->{$e};
810
811 $max = $counts->{$e};
812
813 if (length($event_mate_info->{$e})) {
814 $mate_info = $event_mate_info->{$e};
815 }
816 }
817
818 $summary .= $e .",". $counts->{$e} .":";
819 }
820 # print "done with Summary\n";
821 #### remove last colon ":" from
822 $summary =~ s/:$//;
823 if (($minRP > $max)&&(!$disable_RP_only )){if ($verbose){print "FAILED BECAUSE ($minRP > $max)&&(!$disable_RP_only )"};return};
824
825 #### Run Levenstein distance on softclip in target region to find out if its a small deletion/insetion
826 #### passing 1: clip_seq, 2: chr, 3: start, 4: end, 5: ref file.
827 my $levD = new LevD;
828 ########################################################
829 ########################################################
830 ########################################################
831
832 #### redefine start and end location for LevD calc.
833 # $start = $plus_Reads[1]-$dist_To_Soft;
834 # $end = $plus_Reads[2];
835 my $num_bases_to_loc=0;
836 my $new_start=0;
837 my $new_end=0;
838 my $del_seq="";
839 my $start = $start_local;
840 my $end = $end_local;
841 if ($lSC=~/NA/){$lSC=0}
842
843 if ($side =~ /right/) {
844 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA);
845 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist});
846 $num_bases_to_loc=$levD->{index};
847 $new_start = $plus_Reads[2];
848 if ($plus_Reads[2]=~/^[0-9]/){$new_end=$plus_Reads[2]+$lSC};
849 }
850 else{
851 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA);
852 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist});
853 $num_bases_to_loc=$levD->{index};
854 if ($plus_Reads[2]=~/^[0-9]/){$new_start=$plus_Reads[2]-$lSC};
855 $new_end = $plus_Reads[2];
856 }
857 if((!$new_start)||(!$new_end)||($new_start<0)){print "FAILED AT ((!$new_start)||(!$new_end)||($new_start<0))\n";return};
858
859 $del_seq=getBases($plus_Reads[0], $new_start,$new_end,$INPUT_FASTA);
860 ##############################################################################
861 # #If there is a match, where is the start position of the match?
862 #
863 ##############################################################################
864
865
866 #if $plus_Reads[3] eq "NA", then it was found without soft-clipped reads
867 if($plus_Reads[3] !~ /NA/){
868 if (($local_levD < $levD_local_threshold)) {
869 return if (!$sv_only);
870 #### add value to summary to be written to vcf file.
871 $summary = "GT:sDel," . $plus_Reads[6];
872 $type = "sDEL";
873 ###########################################################################
874 ##### Printing output
875
876 #########################################
877 ##### Get DNA info
878 #########################################
879 #$REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA);
880 $REF1_base = substr($del_seq, 0, 1);
881
882 #### this is alt ref. for softclip its $plus_Reads[3]
883 $REF2_base = $del_seq;
884 $QUAL = 1/($local_levD + 0.001);
885 $QUAL = sprintf("%.2f",$QUAL);
886 $isize = length($del_seq);
887
888 #### svtype = none for sDEL
889 #### isize = length($info[3]);
890 #### nRP = NA
891 #### mate_id = NA
892 #### CTX,:DEL,:....sDEL,##
893 $INFO_1=join(";", "SVTYPE=NA", "EVENT=$type", "ISIZE=$isize");
894
895 #Add Sample infomration
896 my $FORMAT="GT:sDEL";
897 $FORMAT .= ":lSC:nSC:uRP:levD_local";
898 my $SAMPLE= "0/1:";
899 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD";
900
901 #### remove any white spaces.
902 $INFO_1=~s/\s//g;
903 $INFO_2=~s/\s//g;
904
905 $BND1_name =~ s/^BND/LEVD/;
906 # If left, then the start position is plus_Reads[1]-isize
907 my $start_pos=0;
908 #Make sure Ref1 and Ref2 bases are different
909 if($REF2_base eq $REF1_base){$REF1_base="NA"}
910 if($side=~/left/){$start_pos=$plus_Reads[1]-$isize}else{$start_pos=$plus_Reads[1]};
911 print OUT join ("\t", $plus_Reads[0], $start_pos, $BND1_name, $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n");
912 if ($verbose){print "No Softclipped reads here!\n"}
913 return;
914 }
915 }
916
917 #### Otherwise, look for DRP mate info
918 #if($nRP=~/NA/){print "MATE_INFO=$mate_info\tSide=$side\tline=$line\n";}
919 my @mate_info_arr = split(/\t/, $mate_info);
920 $nRP = $mate_info_arr[3];
921 my $mate_chr=$mate_info_arr[0];
922
923 if((! defined $nRP) || ($nRP =~ /na/i) || ($mate_chr =~ /NA/) ){
924 #PRINT UNKNOWN
925 if ($nRP =~ /na/i){print "Can't find SC reads\n" if ($verbose);return};
926 if ($verbose){print "There is an unknown\nNRP=$nRP Mate_CHR=$mate_chr minRP=$minRP\n"}
927 $summary .= ":unknown," . $plus_Reads[6];
928 $type = "unknown";
929 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA);
930 $REF2_base = $plus_Reads[3];
931 $BND1_name =~ s/^BND/UNKNOWN/;
932 $QUAL = 1/($local_levD + 0.001);
933 $QUAL = sprintf("%.2f",$QUAL);
934 $INFO_1=join(";", "SVTYPE=unknown", "EVENT=unknown", "ISIZE=unknown");
935 #Add Sample infomration
936 my $FORMAT="GT:sDEL";
937 $FORMAT .= ":lSC:nSC:uRP:levD_local";
938 my $SAMPLE = "0/1:";
939 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD";
940 $SAMPLE=~s/NA/0/g;
941 #### remove any white spaces.
942 $INFO_1=~s/\s//g;
943 #print join ("\t", $plus_Reads[0], $plus_Reads[1], $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n");
944
945 print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $REF2_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n");
946 return;
947
948 }
949 #### end if there is no mate info or nRP+uRP<minRP
950 if (($nRP<$minRP)&&($unmated_pairs > ($num_unmapped_pairs+$nRP))){
951 print "Something failed here\nif (($nRP<$minRP)&&($unmated_pairs > ($num_unmapped_pairs+$nRP)))\n";
952 return}
953
954 ##################################################################################
955 # Find out if mates have nearby soft-clips (to refine the breakpoints)
956 ##################################################################################
957 #Look for evidence of soft-clipping near mate
958 my @mate_soft_arr = ();
959 my $mate_start = 0;
960 my $mate_soft = "";
961
962 @mate_info_arr = split(/\t/, $mate_info);
963
964 #### mate start and end locations.
965 my $filename = $right_sc;
966
967 $start = $mate_info_arr[1] - $dist_To_Soft;
968 $end = $mate_info_arr[1];
969
970 if ($side =~ /right/) {
971 $start = $mate_info_arr[2];
972 $end = $mate_info_arr[2] + $dist_To_Soft;
973
974 $filename = $left_sc;
975 }
976
977 #### add levenstein distance to Summary
978 #print "Calc distal Levd\n";
979 $levD->search(rev_comp($plus_Reads[3]), $mate_info_arr[0], $start, $end, $INPUT_FASTA);
980 $distl_levD = sprintf("%.2f", $levD->{relative_edit_dist});
981 $distl_levD = "NA" if($plus_Reads[3] =~ /NA/);
982 #If there is no softclips to string match, then give 0 as quality value
983 if ($plus_Reads[3] !~ /NA/){
984 $QUAL=1/($distl_levD + 0.001);
985 }
986 else {
987 $QUAL=0;
988 };
989 $QUAL=sprintf("%.2f",$QUAL);
990 #### looking for softclips to refine break point
991 #### if left look in right and vice-versa.
992 $cmd = qq{echo -e "$mate_info_arr[0]\t$start\t$end"};
993 $cmd .= qq{ | awk -F'\t' 'NF==3' | intersectBed -a stdin -b $filename | head -1};
994 print "$cmd\n" if $verbose;
995 $mate_soft = `$cmd`;
996
997 $mate_soft =~ s/\n//g;
998 @mate_soft_arr = split(/\s/, $mate_soft);
999 my $NO_MATE_SC="";
1000 if(@mate_soft_arr){
1001 $mate_chr = $mate_soft_arr[0];
1002 $mate_start = $mate_soft_arr[1];
1003 $NO_MATE_SC="APPROXIMATE";
1004
1005 } else{
1006 @mate_info_arr = split(/\s/,$mate_info);
1007 $mate_chr = $mate_info_arr[0];
1008 $mate_start = $mate_info_arr[1];
1009 }
1010
1011 #end if there is no mate info
1012 return if ($mate_chr eq "");
1013 #end if there is no mate info and !disable_RP_only
1014 return if (($lSC =~/NA/)&&(!$disable_RP_only));
1015
1016
1017 ###########################################################################
1018 ##### Printing output
1019
1020 #########################################
1021 # Get DNA info
1022 #########################################
1023 #print "PLUS_READS=$plus_Reads[0],$plus_Reads[1]\nMATE=$mate_chr,$mate_start,$INPUT_FASTA\n";
1024 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA);
1025
1026 ### this is alt ref. for softclip its $plus_Reads[3]
1027 $REF2_base = getSeq($mate_chr, $mate_start, $INPUT_FASTA);
1028
1029 #########################################
1030 # print in VCF format
1031 #########################################
1032
1033 #### abs value to account for left and right reads.
1034 $isize = abs($plus_Reads[1]-$mate_start);
1035
1036 my $event_type=$type;
1037 $event_type=~ s/,|[0-9]//g;
1038 $INFO_1=join(";", "$sv_type", "EVENT=$event_type","END=$mate_start", "ISIZE=$isize","MATEID=$BND2_name");
1039 $INFO_2=join(";", "$sv_type", "EVENT=$event_type","END=$plus_Reads[1]", "ISIZE=$isize","MATEID=$BND1_name");
1040
1041 #### remove any white spaces.
1042 #### ask: did you mean to remove space from ends? eg. trim()
1043 $INFO_1=~s/\s//g;
1044 $INFO_2=~s/\s//g;
1045
1046 $FORMAT=$summary;
1047 $FORMAT=~ s/,|[0-9]//g;
1048 $FORMAT .= ":lSC:nSC:uRP:distl_levD";
1049 if($NO_MATE_SC){$INFO_2 .= ":NO_MATE_SC"}
1050 my $SAMPLE="0/1:";
1051 $SAMPLE .=$summary;
1052 # if($NO_MATE_SC){$SAMPLE.= ":$NO_MATE_SC"}
1053
1054 $SAMPLE=~s/[A-Z|,|_]//g;
1055 my $MATE_SAMPLE=$SAMPLE;
1056 $SAMPLE .= ":$lSC:$nSC:$num_unmapped_pairs:$distl_levD";
1057 $MATE_SAMPLE .=":NA:NA:NA:NA";
1058 $SAMPLE=~s/::/:/g;
1059 $MATE_SAMPLE=~s/::/:/g;
1060 $MATE_SAMPLE=~s/NA/0/g;
1061 $SAMPLE=~s/NA/0/g;
1062
1063 if($type !~ /INV/){
1064 $ALT_1 = join("","]",$mate_chr,":",$mate_start,"]",$REF1_base);
1065 $ALT_2 = join("",$REF2_base,"[",$plus_Reads[0],":",$plus_Reads[1],"[");
1066 # 2 321682 bnd_V T ]13:123456]T 6 PASS SVTYPE=BND
1067 # 13 123456 bnd_U C C[2:321682[ 6 PASS SVTYPE=BND
1068 } else {
1069 $ALT_1 = join("", "]", $plus_Reads[0], ":", $plus_Reads[1], "]", $REF2_base);
1070 $ALT_2 = join("", $REF1_base, "[", $mate_chr, ":", $mate_start, "[");
1071 }
1072
1073 if(($mate_chr) && ($plus_Reads[0])){
1074 print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $ALT_1, $QUAL,"PASS", $INFO_1, $FORMAT,$SAMPLE,"\n");
1075 print OUT join ("\t", $mate_chr, $mate_start, $BND2_name, $REF2_base, $ALT_2, $QUAL, "PASS", $INFO_2, $FORMAT,$MATE_SAMPLE,"\n");
1076 }
1077 }
1078
1079 ###############################################################################
1080 ###############################################################################
1081 sub get_counts_n_info {
1082 my ($event, $side, $mapQ, $file, $dist, $target, $upL, $lwL) = @_;
1083
1084 my $mate_info = "";
1085 my $cmd = "";
1086
1087 if ($event =~ /^CTX$/i) {
1088 #print "CTX side $side\n";
1089 if ($side =~ /right/i) {
1090 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1536 $file $target};
1091 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'};
1092 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1093 #if($verbose){print "$cmd\n"}
1094 $mate_info=`$cmd`;
1095 } else {
1096 $cmd = qq{ samtools view $new_blacklist -q $mapQ -f 16 -F 1536 $file $target};
1097 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'};
1098 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1099 #if($verbose){print "$cmd\n"}
1100 $mate_info=`$cmd`;
1101 }
1102 } elsif ($event =~ /^DEL$/i) {
1103 #print "DEL side $side\n";
1104 if ($side =~ /right/i) {
1105 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target};
1106 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
1107 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1108 #if($verbose){print "$cmd\n"}
1109 $mate_info=`$cmd`;
1110 } else {
1111 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1568 -f 16 $file $target};
1112 $cmd .= qq{ | awk '{OFS="\\t"} {if((\$7 ~ /=/)&&(\$9<-$upL)){end=\$8+1;print \$3,\$8,end}}'};
1113 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1};
1114 #if($verbose){print "$cmd\n"}
1115
1116 $mate_info=`$cmd`;
1117 }
1118 } elsif ($event =~ /^INS$/i) {
1119 #print "INS side $side\n";
1120 if ($side =~ /right/i) {
1121 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target};
1122 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<$lwL && \$9 > 0 )){end=\$8+1;print \$3,\$8,end}}'};
1123 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1124 #if($verbose){print "$cmd\n"}
1125 $mate_info = `$cmd`;
1126 } else {
1127 $cmd = qq {samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target};
1128 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>-$lwL && \$9 < 0 )){end=\$8+1;print \$3,\$8,end}}'};
1129 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1130 #if($verbose){print "$cmd\n"}
1131 $mate_info = `$cmd`;
1132 }
1133 } elsif ($event =~ /^INV$/i) {
1134 #print "INV side $side\n";
1135 if ($side =~ /right/i) {
1136 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1596 $file $target};
1137 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'};
1138 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1};
1139 #if($verbose){print "$cmd\n"}
1140 $mate_info = `$cmd`;
1141 } else {
1142 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 48 -F 1548 $file $target};
1143 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'};
1144 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1145 #if($verbose){print "$cmd\n"}
1146 $mate_info = `$cmd`;
1147 }
1148 } elsif ($event =~ /^TDUP$/i) {
1149 #print "TDUP side $side\n";
1150 if ($side =~ /right/i) {
1151 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target};
1152 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
1153 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4>\$8)&&(\$9<0)&& (\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
1154 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1155 #if($verbose){print "$cmd\n"}
1156 $mate_info = `$cmd`;
1157 } else {
1158 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target};
1159 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<-$upL )){end=\$8+1;print \$3,\$8,end}}'};
1160 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4<\$8)&&(\$9>0)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
1161 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1162 #if($verbose){print "$cmd\n"}
1163 $mate_info = `$cmd`;
1164 }
1165 } elsif ($event =~ /^NOV_INS$/i) {
1166 #print "NOV_INS side $side\n";
1167 if ($side =~ /right/i) {
1168 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 8 -F 1552 $file $target};
1169 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'};
1170 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1171 #if($verbose){print "$cmd\n"}
1172 $mate_info = `$cmd`;
1173 } else {
1174 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 24 -F 1536 $file $target};
1175 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'};
1176 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
1177 #if($verbose){print "$cmd\n"}
1178 $mate_info = `$cmd`;
1179 }
1180 }
1181
1182 $mate_info=~s/\n//g;
1183 my @tmp=split(/\t/, $mate_info);
1184
1185 my $counts = 0;
1186
1187 if (defined $tmp[3]) {
1188 $tmp[3] =~ s/\n//g;
1189
1190 $counts = $tmp[3] if (length($tmp[3]));
1191 }
1192 return ({count=>$counts, info=>$mate_info});
1193 }