Mercurial > repos > scisjnu123 > ngsap_vc
comparison ngsap-vc/svdetect/SVDetect_compare.pl @ 3:0d10255b5434 draft default tip
Uploaded
| author | scisjnu123 |
|---|---|
| date | Thu, 03 Oct 2019 10:42:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:2c7824a8d764 | 3:0d10255b5434 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 =pod | |
| 4 | |
| 5 =head1 NAME | |
| 6 | |
| 7 SVDetect Compare for Galaxy | |
| 8 | |
| 9 Version: 0.8b for Galaxy | |
| 10 | |
| 11 =head1 SYNOPSIS | |
| 12 | |
| 13 SVDetect_compare.pl links2compare -conf <configuration_file> [-help] [-man] | |
| 14 | |
| 15 =cut | |
| 16 | |
| 17 # ------------------------------------------------------------------- | |
| 18 | |
| 19 use strict; | |
| 20 use warnings; | |
| 21 | |
| 22 use Pod::Usage; | |
| 23 use Getopt::Long; | |
| 24 | |
| 25 use Config::General; | |
| 26 use Tie::IxHash; | |
| 27 | |
| 28 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 29 #PARSE THE COMMAND LINE | |
| 30 my %OPT; | |
| 31 GetOptions(\%OPT, | |
| 32 'conf=s', | |
| 33 'out1=s', #GALAXY | |
| 34 'out2=s', #GALAXY | |
| 35 'out3=s', #GALAXY | |
| 36 'out4=s', #GALAXY | |
| 37 'out5=s', #GALAXY | |
| 38 'out6=s', #GALAXY | |
| 39 'out7=s', #GALAXY | |
| 40 'out8=s', #GALAXY | |
| 41 'out9=s', #GALAXY | |
| 42 'l=s', #GALAXY | |
| 43 'N=s', #GALAXY | |
| 44 'help', | |
| 45 'man' | |
| 46 ); | |
| 47 | |
| 48 pod2usage() if $OPT{help}; | |
| 49 pod2usage(-verbose=>2) if $OPT{man}; | |
| 50 pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); | |
| 51 | |
| 52 | |
| 53 pod2usage() if(@ARGV<1); | |
| 54 | |
| 55 tie (my %func, 'Tie::IxHash',links2compare=>\&links2compare); | |
| 56 | |
| 57 foreach my $command (@ARGV){ | |
| 58 pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); | |
| 59 } | |
| 60 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 61 | |
| 62 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 63 #READ THE CONFIGURATION FILE | |
| 64 my $conf=Config::General->new( -ConfigFile => $OPT{conf}, | |
| 65 -Tie => "Tie::IxHash", | |
| 66 -AllowMultiOptions => 1, | |
| 67 -LowerCaseNames => 1, | |
| 68 -AutoTrue => 1); | |
| 69 my %CONF= $conf->getall; | |
| 70 validateconfiguration(\%CONF); #validation of the configuration parameters | |
| 71 | |
| 72 | |
| 73 my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY | |
| 74 my $BEDTOOLS_BIN_DIR="/bioinfo/local/BEDTools/bin"; #GALAXY | |
| 75 | |
| 76 my $pt_log_file=$OPT{l}; #GALAXY | |
| 77 my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_compare.log"; #GALAXY | |
| 78 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY | |
| 79 | |
| 80 my @pt_sv_file=($OPT{out1},$OPT{out2},$OPT{out3}) if($OPT{out1}); #GALAXY common,sample,reference | |
| 81 my @pt_circos_file=($OPT{out4},$OPT{out5},$OPT{out6}) if($OPT{out4}); #GALAXY common,sample,reference | |
| 82 my @pt_bed_file=($OPT{out7},$OPT{out8},$OPT{out9}) if($OPT{out7}); #GALAXY common,sample,reference | |
| 83 | |
| 84 $CONF{compare}{sample_link_file}=readlink($CONF{compare}{sample_link_file});#GALAXY | |
| 85 $CONF{compare}{sample_link_file}=~s/.sv.txt//; #GALAXY | |
| 86 | |
| 87 $CONF{compare}{reference_link_file}=readlink($CONF{compare}{reference_link_file});#GALAXY | |
| 88 $CONF{compare}{reference_link_file}=~s/.sv.txt//; #GALAXY | |
| 89 | |
| 90 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 91 | |
| 92 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 93 #COMMAND EXECUTION | |
| 94 foreach my $command (@ARGV){ | |
| 95 &{$func{$command}}(); | |
| 96 } | |
| 97 print LOG "-- end\n"; | |
| 98 | |
| 99 close LOG;#GALAXY | |
| 100 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY | |
| 101 | |
| 102 exit(0); | |
| 103 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 104 | |
| 105 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 106 #FUNCTIONS | |
| 107 | |
| 108 # -----------------------------------------------------------------------------# | |
| 109 #MAIN FUNCTION number 5:Comparison between samples, common or specific links | |
| 110 sub links2compare{ | |
| 111 | |
| 112 my @compare_files; | |
| 113 | |
| 114 compareSamples($CONF{general}{output_dir}, | |
| 115 $CONF{compare}{list_samples}, | |
| 116 $CONF{compare}{sample_link_file}, | |
| 117 $CONF{compare}{reference_link_file}, | |
| 118 $CONF{compare}{min_overlap}, | |
| 119 $CONF{compare}{same_sv_type}, | |
| 120 \@compare_files); | |
| 121 | |
| 122 my $pt_ind=0; | |
| 123 | |
| 124 for my $input_file (@compare_files){ | |
| 125 | |
| 126 $input_file=$CONF{general}{output_dir}.$input_file; | |
| 127 | |
| 128 my $output_file=$input_file; | |
| 129 $output_file=~s/unique$/compared/; | |
| 130 | |
| 131 sortLinks($input_file, $output_file,1); | |
| 132 | |
| 133 if($CONF{compare}{circos_output}){ | |
| 134 links2segdup($CONF{circos}{organism_id}, | |
| 135 $CONF{circos}{colorcode}, | |
| 136 $output_file, | |
| 137 $output_file.".segdup.txt"); | |
| 138 system "rm $pt_circos_file[$pt_ind]; ln -s $output_file.segdup.txt $pt_circos_file[$pt_ind]" if (defined $pt_circos_file[$pt_ind]); #GALAXY | |
| 139 } | |
| 140 | |
| 141 if($CONF{compare}{bed_output}){ | |
| 142 links2bedfile($CONF{compare}{read_lengths}, | |
| 143 $CONF{bed}{colorcode}, | |
| 144 $output_file, | |
| 145 $output_file.".bed"); | |
| 146 system "rm $pt_bed_file[$pt_ind]; ln -s $output_file.bed $pt_bed_file[$pt_ind]" if (defined $pt_bed_file[$pt_ind]); #GALAXY | |
| 147 } | |
| 148 | |
| 149 if($CONF{compare}{sv_output}){ | |
| 150 | |
| 151 links2SVfile ($output_file, $output_file.".sv.txt"); | |
| 152 system "rm $pt_sv_file[$pt_ind]; ln -s $output_file.sv.txt $pt_sv_file[$pt_ind]" if (defined $pt_sv_file[$pt_ind]); #GALAXY | |
| 153 } | |
| 154 $pt_ind++; | |
| 155 | |
| 156 } | |
| 157 unlink(@compare_files); | |
| 158 | |
| 159 } | |
| 160 #------------------------------------------------------------------------------# | |
| 161 #------------------------------------------------------------------------------# | |
| 162 sub compareSamples{ | |
| 163 | |
| 164 my ($dir,$list_samples,$sample_file,$reference_file,$min_overlap,$same_sv_type,$file_names)=@_; | |
| 165 | |
| 166 my @bedpefiles; | |
| 167 my @list=split(",",$list_samples); | |
| 168 my @list_files=($sample_file,$reference_file); | |
| 169 | |
| 170 print LOG "\# Comparison procedure...\n"; | |
| 171 print LOG "-- samples=$list_samples\n". | |
| 172 "-- minimum overlap=$min_overlap\n". | |
| 173 "-- same SV type=$same_sv_type\n"; | |
| 174 | |
| 175 #conversion of links to bedPE format file | |
| 176 print LOG "-- Conversion of links.filtered files to bedPE format\n"; | |
| 177 for my $s (0..$#list) { | |
| 178 | |
| 179 links2bedPElinksfile($list[$s],$list_files[$s],$list_files[$s].".bedpe.txt"); | |
| 180 push(@bedpefiles,$list_files[$s].".bedpe.txt"); | |
| 181 | |
| 182 } | |
| 183 | |
| 184 #get common links between all samples compared | |
| 185 print LOG "-- Getting common links between all samples with BEDTools\n"; | |
| 186 my $common_name=join(".",@list); | |
| 187 | |
| 188 my $nb=scalar @list; | |
| 189 my $command=""; | |
| 190 my $prompt=">"; | |
| 191 | |
| 192 while ($nb>0){ | |
| 193 | |
| 194 for my $i (0..$#list_files){ | |
| 195 | |
| 196 $command.="$BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a ".$list_files[$i].".bedpe.txt"; | |
| 197 my $pipe=0; | |
| 198 | |
| 199 for my $j ($i+1..$#list_files){ | |
| 200 | |
| 201 $command.="| $BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a stdin" if($pipe); | |
| 202 $command.=" -b ".$list_files[$j].".bedpe.txt"; | |
| 203 $pipe=1; | |
| 204 | |
| 205 } | |
| 206 | |
| 207 $command.=$prompt.$dir.$common_name.".bedpe.tmp;"; | |
| 208 $prompt=">>"; | |
| 209 | |
| 210 my $first=shift(@list_files); push(@list_files,$first); | |
| 211 last; | |
| 212 } | |
| 213 $nb--; | |
| 214 } | |
| 215 | |
| 216 system ($command); | |
| 217 | |
| 218 push(@bedpefiles,$dir.$common_name.".bedpe.tmp"); | |
| 219 | |
| 220 #Post comparison to get common links if same type only (as an option) | |
| 221 open( FILE, "<".$dir.$common_name.".bedpe.tmp") or die "Can't open".$dir.$common_name.".bedpe.tmp : $!"; | |
| 222 open( OUT, ">".$dir.$common_name.".bedpe.unique") or die "Can't write in ".$dir.$common_name.".bedpe.unique : $!"; | |
| 223 | |
| 224 while(<FILE>){ | |
| 225 my @t=split("\t",$_); | |
| 226 my $s=(split("_",$t[6]))[0]; | |
| 227 my ($sv1,$sv2)=($t[7],$t[18]); | |
| 228 splice(@t,11,$#t); | |
| 229 | |
| 230 if($same_sv_type){ | |
| 231 print OUT join("\t",@t)."\n" if($sv1 eq $sv2); | |
| 232 }else{ | |
| 233 print OUT join("\t",@t)."\n"; | |
| 234 } | |
| 235 } | |
| 236 close FILE; | |
| 237 close OUT; | |
| 238 | |
| 239 bedPElinks2linksfile($dir.$common_name.".bedpe.unique", $dir.$common_name.".unique"); | |
| 240 push(@bedpefiles,$dir.$common_name.".bedpe.unique"); | |
| 241 push(@$file_names,$common_name.".unique"); | |
| 242 print LOG "-- output created: ".$dir.$common_name.".compared\n"; | |
| 243 | |
| 244 #get specific links for each sample | |
| 245 print LOG "-- Getting specific links for each sample\n"; | |
| 246 for my $s (0..$#list) { | |
| 247 system("grep -Fxv -f ".$dir.$common_name.".bedpe.unique ".$list_files[$s].".bedpe.txt >".$dir.$list[$s].".bedpe.unique"); | |
| 248 bedPElinks2linksfile($dir.$list[$s].".bedpe.unique",$dir.$list[$s].".unique"); | |
| 249 push(@bedpefiles,$dir.$list[$s].".bedpe.unique"); | |
| 250 push(@$file_names,$list[$s].".unique"); | |
| 251 print LOG "-- output created: ".$dir.$list[$s].".compared\n"; | |
| 252 } | |
| 253 | |
| 254 unlink(@bedpefiles); | |
| 255 | |
| 256 } | |
| 257 #------------------------------------------------------------------------------# | |
| 258 #------------------------------------------------------------------------------# | |
| 259 #convert the links file to the circos format | |
| 260 sub links2segdup{ | |
| 261 | |
| 262 my($id,$color_code,$links_file,$segdup_file)=@_; | |
| 263 | |
| 264 print LOG "\# Converting to the circos format...\n"; | |
| 265 | |
| 266 tie (my %hcolor,'Tie::IxHash'); #color-code hash table | |
| 267 foreach my $col (keys %{$color_code}){ | |
| 268 my ($min_links,$max_links)=split(",",$color_code->{$col}); | |
| 269 $hcolor{$col}=[$min_links,$max_links]; | |
| 270 } | |
| 271 | |
| 272 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; | |
| 273 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; | |
| 274 | |
| 275 my $index=1; | |
| 276 while(<LINKS>){ | |
| 277 | |
| 278 my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; | |
| 279 | |
| 280 my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links | |
| 281 | |
| 282 print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output | |
| 283 "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; | |
| 284 $index++; | |
| 285 } | |
| 286 | |
| 287 close LINKS; | |
| 288 close SEGDUP; | |
| 289 print LOG "-- output created: $segdup_file\n"; | |
| 290 } | |
| 291 #------------------------------------------------------------------------------# | |
| 292 #------------------------------------------------------------------------------# | |
| 293 #convert the links file to the bedPE format for BEDTools usage | |
| 294 sub links2bedPElinksfile{ | |
| 295 | |
| 296 my ($sample,$links_file,$bedpe_file)=@_; | |
| 297 | |
| 298 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; | |
| 299 open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; | |
| 300 | |
| 301 my $nb_links=1; | |
| 302 | |
| 303 while(<LINKS>){ | |
| 304 | |
| 305 chomp; | |
| 306 my @t=split("\t",$_); | |
| 307 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); | |
| 308 my $type=($chr1 eq $chr2)? "INTRA":"INTER"; | |
| 309 $type.="_".$t[10]; | |
| 310 | |
| 311 $start1--; $start2--; | |
| 312 | |
| 313 print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". | |
| 314 "\t$sample"."_link$nb_links\t$type\t.\t.". | |
| 315 "\t".join("|",@t)."\n"; | |
| 316 | |
| 317 $nb_links++; | |
| 318 } | |
| 319 | |
| 320 close LINKS; | |
| 321 close BEDPE; | |
| 322 | |
| 323 } | |
| 324 #------------------------------------------------------------------------------# | |
| 325 #------------------------------------------------------------------------------# | |
| 326 sub bedPElinks2linksfile{ | |
| 327 | |
| 328 my ($bedpe_file,$links_file)=@_; | |
| 329 | |
| 330 open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; | |
| 331 open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; | |
| 332 | |
| 333 while(<BEDPE>){ | |
| 334 | |
| 335 chomp; | |
| 336 my $sample=(split("_",(split("\t",$_))[6]))[0]; | |
| 337 my @t1=(split("\t",$_))[0,1,2,3,4,5]; | |
| 338 my @t2=split(/\|/,(split("\t",$_))[10]); | |
| 339 push(@t2,$sample); | |
| 340 | |
| 341 print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; | |
| 342 | |
| 343 } | |
| 344 close BEDPE; | |
| 345 close LINKS; | |
| 346 | |
| 347 } | |
| 348 #------------------------------------------------------------------------------# | |
| 349 #------------------------------------------------------------------------------# | |
| 350 #convert the links file to the bed format | |
| 351 sub links2bedfile{ | |
| 352 | |
| 353 my ($tag_length,$color_code,$links_file,$bed_file)=@_; | |
| 354 | |
| 355 print LOG "\# Converting to the bed format...\n"; | |
| 356 | |
| 357 my $compare=1; | |
| 358 if($links_file!~/compared$/){ | |
| 359 $compare=0; | |
| 360 $tag_length->{none}->{1}=$tag_length->{1}; | |
| 361 $tag_length->{none}->{2}=$tag_length->{2}; | |
| 362 } | |
| 363 | |
| 364 #color-code hash table | |
| 365 tie (my %hcolor,'Tie::IxHash'); | |
| 366 my %color_order; | |
| 367 $color_order{"255,255,255"}=0; | |
| 368 my $n=1; | |
| 369 foreach my $col (keys %{$color_code}){ | |
| 370 my ($min_links,$max_links)=split(",",$color_code->{$col}); | |
| 371 $hcolor{$col}=[$min_links,$max_links]; | |
| 372 $color_order{$col}=$n; | |
| 373 $n++; | |
| 374 } | |
| 375 | |
| 376 my %pair; | |
| 377 my %pt; | |
| 378 $n=1; | |
| 379 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; | |
| 380 | |
| 381 my %str=( "F"=>"+", "R"=>"-" ); | |
| 382 | |
| 383 while(<LINKS>){ | |
| 384 | |
| 385 my @t=split; | |
| 386 my $sample=($compare)? pop(@t):"none"; | |
| 387 | |
| 388 my $chr1=$t[0]; | |
| 389 my $chr2=$t[3]; | |
| 390 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); | |
| 391 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); | |
| 392 my $same_chr=($chr1 eq $chr2)? 1:0; | |
| 393 | |
| 394 my $count=$t[6]; | |
| 395 my $color=getColor($count,\%hcolor,"bed"); | |
| 396 | |
| 397 my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); | |
| 398 my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); | |
| 399 my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); | |
| 400 my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); | |
| 401 my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); | |
| 402 my @position1=deleteBadOrderSensePairs(split(",",$t[14])); | |
| 403 my @position2=deleteBadOrderSensePairs(split(",",$t[15])); | |
| 404 my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); | |
| 405 my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); | |
| 406 | |
| 407 | |
| 408 for my $p (0..$#pairs){ | |
| 409 | |
| 410 if (!exists $pair{$pairs[$p]}){ | |
| 411 | |
| 412 if($same_chr){ | |
| 413 | |
| 414 $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, | |
| 415 $start1[$p]-1, $end2[$p], $color, | |
| 416 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; | |
| 417 $pt{$n}=$pair{$pairs[$p]}->{0}; | |
| 418 $n++; | |
| 419 | |
| 420 }else{ | |
| 421 | |
| 422 $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, | |
| 423 $start1[$p]-1, $end1[$p], $color, | |
| 424 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; | |
| 425 $pt{$n}=$pair{$pairs[$p]}->{1}; | |
| 426 $n++; | |
| 427 | |
| 428 | |
| 429 $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, | |
| 430 $start2[$p]-1, $end2[$p], $color, | |
| 431 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; | |
| 432 $pt{$n}=$pair{$pairs[$p]}->{2}; | |
| 433 $n++; | |
| 434 } | |
| 435 }else{ | |
| 436 | |
| 437 if($same_chr){ | |
| 438 ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); | |
| 439 }else{ | |
| 440 ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); | |
| 441 ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); | |
| 442 } | |
| 443 } | |
| 444 } | |
| 445 } | |
| 446 close LINKS; | |
| 447 | |
| 448 my $nb_pairs=$n-1; | |
| 449 | |
| 450 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; | |
| 451 print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". | |
| 452 "visibility=2 itemRgb=\"On\"\n"; | |
| 453 | |
| 454 for my $i (1..$nb_pairs){ | |
| 455 print BED join("\t",@{$pt{$i}})."\n"; | |
| 456 } | |
| 457 | |
| 458 close BED; | |
| 459 | |
| 460 print LOG "-- output created: $bed_file\n"; | |
| 461 | |
| 462 undef %pair; | |
| 463 undef %pt; | |
| 464 | |
| 465 } | |
| 466 #------------------------------------------------------------------------------# | |
| 467 #------------------------------------------------------------------------------# | |
| 468 sub links2SVfile{ | |
| 469 | |
| 470 my($links_file,$sv_file)=@_; | |
| 471 | |
| 472 print LOG "\# Converting to the sv output table...\n"; | |
| 473 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; | |
| 474 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; | |
| 475 | |
| 476 my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist | |
| 477 chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering | |
| 478 final_score breakpoint1_start1-end1 breakpoint2_start2-end2); | |
| 479 | |
| 480 my $nb_links=0; | |
| 481 | |
| 482 while (<LINKS>){ | |
| 483 | |
| 484 my @t=split; | |
| 485 my @sv=(); | |
| 486 my $sv_type="-"; | |
| 487 my $strand_ratio="-"; | |
| 488 my $eq_ratio="-"; | |
| 489 my $eq_type="-"; | |
| 490 my $insert_ratio="-"; | |
| 491 my $link="-"; | |
| 492 my ($bk1, $bk2)=("-","-"); | |
| 493 my $score="-"; | |
| 494 | |
| 495 my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); | |
| 496 my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); | |
| 497 my $nb_pairs=$t[6]; | |
| 498 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); | |
| 499 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); | |
| 500 my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; | |
| 501 | |
| 502 #if strand filtering | |
| 503 if (defined $t[16]){ | |
| 504 #if inter-chr link | |
| 505 $sv_type=$t[16]; | |
| 506 if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ | |
| 507 $strand_ratio=floor($1/$2*100)."%"; | |
| 508 $score=$t[18]; | |
| 509 } | |
| 510 if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ | |
| 511 #if intra-chr link with insert size filtering | |
| 512 $strand_ratio=floor($1/$2*100)."%"; | |
| 513 $link=floor($t[17]); | |
| 514 if($sv_type!~/^INV/){ | |
| 515 $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); | |
| 516 $score=$t[20]; | |
| 517 }else{ | |
| 518 $score=$t[19]; | |
| 519 } | |
| 520 } | |
| 521 } | |
| 522 | |
| 523 if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ | |
| 524 | |
| 525 #if strand and order filtering only and/or interchr link | |
| 526 $eq_type=$t[18]; | |
| 527 $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); | |
| 528 ($bk1, $bk2)=($t[20],$t[21]); | |
| 529 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} | |
| 530 $score=$t[22]; | |
| 531 | |
| 532 }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ | |
| 533 | |
| 534 #if all three filtering | |
| 535 $link=floor($t[17]); | |
| 536 $eq_type=$t[19]; | |
| 537 $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); | |
| 538 | |
| 539 if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ | |
| 540 $insert_ratio=floor($1/$2*100)."%"; | |
| 541 ($bk1, $bk2)=($t[22],$t[23]); | |
| 542 $score=$t[24]; | |
| 543 | |
| 544 }else{ | |
| 545 ($bk1, $bk2)=($t[21],$t[22]); | |
| 546 $score=$t[23]; | |
| 547 } | |
| 548 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} | |
| 549 | |
| 550 } | |
| 551 | |
| 552 | |
| 553 push(@sv, $chr_type, $sv_type,$eq_type); | |
| 554 push(@sv,"$chr1\t$start1-$end1"); | |
| 555 push(@sv, $link); | |
| 556 push(@sv,"$chr2\t$start2-$end2", | |
| 557 $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); | |
| 558 | |
| 559 | |
| 560 print SV join("\t",@sv)."\n"; | |
| 561 } | |
| 562 | |
| 563 close LINKS; | |
| 564 close SV; | |
| 565 | |
| 566 system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; | |
| 567 | |
| 568 open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; | |
| 569 my @links=<SV>; | |
| 570 close SV; | |
| 571 | |
| 572 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; | |
| 573 | |
| 574 print SV join("\t",@header)."\n"; | |
| 575 print SV @links; | |
| 576 close SV; | |
| 577 | |
| 578 unlink($sv_file.".sorted"); | |
| 579 | |
| 580 print LOG "-- output created: $sv_file\n"; | |
| 581 | |
| 582 } | |
| 583 #------------------------------------------------------------------------------# | |
| 584 #------------------------------------------------------------------------------# | |
| 585 sub deleteBadOrderSensePairs{ | |
| 586 | |
| 587 my (@tab)=@_; | |
| 588 my @tab2; | |
| 589 | |
| 590 foreach my $v (@tab){ | |
| 591 | |
| 592 $v=~s/[\(\)]//g; | |
| 593 push(@tab2,$v) if($v!~/[\$\*\@]$/); | |
| 594 } | |
| 595 return @tab2; | |
| 596 } | |
| 597 #------------------------------------------------------------------------------# | |
| 598 #------------------------------------------------------------------------------# | |
| 599 #gets starts and ends Coords when start=leftmost given positions, directions and orders | |
| 600 sub getCoordswithLeftMost { | |
| 601 | |
| 602 my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; | |
| 603 | |
| 604 for my $i (0..scalar(@{$positions})-1) { | |
| 605 | |
| 606 if($strand->[$i] eq 'F'){ | |
| 607 $starts->[$i]=$positions->[$i]; | |
| 608 $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; | |
| 609 }else{ | |
| 610 $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; | |
| 611 $ends->[$i]=$positions->[$i]; | |
| 612 } | |
| 613 } | |
| 614 } | |
| 615 #------------------------------------------------------------------------------# | |
| 616 #------------------------------------------------------------------------------# | |
| 617 sub floor { | |
| 618 my $nb = $_[0]; | |
| 619 $nb=~ s/\..*//; | |
| 620 return $nb; | |
| 621 } | |
| 622 #------------------------------------------------------------------------------# | |
| 623 #------------------------------------------------------------------------------# | |
| 624 sub decimal{ | |
| 625 | |
| 626 my $num=shift; | |
| 627 my $digs_to_cut=shift; | |
| 628 | |
| 629 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); | |
| 630 | |
| 631 return $num; | |
| 632 } | |
| 633 | |
| 634 #------------------------------------------------------------------------------# | |
| 635 #------------------------------------------------------------------------------# | |
| 636 #Sort links according the concerned chromosomes and their coordinates | |
| 637 sub sortLinks{ | |
| 638 | |
| 639 my ($links_file,$sortedlinks_file,$unique)=@_; | |
| 640 | |
| 641 print LOG "# Sorting links...\n"; | |
| 642 | |
| 643 my $pipe=($unique)? "| sort -u":""; | |
| 644 system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; | |
| 645 | |
| 646 } | |
| 647 #------------------------------------------------------------------------------# | |
| 648 #------------------------------------------------------------------------------# | |
| 649 sub getColor{ | |
| 650 | |
| 651 my($count,$hcolor,$format)=@_; | |
| 652 for my $col ( keys % { $hcolor} ) { | |
| 653 return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); | |
| 654 } | |
| 655 return "white" if($format eq "circos"); | |
| 656 return "255,255,255" if($format eq "bed"); | |
| 657 } | |
| 658 #------------------------------------------------------------------------------# | |
| 659 #------------------------------------------------------------------------------# | |
| 660 #check if the configuration file is correct | |
| 661 sub validateconfiguration{ | |
| 662 | |
| 663 my %conf=%{$_[0]}; | |
| 664 my $list_prgs="@ARGV"; | |
| 665 | |
| 666 my @circos_params=qw(organism_id colorcode); | |
| 667 my @bed_params=qw(colorcode); | |
| 668 my @compare_params=qw(list_samples list_read_lengths sample_link_file reference_link_file); | |
| 669 | |
| 670 unless (defined($conf{general}{output_dir})) { | |
| 671 $conf{general}{output_dir} = "."; | |
| 672 } | |
| 673 unless (-d $conf{general}{output_dir}){ | |
| 674 mkdir $conf{general}{output_dir} or die; | |
| 675 } | |
| 676 $conf{general}{output_dir}.="/" if($conf{general}{output_dir}!~/\/$/); | |
| 677 | |
| 678 | |
| 679 if($list_prgs=~/links2compare/){ | |
| 680 foreach my $p (@compare_params) { | |
| 681 die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); | |
| 682 } | |
| 683 | |
| 684 unless (defined($conf{compare}{same_sv_type})) { | |
| 685 $conf{compare}{same_sv_type} = 0; | |
| 686 } | |
| 687 | |
| 688 unless (defined($conf{compare}{min_overlap})) { | |
| 689 $conf{compare}{min_overlap} = 1E-9; | |
| 690 } | |
| 691 | |
| 692 if($conf{compare}{circos_output}){ | |
| 693 foreach my $p (@circos_params) { | |
| 694 next if($list_prgs=~/^ratio/ && $p eq "colorcode"); | |
| 695 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); | |
| 696 } | |
| 697 } | |
| 698 if($conf{compare}{bed_output}){ | |
| 699 foreach my $p (@bed_params) { | |
| 700 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); | |
| 701 } | |
| 702 die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); | |
| 703 | |
| 704 my @samples=split(",",$conf{compare}{list_samples}); | |
| 705 my @read_lengths=split(",",$conf{compare}{list_read_lengths}); | |
| 706 for my $i (0..$#samples){ | |
| 707 my @l=split("-",$read_lengths[$i]); | |
| 708 $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; | |
| 709 } | |
| 710 } | |
| 711 } | |
| 712 | |
| 713 | |
| 714 } | |
| 715 #------------------------------------------------------------------------------# | |
| 716 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# |
