Mercurial > repos > scisjnu123 > ngsap_vc
comparison ngsap-vc/svdetect/SVDetect_run_parallel.pl @ 3:0d10255b5434 draft default tip
Uploaded
| author | scisjnu123 |
|---|---|
| date | Thu, 03 Oct 2019 10:42:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:2c7824a8d764 | 3:0d10255b5434 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 =pod | |
| 4 | |
| 5 =head1 NAME | |
| 6 | |
| 7 SVDetect - Program designed to the detection of structural variations | |
| 8 from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads | |
| 9 | |
| 10 Version: 0.8b for Galaxy | |
| 11 | |
| 12 =head1 SYNOPSIS | |
| 13 | |
| 14 SVDetect <command> -conf <configuration_file> [-help] [-man] | |
| 15 | |
| 16 Command: | |
| 17 | |
| 18 linking detection and isolation of links | |
| 19 filtering filtering of links according different parameters | |
| 20 links2circos links conversion to circos format | |
| 21 links2bed paired-ends of links converted to bed format (UCSC) | |
| 22 links2SV formatted output to show most significant SVs | |
| 23 cnv calculate copy-number profiles | |
| 24 ratio2circos ratio conversion to circos density format | |
| 25 ratio2bedgraph ratio conversion to bedGraph density format (UCSC) | |
| 26 | |
| 27 =head1 DESCRIPTION | |
| 28 | |
| 29 This is a command-line interface to SVDetect. | |
| 30 | |
| 31 | |
| 32 =head1 AUTHORS | |
| 33 | |
| 34 Bruno Zeitouni E<lt>bruno.zeitouni@curie.frE<gt>, | |
| 35 Valentina Boeva E<lt>valentina.boeva@curie.frE<gt> | |
| 36 | |
| 37 =cut | |
| 38 | |
| 39 # ------------------------------------------------------------------- | |
| 40 | |
| 41 use strict; | |
| 42 use warnings; | |
| 43 | |
| 44 use Pod::Usage; | |
| 45 use Getopt::Long; | |
| 46 | |
| 47 use Config::General; | |
| 48 use Tie::IxHash; | |
| 49 use FileHandle; | |
| 50 use Parallel::ForkManager; | |
| 51 | |
| 52 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 53 #PARSE THE COMMAND LINE | |
| 54 my %OPT; | |
| 55 GetOptions(\%OPT, | |
| 56 'conf=s', | |
| 57 'out1=s', #GALAXY | |
| 58 'out2=s', #GALAXY | |
| 59 'out3=s', #GALAXY | |
| 60 'out4=s', #GALAXY | |
| 61 'out5=s', #GALAXY | |
| 62 'l=s', #GALAXY | |
| 63 'N=s',#GALAXY | |
| 64 'help',#GALAXY | |
| 65 'man' | |
| 66 ); | |
| 67 | |
| 68 pod2usage() if $OPT{help}; | |
| 69 pod2usage(-verbose=>2) if $OPT{man}; | |
| 70 pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); | |
| 71 | |
| 72 pod2usage() if(@ARGV<1); | |
| 73 | |
| 74 tie (my %func, 'Tie::IxHash',linking=>\&createlinks, | |
| 75 filtering=>\&filterlinks, | |
| 76 links2circos=>\&links2circos, | |
| 77 links2bed=>\&links2bed, | |
| 78 links2compare=>\&links2compare, | |
| 79 links2SV=>\&links2SV, | |
| 80 cnv=>\&cnv, | |
| 81 ratio2circos=>\&ratio2circos, | |
| 82 ratio2bedgraph=>\&ratio2bedgraph); | |
| 83 | |
| 84 foreach my $command (@ARGV){ | |
| 85 pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); | |
| 86 } | |
| 87 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 88 | |
| 89 | |
| 90 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 91 #READ THE CONFIGURATION FILE | |
| 92 my $conf=Config::General->new( -ConfigFile => $OPT{conf}, | |
| 93 -Tie => "Tie::IxHash", | |
| 94 -AllowMultiOptions => 1, | |
| 95 -LowerCaseNames => 1, | |
| 96 -AutoTrue => 1); | |
| 97 my %CONF= $conf->getall; | |
| 98 validateconfiguration(\%CONF); #validation of the configuration parameters | |
| 99 | |
| 100 my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY | |
| 101 | |
| 102 my $pt_log_file=$OPT{l}; #GALAXY | |
| 103 my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY | |
| 104 my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY | |
| 105 my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY | |
| 106 my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY | |
| 107 my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY | |
| 108 | |
| 109 $CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY | |
| 110 $CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY | |
| 111 | |
| 112 my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY | |
| 113 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY | |
| 114 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 115 | |
| 116 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 117 #COMMAND EXECUTION | |
| 118 foreach my $command (@ARGV){ | |
| 119 &{$func{$command}}(); | |
| 120 } | |
| 121 print LOG "-- end\n";#GALAXY | |
| 122 | |
| 123 close LOG;#GALAXY | |
| 124 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY | |
| 125 exit(0); | |
| 126 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 127 | |
| 128 | |
| 129 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
| 130 #FUNCTIONS | |
| 131 #------------------------------------------------------------------------------# | |
| 132 #MAIN FUNCTION number 1: Detection of links from mate-pairs data | |
| 133 sub createlinks{ | |
| 134 | |
| 135 my %CHR; #main hash table 1: fragments, links | |
| 136 my %CHRID; | |
| 137 my @MATEFILES; | |
| 138 | |
| 139 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; | |
| 140 my @path=split(/\//,$output_prefix); | |
| 141 $output_prefix=$CONF{general}{output_dir}.$path[$#path]; | |
| 142 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; | |
| 143 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; | |
| 144 | |
| 145 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters | |
| 146 $CONF{detection}{window_size}, | |
| 147 $CONF{detection}{step_length}, | |
| 148 $CONF{general}{cmap_file}); | |
| 149 | |
| 150 if($CONF{detection}{split_mate_file}){ | |
| 151 | |
| 152 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, | |
| 153 $CONF{general}{sv_type}, | |
| 154 $CONF{general}{mates_file}, | |
| 155 $CONF{general}{input_format}, | |
| 156 $CONF{general}{read_lengths} | |
| 157 ); | |
| 158 }else{ | |
| 159 | |
| 160 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!"; | |
| 161 chomp(@MATEFILES); | |
| 162 print LOG "# Splitted mate files already created.\n"; | |
| 163 } | |
| 164 | |
| 165 | |
| 166 #Parallelization of the linking per chromosome for intra + interchrs | |
| 167 my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); | |
| 168 | |
| 169 foreach my $matefile (@MATEFILES){ | |
| 170 | |
| 171 my $pid = $pm->start and next; | |
| 172 getlinks(\%CHR, \%CHRID, $matefile); | |
| 173 $pm->finish; | |
| 174 | |
| 175 } | |
| 176 $pm->wait_all_children; | |
| 177 | |
| 178 #Merge the chromosome links file into only one | |
| 179 my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!"; | |
| 180 chomp(@LINKFILES); | |
| 181 catFiles( \@LINKFILES => "$output_prefix.links" ); | |
| 182 | |
| 183 system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY | |
| 184 print LOG "# Linking end procedure : output created: $output_prefix.links\n"; | |
| 185 #unlink(@LINKFILES); | |
| 186 #unlink(@MATEFILES); | |
| 187 | |
| 188 undef %CHR; | |
| 189 undef %CHRID; | |
| 190 | |
| 191 } | |
| 192 #------------------------------------------------------------------------------# | |
| 193 #------------------------------------------------------------------------------# | |
| 194 sub getlinks { | |
| 195 | |
| 196 my ($chr,$chrID,$tmp_mates_prefix)=@_; | |
| 197 | |
| 198 my $tmp_links_prefix=$tmp_mates_prefix; | |
| 199 $tmp_links_prefix=~s/\/mates\//\/links\//; | |
| 200 | |
| 201 my %PAIR; #main hash table 2: pairs | |
| 202 | |
| 203 linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs | |
| 204 $CONF{general}{read_lengths}, | |
| 205 $CONF{detection}{window_size}, | |
| 206 $CONF{detection}{step_length}, | |
| 207 $tmp_mates_prefix, | |
| 208 $CONF{general}{input_format}, | |
| 209 $CONF{general}{sv_type}, | |
| 210 "$tmp_links_prefix.links.mapped" | |
| 211 ); | |
| 212 | |
| 213 getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons | |
| 214 "$tmp_links_prefix.links.unique"); | |
| 215 | |
| 216 defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links | |
| 217 $CONF{general}{input_format}, | |
| 218 $CONF{general}{sv_type}, | |
| 219 $CONF{general}{read_lengths}, | |
| 220 "$tmp_links_prefix.links.unique", | |
| 221 "$tmp_links_prefix.links.unique_defined"); | |
| 222 | |
| 223 sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates | |
| 224 "$tmp_links_prefix.links.sorted"); | |
| 225 | |
| 226 removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links | |
| 227 "$tmp_links_prefix.links",1); #file output | |
| 228 | |
| 229 | |
| 230 undef %PAIR; | |
| 231 | |
| 232 unlink("$tmp_links_prefix.links.mapped", | |
| 233 "$tmp_links_prefix.links.unique", | |
| 234 "$tmp_links_prefix.links.unique_defined", | |
| 235 "$tmp_links_prefix.links.sorted"); | |
| 236 } | |
| 237 #------------------------------------------------------------------------------# | |
| 238 #------------------------------------------------------------------------------# | |
| 239 sub splitMateFile{ | |
| 240 | |
| 241 my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_; | |
| 242 | |
| 243 print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n"; | |
| 244 | |
| 245 my %filesHandle; | |
| 246 | |
| 247 #fichier matefile inter | |
| 248 if($sv_type=~/^(all|inter)$/){ | |
| 249 my $newFileName="$output_prefix.interchrs"; | |
| 250 push(@{$files_list},$newFileName); | |
| 251 my $fh = new FileHandle; | |
| 252 $fh->open(">$newFileName"); | |
| 253 $filesHandle{inter}=$fh; | |
| 254 } | |
| 255 | |
| 256 #fichiers matefiles intra | |
| 257 if($sv_type=~/^(all|intra)$/){ | |
| 258 foreach my $k (1..$chr->{nb_chrs}){ | |
| 259 my $newFileName=$output_prefix.".".$chr->{$k}->{name}; | |
| 260 push(@{$files_list},$newFileName); | |
| 261 my $fh = new FileHandle; | |
| 262 $fh->open(">$newFileName"); | |
| 263 $filesHandle{$k}=$fh; | |
| 264 } | |
| 265 } | |
| 266 | |
| 267 if ($mates_file =~ /.gz$/) { | |
| 268 open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat | |
| 269 }elsif($mates_file =~ /.bam$/){ | |
| 270 open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY | |
| 271 }else{ | |
| 272 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | |
| 273 } | |
| 274 | |
| 275 | |
| 276 while(<MATES>){ | |
| 277 | |
| 278 my @t=split; | |
| 279 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); | |
| 280 | |
| 281 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); | |
| 282 | |
| 283 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); | |
| 284 | |
| 285 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
| 286 | |
| 287 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ | |
| 288 my $fh2print=$filesHandle{inter}; | |
| 289 print $fh2print join("\t",@t)."\n"; | |
| 290 } | |
| 291 | |
| 292 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ | |
| 293 my $fh2print=$filesHandle{$chr_read1}; | |
| 294 print $fh2print join("\t",@t)."\n"; | |
| 295 | |
| 296 } | |
| 297 } | |
| 298 | |
| 299 foreach my $name (keys %filesHandle){ | |
| 300 my $fh=$filesHandle{$name}; | |
| 301 $fh->close; | |
| 302 } | |
| 303 | |
| 304 print LOG "# Splitted mate files of \"$mates_file\" created.\n"; | |
| 305 } | |
| 306 | |
| 307 | |
| 308 #------------------------------------------------------------------------------# | |
| 309 #------------------------------------------------------------------------------# | |
| 310 sub filterlinks{ | |
| 311 | |
| 312 my %CHR; | |
| 313 my %CHRID; | |
| 314 my @LINKFILES; | |
| 315 my @FLINKFILES; | |
| 316 | |
| 317 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; | |
| 318 my @path=split(/\//,$output_prefix); | |
| 319 $output_prefix=$CONF{general}{output_dir}.$path[$#path]; | |
| 320 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; | |
| 321 | |
| 322 createChrHashTables(\%CHR,\%CHRID, | |
| 323 $CONF{general}{cmap_file}); | |
| 324 | |
| 325 if($CONF{filtering}{split_link_file}){ | |
| 326 | |
| 327 splitLinkFile(\%CHR, \%CHRID, \@LINKFILES, | |
| 328 $tmp_links_prefix, | |
| 329 $CONF{general}{sv_type}, | |
| 330 "$output_prefix.links", | |
| 331 ); | |
| 332 }else{ | |
| 333 | |
| 334 @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n"; | |
| 335 chomp(@LINKFILES); | |
| 336 print LOG "# Splitted link files already created.\n"; | |
| 337 } | |
| 338 | |
| 339 #Parallelization of the filtering per chromosome for intra + interchrs | |
| 340 my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); | |
| 341 | |
| 342 foreach my $linkfile (@LINKFILES){ | |
| 343 | |
| 344 my $pid = $pm->start and next; | |
| 345 getFilteredlinks(\%CHR, \%CHRID, $linkfile); | |
| 346 $pm->finish; | |
| 347 | |
| 348 } | |
| 349 $pm->wait_all_children; | |
| 350 | |
| 351 #Merge the chromosome links file into only one | |
| 352 @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n"; | |
| 353 chomp(@FLINKFILES); | |
| 354 catFiles( \@FLINKFILES => "$output_prefix.links.filtered" ); | |
| 355 | |
| 356 system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY | |
| 357 print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n"; | |
| 358 | |
| 359 undef %CHR; | |
| 360 undef %CHRID; | |
| 361 | |
| 362 } | |
| 363 #------------------------------------------------------------------------------# | |
| 364 #------------------------------------------------------------------------------# | |
| 365 sub splitLinkFile{ | |
| 366 | |
| 367 my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_; | |
| 368 | |
| 369 print LOG "# Splitting the link file for parallel processing...\n"; | |
| 370 | |
| 371 my %filesHandle; | |
| 372 | |
| 373 #fichier matefile inter | |
| 374 if($sv_type=~/^(all|inter)$/){ | |
| 375 my $newFileName="$input_prefix.interchrs.links"; | |
| 376 push(@{$files_list},$newFileName); | |
| 377 my $fh = new FileHandle; | |
| 378 $fh->open(">$newFileName"); | |
| 379 $filesHandle{inter}=$fh; | |
| 380 } | |
| 381 | |
| 382 #fichiers matefiles intra | |
| 383 if($sv_type=~/^(all|intra)$/){ | |
| 384 foreach my $k (1..$chr->{nb_chrs}){ | |
| 385 my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links"; | |
| 386 push(@{$files_list},$newFileName); | |
| 387 my $fh = new FileHandle; | |
| 388 $fh->open(">$newFileName"); | |
| 389 $filesHandle{$k}=$fh; | |
| 390 } | |
| 391 } | |
| 392 | |
| 393 open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n"; | |
| 394 while(<LINKS>){ | |
| 395 | |
| 396 my @t=split; | |
| 397 my ($chr_read1,$chr_read2)=($t[0],$t[3]); | |
| 398 | |
| 399 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); | |
| 400 | |
| 401 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
| 402 | |
| 403 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ | |
| 404 my $fh2print=$filesHandle{inter}; | |
| 405 print $fh2print join("\t",@t)."\n"; | |
| 406 } | |
| 407 | |
| 408 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ | |
| 409 my $fh2print=$filesHandle{$chr_read1}; | |
| 410 print $fh2print join("\t",@t)."\n"; | |
| 411 | |
| 412 } | |
| 413 } | |
| 414 | |
| 415 foreach my $name (keys %filesHandle){ | |
| 416 my $fh=$filesHandle{$name}; | |
| 417 $fh->close; | |
| 418 } | |
| 419 | |
| 420 print LOG "# Splitted link files created.\n"; | |
| 421 } | |
| 422 | |
| 423 | |
| 424 #------------------------------------------------------------------------------# | |
| 425 #------------------------------------------------------------------------------# | |
| 426 #MAIN FUNCTION number 2: Filtering processing | |
| 427 sub getFilteredlinks { | |
| 428 | |
| 429 my ($chr,$chrID,$tmp_links_prefix)=@_; | |
| 430 my %PAIR; | |
| 431 | |
| 432 strandFiltering($chr,$chrID, | |
| 433 $CONF{filtering}{nb_pairs_threshold}, #filtering of links | |
| 434 $CONF{filtering}{strand_filtering}, | |
| 435 $CONF{filtering}{chromosomes}, | |
| 436 $CONF{general}{input_format}, | |
| 437 $CONF{general}{cmap_file}, | |
| 438 $CONF{general}{mates_orientation}, | |
| 439 $CONF{general}{read_lengths}, | |
| 440 $tmp_links_prefix, | |
| 441 "$tmp_links_prefix.filtered", | |
| 442 ); | |
| 443 | |
| 444 if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering | |
| 445 | |
| 446 my @tmpfiles; | |
| 447 | |
| 448 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique"); | |
| 449 | |
| 450 getUniqueLinks("$tmp_links_prefix.filtered_unique", | |
| 451 "$tmp_links_prefix.filtered"); | |
| 452 | |
| 453 push(@tmpfiles,"$tmp_links_prefix.filtered_unique"); | |
| 454 | |
| 455 if($CONF{filtering}{order_filtering}){ #filtering using the order | |
| 456 | |
| 457 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered"); | |
| 458 | |
| 459 orderFiltering($chr,$chrID, | |
| 460 $CONF{filtering}{nb_pairs_threshold}, | |
| 461 $CONF{filtering}{nb_pairs_order_threshold}, | |
| 462 $CONF{filtering}{mu_length}, | |
| 463 $CONF{filtering}{sigma_length}, | |
| 464 $CONF{general}{mates_orientation}, | |
| 465 $CONF{general}{read_lengths}, | |
| 466 "$tmp_links_prefix.filtered_ordered", | |
| 467 "$tmp_links_prefix.filtered", | |
| 468 ); | |
| 469 | |
| 470 push(@tmpfiles,"$tmp_links_prefix.filtered_ordered"); | |
| 471 } | |
| 472 | |
| 473 if (($CONF{filtering}{insert_size_filtering})&& | |
| 474 ($CONF{general}{sv_type} ne 'inter')){ | |
| 475 | |
| 476 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize"); | |
| 477 | |
| 478 addInsertionInfo($chr,$chrID, | |
| 479 $CONF{filtering}{nb_pairs_threshold}, | |
| 480 $CONF{filtering}{order_filtering}, | |
| 481 $CONF{filtering}{indel_sigma_threshold}, | |
| 482 $CONF{filtering}{dup_sigma_threshold}, | |
| 483 $CONF{filtering}{singleton_sigma_threshold}, | |
| 484 $CONF{filtering}{mu_length}, | |
| 485 $CONF{filtering}{sigma_length}, | |
| 486 $CONF{general}{mates_orientation}, | |
| 487 $CONF{general}{read_lengths}, | |
| 488 "$tmp_links_prefix.filtered_withoutIndelSize", | |
| 489 "$tmp_links_prefix.filtered" | |
| 490 ); | |
| 491 | |
| 492 push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize"); | |
| 493 } | |
| 494 | |
| 495 sortLinks("$tmp_links_prefix.filtered", | |
| 496 "$tmp_links_prefix.filtered_sorted"); | |
| 497 | |
| 498 removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted", | |
| 499 "$tmp_links_prefix.filtered_nodup", | |
| 500 ); | |
| 501 | |
| 502 postFiltering("$tmp_links_prefix.filtered_nodup", | |
| 503 "$tmp_links_prefix.filtered", | |
| 504 $CONF{filtering}{final_score_threshold}); | |
| 505 | |
| 506 push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup"); | |
| 507 | |
| 508 unlink(@tmpfiles); | |
| 509 | |
| 510 | |
| 511 } | |
| 512 undef %PAIR; | |
| 513 | |
| 514 } | |
| 515 #------------------------------------------------------------------------------# | |
| 516 #------------------------------------------------------------------------------# | |
| 517 #MAIN FUNCTION number 3: Circos format conversion for links | |
| 518 sub links2circos{ | |
| 519 | |
| 520 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; | |
| 521 my @path=split(/\//,$input_file); | |
| 522 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
| 523 | |
| 524 my $output_file.=$input_file.".segdup.txt"; | |
| 525 | |
| 526 links2segdup($CONF{circos}{organism_id}, | |
| 527 $CONF{circos}{colorcode}, | |
| 528 $input_file, | |
| 529 $output_file); #circos file output | |
| 530 | |
| 531 system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY | |
| 532 } | |
| 533 #------------------------------------------------------------------------------# | |
| 534 #------------------------------------------------------------------------------# | |
| 535 #MAIN FUNCTION number 4: Bed format conversion for links | |
| 536 sub links2bed{ | |
| 537 | |
| 538 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; | |
| 539 my @path=split(/\//,$input_file); | |
| 540 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
| 541 | |
| 542 my $output_file.=$input_file.".bed.txt"; | |
| 543 | |
| 544 links2bedfile($CONF{general}{read_lengths}, | |
| 545 $CONF{bed}{colorcode}, | |
| 546 $input_file, | |
| 547 $output_file); #bed file output | |
| 548 | |
| 549 system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY | |
| 550 | |
| 551 } | |
| 552 #------------------------------------------------------------------------------# | |
| 553 #------------------------------------------------------------------------------# | |
| 554 #MAIN FUNCTION number 6: Bed format conversion for links | |
| 555 sub links2SV{ | |
| 556 | |
| 557 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; | |
| 558 | |
| 559 my @path=split(/\//,$input_file); | |
| 560 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
| 561 | |
| 562 my $output_file.=$input_file.".sv.txt"; | |
| 563 | |
| 564 | |
| 565 links2SVfile( $input_file, | |
| 566 $output_file); | |
| 567 | |
| 568 system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY | |
| 569 } | |
| 570 #------------------------------------------------------------------------------# | |
| 571 #------------------------------------------------------------------------------# | |
| 572 #MAIN FUNCTION number 7: copy number variations, coverage ratio calculation | |
| 573 sub cnv{ | |
| 574 | |
| 575 my %CHR; | |
| 576 my %CHRID; | |
| 577 my @MATEFILES; | |
| 578 my @MATEFILES_REF; | |
| 579 | |
| 580 my $output_prefix=$CONF{general}{mates_file}; | |
| 581 my $output_prefix_ref=$CONF{detection}{mates_file_ref}; | |
| 582 my @path=split(/\//,$output_prefix); | |
| 583 my @path_ref=split(/\//,$output_prefix_ref); | |
| 584 $output_prefix=$CONF{general}{output_dir}.$path[$#path]; | |
| 585 $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref]; | |
| 586 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; | |
| 587 my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref]; | |
| 588 my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path]; | |
| 589 | |
| 590 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters | |
| 591 $CONF{detection}{window_size}, | |
| 592 $CONF{detection}{step_length}, | |
| 593 $CONF{general}{cmap_file}); | |
| 594 | |
| 595 if($CONF{detection}{split_mate_file}){ | |
| 596 | |
| 597 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, | |
| 598 "intra", | |
| 599 $CONF{general}{mates_file}, | |
| 600 $CONF{general}{input_format}, | |
| 601 $CONF{general}{read_lengths} | |
| 602 ); | |
| 603 | |
| 604 splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref, | |
| 605 "intra", | |
| 606 $CONF{detection}{mates_file_ref}, | |
| 607 $CONF{general}{input_format}, | |
| 608 $CONF{general}{read_lengths} | |
| 609 ); | |
| 610 | |
| 611 | |
| 612 }else{ | |
| 613 | |
| 614 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!"; | |
| 615 chomp(@MATEFILES); | |
| 616 @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!"; | |
| 617 chomp(@MATEFILES_REF); | |
| 618 print LOG "# Splitted sample and reference mate files already created.\n"; | |
| 619 } | |
| 620 | |
| 621 #Parallelization of the cnv per chromosome | |
| 622 my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); | |
| 623 | |
| 624 foreach my $file (0..$#MATEFILES){ | |
| 625 | |
| 626 my $pid = $pm->start and next; | |
| 627 | |
| 628 densityCalculation(\%CHR, \%CHRID, $file, | |
| 629 $CONF{general}{read_lengths}, | |
| 630 $CONF{detection}{window_size}, | |
| 631 $CONF{detection}{step_length}, | |
| 632 \@MATEFILES, | |
| 633 \@MATEFILES_REF, | |
| 634 $MATEFILES[$file].".density", | |
| 635 $CONF{general}{input_format}); | |
| 636 | |
| 637 $pm->finish; | |
| 638 | |
| 639 } | |
| 640 $pm->wait_all_children; | |
| 641 | |
| 642 #Merge the chromosome links file into only one | |
| 643 my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!"; | |
| 644 chomp(@DENSITYFILES); | |
| 645 catFiles( \@DENSITYFILES => "$output_prefix.density" ); | |
| 646 | |
| 647 print LOG "# cnv end procedure : output created: $output_prefix.density\n"; | |
| 648 | |
| 649 | |
| 650 undef %CHR; | |
| 651 undef %CHRID; | |
| 652 | |
| 653 } | |
| 654 #------------------------------------------------------------------------------# | |
| 655 #------------------------------------------------------------------------------# | |
| 656 #MAIN FUNCTION number 8: Circos format conversion for cnv ratios | |
| 657 sub ratio2circos{ | |
| 658 | |
| 659 my $input_file=$CONF{general}{mates_file}.".density"; | |
| 660 | |
| 661 my @path=split(/\//,$input_file); | |
| 662 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
| 663 | |
| 664 my $output_file.=$input_file.".segdup.txt"; | |
| 665 | |
| 666 ratio2segdup($CONF{circos}{organism_id}, | |
| 667 $input_file, | |
| 668 $output_file); | |
| 669 } | |
| 670 #------------------------------------------------------------------------------# | |
| 671 #MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios | |
| 672 sub ratio2bedgraph{ | |
| 673 | |
| 674 my $input_file=$CONF{general}{mates_file}.".density"; | |
| 675 | |
| 676 my @path=split(/\//,$input_file); | |
| 677 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
| 678 | |
| 679 my $output_file.=$input_file.".bedgraph.txt"; | |
| 680 | |
| 681 ratio2bedfile($input_file, | |
| 682 $output_file); #bed file output | |
| 683 } | |
| 684 #------------------------------------------------------------------------------# | |
| 685 #------------------------------------------------------------------------------# | |
| 686 #Creation of the fragment library | |
| 687 sub shearingChromosome{ | |
| 688 | |
| 689 print LOG "# Making the fragments library...\n"; | |
| 690 | |
| 691 my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters | |
| 692 | |
| 693 createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name | |
| 694 | |
| 695 foreach my $k (1..$chr->{nb_chrs}){ | |
| 696 | |
| 697 print LOG"-- $chr->{$k}->{name}\n"; | |
| 698 | |
| 699 my $frag=1; | |
| 700 for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){ | |
| 701 | |
| 702 my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1; | |
| 703 $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage | |
| 704 | |
| 705 if($end==($chr->{$k}->{length})-1){ | |
| 706 $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome | |
| 707 last; | |
| 708 } | |
| 709 $frag++; | |
| 710 } | |
| 711 } | |
| 712 } | |
| 713 #------------------------------------------------------------------------------# | |
| 714 #------------------------------------------------------------------------------# | |
| 715 #Creation of chromosome hash tables from the cmap file | |
| 716 sub createChrHashTables{ | |
| 717 | |
| 718 my ($chr,$chrID,$cmap_file)=@_; | |
| 719 $chr->{nb_chrs}=0; | |
| 720 | |
| 721 open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n"; | |
| 722 while(<CMAP>){ | |
| 723 | |
| 724 if(/^\s+$/){ next;} | |
| 725 my ($k,$name,$length) = split; | |
| 726 $chr->{$k}->{name}=$name; | |
| 727 $chr->{$k}->{length}=$length; | |
| 728 $chrID->{$name}=$k; | |
| 729 $chr->{nb_chrs}++; | |
| 730 | |
| 731 } | |
| 732 close CMAP; | |
| 733 } | |
| 734 #------------------------------------------------------------------------------# | |
| 735 #------------------------------------------------------------------------------# | |
| 736 #Read the mate file according the input format file (solid, eland or sam) | |
| 737 sub readMateFile{ | |
| 738 | |
| 739 my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_; | |
| 740 my ($strand1,$strand2); | |
| 741 | |
| 742 if($file_type eq "solid"){ | |
| 743 | |
| 744 ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based | |
| 745 | |
| 746 }else{ | |
| 747 my ($tag_length1,$tag_length2); | |
| 748 ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based | |
| 749 if($file_type eq "eland"); | |
| 750 | |
| 751 if($file_type eq "sam"){ | |
| 752 | |
| 753 return 0 if ($$t[0]=~/^@/); #header sam filtered out | |
| 754 | |
| 755 ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]); | |
| 756 | |
| 757 return 0 if (($$t[1]&0x0004) || ($$t[1]&0x0008)); | |
| 758 | |
| 759 $$chr2=$$chr1 if($$chr2 eq "="); | |
| 760 | |
| 761 $strand1 = (($$t[1]&0x0010))? 'R':'F'; | |
| 762 $strand2 = (($$t[1]&0x0020))? 'R':'F'; | |
| 763 | |
| 764 $$order1= (($$t[1]&0x0040))? '1':'2'; | |
| 765 $$order2= (($$t[1]&0x0080))? '1':'2'; | |
| 766 $tag_length1 = $tag_length->{$$order1}; | |
| 767 $tag_length2 = $tag_length->{$$order2}; | |
| 768 } | |
| 769 | |
| 770 $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts | |
| 771 $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R"); | |
| 772 } | |
| 773 return 1; | |
| 774 } | |
| 775 #------------------------------------------------------------------------------# | |
| 776 #------------------------------------------------------------------------------# | |
| 777 #Parsing of the mates files and creation of links between 2 chromosomal fragments | |
| 778 sub linking{ | |
| 779 | |
| 780 my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_; | |
| 781 my %link; | |
| 782 | |
| 783 my $record=0; | |
| 784 my $nb_links=0; | |
| 785 my $warn=10000; | |
| 786 | |
| 787 my @sfile=split(/\./,$mates_file); | |
| 788 my $fchr=$sfile[$#sfile]; | |
| 789 | |
| 790 my $fh = new FileHandle; | |
| 791 | |
| 792 print LOG "# $fchr : Linking procedure...\n"; | |
| 793 print LOG "-- file=$mates_file\n". | |
| 794 "-- chromosome=$fchr\n". | |
| 795 "-- input format=$input_format\n". | |
| 796 "-- type=$sv_type\n". | |
| 797 "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n". | |
| 798 "-- window size=$window_dist, step length=$step\n"; | |
| 799 | |
| 800 if ($mates_file =~ /.gz$/) { | |
| 801 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat | |
| 802 }elsif($mates_file =~ /.bam$/){ | |
| 803 $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY | |
| 804 }else{ | |
| 805 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; | |
| 806 } | |
| 807 | |
| 808 | |
| 809 while(<$fh>){ | |
| 810 | |
| 811 my @t=split; #for each mate-pair | |
| 812 my $mate=$t[0]; | |
| 813 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); | |
| 814 | |
| 815 next if(exists $$pair{$mate}); | |
| 816 | |
| 817 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); | |
| 818 | |
| 819 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); | |
| 820 | |
| 821 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
| 822 | |
| 823 if($sv_type ne "all"){ | |
| 824 if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) || | |
| 825 ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){ | |
| 826 }else{ | |
| 827 next; | |
| 828 } | |
| 829 } | |
| 830 | |
| 831 $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function) | |
| 832 | |
| 833 $record++; | |
| 834 | |
| 835 my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read | |
| 836 | |
| 837 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); | |
| 838 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); | |
| 839 | |
| 840 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation | |
| 841 | |
| 842 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ | |
| 843 | |
| 844 if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){ | |
| 845 | |
| 846 for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){ | |
| 847 | |
| 848 if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) { | |
| 849 | |
| 850 if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){ | |
| 851 | |
| 852 makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link | |
| 853 } | |
| 854 | |
| 855 }else{ | |
| 856 | |
| 857 $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step); | |
| 858 } | |
| 859 } | |
| 860 } | |
| 861 | |
| 862 }else{ | |
| 863 | |
| 864 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); | |
| 865 } | |
| 866 } | |
| 867 | |
| 868 if($record>=$warn){ | |
| 869 print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n"; | |
| 870 $warn+=10000; | |
| 871 } | |
| 872 } | |
| 873 $fh->close; | |
| 874 | |
| 875 if(!$nb_links){ | |
| 876 print LOG "-- $fchr : No mate-pairs !\n". | |
| 877 "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n"; | |
| 878 } | |
| 879 | |
| 880 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n"; | |
| 881 | |
| 882 print LOG "-- $fchr : writing...\n"; | |
| 883 | |
| 884 $fh = new FileHandle; | |
| 885 | |
| 886 $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n"; | |
| 887 | |
| 888 foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output | |
| 889 | |
| 890 foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){ | |
| 891 | |
| 892 foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){ | |
| 893 | |
| 894 foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){ | |
| 895 | |
| 896 my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2}); | |
| 897 print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t". | |
| 898 "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t". | |
| 899 scalar @count."\t". #nb of read | |
| 900 $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list | |
| 901 } | |
| 902 } | |
| 903 } | |
| 904 } | |
| 905 | |
| 906 $fh->close; | |
| 907 | |
| 908 undef %link; | |
| 909 | |
| 910 } | |
| 911 #------------------------------------------------------------------------------# | |
| 912 #------------------------------------------------------------------------------# | |
| 913 #remove exact links doublons according to the mate list | |
| 914 sub getUniqueLinks{ | |
| 915 | |
| 916 my ($links_file,$nrlinks_file)=@_; | |
| 917 my %links; | |
| 918 my %pt; | |
| 919 my $nb_links; | |
| 920 my $n=1; | |
| 921 | |
| 922 my $record=0; | |
| 923 my $warn=300000; | |
| 924 | |
| 925 my @sfile=split(/\./,$links_file); | |
| 926 my $fchr=$sfile[$#sfile-2]; | |
| 927 | |
| 928 my $fh = new FileHandle; | |
| 929 | |
| 930 print LOG "# $fchr : Getting unique links...\n"; | |
| 931 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
| 932 | |
| 933 while(<$fh>){ | |
| 934 | |
| 935 my @t=split; | |
| 936 my $mates=$t[7]; | |
| 937 $record++; | |
| 938 | |
| 939 if(!exists $links{$mates}){ #Unique links selection | |
| 940 | |
| 941 $links{$mates}=[@t]; | |
| 942 $pt{$n}=$links{$mates}; | |
| 943 $n++; | |
| 944 | |
| 945 | |
| 946 }else{ #get the link coordinates from the mate-pairs list | |
| 947 | |
| 948 for my $i (1,2,4,5){ #get the shortest regions | |
| 949 | |
| 950 $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start | |
| 951 if($i==1 || $i==4); | |
| 952 $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end | |
| 953 if($i==2 || $i==5); | |
| 954 } | |
| 955 } | |
| 956 if($record>=$warn){ | |
| 957 print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n"; | |
| 958 $warn+=300000; | |
| 959 } | |
| 960 } | |
| 961 $fh->close; | |
| 962 | |
| 963 $nb_links=$n-1; | |
| 964 print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n"; | |
| 965 | |
| 966 $fh = new FileHandle; | |
| 967 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; | |
| 968 print LOG "-- $fchr : writing...\n"; | |
| 969 for my $i (1..$nb_links){ | |
| 970 | |
| 971 print $fh join("\t",@{$pt{$i}})."\n"; #all links output | |
| 972 } | |
| 973 | |
| 974 $fh->close; | |
| 975 | |
| 976 undef %links; | |
| 977 undef %pt; | |
| 978 | |
| 979 } | |
| 980 #------------------------------------------------------------------------------# | |
| 981 #------------------------------------------------------------------------------# | |
| 982 #get the new coordinates of each link from the mate list | |
| 983 sub defineCoordsLinks{ | |
| 984 | |
| 985 my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_; | |
| 986 | |
| 987 my @sfile=split(/\./,$links_file); | |
| 988 my $fchr=$sfile[$#sfile-2]; | |
| 989 | |
| 990 my $fh = new FileHandle; | |
| 991 my $fh2 = new FileHandle; | |
| 992 | |
| 993 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
| 994 $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n"; | |
| 995 | |
| 996 print LOG "# $fchr : Defining precise link coordinates...\n"; | |
| 997 | |
| 998 my $record=0; | |
| 999 my $warn=100000; | |
| 1000 | |
| 1001 my %coords; | |
| 1002 my %strands; | |
| 1003 my %order; | |
| 1004 my %ends_order; | |
| 1005 | |
| 1006 while(<$fh>){ | |
| 1007 | |
| 1008 | |
| 1009 my ($col1,$col2)=(1,2); #for an intrachromosomal link | |
| 1010 my $diffchr=0; #difference between chr1 and chr2 | |
| 1011 my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8]; | |
| 1012 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); | |
| 1013 if ($chr1 != $chr2){ #for an interchromosomal link | |
| 1014 $col1=$col2=0; #no distinction | |
| 1015 $diffchr=1; | |
| 1016 } | |
| 1017 | |
| 1018 my @pairs=split(",",$mates_list); | |
| 1019 | |
| 1020 $coords{$col1}{$chr1}->{start}=undef; | |
| 1021 $coords{$col1}{$chr1}->{end}=undef; | |
| 1022 $coords{$col2}{$chr2}->{start}=undef; | |
| 1023 $coords{$col2}{$chr2}->{end}=undef; | |
| 1024 $strands{$col1}{$chr1}=undef; | |
| 1025 $strands{$col2}{$chr2}=undef; | |
| 1026 $ends_order{$col1}{$chr1}=undef; | |
| 1027 $ends_order{$col2}{$chr2}=undef; | |
| 1028 | |
| 1029 | |
| 1030 $order{$col1}{$chr1}->{index}->{1}=undef; | |
| 1031 $order{$col1}{$chr1}->{index}->{2}=undef; | |
| 1032 $order{$col2}{$chr2}->{index}->{1}=undef; | |
| 1033 $order{$col2}{$chr2}->{index}->{2}=undef; | |
| 1034 $order{$col1}{$chr1}->{order}=undef; | |
| 1035 $order{$col2}{$chr2}->{order}=undef; | |
| 1036 | |
| 1037 $record++; | |
| 1038 | |
| 1039 for my $p (0..$#pairs){ #for each pair | |
| 1040 | |
| 1041 my ($coord_start_read1,$coord_end_read1); | |
| 1042 my ($coord_start_read2,$coord_end_read2); | |
| 1043 my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format); | |
| 1044 my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format); | |
| 1045 | |
| 1046 if(!$diffchr){ #for a intrachromosomal link | |
| 1047 if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column | |
| 1048 ($col1,$col2)=(2,1); | |
| 1049 }else{ | |
| 1050 ($col1,$col2)=(1,2); | |
| 1051 } | |
| 1052 } | |
| 1053 | |
| 1054 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads | |
| 1055 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1); | |
| 1056 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2); | |
| 1057 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2); | |
| 1058 push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1); | |
| 1059 push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2); | |
| 1060 push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]); | |
| 1061 push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]); | |
| 1062 } | |
| 1063 | |
| 1064 ($col1,$col2)=(1,2) if(!$diffchr); | |
| 1065 | |
| 1066 my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region | |
| 1067 my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}})); | |
| 1068 my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}})); | |
| 1069 my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}})); | |
| 1070 | |
| 1071 @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}}; | |
| 1072 @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}}; | |
| 1073 | |
| 1074 foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting) | |
| 1075 foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){ | |
| 1076 | |
| 1077 if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){ | |
| 1078 ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i; | |
| 1079 ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j; | |
| 1080 last; | |
| 1081 } | |
| 1082 } | |
| 1083 } | |
| 1084 | |
| 1085 foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID | |
| 1086 foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){ | |
| 1087 | |
| 1088 if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){ | |
| 1089 ${$order{$col1}{$chr1}->{order}}[$i]=$i+1; | |
| 1090 ${$order{$col2}{$chr2}->{order}}[$i]=$j+1; | |
| 1091 last; | |
| 1092 } | |
| 1093 } | |
| 1094 } | |
| 1095 | |
| 1096 @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads | |
| 1097 @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1}); | |
| 1098 @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2}); | |
| 1099 @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1}); | |
| 1100 @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2}); | |
| 1101 @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start}); | |
| 1102 @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start}); | |
| 1103 | |
| 1104 | |
| 1105 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output | |
| 1106 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, | |
| 1107 scalar @pairs, | |
| 1108 join(",",@pairs), | |
| 1109 join(",",@{$strands{$col1}{$chr1}}), | |
| 1110 join(",",@{$strands{$col2}{$chr2}}), | |
| 1111 join(",",@{$ends_order{$col1}{$chr1}}), | |
| 1112 join(",",@{$ends_order{$col2}{$chr2}}), | |
| 1113 join(",",@{$order{$col1}{$chr1}->{order}}), | |
| 1114 join(",",@{$order{$col2}{$chr2}->{order}}), | |
| 1115 join(",",@{$coords{$col1}{$chr1}->{start}}), | |
| 1116 join(",",@{$coords{$col2}{$chr2}->{start}})); | |
| 1117 | |
| 1118 print $fh2 join("\t",@link)."\n"; | |
| 1119 | |
| 1120 if($record>=$warn){ | |
| 1121 print LOG "-- $fchr : $warn links processed\n"; | |
| 1122 $warn+=100000; | |
| 1123 } | |
| 1124 } | |
| 1125 $fh->close; | |
| 1126 $fh2->close; | |
| 1127 | |
| 1128 print LOG "-- $fchr : Total : $record links processed\n"; | |
| 1129 | |
| 1130 } | |
| 1131 #------------------------------------------------------------------------------# | |
| 1132 #------------------------------------------------------------------------------# | |
| 1133 #Sort links according the concerned chromosomes and their coordinates | |
| 1134 sub sortLinks{ | |
| 1135 | |
| 1136 my ($links_file,$sortedlinks_file,$unique)=@_; | |
| 1137 | |
| 1138 my @sfile=split(/\./,$links_file); | |
| 1139 my $fchr=$sfile[$#sfile-2]; | |
| 1140 | |
| 1141 | |
| 1142 print LOG "# $fchr : Sorting links...\n"; | |
| 1143 | |
| 1144 my $pipe=($unique)? "| sort -u":""; | |
| 1145 system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; | |
| 1146 | |
| 1147 } | |
| 1148 #------------------------------------------------------------------------------# | |
| 1149 #------------------------------------------------------------------------------# | |
| 1150 #removal of fully overlapped links | |
| 1151 sub removeFullyOverlappedLinks{ | |
| 1152 | |
| 1153 my ($links_file,$nrlinks_file,$warn_out)=@_; | |
| 1154 | |
| 1155 my %pt; | |
| 1156 my $n=1; | |
| 1157 | |
| 1158 my @sfile=split(/\./,$links_file); | |
| 1159 my $fchr=$sfile[$#sfile-2]; | |
| 1160 | |
| 1161 my $fh = new FileHandle; | |
| 1162 | |
| 1163 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
| 1164 while(<$fh>){ | |
| 1165 | |
| 1166 my @t=split("\t",$_); | |
| 1167 $pt{$n}=[@t]; | |
| 1168 $n++; | |
| 1169 } | |
| 1170 $fh->close; | |
| 1171 | |
| 1172 my $nb_links=$n-1; | |
| 1173 my $nb=$nb_links; | |
| 1174 | |
| 1175 my %pt2; | |
| 1176 my $nb2=1; | |
| 1177 my $record=0; | |
| 1178 my $warn=10000; | |
| 1179 | |
| 1180 print LOG "# $fchr : Removing fully overlapped links...\n"; | |
| 1181 | |
| 1182 LINK: | |
| 1183 | |
| 1184 for my $i (1..$nb){ | |
| 1185 | |
| 1186 my @link=(); | |
| 1187 my @next_link=(); | |
| 1188 my $ind1=$i; | |
| 1189 | |
| 1190 $record++; | |
| 1191 if($record>=$warn){ | |
| 1192 print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n"; | |
| 1193 $warn+=10000; | |
| 1194 } | |
| 1195 | |
| 1196 if(exists $pt{$ind1}){ | |
| 1197 @link=@{$pt{$ind1}}; #link1 | |
| 1198 }else{ | |
| 1199 next LINK; | |
| 1200 } | |
| 1201 | |
| 1202 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1 | |
| 1203 my @mates=deleteBadOrderSensePairs(split(",",$link[7])); | |
| 1204 | |
| 1205 my $ind2=$ind1+1; | |
| 1206 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link | |
| 1207 | |
| 1208 if($ind2<=$nb){ | |
| 1209 | |
| 1210 @next_link=@{$pt{$ind2}}; #link2 | |
| 1211 my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2 | |
| 1212 my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); | |
| 1213 | |
| 1214 while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter | |
| 1215 | |
| 1216 if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2 | |
| 1217 | |
| 1218 $ind2++; | |
| 1219 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); | |
| 1220 | |
| 1221 if($ind2>$nb){ #if no more link in the file -> save link1 | |
| 1222 | |
| 1223 $pt2{$nb2}=\@link; | |
| 1224 $nb2++; | |
| 1225 next LINK; | |
| 1226 } | |
| 1227 | |
| 1228 @next_link=@{$pt{$ind2}}; | |
| 1229 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); | |
| 1230 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); | |
| 1231 next; | |
| 1232 } | |
| 1233 | |
| 1234 my %mates=map{$_ =>1} @mates; #get the equal number of mates | |
| 1235 my @same_mates = grep( $mates{$_}, @next_mates ); | |
| 1236 my $nb_mates= scalar @same_mates; | |
| 1237 | |
| 1238 if($nb_mates == scalar @mates){ | |
| 1239 | |
| 1240 delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1 | |
| 1241 next LINK; #go to link2, link2 becomes link1 | |
| 1242 | |
| 1243 }else{ | |
| 1244 delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2 | |
| 1245 $ind2++; #we continue by checking the next link2 | |
| 1246 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); | |
| 1247 | |
| 1248 if($ind2>$nb){ #if no more link in the file -> save link1 | |
| 1249 | |
| 1250 $pt2{$nb2}=\@link; | |
| 1251 $nb2++; | |
| 1252 next LINK; | |
| 1253 } | |
| 1254 | |
| 1255 @next_link=@{$pt{$ind2}}; #get info of link2 | |
| 1256 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); | |
| 1257 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); | |
| 1258 | |
| 1259 } | |
| 1260 } | |
| 1261 } | |
| 1262 $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1 | |
| 1263 $nb2++; | |
| 1264 } | |
| 1265 | |
| 1266 print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n"; | |
| 1267 | |
| 1268 #OUTPUT | |
| 1269 | |
| 1270 $fh = new FileHandle; | |
| 1271 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; | |
| 1272 print LOG "-- $fchr : writing...\n"; | |
| 1273 for my $i (1..$nb2-1){ | |
| 1274 | |
| 1275 print $fh join("\t",@{$pt2{$i}}); #all links output | |
| 1276 } | |
| 1277 | |
| 1278 close $fh; | |
| 1279 | |
| 1280 print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out); | |
| 1281 | |
| 1282 undef %pt; | |
| 1283 undef %pt2; | |
| 1284 } | |
| 1285 #------------------------------------------------------------------------------# | |
| 1286 sub postFiltering { | |
| 1287 | |
| 1288 my ($links_file,$pflinks_file, $finalScore_thres)=@_; | |
| 1289 | |
| 1290 my @sfile=split(/\./,$links_file); | |
| 1291 my $fchr=$sfile[$#sfile-2]; | |
| 1292 | |
| 1293 | |
| 1294 my ($nb,$nb2)=(0,0); | |
| 1295 | |
| 1296 print LOG "# $fchr : Post-filtering links...\n"; | |
| 1297 print LOG "-- $fchr : final score threshold = $finalScore_thres\n"; | |
| 1298 | |
| 1299 my $fh = new FileHandle; | |
| 1300 my $fh2 = new FileHandle; | |
| 1301 | |
| 1302 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
| 1303 $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n"; | |
| 1304 | |
| 1305 | |
| 1306 while(<$fh>){ | |
| 1307 | |
| 1308 my @t=split("\t",$_); | |
| 1309 my $score=$t[$#t-1]; | |
| 1310 | |
| 1311 if($score >= $finalScore_thres){ | |
| 1312 print $fh2 join("\t", @t); | |
| 1313 $nb2++; | |
| 1314 } | |
| 1315 $nb++; | |
| 1316 } | |
| 1317 $fh->close; | |
| 1318 $fh2->close; | |
| 1319 | |
| 1320 print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n"; | |
| 1321 print LOG "-- $fchr : output created: $pflinks_file\n"; | |
| 1322 } | |
| 1323 | |
| 1324 | |
| 1325 | |
| 1326 #------------------------------------------------------------------------------# | |
| 1327 #------------------------------------------------------------------------------# | |
| 1328 #Filtering of the links | |
| 1329 sub strandFiltering{ | |
| 1330 | |
| 1331 my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes, | |
| 1332 $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_; | |
| 1333 | |
| 1334 my @sfile=split(/\./,$links_file); | |
| 1335 my $fchr=$sfile[$#sfile-1]; | |
| 1336 | |
| 1337 | |
| 1338 my %chrs; | |
| 1339 my %chrs1; | |
| 1340 my %chrs2; | |
| 1341 my $nb_chrs; | |
| 1342 my $exclude; | |
| 1343 | |
| 1344 if($chromosomes){ | |
| 1345 my @chrs=split(",",$chromosomes); | |
| 1346 $nb_chrs=scalar @chrs; | |
| 1347 $exclude=($chrs[0]=~/^\-/)? 1:0; | |
| 1348 for my $chrName (@chrs){ | |
| 1349 $chrName=~s/^(\-)//; | |
| 1350 my $col=($chrName=~s/_(1|2)$//); | |
| 1351 | |
| 1352 if(!$col){ | |
| 1353 $chrs{$chrID->{$chrName}}=undef | |
| 1354 }else{ | |
| 1355 $chrs1{$chrID->{$chrName}}=undef if($1==1); | |
| 1356 $chrs2{$chrID->{$chrName}}=undef if($1==2); | |
| 1357 } | |
| 1358 } | |
| 1359 } | |
| 1360 | |
| 1361 my $record=0; | |
| 1362 my $nb_links=0; | |
| 1363 my $warn=10000; | |
| 1364 | |
| 1365 my $sens_ratio_threshold=0.6; | |
| 1366 | |
| 1367 print LOG "\# Filtering procedure...\n"; | |
| 1368 print LOG "\# Number of pairs and strand filtering...\n"; | |
| 1369 print LOG "-- file=$links_file\n"; | |
| 1370 print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no"). | |
| 1371 ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n"; | |
| 1372 | |
| 1373 | |
| 1374 | |
| 1375 my $fh = new FileHandle; | |
| 1376 my $fh2 = new FileHandle; | |
| 1377 | |
| 1378 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
| 1379 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; | |
| 1380 | |
| 1381 while(<$fh>){ | |
| 1382 | |
| 1383 my @t=split; #for each link | |
| 1384 my $is_good=1; | |
| 1385 $record++; | |
| 1386 | |
| 1387 | |
| 1388 if($chromosomes){ | |
| 1389 | |
| 1390 my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]}); | |
| 1391 | |
| 1392 if(!$exclude){ | |
| 1393 $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0; | |
| 1394 $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good); | |
| 1395 $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good); | |
| 1396 }else{ | |
| 1397 $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1; | |
| 1398 $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good); | |
| 1399 } | |
| 1400 } | |
| 1401 | |
| 1402 $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs | |
| 1403 if($is_good && $strand_filtering){ #if filtering according the strand sense | |
| 1404 | |
| 1405 my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense | |
| 1406 my @strands1=split(/,/,$t[8]); | |
| 1407 my @strands2=split(/,/,$t[9]); | |
| 1408 | |
| 1409 my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0); | |
| 1410 | |
| 1411 my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR | |
| 1412 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF | |
| 1413 | |
| 1414 my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE", | |
| 1415 substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE", | |
| 1416 inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE"); | |
| 1417 | |
| 1418 if($t[6] == 1){ | |
| 1419 | |
| 1420 push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1); | |
| 1421 | |
| 1422 }else{ | |
| 1423 | |
| 1424 tie (my %class,'Tie::IxHash'); | |
| 1425 my $split; | |
| 1426 | |
| 1427 foreach my $i (0..$#mates){ | |
| 1428 $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group | |
| 1429 } | |
| 1430 | |
| 1431 my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR}; | |
| 1432 my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF}; | |
| 1433 my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class); | |
| 1434 | |
| 1435 if($sens_ratio < $sens_ratio_threshold){ | |
| 1436 %class=(1=>'FF', 2=>'FR'); | |
| 1437 $split=1; | |
| 1438 }else{ | |
| 1439 $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class | |
| 1440 $split=0; | |
| 1441 } | |
| 1442 | |
| 1443 $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold); | |
| 1444 } | |
| 1445 } | |
| 1446 | |
| 1447 if($is_good){ #PRINT | |
| 1448 | |
| 1449 my $nb=scalar @t; | |
| 1450 if($nb > 20){ | |
| 1451 my @t2=splice(@t,0,20); | |
| 1452 print $fh2 join("\t",@t2)."\n"; | |
| 1453 $nb_links++; | |
| 1454 } | |
| 1455 $nb_links++; | |
| 1456 print $fh2 join("\t",@t)."\n"; | |
| 1457 } | |
| 1458 | |
| 1459 if($record>=$warn){ | |
| 1460 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; | |
| 1461 $warn+=10000; | |
| 1462 } | |
| 1463 } | |
| 1464 $fh->close; | |
| 1465 $fh2->close; | |
| 1466 | |
| 1467 print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links); | |
| 1468 | |
| 1469 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; | |
| 1470 | |
| 1471 | |
| 1472 } | |
| 1473 #------------------------------------------------------------------------------# | |
| 1474 #------------------------------------------------------------------------------# | |
| 1475 sub getConsistentSenseLinks{ | |
| 1476 | |
| 1477 my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_; | |
| 1478 | |
| 1479 my $npairs=scalar @$mates; | |
| 1480 | |
| 1481 my @ends_order1 = split (/,/,$$t[10]); | |
| 1482 my @ends_order2 = split (/,/,$$t[11]); | |
| 1483 my @order1 = split (/,/,$$t[12]); | |
| 1484 my @order2 = split (/,/,$$t[13]); | |
| 1485 my @positions1 = split (/,/,$$t[14]); | |
| 1486 my @positions2 = split (/,/,$$t[15]); | |
| 1487 | |
| 1488 my @newlink; | |
| 1489 | |
| 1490 foreach my $ind (keys %{$class} ){ | |
| 1491 | |
| 1492 tie (my %flink,'Tie::IxHash'); | |
| 1493 my @orders2remove=(); | |
| 1494 | |
| 1495 foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group | |
| 1496 | |
| 1497 if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){ | |
| 1498 push(@{$flink{mates}},$$mates[$i]); | |
| 1499 push(@{$flink{strands1}},$$strands1[$i]); | |
| 1500 push(@{$flink{strands2}},$$strands2[$i]); | |
| 1501 push(@{$flink{ends_order1}},$ends_order1[$i]); | |
| 1502 push(@{$flink{ends_order2}},$ends_order2[$i]); | |
| 1503 push(@{$flink{positions1}},$positions1[$i]); | |
| 1504 push(@{$flink{positions2}},$positions2[$i]); | |
| 1505 | |
| 1506 }else{ | |
| 1507 push(@orders2remove,$order1[$i]); | |
| 1508 } | |
| 1509 } | |
| 1510 | |
| 1511 @{$flink{order1}}=(); | |
| 1512 @{$flink{order2}}=(); | |
| 1513 if(scalar @orders2remove > 0){ | |
| 1514 getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2}) | |
| 1515 }else{ | |
| 1516 @{$flink{order1}}=@order1; | |
| 1517 @{$flink{order2}}=@order2; | |
| 1518 } | |
| 1519 | |
| 1520 my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length); | |
| 1521 my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length); | |
| 1522 | |
| 1523 my $fnpairs=scalar @{$flink{mates}}; | |
| 1524 my $strand_filtering_ratio=$fnpairs."/".$npairs; | |
| 1525 my $real_ratio=$fnpairs/$npairs; | |
| 1526 | |
| 1527 if($fnpairs>=$thres){ #filtering according the number of pairs | |
| 1528 | |
| 1529 push(@newlink, | |
| 1530 $$t[0], | |
| 1531 min(min(@{$flink{positions1}}),min(@ends1)), | |
| 1532 max(max(@{$flink{positions1}}),max(@ends1)), | |
| 1533 $$t[3], | |
| 1534 min(min(@{$flink{positions2}}),min(@ends2)), | |
| 1535 max(max(@{$flink{positions2}}),max(@ends2)), | |
| 1536 $fnpairs, | |
| 1537 join(",",@{$flink{mates}}), | |
| 1538 join(",",@{$flink{strands1}}), | |
| 1539 join(",",@{$flink{strands2}}), | |
| 1540 join(",",@{$flink{ends_order1}}), | |
| 1541 join(",",@{$flink{ends_order2}}), | |
| 1542 join(",",@{$flink{order1}}), | |
| 1543 join(",",@{$flink{order2}}), | |
| 1544 join(",",@{$flink{positions1}}), | |
| 1545 join(",",@{$flink{positions2}}), | |
| 1546 $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]}, | |
| 1547 $strand_filtering_ratio, | |
| 1548 $real_ratio, | |
| 1549 $npairs | |
| 1550 ); | |
| 1551 } | |
| 1552 } | |
| 1553 | |
| 1554 if (grep {defined($_)} @newlink) { | |
| 1555 @$t=@newlink; | |
| 1556 return 1 | |
| 1557 } | |
| 1558 return 0; | |
| 1559 | |
| 1560 } | |
| 1561 #------------------------------------------------------------------------------# | |
| 1562 #------------------------------------------------------------------------------# | |
| 1563 sub getNewOrders{ | |
| 1564 | |
| 1565 my($tab1,$tab2,$list,$newtab1,$newtab2)=@_; | |
| 1566 my $j=1; | |
| 1567 my $k=1; | |
| 1568 for my $i (0..$#{$tab2}){ | |
| 1569 my $c=0; | |
| 1570 for my $j (0..$#{$list}){ | |
| 1571 $c++ if(${$list}[$j] < ${$tab2}[$i]); | |
| 1572 if(${$list}[$j] == ${$tab2}[$i]){ | |
| 1573 $c=-1; last; | |
| 1574 } | |
| 1575 } | |
| 1576 if($c!=-1){ | |
| 1577 push(@{$newtab2}, ${$tab2}[$i]-$c); | |
| 1578 push(@{$newtab1}, $k); | |
| 1579 $k++; | |
| 1580 } | |
| 1581 } | |
| 1582 } | |
| 1583 | |
| 1584 #------------------------------------------------------------------------------# | |
| 1585 #------------------------------------------------------------------------------# | |
| 1586 #Filtering of the links using their order | |
| 1587 sub orderFiltering { | |
| 1588 | |
| 1589 my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; | |
| 1590 | |
| 1591 my @sfile=split(/\./,$links_file); | |
| 1592 my $fchr=$sfile[$#sfile-2]; | |
| 1593 | |
| 1594 | |
| 1595 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; | |
| 1596 | |
| 1597 my $record=0; | |
| 1598 my $warn=10000; | |
| 1599 my $nb_links=0; | |
| 1600 | |
| 1601 my $quant05 = 1.644854; | |
| 1602 my $quant001 = 3.090232; | |
| 1603 my $alphaDist = $quant05 * 2 * $sigma; | |
| 1604 my $maxFragmentLength = &floor($quant001 * $sigma + $mu); | |
| 1605 | |
| 1606 print LOG "\# Filtering by order...\n"; | |
| 1607 print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n"; | |
| 1608 print LOG "-- distance between comparable pairs was set to $alphaDist\n"; | |
| 1609 print LOG "-- maximal fragment length was set to $maxFragmentLength\n"; | |
| 1610 | |
| 1611 | |
| 1612 my $fh = new FileHandle; | |
| 1613 my $fh2 = new FileHandle; | |
| 1614 | |
| 1615 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
| 1616 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; | |
| 1617 | |
| 1618 while(<$fh>){ | |
| 1619 | |
| 1620 $record++; | |
| 1621 my @t = split; | |
| 1622 my ($chr1,$chr2,$mates_list)=@t[0,3,7]; | |
| 1623 my @pairs=split(",",$mates_list); | |
| 1624 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); | |
| 1625 my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5]; | |
| 1626 my $numberOfPairs = $t[6]; | |
| 1627 my @strand1 = split (/,/,$t[8]); | |
| 1628 my @strand2 = split (/,/,$t[9]); | |
| 1629 my @ends_order1 = split (/,/,$t[10]); | |
| 1630 my @ends_order2 = split (/,/,$t[11]); | |
| 1631 my @order1 = split (/,/,$t[12]); | |
| 1632 my @order2 = split (/,/,$t[13]); | |
| 1633 my @positions1 = split (/,/,$t[14]); | |
| 1634 my @positions2 = split (/,/,$t[15]); | |
| 1635 my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length); | |
| 1636 my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length); | |
| 1637 my $clusterCoordinates_chr1; | |
| 1638 my $clusterCoordinates_chr2; | |
| 1639 my $reads_left = 0; | |
| 1640 | |
| 1641 my $ifRenv = $t[16]; | |
| 1642 my $strand_ratio_filtering=$t[17]; | |
| 1643 | |
| 1644 #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if <F-R> orientation is correct | |
| 1645 my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense); | |
| 1646 #find pairs type F-RRRR or FFFF-R in the case if <R-F> orientation is correct | |
| 1647 #These pairs are annotated as BED pairs forever! They won't be recycled! | |
| 1648 my $table; | |
| 1649 for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs | |
| 1650 my $nonAdeq = 0; | |
| 1651 for my $j (0..$i-1) { | |
| 1652 if (exists($table->{$j}->{$i})) { | |
| 1653 $nonAdeq++; | |
| 1654 $table->{$i}->{$j} = 1; | |
| 1655 } | |
| 1656 } | |
| 1657 for my $j ($i+1..$numberOfPairs-1) { | |
| 1658 if ($positions1[$j]-$positions1[$i]>$alphaDist) { | |
| 1659 if (&reversed ($i,$j,$ifRenv,\@positions2)) { | |
| 1660 $nonAdeq++; | |
| 1661 $table->{$i}->{$j} = 1; | |
| 1662 } | |
| 1663 } | |
| 1664 } | |
| 1665 $table->{$i}->{nonAdeq} = $nonAdeq; | |
| 1666 } | |
| 1667 | |
| 1668 for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of <R-F> orientation | |
| 1669 &remove($bad,$table); | |
| 1670 } | |
| 1671 | |
| 1672 my @falseReads; | |
| 1673 #RRRR-F -> RRRR or R-FFFF -> FFFF | |
| 1674 @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table}); | |
| 1675 #these pairs will be recycled later as $secondTable | |
| 1676 for my $bad (@falseReads) { | |
| 1677 &remove($bad,$table); | |
| 1678 } | |
| 1679 | |
| 1680 my $bad = &check($table); | |
| 1681 while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER | |
| 1682 # push (@falseReads, $bad); remove completely!!! | |
| 1683 &remove($bad,$table); | |
| 1684 $bad = &check($table); | |
| 1685 } | |
| 1686 | |
| 1687 $reads_left = scalar keys %{$table}; | |
| 1688 my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}])); | |
| 1689 my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}])); | |
| 1690 my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}])); | |
| 1691 my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}])); | |
| 1692 | |
| 1693 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; | |
| 1694 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; | |
| 1695 | |
| 1696 my $ifBalanced = 'UNBAL'; | |
| 1697 my $secondTable; | |
| 1698 my $clusterCoordinates; | |
| 1699 my ($break_pont_chr1,$break_pont_chr2); | |
| 1700 | |
| 1701 my $signatureType=""; | |
| 1702 | |
| 1703 my $maxCoord1 =$chr->{$chr1}->{length}; | |
| 1704 my $maxCoord2 =$chr->{$chr2}->{length}; | |
| 1705 | |
| 1706 if (scalar @falseReads) { | |
| 1707 @falseReads = sort @falseReads; | |
| 1708 #now delete FRFR choosing the majority | |
| 1709 my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF | |
| 1710 @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled | |
| 1711 my %hashTmp; | |
| 1712 for my $count1 (0..scalar(@falseReads)-1) { | |
| 1713 my $i = $falseReads[$count1]; | |
| 1714 $hashTmp{$i} = 1; | |
| 1715 for my $bad (@newfalseReads) { | |
| 1716 if ($bad == $i) { | |
| 1717 delete $hashTmp{$i}; | |
| 1718 next; | |
| 1719 } | |
| 1720 } | |
| 1721 } | |
| 1722 @falseReads = sort keys %hashTmp; #what is left | |
| 1723 for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected | |
| 1724 my $nonAdeq = 0; | |
| 1725 my $i = $falseReads[$count1]; | |
| 1726 | |
| 1727 for my $count2 (0..$count1-1) { | |
| 1728 my $j = $falseReads[$count2]; | |
| 1729 if (exists($secondTable->{$j}->{$i})) { | |
| 1730 $nonAdeq++; | |
| 1731 $secondTable->{$i}->{$j} = 1; | |
| 1732 } | |
| 1733 } | |
| 1734 for my $count2 ($count1+1..scalar(@falseReads)-1) { | |
| 1735 my $j = $falseReads[$count2]; | |
| 1736 if ($positions1[$j]-$positions1[$i]>$alphaDist) { | |
| 1737 if (&reversed ($i,$j,$ifRenv,\@positions2)) { | |
| 1738 $nonAdeq++; | |
| 1739 $secondTable->{$i}->{$j} = 1; | |
| 1740 } | |
| 1741 } | |
| 1742 } | |
| 1743 $secondTable->{$i}->{nonAdeq} = $nonAdeq; | |
| 1744 } | |
| 1745 | |
| 1746 my @falseReads2; | |
| 1747 my $bad = &check($secondTable); | |
| 1748 while ($bad ne "OK") { #clear the table to reject non adequate pairs | |
| 1749 push (@falseReads2, $bad); | |
| 1750 &remove($bad,$secondTable); | |
| 1751 $bad = &check($secondTable); | |
| 1752 } | |
| 1753 if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) { | |
| 1754 my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}])); | |
| 1755 my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}])); | |
| 1756 my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}])); | |
| 1757 my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}])); | |
| 1758 | |
| 1759 $ifBalanced = 'BAL'; | |
| 1760 | |
| 1761 if ($ifBalanced eq 'BAL') { | |
| 1762 | |
| 1763 if (scalar keys %{$table} < $nb_pairs_order_threshold) { | |
| 1764 $ifBalanced = 'UNBAL'; #kill cluster 1! | |
| 1765 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 | |
| 1766 $reads_left = scalar keys %{$table}; | |
| 1767 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; | |
| 1768 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; | |
| 1769 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; | |
| 1770 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; | |
| 1771 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; | |
| 1772 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; | |
| 1773 | |
| 1774 } else { | |
| 1775 | |
| 1776 $reads_left += scalar keys %{$secondTable}; | |
| 1777 next if ($reads_left < $nb_pairs_threshold); | |
| 1778 | |
| 1779 if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) { | |
| 1780 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 | |
| 1781 | |
| 1782 ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1); | |
| 1783 ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1); | |
| 1784 ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1); | |
| 1785 ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1); | |
| 1786 | |
| 1787 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1; | |
| 1788 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2; | |
| 1789 }else { | |
| 1790 $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')'; | |
| 1791 $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')'; | |
| 1792 } | |
| 1793 $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2); | |
| 1794 $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2); | |
| 1795 $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2); | |
| 1796 $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2); | |
| 1797 #to calculate breakpoints one need to take into account read orientation in claster.. | |
| 1798 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
| 1799 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
| 1800 | |
| 1801 | |
| 1802 my @index1 = keys %{$table}; | |
| 1803 my @index2 = keys %{$secondTable}; | |
| 1804 | |
| 1805 my (@generalStrand1,@generalStrand2) = 0; | |
| 1806 | |
| 1807 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs | |
| 1808 $leftLetterOk = 'R'; | |
| 1809 $rightLetterOk = 'F'; | |
| 1810 @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1); | |
| 1811 @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2); | |
| 1812 } else { | |
| 1813 @generalStrand1 = @strand1; | |
| 1814 @generalStrand2 = @strand2; # TODO check if it is correct | |
| 1815 } | |
| 1816 if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F) | |
| 1817 $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')'; | |
| 1818 | |
| 1819 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { | |
| 1820 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
| 1821 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; | |
| 1822 $signatureType = "TRANSLOC"; | |
| 1823 } else { | |
| 1824 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; | |
| 1825 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; | |
| 1826 $signatureType = "INS_FRAGMT"; | |
| 1827 } | |
| 1828 | |
| 1829 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { | |
| 1830 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
| 1831 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; | |
| 1832 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; | |
| 1833 $signatureType = "INV_INS_FRAGMT"; | |
| 1834 } else { | |
| 1835 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; | |
| 1836 $signatureType = "INV_TRANSLOC"; | |
| 1837 } | |
| 1838 } else { | |
| 1839 #should not occur | |
| 1840 print STDERR "\nError in orderFiltering\n\n"; | |
| 1841 } | |
| 1842 } | |
| 1843 | |
| 1844 elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R) | |
| 1845 $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')'; | |
| 1846 $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')'; | |
| 1847 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { | |
| 1848 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
| 1849 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; | |
| 1850 $signatureType = "INV_INS_FRAGMT"; | |
| 1851 } else { | |
| 1852 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; | |
| 1853 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; | |
| 1854 $signatureType = "INV_COAMPLICON"; | |
| 1855 } | |
| 1856 | |
| 1857 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { | |
| 1858 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
| 1859 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; | |
| 1860 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; | |
| 1861 $signatureType = "COAMPLICON"; | |
| 1862 } else { | |
| 1863 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; | |
| 1864 $signatureType = "INS_FRAGMT"; | |
| 1865 } | |
| 1866 } else { | |
| 1867 #should not occur | |
| 1868 $signatureType = "UNDEFINED"; | |
| 1869 } | |
| 1870 } | |
| 1871 else { # (F,F) or (R,R) something strange. We will discard the smallest cluster | |
| 1872 $ifBalanced = 'UNBAL'; | |
| 1873 if (scalar keys %{$secondTable} > scalar keys %{$table}) { | |
| 1874 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 | |
| 1875 | |
| 1876 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; | |
| 1877 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; | |
| 1878 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; | |
| 1879 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; | |
| 1880 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; | |
| 1881 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; | |
| 1882 } | |
| 1883 $reads_left = scalar keys %{$table}; | |
| 1884 } | |
| 1885 if ($ifBalanced eq 'BAL') { | |
| 1886 $ifRenv = $signatureType; | |
| 1887 } | |
| 1888 } | |
| 1889 } | |
| 1890 } | |
| 1891 } | |
| 1892 if ($ifBalanced ne 'BAL') { | |
| 1893 #define possible break point | |
| 1894 $coord_start_chr1 = $coord_start_chr1_cluster1; | |
| 1895 $coord_end_chr1 = $coord_end_chr1_cluster1; | |
| 1896 $coord_start_chr2 = $coord_start_chr2_cluster1; | |
| 1897 $coord_end_chr2 = $coord_end_chr2_cluster1; | |
| 1898 | |
| 1899 my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1; | |
| 1900 my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2; | |
| 1901 | |
| 1902 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
| 1903 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
| 1904 | |
| 1905 my @index = keys %{$table}; | |
| 1906 unless ($diff_sense_ends) { | |
| 1907 my $firstEndOrder1 = $ends_order1[$index[0]]; | |
| 1908 my $firstEndOrder2 = $ends_order2[$index[0]]; | |
| 1909 $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; | |
| 1910 $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; | |
| 1911 } else { | |
| 1912 $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; | |
| 1913 $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; | |
| 1914 } | |
| 1915 | |
| 1916 if ($chr1 ne $chr2){ | |
| 1917 $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE"); | |
| 1918 $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE"); | |
| 1919 } | |
| 1920 } | |
| 1921 | |
| 1922 if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) { | |
| 1923 next; #discard the link | |
| 1924 } | |
| 1925 if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) { | |
| 1926 next; #discard the link | |
| 1927 } | |
| 1928 my $ratioTxt = "$reads_left/".(scalar @pairs); | |
| 1929 my ($n1,$nTot) = split ("/",$strand_ratio_filtering); | |
| 1930 my $ratioReal = $reads_left/$nTot; | |
| 1931 | |
| 1932 if ($coord_start_chr1<=0) { | |
| 1933 $coord_start_chr1=1; | |
| 1934 } | |
| 1935 if ($coord_start_chr2<=0) { | |
| 1936 $coord_start_chr2=1; | |
| 1937 } | |
| 1938 #create output | |
| 1939 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output | |
| 1940 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, | |
| 1941 $reads_left, | |
| 1942 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs), | |
| 1943 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1), | |
| 1944 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2), | |
| 1945 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1), | |
| 1946 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2), | |
| 1947 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1), | |
| 1948 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2), | |
| 1949 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1), | |
| 1950 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2), | |
| 1951 $ifRenv, | |
| 1952 $strand_ratio_filtering, | |
| 1953 $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2, | |
| 1954 $ratioReal, $nTot); | |
| 1955 | |
| 1956 $nb_links++; | |
| 1957 print $fh2 join("\t",@link)."\n"; | |
| 1958 | |
| 1959 if($record>=$warn){ | |
| 1960 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; | |
| 1961 $warn+=10000; | |
| 1962 } | |
| 1963 | |
| 1964 } | |
| 1965 $fh->close; | |
| 1966 $fh2->close; | |
| 1967 | |
| 1968 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; | |
| 1969 | |
| 1970 } | |
| 1971 #------------------------------------------------------------------------------# | |
| 1972 #------------------------------------------------------------------------------# | |
| 1973 #gets information about ends positions given start, direction and order | |
| 1974 sub getEnds { | |
| 1975 my ($ends,$starts,$strand,$end_order,$tag_length) = @_; | |
| 1976 for my $i (0..scalar(@{$starts})-1) { | |
| 1977 $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length); | |
| 1978 } | |
| 1979 } | |
| 1980 sub getEnd { | |
| 1981 my ($start,$strand, $end_order,$tag_length) = @_; | |
| 1982 return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1; | |
| 1983 } | |
| 1984 #------------------------------------------------------------------------------# | |
| 1985 #------------------------------------------------------------------------------# | |
| 1986 #gets starts and ends Coords when start=leftmost given positions, directions and orders | |
| 1987 sub getCoordswithLeftMost { | |
| 1988 | |
| 1989 my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; | |
| 1990 | |
| 1991 for my $i (0..scalar(@{$positions})-1) { | |
| 1992 | |
| 1993 if($strand->[$i] eq 'F'){ | |
| 1994 $starts->[$i]=$positions->[$i]; | |
| 1995 $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; | |
| 1996 }else{ | |
| 1997 $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; | |
| 1998 $ends->[$i]=$positions->[$i]; | |
| 1999 } | |
| 2000 } | |
| 2001 } | |
| 2002 #------------------------------------------------------------------------------# | |
| 2003 #------------------------------------------------------------------------------# | |
| 2004 sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering | |
| 2005 | |
| 2006 my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; | |
| 2007 | |
| 2008 my @sfile=split(/\./,$links_file); | |
| 2009 my $fchr=$sfile[$#sfile-2]; | |
| 2010 | |
| 2011 | |
| 2012 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; | |
| 2013 | |
| 2014 my $record=0; | |
| 2015 my $nb_links=0; | |
| 2016 my $warn=10000; | |
| 2017 | |
| 2018 print LOG "\# Filtering out normal pairs using insert size...\n"; | |
| 2019 print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n"; | |
| 2020 print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-". | |
| 2021 ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n"; | |
| 2022 print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-". | |
| 2023 ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n"; | |
| 2024 print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF"); | |
| 2025 | |
| 2026 my $fh = new FileHandle; | |
| 2027 my $fh2 = new FileHandle; | |
| 2028 | |
| 2029 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
| 2030 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; | |
| 2031 | |
| 2032 while(<$fh>){ | |
| 2033 | |
| 2034 $record++; | |
| 2035 my @t = split; | |
| 2036 my ($chr1,$chr2,$mates_list)=@t[0,3,7]; | |
| 2037 | |
| 2038 if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all), | |
| 2039 $nb_links++; | |
| 2040 | |
| 2041 $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE"); | |
| 2042 $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE"); | |
| 2043 | |
| 2044 $t[16].= "\t"; | |
| 2045 $t[19].= "\t"; | |
| 2046 | |
| 2047 print $fh2 join("\t",@t)."\n"; | |
| 2048 | |
| 2049 if($record>=$warn){ | |
| 2050 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; | |
| 2051 $warn+=10000; | |
| 2052 } | |
| 2053 next; | |
| 2054 } | |
| 2055 | |
| 2056 my $ifRenv = $t[16]; | |
| 2057 my $ifBalanced = "UNBAL"; | |
| 2058 $ifBalanced = $t[18] if ($order_filtering); | |
| 2059 | |
| 2060 my $numberOfPairs = $t[6]; | |
| 2061 my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14])); | |
| 2062 my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15])); | |
| 2063 | |
| 2064 if ($ifBalanced eq "BAL") { | |
| 2065 | |
| 2066 if ($ifRenv eq "INV_TRANSLOC") { | |
| 2067 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment | |
| 2068 } | |
| 2069 if ($ifRenv eq "NORMAL_SENSE") { | |
| 2070 $ifRenv = "TRANSLOC"; | |
| 2071 } | |
| 2072 if ($ifRenv eq "REVERSE_SENSE") { | |
| 2073 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment | |
| 2074 } | |
| 2075 $t[19].= "\t"; | |
| 2076 | |
| 2077 my $meanDistance = 0; | |
| 2078 | |
| 2079 for my $i (0..$numberOfPairs-1) { | |
| 2080 $meanDistance += $positions2[$i]-$positions1[$i]; | |
| 2081 } | |
| 2082 $meanDistance /= $numberOfPairs; | |
| 2083 | |
| 2084 $t[16] = $ifRenv."\t".$meanDistance; | |
| 2085 #dont touch the annotation. It should be already OK. | |
| 2086 | |
| 2087 } else { | |
| 2088 #only for unbalanced | |
| 2089 | |
| 2090 my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]); | |
| 2091 | |
| 2092 my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0]. | |
| 2093 (deleteBadOrderSensePairs(split (/,/,$t[9])))[0]; | |
| 2094 my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0]. | |
| 2095 (deleteBadOrderSensePairs(split (/,/,$t[11])))[0]; | |
| 2096 | |
| 2097 my $indel_type = $ifRenv; | |
| 2098 | |
| 2099 my $meanDistance = "N/A"; | |
| 2100 | |
| 2101 ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster | |
| 2102 \@positions1, #assign N/A to $indel_type if unknown | |
| 2103 \@positions2, | |
| 2104 $ifRenv, | |
| 2105 $ifoverlap, | |
| 2106 $indel_sigma_threshold, | |
| 2107 $dup_sigma_threshold, | |
| 2108 $singleton_sigma_threshold, | |
| 2109 $mu, | |
| 2110 $sigma, | |
| 2111 $ifBalanced, | |
| 2112 $ends_sense_class, | |
| 2113 $ends_order_class, | |
| 2114 $mate_sense, | |
| 2115 $diff_sense_ends, | |
| 2116 ); | |
| 2117 | |
| 2118 #filtering of pairs with distance inconsistant with the SV | |
| 2119 if ($ifRenv ne "REVERSE_SENSE") { | |
| 2120 my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length}; | |
| 2121 my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length}; | |
| 2122 $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense, | |
| 2123 $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering); | |
| 2124 next if ($t[6] < $nb_pairs_threshold); | |
| 2125 }else{ | |
| 2126 $t[19].= "\t"; | |
| 2127 } | |
| 2128 $t[16] = $indel_type."\t".$meanDistance; | |
| 2129 } | |
| 2130 | |
| 2131 $nb_links++; | |
| 2132 | |
| 2133 print $fh2 join("\t",@t)."\n"; | |
| 2134 if($record>=$warn){ | |
| 2135 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; | |
| 2136 $warn+=10000; | |
| 2137 } | |
| 2138 } | |
| 2139 $fh->close; | |
| 2140 $fh2->close; | |
| 2141 | |
| 2142 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; | |
| 2143 | |
| 2144 } | |
| 2145 #------------------------------------------------------------------------------# | |
| 2146 #------------------------------------------------------------------------------# | |
| 2147 sub checkIndel { | |
| 2148 | |
| 2149 my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold, | |
| 2150 $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_; | |
| 2151 | |
| 2152 my $meanDistance = 0; | |
| 2153 | |
| 2154 for my $i (0..$numberOfPairs-1) { | |
| 2155 $meanDistance += $positions2->[$i]-$positions1->[$i]; | |
| 2156 } | |
| 2157 $meanDistance /= $numberOfPairs; | |
| 2158 | |
| 2159 return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) ); | |
| 2160 | |
| 2161 return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE"); | |
| 2162 | |
| 2163 if($diff_sense_ends){ | |
| 2164 return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; | |
| 2165 return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense))); | |
| 2166 }else{ | |
| 2167 return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) && | |
| 2168 ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; | |
| 2169 } | |
| 2170 | |
| 2171 return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap); | |
| 2172 | |
| 2173 return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ; | |
| 2174 | |
| 2175 return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma); | |
| 2176 return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma); | |
| 2177 | |
| 2178 return ($meanDistance, "UNDEFINED"); | |
| 2179 } | |
| 2180 #------------------------------------------------------------------------------# | |
| 2181 #------------------------------------------------------------------------------# | |
| 2182 #sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size ) | |
| 2183 sub recalc_t_usingInsertSizeInfo { | |
| 2184 my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_; | |
| 2185 | |
| 2186 my @badPairs; | |
| 2187 | |
| 2188 my @positions1 = getAllEntries($t->[14]); | |
| 2189 my @positions2 = getAllEntries($t->[15]); | |
| 2190 | |
| 2191 if ($meanDistance < $mu) { | |
| 2192 for my $i (0..scalar(@positions1)-1) { | |
| 2193 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) { | |
| 2194 push(@badPairs,$i); | |
| 2195 } | |
| 2196 } | |
| 2197 } else { | |
| 2198 for my $i (0..scalar(@positions1)-1) { | |
| 2199 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) { | |
| 2200 push(@badPairs,$i); | |
| 2201 } | |
| 2202 } | |
| 2203 } | |
| 2204 | |
| 2205 if (scalar (@badPairs)>0) { | |
| 2206 #print join("\t",@badPairs).": ".join("\t",@t)."\n"; | |
| 2207 #remove these inconsistant links | |
| 2208 $t->[6] -= scalar(@badPairs); #numberOfPairs | |
| 2209 return if ($t->[6] < $nb_pairs_threshold); | |
| 2210 | |
| 2211 $t->[7] = mark_values(\@badPairs, $t->[7]); | |
| 2212 $t->[8] = mark_values(\@badPairs, $t->[8]); | |
| 2213 $t->[9] = mark_values(\@badPairs, $t->[9]); | |
| 2214 $t->[10] = mark_values(\@badPairs, $t->[10]); | |
| 2215 $t->[11] = mark_values(\@badPairs, $t->[11]); | |
| 2216 | |
| 2217 $t->[12] = mark_indexes(\@badPairs, $t->[12]); | |
| 2218 $t->[13] = mark_indexes(\@badPairs, $t->[13]); | |
| 2219 | |
| 2220 $t->[14] = mark_values(\@badPairs, $t->[14]); | |
| 2221 $t->[15] = mark_values(\@badPairs, $t->[15]); | |
| 2222 $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio | |
| 2223 $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering); | |
| 2224 ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length); | |
| 2225 ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length); | |
| 2226 | |
| 2227 #recalc breakpoints: | |
| 2228 my $quant001 = 3.090232; | |
| 2229 my $maxFragmentLength = &floor($quant001 * $sigma + $mu); | |
| 2230 $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); | |
| 2231 $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); | |
| 2232 #recalc total ratio | |
| 2233 $t->[22] = $t->[6] / $t->[23] if ($order_filtering); | |
| 2234 $t->[18] = $t->[6] / $t->[19] unless ($order_filtering); | |
| 2235 | |
| 2236 @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14])); | |
| 2237 @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15])); | |
| 2238 | |
| 2239 $meanDistance = 0; | |
| 2240 | |
| 2241 for my $i (0..scalar(@positions1)-1) { | |
| 2242 $meanDistance += $positions2[$i]-$positions1[$i]; | |
| 2243 } | |
| 2244 $meanDistance /= scalar(@positions1); | |
| 2245 | |
| 2246 } else { | |
| 2247 $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering); | |
| 2248 $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering); | |
| 2249 | |
| 2250 } #nothing has been filtered | |
| 2251 return $meanDistance; | |
| 2252 } | |
| 2253 | |
| 2254 sub recalculate_ratio { | |
| 2255 my ($left, $ratio) = @_; | |
| 2256 my @elements = split (/\//,$ratio); | |
| 2257 $elements[1]= $elements[0]; | |
| 2258 $elements[0]=$left; | |
| 2259 return $ratio."\t".join("/",@elements); | |
| 2260 } | |
| 2261 | |
| 2262 sub recalc_breakpoints { | |
| 2263 my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_; | |
| 2264 my $break_pont_chr; | |
| 2265 | |
| 2266 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
| 2267 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
| 2268 | |
| 2269 | |
| 2270 my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); | |
| 2271 | |
| 2272 unless ($diff_sense_ends) { | |
| 2273 $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; | |
| 2274 } else { | |
| 2275 $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; | |
| 2276 } | |
| 2277 return $break_pont_chr; | |
| 2278 } | |
| 2279 sub recalculate_boundaries { | |
| 2280 my ($startString,$senseString,$endsOrderString,$tag_length) = @_; | |
| 2281 my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); | |
| 2282 my @strands = deleteBadOrderSensePairs(split (/,/,$senseString)); | |
| 2283 my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString)); | |
| 2284 my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length); | |
| 2285 my $coord_start_cluster = min(min(@positions),min(@ends)); | |
| 2286 my $coord_end_cluster = max(max(@positions),max(@ends)); | |
| 2287 return ($coord_start_cluster,$coord_end_cluster); | |
| 2288 } | |
| 2289 | |
| 2290 sub remove_indexes { | |
| 2291 my ($bads, $string) = @_; | |
| 2292 my @elements = deleteBadOrderSensePairs(split (/,/,$string)); | |
| 2293 for my $i (reverse sort %{$bads}) { | |
| 2294 delete $elements[$i]; | |
| 2295 } | |
| 2296 return "(".join(",",@elements).")"; | |
| 2297 } | |
| 2298 ##add @ to to elements | |
| 2299 sub mark_values { | |
| 2300 my ($bads, $string) = @_; | |
| 2301 my @elements = getAllEntries($string); | |
| 2302 for my $i (@{$bads}) { | |
| 2303 $elements[$i] .= "@"; | |
| 2304 } | |
| 2305 return "(".join(",",@elements).")"; | |
| 2306 } | |
| 2307 ##add @ to to indexes | |
| 2308 sub mark_indexes { | |
| 2309 my ($bads, $string) = @_; | |
| 2310 my @elements = getAllEntries($string); | |
| 2311 for my $i ((0..scalar(@elements)-1)) { | |
| 2312 for my $j (@{$bads}) { | |
| 2313 $elements[$i] .= "@" if ($elements[$i] eq ($j+1)); | |
| 2314 } | |
| 2315 } | |
| 2316 | |
| 2317 return "(".join(",",@elements).")"; | |
| 2318 } | |
| 2319 | |
| 2320 #------------------------------------------------------------------------------# | |
| 2321 #------------------------------------------------------------------------------# | |
| 2322 sub redraw { | |
| 2323 | |
| 2324 my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_; | |
| 2325 | |
| 2326 my $out; | |
| 2327 my @first_arr; | |
| 2328 if ($ifBalanced eq 'BAL') { | |
| 2329 my @second_arr; | |
| 2330 my $lastPushed = 1; | |
| 2331 if ($type == 1) { | |
| 2332 for my $i (0 .. scalar(@{$arr})-1) { | |
| 2333 if (exists ($table->{$i})) { | |
| 2334 push(@first_arr,$arr->[$i]); | |
| 2335 $lastPushed = 1; | |
| 2336 }elsif (exists ($secondTable->{$i})) { | |
| 2337 push(@second_arr,$arr->[$i]); | |
| 2338 $lastPushed = 2; | |
| 2339 } elsif ($lastPushed == 1) { | |
| 2340 if (exists ($badInFRSense->{$i})) { | |
| 2341 push(@first_arr,$arr->[$i]."\$"); | |
| 2342 }else { | |
| 2343 push(@first_arr,$arr->[$i]."*"); | |
| 2344 } | |
| 2345 } elsif ($lastPushed == 2) { | |
| 2346 if (exists ($badInFRSense->{$i})) { | |
| 2347 push(@second_arr,$arr->[$i]."\$"); | |
| 2348 }else { | |
| 2349 push(@second_arr,$arr->[$i]."*"); | |
| 2350 } | |
| 2351 } else {print "Error!";exit;} | |
| 2352 } | |
| 2353 } else { | |
| 2354 for my $i (@{$arr}) { | |
| 2355 if (exists ($table->{$i-1})) { | |
| 2356 push(@first_arr,$i); | |
| 2357 $lastPushed = 1; | |
| 2358 }elsif (exists ($secondTable->{$i-1})) { | |
| 2359 push(@second_arr,$i); | |
| 2360 $lastPushed = 2; | |
| 2361 } elsif ($lastPushed == 1) { | |
| 2362 if (exists ($badInFRSense->{$i-1})) { | |
| 2363 push(@first_arr,$i."\$"); | |
| 2364 }else { | |
| 2365 push(@first_arr,$i."*"); | |
| 2366 } | |
| 2367 } elsif ($lastPushed == 2) { | |
| 2368 if (exists ($badInFRSense->{$i-1})) { | |
| 2369 push(@second_arr,$i."\$"); | |
| 2370 }else { | |
| 2371 push(@second_arr,$i."*"); | |
| 2372 } | |
| 2373 } else {print "Error!";exit;} | |
| 2374 } | |
| 2375 } | |
| 2376 $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')'; | |
| 2377 } | |
| 2378 else { | |
| 2379 if ($type == 1) { | |
| 2380 for my $i (0 .. scalar(@{$arr})-1) { | |
| 2381 if (exists ($table->{$i})) { | |
| 2382 push(@first_arr,$arr->[$i]); | |
| 2383 } else { | |
| 2384 if (exists ($badInFRSense->{$i})) { | |
| 2385 push(@first_arr,$arr->[$i]."\$"); | |
| 2386 }else { | |
| 2387 push(@first_arr,$arr->[$i]."*"); | |
| 2388 } | |
| 2389 } | |
| 2390 } | |
| 2391 } else { | |
| 2392 for my $i (@{$arr}) { | |
| 2393 if (exists ($table->{$i-1})) { | |
| 2394 push(@first_arr,$i); | |
| 2395 } else { | |
| 2396 if (exists ($badInFRSense->{$i-1})) { | |
| 2397 push(@first_arr,$i."\$"); | |
| 2398 }else { | |
| 2399 push(@first_arr,$i."*"); | |
| 2400 } | |
| 2401 } | |
| 2402 } | |
| 2403 } | |
| 2404 $out = '('.join(",",@first_arr).')'; | |
| 2405 } | |
| 2406 return $out; | |
| 2407 } | |
| 2408 #------------------------------------------------------------------------------# | |
| 2409 #------------------------------------------------------------------------------# | |
| 2410 sub check { | |
| 2411 | |
| 2412 my $table = $_[0]; | |
| 2413 my $bad = 'OK'; | |
| 2414 my $max = 0; | |
| 2415 for my $i (sort {$a<=>$b} keys %{$table}) { | |
| 2416 unless ($table->{$i}->{nonAdeq} == 0) { | |
| 2417 if ($max<$table->{$i}->{nonAdeq}) { | |
| 2418 $max=$table->{$i}->{nonAdeq}; | |
| 2419 $bad = $i; | |
| 2420 } | |
| 2421 } | |
| 2422 } | |
| 2423 return $bad; | |
| 2424 } | |
| 2425 #------------------------------------------------------------------------------# | |
| 2426 #------------------------------------------------------------------------------# | |
| 2427 sub reversed { | |
| 2428 | |
| 2429 my ($i,$j,$ifRenv,$positions) = @_; | |
| 2430 if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){ | |
| 2431 return 1; | |
| 2432 } | |
| 2433 return 0; | |
| 2434 } | |
| 2435 #------------------------------------------------------------------------------# | |
| 2436 #------------------------------------------------------------------------------# | |
| 2437 sub remove { | |
| 2438 | |
| 2439 my ($bad,$table) = @_; | |
| 2440 for my $i (sort {$a<=>$b} keys %{$table}) { | |
| 2441 if ($bad == $i) { | |
| 2442 delete($table->{$i});; | |
| 2443 } else { | |
| 2444 if (exists($table->{$i}->{$bad})) { | |
| 2445 delete($table->{$i}->{$bad}); | |
| 2446 $table->{$i}->{nonAdeq}--; | |
| 2447 } | |
| 2448 } | |
| 2449 } | |
| 2450 } | |
| 2451 #------------------------------------------------------------------------------# | |
| 2452 #------------------------------------------------------------------------------# | |
| 2453 sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs | |
| 2454 | |
| 2455 my ($strand,$ends_order,$mate_sense,@keysLeft) = @_; | |
| 2456 | |
| 2457 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
| 2458 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
| 2459 | |
| 2460 my (@standardArray); | |
| 2461 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs | |
| 2462 $leftLetterOk = 'R'; | |
| 2463 $rightLetterOk = 'F'; | |
| 2464 @standardArray = translateSolidToRF($strand,$ends_order); | |
| 2465 } else { | |
| 2466 @standardArray = @{$strand}; | |
| 2467 } | |
| 2468 | |
| 2469 my $ifR = 0; | |
| 2470 my @Rs; | |
| 2471 | |
| 2472 for my $i (@keysLeft) { | |
| 2473 if ($standardArray[$i] eq $leftLetterOk) { | |
| 2474 $ifR++; | |
| 2475 push(@Rs,$i); | |
| 2476 } | |
| 2477 } | |
| 2478 | |
| 2479 | |
| 2480 my $ifF = 0; | |
| 2481 my @Fs; | |
| 2482 | |
| 2483 for my $i (@keysLeft) { | |
| 2484 if ($standardArray[$i] eq $rightLetterOk) { | |
| 2485 $ifF++; | |
| 2486 push(@Fs,$i); | |
| 2487 } | |
| 2488 } | |
| 2489 | |
| 2490 if($ifR>=$ifF) { | |
| 2491 return @Fs; | |
| 2492 } | |
| 2493 return @Rs; | |
| 2494 } | |
| 2495 | |
| 2496 #------------------------------------------------------------------------------# | |
| 2497 #------------------------------------------------------------------------------# | |
| 2498 sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa | |
| 2499 | |
| 2500 my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]); | |
| 2501 my $mate_sense = $_[6]; | |
| 2502 | |
| 2503 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
| 2504 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
| 2505 | |
| 2506 my (@standardArray1,@standardArray2); | |
| 2507 | |
| 2508 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs | |
| 2509 $leftLetterOk = 'R'; | |
| 2510 $rightLetterOk = 'F'; | |
| 2511 @standardArray1 = translateSolidToRF($strand1,$ends_order1); | |
| 2512 my @arr = getOrderedStrands($strand2,$order2); | |
| 2513 my @ends2 = getOrderedStrands($ends_order2,$order2); | |
| 2514 @standardArray2 = translateSolidToRF(\@arr,\@ends2); | |
| 2515 | |
| 2516 } else { | |
| 2517 @standardArray1 = @{$strand1}; | |
| 2518 @standardArray2 = getOrderedStrands($strand2,$order2); | |
| 2519 } | |
| 2520 | |
| 2521 #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF | |
| 2522 | |
| 2523 #for the first end: | |
| 2524 | |
| 2525 my @array = @standardArray1; | |
| 2526 my %badInFRSense1; | |
| 2527 for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF | |
| 2528 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
| 2529 $badInFRSense1{$i}=1; | |
| 2530 $array[$i] = $rightLetterOk; | |
| 2531 } | |
| 2532 } | |
| 2533 my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1); | |
| 2534 @array = @standardArray1; | |
| 2535 my %badInFRSense0; | |
| 2536 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR | |
| 2537 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
| 2538 $badInFRSense0{$i-1}=1; | |
| 2539 $array[$i-1] = $leftLetterOk; | |
| 2540 | |
| 2541 } | |
| 2542 } | |
| 2543 my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0); | |
| 2544 | |
| 2545 #for the second end: | |
| 2546 @array = @standardArray2; | |
| 2547 | |
| 2548 my %badInFRSense3; | |
| 2549 for my $i (1..scalar(@array)-1){ | |
| 2550 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
| 2551 $badInFRSense3{$order2->[$i]}=1; | |
| 2552 $array[$i] = $rightLetterOk; | |
| 2553 } | |
| 2554 } | |
| 2555 my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3); | |
| 2556 | |
| 2557 @array = @standardArray2; | |
| 2558 my %badInFRSense5; | |
| 2559 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF | |
| 2560 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
| 2561 $badInFRSense5{$i-1}=1; | |
| 2562 $array[$i-1] = $leftLetterOk; | |
| 2563 } | |
| 2564 } | |
| 2565 my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5); | |
| 2566 | |
| 2567 if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) { | |
| 2568 return (1,%badInFRSense0); | |
| 2569 } | |
| 2570 | |
| 2571 if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) { | |
| 2572 return (1,%badInFRSense1); | |
| 2573 } | |
| 2574 | |
| 2575 if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) { | |
| 2576 return (2,%badInFRSense3); | |
| 2577 } | |
| 2578 | |
| 2579 if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) { | |
| 2580 return (2,%badInFRSense5); | |
| 2581 } | |
| 2582 | |
| 2583 #should not get here: | |
| 2584 print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n"; | |
| 2585 return (1,%badInFRSense1); | |
| 2586 } | |
| 2587 | |
| 2588 sub getOrderedStrands { | |
| 2589 my ($strand,$order) = ($_[0],$_[1]); | |
| 2590 my @arr; | |
| 2591 for my $i (0..scalar(@{$strand})-1) { | |
| 2592 push(@arr,$strand->[$order->[$i]-1]); | |
| 2593 } | |
| 2594 return @arr; | |
| 2595 } | |
| 2596 #------------------------------------------------------------------------------# | |
| 2597 #------------------------------------------------------------------------------# | |
| 2598 sub checkClusters { | |
| 2599 | |
| 2600 my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_; | |
| 2601 if ($ifRenv eq 'REVERSE_SENSE') { | |
| 2602 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { | |
| 2603 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; | |
| 2604 } | |
| 2605 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; | |
| 2606 } | |
| 2607 #if NORM | |
| 2608 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { | |
| 2609 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; | |
| 2610 } | |
| 2611 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; | |
| 2612 } | |
| 2613 | |
| 2614 #------------------------------------------------------------------------------# | |
| 2615 #------------------------------------------------------------------------------# | |
| 2616 sub translateSolidToRF { | |
| 2617 my ($strandArr,$ends_orderArr)=@_; | |
| 2618 my @array; | |
| 2619 for my $i (0..scalar(@{$strandArr})-1) { | |
| 2620 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') { | |
| 2621 push(@array,'F'); | |
| 2622 } | |
| 2623 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') { | |
| 2624 push(@array,'R'); | |
| 2625 } | |
| 2626 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') { | |
| 2627 push(@array,'R'); | |
| 2628 } | |
| 2629 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') { | |
| 2630 push(@array,'F'); | |
| 2631 } | |
| 2632 } | |
| 2633 return @array; | |
| 2634 } | |
| 2635 | |
| 2636 #------------------------------------------------------------------------------# | |
| 2637 #------------------------------------------------------------------------------# | |
| 2638 #convert the links file to the circos format | |
| 2639 sub links2segdup{ | |
| 2640 | |
| 2641 my($id,$color_code,$links_file,$segdup_file)=@_; | |
| 2642 | |
| 2643 print LOG "# Converting to the circos format...\n"; | |
| 2644 | |
| 2645 tie (my %hcolor,'Tie::IxHash'); #color-code hash table | |
| 2646 foreach my $col (keys %{$color_code}){ | |
| 2647 my ($min_links,$max_links)=split(",",$color_code->{$col}); | |
| 2648 $hcolor{$col}=[$min_links,$max_links]; | |
| 2649 } | |
| 2650 | |
| 2651 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; | |
| 2652 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; | |
| 2653 | |
| 2654 my $index=1; | |
| 2655 while(<LINKS>){ | |
| 2656 | |
| 2657 my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; | |
| 2658 | |
| 2659 my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links | |
| 2660 | |
| 2661 print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output | |
| 2662 "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; | |
| 2663 $index++; | |
| 2664 } | |
| 2665 | |
| 2666 close LINKS; | |
| 2667 close SEGDUP; | |
| 2668 print LOG "-- output created: $segdup_file\n"; | |
| 2669 } | |
| 2670 #------------------------------------------------------------------------------# | |
| 2671 #------------------------------------------------------------------------------# | |
| 2672 #convert the links file to the bedPE format for BEDTools usage | |
| 2673 sub links2bedPElinksfile{ | |
| 2674 | |
| 2675 my ($sample,$links_file,$bedpe_file)=@_; | |
| 2676 | |
| 2677 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; | |
| 2678 open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; | |
| 2679 | |
| 2680 my $nb_links=1; | |
| 2681 | |
| 2682 while(<LINKS>){ | |
| 2683 | |
| 2684 chomp; | |
| 2685 my @t=split("\t",$_); | |
| 2686 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); | |
| 2687 my $type=($chr1 eq $chr2)? "INTRA":"INTER"; | |
| 2688 $type.="_".$t[10]; | |
| 2689 | |
| 2690 $start1--; $start2--; | |
| 2691 | |
| 2692 print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". | |
| 2693 "\t$sample"."_link$nb_links\t$type\t.\t.". | |
| 2694 "\t".join("|",@t)."\n"; | |
| 2695 | |
| 2696 $nb_links++; | |
| 2697 } | |
| 2698 | |
| 2699 close LINKS; | |
| 2700 close BEDPE; | |
| 2701 | |
| 2702 } | |
| 2703 #------------------------------------------------------------------------------# | |
| 2704 #------------------------------------------------------------------------------# | |
| 2705 sub bedPElinks2linksfile{ | |
| 2706 | |
| 2707 my ($bedpe_file,$links_file)=@_; | |
| 2708 | |
| 2709 open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; | |
| 2710 open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; | |
| 2711 | |
| 2712 while(<BEDPE>){ | |
| 2713 | |
| 2714 chomp; | |
| 2715 my $sample=(split("_",(split("\t",$_))[6]))[0]; | |
| 2716 my @t1=(split("\t",$_))[0,1,2,3,4,5]; | |
| 2717 my @t2=split(/\|/,(split("\t",$_))[10]); | |
| 2718 push(@t2,$sample); | |
| 2719 | |
| 2720 print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; | |
| 2721 | |
| 2722 } | |
| 2723 close BEDPE; | |
| 2724 close LINKS; | |
| 2725 | |
| 2726 } | |
| 2727 #------------------------------------------------------------------------------# | |
| 2728 #------------------------------------------------------------------------------# | |
| 2729 #convert the links file to the bed format | |
| 2730 sub links2bedfile{ | |
| 2731 | |
| 2732 my ($tag_length,$color_code,$links_file,$bed_file)=@_; | |
| 2733 | |
| 2734 print LOG "# Converting to the bed format...\n"; | |
| 2735 | |
| 2736 my $compare=1; | |
| 2737 if($links_file!~/compared$/){ | |
| 2738 $compare=0; | |
| 2739 $tag_length->{none}->{1}=$tag_length->{1}; | |
| 2740 $tag_length->{none}->{2}=$tag_length->{2}; | |
| 2741 } | |
| 2742 | |
| 2743 #color-code hash table | |
| 2744 tie (my %hcolor,'Tie::IxHash'); | |
| 2745 my %color_order; | |
| 2746 my $n=1; | |
| 2747 foreach my $col (keys %{$color_code}){ | |
| 2748 my ($min_links,$max_links)=split(",",$color_code->{$col}); | |
| 2749 $hcolor{$col}=[$min_links,$max_links]; | |
| 2750 $color_order{$col}=$n; | |
| 2751 $n++; | |
| 2752 } | |
| 2753 | |
| 2754 my %pair; | |
| 2755 my %pt; | |
| 2756 $n=1; | |
| 2757 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; | |
| 2758 | |
| 2759 my %str=( "F"=>"+", "R"=>"-" ); | |
| 2760 | |
| 2761 while(<LINKS>){ | |
| 2762 | |
| 2763 my @t=split; | |
| 2764 my $sample=($compare)? pop(@t):"none"; | |
| 2765 | |
| 2766 my $chr1=$t[0]; | |
| 2767 my $chr2=$t[3]; | |
| 2768 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); | |
| 2769 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); | |
| 2770 my $same_chr=($chr1 eq $chr2)? 1:0; | |
| 2771 | |
| 2772 my $count=$t[6]; | |
| 2773 my $color=getColor($count,\%hcolor,"bed"); | |
| 2774 | |
| 2775 my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); | |
| 2776 my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); | |
| 2777 my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); | |
| 2778 my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); | |
| 2779 my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); | |
| 2780 my @position1=deleteBadOrderSensePairs(split(",",$t[14])); | |
| 2781 my @position2=deleteBadOrderSensePairs(split(",",$t[15])); | |
| 2782 my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); | |
| 2783 my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); | |
| 2784 | |
| 2785 | |
| 2786 for my $p (0..$#pairs){ | |
| 2787 | |
| 2788 if (!exists $pair{$pairs[$p]}){ | |
| 2789 | |
| 2790 if($same_chr){ | |
| 2791 | |
| 2792 $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, | |
| 2793 $start1[$p]-1, $end2[$p], $color, | |
| 2794 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; | |
| 2795 $pt{$n}=$pair{$pairs[$p]}->{0}; | |
| 2796 $n++; | |
| 2797 | |
| 2798 }else{ | |
| 2799 | |
| 2800 $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, | |
| 2801 $start1[$p]-1, $end1[$p], $color, | |
| 2802 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; | |
| 2803 $pt{$n}=$pair{$pairs[$p]}->{1}; | |
| 2804 $n++; | |
| 2805 | |
| 2806 | |
| 2807 $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, | |
| 2808 $start2[$p]-1, $end2[$p], $color, | |
| 2809 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; | |
| 2810 $pt{$n}=$pair{$pairs[$p]}->{2}; | |
| 2811 $n++; | |
| 2812 } | |
| 2813 }else{ | |
| 2814 | |
| 2815 if($same_chr){ | |
| 2816 ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); | |
| 2817 }else{ | |
| 2818 ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); | |
| 2819 ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); | |
| 2820 } | |
| 2821 } | |
| 2822 } | |
| 2823 } | |
| 2824 close LINKS; | |
| 2825 | |
| 2826 my $nb_pairs=$n-1; | |
| 2827 | |
| 2828 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; | |
| 2829 print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". | |
| 2830 "visibility=2 itemRgb=\"On\"\n"; | |
| 2831 | |
| 2832 for my $i (1..$nb_pairs){ | |
| 2833 print BED join("\t",@{$pt{$i}})."\n"; | |
| 2834 } | |
| 2835 | |
| 2836 close BED; | |
| 2837 | |
| 2838 print LOG "-- output created: $bed_file\n"; | |
| 2839 | |
| 2840 undef %pair; | |
| 2841 undef %pt; | |
| 2842 | |
| 2843 } | |
| 2844 #------------------------------------------------------------------------------# | |
| 2845 #------------------------------------------------------------------------------# | |
| 2846 sub deleteBadOrderSensePairs{ | |
| 2847 | |
| 2848 my (@tab)=@_; | |
| 2849 my @tab2; | |
| 2850 | |
| 2851 foreach my $v (@tab){ | |
| 2852 | |
| 2853 $v=~s/[\(\)]//g; | |
| 2854 push(@tab2,$v) if($v!~/[\$\*\@]$/); | |
| 2855 } | |
| 2856 return @tab2; | |
| 2857 } | |
| 2858 #------------------------------------------------------------------------------# | |
| 2859 #------------------------------------------------------------------------------# | |
| 2860 sub getAllEntries{ | |
| 2861 my (@tab)=split (/,/,$_[0]); | |
| 2862 my @tab2; | |
| 2863 | |
| 2864 foreach my $v (@tab){ | |
| 2865 | |
| 2866 $v=~s/[\(\)]//g; | |
| 2867 push(@tab2,$v); | |
| 2868 } | |
| 2869 return @tab2; | |
| 2870 }#------------------------------------------------------------------------------# | |
| 2871 #------------------------------------------------------------------------------# | |
| 2872 sub getAllEntriesWOspecialChar{ | |
| 2873 my (@tab)=split (/,/,$_[0]); | |
| 2874 my @tab2; | |
| 2875 | |
| 2876 foreach my $v (@tab){ | |
| 2877 | |
| 2878 $v=~s/[\(\)\$\*\@]//g; | |
| 2879 push(@tab2,$v); | |
| 2880 } | |
| 2881 return @tab2; | |
| 2882 } | |
| 2883 #------------------------------------------------------------------------------# | |
| 2884 #------------------------------------------------------------------------------# | |
| 2885 sub links2SVfile{ | |
| 2886 | |
| 2887 my($links_file,$sv_file)=@_; | |
| 2888 | |
| 2889 print LOG "# Converting to the sv output table...\n"; | |
| 2890 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; | |
| 2891 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; | |
| 2892 | |
| 2893 my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist | |
| 2894 chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering | |
| 2895 final_score breakpoint1_start1-end1 breakpoint2_start2-end2); | |
| 2896 | |
| 2897 my $nb_links=0; | |
| 2898 | |
| 2899 while (<LINKS>){ | |
| 2900 | |
| 2901 my @t=split; | |
| 2902 my @sv=(); | |
| 2903 my $sv_type="-"; | |
| 2904 my $strand_ratio="-"; | |
| 2905 my $eq_ratio="-"; | |
| 2906 my $eq_type="-"; | |
| 2907 my $insert_ratio="-"; | |
| 2908 my $link="-"; | |
| 2909 my ($bk1, $bk2)=("-","-"); | |
| 2910 my $score="-"; | |
| 2911 | |
| 2912 my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); | |
| 2913 my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); | |
| 2914 my $nb_pairs=$t[6]; | |
| 2915 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); | |
| 2916 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); | |
| 2917 my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; | |
| 2918 | |
| 2919 #if strand filtering | |
| 2920 if (defined $t[16]){ | |
| 2921 #if inter-chr link | |
| 2922 $sv_type=$t[16]; | |
| 2923 if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ | |
| 2924 $strand_ratio=floor($1/$2*100)."%"; | |
| 2925 $score=$t[18]; | |
| 2926 } | |
| 2927 if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ | |
| 2928 #if intra-chr link with insert size filtering | |
| 2929 $strand_ratio=floor($1/$2*100)."%"; | |
| 2930 $link=floor($t[17]); | |
| 2931 if($sv_type!~/^INV/){ | |
| 2932 $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); | |
| 2933 $score=$t[20]; | |
| 2934 }else{ | |
| 2935 $score=$t[19]; | |
| 2936 } | |
| 2937 } | |
| 2938 } | |
| 2939 | |
| 2940 if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ | |
| 2941 | |
| 2942 #if strand and order filtering only and/or interchr link | |
| 2943 $eq_type=$t[18]; | |
| 2944 $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); | |
| 2945 ($bk1, $bk2)=($t[20],$t[21]); | |
| 2946 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} | |
| 2947 $score=$t[22]; | |
| 2948 | |
| 2949 }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ | |
| 2950 | |
| 2951 #if all three filtering | |
| 2952 $link=floor($t[17]); | |
| 2953 $eq_type=$t[19]; | |
| 2954 $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); | |
| 2955 | |
| 2956 if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ | |
| 2957 $insert_ratio=floor($1/$2*100)."%"; | |
| 2958 ($bk1, $bk2)=($t[22],$t[23]); | |
| 2959 $score=$t[24]; | |
| 2960 | |
| 2961 }else{ | |
| 2962 ($bk1, $bk2)=($t[21],$t[22]); | |
| 2963 $score=$t[23]; | |
| 2964 } | |
| 2965 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} | |
| 2966 | |
| 2967 } | |
| 2968 | |
| 2969 | |
| 2970 push(@sv, $chr_type, $sv_type,$eq_type); | |
| 2971 push(@sv,"$chr1\t$start1-$end1"); | |
| 2972 push(@sv, $link); | |
| 2973 push(@sv,"$chr2\t$start2-$end2", | |
| 2974 $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); | |
| 2975 | |
| 2976 | |
| 2977 print SV join("\t",@sv)."\n"; | |
| 2978 } | |
| 2979 | |
| 2980 close LINKS; | |
| 2981 close SV; | |
| 2982 | |
| 2983 system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; | |
| 2984 | |
| 2985 open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; | |
| 2986 my @links=<SV>; | |
| 2987 close SV; | |
| 2988 | |
| 2989 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; | |
| 2990 | |
| 2991 print SV join("\t",@header)."\n"; | |
| 2992 print SV @links; | |
| 2993 close SV; | |
| 2994 | |
| 2995 unlink($sv_file.".sorted"); | |
| 2996 | |
| 2997 print LOG "-- output created: $sv_file\n"; | |
| 2998 | |
| 2999 } | |
| 3000 #------------------------------------------------------------------------------# | |
| 3001 sub densityCalculation{ | |
| 3002 | |
| 3003 my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_; | |
| 3004 | |
| 3005 my @sfile=split(/\./,$$mates_file[$file]); | |
| 3006 my $fchr=$sfile[$#sfile]; | |
| 3007 | |
| 3008 my $fh = new FileHandle; | |
| 3009 | |
| 3010 my %density; | |
| 3011 my %density_ref; | |
| 3012 my @ratio; | |
| 3013 my ($cov,$cov_ref); | |
| 3014 | |
| 3015 #FREQUENCY CALCULATION PROCEDURE | |
| 3016 print LOG "# $fchr : Frequency calculation procedure...\n"; | |
| 3017 &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format); | |
| 3018 &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format); | |
| 3019 | |
| 3020 #MAKING RATIO AND OUTPUT | |
| 3021 print LOG "\# Ratio calculation procedure...\n"; | |
| 3022 $density_file=~s/\/mates\//\/density\//; | |
| 3023 $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n"; | |
| 3024 | |
| 3025 foreach my $k (1..$chr->{nb_chrs}){ | |
| 3026 foreach my $frag (1..$chr->{$k}->{nb_frag}){ | |
| 3027 | |
| 3028 @ratio= ($chr->{$k}->{name}, | |
| 3029 (${$chr->{$k}->{$frag}}[0]+1), | |
| 3030 (${$chr->{$k}->{$frag}}[1]+1)); | |
| 3031 | |
| 3032 $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0; | |
| 3033 $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0; | |
| 3034 | |
| 3035 push(@ratio,$cov,$cov_ref); | |
| 3036 push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref); | |
| 3037 push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref); | |
| 3038 push(@ratio,log($cov+1)) if($cov && !$cov_ref); | |
| 3039 next if(!$cov && !$cov_ref); | |
| 3040 | |
| 3041 print $fh join("\t",@ratio)."\n"; | |
| 3042 } | |
| 3043 } | |
| 3044 | |
| 3045 $fh->close; | |
| 3046 print LOG "-- output created: $density_file\n"; | |
| 3047 | |
| 3048 undef %density; | |
| 3049 undef %density_ref; | |
| 3050 } | |
| 3051 #------------------------------------------------------------------------------# | |
| 3052 #------------------------------------------------------------------------------# | |
| 3053 sub FreqCalculation{ | |
| 3054 | |
| 3055 my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_; | |
| 3056 | |
| 3057 my @sfile=split(/\./,$mates_file); | |
| 3058 my $fchr=$sfile[$#sfile]; | |
| 3059 my $fh = new FileHandle; | |
| 3060 | |
| 3061 my $nb_windows=0; | |
| 3062 my $warn=100000; | |
| 3063 my $record=0; | |
| 3064 my %pair; | |
| 3065 | |
| 3066 my ($sumX,$sumX2) = (0,0); | |
| 3067 | |
| 3068 print LOG "\# Frequency calculation for $mates_file...\n"; | |
| 3069 | |
| 3070 if ($mates_file =~ /.gz$/) { | |
| 3071 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
| 3072 }elsif($mates_file =~ /.bam$/){ | |
| 3073 o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY | |
| 3074 }else{ | |
| 3075 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; | |
| 3076 } | |
| 3077 | |
| 3078 while(<$fh>){ | |
| 3079 | |
| 3080 my @t=split; | |
| 3081 my $mate=$t[0]; | |
| 3082 | |
| 3083 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,); | |
| 3084 | |
| 3085 next if(exists $pair{$mate}); | |
| 3086 | |
| 3087 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); | |
| 3088 | |
| 3089 next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2}); | |
| 3090 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
| 3091 | |
| 3092 $pair{$mate}=undef; | |
| 3093 $record++; | |
| 3094 | |
| 3095 my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2); | |
| 3096 | |
| 3097 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); | |
| 3098 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); | |
| 3099 | |
| 3100 my $length = abs($coord_start_read1-$coord_start_read2); | |
| 3101 $sumX += $length; #add to sum and sum^2 for mean and variance calculation | |
| 3102 $sumX2 += $length*$length; | |
| 3103 | |
| 3104 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ | |
| 3105 | |
| 3106 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ | |
| 3107 | |
| 3108 &addToDensity($density,$chr_read1,$i,\$nb_windows) | |
| 3109 if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])); | |
| 3110 | |
| 3111 }else{ | |
| 3112 | |
| 3113 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); | |
| 3114 } | |
| 3115 } | |
| 3116 | |
| 3117 if($record>=$warn){ | |
| 3118 print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n"; | |
| 3119 $warn+=100000; | |
| 3120 } | |
| 3121 } | |
| 3122 $fh->close; | |
| 3123 | |
| 3124 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n"; | |
| 3125 | |
| 3126 if($record>0){ | |
| 3127 | |
| 3128 my $mu = $sumX/$record; | |
| 3129 my $sigma = sqrt($sumX2/$record - $mu*$mu); | |
| 3130 print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n"; | |
| 3131 } | |
| 3132 | |
| 3133 } | |
| 3134 #------------------------------------------------------------------------------# | |
| 3135 #------------------------------------------------------------------------------# | |
| 3136 sub ratio2segdup{ | |
| 3137 | |
| 3138 my($id,$density_file,$segdup_file)=@_; | |
| 3139 | |
| 3140 print LOG "# Converting to circos format...\n"; | |
| 3141 | |
| 3142 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; | |
| 3143 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; | |
| 3144 | |
| 3145 while(<RATIO>){ | |
| 3146 chomp; | |
| 3147 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; | |
| 3148 print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n"; | |
| 3149 } | |
| 3150 | |
| 3151 close RATIO; | |
| 3152 close SEGDUP; | |
| 3153 print LOG "-- output created: $segdup_file\n"; | |
| 3154 } | |
| 3155 #------------------------------------------------------------------------------# | |
| 3156 #------------------------------------------------------------------------------# | |
| 3157 sub ratio2bedfile{ | |
| 3158 | |
| 3159 my($density_file,$bed_file)=@_; | |
| 3160 | |
| 3161 print LOG "# Converting to bedGraph format...\n"; | |
| 3162 | |
| 3163 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; | |
| 3164 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; | |
| 3165 print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ". | |
| 3166 "visibility=2 color=255,0,0 alwaysZero=\"On\"\n"; | |
| 3167 | |
| 3168 while(<RATIO>){ | |
| 3169 chomp; | |
| 3170 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; | |
| 3171 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/); | |
| 3172 print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n"; | |
| 3173 } | |
| 3174 | |
| 3175 close RATIO; | |
| 3176 close BED; | |
| 3177 print LOG "-- output created: $bed_file\n"; | |
| 3178 } | |
| 3179 #------------------------------------------------------------------------------# | |
| 3180 #------------------------------------------------------------------------------# | |
| 3181 sub inverseSense{ | |
| 3182 | |
| 3183 my $mate_sense=$_[0]; | |
| 3184 my %reverse=( 'F' => 'R' , 'R' => 'F' , | |
| 3185 'FF' => 'RR', 'RR' => 'FF', | |
| 3186 'FR' => 'RF', 'RF' => 'FR'); | |
| 3187 return $reverse{$mate_sense}; | |
| 3188 } | |
| 3189 | |
| 3190 #------------------------------------------------------------------------------# | |
| 3191 #------------------------------------------------------------------------------# | |
| 3192 sub getNextFrag{ | |
| 3193 | |
| 3194 my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_; | |
| 3195 | |
| 3196 my $how_far = $read_start-$frag_start; | |
| 3197 my $nb_windows_toskip; | |
| 3198 | |
| 3199 if($how_far>0){ | |
| 3200 | |
| 3201 $nb_windows_toskip=($how_far/$step)-($window_dist/$step); | |
| 3202 $nb_windows_toskip=~ s/\..*//; | |
| 3203 $nb_windows_toskip=0 if($nb_windows_toskip<0); | |
| 3204 return ($frag_num + $nb_windows_toskip); | |
| 3205 } | |
| 3206 return $frag_last; | |
| 3207 } | |
| 3208 #------------------------------------------------------------------------------# | |
| 3209 #------------------------------------------------------------------------------# | |
| 3210 sub getColor{ | |
| 3211 | |
| 3212 my($count,$hcolor,$format)=@_; | |
| 3213 for my $col ( keys % { $hcolor} ) { | |
| 3214 return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); | |
| 3215 } | |
| 3216 return "white" if($format eq "circos"); | |
| 3217 return "255,255,255" if($format eq "bed"); | |
| 3218 } | |
| 3219 #------------------------------------------------------------------------------# | |
| 3220 #------------------------------------------------------------------------------# | |
| 3221 sub recupCoords{ | |
| 3222 | |
| 3223 my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_; | |
| 3224 my $strand = 'F'; | |
| 3225 | |
| 3226 if ($c_hit=~s/^\-//) { | |
| 3227 $strand='R'; | |
| 3228 $$cs_hit=$c_hit; | |
| 3229 $$ce_hit=$c_hit-($tag_length-1); | |
| 3230 }else{ | |
| 3231 $$cs_hit=$c_hit; | |
| 3232 $$ce_hit=$c_hit+($tag_length-1); | |
| 3233 } | |
| 3234 return $strand; | |
| 3235 | |
| 3236 } | |
| 3237 #------------------------------------------------------------------------------# | |
| 3238 #------------------------------------------------------------------------------# | |
| 3239 sub overlap { | |
| 3240 my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_; | |
| 3241 if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) { | |
| 3242 return 0; | |
| 3243 } | |
| 3244 return 1; | |
| 3245 } | |
| 3246 #------------------------------------------------------------------------------# | |
| 3247 #------------------------------------------------------------------------------# | |
| 3248 sub makeLink { | |
| 3249 | |
| 3250 my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_; | |
| 3251 | |
| 3252 if($chr1>$chr2){ | |
| 3253 ($chr1,$chr2)= ($chr2,$chr1); | |
| 3254 ($frag1,$frag2)= ($frag2,$frag1); | |
| 3255 } | |
| 3256 | |
| 3257 if($chr1 == $chr2){ | |
| 3258 if($frag1>$frag2){ | |
| 3259 ($frag1,$frag2)= ($frag2,$frag1); | |
| 3260 } | |
| 3261 } | |
| 3262 | |
| 3263 if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){ | |
| 3264 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt; | |
| 3265 $$nb++; | |
| 3266 }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){ | |
| 3267 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt"; | |
| 3268 } | |
| 3269 } | |
| 3270 #------------------------------------------------------------------------------# | |
| 3271 #------------------------------------------------------------------------------# | |
| 3272 #fonction of adding the read to the density profile | |
| 3273 sub addToDensity { | |
| 3274 | |
| 3275 my ($density,$chr1,$frag1,$nb)=@_; | |
| 3276 | |
| 3277 if(!exists $density->{$chr1}->{$frag1}->{count}){ | |
| 3278 $density->{$chr1}->{$frag1}->{count}=1; | |
| 3279 $$nb++; | |
| 3280 }else{ | |
| 3281 $density->{$chr1}->{$frag1}->{count}++; | |
| 3282 } | |
| 3283 } | |
| 3284 #------------------------------------------------------------------------------# | |
| 3285 #------------------------------------------------------------------------------# | |
| 3286 sub floor { | |
| 3287 my $nb = $_[0]; | |
| 3288 $nb=~ s/\..*//; | |
| 3289 return $nb; | |
| 3290 } | |
| 3291 #------------------------------------------------------------------------------# | |
| 3292 sub decimal{ | |
| 3293 | |
| 3294 my $num=shift; | |
| 3295 my $digs_to_cut=shift; | |
| 3296 | |
| 3297 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); | |
| 3298 | |
| 3299 return $num; | |
| 3300 } | |
| 3301 | |
| 3302 #------------------------------------------------------------------------------# | |
| 3303 sub max { | |
| 3304 | |
| 3305 my($max) = shift(@_); | |
| 3306 foreach my $temp (@_) { | |
| 3307 $max = $temp if $temp > $max; | |
| 3308 } | |
| 3309 return($max); | |
| 3310 } | |
| 3311 #------------------------------------------------------------------------------# | |
| 3312 #------------------------------------------------------------------------------# | |
| 3313 sub min { | |
| 3314 | |
| 3315 my($min) = shift(@_); | |
| 3316 foreach my $temp (@_) { | |
| 3317 $min = $temp if $temp < $min; | |
| 3318 } | |
| 3319 return($min); | |
| 3320 } | |
| 3321 #------------------------------------------------------------------------------# | |
| 3322 #------------------------------------------------------------------------------# | |
| 3323 sub sortTablebyIndex{ | |
| 3324 my ($tab1,$tab2)=@_; | |
| 3325 my @tab3; | |
| 3326 | |
| 3327 foreach my $i (@$tab1){ | |
| 3328 $tab3[$i]=$$tab2[$$tab1[$i]]; | |
| 3329 } | |
| 3330 return @tab3; | |
| 3331 } | |
| 3332 #------------------------------------------------------------------------------# | |
| 3333 #------------------------------------------------------------------------------# | |
| 3334 sub round { | |
| 3335 my $number = shift || 0; | |
| 3336 my $dec = 10 ** (shift || 0); | |
| 3337 return int( $dec * $number + .5 * ($number <=> 0)) / $dec; | |
| 3338 } | |
| 3339 #------------------------------------------------------------------------------# | |
| 3340 #------------------------------------------------------------------------------# | |
| 3341 sub getUniqueTable{ | |
| 3342 | |
| 3343 my (@tab)=@_; | |
| 3344 my (%saw,@out)=(); | |
| 3345 undef %saw; | |
| 3346 return sort(grep(!$saw{$_}++, @tab)); | |
| 3347 } | |
| 3348 #------------------------------------------------------------------------------# | |
| 3349 #------------------------------------------------------------------------------# | |
| 3350 sub catFiles { | |
| 3351 | |
| 3352 unlink("$_[1]") if(exists $_[1]); | |
| 3353 system qq( cat "$_" >> "$_[1]" ) for @{$_[0]}; | |
| 3354 } | |
| 3355 #------------------------------------------------------------------------------# | |
| 3356 #------------------------------------------------------------------------------# | |
| 3357 #check if the configuration file is correct | |
| 3358 sub validateconfiguration{ | |
| 3359 | |
| 3360 my %conf=%{$_[0]}; | |
| 3361 my $list_prgs="@ARGV"; | |
| 3362 | |
| 3363 my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file); | |
| 3364 my @detection_params=qw(split_mate_file window_size step_length split_mate_file); | |
| 3365 my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file); | |
| 3366 my @circos_params=qw(organism_id colorcode); | |
| 3367 my @bed_params=qw(colorcode); | |
| 3368 my @compare_params=qw(list_samples file_suffix); | |
| 3369 | |
| 3370 foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){ | |
| 3371 | |
| 3372 unless (defined($dir)) { | |
| 3373 $dir = "."; | |
| 3374 } | |
| 3375 unless (-d $dir){ | |
| 3376 mkdir $dir or die; | |
| 3377 } | |
| 3378 $dir.="/" if($dir!~/\/$/); | |
| 3379 } | |
| 3380 | |
| 3381 unless (defined($conf{general}{num_threads})) { | |
| 3382 $conf{general}{num_threads} = 1; | |
| 3383 } | |
| 3384 $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24); | |
| 3385 | |
| 3386 if($list_prgs!~/links2compare/){ | |
| 3387 | |
| 3388 foreach my $p (@general_params){ | |
| 3389 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p}); | |
| 3390 } | |
| 3391 | |
| 3392 $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam"); | |
| 3393 | |
| 3394 unless (defined($conf{general}{sv_type})) { | |
| 3395 $conf{general}{sv_type} = "all"; | |
| 3396 } | |
| 3397 | |
| 3398 $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}}; | |
| 3399 } | |
| 3400 | |
| 3401 if($list_prgs=~/(linking|cnv)/){ | |
| 3402 | |
| 3403 foreach my $p (@detection_params){ | |
| 3404 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p}); | |
| 3405 } | |
| 3406 | |
| 3407 die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref}); | |
| 3408 | |
| 3409 if($conf{detection}{step_length}>$conf{detection}{window_size}){ | |
| 3410 die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n"); | |
| 3411 } | |
| 3412 | |
| 3413 unless (-d $conf{general}{tmp_dir}."/mates"){ | |
| 3414 mkdir $conf{general}{tmp_dir}."/mates" or die; | |
| 3415 } | |
| 3416 | |
| 3417 if($list_prgs=~/linking/){ | |
| 3418 unless (-d $conf{general}{tmp_dir}."/links"){ | |
| 3419 mkdir $conf{general}{tmp_dir}."/links" or die; | |
| 3420 } | |
| 3421 } | |
| 3422 if($list_prgs=~/cnv/){ | |
| 3423 unless (-d $conf{general}{tmp_dir}."/density"){ | |
| 3424 mkdir $conf{general}{tmp_dir}."/density" or die; | |
| 3425 } | |
| 3426 } | |
| 3427 | |
| 3428 } | |
| 3429 | |
| 3430 if($list_prgs=~/filtering/){ | |
| 3431 | |
| 3432 foreach my $p (@filtering_params) { | |
| 3433 die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p}); | |
| 3434 | |
| 3435 } | |
| 3436 | |
| 3437 if(defined($conf{filtering}{chromosomes})) { | |
| 3438 my @chrs=split(",",$conf{filtering}{chromosomes}); | |
| 3439 my $exclude=($chrs[0]=~/^\-/)? 1:0; | |
| 3440 for my $chrName (@chrs){ | |
| 3441 | |
| 3442 die("Error Config : The filtering parameter \"chromosomes\" is not valid\n") | |
| 3443 if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude)); | |
| 3444 | |
| 3445 } | |
| 3446 } | |
| 3447 | |
| 3448 if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) { | |
| 3449 die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected". | |
| 3450 "\nChange strand_filtering to \"1\" if you want to use the order filtering\n"); | |
| 3451 } | |
| 3452 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) { | |
| 3453 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n"); | |
| 3454 } | |
| 3455 if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) { | |
| 3456 die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected". | |
| 3457 "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n"); | |
| 3458 } | |
| 3459 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) { | |
| 3460 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n"); | |
| 3461 } | |
| 3462 | |
| 3463 if (!defined($conf{filtering}{indel_sigma_threshold})) { | |
| 3464 $conf{filtering}{indel_sigma_threshold} = 2; | |
| 3465 } | |
| 3466 if (!defined($conf{filtering}{dup_sigma_threshold})) { | |
| 3467 $conf{filtering}{dup_sigma_threshold} = 2; | |
| 3468 } | |
| 3469 if (!defined($conf{filtering}{singleton_sigma_threshold})) { | |
| 3470 $conf{filtering}{singleton_sigma_threshold} = 4; | |
| 3471 } | |
| 3472 | |
| 3473 if (!defined($conf{filtering}{nb_pairs_order_threshold})) { | |
| 3474 $conf{filtering}{nb_pairs_order_threshold} = 1; | |
| 3475 } | |
| 3476 | |
| 3477 if (!defined($conf{filtering}{final_score_threshold})) { | |
| 3478 $conf{filtering}{final_score_threshold} = 0.8; | |
| 3479 } | |
| 3480 | |
| 3481 if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) { | |
| 3482 die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n"); | |
| 3483 } | |
| 3484 | |
| 3485 } | |
| 3486 | |
| 3487 if($list_prgs=~/2circos$/){ | |
| 3488 foreach my $p (@circos_params) { | |
| 3489 next if($list_prgs=~/^ratio/ && $p eq "colorcode"); | |
| 3490 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); | |
| 3491 } | |
| 3492 } | |
| 3493 | |
| 3494 if($list_prgs=~/2bed$/){ | |
| 3495 foreach my $p (@bed_params) { | |
| 3496 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); | |
| 3497 } | |
| 3498 } | |
| 3499 | |
| 3500 if($list_prgs=~/links2compare/){ | |
| 3501 foreach my $p (@compare_params) { | |
| 3502 die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); | |
| 3503 } | |
| 3504 | |
| 3505 unless (defined($conf{compare}{same_sv_type})) { | |
| 3506 $conf{compare}{same_sv_type} = 0; | |
| 3507 } | |
| 3508 | |
| 3509 unless (defined($conf{compare}{min_overlap})) { | |
| 3510 $conf{compare}{min_overlap} = 1E-9; | |
| 3511 } | |
| 3512 | |
| 3513 if($conf{compare}{circos_output}){ | |
| 3514 foreach my $p (@circos_params) { | |
| 3515 next if($list_prgs=~/^ratio/ && $p eq "colorcode"); | |
| 3516 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); | |
| 3517 } | |
| 3518 } | |
| 3519 if($conf{compare}{bed_output}){ | |
| 3520 foreach my $p (@bed_params) { | |
| 3521 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); | |
| 3522 } | |
| 3523 die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); | |
| 3524 | |
| 3525 my @samples=split(",",$conf{compare}{list_samples}); | |
| 3526 my @read_lengths=split(",",$conf{compare}{list_read_lengths}); | |
| 3527 for my $i (0..$#samples){ | |
| 3528 my @l=split("-",$read_lengths[$i]); | |
| 3529 $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; | |
| 3530 } | |
| 3531 } | |
| 3532 } | |
| 3533 | |
| 3534 | |
| 3535 } | |
| 3536 #------------------------------------------------------------------------------# | |
| 3537 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# |
