comparison SVDetect_r0.8b_galaxy/svdetect/SVDetect_run_parallel.pl @ 28:091714bd75a0 draft default tip

new release r0.8b
author bzeitouni
date Tue, 22 Jan 2013 06:20:22 -0500
parents
children
comparison
equal deleted inserted replaced
27:c284618dd8da 28:091714bd75a0
1 #!/usr/bin/perl -w
2
3 =pod
4
5 =head1 NAME
6
7 SVDetect - Program designed to the detection of structural variations
8 from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads
9
10 Version: 0.8b for Galaxy
11
12 =head1 SYNOPSIS
13
14 SVDetect <command> -conf <configuration_file> [-help] [-man]
15
16 Command:
17
18 linking detection and isolation of links
19 filtering filtering of links according different parameters
20 links2circos links conversion to circos format
21 links2bed paired-ends of links converted to bed format (UCSC)
22 links2SV formatted output to show most significant SVs
23 cnv calculate copy-number profiles
24 ratio2circos ratio conversion to circos density format
25 ratio2bedgraph ratio conversion to bedGraph density format (UCSC)
26
27 =head1 DESCRIPTION
28
29 This is a command-line interface to SVDetect.
30
31
32 =head1 AUTHORS
33
34 Bruno Zeitouni E<lt>bruno.zeitouni@curie.frE<gt>,
35 Valentina Boeva E<lt>valentina.boeva@curie.frE<gt>
36
37 =cut
38
39 # -------------------------------------------------------------------
40
41 use strict;
42 use warnings;
43
44 use Pod::Usage;
45 use Getopt::Long;
46
47 use Config::General;
48 use Tie::IxHash;
49 use FileHandle;
50 use Parallel::ForkManager;
51
52 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
53 #PARSE THE COMMAND LINE
54 my %OPT;
55 GetOptions(\%OPT,
56 'conf=s',
57 'out1=s', #GALAXY
58 'out2=s', #GALAXY
59 'out3=s', #GALAXY
60 'out4=s', #GALAXY
61 'out5=s', #GALAXY
62 'l=s', #GALAXY
63 'N=s',#GALAXY
64 'help',#GALAXY
65 'man'
66 );
67
68 pod2usage() if $OPT{help};
69 pod2usage(-verbose=>2) if $OPT{man};
70 pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf});
71
72 pod2usage() if(@ARGV<1);
73
74 tie (my %func, 'Tie::IxHash',linking=>\&createlinks,
75 filtering=>\&filterlinks,
76 links2circos=>\&links2circos,
77 links2bed=>\&links2bed,
78 links2compare=>\&links2compare,
79 links2SV=>\&links2SV,
80 cnv=>\&cnv,
81 ratio2circos=>\&ratio2circos,
82 ratio2bedgraph=>\&ratio2bedgraph);
83
84 foreach my $command (@ARGV){
85 pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command}));
86 }
87 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
88
89
90 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
91 #READ THE CONFIGURATION FILE
92 my $conf=Config::General->new( -ConfigFile => $OPT{conf},
93 -Tie => "Tie::IxHash",
94 -AllowMultiOptions => 1,
95 -LowerCaseNames => 1,
96 -AutoTrue => 1);
97 my %CONF= $conf->getall;
98 validateconfiguration(\%CONF); #validation of the configuration parameters
99
100 my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY
101
102 my $pt_log_file=$OPT{l}; #GALAXY
103 my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY
104 my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY
105 my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY
106 my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY
107 my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY
108
109 $CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY
110 $CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY
111
112 my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY
113 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY
114 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
115
116 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
117 #COMMAND EXECUTION
118 foreach my $command (@ARGV){
119 &{$func{$command}}();
120 }
121 print LOG "-- end\n";#GALAXY
122
123 close LOG;#GALAXY
124 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY
125 exit(0);
126 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
127
128
129 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
130 #FUNCTIONS
131 #------------------------------------------------------------------------------#
132 #MAIN FUNCTION number 1: Detection of links from mate-pairs data
133 sub createlinks{
134
135 my %CHR; #main hash table 1: fragments, links
136 my %CHRID;
137 my @MATEFILES;
138
139 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type};
140 my @path=split(/\//,$output_prefix);
141 $output_prefix=$CONF{general}{output_dir}.$path[$#path];
142 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path];
143 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path];
144
145 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters
146 $CONF{detection}{window_size},
147 $CONF{detection}{step_length},
148 $CONF{general}{cmap_file});
149
150 if($CONF{detection}{split_mate_file}){
151
152 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix,
153 $CONF{general}{sv_type},
154 $CONF{general}{mates_file},
155 $CONF{general}{input_format},
156 $CONF{general}{read_lengths}
157 );
158 }else{
159
160 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!";
161 chomp(@MATEFILES);
162 print LOG "# Splitted mate files already created.\n";
163 }
164
165
166 #Parallelization of the linking per chromosome for intra + interchrs
167 my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
168
169 foreach my $matefile (@MATEFILES){
170
171 my $pid = $pm->start and next;
172 getlinks(\%CHR, \%CHRID, $matefile);
173 $pm->finish;
174
175 }
176 $pm->wait_all_children;
177
178 #Merge the chromosome links file into only one
179 my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!";
180 chomp(@LINKFILES);
181 catFiles( \@LINKFILES => "$output_prefix.links" );
182
183 system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY
184 print LOG "# Linking end procedure : output created: $output_prefix.links\n";
185 #unlink(@LINKFILES);
186 #unlink(@MATEFILES);
187
188 undef %CHR;
189 undef %CHRID;
190
191 }
192 #------------------------------------------------------------------------------#
193 #------------------------------------------------------------------------------#
194 sub getlinks {
195
196 my ($chr,$chrID,$tmp_mates_prefix)=@_;
197
198 my $tmp_links_prefix=$tmp_mates_prefix;
199 $tmp_links_prefix=~s/\/mates\//\/links\//;
200
201 my %PAIR; #main hash table 2: pairs
202
203 linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs
204 $CONF{general}{read_lengths},
205 $CONF{detection}{window_size},
206 $CONF{detection}{step_length},
207 $tmp_mates_prefix,
208 $CONF{general}{input_format},
209 $CONF{general}{sv_type},
210 "$tmp_links_prefix.links.mapped"
211 );
212
213 getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons
214 "$tmp_links_prefix.links.unique");
215
216 defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links
217 $CONF{general}{input_format},
218 $CONF{general}{sv_type},
219 $CONF{general}{read_lengths},
220 "$tmp_links_prefix.links.unique",
221 "$tmp_links_prefix.links.unique_defined");
222
223 sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates
224 "$tmp_links_prefix.links.sorted");
225
226 removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links
227 "$tmp_links_prefix.links",1); #file output
228
229
230 undef %PAIR;
231
232 unlink("$tmp_links_prefix.links.mapped",
233 "$tmp_links_prefix.links.unique",
234 "$tmp_links_prefix.links.unique_defined",
235 "$tmp_links_prefix.links.sorted");
236 }
237 #------------------------------------------------------------------------------#
238 #------------------------------------------------------------------------------#
239 sub splitMateFile{
240
241 my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_;
242
243 print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n";
244
245 my %filesHandle;
246
247 #fichier matefile inter
248 if($sv_type=~/^(all|inter)$/){
249 my $newFileName="$output_prefix.interchrs";
250 push(@{$files_list},$newFileName);
251 my $fh = new FileHandle;
252 $fh->open(">$newFileName");
253 $filesHandle{inter}=$fh;
254 }
255
256 #fichiers matefiles intra
257 if($sv_type=~/^(all|intra)$/){
258 foreach my $k (1..$chr->{nb_chrs}){
259 my $newFileName=$output_prefix.".".$chr->{$k}->{name};
260 push(@{$files_list},$newFileName);
261 my $fh = new FileHandle;
262 $fh->open(">$newFileName");
263 $filesHandle{$k}=$fh;
264 }
265 }
266
267 if ($mates_file =~ /.gz$/) {
268 open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat
269 }elsif($mates_file =~ /.bam$/){
270 open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
271 }else{
272 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
273 }
274
275
276 while(<MATES>){
277
278 my @t=split;
279 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2);
280
281 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
282
283 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
284
285 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
286
287 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){
288 my $fh2print=$filesHandle{inter};
289 print $fh2print join("\t",@t)."\n";
290 }
291
292 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){
293 my $fh2print=$filesHandle{$chr_read1};
294 print $fh2print join("\t",@t)."\n";
295
296 }
297 }
298
299 foreach my $name (keys %filesHandle){
300 my $fh=$filesHandle{$name};
301 $fh->close;
302 }
303
304 print LOG "# Splitted mate files of \"$mates_file\" created.\n";
305 }
306
307
308 #------------------------------------------------------------------------------#
309 #------------------------------------------------------------------------------#
310 sub filterlinks{
311
312 my %CHR;
313 my %CHRID;
314 my @LINKFILES;
315 my @FLINKFILES;
316
317 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type};
318 my @path=split(/\//,$output_prefix);
319 $output_prefix=$CONF{general}{output_dir}.$path[$#path];
320 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path];
321
322 createChrHashTables(\%CHR,\%CHRID,
323 $CONF{general}{cmap_file});
324
325 if($CONF{filtering}{split_link_file}){
326
327 splitLinkFile(\%CHR, \%CHRID, \@LINKFILES,
328 $tmp_links_prefix,
329 $CONF{general}{sv_type},
330 "$output_prefix.links",
331 );
332 }else{
333
334 @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n";
335 chomp(@LINKFILES);
336 print LOG "# Splitted link files already created.\n";
337 }
338
339 #Parallelization of the filtering per chromosome for intra + interchrs
340 my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
341
342 foreach my $linkfile (@LINKFILES){
343
344 my $pid = $pm->start and next;
345 getFilteredlinks(\%CHR, \%CHRID, $linkfile);
346 $pm->finish;
347
348 }
349 $pm->wait_all_children;
350
351 #Merge the chromosome links file into only one
352 @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n";
353 chomp(@FLINKFILES);
354 catFiles( \@FLINKFILES => "$output_prefix.links.filtered" );
355
356 system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY
357 print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n";
358
359 undef %CHR;
360 undef %CHRID;
361
362 }
363 #------------------------------------------------------------------------------#
364 #------------------------------------------------------------------------------#
365 sub splitLinkFile{
366
367 my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_;
368
369 print LOG "# Splitting the link file for parallel processing...\n";
370
371 my %filesHandle;
372
373 #fichier matefile inter
374 if($sv_type=~/^(all|inter)$/){
375 my $newFileName="$input_prefix.interchrs.links";
376 push(@{$files_list},$newFileName);
377 my $fh = new FileHandle;
378 $fh->open(">$newFileName");
379 $filesHandle{inter}=$fh;
380 }
381
382 #fichiers matefiles intra
383 if($sv_type=~/^(all|intra)$/){
384 foreach my $k (1..$chr->{nb_chrs}){
385 my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links";
386 push(@{$files_list},$newFileName);
387 my $fh = new FileHandle;
388 $fh->open(">$newFileName");
389 $filesHandle{$k}=$fh;
390 }
391 }
392
393 open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n";
394 while(<LINKS>){
395
396 my @t=split;
397 my ($chr_read1,$chr_read2)=($t[0],$t[3]);
398
399 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
400
401 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
402
403 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){
404 my $fh2print=$filesHandle{inter};
405 print $fh2print join("\t",@t)."\n";
406 }
407
408 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){
409 my $fh2print=$filesHandle{$chr_read1};
410 print $fh2print join("\t",@t)."\n";
411
412 }
413 }
414
415 foreach my $name (keys %filesHandle){
416 my $fh=$filesHandle{$name};
417 $fh->close;
418 }
419
420 print LOG "# Splitted link files created.\n";
421 }
422
423
424 #------------------------------------------------------------------------------#
425 #------------------------------------------------------------------------------#
426 #MAIN FUNCTION number 2: Filtering processing
427 sub getFilteredlinks {
428
429 my ($chr,$chrID,$tmp_links_prefix)=@_;
430 my %PAIR;
431
432 strandFiltering($chr,$chrID,
433 $CONF{filtering}{nb_pairs_threshold}, #filtering of links
434 $CONF{filtering}{strand_filtering},
435 $CONF{filtering}{chromosomes},
436 $CONF{general}{input_format},
437 $CONF{general}{cmap_file},
438 $CONF{general}{mates_orientation},
439 $CONF{general}{read_lengths},
440 $tmp_links_prefix,
441 "$tmp_links_prefix.filtered",
442 );
443
444 if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering
445
446 my @tmpfiles;
447
448 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique");
449
450 getUniqueLinks("$tmp_links_prefix.filtered_unique",
451 "$tmp_links_prefix.filtered");
452
453 push(@tmpfiles,"$tmp_links_prefix.filtered_unique");
454
455 if($CONF{filtering}{order_filtering}){ #filtering using the order
456
457 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered");
458
459 orderFiltering($chr,$chrID,
460 $CONF{filtering}{nb_pairs_threshold},
461 $CONF{filtering}{nb_pairs_order_threshold},
462 $CONF{filtering}{mu_length},
463 $CONF{filtering}{sigma_length},
464 $CONF{general}{mates_orientation},
465 $CONF{general}{read_lengths},
466 "$tmp_links_prefix.filtered_ordered",
467 "$tmp_links_prefix.filtered",
468 );
469
470 push(@tmpfiles,"$tmp_links_prefix.filtered_ordered");
471 }
472
473 if (($CONF{filtering}{insert_size_filtering})&&
474 ($CONF{general}{sv_type} ne 'inter')){
475
476 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize");
477
478 addInsertionInfo($chr,$chrID,
479 $CONF{filtering}{nb_pairs_threshold},
480 $CONF{filtering}{order_filtering},
481 $CONF{filtering}{indel_sigma_threshold},
482 $CONF{filtering}{dup_sigma_threshold},
483 $CONF{filtering}{singleton_sigma_threshold},
484 $CONF{filtering}{mu_length},
485 $CONF{filtering}{sigma_length},
486 $CONF{general}{mates_orientation},
487 $CONF{general}{read_lengths},
488 "$tmp_links_prefix.filtered_withoutIndelSize",
489 "$tmp_links_prefix.filtered"
490 );
491
492 push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize");
493 }
494
495 sortLinks("$tmp_links_prefix.filtered",
496 "$tmp_links_prefix.filtered_sorted");
497
498 removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted",
499 "$tmp_links_prefix.filtered_nodup",
500 );
501
502 postFiltering("$tmp_links_prefix.filtered_nodup",
503 "$tmp_links_prefix.filtered",
504 $CONF{filtering}{final_score_threshold});
505
506 push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup");
507
508 unlink(@tmpfiles);
509
510
511 }
512 undef %PAIR;
513
514 }
515 #------------------------------------------------------------------------------#
516 #------------------------------------------------------------------------------#
517 #MAIN FUNCTION number 3: Circos format conversion for links
518 sub links2circos{
519
520 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
521 my @path=split(/\//,$input_file);
522 $input_file=$CONF{general}{output_dir}.$path[$#path];
523
524 my $output_file.=$input_file.".segdup.txt";
525
526 links2segdup($CONF{circos}{organism_id},
527 $CONF{circos}{colorcode},
528 $input_file,
529 $output_file); #circos file output
530
531 system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY
532 }
533 #------------------------------------------------------------------------------#
534 #------------------------------------------------------------------------------#
535 #MAIN FUNCTION number 4: Bed format conversion for links
536 sub links2bed{
537
538 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
539 my @path=split(/\//,$input_file);
540 $input_file=$CONF{general}{output_dir}.$path[$#path];
541
542 my $output_file.=$input_file.".bed.txt";
543
544 links2bedfile($CONF{general}{read_lengths},
545 $CONF{bed}{colorcode},
546 $input_file,
547 $output_file); #bed file output
548
549 system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY
550
551 }
552 #------------------------------------------------------------------------------#
553 #------------------------------------------------------------------------------#
554 #MAIN FUNCTION number 6: Bed format conversion for links
555 sub links2SV{
556
557 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
558
559 my @path=split(/\//,$input_file);
560 $input_file=$CONF{general}{output_dir}.$path[$#path];
561
562 my $output_file.=$input_file.".sv.txt";
563
564
565 links2SVfile( $input_file,
566 $output_file);
567
568 system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY
569 }
570 #------------------------------------------------------------------------------#
571 #------------------------------------------------------------------------------#
572 #MAIN FUNCTION number 7: copy number variations, coverage ratio calculation
573 sub cnv{
574
575 my %CHR;
576 my %CHRID;
577 my @MATEFILES;
578 my @MATEFILES_REF;
579
580 my $output_prefix=$CONF{general}{mates_file};
581 my $output_prefix_ref=$CONF{detection}{mates_file_ref};
582 my @path=split(/\//,$output_prefix);
583 my @path_ref=split(/\//,$output_prefix_ref);
584 $output_prefix=$CONF{general}{output_dir}.$path[$#path];
585 $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref];
586 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path];
587 my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref];
588 my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path];
589
590 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters
591 $CONF{detection}{window_size},
592 $CONF{detection}{step_length},
593 $CONF{general}{cmap_file});
594
595 if($CONF{detection}{split_mate_file}){
596
597 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix,
598 "intra",
599 $CONF{general}{mates_file},
600 $CONF{general}{input_format},
601 $CONF{general}{read_lengths}
602 );
603
604 splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref,
605 "intra",
606 $CONF{detection}{mates_file_ref},
607 $CONF{general}{input_format},
608 $CONF{general}{read_lengths}
609 );
610
611
612 }else{
613
614 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!";
615 chomp(@MATEFILES);
616 @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!";
617 chomp(@MATEFILES_REF);
618 print LOG "# Splitted sample and reference mate files already created.\n";
619 }
620
621 #Parallelization of the cnv per chromosome
622 my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
623
624 foreach my $file (0..$#MATEFILES){
625
626 my $pid = $pm->start and next;
627
628 densityCalculation(\%CHR, \%CHRID, $file,
629 $CONF{general}{read_lengths},
630 $CONF{detection}{window_size},
631 $CONF{detection}{step_length},
632 \@MATEFILES,
633 \@MATEFILES_REF,
634 $MATEFILES[$file].".density",
635 $CONF{general}{input_format});
636
637 $pm->finish;
638
639 }
640 $pm->wait_all_children;
641
642 #Merge the chromosome links file into only one
643 my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!";
644 chomp(@DENSITYFILES);
645 catFiles( \@DENSITYFILES => "$output_prefix.density" );
646
647 print LOG "# cnv end procedure : output created: $output_prefix.density\n";
648
649
650 undef %CHR;
651 undef %CHRID;
652
653 }
654 #------------------------------------------------------------------------------#
655 #------------------------------------------------------------------------------#
656 #MAIN FUNCTION number 8: Circos format conversion for cnv ratios
657 sub ratio2circos{
658
659 my $input_file=$CONF{general}{mates_file}.".density";
660
661 my @path=split(/\//,$input_file);
662 $input_file=$CONF{general}{output_dir}.$path[$#path];
663
664 my $output_file.=$input_file.".segdup.txt";
665
666 ratio2segdup($CONF{circos}{organism_id},
667 $input_file,
668 $output_file);
669 }
670 #------------------------------------------------------------------------------#
671 #MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios
672 sub ratio2bedgraph{
673
674 my $input_file=$CONF{general}{mates_file}.".density";
675
676 my @path=split(/\//,$input_file);
677 $input_file=$CONF{general}{output_dir}.$path[$#path];
678
679 my $output_file.=$input_file.".bedgraph.txt";
680
681 ratio2bedfile($input_file,
682 $output_file); #bed file output
683 }
684 #------------------------------------------------------------------------------#
685 #------------------------------------------------------------------------------#
686 #Creation of the fragment library
687 sub shearingChromosome{
688
689 print LOG "# Making the fragments library...\n";
690
691 my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters
692
693 createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name
694
695 foreach my $k (1..$chr->{nb_chrs}){
696
697 print LOG"-- $chr->{$k}->{name}\n";
698
699 my $frag=1;
700 for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){
701
702 my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1;
703 $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage
704
705 if($end==($chr->{$k}->{length})-1){
706 $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome
707 last;
708 }
709 $frag++;
710 }
711 }
712 }
713 #------------------------------------------------------------------------------#
714 #------------------------------------------------------------------------------#
715 #Creation of chromosome hash tables from the cmap file
716 sub createChrHashTables{
717
718 my ($chr,$chrID,$cmap_file)=@_;
719 $chr->{nb_chrs}=0;
720
721 open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n";
722 while(<CMAP>){
723
724 if(/^\s+$/){ next;}
725 my ($k,$name,$length) = split;
726 $chr->{$k}->{name}=$name;
727 $chr->{$k}->{length}=$length;
728 $chrID->{$name}=$k;
729 $chr->{nb_chrs}++;
730
731 }
732 close CMAP;
733 }
734 #------------------------------------------------------------------------------#
735 #------------------------------------------------------------------------------#
736 #Read the mate file according the input format file (solid, eland or sam)
737 sub readMateFile{
738
739 my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_;
740 my ($strand1,$strand2);
741
742 if($file_type eq "solid"){
743
744 ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based
745
746 }else{
747 my ($tag_length1,$tag_length2);
748 ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based
749 if($file_type eq "eland");
750
751 if($file_type eq "sam"){
752
753 return 0 if ($$t[0]=~/^@/); #header sam filtered out
754
755 ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]);
756
757 return 0 if (($$t[1]&0x0004) || ($$t[1]&0x0008));
758
759 $$chr2=$$chr1 if($$chr2 eq "=");
760
761 $strand1 = (($$t[1]&0x0010))? 'R':'F';
762 $strand2 = (($$t[1]&0x0020))? 'R':'F';
763
764 $$order1= (($$t[1]&0x0040))? '1':'2';
765 $$order2= (($$t[1]&0x0080))? '1':'2';
766 $tag_length1 = $tag_length->{$$order1};
767 $tag_length2 = $tag_length->{$$order2};
768 }
769
770 $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts
771 $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R");
772 }
773 return 1;
774 }
775 #------------------------------------------------------------------------------#
776 #------------------------------------------------------------------------------#
777 #Parsing of the mates files and creation of links between 2 chromosomal fragments
778 sub linking{
779
780 my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_;
781 my %link;
782
783 my $record=0;
784 my $nb_links=0;
785 my $warn=10000;
786
787 my @sfile=split(/\./,$mates_file);
788 my $fchr=$sfile[$#sfile];
789
790 my $fh = new FileHandle;
791
792 print LOG "# $fchr : Linking procedure...\n";
793 print LOG "-- file=$mates_file\n".
794 "-- chromosome=$fchr\n".
795 "-- input format=$input_format\n".
796 "-- type=$sv_type\n".
797 "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n".
798 "-- window size=$window_dist, step length=$step\n";
799
800 if ($mates_file =~ /.gz$/) {
801 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat
802 }elsif($mates_file =~ /.bam$/){
803 $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
804 }else{
805 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n";
806 }
807
808
809 while(<$fh>){
810
811 my @t=split; #for each mate-pair
812 my $mate=$t[0];
813 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2);
814
815 next if(exists $$pair{$mate});
816
817 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
818
819 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
820
821 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
822
823 if($sv_type ne "all"){
824 if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) ||
825 ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){
826 }else{
827 next;
828 }
829 }
830
831 $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function)
832
833 $record++;
834
835 my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read
836
837 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format);
838 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format);
839
840 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation
841
842 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){
843
844 if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){
845
846 for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){
847
848 if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) {
849
850 if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){
851
852 makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link
853 }
854
855 }else{
856
857 $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step);
858 }
859 }
860 }
861
862 }else{
863
864 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step);
865 }
866 }
867
868 if($record>=$warn){
869 print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n";
870 $warn+=10000;
871 }
872 }
873 $fh->close;
874
875 if(!$nb_links){
876 print LOG "-- $fchr : No mate-pairs !\n".
877 "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n";
878 }
879
880 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n";
881
882 print LOG "-- $fchr : writing...\n";
883
884 $fh = new FileHandle;
885
886 $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n";
887
888 foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output
889
890 foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){
891
892 foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){
893
894 foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){
895
896 my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2});
897 print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t".
898 "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t".
899 scalar @count."\t". #nb of read
900 $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list
901 }
902 }
903 }
904 }
905
906 $fh->close;
907
908 undef %link;
909
910 }
911 #------------------------------------------------------------------------------#
912 #------------------------------------------------------------------------------#
913 #remove exact links doublons according to the mate list
914 sub getUniqueLinks{
915
916 my ($links_file,$nrlinks_file)=@_;
917 my %links;
918 my %pt;
919 my $nb_links;
920 my $n=1;
921
922 my $record=0;
923 my $warn=300000;
924
925 my @sfile=split(/\./,$links_file);
926 my $fchr=$sfile[$#sfile-2];
927
928 my $fh = new FileHandle;
929
930 print LOG "# $fchr : Getting unique links...\n";
931 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
932
933 while(<$fh>){
934
935 my @t=split;
936 my $mates=$t[7];
937 $record++;
938
939 if(!exists $links{$mates}){ #Unique links selection
940
941 $links{$mates}=[@t];
942 $pt{$n}=$links{$mates};
943 $n++;
944
945
946 }else{ #get the link coordinates from the mate-pairs list
947
948 for my $i (1,2,4,5){ #get the shortest regions
949
950 $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start
951 if($i==1 || $i==4);
952 $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end
953 if($i==2 || $i==5);
954 }
955 }
956 if($record>=$warn){
957 print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n";
958 $warn+=300000;
959 }
960 }
961 $fh->close;
962
963 $nb_links=$n-1;
964 print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n";
965
966 $fh = new FileHandle;
967 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n";
968 print LOG "-- $fchr : writing...\n";
969 for my $i (1..$nb_links){
970
971 print $fh join("\t",@{$pt{$i}})."\n"; #all links output
972 }
973
974 $fh->close;
975
976 undef %links;
977 undef %pt;
978
979 }
980 #------------------------------------------------------------------------------#
981 #------------------------------------------------------------------------------#
982 #get the new coordinates of each link from the mate list
983 sub defineCoordsLinks{
984
985 my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_;
986
987 my @sfile=split(/\./,$links_file);
988 my $fchr=$sfile[$#sfile-2];
989
990 my $fh = new FileHandle;
991 my $fh2 = new FileHandle;
992
993 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
994 $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n";
995
996 print LOG "# $fchr : Defining precise link coordinates...\n";
997
998 my $record=0;
999 my $warn=100000;
1000
1001 my %coords;
1002 my %strands;
1003 my %order;
1004 my %ends_order;
1005
1006 while(<$fh>){
1007
1008
1009 my ($col1,$col2)=(1,2); #for an intrachromosomal link
1010 my $diffchr=0; #difference between chr1 and chr2
1011 my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8];
1012 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2});
1013 if ($chr1 != $chr2){ #for an interchromosomal link
1014 $col1=$col2=0; #no distinction
1015 $diffchr=1;
1016 }
1017
1018 my @pairs=split(",",$mates_list);
1019
1020 $coords{$col1}{$chr1}->{start}=undef;
1021 $coords{$col1}{$chr1}->{end}=undef;
1022 $coords{$col2}{$chr2}->{start}=undef;
1023 $coords{$col2}{$chr2}->{end}=undef;
1024 $strands{$col1}{$chr1}=undef;
1025 $strands{$col2}{$chr2}=undef;
1026 $ends_order{$col1}{$chr1}=undef;
1027 $ends_order{$col2}{$chr2}=undef;
1028
1029
1030 $order{$col1}{$chr1}->{index}->{1}=undef;
1031 $order{$col1}{$chr1}->{index}->{2}=undef;
1032 $order{$col2}{$chr2}->{index}->{1}=undef;
1033 $order{$col2}{$chr2}->{index}->{2}=undef;
1034 $order{$col1}{$chr1}->{order}=undef;
1035 $order{$col2}{$chr2}->{order}=undef;
1036
1037 $record++;
1038
1039 for my $p (0..$#pairs){ #for each pair
1040
1041 my ($coord_start_read1,$coord_end_read1);
1042 my ($coord_start_read2,$coord_end_read2);
1043 my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format);
1044 my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format);
1045
1046 if(!$diffchr){ #for a intrachromosomal link
1047 if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column
1048 ($col1,$col2)=(2,1);
1049 }else{
1050 ($col1,$col2)=(1,2);
1051 }
1052 }
1053
1054 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads
1055 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1);
1056 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2);
1057 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2);
1058 push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1);
1059 push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2);
1060 push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]);
1061 push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]);
1062 }
1063
1064 ($col1,$col2)=(1,2) if(!$diffchr);
1065
1066 my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region
1067 my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}}));
1068 my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}}));
1069 my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}}));
1070
1071 @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}};
1072 @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}};
1073
1074 foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting)
1075 foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){
1076
1077 if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){
1078 ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i;
1079 ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j;
1080 last;
1081 }
1082 }
1083 }
1084
1085 foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID
1086 foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){
1087
1088 if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){
1089 ${$order{$col1}{$chr1}->{order}}[$i]=$i+1;
1090 ${$order{$col2}{$chr2}->{order}}[$i]=$j+1;
1091 last;
1092 }
1093 }
1094 }
1095
1096 @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads
1097 @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1});
1098 @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2});
1099 @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1});
1100 @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2});
1101 @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start});
1102 @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start});
1103
1104
1105 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output
1106 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2,
1107 scalar @pairs,
1108 join(",",@pairs),
1109 join(",",@{$strands{$col1}{$chr1}}),
1110 join(",",@{$strands{$col2}{$chr2}}),
1111 join(",",@{$ends_order{$col1}{$chr1}}),
1112 join(",",@{$ends_order{$col2}{$chr2}}),
1113 join(",",@{$order{$col1}{$chr1}->{order}}),
1114 join(",",@{$order{$col2}{$chr2}->{order}}),
1115 join(",",@{$coords{$col1}{$chr1}->{start}}),
1116 join(",",@{$coords{$col2}{$chr2}->{start}}));
1117
1118 print $fh2 join("\t",@link)."\n";
1119
1120 if($record>=$warn){
1121 print LOG "-- $fchr : $warn links processed\n";
1122 $warn+=100000;
1123 }
1124 }
1125 $fh->close;
1126 $fh2->close;
1127
1128 print LOG "-- $fchr : Total : $record links processed\n";
1129
1130 }
1131 #------------------------------------------------------------------------------#
1132 #------------------------------------------------------------------------------#
1133 #Sort links according the concerned chromosomes and their coordinates
1134 sub sortLinks{
1135
1136 my ($links_file,$sortedlinks_file,$unique)=@_;
1137
1138 my @sfile=split(/\./,$links_file);
1139 my $fchr=$sfile[$#sfile-2];
1140
1141
1142 print LOG "# $fchr : Sorting links...\n";
1143
1144 my $pipe=($unique)? "| sort -u":"";
1145 system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file";
1146
1147 }
1148 #------------------------------------------------------------------------------#
1149 #------------------------------------------------------------------------------#
1150 #removal of fully overlapped links
1151 sub removeFullyOverlappedLinks{
1152
1153 my ($links_file,$nrlinks_file,$warn_out)=@_;
1154
1155 my %pt;
1156 my $n=1;
1157
1158 my @sfile=split(/\./,$links_file);
1159 my $fchr=$sfile[$#sfile-2];
1160
1161 my $fh = new FileHandle;
1162
1163 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
1164 while(<$fh>){
1165
1166 my @t=split("\t",$_);
1167 $pt{$n}=[@t];
1168 $n++;
1169 }
1170 $fh->close;
1171
1172 my $nb_links=$n-1;
1173 my $nb=$nb_links;
1174
1175 my %pt2;
1176 my $nb2=1;
1177 my $record=0;
1178 my $warn=10000;
1179
1180 print LOG "# $fchr : Removing fully overlapped links...\n";
1181
1182 LINK:
1183
1184 for my $i (1..$nb){
1185
1186 my @link=();
1187 my @next_link=();
1188 my $ind1=$i;
1189
1190 $record++;
1191 if($record>=$warn){
1192 print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n";
1193 $warn+=10000;
1194 }
1195
1196 if(exists $pt{$ind1}){
1197 @link=@{$pt{$ind1}}; #link1
1198 }else{
1199 next LINK;
1200 }
1201
1202 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1
1203 my @mates=deleteBadOrderSensePairs(split(",",$link[7]));
1204
1205 my $ind2=$ind1+1;
1206 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link
1207
1208 if($ind2<=$nb){
1209
1210 @next_link=@{$pt{$ind2}}; #link2
1211 my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2
1212 my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
1213
1214 while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter
1215
1216 if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2
1217
1218 $ind2++;
1219 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb);
1220
1221 if($ind2>$nb){ #if no more link in the file -> save link1
1222
1223 $pt2{$nb2}=\@link;
1224 $nb2++;
1225 next LINK;
1226 }
1227
1228 @next_link=@{$pt{$ind2}};
1229 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]);
1230 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
1231 next;
1232 }
1233
1234 my %mates=map{$_ =>1} @mates; #get the equal number of mates
1235 my @same_mates = grep( $mates{$_}, @next_mates );
1236 my $nb_mates= scalar @same_mates;
1237
1238 if($nb_mates == scalar @mates){
1239
1240 delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1
1241 next LINK; #go to link2, link2 becomes link1
1242
1243 }else{
1244 delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2
1245 $ind2++; #we continue by checking the next link2
1246 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb);
1247
1248 if($ind2>$nb){ #if no more link in the file -> save link1
1249
1250 $pt2{$nb2}=\@link;
1251 $nb2++;
1252 next LINK;
1253 }
1254
1255 @next_link=@{$pt{$ind2}}; #get info of link2
1256 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]);
1257 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
1258
1259 }
1260 }
1261 }
1262 $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1
1263 $nb2++;
1264 }
1265
1266 print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n";
1267
1268 #OUTPUT
1269
1270 $fh = new FileHandle;
1271 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n";
1272 print LOG "-- $fchr : writing...\n";
1273 for my $i (1..$nb2-1){
1274
1275 print $fh join("\t",@{$pt2{$i}}); #all links output
1276 }
1277
1278 close $fh;
1279
1280 print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out);
1281
1282 undef %pt;
1283 undef %pt2;
1284 }
1285 #------------------------------------------------------------------------------#
1286 sub postFiltering {
1287
1288 my ($links_file,$pflinks_file, $finalScore_thres)=@_;
1289
1290 my @sfile=split(/\./,$links_file);
1291 my $fchr=$sfile[$#sfile-2];
1292
1293
1294 my ($nb,$nb2)=(0,0);
1295
1296 print LOG "# $fchr : Post-filtering links...\n";
1297 print LOG "-- $fchr : final score threshold = $finalScore_thres\n";
1298
1299 my $fh = new FileHandle;
1300 my $fh2 = new FileHandle;
1301
1302 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
1303 $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n";
1304
1305
1306 while(<$fh>){
1307
1308 my @t=split("\t",$_);
1309 my $score=$t[$#t-1];
1310
1311 if($score >= $finalScore_thres){
1312 print $fh2 join("\t", @t);
1313 $nb2++;
1314 }
1315 $nb++;
1316 }
1317 $fh->close;
1318 $fh2->close;
1319
1320 print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n";
1321 print LOG "-- $fchr : output created: $pflinks_file\n";
1322 }
1323
1324
1325
1326 #------------------------------------------------------------------------------#
1327 #------------------------------------------------------------------------------#
1328 #Filtering of the links
1329 sub strandFiltering{
1330
1331 my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes,
1332 $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_;
1333
1334 my @sfile=split(/\./,$links_file);
1335 my $fchr=$sfile[$#sfile-1];
1336
1337
1338 my %chrs;
1339 my %chrs1;
1340 my %chrs2;
1341 my $nb_chrs;
1342 my $exclude;
1343
1344 if($chromosomes){
1345 my @chrs=split(",",$chromosomes);
1346 $nb_chrs=scalar @chrs;
1347 $exclude=($chrs[0]=~/^\-/)? 1:0;
1348 for my $chrName (@chrs){
1349 $chrName=~s/^(\-)//;
1350 my $col=($chrName=~s/_(1|2)$//);
1351
1352 if(!$col){
1353 $chrs{$chrID->{$chrName}}=undef
1354 }else{
1355 $chrs1{$chrID->{$chrName}}=undef if($1==1);
1356 $chrs2{$chrID->{$chrName}}=undef if($1==2);
1357 }
1358 }
1359 }
1360
1361 my $record=0;
1362 my $nb_links=0;
1363 my $warn=10000;
1364
1365 my $sens_ratio_threshold=0.6;
1366
1367 print LOG "\# Filtering procedure...\n";
1368 print LOG "\# Number of pairs and strand filtering...\n";
1369 print LOG "-- file=$links_file\n";
1370 print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no").
1371 ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n";
1372
1373
1374
1375 my $fh = new FileHandle;
1376 my $fh2 = new FileHandle;
1377
1378 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
1379 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
1380
1381 while(<$fh>){
1382
1383 my @t=split; #for each link
1384 my $is_good=1;
1385 $record++;
1386
1387
1388 if($chromosomes){
1389
1390 my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]});
1391
1392 if(!$exclude){
1393 $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0;
1394 $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good);
1395 $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good);
1396 }else{
1397 $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1;
1398 $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good);
1399 }
1400 }
1401
1402 $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs
1403 if($is_good && $strand_filtering){ #if filtering according the strand sense
1404
1405 my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense
1406 my @strands1=split(/,/,$t[8]);
1407 my @strands2=split(/,/,$t[9]);
1408
1409 my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0);
1410
1411 my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR
1412 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF
1413
1414 my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE",
1415 substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE",
1416 inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE");
1417
1418 if($t[6] == 1){
1419
1420 push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1);
1421
1422 }else{
1423
1424 tie (my %class,'Tie::IxHash');
1425 my $split;
1426
1427 foreach my $i (0..$#mates){
1428 $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group
1429 }
1430
1431 my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR};
1432 my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF};
1433 my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class);
1434
1435 if($sens_ratio < $sens_ratio_threshold){
1436 %class=(1=>'FF', 2=>'FR');
1437 $split=1;
1438 }else{
1439 $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class
1440 $split=0;
1441 }
1442
1443 $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold);
1444 }
1445 }
1446
1447 if($is_good){ #PRINT
1448
1449 my $nb=scalar @t;
1450 if($nb > 20){
1451 my @t2=splice(@t,0,20);
1452 print $fh2 join("\t",@t2)."\n";
1453 $nb_links++;
1454 }
1455 $nb_links++;
1456 print $fh2 join("\t",@t)."\n";
1457 }
1458
1459 if($record>=$warn){
1460 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n";
1461 $warn+=10000;
1462 }
1463 }
1464 $fh->close;
1465 $fh2->close;
1466
1467 print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links);
1468
1469 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
1470
1471
1472 }
1473 #------------------------------------------------------------------------------#
1474 #------------------------------------------------------------------------------#
1475 sub getConsistentSenseLinks{
1476
1477 my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_;
1478
1479 my $npairs=scalar @$mates;
1480
1481 my @ends_order1 = split (/,/,$$t[10]);
1482 my @ends_order2 = split (/,/,$$t[11]);
1483 my @order1 = split (/,/,$$t[12]);
1484 my @order2 = split (/,/,$$t[13]);
1485 my @positions1 = split (/,/,$$t[14]);
1486 my @positions2 = split (/,/,$$t[15]);
1487
1488 my @newlink;
1489
1490 foreach my $ind (keys %{$class} ){
1491
1492 tie (my %flink,'Tie::IxHash');
1493 my @orders2remove=();
1494
1495 foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group
1496
1497 if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){
1498 push(@{$flink{mates}},$$mates[$i]);
1499 push(@{$flink{strands1}},$$strands1[$i]);
1500 push(@{$flink{strands2}},$$strands2[$i]);
1501 push(@{$flink{ends_order1}},$ends_order1[$i]);
1502 push(@{$flink{ends_order2}},$ends_order2[$i]);
1503 push(@{$flink{positions1}},$positions1[$i]);
1504 push(@{$flink{positions2}},$positions2[$i]);
1505
1506 }else{
1507 push(@orders2remove,$order1[$i]);
1508 }
1509 }
1510
1511 @{$flink{order1}}=();
1512 @{$flink{order2}}=();
1513 if(scalar @orders2remove > 0){
1514 getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2})
1515 }else{
1516 @{$flink{order1}}=@order1;
1517 @{$flink{order2}}=@order2;
1518 }
1519
1520 my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length);
1521 my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length);
1522
1523 my $fnpairs=scalar @{$flink{mates}};
1524 my $strand_filtering_ratio=$fnpairs."/".$npairs;
1525 my $real_ratio=$fnpairs/$npairs;
1526
1527 if($fnpairs>=$thres){ #filtering according the number of pairs
1528
1529 push(@newlink,
1530 $$t[0],
1531 min(min(@{$flink{positions1}}),min(@ends1)),
1532 max(max(@{$flink{positions1}}),max(@ends1)),
1533 $$t[3],
1534 min(min(@{$flink{positions2}}),min(@ends2)),
1535 max(max(@{$flink{positions2}}),max(@ends2)),
1536 $fnpairs,
1537 join(",",@{$flink{mates}}),
1538 join(",",@{$flink{strands1}}),
1539 join(",",@{$flink{strands2}}),
1540 join(",",@{$flink{ends_order1}}),
1541 join(",",@{$flink{ends_order2}}),
1542 join(",",@{$flink{order1}}),
1543 join(",",@{$flink{order2}}),
1544 join(",",@{$flink{positions1}}),
1545 join(",",@{$flink{positions2}}),
1546 $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]},
1547 $strand_filtering_ratio,
1548 $real_ratio,
1549 $npairs
1550 );
1551 }
1552 }
1553
1554 if (grep {defined($_)} @newlink) {
1555 @$t=@newlink;
1556 return 1
1557 }
1558 return 0;
1559
1560 }
1561 #------------------------------------------------------------------------------#
1562 #------------------------------------------------------------------------------#
1563 sub getNewOrders{
1564
1565 my($tab1,$tab2,$list,$newtab1,$newtab2)=@_;
1566 my $j=1;
1567 my $k=1;
1568 for my $i (0..$#{$tab2}){
1569 my $c=0;
1570 for my $j (0..$#{$list}){
1571 $c++ if(${$list}[$j] < ${$tab2}[$i]);
1572 if(${$list}[$j] == ${$tab2}[$i]){
1573 $c=-1; last;
1574 }
1575 }
1576 if($c!=-1){
1577 push(@{$newtab2}, ${$tab2}[$i]-$c);
1578 push(@{$newtab1}, $k);
1579 $k++;
1580 }
1581 }
1582 }
1583
1584 #------------------------------------------------------------------------------#
1585 #------------------------------------------------------------------------------#
1586 #Filtering of the links using their order
1587 sub orderFiltering {
1588
1589 my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_;
1590
1591 my @sfile=split(/\./,$links_file);
1592 my $fchr=$sfile[$#sfile-2];
1593
1594
1595 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0;
1596
1597 my $record=0;
1598 my $warn=10000;
1599 my $nb_links=0;
1600
1601 my $quant05 = 1.644854;
1602 my $quant001 = 3.090232;
1603 my $alphaDist = $quant05 * 2 * $sigma;
1604 my $maxFragmentLength = &floor($quant001 * $sigma + $mu);
1605
1606 print LOG "\# Filtering by order...\n";
1607 print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n";
1608 print LOG "-- distance between comparable pairs was set to $alphaDist\n";
1609 print LOG "-- maximal fragment length was set to $maxFragmentLength\n";
1610
1611
1612 my $fh = new FileHandle;
1613 my $fh2 = new FileHandle;
1614
1615 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
1616 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
1617
1618 while(<$fh>){
1619
1620 $record++;
1621 my @t = split;
1622 my ($chr1,$chr2,$mates_list)=@t[0,3,7];
1623 my @pairs=split(",",$mates_list);
1624 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2});
1625 my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5];
1626 my $numberOfPairs = $t[6];
1627 my @strand1 = split (/,/,$t[8]);
1628 my @strand2 = split (/,/,$t[9]);
1629 my @ends_order1 = split (/,/,$t[10]);
1630 my @ends_order2 = split (/,/,$t[11]);
1631 my @order1 = split (/,/,$t[12]);
1632 my @order2 = split (/,/,$t[13]);
1633 my @positions1 = split (/,/,$t[14]);
1634 my @positions2 = split (/,/,$t[15]);
1635 my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length);
1636 my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length);
1637 my $clusterCoordinates_chr1;
1638 my $clusterCoordinates_chr2;
1639 my $reads_left = 0;
1640
1641 my $ifRenv = $t[16];
1642 my $strand_ratio_filtering=$t[17];
1643
1644 #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if <F-R> orientation is correct
1645 my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense);
1646 #find pairs type F-RRRR or FFFF-R in the case if <R-F> orientation is correct
1647 #These pairs are annotated as BED pairs forever! They won't be recycled!
1648 my $table;
1649 for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs
1650 my $nonAdeq = 0;
1651 for my $j (0..$i-1) {
1652 if (exists($table->{$j}->{$i})) {
1653 $nonAdeq++;
1654 $table->{$i}->{$j} = 1;
1655 }
1656 }
1657 for my $j ($i+1..$numberOfPairs-1) {
1658 if ($positions1[$j]-$positions1[$i]>$alphaDist) {
1659 if (&reversed ($i,$j,$ifRenv,\@positions2)) {
1660 $nonAdeq++;
1661 $table->{$i}->{$j} = 1;
1662 }
1663 }
1664 }
1665 $table->{$i}->{nonAdeq} = $nonAdeq;
1666 }
1667
1668 for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of <R-F> orientation
1669 &remove($bad,$table);
1670 }
1671
1672 my @falseReads;
1673 #RRRR-F -> RRRR or R-FFFF -> FFFF
1674 @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table});
1675 #these pairs will be recycled later as $secondTable
1676 for my $bad (@falseReads) {
1677 &remove($bad,$table);
1678 }
1679
1680 my $bad = &check($table);
1681 while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER
1682 # push (@falseReads, $bad); remove completely!!!
1683 &remove($bad,$table);
1684 $bad = &check($table);
1685 }
1686
1687 $reads_left = scalar keys %{$table};
1688 my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}]));
1689 my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}]));
1690 my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}]));
1691 my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}]));
1692
1693 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
1694 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
1695
1696 my $ifBalanced = 'UNBAL';
1697 my $secondTable;
1698 my $clusterCoordinates;
1699 my ($break_pont_chr1,$break_pont_chr2);
1700
1701 my $signatureType="";
1702
1703 my $maxCoord1 =$chr->{$chr1}->{length};
1704 my $maxCoord2 =$chr->{$chr2}->{length};
1705
1706 if (scalar @falseReads) {
1707 @falseReads = sort @falseReads;
1708 #now delete FRFR choosing the majority
1709 my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF
1710 @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled
1711 my %hashTmp;
1712 for my $count1 (0..scalar(@falseReads)-1) {
1713 my $i = $falseReads[$count1];
1714 $hashTmp{$i} = 1;
1715 for my $bad (@newfalseReads) {
1716 if ($bad == $i) {
1717 delete $hashTmp{$i};
1718 next;
1719 }
1720 }
1721 }
1722 @falseReads = sort keys %hashTmp; #what is left
1723 for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected
1724 my $nonAdeq = 0;
1725 my $i = $falseReads[$count1];
1726
1727 for my $count2 (0..$count1-1) {
1728 my $j = $falseReads[$count2];
1729 if (exists($secondTable->{$j}->{$i})) {
1730 $nonAdeq++;
1731 $secondTable->{$i}->{$j} = 1;
1732 }
1733 }
1734 for my $count2 ($count1+1..scalar(@falseReads)-1) {
1735 my $j = $falseReads[$count2];
1736 if ($positions1[$j]-$positions1[$i]>$alphaDist) {
1737 if (&reversed ($i,$j,$ifRenv,\@positions2)) {
1738 $nonAdeq++;
1739 $secondTable->{$i}->{$j} = 1;
1740 }
1741 }
1742 }
1743 $secondTable->{$i}->{nonAdeq} = $nonAdeq;
1744 }
1745
1746 my @falseReads2;
1747 my $bad = &check($secondTable);
1748 while ($bad ne "OK") { #clear the table to reject non adequate pairs
1749 push (@falseReads2, $bad);
1750 &remove($bad,$secondTable);
1751 $bad = &check($secondTable);
1752 }
1753 if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) {
1754 my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}]));
1755 my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}]));
1756 my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}]));
1757 my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}]));
1758
1759 $ifBalanced = 'BAL';
1760
1761 if ($ifBalanced eq 'BAL') {
1762
1763 if (scalar keys %{$table} < $nb_pairs_order_threshold) {
1764 $ifBalanced = 'UNBAL'; #kill cluster 1!
1765 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
1766 $reads_left = scalar keys %{$table};
1767 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2;
1768 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2;
1769 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2;
1770 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2;
1771 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
1772 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
1773
1774 } else {
1775
1776 $reads_left += scalar keys %{$secondTable};
1777 next if ($reads_left < $nb_pairs_threshold);
1778
1779 if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) {
1780 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
1781
1782 ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1);
1783 ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1);
1784 ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1);
1785 ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1);
1786
1787 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1;
1788 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2;
1789 }else {
1790 $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')';
1791 $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')';
1792 }
1793 $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2);
1794 $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2);
1795 $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2);
1796 $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2);
1797 #to calculate breakpoints one need to take into account read orientation in claster..
1798 my $leftLetterOk = substr($mate_sense, 0, 1); #R
1799 my $rightLetterOk = substr($mate_sense, 1, 1); #F
1800
1801
1802 my @index1 = keys %{$table};
1803 my @index2 = keys %{$secondTable};
1804
1805 my (@generalStrand1,@generalStrand2) = 0;
1806
1807 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
1808 $leftLetterOk = 'R';
1809 $rightLetterOk = 'F';
1810 @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1);
1811 @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2);
1812 } else {
1813 @generalStrand1 = @strand1;
1814 @generalStrand2 = @strand2; # TODO check if it is correct
1815 }
1816 if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F)
1817 $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')';
1818
1819 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) {
1820 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
1821 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')';
1822 $signatureType = "TRANSLOC";
1823 } else {
1824 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')';
1825 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')';
1826 $signatureType = "INS_FRAGMT";
1827 }
1828
1829 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) {
1830 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
1831 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')';
1832 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')';
1833 $signatureType = "INV_INS_FRAGMT";
1834 } else {
1835 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')';
1836 $signatureType = "INV_TRANSLOC";
1837 }
1838 } else {
1839 #should not occur
1840 print STDERR "\nError in orderFiltering\n\n";
1841 }
1842 }
1843
1844 elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R)
1845 $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')';
1846 $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')';
1847 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) {
1848 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
1849 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')';
1850 $signatureType = "INV_INS_FRAGMT";
1851 } else {
1852 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')';
1853 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')';
1854 $signatureType = "INV_COAMPLICON";
1855 }
1856
1857 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) {
1858 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
1859 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')';
1860 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')';
1861 $signatureType = "COAMPLICON";
1862 } else {
1863 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')';
1864 $signatureType = "INS_FRAGMT";
1865 }
1866 } else {
1867 #should not occur
1868 $signatureType = "UNDEFINED";
1869 }
1870 }
1871 else { # (F,F) or (R,R) something strange. We will discard the smallest cluster
1872 $ifBalanced = 'UNBAL';
1873 if (scalar keys %{$secondTable} > scalar keys %{$table}) {
1874 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
1875
1876 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2;
1877 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2;
1878 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2;
1879 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2;
1880 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
1881 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
1882 }
1883 $reads_left = scalar keys %{$table};
1884 }
1885 if ($ifBalanced eq 'BAL') {
1886 $ifRenv = $signatureType;
1887 }
1888 }
1889 }
1890 }
1891 }
1892 if ($ifBalanced ne 'BAL') {
1893 #define possible break point
1894 $coord_start_chr1 = $coord_start_chr1_cluster1;
1895 $coord_end_chr1 = $coord_end_chr1_cluster1;
1896 $coord_start_chr2 = $coord_start_chr2_cluster1;
1897 $coord_end_chr2 = $coord_end_chr2_cluster1;
1898
1899 my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1;
1900 my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2;
1901
1902 my $leftLetterOk = substr($mate_sense, 0, 1); #R
1903 my $rightLetterOk = substr($mate_sense, 1, 1); #F
1904
1905 my @index = keys %{$table};
1906 unless ($diff_sense_ends) {
1907 my $firstEndOrder1 = $ends_order1[$index[0]];
1908 my $firstEndOrder2 = $ends_order2[$index[0]];
1909 $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')';
1910 $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')';
1911 } else {
1912 $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')';
1913 $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')';
1914 }
1915
1916 if ($chr1 ne $chr2){
1917 $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE");
1918 $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE");
1919 }
1920 }
1921
1922 if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) {
1923 next; #discard the link
1924 }
1925 if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) {
1926 next; #discard the link
1927 }
1928 my $ratioTxt = "$reads_left/".(scalar @pairs);
1929 my ($n1,$nTot) = split ("/",$strand_ratio_filtering);
1930 my $ratioReal = $reads_left/$nTot;
1931
1932 if ($coord_start_chr1<=0) {
1933 $coord_start_chr1=1;
1934 }
1935 if ($coord_start_chr2<=0) {
1936 $coord_start_chr2=1;
1937 }
1938 #create output
1939 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output
1940 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2,
1941 $reads_left,
1942 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs),
1943 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1),
1944 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2),
1945 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1),
1946 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2),
1947 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1),
1948 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2),
1949 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1),
1950 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2),
1951 $ifRenv,
1952 $strand_ratio_filtering,
1953 $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2,
1954 $ratioReal, $nTot);
1955
1956 $nb_links++;
1957 print $fh2 join("\t",@link)."\n";
1958
1959 if($record>=$warn){
1960 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n";
1961 $warn+=10000;
1962 }
1963
1964 }
1965 $fh->close;
1966 $fh2->close;
1967
1968 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
1969
1970 }
1971 #------------------------------------------------------------------------------#
1972 #------------------------------------------------------------------------------#
1973 #gets information about ends positions given start, direction and order
1974 sub getEnds {
1975 my ($ends,$starts,$strand,$end_order,$tag_length) = @_;
1976 for my $i (0..scalar(@{$starts})-1) {
1977 $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length);
1978 }
1979 }
1980 sub getEnd {
1981 my ($start,$strand, $end_order,$tag_length) = @_;
1982 return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1;
1983 }
1984 #------------------------------------------------------------------------------#
1985 #------------------------------------------------------------------------------#
1986 #gets starts and ends Coords when start=leftmost given positions, directions and orders
1987 sub getCoordswithLeftMost {
1988
1989 my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_;
1990
1991 for my $i (0..scalar(@{$positions})-1) {
1992
1993 if($strand->[$i] eq 'F'){
1994 $starts->[$i]=$positions->[$i];
1995 $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1;
1996 }else{
1997 $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1;
1998 $ends->[$i]=$positions->[$i];
1999 }
2000 }
2001 }
2002 #------------------------------------------------------------------------------#
2003 #------------------------------------------------------------------------------#
2004 sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering
2005
2006 my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_;
2007
2008 my @sfile=split(/\./,$links_file);
2009 my $fchr=$sfile[$#sfile-2];
2010
2011
2012 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0;
2013
2014 my $record=0;
2015 my $nb_links=0;
2016 my $warn=10000;
2017
2018 print LOG "\# Filtering out normal pairs using insert size...\n";
2019 print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n";
2020 print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-".
2021 ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n";
2022 print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-".
2023 ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n";
2024 print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF");
2025
2026 my $fh = new FileHandle;
2027 my $fh2 = new FileHandle;
2028
2029 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
2030 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
2031
2032 while(<$fh>){
2033
2034 $record++;
2035 my @t = split;
2036 my ($chr1,$chr2,$mates_list)=@t[0,3,7];
2037
2038 if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all),
2039 $nb_links++;
2040
2041 $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE");
2042 $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE");
2043
2044 $t[16].= "\t";
2045 $t[19].= "\t";
2046
2047 print $fh2 join("\t",@t)."\n";
2048
2049 if($record>=$warn){
2050 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n";
2051 $warn+=10000;
2052 }
2053 next;
2054 }
2055
2056 my $ifRenv = $t[16];
2057 my $ifBalanced = "UNBAL";
2058 $ifBalanced = $t[18] if ($order_filtering);
2059
2060 my $numberOfPairs = $t[6];
2061 my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14]));
2062 my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15]));
2063
2064 if ($ifBalanced eq "BAL") {
2065
2066 if ($ifRenv eq "INV_TRANSLOC") {
2067 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment
2068 }
2069 if ($ifRenv eq "NORMAL_SENSE") {
2070 $ifRenv = "TRANSLOC";
2071 }
2072 if ($ifRenv eq "REVERSE_SENSE") {
2073 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment
2074 }
2075 $t[19].= "\t";
2076
2077 my $meanDistance = 0;
2078
2079 for my $i (0..$numberOfPairs-1) {
2080 $meanDistance += $positions2[$i]-$positions1[$i];
2081 }
2082 $meanDistance /= $numberOfPairs;
2083
2084 $t[16] = $ifRenv."\t".$meanDistance;
2085 #dont touch the annotation. It should be already OK.
2086
2087 } else {
2088 #only for unbalanced
2089
2090 my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]);
2091
2092 my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0].
2093 (deleteBadOrderSensePairs(split (/,/,$t[9])))[0];
2094 my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0].
2095 (deleteBadOrderSensePairs(split (/,/,$t[11])))[0];
2096
2097 my $indel_type = $ifRenv;
2098
2099 my $meanDistance = "N/A";
2100
2101 ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster
2102 \@positions1, #assign N/A to $indel_type if unknown
2103 \@positions2,
2104 $ifRenv,
2105 $ifoverlap,
2106 $indel_sigma_threshold,
2107 $dup_sigma_threshold,
2108 $singleton_sigma_threshold,
2109 $mu,
2110 $sigma,
2111 $ifBalanced,
2112 $ends_sense_class,
2113 $ends_order_class,
2114 $mate_sense,
2115 $diff_sense_ends,
2116 );
2117
2118 #filtering of pairs with distance inconsistant with the SV
2119 if ($ifRenv ne "REVERSE_SENSE") {
2120 my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length};
2121 my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length};
2122 $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,
2123 $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering);
2124 next if ($t[6] < $nb_pairs_threshold);
2125 }else{
2126 $t[19].= "\t";
2127 }
2128 $t[16] = $indel_type."\t".$meanDistance;
2129 }
2130
2131 $nb_links++;
2132
2133 print $fh2 join("\t",@t)."\n";
2134 if($record>=$warn){
2135 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n";
2136 $warn+=10000;
2137 }
2138 }
2139 $fh->close;
2140 $fh2->close;
2141
2142 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
2143
2144 }
2145 #------------------------------------------------------------------------------#
2146 #------------------------------------------------------------------------------#
2147 sub checkIndel {
2148
2149 my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold,
2150 $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_;
2151
2152 my $meanDistance = 0;
2153
2154 for my $i (0..$numberOfPairs-1) {
2155 $meanDistance += $positions2->[$i]-$positions1->[$i];
2156 }
2157 $meanDistance /= $numberOfPairs;
2158
2159 return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) );
2160
2161 return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE");
2162
2163 if($diff_sense_ends){
2164 return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ;
2165 return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense)));
2166 }else{
2167 return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) &&
2168 ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ;
2169 }
2170
2171 return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap);
2172
2173 return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ;
2174
2175 return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma);
2176 return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma);
2177
2178 return ($meanDistance, "UNDEFINED");
2179 }
2180 #------------------------------------------------------------------------------#
2181 #------------------------------------------------------------------------------#
2182 #sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size )
2183 sub recalc_t_usingInsertSizeInfo {
2184 my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_;
2185
2186 my @badPairs;
2187
2188 my @positions1 = getAllEntries($t->[14]);
2189 my @positions2 = getAllEntries($t->[15]);
2190
2191 if ($meanDistance < $mu) {
2192 for my $i (0..scalar(@positions1)-1) {
2193 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) {
2194 push(@badPairs,$i);
2195 }
2196 }
2197 } else {
2198 for my $i (0..scalar(@positions1)-1) {
2199 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) {
2200 push(@badPairs,$i);
2201 }
2202 }
2203 }
2204
2205 if (scalar (@badPairs)>0) {
2206 #print join("\t",@badPairs).": ".join("\t",@t)."\n";
2207 #remove these inconsistant links
2208 $t->[6] -= scalar(@badPairs); #numberOfPairs
2209 return if ($t->[6] < $nb_pairs_threshold);
2210
2211 $t->[7] = mark_values(\@badPairs, $t->[7]);
2212 $t->[8] = mark_values(\@badPairs, $t->[8]);
2213 $t->[9] = mark_values(\@badPairs, $t->[9]);
2214 $t->[10] = mark_values(\@badPairs, $t->[10]);
2215 $t->[11] = mark_values(\@badPairs, $t->[11]);
2216
2217 $t->[12] = mark_indexes(\@badPairs, $t->[12]);
2218 $t->[13] = mark_indexes(\@badPairs, $t->[13]);
2219
2220 $t->[14] = mark_values(\@badPairs, $t->[14]);
2221 $t->[15] = mark_values(\@badPairs, $t->[15]);
2222 $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio
2223 $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering);
2224 ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length);
2225 ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length);
2226
2227 #recalc breakpoints:
2228 my $quant001 = 3.090232;
2229 my $maxFragmentLength = &floor($quant001 * $sigma + $mu);
2230 $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering);
2231 $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering);
2232 #recalc total ratio
2233 $t->[22] = $t->[6] / $t->[23] if ($order_filtering);
2234 $t->[18] = $t->[6] / $t->[19] unless ($order_filtering);
2235
2236 @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14]));
2237 @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15]));
2238
2239 $meanDistance = 0;
2240
2241 for my $i (0..scalar(@positions1)-1) {
2242 $meanDistance += $positions2[$i]-$positions1[$i];
2243 }
2244 $meanDistance /= scalar(@positions1);
2245
2246 } else {
2247 $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering);
2248 $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering);
2249
2250 } #nothing has been filtered
2251 return $meanDistance;
2252 }
2253
2254 sub recalculate_ratio {
2255 my ($left, $ratio) = @_;
2256 my @elements = split (/\//,$ratio);
2257 $elements[1]= $elements[0];
2258 $elements[0]=$left;
2259 return $ratio."\t".join("/",@elements);
2260 }
2261
2262 sub recalc_breakpoints {
2263 my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_;
2264 my $break_pont_chr;
2265
2266 my $leftLetterOk = substr($mate_sense, 0, 1); #R
2267 my $rightLetterOk = substr($mate_sense, 1, 1); #F
2268
2269
2270 my @positions = deleteBadOrderSensePairs(split (/,/,$startString));
2271
2272 unless ($diff_sense_ends) {
2273 $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')';
2274 } else {
2275 $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')';
2276 }
2277 return $break_pont_chr;
2278 }
2279 sub recalculate_boundaries {
2280 my ($startString,$senseString,$endsOrderString,$tag_length) = @_;
2281 my @positions = deleteBadOrderSensePairs(split (/,/,$startString));
2282 my @strands = deleteBadOrderSensePairs(split (/,/,$senseString));
2283 my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString));
2284 my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length);
2285 my $coord_start_cluster = min(min(@positions),min(@ends));
2286 my $coord_end_cluster = max(max(@positions),max(@ends));
2287 return ($coord_start_cluster,$coord_end_cluster);
2288 }
2289
2290 sub remove_indexes {
2291 my ($bads, $string) = @_;
2292 my @elements = deleteBadOrderSensePairs(split (/,/,$string));
2293 for my $i (reverse sort %{$bads}) {
2294 delete $elements[$i];
2295 }
2296 return "(".join(",",@elements).")";
2297 }
2298 ##add @ to to elements
2299 sub mark_values {
2300 my ($bads, $string) = @_;
2301 my @elements = getAllEntries($string);
2302 for my $i (@{$bads}) {
2303 $elements[$i] .= "@";
2304 }
2305 return "(".join(",",@elements).")";
2306 }
2307 ##add @ to to indexes
2308 sub mark_indexes {
2309 my ($bads, $string) = @_;
2310 my @elements = getAllEntries($string);
2311 for my $i ((0..scalar(@elements)-1)) {
2312 for my $j (@{$bads}) {
2313 $elements[$i] .= "@" if ($elements[$i] eq ($j+1));
2314 }
2315 }
2316
2317 return "(".join(",",@elements).")";
2318 }
2319
2320 #------------------------------------------------------------------------------#
2321 #------------------------------------------------------------------------------#
2322 sub redraw {
2323
2324 my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_;
2325
2326 my $out;
2327 my @first_arr;
2328 if ($ifBalanced eq 'BAL') {
2329 my @second_arr;
2330 my $lastPushed = 1;
2331 if ($type == 1) {
2332 for my $i (0 .. scalar(@{$arr})-1) {
2333 if (exists ($table->{$i})) {
2334 push(@first_arr,$arr->[$i]);
2335 $lastPushed = 1;
2336 }elsif (exists ($secondTable->{$i})) {
2337 push(@second_arr,$arr->[$i]);
2338 $lastPushed = 2;
2339 } elsif ($lastPushed == 1) {
2340 if (exists ($badInFRSense->{$i})) {
2341 push(@first_arr,$arr->[$i]."\$");
2342 }else {
2343 push(@first_arr,$arr->[$i]."*");
2344 }
2345 } elsif ($lastPushed == 2) {
2346 if (exists ($badInFRSense->{$i})) {
2347 push(@second_arr,$arr->[$i]."\$");
2348 }else {
2349 push(@second_arr,$arr->[$i]."*");
2350 }
2351 } else {print "Error!";exit;}
2352 }
2353 } else {
2354 for my $i (@{$arr}) {
2355 if (exists ($table->{$i-1})) {
2356 push(@first_arr,$i);
2357 $lastPushed = 1;
2358 }elsif (exists ($secondTable->{$i-1})) {
2359 push(@second_arr,$i);
2360 $lastPushed = 2;
2361 } elsif ($lastPushed == 1) {
2362 if (exists ($badInFRSense->{$i-1})) {
2363 push(@first_arr,$i."\$");
2364 }else {
2365 push(@first_arr,$i."*");
2366 }
2367 } elsif ($lastPushed == 2) {
2368 if (exists ($badInFRSense->{$i-1})) {
2369 push(@second_arr,$i."\$");
2370 }else {
2371 push(@second_arr,$i."*");
2372 }
2373 } else {print "Error!";exit;}
2374 }
2375 }
2376 $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')';
2377 }
2378 else {
2379 if ($type == 1) {
2380 for my $i (0 .. scalar(@{$arr})-1) {
2381 if (exists ($table->{$i})) {
2382 push(@first_arr,$arr->[$i]);
2383 } else {
2384 if (exists ($badInFRSense->{$i})) {
2385 push(@first_arr,$arr->[$i]."\$");
2386 }else {
2387 push(@first_arr,$arr->[$i]."*");
2388 }
2389 }
2390 }
2391 } else {
2392 for my $i (@{$arr}) {
2393 if (exists ($table->{$i-1})) {
2394 push(@first_arr,$i);
2395 } else {
2396 if (exists ($badInFRSense->{$i-1})) {
2397 push(@first_arr,$i."\$");
2398 }else {
2399 push(@first_arr,$i."*");
2400 }
2401 }
2402 }
2403 }
2404 $out = '('.join(",",@first_arr).')';
2405 }
2406 return $out;
2407 }
2408 #------------------------------------------------------------------------------#
2409 #------------------------------------------------------------------------------#
2410 sub check {
2411
2412 my $table = $_[0];
2413 my $bad = 'OK';
2414 my $max = 0;
2415 for my $i (sort {$a<=>$b} keys %{$table}) {
2416 unless ($table->{$i}->{nonAdeq} == 0) {
2417 if ($max<$table->{$i}->{nonAdeq}) {
2418 $max=$table->{$i}->{nonAdeq};
2419 $bad = $i;
2420 }
2421 }
2422 }
2423 return $bad;
2424 }
2425 #------------------------------------------------------------------------------#
2426 #------------------------------------------------------------------------------#
2427 sub reversed {
2428
2429 my ($i,$j,$ifRenv,$positions) = @_;
2430 if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){
2431 return 1;
2432 }
2433 return 0;
2434 }
2435 #------------------------------------------------------------------------------#
2436 #------------------------------------------------------------------------------#
2437 sub remove {
2438
2439 my ($bad,$table) = @_;
2440 for my $i (sort {$a<=>$b} keys %{$table}) {
2441 if ($bad == $i) {
2442 delete($table->{$i});;
2443 } else {
2444 if (exists($table->{$i}->{$bad})) {
2445 delete($table->{$i}->{$bad});
2446 $table->{$i}->{nonAdeq}--;
2447 }
2448 }
2449 }
2450 }
2451 #------------------------------------------------------------------------------#
2452 #------------------------------------------------------------------------------#
2453 sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs
2454
2455 my ($strand,$ends_order,$mate_sense,@keysLeft) = @_;
2456
2457 my $leftLetterOk = substr($mate_sense, 0, 1); #R
2458 my $rightLetterOk = substr($mate_sense, 1, 1); #F
2459
2460 my (@standardArray);
2461 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
2462 $leftLetterOk = 'R';
2463 $rightLetterOk = 'F';
2464 @standardArray = translateSolidToRF($strand,$ends_order);
2465 } else {
2466 @standardArray = @{$strand};
2467 }
2468
2469 my $ifR = 0;
2470 my @Rs;
2471
2472 for my $i (@keysLeft) {
2473 if ($standardArray[$i] eq $leftLetterOk) {
2474 $ifR++;
2475 push(@Rs,$i);
2476 }
2477 }
2478
2479
2480 my $ifF = 0;
2481 my @Fs;
2482
2483 for my $i (@keysLeft) {
2484 if ($standardArray[$i] eq $rightLetterOk) {
2485 $ifF++;
2486 push(@Fs,$i);
2487 }
2488 }
2489
2490 if($ifR>=$ifF) {
2491 return @Fs;
2492 }
2493 return @Rs;
2494 }
2495
2496 #------------------------------------------------------------------------------#
2497 #------------------------------------------------------------------------------#
2498 sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa
2499
2500 my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]);
2501 my $mate_sense = $_[6];
2502
2503 my $leftLetterOk = substr($mate_sense, 0, 1); #R
2504 my $rightLetterOk = substr($mate_sense, 1, 1); #F
2505
2506 my (@standardArray1,@standardArray2);
2507
2508 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
2509 $leftLetterOk = 'R';
2510 $rightLetterOk = 'F';
2511 @standardArray1 = translateSolidToRF($strand1,$ends_order1);
2512 my @arr = getOrderedStrands($strand2,$order2);
2513 my @ends2 = getOrderedStrands($ends_order2,$order2);
2514 @standardArray2 = translateSolidToRF(\@arr,\@ends2);
2515
2516 } else {
2517 @standardArray1 = @{$strand1};
2518 @standardArray2 = getOrderedStrands($strand2,$order2);
2519 }
2520
2521 #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF
2522
2523 #for the first end:
2524
2525 my @array = @standardArray1;
2526 my %badInFRSense1;
2527 for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF
2528 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
2529 $badInFRSense1{$i}=1;
2530 $array[$i] = $rightLetterOk;
2531 }
2532 }
2533 my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1);
2534 @array = @standardArray1;
2535 my %badInFRSense0;
2536 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR
2537 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
2538 $badInFRSense0{$i-1}=1;
2539 $array[$i-1] = $leftLetterOk;
2540
2541 }
2542 }
2543 my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0);
2544
2545 #for the second end:
2546 @array = @standardArray2;
2547
2548 my %badInFRSense3;
2549 for my $i (1..scalar(@array)-1){
2550 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
2551 $badInFRSense3{$order2->[$i]}=1;
2552 $array[$i] = $rightLetterOk;
2553 }
2554 }
2555 my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3);
2556
2557 @array = @standardArray2;
2558 my %badInFRSense5;
2559 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF
2560 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
2561 $badInFRSense5{$i-1}=1;
2562 $array[$i-1] = $leftLetterOk;
2563 }
2564 }
2565 my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5);
2566
2567 if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) {
2568 return (1,%badInFRSense0);
2569 }
2570
2571 if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) {
2572 return (1,%badInFRSense1);
2573 }
2574
2575 if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) {
2576 return (2,%badInFRSense3);
2577 }
2578
2579 if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) {
2580 return (2,%badInFRSense5);
2581 }
2582
2583 #should not get here:
2584 print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n";
2585 return (1,%badInFRSense1);
2586 }
2587
2588 sub getOrderedStrands {
2589 my ($strand,$order) = ($_[0],$_[1]);
2590 my @arr;
2591 for my $i (0..scalar(@{$strand})-1) {
2592 push(@arr,$strand->[$order->[$i]-1]);
2593 }
2594 return @arr;
2595 }
2596 #------------------------------------------------------------------------------#
2597 #------------------------------------------------------------------------------#
2598 sub checkClusters {
2599
2600 my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_;
2601 if ($ifRenv eq 'REVERSE_SENSE') {
2602 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) {
2603 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0;
2604 }
2605 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0;
2606 }
2607 #if NORM
2608 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) {
2609 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0;
2610 }
2611 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0;
2612 }
2613
2614 #------------------------------------------------------------------------------#
2615 #------------------------------------------------------------------------------#
2616 sub translateSolidToRF {
2617 my ($strandArr,$ends_orderArr)=@_;
2618 my @array;
2619 for my $i (0..scalar(@{$strandArr})-1) {
2620 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') {
2621 push(@array,'F');
2622 }
2623 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') {
2624 push(@array,'R');
2625 }
2626 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') {
2627 push(@array,'R');
2628 }
2629 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') {
2630 push(@array,'F');
2631 }
2632 }
2633 return @array;
2634 }
2635
2636 #------------------------------------------------------------------------------#
2637 #------------------------------------------------------------------------------#
2638 #convert the links file to the circos format
2639 sub links2segdup{
2640
2641 my($id,$color_code,$links_file,$segdup_file)=@_;
2642
2643 print LOG "# Converting to the circos format...\n";
2644
2645 tie (my %hcolor,'Tie::IxHash'); #color-code hash table
2646 foreach my $col (keys %{$color_code}){
2647 my ($min_links,$max_links)=split(",",$color_code->{$col});
2648 $hcolor{$col}=[$min_links,$max_links];
2649 }
2650
2651 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
2652 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n";
2653
2654 my $index=1;
2655 while(<LINKS>){
2656
2657 my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6];
2658
2659 my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links
2660
2661 print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output
2662 "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n";
2663 $index++;
2664 }
2665
2666 close LINKS;
2667 close SEGDUP;
2668 print LOG "-- output created: $segdup_file\n";
2669 }
2670 #------------------------------------------------------------------------------#
2671 #------------------------------------------------------------------------------#
2672 #convert the links file to the bedPE format for BEDTools usage
2673 sub links2bedPElinksfile{
2674
2675 my ($sample,$links_file,$bedpe_file)=@_;
2676
2677 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
2678 open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n";
2679
2680 my $nb_links=1;
2681
2682 while(<LINKS>){
2683
2684 chomp;
2685 my @t=split("\t",$_);
2686 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6);
2687 my $type=($chr1 eq $chr2)? "INTRA":"INTER";
2688 $type.="_".$t[10];
2689
2690 $start1--; $start2--;
2691
2692 print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2".
2693 "\t$sample"."_link$nb_links\t$type\t.\t.".
2694 "\t".join("|",@t)."\n";
2695
2696 $nb_links++;
2697 }
2698
2699 close LINKS;
2700 close BEDPE;
2701
2702 }
2703 #------------------------------------------------------------------------------#
2704 #------------------------------------------------------------------------------#
2705 sub bedPElinks2linksfile{
2706
2707 my ($bedpe_file,$links_file)=@_;
2708
2709 open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n";
2710 open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n";
2711
2712 while(<BEDPE>){
2713
2714 chomp;
2715 my $sample=(split("_",(split("\t",$_))[6]))[0];
2716 my @t1=(split("\t",$_))[0,1,2,3,4,5];
2717 my @t2=split(/\|/,(split("\t",$_))[10]);
2718 push(@t2,$sample);
2719
2720 print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n";
2721
2722 }
2723 close BEDPE;
2724 close LINKS;
2725
2726 }
2727 #------------------------------------------------------------------------------#
2728 #------------------------------------------------------------------------------#
2729 #convert the links file to the bed format
2730 sub links2bedfile{
2731
2732 my ($tag_length,$color_code,$links_file,$bed_file)=@_;
2733
2734 print LOG "# Converting to the bed format...\n";
2735
2736 my $compare=1;
2737 if($links_file!~/compared$/){
2738 $compare=0;
2739 $tag_length->{none}->{1}=$tag_length->{1};
2740 $tag_length->{none}->{2}=$tag_length->{2};
2741 }
2742
2743 #color-code hash table
2744 tie (my %hcolor,'Tie::IxHash');
2745 my %color_order;
2746 my $n=1;
2747 foreach my $col (keys %{$color_code}){
2748 my ($min_links,$max_links)=split(",",$color_code->{$col});
2749 $hcolor{$col}=[$min_links,$max_links];
2750 $color_order{$col}=$n;
2751 $n++;
2752 }
2753
2754 my %pair;
2755 my %pt;
2756 $n=1;
2757 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
2758
2759 my %str=( "F"=>"+", "R"=>"-" );
2760
2761 while(<LINKS>){
2762
2763 my @t=split;
2764 my $sample=($compare)? pop(@t):"none";
2765
2766 my $chr1=$t[0];
2767 my $chr2=$t[3];
2768 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
2769 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
2770 my $same_chr=($chr1 eq $chr2)? 1:0;
2771
2772 my $count=$t[6];
2773 my $color=getColor($count,\%hcolor,"bed");
2774
2775 my @pairs=deleteBadOrderSensePairs(split(",",$t[7]));
2776 my @strand1=deleteBadOrderSensePairs(split(",",$t[8]));
2777 my @strand2=deleteBadOrderSensePairs(split(",",$t[9]));
2778 my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10]));
2779 my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11]));
2780 my @position1=deleteBadOrderSensePairs(split(",",$t[14]));
2781 my @position2=deleteBadOrderSensePairs(split(",",$t[15]));
2782 my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample});
2783 my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample});
2784
2785
2786 for my $p (0..$#pairs){
2787
2788 if (!exists $pair{$pairs[$p]}){
2789
2790 if($same_chr){
2791
2792 $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]},
2793 $start1[$p]-1, $end2[$p], $color,
2794 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ];
2795 $pt{$n}=$pair{$pairs[$p]}->{0};
2796 $n++;
2797
2798 }else{
2799
2800 $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]},
2801 $start1[$p]-1, $end1[$p], $color,
2802 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0];
2803 $pt{$n}=$pair{$pairs[$p]}->{1};
2804 $n++;
2805
2806
2807 $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]},
2808 $start2[$p]-1, $end2[$p], $color,
2809 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0];
2810 $pt{$n}=$pair{$pairs[$p]}->{2};
2811 $n++;
2812 }
2813 }else{
2814
2815 if($same_chr){
2816 ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]});
2817 }else{
2818 ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]});
2819 ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]});
2820 }
2821 }
2822 }
2823 }
2824 close LINKS;
2825
2826 my $nb_pairs=$n-1;
2827
2828 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n";
2829 print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ".
2830 "visibility=2 itemRgb=\"On\"\n";
2831
2832 for my $i (1..$nb_pairs){
2833 print BED join("\t",@{$pt{$i}})."\n";
2834 }
2835
2836 close BED;
2837
2838 print LOG "-- output created: $bed_file\n";
2839
2840 undef %pair;
2841 undef %pt;
2842
2843 }
2844 #------------------------------------------------------------------------------#
2845 #------------------------------------------------------------------------------#
2846 sub deleteBadOrderSensePairs{
2847
2848 my (@tab)=@_;
2849 my @tab2;
2850
2851 foreach my $v (@tab){
2852
2853 $v=~s/[\(\)]//g;
2854 push(@tab2,$v) if($v!~/[\$\*\@]$/);
2855 }
2856 return @tab2;
2857 }
2858 #------------------------------------------------------------------------------#
2859 #------------------------------------------------------------------------------#
2860 sub getAllEntries{
2861 my (@tab)=split (/,/,$_[0]);
2862 my @tab2;
2863
2864 foreach my $v (@tab){
2865
2866 $v=~s/[\(\)]//g;
2867 push(@tab2,$v);
2868 }
2869 return @tab2;
2870 }#------------------------------------------------------------------------------#
2871 #------------------------------------------------------------------------------#
2872 sub getAllEntriesWOspecialChar{
2873 my (@tab)=split (/,/,$_[0]);
2874 my @tab2;
2875
2876 foreach my $v (@tab){
2877
2878 $v=~s/[\(\)\$\*\@]//g;
2879 push(@tab2,$v);
2880 }
2881 return @tab2;
2882 }
2883 #------------------------------------------------------------------------------#
2884 #------------------------------------------------------------------------------#
2885 sub links2SVfile{
2886
2887 my($links_file,$sv_file)=@_;
2888
2889 print LOG "# Converting to the sv output table...\n";
2890 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
2891 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
2892
2893 my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist
2894 chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering
2895 final_score breakpoint1_start1-end1 breakpoint2_start2-end2);
2896
2897 my $nb_links=0;
2898
2899 while (<LINKS>){
2900
2901 my @t=split;
2902 my @sv=();
2903 my $sv_type="-";
2904 my $strand_ratio="-";
2905 my $eq_ratio="-";
2906 my $eq_type="-";
2907 my $insert_ratio="-";
2908 my $link="-";
2909 my ($bk1, $bk2)=("-","-");
2910 my $score="-";
2911
2912 my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]);
2913 my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]);
2914 my $nb_pairs=$t[6];
2915 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
2916 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
2917 my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER";
2918
2919 #if strand filtering
2920 if (defined $t[16]){
2921 #if inter-chr link
2922 $sv_type=$t[16];
2923 if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){
2924 $strand_ratio=floor($1/$2*100)."%";
2925 $score=$t[18];
2926 }
2927 if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){
2928 #if intra-chr link with insert size filtering
2929 $strand_ratio=floor($1/$2*100)."%";
2930 $link=floor($t[17]);
2931 if($sv_type!~/^INV/){
2932 $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
2933 $score=$t[20];
2934 }else{
2935 $score=$t[19];
2936 }
2937 }
2938 }
2939
2940 if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){
2941
2942 #if strand and order filtering only and/or interchr link
2943 $eq_type=$t[18];
2944 $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
2945 ($bk1, $bk2)=($t[20],$t[21]);
2946 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
2947 $score=$t[22];
2948
2949 }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){
2950
2951 #if all three filtering
2952 $link=floor($t[17]);
2953 $eq_type=$t[19];
2954 $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/);
2955
2956 if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){
2957 $insert_ratio=floor($1/$2*100)."%";
2958 ($bk1, $bk2)=($t[22],$t[23]);
2959 $score=$t[24];
2960
2961 }else{
2962 ($bk1, $bk2)=($t[21],$t[22]);
2963 $score=$t[23];
2964 }
2965 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
2966
2967 }
2968
2969
2970 push(@sv, $chr_type, $sv_type,$eq_type);
2971 push(@sv,"$chr1\t$start1-$end1");
2972 push(@sv, $link);
2973 push(@sv,"$chr2\t$start2-$end2",
2974 $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2);
2975
2976
2977 print SV join("\t",@sv)."\n";
2978 }
2979
2980 close LINKS;
2981 close SV;
2982
2983 system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted";
2984
2985 open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n";
2986 my @links=<SV>;
2987 close SV;
2988
2989 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
2990
2991 print SV join("\t",@header)."\n";
2992 print SV @links;
2993 close SV;
2994
2995 unlink($sv_file.".sorted");
2996
2997 print LOG "-- output created: $sv_file\n";
2998
2999 }
3000 #------------------------------------------------------------------------------#
3001 sub densityCalculation{
3002
3003 my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_;
3004
3005 my @sfile=split(/\./,$$mates_file[$file]);
3006 my $fchr=$sfile[$#sfile];
3007
3008 my $fh = new FileHandle;
3009
3010 my %density;
3011 my %density_ref;
3012 my @ratio;
3013 my ($cov,$cov_ref);
3014
3015 #FREQUENCY CALCULATION PROCEDURE
3016 print LOG "# $fchr : Frequency calculation procedure...\n";
3017 &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format);
3018 &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format);
3019
3020 #MAKING RATIO AND OUTPUT
3021 print LOG "\# Ratio calculation procedure...\n";
3022 $density_file=~s/\/mates\//\/density\//;
3023 $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n";
3024
3025 foreach my $k (1..$chr->{nb_chrs}){
3026 foreach my $frag (1..$chr->{$k}->{nb_frag}){
3027
3028 @ratio= ($chr->{$k}->{name},
3029 (${$chr->{$k}->{$frag}}[0]+1),
3030 (${$chr->{$k}->{$frag}}[1]+1));
3031
3032 $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0;
3033 $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0;
3034
3035 push(@ratio,$cov,$cov_ref);
3036 push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref);
3037 push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref);
3038 push(@ratio,log($cov+1)) if($cov && !$cov_ref);
3039 next if(!$cov && !$cov_ref);
3040
3041 print $fh join("\t",@ratio)."\n";
3042 }
3043 }
3044
3045 $fh->close;
3046 print LOG "-- output created: $density_file\n";
3047
3048 undef %density;
3049 undef %density_ref;
3050 }
3051 #------------------------------------------------------------------------------#
3052 #------------------------------------------------------------------------------#
3053 sub FreqCalculation{
3054
3055 my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_;
3056
3057 my @sfile=split(/\./,$mates_file);
3058 my $fchr=$sfile[$#sfile];
3059 my $fh = new FileHandle;
3060
3061 my $nb_windows=0;
3062 my $warn=100000;
3063 my $record=0;
3064 my %pair;
3065
3066 my ($sumX,$sumX2) = (0,0);
3067
3068 print LOG "\# Frequency calculation for $mates_file...\n";
3069
3070 if ($mates_file =~ /.gz$/) {
3071 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
3072 }elsif($mates_file =~ /.bam$/){
3073 o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
3074 }else{
3075 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n";
3076 }
3077
3078 while(<$fh>){
3079
3080 my @t=split;
3081 my $mate=$t[0];
3082
3083 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,);
3084
3085 next if(exists $pair{$mate});
3086
3087 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
3088
3089 next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2});
3090 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
3091
3092 $pair{$mate}=undef;
3093 $record++;
3094
3095 my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2);
3096
3097 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format);
3098 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format);
3099
3100 my $length = abs($coord_start_read1-$coord_start_read2);
3101 $sumX += $length; #add to sum and sum^2 for mean and variance calculation
3102 $sumX2 += $length*$length;
3103
3104 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){
3105
3106 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){
3107
3108 &addToDensity($density,$chr_read1,$i,\$nb_windows)
3109 if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1]));
3110
3111 }else{
3112
3113 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step);
3114 }
3115 }
3116
3117 if($record>=$warn){
3118 print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n";
3119 $warn+=100000;
3120 }
3121 }
3122 $fh->close;
3123
3124 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n";
3125
3126 if($record>0){
3127
3128 my $mu = $sumX/$record;
3129 my $sigma = sqrt($sumX2/$record - $mu*$mu);
3130 print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n";
3131 }
3132
3133 }
3134 #------------------------------------------------------------------------------#
3135 #------------------------------------------------------------------------------#
3136 sub ratio2segdup{
3137
3138 my($id,$density_file,$segdup_file)=@_;
3139
3140 print LOG "# Converting to circos format...\n";
3141
3142 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n";
3143 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n";
3144
3145 while(<RATIO>){
3146 chomp;
3147 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5];
3148 print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n";
3149 }
3150
3151 close RATIO;
3152 close SEGDUP;
3153 print LOG "-- output created: $segdup_file\n";
3154 }
3155 #------------------------------------------------------------------------------#
3156 #------------------------------------------------------------------------------#
3157 sub ratio2bedfile{
3158
3159 my($density_file,$bed_file)=@_;
3160
3161 print LOG "# Converting to bedGraph format...\n";
3162
3163 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n";
3164 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n";
3165 print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ".
3166 "visibility=2 color=255,0,0 alwaysZero=\"On\"\n";
3167
3168 while(<RATIO>){
3169 chomp;
3170 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5];
3171 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/);
3172 print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n";
3173 }
3174
3175 close RATIO;
3176 close BED;
3177 print LOG "-- output created: $bed_file\n";
3178 }
3179 #------------------------------------------------------------------------------#
3180 #------------------------------------------------------------------------------#
3181 sub inverseSense{
3182
3183 my $mate_sense=$_[0];
3184 my %reverse=( 'F' => 'R' , 'R' => 'F' ,
3185 'FF' => 'RR', 'RR' => 'FF',
3186 'FR' => 'RF', 'RF' => 'FR');
3187 return $reverse{$mate_sense};
3188 }
3189
3190 #------------------------------------------------------------------------------#
3191 #------------------------------------------------------------------------------#
3192 sub getNextFrag{
3193
3194 my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_;
3195
3196 my $how_far = $read_start-$frag_start;
3197 my $nb_windows_toskip;
3198
3199 if($how_far>0){
3200
3201 $nb_windows_toskip=($how_far/$step)-($window_dist/$step);
3202 $nb_windows_toskip=~ s/\..*//;
3203 $nb_windows_toskip=0 if($nb_windows_toskip<0);
3204 return ($frag_num + $nb_windows_toskip);
3205 }
3206 return $frag_last;
3207 }
3208 #------------------------------------------------------------------------------#
3209 #------------------------------------------------------------------------------#
3210 sub getColor{
3211
3212 my($count,$hcolor,$format)=@_;
3213 for my $col ( keys % { $hcolor} ) {
3214 return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]);
3215 }
3216 return "white" if($format eq "circos");
3217 return "255,255,255" if($format eq "bed");
3218 }
3219 #------------------------------------------------------------------------------#
3220 #------------------------------------------------------------------------------#
3221 sub recupCoords{
3222
3223 my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_;
3224 my $strand = 'F';
3225
3226 if ($c_hit=~s/^\-//) {
3227 $strand='R';
3228 $$cs_hit=$c_hit;
3229 $$ce_hit=$c_hit-($tag_length-1);
3230 }else{
3231 $$cs_hit=$c_hit;
3232 $$ce_hit=$c_hit+($tag_length-1);
3233 }
3234 return $strand;
3235
3236 }
3237 #------------------------------------------------------------------------------#
3238 #------------------------------------------------------------------------------#
3239 sub overlap {
3240 my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_;
3241 if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) {
3242 return 0;
3243 }
3244 return 1;
3245 }
3246 #------------------------------------------------------------------------------#
3247 #------------------------------------------------------------------------------#
3248 sub makeLink {
3249
3250 my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_;
3251
3252 if($chr1>$chr2){
3253 ($chr1,$chr2)= ($chr2,$chr1);
3254 ($frag1,$frag2)= ($frag2,$frag1);
3255 }
3256
3257 if($chr1 == $chr2){
3258 if($frag1>$frag2){
3259 ($frag1,$frag2)= ($frag2,$frag1);
3260 }
3261 }
3262
3263 if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){
3264 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt;
3265 $$nb++;
3266 }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){
3267 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt";
3268 }
3269 }
3270 #------------------------------------------------------------------------------#
3271 #------------------------------------------------------------------------------#
3272 #fonction of adding the read to the density profile
3273 sub addToDensity {
3274
3275 my ($density,$chr1,$frag1,$nb)=@_;
3276
3277 if(!exists $density->{$chr1}->{$frag1}->{count}){
3278 $density->{$chr1}->{$frag1}->{count}=1;
3279 $$nb++;
3280 }else{
3281 $density->{$chr1}->{$frag1}->{count}++;
3282 }
3283 }
3284 #------------------------------------------------------------------------------#
3285 #------------------------------------------------------------------------------#
3286 sub floor {
3287 my $nb = $_[0];
3288 $nb=~ s/\..*//;
3289 return $nb;
3290 }
3291 #------------------------------------------------------------------------------#
3292 sub decimal{
3293
3294 my $num=shift;
3295 my $digs_to_cut=shift;
3296
3297 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/);
3298
3299 return $num;
3300 }
3301
3302 #------------------------------------------------------------------------------#
3303 sub max {
3304
3305 my($max) = shift(@_);
3306 foreach my $temp (@_) {
3307 $max = $temp if $temp > $max;
3308 }
3309 return($max);
3310 }
3311 #------------------------------------------------------------------------------#
3312 #------------------------------------------------------------------------------#
3313 sub min {
3314
3315 my($min) = shift(@_);
3316 foreach my $temp (@_) {
3317 $min = $temp if $temp < $min;
3318 }
3319 return($min);
3320 }
3321 #------------------------------------------------------------------------------#
3322 #------------------------------------------------------------------------------#
3323 sub sortTablebyIndex{
3324 my ($tab1,$tab2)=@_;
3325 my @tab3;
3326
3327 foreach my $i (@$tab1){
3328 $tab3[$i]=$$tab2[$$tab1[$i]];
3329 }
3330 return @tab3;
3331 }
3332 #------------------------------------------------------------------------------#
3333 #------------------------------------------------------------------------------#
3334 sub round {
3335 my $number = shift || 0;
3336 my $dec = 10 ** (shift || 0);
3337 return int( $dec * $number + .5 * ($number <=> 0)) / $dec;
3338 }
3339 #------------------------------------------------------------------------------#
3340 #------------------------------------------------------------------------------#
3341 sub getUniqueTable{
3342
3343 my (@tab)=@_;
3344 my (%saw,@out)=();
3345 undef %saw;
3346 return sort(grep(!$saw{$_}++, @tab));
3347 }
3348 #------------------------------------------------------------------------------#
3349 #------------------------------------------------------------------------------#
3350 sub catFiles {
3351
3352 unlink("$_[1]") if(exists $_[1]);
3353 system qq( cat "$_" >> "$_[1]" ) for @{$_[0]};
3354 }
3355 #------------------------------------------------------------------------------#
3356 #------------------------------------------------------------------------------#
3357 #check if the configuration file is correct
3358 sub validateconfiguration{
3359
3360 my %conf=%{$_[0]};
3361 my $list_prgs="@ARGV";
3362
3363 my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file);
3364 my @detection_params=qw(split_mate_file window_size step_length split_mate_file);
3365 my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file);
3366 my @circos_params=qw(organism_id colorcode);
3367 my @bed_params=qw(colorcode);
3368 my @compare_params=qw(list_samples file_suffix);
3369
3370 foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){
3371
3372 unless (defined($dir)) {
3373 $dir = ".";
3374 }
3375 unless (-d $dir){
3376 mkdir $dir or die;
3377 }
3378 $dir.="/" if($dir!~/\/$/);
3379 }
3380
3381 unless (defined($conf{general}{num_threads})) {
3382 $conf{general}{num_threads} = 1;
3383 }
3384 $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24);
3385
3386 if($list_prgs!~/links2compare/){
3387
3388 foreach my $p (@general_params){
3389 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p});
3390 }
3391
3392 $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam");
3393
3394 unless (defined($conf{general}{sv_type})) {
3395 $conf{general}{sv_type} = "all";
3396 }
3397
3398 $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}};
3399 }
3400
3401 if($list_prgs=~/(linking|cnv)/){
3402
3403 foreach my $p (@detection_params){
3404 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p});
3405 }
3406
3407 die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref});
3408
3409 if($conf{detection}{step_length}>$conf{detection}{window_size}){
3410 die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n");
3411 }
3412
3413 unless (-d $conf{general}{tmp_dir}."/mates"){
3414 mkdir $conf{general}{tmp_dir}."/mates" or die;
3415 }
3416
3417 if($list_prgs=~/linking/){
3418 unless (-d $conf{general}{tmp_dir}."/links"){
3419 mkdir $conf{general}{tmp_dir}."/links" or die;
3420 }
3421 }
3422 if($list_prgs=~/cnv/){
3423 unless (-d $conf{general}{tmp_dir}."/density"){
3424 mkdir $conf{general}{tmp_dir}."/density" or die;
3425 }
3426 }
3427
3428 }
3429
3430 if($list_prgs=~/filtering/){
3431
3432 foreach my $p (@filtering_params) {
3433 die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p});
3434
3435 }
3436
3437 if(defined($conf{filtering}{chromosomes})) {
3438 my @chrs=split(",",$conf{filtering}{chromosomes});
3439 my $exclude=($chrs[0]=~/^\-/)? 1:0;
3440 for my $chrName (@chrs){
3441
3442 die("Error Config : The filtering parameter \"chromosomes\" is not valid\n")
3443 if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude));
3444
3445 }
3446 }
3447
3448 if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) {
3449 die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected".
3450 "\nChange strand_filtering to \"1\" if you want to use the order filtering\n");
3451 }
3452 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) {
3453 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n");
3454 }
3455 if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) {
3456 die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected".
3457 "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n");
3458 }
3459 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) {
3460 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n");
3461 }
3462
3463 if (!defined($conf{filtering}{indel_sigma_threshold})) {
3464 $conf{filtering}{indel_sigma_threshold} = 2;
3465 }
3466 if (!defined($conf{filtering}{dup_sigma_threshold})) {
3467 $conf{filtering}{dup_sigma_threshold} = 2;
3468 }
3469 if (!defined($conf{filtering}{singleton_sigma_threshold})) {
3470 $conf{filtering}{singleton_sigma_threshold} = 4;
3471 }
3472
3473 if (!defined($conf{filtering}{nb_pairs_order_threshold})) {
3474 $conf{filtering}{nb_pairs_order_threshold} = 1;
3475 }
3476
3477 if (!defined($conf{filtering}{final_score_threshold})) {
3478 $conf{filtering}{final_score_threshold} = 0.8;
3479 }
3480
3481 if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) {
3482 die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n");
3483 }
3484
3485 }
3486
3487 if($list_prgs=~/2circos$/){
3488 foreach my $p (@circos_params) {
3489 next if($list_prgs=~/^ratio/ && $p eq "colorcode");
3490 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p});
3491 }
3492 }
3493
3494 if($list_prgs=~/2bed$/){
3495 foreach my $p (@bed_params) {
3496 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p});
3497 }
3498 }
3499
3500 if($list_prgs=~/links2compare/){
3501 foreach my $p (@compare_params) {
3502 die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p});
3503 }
3504
3505 unless (defined($conf{compare}{same_sv_type})) {
3506 $conf{compare}{same_sv_type} = 0;
3507 }
3508
3509 unless (defined($conf{compare}{min_overlap})) {
3510 $conf{compare}{min_overlap} = 1E-9;
3511 }
3512
3513 if($conf{compare}{circos_output}){
3514 foreach my $p (@circos_params) {
3515 next if($list_prgs=~/^ratio/ && $p eq "colorcode");
3516 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p});
3517 }
3518 }
3519 if($conf{compare}{bed_output}){
3520 foreach my $p (@bed_params) {
3521 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p});
3522 }
3523 die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths});
3524
3525 my @samples=split(",",$conf{compare}{list_samples});
3526 my @read_lengths=split(",",$conf{compare}{list_read_lengths});
3527 for my $i (0..$#samples){
3528 my @l=split("-",$read_lengths[$i]);
3529 $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]};
3530 }
3531 }
3532 }
3533
3534
3535 }
3536 #------------------------------------------------------------------------------#
3537 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#