5
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 =pod
|
|
4
|
|
5 =head1 NAME
|
|
6
|
|
7 SVDetect - Program designed to the detection of structural variations
|
|
8 from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads
|
|
9
|
|
10 Version: 0.8 for Galaxy
|
|
11
|
|
12 =head1 SYNOPSIS
|
|
13
|
|
14 SVDetect <command> -conf <configuration_file> [-help] [-man]
|
|
15
|
|
16 Command:
|
|
17
|
|
18 linking detection and isolation of links
|
|
19 filtering filtering of links according different parameters
|
|
20 links2circos links conversion to circos format
|
|
21 links2bed paired-ends of links converted to bed format (UCSC)
|
|
22 links2SV formatted output to show most significant SVs
|
|
23 cnv calculate copy-number profiles
|
|
24 ratio2circos ratio conversion to circos density format
|
|
25 ratio2bedgraph ratio conversion to bedGraph density format (UCSC)
|
|
26
|
|
27 =head1 DESCRIPTION
|
|
28
|
|
29 This is a command-line interface to SVDetect.
|
|
30
|
|
31
|
|
32 =head1 AUTHORS
|
|
33
|
|
34 Bruno Zeitouni E<lt>bruno.zeitouni@curie.frE<gt>,
|
|
35 Valentina Boeva E<lt>valentina.boeva@curie.frE<gt>
|
|
36
|
|
37 =cut
|
|
38
|
|
39 # -------------------------------------------------------------------
|
|
40
|
|
41 use strict;
|
|
42 use warnings;
|
|
43
|
|
44 use Pod::Usage;
|
|
45 use Getopt::Long;
|
|
46
|
|
47 use Config::General;
|
|
48 use Tie::IxHash;
|
|
49 use FileHandle;
|
|
50 use Parallel::ForkManager;
|
|
51
|
|
52 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|
|
53 #PARSE THE COMMAND LINE
|
|
54 my %OPT;
|
|
55 GetOptions(\%OPT,
|
|
56 'conf=s',
|
|
57 'out1=s', #GALAXY
|
|
58 'out2=s', #GALAXY
|
|
59 'out3=s', #GALAXY
|
|
60 'out4=s', #GALAXY
|
|
61 'out5=s', #GALAXY
|
|
62 'l=s', #GALAXY
|
|
63 'N=s',#GALAXY
|
|
64 'help',#GALAXY
|
|
65 'man'
|
|
66 );
|
|
67
|
|
68 pod2usage() if $OPT{help};
|
|
69 pod2usage(-verbose=>2) if $OPT{man};
|
|
70 pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf});
|
|
71
|
|
72 pod2usage() if(@ARGV<1);
|
|
73
|
|
74 tie (my %func, 'Tie::IxHash',linking=>\&createlinks,
|
|
75 filtering=>\&filterlinks,
|
|
76 links2circos=>\&links2circos,
|
|
77 links2bed=>\&links2bed,
|
|
78 links2compare=>\&links2compare,
|
|
79 links2SV=>\&links2SV,
|
|
80 cnv=>\&cnv,
|
|
81 ratio2circos=>\&ratio2circos,
|
|
82 ratio2bedgraph=>\&ratio2bedgraph);
|
|
83
|
|
84 foreach my $command (@ARGV){
|
|
85 pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command}));
|
|
86 }
|
|
87 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|
|
88
|
|
89
|
|
90 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|
|
91 #READ THE CONFIGURATION FILE
|
|
92 my $conf=Config::General->new( -ConfigFile => $OPT{conf},
|
|
93 -Tie => "Tie::IxHash",
|
|
94 -AllowMultiOptions => 1,
|
|
95 -LowerCaseNames => 1,
|
|
96 -AutoTrue => 1);
|
|
97 my %CONF= $conf->getall;
|
|
98 validateconfiguration(\%CONF); #validation of the configuration parameters
|
|
99
|
|
100 my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY
|
|
101
|
|
102 my $pt_log_file=$OPT{l}; #GALAXY
|
|
103 my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY
|
|
104 my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY
|
|
105 my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY
|
|
106 my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY
|
|
107 my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY
|
|
108
|
|
109 $CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY
|
|
110 $CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY
|
|
111
|
|
112 my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY
|
|
113 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY
|
|
114 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|
|
115
|
|
116 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|
|
117 #COMMAND EXECUTION
|
|
118 foreach my $command (@ARGV){
|
|
119 &{$func{$command}}();
|
|
120 }
|
|
121 print LOG "-- end\n";#GALAXY
|
|
122
|
|
123 close LOG;#GALAXY
|
|
124 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY
|
|
125 exit(0);
|
|
126 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|
|
127
|
|
128
|
|
129 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|
|
130 #FUNCTIONS
|
|
131 #------------------------------------------------------------------------------#
|
|
132 #MAIN FUNCTION number 1: Detection of links from mate-pairs data
|
|
133 sub createlinks{
|
|
134
|
|
135 my %CHR; #main hash table 1: fragments, links
|
|
136 my %CHRID;
|
|
137 my @MATEFILES;
|
|
138
|
|
139 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type};
|
|
140 my @path=split(/\//,$output_prefix);
|
|
141 $output_prefix=$CONF{general}{output_dir}.$path[$#path];
|
|
142 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path];
|
|
143 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path];
|
|
144
|
|
145 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters
|
|
146 $CONF{detection}{window_size},
|
|
147 $CONF{detection}{step_length},
|
|
148 $CONF{general}{cmap_file});
|
|
149
|
|
150 if($CONF{detection}{split_mate_file}){
|
|
151
|
|
152 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix,
|
|
153 $CONF{general}{sv_type},
|
|
154 $CONF{general}{mates_file},
|
|
155 $CONF{general}{input_format},
|
|
156 $CONF{general}{read_lengths}
|
|
157 );
|
|
158 }else{
|
|
159
|
|
160 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!";
|
|
161 chomp(@MATEFILES);
|
|
162 print LOG "# Splitted mate files already created.\n";
|
|
163 }
|
|
164
|
|
165
|
|
166 #Parallelization of the linking per chromosome for intra + interchrs
|
|
167 my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
|
|
168
|
|
169 foreach my $matefile (@MATEFILES){
|
|
170
|
|
171 my $pid = $pm->start and next;
|
|
172 getlinks(\%CHR, \%CHRID, $matefile);
|
|
173 $pm->finish;
|
|
174
|
|
175 }
|
|
176 $pm->wait_all_children;
|
|
177
|
|
178 #Merge the chromosome links file into only one
|
|
179 my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!";
|
|
180 chomp(@LINKFILES);
|
|
181 catFiles( \@LINKFILES => "$output_prefix.links" );
|
|
182
|
|
183 system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY
|
|
184 print LOG "# Linking end procedure : output created: $output_prefix.links\n";
|
|
185 #unlink(@LINKFILES);
|
|
186 #unlink(@MATEFILES);
|
|
187
|
|
188 undef %CHR;
|
|
189 undef %CHRID;
|
|
190
|
|
191 }
|
|
192 #------------------------------------------------------------------------------#
|
|
193 #------------------------------------------------------------------------------#
|
|
194 sub getlinks {
|
|
195
|
|
196 my ($chr,$chrID,$tmp_mates_prefix)=@_;
|
|
197
|
|
198 my $tmp_links_prefix=$tmp_mates_prefix;
|
|
199 $tmp_links_prefix=~s/\/mates\//\/links\//;
|
|
200
|
|
201 my %PAIR; #main hash table 2: pairs
|
|
202
|
|
203 linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs
|
|
204 $CONF{general}{read_lengths},
|
|
205 $CONF{detection}{window_size},
|
|
206 $CONF{detection}{step_length},
|
|
207 $tmp_mates_prefix,
|
|
208 $CONF{general}{input_format},
|
|
209 $CONF{general}{sv_type},
|
|
210 "$tmp_links_prefix.links.mapped"
|
|
211 );
|
|
212
|
|
213 getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons
|
|
214 "$tmp_links_prefix.links.unique");
|
|
215
|
|
216 defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links
|
|
217 $CONF{general}{input_format},
|
|
218 $CONF{general}{sv_type},
|
|
219 $CONF{general}{read_lengths},
|
|
220 "$tmp_links_prefix.links.unique",
|
|
221 "$tmp_links_prefix.links.unique_defined");
|
|
222
|
|
223 sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates
|
|
224 "$tmp_links_prefix.links.sorted");
|
|
225
|
|
226 removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links
|
|
227 "$tmp_links_prefix.links",1); #file output
|
|
228
|
|
229
|
|
230 undef %PAIR;
|
|
231
|
|
232 unlink("$tmp_links_prefix.links.mapped",
|
|
233 "$tmp_links_prefix.links.unique",
|
|
234 "$tmp_links_prefix.links.unique_defined",
|
|
235 "$tmp_links_prefix.links.sorted");
|
|
236 }
|
|
237 #------------------------------------------------------------------------------#
|
|
238 #------------------------------------------------------------------------------#
|
|
239 sub splitMateFile{
|
|
240
|
|
241 my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_;
|
|
242
|
|
243 print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n";
|
|
244
|
|
245 my %filesHandle;
|
|
246
|
|
247 #fichier matefile inter
|
|
248 if($sv_type=~/^(all|inter)$/){
|
|
249 my $newFileName="$output_prefix.interchrs";
|
|
250 push(@{$files_list},$newFileName);
|
|
251 my $fh = new FileHandle;
|
|
252 $fh->open(">$newFileName");
|
|
253 $filesHandle{inter}=$fh;
|
|
254 }
|
|
255
|
|
256 #fichiers matefiles intra
|
|
257 if($sv_type=~/^(all|intra)$/){
|
|
258 foreach my $k (1..$chr->{nb_chrs}){
|
|
259 my $newFileName=$output_prefix.".".$chr->{$k}->{name};
|
|
260 push(@{$files_list},$newFileName);
|
|
261 my $fh = new FileHandle;
|
|
262 $fh->open(">$newFileName");
|
|
263 $filesHandle{$k}=$fh;
|
|
264 }
|
|
265 }
|
|
266
|
|
267 if ($mates_file =~ /.gz$/) {
|
|
268 open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat
|
|
269 }elsif($mates_file =~ /.bam$/){
|
|
270 open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
|
|
271 }else{
|
|
272 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
|
|
273 }
|
|
274
|
|
275
|
|
276 while(<MATES>){
|
|
277
|
|
278 my @t=split;
|
|
279 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2);
|
|
280
|
|
281 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
|
|
282
|
|
283 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
|
|
284
|
|
285 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
|
|
286
|
|
287 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){
|
|
288 my $fh2print=$filesHandle{inter};
|
|
289 print $fh2print join("\t",@t)."\n";
|
|
290 }
|
|
291
|
|
292 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){
|
|
293 my $fh2print=$filesHandle{$chr_read1};
|
|
294 print $fh2print join("\t",@t)."\n";
|
|
295
|
|
296 }
|
|
297 }
|
|
298
|
|
299 foreach my $name (keys %filesHandle){
|
|
300 my $fh=$filesHandle{$name};
|
|
301 $fh->close;
|
|
302 }
|
|
303
|
|
304 print LOG "# Splitted mate files of \"$mates_file\" created.\n";
|
|
305 }
|
|
306
|
|
307
|
|
308 #------------------------------------------------------------------------------#
|
|
309 #------------------------------------------------------------------------------#
|
|
310 sub filterlinks{
|
|
311
|
|
312 my %CHR;
|
|
313 my %CHRID;
|
|
314 my @LINKFILES;
|
|
315 my @FLINKFILES;
|
|
316
|
|
317 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type};
|
|
318 my @path=split(/\//,$output_prefix);
|
|
319 $output_prefix=$CONF{general}{output_dir}.$path[$#path];
|
|
320 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path];
|
|
321
|
|
322 createChrHashTables(\%CHR,\%CHRID,
|
|
323 $CONF{general}{cmap_file});
|
|
324
|
|
325 if($CONF{filtering}{split_link_file}){
|
|
326
|
|
327 splitLinkFile(\%CHR, \%CHRID, \@LINKFILES,
|
|
328 $tmp_links_prefix,
|
|
329 $CONF{general}{sv_type},
|
|
330 "$output_prefix.links",
|
|
331 );
|
|
332 }else{
|
|
333
|
|
334 @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n";
|
|
335 chomp(@LINKFILES);
|
|
336 print LOG "# Splitted link files already created.\n";
|
|
337 }
|
|
338
|
|
339 #Parallelization of the filtering per chromosome for intra + interchrs
|
|
340 my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
|
|
341
|
|
342 foreach my $linkfile (@LINKFILES){
|
|
343
|
|
344 my $pid = $pm->start and next;
|
|
345 getFilteredlinks(\%CHR, \%CHRID, $linkfile);
|
|
346 $pm->finish;
|
|
347
|
|
348 }
|
|
349 $pm->wait_all_children;
|
|
350
|
|
351 #Merge the chromosome links file into only one
|
|
352 @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n";
|
|
353 chomp(@FLINKFILES);
|
|
354 catFiles( \@FLINKFILES => "$output_prefix.links.filtered" );
|
|
355
|
|
356 system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY
|
|
357 print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n";
|
|
358
|
|
359 undef %CHR;
|
|
360 undef %CHRID;
|
|
361
|
|
362 }
|
|
363 #------------------------------------------------------------------------------#
|
|
364 #------------------------------------------------------------------------------#
|
|
365 sub splitLinkFile{
|
|
366
|
|
367 my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_;
|
|
368
|
|
369 print LOG "# Splitting the link file for parallel processing...\n";
|
|
370
|
|
371 my %filesHandle;
|
|
372
|
|
373 #fichier matefile inter
|
|
374 if($sv_type=~/^(all|inter)$/){
|
|
375 my $newFileName="$input_prefix.interchrs.links";
|
|
376 push(@{$files_list},$newFileName);
|
|
377 my $fh = new FileHandle;
|
|
378 $fh->open(">$newFileName");
|
|
379 $filesHandle{inter}=$fh;
|
|
380 }
|
|
381
|
|
382 #fichiers matefiles intra
|
|
383 if($sv_type=~/^(all|intra)$/){
|
|
384 foreach my $k (1..$chr->{nb_chrs}){
|
|
385 my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links";
|
|
386 push(@{$files_list},$newFileName);
|
|
387 my $fh = new FileHandle;
|
|
388 $fh->open(">$newFileName");
|
|
389 $filesHandle{$k}=$fh;
|
|
390 }
|
|
391 }
|
|
392
|
|
393 open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n";
|
|
394 while(<LINKS>){
|
|
395
|
|
396 my @t=split;
|
|
397 my ($chr_read1,$chr_read2)=($t[0],$t[3]);
|
|
398
|
|
399 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
|
|
400
|
|
401 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
|
|
402
|
|
403 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){
|
|
404 my $fh2print=$filesHandle{inter};
|
|
405 print $fh2print join("\t",@t)."\n";
|
|
406 }
|
|
407
|
|
408 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){
|
|
409 my $fh2print=$filesHandle{$chr_read1};
|
|
410 print $fh2print join("\t",@t)."\n";
|
|
411
|
|
412 }
|
|
413 }
|
|
414
|
|
415 foreach my $name (keys %filesHandle){
|
|
416 my $fh=$filesHandle{$name};
|
|
417 $fh->close;
|
|
418 }
|
|
419
|
|
420 print LOG "# Splitted link files created.\n";
|
|
421 }
|
|
422
|
|
423
|
|
424 #------------------------------------------------------------------------------#
|
|
425 #------------------------------------------------------------------------------#
|
|
426 #MAIN FUNCTION number 2: Filtering processing
|
|
427 sub getFilteredlinks {
|
|
428
|
|
429 my ($chr,$chrID,$tmp_links_prefix)=@_;
|
|
430 my %PAIR;
|
|
431
|
|
432 strandFiltering($chr,$chrID,
|
|
433 $CONF{filtering}{nb_pairs_threshold}, #filtering of links
|
|
434 $CONF{filtering}{strand_filtering},
|
|
435 $CONF{filtering}{chromosomes},
|
|
436 $CONF{general}{input_format},
|
|
437 $CONF{general}{cmap_file},
|
|
438 $CONF{general}{mates_orientation},
|
|
439 $CONF{general}{read_lengths},
|
|
440 $tmp_links_prefix,
|
|
441 "$tmp_links_prefix.filtered",
|
|
442 );
|
|
443
|
|
444 if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering
|
|
445
|
|
446 my @tmpfiles;
|
|
447
|
|
448 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique");
|
|
449
|
|
450 getUniqueLinks("$tmp_links_prefix.filtered_unique",
|
|
451 "$tmp_links_prefix.filtered");
|
|
452
|
|
453 push(@tmpfiles,"$tmp_links_prefix.filtered_unique");
|
|
454
|
|
455 if($CONF{filtering}{order_filtering}){ #filtering using the order
|
|
456
|
|
457 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered");
|
|
458
|
|
459 orderFiltering($chr,$chrID,
|
|
460 $CONF{filtering}{nb_pairs_threshold},
|
|
461 $CONF{filtering}{nb_pairs_order_threshold},
|
|
462 $CONF{filtering}{mu_length},
|
|
463 $CONF{filtering}{sigma_length},
|
|
464 $CONF{general}{mates_orientation},
|
|
465 $CONF{general}{read_lengths},
|
|
466 "$tmp_links_prefix.filtered_ordered",
|
|
467 "$tmp_links_prefix.filtered",
|
|
468 );
|
|
469
|
|
470 push(@tmpfiles,"$tmp_links_prefix.filtered_ordered");
|
|
471 }
|
|
472
|
|
473 if (($CONF{filtering}{insert_size_filtering})&&
|
|
474 ($CONF{general}{sv_type} ne 'inter')){
|
|
475
|
|
476 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize");
|
|
477
|
|
478 addInsertionInfo($chr,$chrID,
|
|
479 $CONF{filtering}{nb_pairs_threshold},
|
|
480 $CONF{filtering}{order_filtering},
|
|
481 $CONF{filtering}{indel_sigma_threshold},
|
|
482 $CONF{filtering}{dup_sigma_threshold},
|
|
483 $CONF{filtering}{singleton_sigma_threshold},
|
|
484 $CONF{filtering}{mu_length},
|
|
485 $CONF{filtering}{sigma_length},
|
|
486 $CONF{general}{mates_orientation},
|
|
487 $CONF{general}{read_lengths},
|
|
488 "$tmp_links_prefix.filtered_withoutIndelSize",
|
|
489 "$tmp_links_prefix.filtered"
|
|
490 );
|
|
491
|
|
492 push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize");
|
|
493 }
|
|
494
|
|
495 sortLinks("$tmp_links_prefix.filtered",
|
|
496 "$tmp_links_prefix.filtered_sorted");
|
|
497
|
|
498 removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted",
|
|
499 "$tmp_links_prefix.filtered_nodup",
|
|
500 );
|
|
501
|
|
502 postFiltering("$tmp_links_prefix.filtered_nodup",
|
|
503 "$tmp_links_prefix.filtered",
|
|
504 $CONF{filtering}{final_score_threshold});
|
|
505
|
|
506 push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup");
|
|
507
|
|
508 unlink(@tmpfiles);
|
|
509
|
|
510
|
|
511 }
|
|
512 undef %PAIR;
|
|
513
|
|
514 }
|
|
515 #------------------------------------------------------------------------------#
|
|
516 #------------------------------------------------------------------------------#
|
|
517 #MAIN FUNCTION number 3: Circos format conversion for links
|
|
518 sub links2circos{
|
|
519
|
|
520 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
|
|
521 my @path=split(/\//,$input_file);
|
|
522 $input_file=$CONF{general}{output_dir}.$path[$#path];
|
|
523
|
|
524 my $output_file.=$input_file.".segdup.txt";
|
|
525
|
|
526 links2segdup($CONF{circos}{organism_id},
|
|
527 $CONF{circos}{colorcode},
|
|
528 $input_file,
|
|
529 $output_file); #circos file output
|
|
530
|
|
531 system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY
|
|
532 }
|
|
533 #------------------------------------------------------------------------------#
|
|
534 #------------------------------------------------------------------------------#
|
|
535 #MAIN FUNCTION number 4: Bed format conversion for links
|
|
536 sub links2bed{
|
|
537
|
|
538 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
|
|
539 my @path=split(/\//,$input_file);
|
|
540 $input_file=$CONF{general}{output_dir}.$path[$#path];
|
|
541
|
|
542 my $output_file.=$input_file.".bed.txt";
|
|
543
|
|
544 links2bedfile($CONF{general}{read_lengths},
|
|
545 $CONF{bed}{colorcode},
|
|
546 $input_file,
|
|
547 $output_file); #bed file output
|
|
548
|
|
549 system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY
|
|
550
|
|
551 }
|
|
552 #------------------------------------------------------------------------------#
|
|
553 #------------------------------------------------------------------------------#
|
|
554 #MAIN FUNCTION number 6: Bed format conversion for links
|
|
555 sub links2SV{
|
|
556
|
|
557 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
|
|
558
|
|
559 my @path=split(/\//,$input_file);
|
|
560 $input_file=$CONF{general}{output_dir}.$path[$#path];
|
|
561
|
|
562 my $output_file.=$input_file.".sv.txt";
|
|
563
|
|
564
|
|
565 links2SVfile( $input_file,
|
|
566 $output_file);
|
|
567
|
|
568 system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY
|
|
569 }
|
|
570 #------------------------------------------------------------------------------#
|
|
571 #------------------------------------------------------------------------------#
|
|
572 #MAIN FUNCTION number 7: copy number variations, coverage ratio calculation
|
|
573 sub cnv{
|
|
574
|
|
575 my %CHR;
|
|
576 my %CHRID;
|
|
577 my @MATEFILES;
|
|
578 my @MATEFILES_REF;
|
|
579
|
|
580 my $output_prefix=$CONF{general}{mates_file};
|
|
581 my $output_prefix_ref=$CONF{detection}{mates_file_ref};
|
|
582 my @path=split(/\//,$output_prefix);
|
|
583 my @path_ref=split(/\//,$output_prefix_ref);
|
|
584 $output_prefix=$CONF{general}{output_dir}.$path[$#path];
|
|
585 $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref];
|
|
586 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path];
|
|
587 my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref];
|
|
588 my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path];
|
|
589
|
|
590 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters
|
|
591 $CONF{detection}{window_size},
|
|
592 $CONF{detection}{step_length},
|
|
593 $CONF{general}{cmap_file});
|
|
594
|
|
595 if($CONF{detection}{split_mate_file}){
|
|
596
|
|
597 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix,
|
|
598 "intra",
|
|
599 $CONF{general}{mates_file},
|
|
600 $CONF{general}{input_format},
|
|
601 $CONF{general}{read_lengths}
|
|
602 );
|
|
603
|
|
604 splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref,
|
|
605 "intra",
|
|
606 $CONF{detection}{mates_file_ref},
|
|
607 $CONF{general}{input_format},
|
|
608 $CONF{general}{read_lengths}
|
|
609 );
|
|
610
|
|
611
|
|
612 }else{
|
|
613
|
|
614 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!";
|
|
615 chomp(@MATEFILES);
|
|
616 @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!";
|
|
617 chomp(@MATEFILES_REF);
|
|
618 print LOG "# Splitted sample and reference mate files already created.\n";
|
|
619 }
|
|
620
|
|
621 #Parallelization of the cnv per chromosome
|
|
622 my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
|
|
623
|
|
624 foreach my $file (0..$#MATEFILES){
|
|
625
|
|
626 my $pid = $pm->start and next;
|
|
627
|
|
628 densityCalculation(\%CHR, \%CHRID, $file,
|
|
629 $CONF{general}{read_lengths},
|
|
630 $CONF{detection}{window_size},
|
|
631 $CONF{detection}{step_length},
|
|
632 \@MATEFILES,
|
|
633 \@MATEFILES_REF,
|
|
634 $MATEFILES[$file].".density",
|
|
635 $CONF{general}{input_format});
|
|
636
|
|
637 $pm->finish;
|
|
638
|
|
639 }
|
|
640 $pm->wait_all_children;
|
|
641
|
|
642 #Merge the chromosome links file into only one
|
|
643 my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!";
|
|
644 chomp(@DENSITYFILES);
|
|
645 catFiles( \@DENSITYFILES => "$output_prefix.density" );
|
|
646
|
|
647 print LOG "# cnv end procedure : output created: $output_prefix.density\n";
|
|
648
|
|
649
|
|
650 undef %CHR;
|
|
651 undef %CHRID;
|
|
652
|
|
653 }
|
|
654 #------------------------------------------------------------------------------#
|
|
655 #------------------------------------------------------------------------------#
|
|
656 #MAIN FUNCTION number 8: Circos format conversion for cnv ratios
|
|
657 sub ratio2circos{
|
|
658
|
|
659 my $input_file=$CONF{general}{mates_file}.".density";
|
|
660
|
|
661 my @path=split(/\//,$input_file);
|
|
662 $input_file=$CONF{general}{output_dir}.$path[$#path];
|
|
663
|
|
664 my $output_file.=$input_file.".segdup.txt";
|
|
665
|
|
666 ratio2segdup($CONF{circos}{organism_id},
|
|
667 $input_file,
|
|
668 $output_file);
|
|
669 }
|
|
670 #------------------------------------------------------------------------------#
|
|
671 #MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios
|
|
672 sub ratio2bedgraph{
|
|
673
|
|
674 my $input_file=$CONF{general}{mates_file}.".density";
|
|
675
|
|
676 my @path=split(/\//,$input_file);
|
|
677 $input_file=$CONF{general}{output_dir}.$path[$#path];
|
|
678
|
|
679 my $output_file.=$input_file.".bedgraph.txt";
|
|
680
|
|
681 ratio2bedfile($input_file,
|
|
682 $output_file); #bed file output
|
|
683 }
|
|
684 #------------------------------------------------------------------------------#
|
|
685 #------------------------------------------------------------------------------#
|
|
686 #Creation of the fragment library
|
|
687 sub shearingChromosome{
|
|
688
|
|
689 print LOG "# Making the fragments library...\n";
|
|
690
|
|
691 my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters
|
|
692
|
|
693 createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name
|
|
694
|
|
695 foreach my $k (1..$chr->{nb_chrs}){
|
|
696
|
|
697 print LOG"-- $chr->{$k}->{name}\n";
|
|
698
|
|
699 my $frag=1;
|
|
700 for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){
|
|
701
|
|
702 my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1;
|
|
703 $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage
|
|
704
|
|
705 if($end==($chr->{$k}->{length})-1){
|
|
706 $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome
|
|
707 last;
|
|
708 }
|
|
709 $frag++;
|
|
710 }
|
|
711 }
|
|
712 }
|
|
713 #------------------------------------------------------------------------------#
|
|
714 #------------------------------------------------------------------------------#
|
|
715 #Creation of chromosome hash tables from the cmap file
|
|
716 sub createChrHashTables{
|
|
717
|
|
718 my ($chr,$chrID,$cmap_file)=@_;
|
|
719 $chr->{nb_chrs}=0;
|
|
720
|
|
721 open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n";
|
|
722 while(<CMAP>){
|
|
723
|
|
724 if(/^\s+$/){ next;}
|
|
725 my ($k,$name,$length) = split;
|
|
726 $chr->{$k}->{name}=$name;
|
|
727 $chr->{$k}->{length}=$length;
|
|
728 $chrID->{$name}=$k;
|
|
729 $chr->{nb_chrs}++;
|
|
730
|
|
731 }
|
|
732 close CMAP;
|
|
733 }
|
|
734 #------------------------------------------------------------------------------#
|
|
735 #------------------------------------------------------------------------------#
|
|
736 #Read the mate file according the input format file (solid, eland or sam)
|
|
737 sub readMateFile{
|
|
738
|
|
739 my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_;
|
|
740 my ($strand1,$strand2);
|
|
741
|
|
742 if($file_type eq "solid"){
|
|
743
|
|
744 ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based
|
|
745
|
|
746 }else{
|
|
747 my ($tag_length1,$tag_length2);
|
|
748 ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based
|
|
749 if($file_type eq "eland");
|
|
750
|
|
751 if($file_type eq "sam"){
|
|
752
|
|
753 return 0 if ($$t[0]=~/^@/); #header sam filtered out
|
|
754
|
|
755 ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]);
|
|
756
|
|
757 return 0 if ($$chr1 eq "*" || $$chr2 eq "*");
|
|
758
|
|
759 $$chr2=$$chr1 if($$chr2 eq "=");
|
|
760
|
|
761 $strand1 = (($$t[1]&0x0010))? 'R':'F';
|
|
762 $strand2 = (($$t[1]&0x0020))? 'R':'F';
|
|
763
|
|
764 $$order1= (($$t[1]&0x0040))? '1':'2';
|
|
765 $$order2= (($$t[1]&0x0080))? '1':'2';
|
|
766 $tag_length1 = $tag_length->{$$order1};
|
|
767 $tag_length2 = $tag_length->{$$order2};
|
|
768 }
|
|
769
|
|
770 $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts
|
|
771 $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R");
|
|
772 }
|
|
773 return 1;
|
|
774 }
|
|
775 #------------------------------------------------------------------------------#
|
|
776 #------------------------------------------------------------------------------#
|
|
777 #Parsing of the mates files and creation of links between 2 chromosomal fragments
|
|
778 sub linking{
|
|
779
|
|
780 my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_;
|
|
781 my %link;
|
|
782
|
|
783 my $record=0;
|
|
784 my $nb_links=0;
|
|
785 my $warn=10000;
|
|
786
|
|
787 my @sfile=split(/\./,$mates_file);
|
|
788 my $fchr=$sfile[$#sfile];
|
|
789
|
|
790 my $fh = new FileHandle;
|
|
791
|
|
792 print LOG "# $fchr : Linking procedure...\n";
|
|
793 print LOG "-- file=$mates_file\n".
|
|
794 "-- chromosome=$fchr\n".
|
|
795 "-- input format=$input_format\n".
|
|
796 "-- type=$sv_type\n".
|
|
797 "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n".
|
|
798 "-- window size=$window_dist, step length=$step\n";
|
|
799
|
|
800 if ($mates_file =~ /.gz$/) {
|
|
801 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat
|
|
802 }elsif($mates_file =~ /.bam$/){
|
|
803 $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
|
|
804 }else{
|
|
805 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n";
|
|
806 }
|
|
807
|
|
808
|
|
809 while(<$fh>){
|
|
810
|
|
811 my @t=split; #for each mate-pair
|
|
812 my $mate=$t[0];
|
|
813 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2);
|
|
814
|
|
815 next if(exists $$pair{$mate});
|
|
816
|
|
817 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
|
|
818
|
|
819 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
|
|
820
|
|
821 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
|
|
822
|
|
823 if($sv_type ne "all"){
|
|
824 if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) ||
|
|
825 ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){
|
|
826 }else{
|
|
827 next;
|
|
828 }
|
|
829 }
|
|
830
|
|
831 $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function)
|
|
832
|
|
833 $record++;
|
|
834
|
|
835 my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read
|
|
836
|
|
837 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format);
|
|
838 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format);
|
|
839
|
|
840 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation
|
|
841
|
|
842 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){
|
|
843
|
|
844 if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){
|
|
845
|
|
846 for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){
|
|
847
|
|
848 if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) {
|
|
849
|
|
850 if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){
|
|
851
|
|
852 makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link
|
|
853 }
|
|
854
|
|
855 }else{
|
|
856
|
|
857 $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step);
|
|
858 }
|
|
859 }
|
|
860 }
|
|
861
|
|
862 }else{
|
|
863
|
|
864 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step);
|
|
865 }
|
|
866 }
|
|
867
|
|
868 if($record>=$warn){
|
|
869 print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n";
|
|
870 $warn+=10000;
|
|
871 }
|
|
872 }
|
|
873 $fh->close;
|
|
874
|
|
875 if(!$nb_links){
|
|
876 print LOG "-- $fchr : No mate-pairs !\n".
|
|
877 "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n";
|
|
878 }
|
|
879
|
|
880 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n";
|
|
881
|
|
882 print LOG "-- $fchr : writing...\n";
|
|
883
|
|
884 $fh = new FileHandle;
|
|
885
|
|
886 $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n";
|
|
887
|
|
888 foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output
|
|
889
|
|
890 foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){
|
|
891
|
|
892 foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){
|
|
893
|
|
894 foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){
|
|
895
|
|
896 my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2});
|
|
897 print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t".
|
|
898 "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t".
|
|
899 scalar @count."\t". #nb of read
|
|
900 $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list
|
|
901 }
|
|
902 }
|
|
903 }
|
|
904 }
|
|
905
|
|
906 $fh->close;
|
|
907
|
|
908 undef %link;
|
|
909
|
|
910 }
|
|
911 #------------------------------------------------------------------------------#
|
|
912 #------------------------------------------------------------------------------#
|
|
913 #remove exact links doublons according to the mate list
|
|
914 sub getUniqueLinks{
|
|
915
|
|
916 my ($links_file,$nrlinks_file)=@_;
|
|
917 my %links;
|
|
918 my %pt;
|
|
919 my $nb_links;
|
|
920 my $n=1;
|
|
921
|
|
922 my $record=0;
|
|
923 my $warn=300000;
|
|
924
|
|
925 my @sfile=split(/\./,$links_file);
|
|
926 my $fchr=$sfile[$#sfile-2];
|
|
927
|
|
928 my $fh = new FileHandle;
|
|
929
|
|
930 print LOG "# $fchr : Getting unique links...\n";
|
|
931 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
|
|
932
|
|
933 while(<$fh>){
|
|
934
|
|
935 my @t=split;
|
|
936 my $mates=$t[7];
|
|
937 $record++;
|
|
938
|
|
939 if(!exists $links{$mates}){ #Unique links selection
|
|
940
|
|
941 $links{$mates}=[@t];
|
|
942 $pt{$n}=$links{$mates};
|
|
943 $n++;
|
|
944
|
|
945
|
|
946 }else{ #get the link coordinates from the mate-pairs list
|
|
947
|
|
948 for my $i (1,2,4,5){ #get the shortest regions
|
|
949
|
|
950 $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start
|
|
951 if($i==1 || $i==4);
|
|
952 $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end
|
|
953 if($i==2 || $i==5);
|
|
954 }
|
|
955 }
|
|
956 if($record>=$warn){
|
|
957 print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n";
|
|
958 $warn+=300000;
|
|
959 }
|
|
960 }
|
|
961 $fh->close;
|
|
962
|
|
963 $nb_links=$n-1;
|
|
964 print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n";
|
|
965
|
|
966 $fh = new FileHandle;
|
|
967 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n";
|
|
968 print LOG "-- $fchr : writing...\n";
|
|
969 for my $i (1..$nb_links){
|
|
970
|
|
971 print $fh join("\t",@{$pt{$i}})."\n"; #all links output
|
|
972 }
|
|
973
|
|
974 $fh->close;
|
|
975
|
|
976 undef %links;
|
|
977 undef %pt;
|
|
978
|
|
979 }
|
|
980 #------------------------------------------------------------------------------#
|
|
981 #------------------------------------------------------------------------------#
|
|
982 #get the new coordinates of each link from the mate list
|
|
983 sub defineCoordsLinks{
|
|
984
|
|
985 my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_;
|
|
986
|
|
987 my @sfile=split(/\./,$links_file);
|
|
988 my $fchr=$sfile[$#sfile-2];
|
|
989
|
|
990 my $fh = new FileHandle;
|
|
991 my $fh2 = new FileHandle;
|
|
992
|
|
993 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
|
|
994 $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n";
|
|
995
|
|
996 print LOG "# $fchr : Defining precise link coordinates...\n";
|
|
997
|
|
998 my $record=0;
|
|
999 my $warn=100000;
|
|
1000
|
|
1001 my %coords;
|
|
1002 my %strands;
|
|
1003 my %order;
|
|
1004 my %ends_order;
|
|
1005
|
|
1006 while(<$fh>){
|
|
1007
|
|
1008
|
|
1009 my ($col1,$col2)=(1,2); #for an intrachromosomal link
|
|
1010 my $diffchr=0; #difference between chr1 and chr2
|
|
1011 my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8];
|
|
1012 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2});
|
|
1013 if ($chr1 != $chr2){ #for an interchromosomal link
|
|
1014 $col1=$col2=0; #no distinction
|
|
1015 $diffchr=1;
|
|
1016 }
|
|
1017
|
|
1018 my @pairs=split(",",$mates_list);
|
|
1019
|
|
1020 $coords{$col1}{$chr1}->{start}=undef;
|
|
1021 $coords{$col1}{$chr1}->{end}=undef;
|
|
1022 $coords{$col2}{$chr2}->{start}=undef;
|
|
1023 $coords{$col2}{$chr2}->{end}=undef;
|
|
1024 $strands{$col1}{$chr1}=undef;
|
|
1025 $strands{$col2}{$chr2}=undef;
|
|
1026 $ends_order{$col1}{$chr1}=undef;
|
|
1027 $ends_order{$col2}{$chr2}=undef;
|
|
1028
|
|
1029
|
|
1030 $order{$col1}{$chr1}->{index}->{1}=undef;
|
|
1031 $order{$col1}{$chr1}->{index}->{2}=undef;
|
|
1032 $order{$col2}{$chr2}->{index}->{1}=undef;
|
|
1033 $order{$col2}{$chr2}->{index}->{2}=undef;
|
|
1034 $order{$col1}{$chr1}->{order}=undef;
|
|
1035 $order{$col2}{$chr2}->{order}=undef;
|
|
1036
|
|
1037 $record++;
|
|
1038
|
|
1039 for my $p (0..$#pairs){ #for each pair
|
|
1040
|
|
1041 my ($coord_start_read1,$coord_end_read1);
|
|
1042 my ($coord_start_read2,$coord_end_read2);
|
|
1043 my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format);
|
|
1044 my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format);
|
|
1045
|
|
1046 if(!$diffchr){ #for a intrachromosomal link
|
|
1047 if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column
|
|
1048 ($col1,$col2)=(2,1);
|
|
1049 }else{
|
|
1050 ($col1,$col2)=(1,2);
|
|
1051 }
|
|
1052 }
|
|
1053
|
|
1054 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads
|
|
1055 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1);
|
|
1056 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2);
|
|
1057 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2);
|
|
1058 push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1);
|
|
1059 push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2);
|
|
1060 push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]);
|
|
1061 push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]);
|
|
1062 }
|
|
1063
|
|
1064 ($col1,$col2)=(1,2) if(!$diffchr);
|
|
1065
|
|
1066 my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region
|
|
1067 my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}}));
|
|
1068 my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}}));
|
|
1069 my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}}));
|
|
1070
|
|
1071 @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}};
|
|
1072 @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}};
|
|
1073
|
|
1074 foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting)
|
|
1075 foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){
|
|
1076
|
|
1077 if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){
|
|
1078 ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i;
|
|
1079 ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j;
|
|
1080 last;
|
|
1081 }
|
|
1082 }
|
|
1083 }
|
|
1084
|
|
1085 foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID
|
|
1086 foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){
|
|
1087
|
|
1088 if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){
|
|
1089 ${$order{$col1}{$chr1}->{order}}[$i]=$i+1;
|
|
1090 ${$order{$col2}{$chr2}->{order}}[$i]=$j+1;
|
|
1091 last;
|
|
1092 }
|
|
1093 }
|
|
1094 }
|
|
1095
|
|
1096 @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads
|
|
1097 @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1});
|
|
1098 @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2});
|
|
1099 @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1});
|
|
1100 @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2});
|
|
1101 @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start});
|
|
1102 @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start});
|
|
1103
|
|
1104
|
|
1105 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output
|
|
1106 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2,
|
|
1107 scalar @pairs,
|
|
1108 join(",",@pairs),
|
|
1109 join(",",@{$strands{$col1}{$chr1}}),
|
|
1110 join(",",@{$strands{$col2}{$chr2}}),
|
|
1111 join(",",@{$ends_order{$col1}{$chr1}}),
|
|
1112 join(",",@{$ends_order{$col2}{$chr2}}),
|
|
1113 join(",",@{$order{$col1}{$chr1}->{order}}),
|
|
1114 join(",",@{$order{$col2}{$chr2}->{order}}),
|
|
1115 join(",",@{$coords{$col1}{$chr1}->{start}}),
|
|
1116 join(",",@{$coords{$col2}{$chr2}->{start}}));
|
|
1117
|
|
1118 print $fh2 join("\t",@link)."\n";
|
|
1119
|
|
1120 if($record>=$warn){
|
|
1121 print LOG "-- $fchr : $warn links processed\n";
|
|
1122 $warn+=100000;
|
|
1123 }
|
|
1124 }
|
|
1125 $fh->close;
|
|
1126 $fh2->close;
|
|
1127
|
|
1128 print LOG "-- $fchr : Total : $record links processed\n";
|
|
1129
|
|
1130 }
|
|
1131 #------------------------------------------------------------------------------#
|
|
1132 #------------------------------------------------------------------------------#
|
|
1133 #Sort links according the concerned chromosomes and their coordinates
|
|
1134 sub sortLinks{
|
|
1135
|
|
1136 my ($links_file,$sortedlinks_file,$unique)=@_;
|
|
1137
|
|
1138 my @sfile=split(/\./,$links_file);
|
|
1139 my $fchr=$sfile[$#sfile-2];
|
|
1140
|
|
1141
|
|
1142 print LOG "# $fchr : Sorting links...\n";
|
|
1143
|
|
1144 my $pipe=($unique)? "| sort -u":"";
|
|
1145 system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file";
|
|
1146
|
|
1147 }
|
|
1148 #------------------------------------------------------------------------------#
|
|
1149 #------------------------------------------------------------------------------#
|
|
1150 #removal of fully overlapped links
|
|
1151 sub removeFullyOverlappedLinks{
|
|
1152
|
|
1153 my ($links_file,$nrlinks_file,$warn_out)=@_;
|
|
1154
|
|
1155 my %pt;
|
|
1156 my $n=1;
|
|
1157
|
|
1158 my @sfile=split(/\./,$links_file);
|
|
1159 my $fchr=$sfile[$#sfile-2];
|
|
1160
|
|
1161 my $fh = new FileHandle;
|
|
1162
|
|
1163 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
|
|
1164 while(<$fh>){
|
|
1165
|
|
1166 my @t=split("\t",$_);
|
|
1167 $pt{$n}=[@t];
|
|
1168 $n++;
|
|
1169 }
|
|
1170 $fh->close;
|
|
1171
|
|
1172 my $nb_links=$n-1;
|
|
1173 my $nb=$nb_links;
|
|
1174
|
|
1175 my %pt2;
|
|
1176 my $nb2=1;
|
|
1177 my $record=0;
|
|
1178 my $warn=10000;
|
|
1179
|
|
1180 print LOG "# $fchr : Removing fully overlapped links...\n";
|
|
1181
|
|
1182 LINK:
|
|
1183
|
|
1184 for my $i (1..$nb){
|
|
1185
|
|
1186 my @link=();
|
|
1187 my @next_link=();
|
|
1188 my $ind1=$i;
|
|
1189
|
|
1190 $record++;
|
|
1191 if($record>=$warn){
|
|
1192 print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n";
|
|
1193 $warn+=10000;
|
|
1194 }
|
|
1195
|
|
1196 if(exists $pt{$ind1}){
|
|
1197 @link=@{$pt{$ind1}}; #link1
|
|
1198 }else{
|
|
1199 next LINK;
|
|
1200 }
|
|
1201
|
|
1202 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1
|
|
1203 my @mates=deleteBadOrderSensePairs(split(",",$link[7]));
|
|
1204
|
|
1205 my $ind2=$ind1+1;
|
|
1206 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link
|
|
1207
|
|
1208 if($ind2<=$nb){
|
|
1209
|
|
1210 @next_link=@{$pt{$ind2}}; #link2
|
|
1211 my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2
|
|
1212 my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
|
|
1213
|
|
1214 while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter
|
|
1215
|
|
1216 if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2
|
|
1217
|
|
1218 $ind2++;
|
|
1219 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb);
|
|
1220
|
|
1221 if($ind2>$nb){ #if no more link in the file -> save link1
|
|
1222
|
|
1223 $pt2{$nb2}=\@link;
|
|
1224 $nb2++;
|
|
1225 next LINK;
|
|
1226 }
|
|
1227
|
|
1228 @next_link=@{$pt{$ind2}};
|
|
1229 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]);
|
|
1230 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
|
|
1231 next;
|
|
1232 }
|
|
1233
|
|
1234 my %mates=map{$_ =>1} @mates; #get the equal number of mates
|
|
1235 my @same_mates = grep( $mates{$_}, @next_mates );
|
|
1236 my $nb_mates= scalar @same_mates;
|
|
1237
|
|
1238 if($nb_mates == scalar @mates){
|
|
1239
|
|
1240 delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1
|
|
1241 next LINK; #go to link2, link2 becomes link1
|
|
1242
|
|
1243 }else{
|
|
1244 delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2
|
|
1245 $ind2++; #we continue by checking the next link2
|
|
1246 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb);
|
|
1247
|
|
1248 if($ind2>$nb){ #if no more link in the file -> save link1
|
|
1249
|
|
1250 $pt2{$nb2}=\@link;
|
|
1251 $nb2++;
|
|
1252 next LINK;
|
|
1253 }
|
|
1254
|
|
1255 @next_link=@{$pt{$ind2}}; #get info of link2
|
|
1256 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]);
|
|
1257 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
|
|
1258
|
|
1259 }
|
|
1260 }
|
|
1261 }
|
|
1262 $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1
|
|
1263 $nb2++;
|
|
1264 }
|
|
1265
|
|
1266 print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n";
|
|
1267
|
|
1268 #OUTPUT
|
|
1269
|
|
1270 $fh = new FileHandle;
|
|
1271 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n";
|
|
1272 print LOG "-- $fchr : writing...\n";
|
|
1273 for my $i (1..$nb2-1){
|
|
1274
|
|
1275 print $fh join("\t",@{$pt2{$i}}); #all links output
|
|
1276 }
|
|
1277
|
|
1278 close $fh;
|
|
1279
|
|
1280 print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out);
|
|
1281
|
|
1282 undef %pt;
|
|
1283 undef %pt2;
|
|
1284 }
|
|
1285 #------------------------------------------------------------------------------#
|
|
1286 sub postFiltering {
|
|
1287
|
|
1288 my ($links_file,$pflinks_file, $finalScore_thres)=@_;
|
|
1289
|
|
1290 my @sfile=split(/\./,$links_file);
|
|
1291 my $fchr=$sfile[$#sfile-2];
|
|
1292
|
|
1293
|
|
1294 my ($nb,$nb2)=(0,0);
|
|
1295
|
|
1296 print LOG "# $fchr : Post-filtering links...\n";
|
|
1297 print LOG "-- $fchr : final score threshold = $finalScore_thres\n";
|
|
1298
|
|
1299 my $fh = new FileHandle;
|
|
1300 my $fh2 = new FileHandle;
|
|
1301
|
|
1302 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
|
|
1303 $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n";
|
|
1304
|
|
1305
|
|
1306 while(<$fh>){
|
|
1307
|
|
1308 my @t=split("\t",$_);
|
|
1309 my $score=$t[$#t-1];
|
|
1310
|
|
1311 if($score >= $finalScore_thres){
|
|
1312 print $fh2 join("\t", @t);
|
|
1313 $nb2++;
|
|
1314 }
|
|
1315 $nb++;
|
|
1316 }
|
|
1317 $fh->close;
|
|
1318 $fh2->close;
|
|
1319
|
|
1320 print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n";
|
|
1321 print LOG "-- $fchr : output created: $pflinks_file\n";
|
|
1322 }
|
|
1323
|
|
1324
|
|
1325
|
|
1326 #------------------------------------------------------------------------------#
|
|
1327 #------------------------------------------------------------------------------#
|
|
1328 #Filtering of the links
|
|
1329 sub strandFiltering{
|
|
1330
|
|
1331 my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes,
|
|
1332 $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_;
|
|
1333
|
|
1334 my @sfile=split(/\./,$links_file);
|
|
1335 my $fchr=$sfile[$#sfile-1];
|
|
1336
|
|
1337
|
|
1338 my %chrs;
|
|
1339 my %chrs1;
|
|
1340 my %chrs2;
|
|
1341 my $nb_chrs;
|
|
1342 my $exclude;
|
|
1343
|
|
1344 if($chromosomes){
|
|
1345 my @chrs=split(",",$chromosomes);
|
|
1346 $nb_chrs=scalar @chrs;
|
|
1347 $exclude=($chrs[0]=~/^\-/)? 1:0;
|
|
1348 for my $chrName (@chrs){
|
|
1349 $chrName=~s/^(\-)//;
|
|
1350 my $col=($chrName=~s/_(1|2)$//);
|
|
1351
|
|
1352 if(!$col){
|
|
1353 $chrs{$chrID->{$chrName}}=undef
|
|
1354 }else{
|
|
1355 $chrs1{$chrID->{$chrName}}=undef if($1==1);
|
|
1356 $chrs2{$chrID->{$chrName}}=undef if($1==2);
|
|
1357 }
|
|
1358 }
|
|
1359 }
|
|
1360
|
|
1361 my $record=0;
|
|
1362 my $nb_links=0;
|
|
1363 my $warn=10000;
|
|
1364
|
|
1365 my $sens_ratio_threshold=0.6;
|
|
1366
|
|
1367 print LOG "\# Filtering procedure...\n";
|
|
1368 print LOG "\# Number of pairs and strand filtering...\n";
|
|
1369 print LOG "-- file=$links_file\n";
|
|
1370 print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no").
|
|
1371 ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n";
|
|
1372
|
|
1373
|
|
1374
|
|
1375 my $fh = new FileHandle;
|
|
1376 my $fh2 = new FileHandle;
|
|
1377
|
|
1378 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
|
|
1379 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
|
|
1380
|
|
1381 while(<$fh>){
|
|
1382
|
|
1383 my @t=split; #for each link
|
|
1384 my $is_good=1;
|
|
1385 $record++;
|
|
1386
|
|
1387
|
|
1388 if($chromosomes){
|
|
1389
|
|
1390 my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]});
|
|
1391
|
|
1392 if(!$exclude){
|
|
1393 $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0;
|
|
1394 $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good);
|
|
1395 $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good);
|
|
1396 }else{
|
|
1397 $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1;
|
|
1398 $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good);
|
|
1399 }
|
|
1400 }
|
|
1401
|
|
1402 $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs
|
|
1403 if($is_good && $strand_filtering){ #if filtering according the strand sense
|
|
1404
|
|
1405 my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense
|
|
1406 my @strands1=split(/,/,$t[8]);
|
|
1407 my @strands2=split(/,/,$t[9]);
|
|
1408
|
|
1409 my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0);
|
|
1410
|
|
1411 my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR
|
|
1412 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF
|
|
1413
|
|
1414 my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE",
|
|
1415 substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE",
|
|
1416 inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE");
|
|
1417
|
|
1418 if($t[6] == 1){
|
|
1419
|
|
1420 push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1);
|
|
1421
|
|
1422 }else{
|
|
1423
|
|
1424 tie (my %class,'Tie::IxHash');
|
|
1425 my $split;
|
|
1426
|
|
1427 foreach my $i (0..$#mates){
|
|
1428 $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group
|
|
1429 }
|
|
1430
|
|
1431 my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR};
|
|
1432 my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF};
|
|
1433 my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class);
|
|
1434
|
|
1435 if($sens_ratio < $sens_ratio_threshold){
|
|
1436 %class=(1=>'FF', 2=>'FR');
|
|
1437 $split=1;
|
|
1438 }else{
|
|
1439 $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class
|
|
1440 $split=0;
|
|
1441 }
|
|
1442
|
|
1443 $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold);
|
|
1444 }
|
|
1445 }
|
|
1446
|
|
1447 if($is_good){ #PRINT
|
|
1448
|
|
1449 my $nb=scalar @t;
|
|
1450 if($nb > 20){
|
|
1451 my @t2=splice(@t,0,20);
|
|
1452 print $fh2 join("\t",@t2)."\n";
|
|
1453 $nb_links++;
|
|
1454 }
|
|
1455 $nb_links++;
|
|
1456 print $fh2 join("\t",@t)."\n";
|
|
1457 }
|
|
1458
|
|
1459 if($record>=$warn){
|
|
1460 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n";
|
|
1461 $warn+=10000;
|
|
1462 }
|
|
1463 }
|
|
1464 $fh->close;
|
|
1465 $fh2->close;
|
|
1466
|
|
1467 print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links);
|
|
1468
|
|
1469 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
|
|
1470
|
|
1471
|
|
1472 }
|
|
1473 #------------------------------------------------------------------------------#
|
|
1474 #------------------------------------------------------------------------------#
|
|
1475 sub getConsistentSenseLinks{
|
|
1476
|
|
1477 my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_;
|
|
1478
|
|
1479 my $npairs=scalar @$mates;
|
|
1480
|
|
1481 my @ends_order1 = split (/,/,$$t[10]);
|
|
1482 my @ends_order2 = split (/,/,$$t[11]);
|
|
1483 my @order1 = split (/,/,$$t[12]);
|
|
1484 my @order2 = split (/,/,$$t[13]);
|
|
1485 my @positions1 = split (/,/,$$t[14]);
|
|
1486 my @positions2 = split (/,/,$$t[15]);
|
|
1487
|
|
1488 my @newlink;
|
|
1489
|
|
1490 foreach my $ind (keys %{$class} ){
|
|
1491
|
|
1492 tie (my %flink,'Tie::IxHash');
|
|
1493 my @orders2remove=();
|
|
1494
|
|
1495 foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group
|
|
1496
|
|
1497 if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){
|
|
1498 push(@{$flink{mates}},$$mates[$i]);
|
|
1499 push(@{$flink{strands1}},$$strands1[$i]);
|
|
1500 push(@{$flink{strands2}},$$strands2[$i]);
|
|
1501 push(@{$flink{ends_order1}},$ends_order1[$i]);
|
|
1502 push(@{$flink{ends_order2}},$ends_order2[$i]);
|
|
1503 push(@{$flink{positions1}},$positions1[$i]);
|
|
1504 push(@{$flink{positions2}},$positions2[$i]);
|
|
1505
|
|
1506 }else{
|
|
1507 push(@orders2remove,$order1[$i]);
|
|
1508 }
|
|
1509 }
|
|
1510
|
|
1511 @{$flink{order1}}=();
|
|
1512 @{$flink{order2}}=();
|
|
1513 if(scalar @orders2remove > 0){
|
|
1514 getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2})
|
|
1515 }else{
|
|
1516 @{$flink{order1}}=@order1;
|
|
1517 @{$flink{order2}}=@order2;
|
|
1518 }
|
|
1519
|
|
1520 my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length);
|
|
1521 my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length);
|
|
1522
|
|
1523 my $fnpairs=scalar @{$flink{mates}};
|
|
1524 my $strand_filtering_ratio=$fnpairs."/".$npairs;
|
|
1525 my $real_ratio=$fnpairs/$npairs;
|
|
1526
|
|
1527 if($fnpairs>=$thres){ #filtering according the number of pairs
|
|
1528
|
|
1529 push(@newlink,
|
|
1530 $$t[0],
|
|
1531 min(min(@{$flink{positions1}}),min(@ends1)),
|
|
1532 max(max(@{$flink{positions1}}),max(@ends1)),
|
|
1533 $$t[3],
|
|
1534 min(min(@{$flink{positions2}}),min(@ends2)),
|
|
1535 max(max(@{$flink{positions2}}),max(@ends2)),
|
|
1536 $fnpairs,
|
|
1537 join(",",@{$flink{mates}}),
|
|
1538 join(",",@{$flink{strands1}}),
|
|
1539 join(",",@{$flink{strands2}}),
|
|
1540 join(",",@{$flink{ends_order1}}),
|
|
1541 join(",",@{$flink{ends_order2}}),
|
|
1542 join(",",@{$flink{order1}}),
|
|
1543 join(",",@{$flink{order2}}),
|
|
1544 join(",",@{$flink{positions1}}),
|
|
1545 join(",",@{$flink{positions2}}),
|
|
1546 $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]},
|
|
1547 $strand_filtering_ratio,
|
|
1548 $real_ratio,
|
|
1549 $npairs
|
|
1550 );
|
|
1551 }
|
|
1552 }
|
|
1553
|
|
1554 if (grep {defined($_)} @newlink) {
|
|
1555 @$t=@newlink;
|
|
1556 return 1
|
|
1557 }
|
|
1558 return 0;
|
|
1559
|
|
1560 }
|
|
1561 #------------------------------------------------------------------------------#
|
|
1562 #------------------------------------------------------------------------------#
|
|
1563 sub getNewOrders{
|
|
1564
|
|
1565 my($tab1,$tab2,$list,$newtab1,$newtab2)=@_;
|
|
1566 my $j=1;
|
|
1567 my $k=1;
|
|
1568 for my $i (0..$#{$tab2}){
|
|
1569 my $c=0;
|
|
1570 for my $j (0..$#{$list}){
|
|
1571 $c++ if(${$list}[$j] < ${$tab2}[$i]);
|
|
1572 if(${$list}[$j] == ${$tab2}[$i]){
|
|
1573 $c=-1; last;
|
|
1574 }
|
|
1575 }
|
|
1576 if($c!=-1){
|
|
1577 push(@{$newtab2}, ${$tab2}[$i]-$c);
|
|
1578 push(@{$newtab1}, $k);
|
|
1579 $k++;
|
|
1580 }
|
|
1581 }
|
|
1582 }
|
|
1583
|
|
1584 #------------------------------------------------------------------------------#
|
|
1585 #------------------------------------------------------------------------------#
|
|
1586 #Filtering of the links using their order
|
|
1587 sub orderFiltering {
|
|
1588
|
|
1589 my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_;
|
|
1590
|
|
1591 my @sfile=split(/\./,$links_file);
|
|
1592 my $fchr=$sfile[$#sfile-2];
|
|
1593
|
|
1594
|
|
1595 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0;
|
|
1596
|
|
1597 my $record=0;
|
|
1598 my $warn=10000;
|
|
1599 my $nb_links=0;
|
|
1600
|
|
1601 my $quant05 = 1.644854;
|
|
1602 my $quant001 = 3.090232;
|
|
1603 my $alphaDist = $quant05 * 2 * $sigma;
|
|
1604 my $maxFragmentLength = &floor($quant001 * $sigma + $mu);
|
|
1605
|
|
1606 print LOG "\# Filtering by order...\n";
|
|
1607 print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n";
|
|
1608 print LOG "-- distance between comparable pairs was set to $alphaDist\n";
|
|
1609 print LOG "-- maximal fragment length was set to $maxFragmentLength\n";
|
|
1610
|
|
1611
|
|
1612 my $fh = new FileHandle;
|
|
1613 my $fh2 = new FileHandle;
|
|
1614
|
|
1615 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
|
|
1616 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
|
|
1617
|
|
1618 while(<$fh>){
|
|
1619
|
|
1620 $record++;
|
|
1621 my @t = split;
|
|
1622 my ($chr1,$chr2,$mates_list)=@t[0,3,7];
|
|
1623 my @pairs=split(",",$mates_list);
|
|
1624 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2});
|
|
1625 my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5];
|
|
1626 my $numberOfPairs = $t[6];
|
|
1627 my @strand1 = split (/,/,$t[8]);
|
|
1628 my @strand2 = split (/,/,$t[9]);
|
|
1629 my @ends_order1 = split (/,/,$t[10]);
|
|
1630 my @ends_order2 = split (/,/,$t[11]);
|
|
1631 my @order1 = split (/,/,$t[12]);
|
|
1632 my @order2 = split (/,/,$t[13]);
|
|
1633 my @positions1 = split (/,/,$t[14]);
|
|
1634 my @positions2 = split (/,/,$t[15]);
|
|
1635 my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length);
|
|
1636 my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length);
|
|
1637 my $clusterCoordinates_chr1;
|
|
1638 my $clusterCoordinates_chr2;
|
|
1639 my $reads_left = 0;
|
|
1640
|
|
1641 my $ifRenv = $t[16];
|
|
1642 my $strand_ratio_filtering=$t[17];
|
|
1643
|
|
1644 #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if <F-R> orientation is correct
|
|
1645 my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense);
|
|
1646 #find pairs type F-RRRR or FFFF-R in the case if <R-F> orientation is correct
|
|
1647 #These pairs are annotated as BED pairs forever! They won't be recycled!
|
|
1648 my $table;
|
|
1649 for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs
|
|
1650 my $nonAdeq = 0;
|
|
1651 for my $j (0..$i-1) {
|
|
1652 if (exists($table->{$j}->{$i})) {
|
|
1653 $nonAdeq++;
|
|
1654 $table->{$i}->{$j} = 1;
|
|
1655 }
|
|
1656 }
|
|
1657 for my $j ($i+1..$numberOfPairs-1) {
|
|
1658 if ($positions1[$j]-$positions1[$i]>$alphaDist) {
|
|
1659 if (&reversed ($i,$j,$ifRenv,\@positions2)) {
|
|
1660 $nonAdeq++;
|
|
1661 $table->{$i}->{$j} = 1;
|
|
1662 }
|
|
1663 }
|
|
1664 }
|
|
1665 $table->{$i}->{nonAdeq} = $nonAdeq;
|
|
1666 }
|
|
1667
|
|
1668 for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of <R-F> orientation
|
|
1669 &remove($bad,$table);
|
|
1670 }
|
|
1671
|
|
1672 my @falseReads;
|
|
1673 #RRRR-F -> RRRR or R-FFFF -> FFFF
|
|
1674 @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table});
|
|
1675 #these pairs will be recycled later as $secondTable
|
|
1676 for my $bad (@falseReads) {
|
|
1677 &remove($bad,$table);
|
|
1678 }
|
|
1679
|
|
1680 my $bad = &check($table);
|
|
1681 while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER
|
|
1682 # push (@falseReads, $bad); remove completely!!!
|
|
1683 &remove($bad,$table);
|
|
1684 $bad = &check($table);
|
|
1685 }
|
|
1686
|
|
1687 $reads_left = scalar keys %{$table};
|
|
1688 my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}]));
|
|
1689 my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}]));
|
|
1690 my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}]));
|
|
1691 my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}]));
|
|
1692
|
|
1693 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
|
|
1694 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
|
|
1695
|
|
1696 my $ifBalanced = 'UNBAL';
|
|
1697 my $secondTable;
|
|
1698 my $clusterCoordinates;
|
|
1699 my ($break_pont_chr1,$break_pont_chr2);
|
|
1700
|
|
1701 my $signatureType="";
|
|
1702
|
|
1703 my $maxCoord1 =$chr->{$chr1}->{length};
|
|
1704 my $maxCoord2 =$chr->{$chr2}->{length};
|
|
1705
|
|
1706 if (scalar @falseReads) {
|
|
1707 @falseReads = sort @falseReads;
|
|
1708 #now delete FRFR choosing the majority
|
|
1709 my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF
|
|
1710 @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled
|
|
1711 my %hashTmp;
|
|
1712 for my $count1 (0..scalar(@falseReads)-1) {
|
|
1713 my $i = $falseReads[$count1];
|
|
1714 $hashTmp{$i} = 1;
|
|
1715 for my $bad (@newfalseReads) {
|
|
1716 if ($bad == $i) {
|
|
1717 delete $hashTmp{$i};
|
|
1718 next;
|
|
1719 }
|
|
1720 }
|
|
1721 }
|
|
1722 @falseReads = sort keys %hashTmp; #what is left
|
|
1723 for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected
|
|
1724 my $nonAdeq = 0;
|
|
1725 my $i = $falseReads[$count1];
|
|
1726
|
|
1727 for my $count2 (0..$count1-1) {
|
|
1728 my $j = $falseReads[$count2];
|
|
1729 if (exists($secondTable->{$j}->{$i})) {
|
|
1730 $nonAdeq++;
|
|
1731 $secondTable->{$i}->{$j} = 1;
|
|
1732 }
|
|
1733 }
|
|
1734 for my $count2 ($count1+1..scalar(@falseReads)-1) {
|
|
1735 my $j = $falseReads[$count2];
|
|
1736 if ($positions1[$j]-$positions1[$i]>$alphaDist) {
|
|
1737 if (&reversed ($i,$j,$ifRenv,\@positions2)) {
|
|
1738 $nonAdeq++;
|
|
1739 $secondTable->{$i}->{$j} = 1;
|
|
1740 }
|
|
1741 }
|
|
1742 }
|
|
1743 $secondTable->{$i}->{nonAdeq} = $nonAdeq;
|
|
1744 }
|
|
1745
|
|
1746 my @falseReads2;
|
|
1747 my $bad = &check($secondTable);
|
|
1748 while ($bad ne "OK") { #clear the table to reject non adequate pairs
|
|
1749 push (@falseReads2, $bad);
|
|
1750 &remove($bad,$secondTable);
|
|
1751 $bad = &check($secondTable);
|
|
1752 }
|
|
1753 if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) {
|
|
1754 my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}]));
|
|
1755 my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}]));
|
|
1756 my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}]));
|
|
1757 my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}]));
|
|
1758
|
|
1759 $ifBalanced = 'BAL';
|
|
1760
|
|
1761 if ($ifBalanced eq 'BAL') {
|
|
1762
|
|
1763 if (scalar keys %{$table} < $nb_pairs_order_threshold) {
|
|
1764 $ifBalanced = 'UNBAL'; #kill cluster 1!
|
|
1765 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
|
|
1766 $reads_left = scalar keys %{$table};
|
|
1767 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2;
|
|
1768 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2;
|
|
1769 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2;
|
|
1770 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2;
|
|
1771 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
|
|
1772 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
|
|
1773
|
|
1774 } else {
|
|
1775
|
|
1776 $reads_left += scalar keys %{$secondTable};
|
|
1777 next if ($reads_left < $nb_pairs_threshold);
|
|
1778
|
|
1779 if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) {
|
|
1780 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
|
|
1781
|
|
1782 ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1);
|
|
1783 ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1);
|
|
1784 ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1);
|
|
1785 ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1);
|
|
1786
|
|
1787 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1;
|
|
1788 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2;
|
|
1789 }else {
|
|
1790 $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')';
|
|
1791 $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')';
|
|
1792 }
|
|
1793 $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2);
|
|
1794 $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2);
|
|
1795 $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2);
|
|
1796 $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2);
|
|
1797 #to calculate breakpoints one need to take into account read orientation in claster..
|
|
1798 my $leftLetterOk = substr($mate_sense, 0, 1); #R
|
|
1799 my $rightLetterOk = substr($mate_sense, 1, 1); #F
|
|
1800
|
|
1801
|
|
1802 my @index1 = keys %{$table};
|
|
1803 my @index2 = keys %{$secondTable};
|
|
1804
|
|
1805 my (@generalStrand1,@generalStrand2) = 0;
|
|
1806
|
|
1807 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
|
|
1808 $leftLetterOk = 'R';
|
|
1809 $rightLetterOk = 'F';
|
|
1810 @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1);
|
|
1811 @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2);
|
|
1812 } else {
|
|
1813 @generalStrand1 = @strand1;
|
|
1814 @generalStrand2 = @strand2; # TODO check if it is correct
|
|
1815 }
|
|
1816 if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F)
|
|
1817 $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')';
|
|
1818
|
|
1819 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) {
|
|
1820 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
|
|
1821 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')';
|
|
1822 $signatureType = "TRANSLOC";
|
|
1823 } else {
|
|
1824 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')';
|
|
1825 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')';
|
|
1826 $signatureType = "INS_FRAGMT";
|
|
1827 }
|
|
1828
|
|
1829 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) {
|
|
1830 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
|
|
1831 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')';
|
|
1832 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')';
|
|
1833 $signatureType = "INV_INS_FRAGMT";
|
|
1834 } else {
|
|
1835 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')';
|
|
1836 $signatureType = "INV_TRANSLOC";
|
|
1837 }
|
|
1838 } else {
|
|
1839 #should not occur
|
|
1840 print STDERR "\nError in orderFiltering\n\n";
|
|
1841 }
|
|
1842 }
|
|
1843
|
|
1844 elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R)
|
|
1845 $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')';
|
|
1846 $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')';
|
|
1847 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) {
|
|
1848 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
|
|
1849 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')';
|
|
1850 $signatureType = "INV_INS_FRAGMT";
|
|
1851 } else {
|
|
1852 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')';
|
|
1853 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')';
|
|
1854 $signatureType = "INV_COAMPLICON";
|
|
1855 }
|
|
1856
|
|
1857 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) {
|
|
1858 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
|
|
1859 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')';
|
|
1860 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')';
|
|
1861 $signatureType = "COAMPLICON";
|
|
1862 } else {
|
|
1863 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')';
|
|
1864 $signatureType = "INS_FRAGMT";
|
|
1865 }
|
|
1866 } else {
|
|
1867 #should not occur
|
|
1868 $signatureType = "UNDEFINED";
|
|
1869 }
|
|
1870 }
|
|
1871 else { # (F,F) or (R,R) something strange. We will discard the smallest cluster
|
|
1872 $ifBalanced = 'UNBAL';
|
|
1873 if (scalar keys %{$secondTable} > scalar keys %{$table}) {
|
|
1874 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
|
|
1875
|
|
1876 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2;
|
|
1877 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2;
|
|
1878 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2;
|
|
1879 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2;
|
|
1880 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
|
|
1881 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
|
|
1882 }
|
|
1883 $reads_left = scalar keys %{$table};
|
|
1884 }
|
|
1885 if ($ifBalanced eq 'BAL') {
|
|
1886 $ifRenv = $signatureType;
|
|
1887 }
|
|
1888 }
|
|
1889 }
|
|
1890 }
|
|
1891 }
|
|
1892 if ($ifBalanced ne 'BAL') {
|
|
1893 #define possible break point
|
|
1894 $coord_start_chr1 = $coord_start_chr1_cluster1;
|
|
1895 $coord_end_chr1 = $coord_end_chr1_cluster1;
|
|
1896 $coord_start_chr2 = $coord_start_chr2_cluster1;
|
|
1897 $coord_end_chr2 = $coord_end_chr2_cluster1;
|
|
1898
|
|
1899 my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1;
|
|
1900 my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2;
|
|
1901
|
|
1902 my $leftLetterOk = substr($mate_sense, 0, 1); #R
|
|
1903 my $rightLetterOk = substr($mate_sense, 1, 1); #F
|
|
1904
|
|
1905 my @index = keys %{$table};
|
|
1906 unless ($diff_sense_ends) {
|
|
1907 my $firstEndOrder1 = $ends_order1[$index[0]];
|
|
1908 my $firstEndOrder2 = $ends_order2[$index[0]];
|
|
1909 $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')';
|
|
1910 $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')';
|
|
1911 } else {
|
|
1912 $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')';
|
|
1913 $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')';
|
|
1914 }
|
|
1915
|
|
1916 if ($chr1 ne $chr2){
|
|
1917 $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE");
|
|
1918 $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE");
|
|
1919 }
|
|
1920 }
|
|
1921
|
|
1922 if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) {
|
|
1923 next; #discard the link
|
|
1924 }
|
|
1925 if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) {
|
|
1926 next; #discard the link
|
|
1927 }
|
|
1928 my $ratioTxt = "$reads_left/".(scalar @pairs);
|
|
1929 my ($n1,$nTot) = split ("/",$strand_ratio_filtering);
|
|
1930 my $ratioReal = $reads_left/$nTot;
|
|
1931
|
|
1932 if ($coord_start_chr1<=0) {
|
|
1933 $coord_start_chr1=1;
|
|
1934 }
|
|
1935 if ($coord_start_chr2<=0) {
|
|
1936 $coord_start_chr2=1;
|
|
1937 }
|
|
1938 #create output
|
|
1939 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output
|
|
1940 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2,
|
|
1941 $reads_left,
|
|
1942 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs),
|
|
1943 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1),
|
|
1944 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2),
|
|
1945 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1),
|
|
1946 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2),
|
|
1947 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1),
|
|
1948 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2),
|
|
1949 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1),
|
|
1950 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2),
|
|
1951 $ifRenv,
|
|
1952 $strand_ratio_filtering,
|
|
1953 $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2,
|
|
1954 $ratioReal, $nTot);
|
|
1955
|
|
1956 $nb_links++;
|
|
1957 print $fh2 join("\t",@link)."\n";
|
|
1958
|
|
1959 if($record>=$warn){
|
|
1960 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n";
|
|
1961 $warn+=10000;
|
|
1962 }
|
|
1963
|
|
1964 }
|
|
1965 $fh->close;
|
|
1966 $fh2->close;
|
|
1967
|
|
1968 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
|
|
1969
|
|
1970 }
|
|
1971 #------------------------------------------------------------------------------#
|
|
1972 #------------------------------------------------------------------------------#
|
|
1973 #gets information about ends positions given start, direction and order
|
|
1974 sub getEnds {
|
|
1975 my ($ends,$starts,$strand,$end_order,$tag_length) = @_;
|
|
1976 for my $i (0..scalar(@{$starts})-1) {
|
|
1977 $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length);
|
|
1978 }
|
|
1979 }
|
|
1980 sub getEnd {
|
|
1981 my ($start,$strand, $end_order,$tag_length) = @_;
|
|
1982 return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1;
|
|
1983 }
|
|
1984 #------------------------------------------------------------------------------#
|
|
1985 #------------------------------------------------------------------------------#
|
|
1986 #gets starts and ends Coords when start=leftmost given positions, directions and orders
|
|
1987 sub getCoordswithLeftMost {
|
|
1988
|
|
1989 my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_;
|
|
1990
|
|
1991 for my $i (0..scalar(@{$positions})-1) {
|
|
1992
|
|
1993 if($strand->[$i] eq 'F'){
|
|
1994 $starts->[$i]=$positions->[$i];
|
|
1995 $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1;
|
|
1996 }else{
|
|
1997 $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1;
|
|
1998 $ends->[$i]=$positions->[$i];
|
|
1999 }
|
|
2000 }
|
|
2001 }
|
|
2002 #------------------------------------------------------------------------------#
|
|
2003 #------------------------------------------------------------------------------#
|
|
2004 sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering
|
|
2005
|
|
2006 my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_;
|
|
2007
|
|
2008 my @sfile=split(/\./,$links_file);
|
|
2009 my $fchr=$sfile[$#sfile-2];
|
|
2010
|
|
2011
|
|
2012 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0;
|
|
2013
|
|
2014 my $record=0;
|
|
2015 my $nb_links=0;
|
|
2016 my $warn=10000;
|
|
2017
|
|
2018 print LOG "\# Filtering out normal pairs using insert size...\n";
|
|
2019 print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n";
|
|
2020 print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-".
|
|
2021 ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n";
|
|
2022 print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-".
|
|
2023 ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n";
|
|
2024 print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF");
|
|
2025
|
|
2026 my $fh = new FileHandle;
|
|
2027 my $fh2 = new FileHandle;
|
|
2028
|
|
2029 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
|
|
2030 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
|
|
2031
|
|
2032 while(<$fh>){
|
|
2033
|
|
2034 $record++;
|
|
2035 my @t = split;
|
|
2036 my ($chr1,$chr2,$mates_list)=@t[0,3,7];
|
|
2037
|
|
2038 if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all),
|
|
2039 $nb_links++;
|
|
2040
|
|
2041 $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE");
|
|
2042 $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE");
|
|
2043
|
|
2044 $t[16].= "\t";
|
|
2045 $t[19].= "\t";
|
|
2046
|
|
2047 print $fh2 join("\t",@t)."\n";
|
|
2048
|
|
2049 if($record>=$warn){
|
|
2050 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n";
|
|
2051 $warn+=10000;
|
|
2052 }
|
|
2053 next;
|
|
2054 }
|
|
2055
|
|
2056 my $ifRenv = $t[16];
|
|
2057 my $ifBalanced = "UNBAL";
|
|
2058 $ifBalanced = $t[18] if ($order_filtering);
|
|
2059
|
|
2060 my $numberOfPairs = $t[6];
|
|
2061 my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14]));
|
|
2062 my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15]));
|
|
2063
|
|
2064 if ($ifBalanced eq "BAL") {
|
|
2065
|
|
2066 if ($ifRenv eq "INV_TRANSLOC") {
|
|
2067 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment
|
|
2068 }
|
|
2069 if ($ifRenv eq "NORMAL_SENSE") {
|
|
2070 $ifRenv = "TRANSLOC";
|
|
2071 }
|
|
2072 if ($ifRenv eq "REVERSE_SENSE") {
|
|
2073 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment
|
|
2074 }
|
|
2075 $t[19].= "\t";
|
|
2076
|
|
2077 my $meanDistance = 0;
|
|
2078
|
|
2079 for my $i (0..$numberOfPairs-1) {
|
|
2080 $meanDistance += $positions2[$i]-$positions1[$i];
|
|
2081 }
|
|
2082 $meanDistance /= $numberOfPairs;
|
|
2083
|
|
2084 $t[16] = $ifRenv."\t".$meanDistance;
|
|
2085 #dont touch the annotation. It should be already OK.
|
|
2086
|
|
2087 } else {
|
|
2088 #only for unbalanced
|
|
2089
|
|
2090 my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]);
|
|
2091
|
|
2092 my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0].
|
|
2093 (deleteBadOrderSensePairs(split (/,/,$t[9])))[0];
|
|
2094 my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0].
|
|
2095 (deleteBadOrderSensePairs(split (/,/,$t[11])))[0];
|
|
2096
|
|
2097 my $indel_type = $ifRenv;
|
|
2098
|
|
2099 my $meanDistance = "N/A";
|
|
2100
|
|
2101 ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster
|
|
2102 \@positions1, #assign N/A to $indel_type if unknown
|
|
2103 \@positions2,
|
|
2104 $ifRenv,
|
|
2105 $ifoverlap,
|
|
2106 $indel_sigma_threshold,
|
|
2107 $dup_sigma_threshold,
|
|
2108 $singleton_sigma_threshold,
|
|
2109 $mu,
|
|
2110 $sigma,
|
|
2111 $ifBalanced,
|
|
2112 $ends_sense_class,
|
|
2113 $ends_order_class,
|
|
2114 $mate_sense,
|
|
2115 $diff_sense_ends,
|
|
2116 );
|
|
2117
|
|
2118 #filtering of pairs with distance inconsistant with the SV
|
|
2119 if ($ifRenv ne "REVERSE_SENSE") {
|
|
2120 my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length};
|
|
2121 my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length};
|
|
2122 $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,
|
|
2123 $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering);
|
|
2124 next if ($t[6] < $nb_pairs_threshold);
|
|
2125 }else{
|
|
2126 $t[19].= "\t";
|
|
2127 }
|
|
2128 $t[16] = $indel_type."\t".$meanDistance;
|
|
2129 }
|
|
2130
|
|
2131 $nb_links++;
|
|
2132
|
|
2133 print $fh2 join("\t",@t)."\n";
|
|
2134 if($record>=$warn){
|
|
2135 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n";
|
|
2136 $warn+=10000;
|
|
2137 }
|
|
2138 }
|
|
2139 $fh->close;
|
|
2140 $fh2->close;
|
|
2141
|
|
2142 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
|
|
2143
|
|
2144 }
|
|
2145 #------------------------------------------------------------------------------#
|
|
2146 #------------------------------------------------------------------------------#
|
|
2147 sub checkIndel {
|
|
2148
|
|
2149 my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold,
|
|
2150 $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_;
|
|
2151
|
|
2152 my $meanDistance = 0;
|
|
2153
|
|
2154 for my $i (0..$numberOfPairs-1) {
|
|
2155 $meanDistance += $positions2->[$i]-$positions1->[$i];
|
|
2156 }
|
|
2157 $meanDistance /= $numberOfPairs;
|
|
2158
|
|
2159 return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) );
|
|
2160
|
|
2161 return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE");
|
|
2162
|
|
2163 if($diff_sense_ends){
|
|
2164 return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ;
|
|
2165 return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense)));
|
|
2166 }else{
|
|
2167 return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) &&
|
|
2168 ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ;
|
|
2169 }
|
|
2170
|
|
2171 return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap);
|
|
2172
|
|
2173 return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ;
|
|
2174
|
|
2175 return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma);
|
|
2176 return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma);
|
|
2177
|
|
2178 return ($meanDistance, "UNDEFINED");
|
|
2179 }
|
|
2180 #------------------------------------------------------------------------------#
|
|
2181 #------------------------------------------------------------------------------#
|
|
2182 #sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size )
|
|
2183 sub recalc_t_usingInsertSizeInfo {
|
|
2184 my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_;
|
|
2185
|
|
2186 my @badPairs;
|
|
2187
|
|
2188 my @positions1 = getAllEntries($t->[14]);
|
|
2189 my @positions2 = getAllEntries($t->[15]);
|
|
2190
|
|
2191 if ($meanDistance < $mu) {
|
|
2192 for my $i (0..scalar(@positions1)-1) {
|
|
2193 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) {
|
|
2194 push(@badPairs,$i);
|
|
2195 }
|
|
2196 }
|
|
2197 } else {
|
|
2198 for my $i (0..scalar(@positions1)-1) {
|
|
2199 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) {
|
|
2200 push(@badPairs,$i);
|
|
2201 }
|
|
2202 }
|
|
2203 }
|
|
2204
|
|
2205 if (scalar (@badPairs)>0) {
|
|
2206 #print join("\t",@badPairs).": ".join("\t",@t)."\n";
|
|
2207 #remove these inconsistant links
|
|
2208 $t->[6] -= scalar(@badPairs); #numberOfPairs
|
|
2209 return if ($t->[6] < $nb_pairs_threshold);
|
|
2210
|
|
2211 $t->[7] = mark_values(\@badPairs, $t->[7]);
|
|
2212 $t->[8] = mark_values(\@badPairs, $t->[8]);
|
|
2213 $t->[9] = mark_values(\@badPairs, $t->[9]);
|
|
2214 $t->[10] = mark_values(\@badPairs, $t->[10]);
|
|
2215 $t->[11] = mark_values(\@badPairs, $t->[11]);
|
|
2216
|
|
2217 $t->[12] = mark_indexes(\@badPairs, $t->[12]);
|
|
2218 $t->[13] = mark_indexes(\@badPairs, $t->[13]);
|
|
2219
|
|
2220 $t->[14] = mark_values(\@badPairs, $t->[14]);
|
|
2221 $t->[15] = mark_values(\@badPairs, $t->[15]);
|
|
2222 $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio
|
|
2223 $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering);
|
|
2224 ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length);
|
|
2225 ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length);
|
|
2226
|
|
2227 #recalc breakpoints:
|
|
2228 my $quant001 = 3.090232;
|
|
2229 my $maxFragmentLength = &floor($quant001 * $sigma + $mu);
|
|
2230 $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering);
|
|
2231 $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering);
|
|
2232 #recalc total ratio
|
|
2233 $t->[22] = $t->[6] / $t->[23] if ($order_filtering);
|
|
2234 $t->[18] = $t->[6] / $t->[19] unless ($order_filtering);
|
|
2235
|
|
2236 @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14]));
|
|
2237 @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15]));
|
|
2238
|
|
2239 $meanDistance = 0;
|
|
2240
|
|
2241 for my $i (0..scalar(@positions1)-1) {
|
|
2242 $meanDistance += $positions2[$i]-$positions1[$i];
|
|
2243 }
|
|
2244 $meanDistance /= scalar(@positions1);
|
|
2245
|
|
2246 } else {
|
|
2247 $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering);
|
|
2248 $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering);
|
|
2249
|
|
2250 } #nothing has been filtered
|
|
2251 return $meanDistance;
|
|
2252 }
|
|
2253
|
|
2254 sub recalculate_ratio {
|
|
2255 my ($left, $ratio) = @_;
|
|
2256 my @elements = split (/\//,$ratio);
|
|
2257 $elements[1]= $elements[0];
|
|
2258 $elements[0]=$left;
|
|
2259 return $ratio."\t".join("/",@elements);
|
|
2260 }
|
|
2261
|
|
2262 sub recalc_breakpoints {
|
|
2263 my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_;
|
|
2264 my $break_pont_chr;
|
|
2265
|
|
2266 my $leftLetterOk = substr($mate_sense, 0, 1); #R
|
|
2267 my $rightLetterOk = substr($mate_sense, 1, 1); #F
|
|
2268
|
|
2269
|
|
2270 my @positions = deleteBadOrderSensePairs(split (/,/,$startString));
|
|
2271
|
|
2272 unless ($diff_sense_ends) {
|
|
2273 $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')';
|
|
2274 } else {
|
|
2275 $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')';
|
|
2276 }
|
|
2277 return $break_pont_chr;
|
|
2278 }
|
|
2279 sub recalculate_boundaries {
|
|
2280 my ($startString,$senseString,$endsOrderString,$tag_length) = @_;
|
|
2281 my @positions = deleteBadOrderSensePairs(split (/,/,$startString));
|
|
2282 my @strands = deleteBadOrderSensePairs(split (/,/,$senseString));
|
|
2283 my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString));
|
|
2284 my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length);
|
|
2285 my $coord_start_cluster = min(min(@positions),min(@ends));
|
|
2286 my $coord_end_cluster = max(max(@positions),max(@ends));
|
|
2287 return ($coord_start_cluster,$coord_end_cluster);
|
|
2288 }
|
|
2289
|
|
2290 sub remove_indexes {
|
|
2291 my ($bads, $string) = @_;
|
|
2292 my @elements = deleteBadOrderSensePairs(split (/,/,$string));
|
|
2293 for my $i (reverse sort %{$bads}) {
|
|
2294 delete $elements[$i];
|
|
2295 }
|
|
2296 return "(".join(",",@elements).")";
|
|
2297 }
|
|
2298 ##add @ to to elements
|
|
2299 sub mark_values {
|
|
2300 my ($bads, $string) = @_;
|
|
2301 my @elements = getAllEntries($string);
|
|
2302 for my $i (@{$bads}) {
|
|
2303 $elements[$i] .= "@";
|
|
2304 }
|
|
2305 return "(".join(",",@elements).")";
|
|
2306 }
|
|
2307 ##add @ to to indexes
|
|
2308 sub mark_indexes {
|
|
2309 my ($bads, $string) = @_;
|
|
2310 my @elements = getAllEntries($string);
|
|
2311 for my $i ((0..scalar(@elements)-1)) {
|
|
2312 for my $j (@{$bads}) {
|
|
2313 $elements[$i] .= "@" if ($elements[$i] eq ($j+1));
|
|
2314 }
|
|
2315 }
|
|
2316
|
|
2317 return "(".join(",",@elements).")";
|
|
2318 }
|
|
2319
|
|
2320 #------------------------------------------------------------------------------#
|
|
2321 #------------------------------------------------------------------------------#
|
|
2322 sub redraw {
|
|
2323
|
|
2324 my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_;
|
|
2325
|
|
2326 my $out;
|
|
2327 my @first_arr;
|
|
2328 if ($ifBalanced eq 'BAL') {
|
|
2329 my @second_arr;
|
|
2330 my $lastPushed = 1;
|
|
2331 if ($type == 1) {
|
|
2332 for my $i (0 .. scalar(@{$arr})-1) {
|
|
2333 if (exists ($table->{$i})) {
|
|
2334 push(@first_arr,$arr->[$i]);
|
|
2335 $lastPushed = 1;
|
|
2336 }elsif (exists ($secondTable->{$i})) {
|
|
2337 push(@second_arr,$arr->[$i]);
|
|
2338 $lastPushed = 2;
|
|
2339 } elsif ($lastPushed == 1) {
|
|
2340 if (exists ($badInFRSense->{$i})) {
|
|
2341 push(@first_arr,$arr->[$i]."\$");
|
|
2342 }else {
|
|
2343 push(@first_arr,$arr->[$i]."*");
|
|
2344 }
|
|
2345 } elsif ($lastPushed == 2) {
|
|
2346 if (exists ($badInFRSense->{$i})) {
|
|
2347 push(@second_arr,$arr->[$i]."\$");
|
|
2348 }else {
|
|
2349 push(@second_arr,$arr->[$i]."*");
|
|
2350 }
|
|
2351 } else {print "Error!";exit;}
|
|
2352 }
|
|
2353 } else {
|
|
2354 for my $i (@{$arr}) {
|
|
2355 if (exists ($table->{$i-1})) {
|
|
2356 push(@first_arr,$i);
|
|
2357 $lastPushed = 1;
|
|
2358 }elsif (exists ($secondTable->{$i-1})) {
|
|
2359 push(@second_arr,$i);
|
|
2360 $lastPushed = 2;
|
|
2361 } elsif ($lastPushed == 1) {
|
|
2362 if (exists ($badInFRSense->{$i-1})) {
|
|
2363 push(@first_arr,$i."\$");
|
|
2364 }else {
|
|
2365 push(@first_arr,$i."*");
|
|
2366 }
|
|
2367 } elsif ($lastPushed == 2) {
|
|
2368 if (exists ($badInFRSense->{$i-1})) {
|
|
2369 push(@second_arr,$i."\$");
|
|
2370 }else {
|
|
2371 push(@second_arr,$i."*");
|
|
2372 }
|
|
2373 } else {print "Error!";exit;}
|
|
2374 }
|
|
2375 }
|
|
2376 $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')';
|
|
2377 }
|
|
2378 else {
|
|
2379 if ($type == 1) {
|
|
2380 for my $i (0 .. scalar(@{$arr})-1) {
|
|
2381 if (exists ($table->{$i})) {
|
|
2382 push(@first_arr,$arr->[$i]);
|
|
2383 } else {
|
|
2384 if (exists ($badInFRSense->{$i})) {
|
|
2385 push(@first_arr,$arr->[$i]."\$");
|
|
2386 }else {
|
|
2387 push(@first_arr,$arr->[$i]."*");
|
|
2388 }
|
|
2389 }
|
|
2390 }
|
|
2391 } else {
|
|
2392 for my $i (@{$arr}) {
|
|
2393 if (exists ($table->{$i-1})) {
|
|
2394 push(@first_arr,$i);
|
|
2395 } else {
|
|
2396 if (exists ($badInFRSense->{$i-1})) {
|
|
2397 push(@first_arr,$i."\$");
|
|
2398 }else {
|
|
2399 push(@first_arr,$i."*");
|
|
2400 }
|
|
2401 }
|
|
2402 }
|
|
2403 }
|
|
2404 $out = '('.join(",",@first_arr).')';
|
|
2405 }
|
|
2406 return $out;
|
|
2407 }
|
|
2408 #------------------------------------------------------------------------------#
|
|
2409 #------------------------------------------------------------------------------#
|
|
2410 sub check {
|
|
2411
|
|
2412 my $table = $_[0];
|
|
2413 my $bad = 'OK';
|
|
2414 my $max = 0;
|
|
2415 for my $i (sort {$a<=>$b} keys %{$table}) {
|
|
2416 unless ($table->{$i}->{nonAdeq} == 0) {
|
|
2417 if ($max<$table->{$i}->{nonAdeq}) {
|
|
2418 $max=$table->{$i}->{nonAdeq};
|
|
2419 $bad = $i;
|
|
2420 }
|
|
2421 }
|
|
2422 }
|
|
2423 return $bad;
|
|
2424 }
|
|
2425 #------------------------------------------------------------------------------#
|
|
2426 #------------------------------------------------------------------------------#
|
|
2427 sub reversed {
|
|
2428
|
|
2429 my ($i,$j,$ifRenv,$positions) = @_;
|
|
2430 if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){
|
|
2431 return 1;
|
|
2432 }
|
|
2433 return 0;
|
|
2434 }
|
|
2435 #------------------------------------------------------------------------------#
|
|
2436 #------------------------------------------------------------------------------#
|
|
2437 sub remove {
|
|
2438
|
|
2439 my ($bad,$table) = @_;
|
|
2440 for my $i (sort {$a<=>$b} keys %{$table}) {
|
|
2441 if ($bad == $i) {
|
|
2442 delete($table->{$i});;
|
|
2443 } else {
|
|
2444 if (exists($table->{$i}->{$bad})) {
|
|
2445 delete($table->{$i}->{$bad});
|
|
2446 $table->{$i}->{nonAdeq}--;
|
|
2447 }
|
|
2448 }
|
|
2449 }
|
|
2450 }
|
|
2451 #------------------------------------------------------------------------------#
|
|
2452 #------------------------------------------------------------------------------#
|
|
2453 sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs
|
|
2454
|
|
2455 my ($strand,$ends_order,$mate_sense,@keysLeft) = @_;
|
|
2456
|
|
2457 my $leftLetterOk = substr($mate_sense, 0, 1); #R
|
|
2458 my $rightLetterOk = substr($mate_sense, 1, 1); #F
|
|
2459
|
|
2460 my (@standardArray);
|
|
2461 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
|
|
2462 $leftLetterOk = 'R';
|
|
2463 $rightLetterOk = 'F';
|
|
2464 @standardArray = translateSolidToRF($strand,$ends_order);
|
|
2465 } else {
|
|
2466 @standardArray = @{$strand};
|
|
2467 }
|
|
2468
|
|
2469 my $ifR = 0;
|
|
2470 my @Rs;
|
|
2471
|
|
2472 for my $i (@keysLeft) {
|
|
2473 if ($standardArray[$i] eq $leftLetterOk) {
|
|
2474 $ifR++;
|
|
2475 push(@Rs,$i);
|
|
2476 }
|
|
2477 }
|
|
2478
|
|
2479
|
|
2480 my $ifF = 0;
|
|
2481 my @Fs;
|
|
2482
|
|
2483 for my $i (@keysLeft) {
|
|
2484 if ($standardArray[$i] eq $rightLetterOk) {
|
|
2485 $ifF++;
|
|
2486 push(@Fs,$i);
|
|
2487 }
|
|
2488 }
|
|
2489
|
|
2490 if($ifR>=$ifF) {
|
|
2491 return @Fs;
|
|
2492 }
|
|
2493 return @Rs;
|
|
2494 }
|
|
2495
|
|
2496 #------------------------------------------------------------------------------#
|
|
2497 #------------------------------------------------------------------------------#
|
|
2498 sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa
|
|
2499
|
|
2500 my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]);
|
|
2501 my $mate_sense = $_[6];
|
|
2502
|
|
2503 my $leftLetterOk = substr($mate_sense, 0, 1); #R
|
|
2504 my $rightLetterOk = substr($mate_sense, 1, 1); #F
|
|
2505
|
|
2506 my (@standardArray1,@standardArray2);
|
|
2507
|
|
2508 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
|
|
2509 $leftLetterOk = 'R';
|
|
2510 $rightLetterOk = 'F';
|
|
2511 @standardArray1 = translateSolidToRF($strand1,$ends_order1);
|
|
2512 my @arr = getOrderedStrands($strand2,$order2);
|
|
2513 my @ends2 = getOrderedStrands($ends_order2,$order2);
|
|
2514 @standardArray2 = translateSolidToRF(\@arr,\@ends2);
|
|
2515
|
|
2516 } else {
|
|
2517 @standardArray1 = @{$strand1};
|
|
2518 @standardArray2 = getOrderedStrands($strand2,$order2);
|
|
2519 }
|
|
2520
|
|
2521 #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF
|
|
2522
|
|
2523 #for the first end:
|
|
2524
|
|
2525 my @array = @standardArray1;
|
|
2526 my %badInFRSense1;
|
|
2527 for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF
|
|
2528 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
|
|
2529 $badInFRSense1{$i}=1;
|
|
2530 $array[$i] = $rightLetterOk;
|
|
2531 }
|
|
2532 }
|
|
2533 my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1);
|
|
2534 @array = @standardArray1;
|
|
2535 my %badInFRSense0;
|
|
2536 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR
|
|
2537 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
|
|
2538 $badInFRSense0{$i-1}=1;
|
|
2539 $array[$i-1] = $leftLetterOk;
|
|
2540
|
|
2541 }
|
|
2542 }
|
|
2543 my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0);
|
|
2544
|
|
2545 #for the second end:
|
|
2546 @array = @standardArray2;
|
|
2547
|
|
2548 my %badInFRSense3;
|
|
2549 for my $i (1..scalar(@array)-1){
|
|
2550 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
|
|
2551 $badInFRSense3{$order2->[$i]}=1;
|
|
2552 $array[$i] = $rightLetterOk;
|
|
2553 }
|
|
2554 }
|
|
2555 my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3);
|
|
2556
|
|
2557 @array = @standardArray2;
|
|
2558 my %badInFRSense5;
|
|
2559 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF
|
|
2560 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
|
|
2561 $badInFRSense5{$i-1}=1;
|
|
2562 $array[$i-1] = $leftLetterOk;
|
|
2563 }
|
|
2564 }
|
|
2565 my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5);
|
|
2566
|
|
2567 if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) {
|
|
2568 return (1,%badInFRSense0);
|
|
2569 }
|
|
2570
|
|
2571 if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) {
|
|
2572 return (1,%badInFRSense1);
|
|
2573 }
|
|
2574
|
|
2575 if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) {
|
|
2576 return (2,%badInFRSense3);
|
|
2577 }
|
|
2578
|
|
2579 if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) {
|
|
2580 return (2,%badInFRSense5);
|
|
2581 }
|
|
2582
|
|
2583 #should not get here:
|
|
2584 print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n";
|
|
2585 return (1,%badInFRSense1);
|
|
2586 }
|
|
2587
|
|
2588 sub getOrderedStrands {
|
|
2589 my ($strand,$order) = ($_[0],$_[1]);
|
|
2590 my @arr;
|
|
2591 for my $i (0..scalar(@{$strand})-1) {
|
|
2592 push(@arr,$strand->[$order->[$i]-1]);
|
|
2593 }
|
|
2594 return @arr;
|
|
2595 }
|
|
2596 #------------------------------------------------------------------------------#
|
|
2597 #------------------------------------------------------------------------------#
|
|
2598 sub checkClusters {
|
|
2599
|
|
2600 my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_;
|
|
2601 if ($ifRenv eq 'REVERSE_SENSE') {
|
|
2602 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) {
|
|
2603 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0;
|
|
2604 }
|
|
2605 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0;
|
|
2606 }
|
|
2607 #if NORM
|
|
2608 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) {
|
|
2609 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0;
|
|
2610 }
|
|
2611 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0;
|
|
2612 }
|
|
2613
|
|
2614 #------------------------------------------------------------------------------#
|
|
2615 #------------------------------------------------------------------------------#
|
|
2616 sub translateSolidToRF {
|
|
2617 my ($strandArr,$ends_orderArr)=@_;
|
|
2618 my @array;
|
|
2619 for my $i (0..scalar(@{$strandArr})-1) {
|
|
2620 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') {
|
|
2621 push(@array,'F');
|
|
2622 }
|
|
2623 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') {
|
|
2624 push(@array,'R');
|
|
2625 }
|
|
2626 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') {
|
|
2627 push(@array,'R');
|
|
2628 }
|
|
2629 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') {
|
|
2630 push(@array,'F');
|
|
2631 }
|
|
2632 }
|
|
2633 return @array;
|
|
2634 }
|
|
2635
|
|
2636 #------------------------------------------------------------------------------#
|
|
2637 #------------------------------------------------------------------------------#
|
|
2638 #convert the links file to the circos format
|
|
2639 sub links2segdup{
|
|
2640
|
|
2641 my($id,$color_code,$links_file,$segdup_file)=@_;
|
|
2642
|
|
2643 print LOG "# Converting to the circos format...\n";
|
|
2644
|
|
2645 tie (my %hcolor,'Tie::IxHash'); #color-code hash table
|
|
2646 foreach my $col (keys %{$color_code}){
|
|
2647 my ($min_links,$max_links)=split(",",$color_code->{$col});
|
|
2648 $hcolor{$col}=[$min_links,$max_links];
|
|
2649 }
|
|
2650
|
|
2651 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
|
|
2652 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n";
|
|
2653
|
|
2654 my $index=1;
|
|
2655 while(<LINKS>){
|
|
2656
|
|
2657 my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6];
|
|
2658
|
|
2659 my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links
|
|
2660
|
|
2661 print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output
|
|
2662 "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n";
|
|
2663 $index++;
|
|
2664 }
|
|
2665
|
|
2666 close LINKS;
|
|
2667 close SEGDUP;
|
|
2668 print LOG "-- output created: $segdup_file\n";
|
|
2669 }
|
|
2670 #------------------------------------------------------------------------------#
|
|
2671 #------------------------------------------------------------------------------#
|
|
2672 #convert the links file to the bedPE format for BEDTools usage
|
|
2673 sub links2bedPElinksfile{
|
|
2674
|
|
2675 my ($sample,$links_file,$bedpe_file)=@_;
|
|
2676
|
|
2677 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
|
|
2678 open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n";
|
|
2679
|
|
2680 my $nb_links=1;
|
|
2681
|
|
2682 while(<LINKS>){
|
|
2683
|
|
2684 chomp;
|
|
2685 my @t=split("\t",$_);
|
|
2686 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6);
|
|
2687 my $type=($chr1 eq $chr2)? "INTRA":"INTER";
|
|
2688 $type.="_".$t[10];
|
|
2689
|
|
2690 $start1--; $start2--;
|
|
2691
|
|
2692 print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2".
|
|
2693 "\t$sample"."_link$nb_links\t$type\t.\t.".
|
|
2694 "\t".join("|",@t)."\n";
|
|
2695
|
|
2696 $nb_links++;
|
|
2697 }
|
|
2698
|
|
2699 close LINKS;
|
|
2700 close BEDPE;
|
|
2701
|
|
2702 }
|
|
2703 #------------------------------------------------------------------------------#
|
|
2704 #------------------------------------------------------------------------------#
|
|
2705 sub bedPElinks2linksfile{
|
|
2706
|
|
2707 my ($bedpe_file,$links_file)=@_;
|
|
2708
|
|
2709 open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n";
|
|
2710 open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n";
|
|
2711
|
|
2712 while(<BEDPE>){
|
|
2713
|
|
2714 chomp;
|
|
2715 my $sample=(split("_",(split("\t",$_))[6]))[0];
|
|
2716 my @t1=(split("\t",$_))[0,1,2,3,4,5];
|
|
2717 my @t2=split(/\|/,(split("\t",$_))[10]);
|
|
2718 push(@t2,$sample);
|
|
2719
|
|
2720 print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n";
|
|
2721
|
|
2722 }
|
|
2723 close BEDPE;
|
|
2724 close LINKS;
|
|
2725
|
|
2726 }
|
|
2727 #------------------------------------------------------------------------------#
|
|
2728 #------------------------------------------------------------------------------#
|
|
2729 #convert the links file to the bed format
|
|
2730 sub links2bedfile{
|
|
2731
|
|
2732 my ($tag_length,$color_code,$links_file,$bed_file)=@_;
|
|
2733
|
|
2734 print LOG "# Converting to the bed format...\n";
|
|
2735
|
|
2736 my $compare=1;
|
|
2737 if($links_file!~/compared$/){
|
|
2738 $compare=0;
|
|
2739 $tag_length->{none}->{1}=$tag_length->{1};
|
|
2740 $tag_length->{none}->{2}=$tag_length->{2};
|
|
2741 }
|
|
2742
|
|
2743 #color-code hash table
|
|
2744 tie (my %hcolor,'Tie::IxHash');
|
|
2745 my %color_order;
|
|
2746 my $n=1;
|
|
2747 foreach my $col (keys %{$color_code}){
|
|
2748 my ($min_links,$max_links)=split(",",$color_code->{$col});
|
|
2749 $hcolor{$col}=[$min_links,$max_links];
|
|
2750 $color_order{$col}=$n;
|
|
2751 $n++;
|
|
2752 }
|
|
2753
|
|
2754 my %pair;
|
|
2755 my %pt;
|
|
2756 $n=1;
|
|
2757 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
|
|
2758
|
|
2759 my %str=( "F"=>"+", "R"=>"-" );
|
|
2760
|
|
2761 while(<LINKS>){
|
|
2762
|
|
2763 my @t=split;
|
|
2764 my $sample=($compare)? pop(@t):"none";
|
|
2765
|
|
2766 my $chr1=$t[0];
|
|
2767 my $chr2=$t[3];
|
|
2768 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
|
|
2769 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
|
|
2770 my $same_chr=($chr1 eq $chr2)? 1:0;
|
|
2771
|
|
2772 my $count=$t[6];
|
|
2773 my $color=getColor($count,\%hcolor,"bed");
|
|
2774
|
|
2775 my @pairs=deleteBadOrderSensePairs(split(",",$t[7]));
|
|
2776 my @strand1=deleteBadOrderSensePairs(split(",",$t[8]));
|
|
2777 my @strand2=deleteBadOrderSensePairs(split(",",$t[9]));
|
|
2778 my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10]));
|
|
2779 my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11]));
|
|
2780 my @position1=deleteBadOrderSensePairs(split(",",$t[14]));
|
|
2781 my @position2=deleteBadOrderSensePairs(split(",",$t[15]));
|
|
2782 my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample});
|
|
2783 my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample});
|
|
2784
|
|
2785
|
|
2786 for my $p (0..$#pairs){
|
|
2787
|
|
2788 if (!exists $pair{$pairs[$p]}){
|
|
2789
|
|
2790 if($same_chr){
|
|
2791
|
|
2792 $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]},
|
|
2793 $start1[$p]-1, $end2[$p], $color,
|
|
2794 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ];
|
|
2795 $pt{$n}=$pair{$pairs[$p]}->{0};
|
|
2796 $n++;
|
|
2797
|
|
2798 }else{
|
|
2799
|
|
2800 $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]},
|
|
2801 $start1[$p]-1, $end1[$p], $color,
|
|
2802 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0];
|
|
2803 $pt{$n}=$pair{$pairs[$p]}->{1};
|
|
2804 $n++;
|
|
2805
|
|
2806
|
|
2807 $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]},
|
|
2808 $start2[$p]-1, $end2[$p], $color,
|
|
2809 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0];
|
|
2810 $pt{$n}=$pair{$pairs[$p]}->{2};
|
|
2811 $n++;
|
|
2812 }
|
|
2813 }else{
|
|
2814
|
|
2815 if($same_chr){
|
|
2816 ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]});
|
|
2817 }else{
|
|
2818 ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]});
|
|
2819 ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]});
|
|
2820 }
|
|
2821 }
|
|
2822 }
|
|
2823 }
|
|
2824 close LINKS;
|
|
2825
|
|
2826 my $nb_pairs=$n-1;
|
|
2827
|
|
2828 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n";
|
|
2829 print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ".
|
|
2830 "visibility=2 itemRgb=\"On\"\n";
|
|
2831
|
|
2832 for my $i (1..$nb_pairs){
|
|
2833 print BED join("\t",@{$pt{$i}})."\n";
|
|
2834 }
|
|
2835
|
|
2836 close BED;
|
|
2837
|
|
2838 print LOG "-- output created: $bed_file\n";
|
|
2839
|
|
2840 undef %pair;
|
|
2841 undef %pt;
|
|
2842
|
|
2843 }
|
|
2844 #------------------------------------------------------------------------------#
|
|
2845 #------------------------------------------------------------------------------#
|
|
2846 sub deleteBadOrderSensePairs{
|
|
2847
|
|
2848 my (@tab)=@_;
|
|
2849 my @tab2;
|
|
2850
|
|
2851 foreach my $v (@tab){
|
|
2852
|
|
2853 $v=~s/[\(\)]//g;
|
|
2854 push(@tab2,$v) if($v!~/[\$\*\@]$/);
|
|
2855 }
|
|
2856 return @tab2;
|
|
2857 }
|
|
2858 #------------------------------------------------------------------------------#
|
|
2859 #------------------------------------------------------------------------------#
|
|
2860 sub getAllEntries{
|
|
2861 my (@tab)=split (/,/,$_[0]);
|
|
2862 my @tab2;
|
|
2863
|
|
2864 foreach my $v (@tab){
|
|
2865
|
|
2866 $v=~s/[\(\)]//g;
|
|
2867 push(@tab2,$v);
|
|
2868 }
|
|
2869 return @tab2;
|
|
2870 }#------------------------------------------------------------------------------#
|
|
2871 #------------------------------------------------------------------------------#
|
|
2872 sub getAllEntriesWOspecialChar{
|
|
2873 my (@tab)=split (/,/,$_[0]);
|
|
2874 my @tab2;
|
|
2875
|
|
2876 foreach my $v (@tab){
|
|
2877
|
|
2878 $v=~s/[\(\)\$\*\@]//g;
|
|
2879 push(@tab2,$v);
|
|
2880 }
|
|
2881 return @tab2;
|
|
2882 }
|
|
2883 #------------------------------------------------------------------------------#
|
|
2884 #------------------------------------------------------------------------------#
|
|
2885 sub links2SVfile{
|
|
2886
|
|
2887 my($links_file,$sv_file)=@_;
|
|
2888
|
|
2889 print LOG "# Converting to the sv output table...\n";
|
|
2890 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
|
|
2891 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
|
|
2892
|
|
2893 my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist
|
|
2894 chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering
|
|
2895 final_score breakpoint1_start1-end1 breakpoint2_start2-end2);
|
|
2896
|
|
2897 my $nb_links=0;
|
|
2898
|
|
2899 while (<LINKS>){
|
|
2900
|
|
2901 my @t=split;
|
|
2902 my @sv=();
|
|
2903 my $sv_type="-";
|
|
2904 my $strand_ratio="-";
|
|
2905 my $eq_ratio="-";
|
|
2906 my $eq_type="-";
|
|
2907 my $insert_ratio="-";
|
|
2908 my $link="-";
|
|
2909 my ($bk1, $bk2)=("-","-");
|
|
2910 my $score="-";
|
|
2911
|
|
2912 my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]);
|
|
2913 my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]);
|
|
2914 my $nb_pairs=$t[6];
|
|
2915 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
|
|
2916 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
|
|
2917 my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER";
|
|
2918
|
|
2919 #if strand filtering
|
|
2920 if (defined $t[16]){
|
|
2921 #if inter-chr link
|
|
2922 $sv_type=$t[16];
|
|
2923 if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){
|
|
2924 $strand_ratio=floor($1/$2*100)."%";
|
|
2925 $score=$t[18];
|
|
2926 }
|
|
2927 if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){
|
|
2928 #if intra-chr link with insert size filtering
|
|
2929 $strand_ratio=floor($1/$2*100)."%";
|
|
2930 $link=floor($t[17]);
|
|
2931 if($sv_type!~/^INV/){
|
|
2932 $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
|
|
2933 $score=$t[20];
|
|
2934 }else{
|
|
2935 $score=$t[19];
|
|
2936 }
|
|
2937 }
|
|
2938 }
|
|
2939
|
|
2940 if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){
|
|
2941
|
|
2942 #if strand and order filtering only and/or interchr link
|
|
2943 $eq_type=$t[18];
|
|
2944 $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
|
|
2945 ($bk1, $bk2)=($t[20],$t[21]);
|
|
2946 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
|
|
2947 $score=$t[22];
|
|
2948
|
|
2949 }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){
|
|
2950
|
|
2951 #if all three filtering
|
|
2952 $link=floor($t[17]);
|
|
2953 $eq_type=$t[19];
|
|
2954 $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/);
|
|
2955
|
|
2956 if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){
|
|
2957 $insert_ratio=floor($1/$2*100)."%";
|
|
2958 ($bk1, $bk2)=($t[22],$t[23]);
|
|
2959 $score=$t[24];
|
|
2960
|
|
2961 }else{
|
|
2962 ($bk1, $bk2)=($t[21],$t[22]);
|
|
2963 $score=$t[23];
|
|
2964 }
|
|
2965 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
|
|
2966
|
|
2967 }
|
|
2968
|
|
2969
|
|
2970 push(@sv, $chr_type, $sv_type,$eq_type);
|
|
2971 push(@sv,"$chr1\t$start1-$end1");
|
|
2972 push(@sv, $link);
|
|
2973 push(@sv,"$chr2\t$start2-$end2",
|
|
2974 $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2);
|
|
2975
|
|
2976
|
|
2977 print SV join("\t",@sv)."\n";
|
|
2978 }
|
|
2979
|
|
2980 close LINKS;
|
|
2981 close SV;
|
|
2982
|
|
2983 system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted";
|
|
2984
|
|
2985 open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n";
|
|
2986 my @links=<SV>;
|
|
2987 close SV;
|
|
2988
|
|
2989 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
|
|
2990
|
|
2991 print SV join("\t",@header)."\n";
|
|
2992 print SV @links;
|
|
2993 close SV;
|
|
2994
|
|
2995 unlink($sv_file.".sorted");
|
|
2996
|
|
2997 print LOG "-- output created: $sv_file\n";
|
|
2998
|
|
2999 }
|
|
3000 #------------------------------------------------------------------------------#
|
|
3001 sub densityCalculation{
|
|
3002
|
|
3003 my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_;
|
|
3004
|
|
3005 my @sfile=split(/\./,$$mates_file[$file]);
|
|
3006 my $fchr=$sfile[$#sfile];
|
|
3007
|
|
3008 my $fh = new FileHandle;
|
|
3009
|
|
3010 my %density;
|
|
3011 my %density_ref;
|
|
3012 my @ratio;
|
|
3013 my ($cov,$cov_ref);
|
|
3014
|
|
3015 #FREQUENCY CALCULATION PROCEDURE
|
|
3016 print LOG "# $fchr : Frequency calculation procedure...\n";
|
|
3017 &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format);
|
|
3018 &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format);
|
|
3019
|
|
3020 #MAKING RATIO AND OUTPUT
|
|
3021 print LOG "\# Ratio calculation procedure...\n";
|
|
3022 $density_file=~s/\/mates\//\/density\//;
|
|
3023 $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n";
|
|
3024
|
|
3025 foreach my $k (1..$chr->{nb_chrs}){
|
|
3026 foreach my $frag (1..$chr->{$k}->{nb_frag}){
|
|
3027
|
|
3028 @ratio= ($chr->{$k}->{name},
|
|
3029 (${$chr->{$k}->{$frag}}[0]+1),
|
|
3030 (${$chr->{$k}->{$frag}}[1]+1));
|
|
3031
|
|
3032 $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0;
|
|
3033 $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0;
|
|
3034
|
|
3035 push(@ratio,$cov,$cov_ref);
|
|
3036 push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref);
|
|
3037 push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref);
|
|
3038 push(@ratio,log($cov+1)) if($cov && !$cov_ref);
|
|
3039 next if(!$cov && !$cov_ref);
|
|
3040
|
|
3041 print $fh join("\t",@ratio)."\n";
|
|
3042 }
|
|
3043 }
|
|
3044
|
|
3045 $fh->close;
|
|
3046 print LOG "-- output created: $density_file\n";
|
|
3047
|
|
3048 undef %density;
|
|
3049 undef %density_ref;
|
|
3050 }
|
|
3051 #------------------------------------------------------------------------------#
|
|
3052 #------------------------------------------------------------------------------#
|
|
3053 sub FreqCalculation{
|
|
3054
|
|
3055 my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_;
|
|
3056
|
|
3057 my @sfile=split(/\./,$mates_file);
|
|
3058 my $fchr=$sfile[$#sfile];
|
|
3059 my $fh = new FileHandle;
|
|
3060
|
|
3061 my $nb_windows=0;
|
|
3062 my $warn=100000;
|
|
3063 my $record=0;
|
|
3064 my %pair;
|
|
3065
|
|
3066 my ($sumX,$sumX2) = (0,0);
|
|
3067
|
|
3068 print LOG "\# Frequency calculation for $mates_file...\n";
|
|
3069
|
|
3070 if ($mates_file =~ /.gz$/) {
|
|
3071 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
|
|
3072 }elsif($mates_file =~ /.bam$/){
|
|
3073 o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
|
|
3074 }else{
|
|
3075 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n";
|
|
3076 }
|
|
3077
|
|
3078 while(<$fh>){
|
|
3079
|
|
3080 my @t=split;
|
|
3081 my $mate=$t[0];
|
|
3082
|
|
3083 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,);
|
|
3084
|
|
3085 next if(exists $pair{$mate});
|
|
3086
|
|
3087 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
|
|
3088
|
|
3089 next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2});
|
|
3090 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
|
|
3091
|
|
3092 $pair{$mate}=undef;
|
|
3093 $record++;
|
|
3094
|
|
3095 my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2);
|
|
3096
|
|
3097 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format);
|
|
3098 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format);
|
|
3099
|
|
3100 my $length = abs($coord_start_read1-$coord_start_read2);
|
|
3101 $sumX += $length; #add to sum and sum^2 for mean and variance calculation
|
|
3102 $sumX2 += $length*$length;
|
|
3103
|
|
3104 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){
|
|
3105
|
|
3106 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){
|
|
3107
|
|
3108 &addToDensity($density,$chr_read1,$i,\$nb_windows)
|
|
3109 if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1]));
|
|
3110
|
|
3111 }else{
|
|
3112
|
|
3113 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step);
|
|
3114 }
|
|
3115 }
|
|
3116
|
|
3117 if($record>=$warn){
|
|
3118 print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n";
|
|
3119 $warn+=100000;
|
|
3120 }
|
|
3121 }
|
|
3122 $fh->close;
|
|
3123
|
|
3124 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n";
|
|
3125
|
|
3126 if($record>0){
|
|
3127
|
|
3128 my $mu = $sumX/$record;
|
|
3129 my $sigma = sqrt($sumX2/$record - $mu*$mu);
|
|
3130 print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n";
|
|
3131 }
|
|
3132
|
|
3133 }
|
|
3134 #------------------------------------------------------------------------------#
|
|
3135 #------------------------------------------------------------------------------#
|
|
3136 sub ratio2segdup{
|
|
3137
|
|
3138 my($id,$density_file,$segdup_file)=@_;
|
|
3139
|
|
3140 print LOG "# Converting to circos format...\n";
|
|
3141
|
|
3142 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n";
|
|
3143 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n";
|
|
3144
|
|
3145 while(<RATIO>){
|
|
3146 chomp;
|
|
3147 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5];
|
|
3148 print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n";
|
|
3149 }
|
|
3150
|
|
3151 close RATIO;
|
|
3152 close SEGDUP;
|
|
3153 print LOG "-- output created: $segdup_file\n";
|
|
3154 }
|
|
3155 #------------------------------------------------------------------------------#
|
|
3156 #------------------------------------------------------------------------------#
|
|
3157 sub ratio2bedfile{
|
|
3158
|
|
3159 my($density_file,$bed_file)=@_;
|
|
3160
|
|
3161 print LOG "# Converting to bedGraph format...\n";
|
|
3162
|
|
3163 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n";
|
|
3164 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n";
|
|
3165 print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ".
|
|
3166 "visibility=2 color=255,0,0 alwaysZero=\"On\"\n";
|
|
3167
|
|
3168 while(<RATIO>){
|
|
3169 chomp;
|
|
3170 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5];
|
|
3171 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/);
|
|
3172 print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n";
|
|
3173 }
|
|
3174
|
|
3175 close RATIO;
|
|
3176 close BED;
|
|
3177 print LOG "-- output created: $bed_file\n";
|
|
3178 }
|
|
3179 #------------------------------------------------------------------------------#
|
|
3180 #------------------------------------------------------------------------------#
|
|
3181 sub inverseSense{
|
|
3182
|
|
3183 my $mate_sense=$_[0];
|
|
3184 my %reverse=( 'F' => 'R' , 'R' => 'F' ,
|
|
3185 'FF' => 'RR', 'RR' => 'FF',
|
|
3186 'FR' => 'RF', 'RF' => 'FR');
|
|
3187 return $reverse{$mate_sense};
|
|
3188 }
|
|
3189
|
|
3190 #------------------------------------------------------------------------------#
|
|
3191 #------------------------------------------------------------------------------#
|
|
3192 sub getNextFrag{
|
|
3193
|
|
3194 my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_;
|
|
3195
|
|
3196 my $how_far = $read_start-$frag_start;
|
|
3197 my $nb_windows_toskip;
|
|
3198
|
|
3199 if($how_far>0){
|
|
3200
|
|
3201 $nb_windows_toskip=($how_far/$step)-($window_dist/$step);
|
|
3202 $nb_windows_toskip=~ s/\..*//;
|
|
3203 $nb_windows_toskip=0 if($nb_windows_toskip<0);
|
|
3204 return ($frag_num + $nb_windows_toskip);
|
|
3205 }
|
|
3206 return $frag_last;
|
|
3207 }
|
|
3208 #------------------------------------------------------------------------------#
|
|
3209 #------------------------------------------------------------------------------#
|
|
3210 sub getColor{
|
|
3211
|
|
3212 my($count,$hcolor,$format)=@_;
|
|
3213 for my $col ( keys % { $hcolor} ) {
|
|
3214 return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]);
|
|
3215 }
|
|
3216 return "white" if($format eq "circos");
|
|
3217 return "255,255,255" if($format eq "bed");
|
|
3218 }
|
|
3219 #------------------------------------------------------------------------------#
|
|
3220 #------------------------------------------------------------------------------#
|
|
3221 sub recupCoords{
|
|
3222
|
|
3223 my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_;
|
|
3224 my $strand = 'F';
|
|
3225
|
|
3226 if ($c_hit=~s/^\-//) {
|
|
3227 $strand='R';
|
|
3228 $$cs_hit=$c_hit;
|
|
3229 $$ce_hit=$c_hit-($tag_length-1);
|
|
3230 }else{
|
|
3231 $$cs_hit=$c_hit;
|
|
3232 $$ce_hit=$c_hit+($tag_length-1);
|
|
3233 }
|
|
3234 return $strand;
|
|
3235
|
|
3236 }
|
|
3237 #------------------------------------------------------------------------------#
|
|
3238 #------------------------------------------------------------------------------#
|
|
3239 sub overlap {
|
|
3240 my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_;
|
|
3241 if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) {
|
|
3242 return 0;
|
|
3243 }
|
|
3244 return 1;
|
|
3245 }
|
|
3246 #------------------------------------------------------------------------------#
|
|
3247 #------------------------------------------------------------------------------#
|
|
3248 sub makeLink {
|
|
3249
|
|
3250 my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_;
|
|
3251
|
|
3252 if($chr1>$chr2){
|
|
3253 ($chr1,$chr2)= ($chr2,$chr1);
|
|
3254 ($frag1,$frag2)= ($frag2,$frag1);
|
|
3255 }
|
|
3256
|
|
3257 if($chr1 == $chr2){
|
|
3258 if($frag1>$frag2){
|
|
3259 ($frag1,$frag2)= ($frag2,$frag1);
|
|
3260 }
|
|
3261 }
|
|
3262
|
|
3263 if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){
|
|
3264 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt;
|
|
3265 $$nb++;
|
|
3266 }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){
|
|
3267 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt";
|
|
3268 }
|
|
3269 }
|
|
3270 #------------------------------------------------------------------------------#
|
|
3271 #------------------------------------------------------------------------------#
|
|
3272 #fonction of adding the read to the density profile
|
|
3273 sub addToDensity {
|
|
3274
|
|
3275 my ($density,$chr1,$frag1,$nb)=@_;
|
|
3276
|
|
3277 if(!exists $density->{$chr1}->{$frag1}->{count}){
|
|
3278 $density->{$chr1}->{$frag1}->{count}=1;
|
|
3279 $$nb++;
|
|
3280 }else{
|
|
3281 $density->{$chr1}->{$frag1}->{count}++;
|
|
3282 }
|
|
3283 }
|
|
3284 #------------------------------------------------------------------------------#
|
|
3285 #------------------------------------------------------------------------------#
|
|
3286 sub floor {
|
|
3287 my $nb = $_[0];
|
|
3288 $nb=~ s/\..*//;
|
|
3289 return $nb;
|
|
3290 }
|
|
3291 #------------------------------------------------------------------------------#
|
|
3292 sub decimal{
|
|
3293
|
|
3294 my $num=shift;
|
|
3295 my $digs_to_cut=shift;
|
|
3296
|
|
3297 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/);
|
|
3298
|
|
3299 return $num;
|
|
3300 }
|
|
3301
|
|
3302 #------------------------------------------------------------------------------#
|
|
3303 sub max {
|
|
3304
|
|
3305 my($max) = shift(@_);
|
|
3306 foreach my $temp (@_) {
|
|
3307 $max = $temp if $temp > $max;
|
|
3308 }
|
|
3309 return($max);
|
|
3310 }
|
|
3311 #------------------------------------------------------------------------------#
|
|
3312 #------------------------------------------------------------------------------#
|
|
3313 sub min {
|
|
3314
|
|
3315 my($min) = shift(@_);
|
|
3316 foreach my $temp (@_) {
|
|
3317 $min = $temp if $temp < $min;
|
|
3318 }
|
|
3319 return($min);
|
|
3320 }
|
|
3321 #------------------------------------------------------------------------------#
|
|
3322 #------------------------------------------------------------------------------#
|
|
3323 sub sortTablebyIndex{
|
|
3324 my ($tab1,$tab2)=@_;
|
|
3325 my @tab3;
|
|
3326
|
|
3327 foreach my $i (@$tab1){
|
|
3328 $tab3[$i]=$$tab2[$$tab1[$i]];
|
|
3329 }
|
|
3330 return @tab3;
|
|
3331 }
|
|
3332 #------------------------------------------------------------------------------#
|
|
3333 #------------------------------------------------------------------------------#
|
|
3334 sub round {
|
|
3335 my $number = shift || 0;
|
|
3336 my $dec = 10 ** (shift || 0);
|
|
3337 return int( $dec * $number + .5 * ($number <=> 0)) / $dec;
|
|
3338 }
|
|
3339 #------------------------------------------------------------------------------#
|
|
3340 #------------------------------------------------------------------------------#
|
|
3341 sub getUniqueTable{
|
|
3342
|
|
3343 my (@tab)=@_;
|
|
3344 my (%saw,@out)=();
|
|
3345 undef %saw;
|
|
3346 return sort(grep(!$saw{$_}++, @tab));
|
|
3347 }
|
|
3348 #------------------------------------------------------------------------------#
|
|
3349 #------------------------------------------------------------------------------#
|
|
3350 sub catFiles {
|
|
3351
|
|
3352 unlink("$_[1]") if(exists $_[1]);
|
|
3353 system qq( cat "$_" >> "$_[1]" ) for @{$_[0]};
|
|
3354 }
|
|
3355 #------------------------------------------------------------------------------#
|
|
3356 #------------------------------------------------------------------------------#
|
|
3357 #check if the configuration file is correct
|
|
3358 sub validateconfiguration{
|
|
3359
|
|
3360 my %conf=%{$_[0]};
|
|
3361 my $list_prgs="@ARGV";
|
|
3362
|
|
3363 my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file);
|
|
3364 my @detection_params=qw(split_mate_file window_size step_length split_mate_file);
|
|
3365 my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file);
|
|
3366 my @circos_params=qw(organism_id colorcode);
|
|
3367 my @bed_params=qw(colorcode);
|
|
3368 my @compare_params=qw(list_samples file_suffix);
|
|
3369
|
|
3370 foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){
|
|
3371
|
|
3372 unless (defined($dir)) {
|
|
3373 $dir = ".";
|
|
3374 }
|
|
3375 unless (-d $dir){
|
|
3376 mkdir $dir or die;
|
|
3377 }
|
|
3378 $dir.="/" if($dir!~/\/$/);
|
|
3379 }
|
|
3380
|
|
3381 unless (defined($conf{general}{num_threads})) {
|
|
3382 $conf{general}{num_threads} = 1;
|
|
3383 }
|
|
3384 $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24);
|
|
3385
|
|
3386 if($list_prgs!~/links2compare/){
|
|
3387
|
|
3388 foreach my $p (@general_params){
|
|
3389 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p});
|
|
3390 }
|
|
3391
|
|
3392 $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam");
|
|
3393
|
|
3394 unless (defined($conf{general}{sv_type})) {
|
|
3395 $conf{general}{sv_type} = "all";
|
|
3396 }
|
|
3397
|
|
3398 $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}};
|
|
3399 }
|
|
3400
|
|
3401 if($list_prgs=~/(linking|cnv)/){
|
|
3402
|
|
3403 foreach my $p (@detection_params){
|
|
3404 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p});
|
|
3405 }
|
|
3406
|
|
3407 die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref});
|
|
3408
|
|
3409 if($conf{detection}{step_length}>$conf{detection}{window_size}){
|
|
3410 die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n");
|
|
3411 }
|
|
3412
|
|
3413 unless (-d $conf{general}{tmp_dir}."/mates"){
|
|
3414 mkdir $conf{general}{tmp_dir}."/mates" or die;
|
|
3415 }
|
|
3416
|
|
3417 if($list_prgs=~/linking/){
|
|
3418 unless (-d $conf{general}{tmp_dir}."/links"){
|
|
3419 mkdir $conf{general}{tmp_dir}."/links" or die;
|
|
3420 }
|
|
3421 }
|
|
3422 if($list_prgs=~/cnv/){
|
|
3423 unless (-d $conf{general}{tmp_dir}."/density"){
|
|
3424 mkdir $conf{general}{tmp_dir}."/density" or die;
|
|
3425 }
|
|
3426 }
|
|
3427
|
|
3428 }
|
|
3429
|
|
3430 if($list_prgs=~/filtering/){
|
|
3431
|
|
3432 foreach my $p (@filtering_params) {
|
|
3433 die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p});
|
|
3434
|
|
3435 }
|
|
3436
|
|
3437 if(defined($conf{filtering}{chromosomes})) {
|
|
3438 my @chrs=split(",",$conf{filtering}{chromosomes});
|
|
3439 my $exclude=($chrs[0]=~/^\-/)? 1:0;
|
|
3440 for my $chrName (@chrs){
|
|
3441
|
|
3442 die("Error Config : The filtering parameter \"chromosomes\" is not valid\n")
|
|
3443 if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude));
|
|
3444
|
|
3445 }
|
|
3446 }
|
|
3447
|
|
3448 if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) {
|
|
3449 die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected".
|
|
3450 "\nChange strand_filtering to \"1\" if you want to use the order filtering\n");
|
|
3451 }
|
|
3452 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) {
|
|
3453 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n");
|
|
3454 }
|
|
3455 if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) {
|
|
3456 die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected".
|
|
3457 "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n");
|
|
3458 }
|
|
3459 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) {
|
|
3460 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n");
|
|
3461 }
|
|
3462
|
|
3463 if (!defined($conf{filtering}{indel_sigma_threshold})) {
|
|
3464 $conf{filtering}{indel_sigma_threshold} = 2;
|
|
3465 }
|
|
3466 if (!defined($conf{filtering}{dup_sigma_threshold})) {
|
|
3467 $conf{filtering}{dup_sigma_threshold} = 2;
|
|
3468 }
|
|
3469 if (!defined($conf{filtering}{singleton_sigma_threshold})) {
|
|
3470 $conf{filtering}{singleton_sigma_threshold} = 4;
|
|
3471 }
|
|
3472
|
|
3473 if (!defined($conf{filtering}{nb_pairs_order_threshold})) {
|
|
3474 $conf{filtering}{nb_pairs_order_threshold} = 1;
|
|
3475 }
|
|
3476
|
|
3477 if (!defined($conf{filtering}{final_score_threshold})) {
|
|
3478 $conf{filtering}{final_score_threshold} = 0.8;
|
|
3479 }
|
|
3480
|
|
3481 if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) {
|
|
3482 die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n");
|
|
3483 }
|
|
3484
|
|
3485 }
|
|
3486
|
|
3487 if($list_prgs=~/2circos$/){
|
|
3488 foreach my $p (@circos_params) {
|
|
3489 next if($list_prgs=~/^ratio/ && $p eq "colorcode");
|
|
3490 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p});
|
|
3491 }
|
|
3492 }
|
|
3493
|
|
3494 if($list_prgs=~/2bed$/){
|
|
3495 foreach my $p (@bed_params) {
|
|
3496 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p});
|
|
3497 }
|
|
3498 }
|
|
3499
|
|
3500 if($list_prgs=~/links2compare/){
|
|
3501 foreach my $p (@compare_params) {
|
|
3502 die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p});
|
|
3503 }
|
|
3504
|
|
3505 unless (defined($conf{compare}{same_sv_type})) {
|
|
3506 $conf{compare}{same_sv_type} = 0;
|
|
3507 }
|
|
3508
|
|
3509 unless (defined($conf{compare}{min_overlap})) {
|
|
3510 $conf{compare}{min_overlap} = 1E-9;
|
|
3511 }
|
|
3512
|
|
3513 if($conf{compare}{circos_output}){
|
|
3514 foreach my $p (@circos_params) {
|
|
3515 next if($list_prgs=~/^ratio/ && $p eq "colorcode");
|
|
3516 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p});
|
|
3517 }
|
|
3518 }
|
|
3519 if($conf{compare}{bed_output}){
|
|
3520 foreach my $p (@bed_params) {
|
|
3521 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p});
|
|
3522 }
|
|
3523 die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths});
|
|
3524
|
|
3525 my @samples=split(",",$conf{compare}{list_samples});
|
|
3526 my @read_lengths=split(",",$conf{compare}{list_read_lengths});
|
|
3527 for my $i (0..$#samples){
|
|
3528 my @l=split("-",$read_lengths[$i]);
|
|
3529 $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]};
|
|
3530 }
|
|
3531 }
|
|
3532 }
|
|
3533
|
|
3534
|
|
3535 }
|
|
3536 #------------------------------------------------------------------------------#
|
|
3537 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
|