comparison bismark_deduplicate/deduplicate_bismark @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5
6
7 ### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification
8 ### Paired-end alignments are considered a duplicate if both partner reasd start and end at the exact same position
9
10 ### May 13, 2013
11 ### Changed the single-end trimming behavior so that only the start coordinate will be used. This avoids duplicate reads that have been trimmed to a varying extent
12 ### Changed the way of determining the end of reads in SAM format to using the CIGAR string if the read contains InDels
13
14 ### 16 July 2013
15 ### Adding a new deduplication mode for barcoded RRBS-Seq
16
17 ### 27 Sept 2013
18 ### Added close statement for all output filehandles (which should probably have been there from the start...)
19
20 ### 8 Jan 2015
21 ### to detect paired-end command from the @PG line we are no requiring spaces before and after the -1 or -2
22
23 ### 09 Mar 2015
24 ### Removing newline characters also from the read conversion flag in case the tags had been reordered and are now present in the very last column
25
26 ### 19 08 2015
27 ### Hiding the option --representative from view to discourage people from using it (it was nearly always not what they wanted to do anyway). It should still work
28 ### for alignments that do not contain any indels
29 ### Just for reference, here is the the text:
30 ### print "--representative\twill browse through all sequences and print out the sequence with the most representative (as in most frequent) methylation call for any given position. Note that this is very likely the most highly amplified PCR product for a given sequence\n\n";
31
32
33 my $dedup_version = 'v0.16.3';
34
35 my $help;
36 my $representative;
37 my $single;
38 my $paired;
39 my $global_single;
40 my $global_paired;
41 my $vanilla;
42 my $samtools_path;
43 my $bam;
44 my $rrbs;
45 my $version;
46
47 my $command_line = GetOptions ('help' => \$help,
48 'representative' => \$representative,
49 's|single' => \$global_single,
50 'p|paired' => \$global_paired,
51 'vanilla' => \$vanilla,
52 'samtools_path=s' => \$samtools_path,
53 'bam' => \$bam,
54 'barcode' => \$rrbs,
55 'version' => \$version,
56 );
57
58 die "Please respecify command line options\n\n" unless ($command_line);
59
60 if ($help){
61 print_helpfile();
62 exit;
63 }
64
65 if ($version){
66 print << "VERSION";
67
68 Bismark Deduplication Module
69
70 Deduplicator Version: $dedup_version
71 Copyright 2010-16 Felix Krueger, Babraham Bioinformatics
72 www.bioinformatics.babraham.ac.uk/projects/bismark/
73
74
75 VERSION
76 exit;
77 }
78
79
80
81 my @filenames = @ARGV;
82
83 unless (@filenames){
84 print "Please provide one or more Bismark output files for deduplication\n\n";
85 sleep (2);
86 print_helpfile();
87 exit;
88 }
89
90
91 ### OPTIONS
92 unless ($global_single or $global_paired){
93 if ($vanilla){
94 die "Please specify either -s (single-end) or -p (paired-end) for deduplication. Reading this information from the \@PG header line only works for SAM/BAM files\n\n";
95 }
96 warn "\nNeither -s (single-end) nor -p (paired-end) selected for deduplication. Trying to extract this information for each file separately from the \@PG line of the SAM/BAM file\n";
97 }
98
99 if ($global_paired){
100 if ($global_single){
101 die "Please select either -s for single-end files or -p for paired-end files, but not both at the same time!\n\n";
102 }
103 if ($vanilla){
104
105 if ($rrbs){
106 die "Barcode deduplication only works with Bismark SAM (or BAM) output (in attempt to phase out the vanilla format)\n";
107 }
108
109 warn "Processing paired-end custom Bismark output file(s):\n";
110 warn join ("\t",@filenames),"\n\n";
111 }
112 else{
113 warn "Processing paired-end Bismark output file(s) (SAM format):\n";
114 warn join ("\t",@filenames),"\n\n";
115 }
116 }
117 else{
118 if ($vanilla){
119 warn "Processing single-end custom Bismark output file(s):\n";
120 warn join ("\t",@filenames),"\n\n";
121 }
122 else{
123 warn "Processing single-end Bismark output file(s) (SAM format):\n";
124 warn join ("\t",@filenames),"\n\n";
125 }
126 }
127
128 ### PATH TO SAMTOOLS
129 if (defined $samtools_path){
130 # if Samtools was specified as full command
131 if ($samtools_path =~ /samtools$/){
132 if (-e $samtools_path){
133 # Samtools executable found
134 }
135 else{
136 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
137 }
138 }
139 else{
140 unless ($samtools_path =~ /\/$/){
141 $samtools_path =~ s/$/\//;
142 }
143 $samtools_path .= 'samtools';
144 if (-e $samtools_path){
145 # Samtools executable found
146 }
147 else{
148 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
149 }
150 }
151 }
152 # Check whether Samtools is in the PATH if no path was supplied by the user
153 else{
154 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
155 $samtools_path = `which samtools`;
156 chomp $samtools_path;
157 }
158 }
159
160 if ($bam){
161 if (defined $samtools_path){
162 $bam = 1;
163 }
164 else{
165 warn "No Samtools found on your system, writing out a gzipped SAM file instead\n";
166 $bam = 2;
167 }
168 }
169 else{
170 $bam = 0;
171 }
172
173
174 if ($representative){
175 warn "\nIf there are several alignments to a single position in the genome the alignment with the most representative methylation call will be chosen (this might be the most highly amplified PCR product...)\n\n";
176 sleep (1);
177 }
178 elsif($rrbs){
179 warn "\nIf the input is a multiplexed sample with several alignments to a single position in the genome, only alignments with a unique barcode will be chosen)\n\n";
180 sleep (1);
181 }
182 else{ # default; random (=first) alignment
183 warn "\nIf there are several alignments to a single position in the genome the first alignment will be chosen. Since the input files are not in any way sorted this is a near-enough random selection of reads.\n\n";
184 sleep (1);
185 }
186
187 foreach my $file (@filenames){
188
189 if ($global_single){
190 $paired = 0;
191 $single = 1;
192 }
193 elsif($global_paired){
194 $paired = 1;
195 $single = 0;
196 }
197
198 unless($global_single or $global_paired){
199
200 warn "Trying to determine the type of mapping from the SAM header line\n"; sleep(1);
201
202 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
203 if ($file =~ /\.gz$/){
204 open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
205 }
206 elsif ($file =~ /\.bam$/){
207 open (DETERMINE,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
208 }
209 else{
210 open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
211 }
212 while (<DETERMINE>){
213 last unless (/^\@/);
214 if ($_ =~ /^\@PG/){
215 # warn "found the \@PG line:\n";
216 # warn "$_";
217
218 if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){
219 warn "Treating file as paired-end data (extracted from \@PG line)\n"; sleep(1);
220 $paired = 1;
221 $single = 0;
222 }
223 else{
224 warn "Treating file as single-end data (extracted from \@PG line)\n"; sleep(1);
225 $paired = 0;
226 $single = 1;
227 }
228 }
229 }
230 close DETERMINE or warn "$!\n";
231 }
232
233 if ($file =~ /(\.bam$|\.sam$)/){
234 bam_isEmpty($file);
235 }
236
237
238 ### OPTIONS
239 unless ($single or $paired){
240 die "Please specify either -s (single-end) or -p (paired-end) for deduplication, or provide a SAM/BAM file that contains the \@PG header line\n\n";
241 }
242
243 ###
244 unless ($vanilla){
245 if ($paired){
246 test_positional_sorting($file);
247 }
248 }
249
250
251 ### writing to a report file
252 my $report = $file;
253
254 $report =~ s/\.gz$//;
255 $report =~ s/\.sam$//;
256 $report =~ s/\.bam$//;
257 $report =~ s/\.txt$//;
258 $report =~ s/$/.deduplication_report.txt/;
259
260 open (REPORT,'>',$report) or die "Failed to write to report file to $report: $!\n\n";
261
262
263 ### for representative methylation calls we need to discriminate between single-end and paired-end files as the latter have 2 methylation call strings
264 if($representative){
265 deduplicate_representative($file);
266 }
267
268 elsif($rrbs){
269 deduplicate_barcoded_rrbs($file);
270 }
271
272 ### as the default option we simply write out the first read for a position and discard all others. This is the fastest option
273 else{
274
275 my %unique_seqs;
276 my %positions;
277
278 if ($file =~ /\.gz$/){
279 open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
280 }
281 elsif ($file =~ /\.bam$/){
282 open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
283 }
284 else{
285 open (IN,$file) or die "Unable to read from $file: $!\n";
286 }
287
288 my $outfile = $file;
289 $outfile =~ s/\.gz$//;
290 $outfile =~ s/\.sam$//;
291 $outfile =~ s/\.bam$//;
292 $outfile =~ s/\.txt$//;
293
294 if ($vanilla){
295 $outfile =~ s/$/_deduplicated.txt/;
296 }
297 else{
298 if ($bam == 1){
299 $outfile =~ s/$/.deduplicated.bam/;
300 }
301 elsif ($bam == 2){
302 $outfile =~ s/$/.deduplicated.sam.gz/;
303 }
304 else{
305 $outfile =~ s/$/.deduplicated.sam/;
306 }
307 }
308 if ($bam == 1){
309 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
310 }
311 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
312 open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
313 }
314
315 else{
316 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
317 }
318
319 ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
320 if ($vanilla){
321 $_ = <IN>; # Bismark version header
322 print OUT; # Printing the Bismark version to the de-duplicated file again
323 }
324 my $count = 0;
325 my $unique_seqs = 0;
326 my $removed = 0;
327
328 while (<IN>){
329
330 if ($count == 0){
331 if ($_ =~ /^Bismark version:/){
332 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
333 sleep (2);
334 print_helpfile();
335 exit;
336 }
337 }
338
339 ### if this was a SAM file we ignore header lines
340 unless ($vanilla){
341 if (/^\@\w{2}\t/){
342 print "skipping header line:\t$_";
343 print OUT "$_"; # Printing the header lines again into the de-duplicated file
344 next;
345 }
346 }
347
348 ++$count;
349 my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
350
351 my ($strand,$chr,$start,$end,$cigar);
352 my $line1;
353
354 if ($vanilla){
355 ($strand,$chr,$start,$end) = (split (/\t/))[1,2,3,4];
356 }
357 else{ # SAM format
358 ($strand,$chr,$start,$cigar) = (split (/\t/))[1,2,3,5]; # we are assigning the FLAG value to $strand
359
360 ### SAM single-end
361 if ($single){
362
363 if ($strand == 0 ){
364 ### read aligned to the forward strand. No action needed
365 }
366 elsif ($strand == 16){
367 ### read is on reverse strand
368
369 $start -= 1; # only need to adjust this once
370
371 # for InDel free matches we can simply use the M number in the CIGAR string
372 if ($cigar =~ /^(\d+)M$/){ # linear match
373 $start += $1;
374 }
375
376 else{
377 # parsing CIGAR string
378 my @len = split (/\D+/,$cigar); # storing the length per operation
379 my @ops = split (/\d+/,$cigar); # storing the operation
380 shift @ops; # remove the empty first element
381 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
382
383 # warn "CIGAR string; $cigar\n";
384 ### determining end position of a read
385 foreach my $index(0..$#len){
386 if ($ops[$index] eq 'M'){ # standard matching bases
387 $start += $len[$index];
388 # warn "Operation is 'M', adding $len[$index] bp\n";
389 }
390 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
391 # warn "Operation is 'I', next\n";
392 }
393 elsif($ops[$index] eq 'D'){ # deletions do affect the end position
394 # warn "Operation is 'D',adding $len[$index] bp\n";
395 $start += $len[$index];
396 }
397 else{
398 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
399 }
400 }
401 }
402 }
403 $composite = join (":",$strand,$chr,$start);
404 }
405 elsif($paired){
406
407 ### storing the current line
408 $line1 = $_;
409
410 my $read_conversion;
411 my $genome_conversion;
412
413 while ( /(XR|XG):Z:([^\t]+)/g ) {
414 my $tag = $1;
415 my $value = $2;
416
417 if ($tag eq "XR") {
418 $read_conversion = $value;
419 $read_conversion =~ s/\r//;
420 chomp $read_conversion;
421 } elsif ($tag eq "XG") {
422 $genome_conversion = $value;
423 $genome_conversion =~ s/\r//;
424 chomp $genome_conversion;
425 }
426 }
427 die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion);
428
429
430 my $index;
431 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
432 $index = 0;
433 $strand = '+';
434 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
435 $index = 1;
436 $strand = '-';
437 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
438 $index = 2;
439 $strand = '+';
440 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
441 $index = 3;
442 $strand = '-';
443 } else {
444 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
445 }
446
447 # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2
448 if ($index == 0 or $index == 2){
449
450 ### reading in the next line
451 $_ = <IN>;
452 # the only thing we need is the end position
453 ($end,my $cigar_2) = (split (/\t/))[3,5];
454
455 $end -= 1; # only need to adjust this once
456
457 # for InDel free matches we can simply use the M number in the CIGAR string
458 if ($cigar_2 =~ /^(\d+)M$/){ # linear match
459 $end += $1;
460 }
461 else{
462 # parsing CIGAR string
463 my @len = split (/\D+/,$cigar_2); # storing the length per operation
464 my @ops = split (/\d+/,$cigar_2); # storing the operation
465 shift @ops; # remove the empty first element
466 die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);
467
468 # warn "CIGAR string; $cigar_2\n";
469 ### determining end position of the read
470 foreach my $index(0..$#len){
471 if ($ops[$index] eq 'M'){ # standard matching bases
472 $end += $len[$index];
473 # warn "Operation is 'M', adding $len[$index] bp\n";
474 }
475 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
476 # warn "Operation is 'I', next\n";
477 }
478 elsif($ops[$index] eq 'D'){ # deletions do affect the end position
479 # warn "Operation is 'D',adding $len[$index] bp\n";
480 $end += $len[$index];
481 }
482 else{
483 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
484 }
485 }
486 }
487 }
488 else{
489 # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line
490
491 $end = $start - 1; # need to adjust this only once
492
493 # for InDel free matches we can simply use the M number in the CIGAR string
494 if ($cigar =~ /^(\d+)M$/){ # linear match
495 $end += $1;
496 }
497 else{
498 # parsing CIGAR string
499 my @len = split (/\D+/,$cigar); # storing the length per operation
500 my @ops = split (/\d+/,$cigar); # storing the operation
501 shift @ops; # remove the empty first element
502 die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);
503
504 # warn "CIGAR string; $cigar\n";
505 ### determining end position of the read
506 foreach my $index(0..$#len){
507 if ($ops[$index] eq 'M'){ # standard matching bases
508 $end += $len[$index];
509 # warn "Operation is 'M', adding $len[$index] bp\n";
510 }
511 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
512 # warn "Operation is 'I', next\n";
513 }
514 elsif($ops[$index] eq 'D'){ # deletions do affect the end position
515 # warn "Operation is 'D',adding $len[$index] bp\n";
516 $end += $len[$index];
517 }
518 else{
519 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
520 }
521 }
522 }
523
524 ### reading in the next line
525 $_ = <IN>;
526 # the only thing we need is the start position
527 ($start) = (split (/\t/))[3];
528 }
529 $composite = join (":",$strand,$chr,$start,$end);
530 }
531
532 else{
533 die "Input must be single or paired-end\n";
534 }
535 }
536
537 if (exists $unique_seqs{$composite}){
538 ++$removed;
539 unless (exists $positions{$composite}){
540 $positions{$composite}++;
541 }
542 }
543 else{
544 if ($paired){
545 unless ($vanilla){
546 print OUT "$line1"; # printing first paired-end line for SAM output
547 }
548 }
549 print OUT "$_"; # printing single-end SAM alignment or second paired-end line
550 $unique_seqs{$composite}++;
551 }
552 }
553
554 my $percentage;
555 my $percentage_leftover;
556 my $leftover = $count - $removed;
557
558 unless ($count == 0){
559 $percentage = sprintf("%.2f",$removed/$count*100);
560 $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
561 }
562 else{
563 $percentage = 'N/A';
564 $percentage_leftover = 'N/A';
565 }
566
567 warn "\nTotal number of alignments analysed in $file:\t$count\n";
568 warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
569 warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
570 warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
571
572 print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
573 print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
574 print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
575 print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
576 }
577
578 close OUT or warn "Failed to close output filehandle: $!\n";
579 close REPORT or warn "Failed to close report filehandle: $!\n";
580
581 }
582
583
584 sub deduplicate_barcoded_rrbs{
585
586 my $file = shift;
587
588 my %unique_seqs;
589 my %positions;
590
591 if ($file =~ /\.gz$/){
592 open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
593 }
594 elsif ($file =~ /\.bam$/){
595 open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
596 }
597 else{
598 open (IN,$file) or die "Unable to read from $file: $!\n";
599 }
600
601 my $outfile = $file;
602 $outfile =~ s/\.gz$//;
603 $outfile =~ s/\.sam$//;
604 $outfile =~ s/\.bam$//;
605 $outfile =~ s/\.txt$//;
606
607 if ($vanilla){
608 $outfile =~ s/$/_dedup_RRBS.txt/;
609 }
610 else{
611 if ($bam == 1){
612 $outfile =~ s/$/.dedup_RRBS.bam/;
613 }
614 elsif ($bam == 2){
615 $outfile =~ s/$/.dedupRRBS.sam.gz/;
616 }
617 else{
618 $outfile =~ s/$/.dedup_RRBS.sam/;
619 }
620 }
621 if ($bam == 1){
622 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
623 }
624 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
625 open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
626 }
627 else{
628 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
629 }
630
631 ### This mode only supports Bismark SAM output
632 my $count = 0;
633 my $unique_seqs = 0;
634 my $removed = 0;
635
636 while (<IN>){
637
638 if ($count == 0){
639 if ($_ =~ /^Bismark version:/){
640 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
641 sleep (2);
642 print_helpfile();
643 exit;
644 }
645 }
646
647 ### if this was a SAM file we ignore header lines
648 if (/^\@\w{2}\t/){
649 warn "skipping SAM header line:\t$_";
650 print OUT; # Printing the header lines again into the de-duplicated file
651 next;
652 }
653
654 ++$count;
655 my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
656 ### in this barcoded mode we also store the read barcode as additional means of assisting the deduplication
657 ### in effect the $composite string looks like this (separated by ':'):
658
659 ### FLAG:chromosome:start:barcode
660
661 my $end;
662 my $line1;
663
664 # SAM format
665 my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand
666
667 $id =~ /:(\w+)$/;
668 my $barcode = $1;
669 unless ($barcode){
670 die "Failed to extract a barcode from the read ID (last element of each read ID needs to be the barcode sequence, e.g. ':CATG'\n\n";
671 }
672
673 ### SAM single-end
674 if ($single){
675
676 if ($strand == 0 ){
677 ### read aligned to the forward strand. No action needed
678 }
679 elsif ($strand == 16){
680 ### read is on reverse strand
681
682 $start -= 1; # only need to adjust this once
683
684 # for InDel free matches we can simply use the M number in the CIGAR string
685 if ($cigar =~ /^(\d+)M$/){ # linear match
686 $start += $1;
687 }
688 else{
689 # parsing CIGAR string
690 my @len = split (/\D+/,$cigar); # storing the length per operation
691 my @ops = split (/\d+/,$cigar); # storing the operation
692 shift @ops; # remove the empty first element
693 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
694
695 # warn "CIGAR string; $cigar\n";
696 ### determining end position of a read
697 foreach my $index(0..$#len){
698 if ($ops[$index] eq 'M'){ # standard matching bases
699 $start += $len[$index];
700 # warn "Operation is 'M', adding $len[$index] bp\n";
701 }
702 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
703 # warn "Operation is 'I', next\n";
704 }
705 elsif($ops[$index] eq 'D'){ # deletions do affect the end position
706 # warn "Operation is 'D',adding $len[$index] bp\n";
707 $start += $len[$index];
708 }
709 else{
710 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
711 }
712 }
713 }
714 }
715
716 ### Here we take the barcode sequence into consideration
717 $composite = join (":",$strand,$chr,$start,$barcode);
718 # warn "$composite\n\n";
719 # sleep(1);
720 }
721 elsif($paired){
722
723 ### storing the current line
724 $line1 = $_;
725
726 my $read_conversion;
727 my $genome_conversion;
728
729 while ( /(XR|XG):Z:([^\t]+)/g ) {
730 my $tag = $1;
731 my $value = $2;
732
733 if ($tag eq "XR") {
734 $read_conversion = $value;
735 $read_conversion =~ s/\r//;
736 chomp $read_conversion;
737 }
738 elsif ($tag eq "XG") {
739 $genome_conversion = $value;
740 $genome_conversion =~ s/\r//;
741 chomp $genome_conversion;
742 }
743 }
744 die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion);
745
746
747 my $index;
748 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
749 $index = 0;
750 $strand = '+';
751 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
752 $index = 1;
753 $strand = '-';
754 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
755 $index = 2;
756 $strand = '+';
757 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
758 $index = 3;
759 $strand = '-';
760 } else {
761 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
762 }
763
764 # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2
765 if ($index == 0 or $index == 2){
766
767 ### reading in the next line
768 $_ = <IN>;
769 # the only thing we need is the end position
770 ($end,my $cigar_2) = (split (/\t/))[3,5];
771
772 $end -= 1; # only need to adjust this once
773
774 # for InDel free matches we can simply use the M number in the CIGAR string
775 if ($cigar_2 =~ /^(\d+)M$/){ # linear match
776 $end += $1;
777 }
778 else{
779 # parsing CIGAR string
780 my @len = split (/\D+/,$cigar_2); # storing the length per operation
781 my @ops = split (/\d+/,$cigar_2); # storing the operation
782 shift @ops; # remove the empty first element
783 die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);
784
785 # warn "CIGAR string; $cigar_2\n";
786 ### determining end position of the read
787 foreach my $index(0..$#len){
788 if ($ops[$index] eq 'M'){ # standard matching bases
789 $end += $len[$index];
790 # warn "Operation is 'M', adding $len[$index] bp\n";
791 }
792 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
793 # warn "Operation is 'I', next\n";
794 }
795 elsif($ops[$index] eq 'D'){ # deletions do affect the end position
796 # warn "Operation is 'D',adding $len[$index] bp\n";
797 $end += $len[$index];
798 }
799 else{
800 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
801 }
802 }
803 }
804 }
805 else{
806 # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line
807
808 $end = $start - 1; # need to adjust this only once
809
810 # for InDel free matches we can simply use the M number in the CIGAR string
811 if ($cigar =~ /^(\d+)M$/){ # linear match
812 $end += $1;
813 }
814 else{
815 # parsing CIGAR string
816 my @len = split (/\D+/,$cigar); # storing the length per operation
817 my @ops = split (/\d+/,$cigar); # storing the operation
818 shift @ops; # remove the empty first element
819 die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);
820
821 # warn "CIGAR string; $cigar\n";
822 ### determining end position of the read
823 foreach my $index(0..$#len){
824 if ($ops[$index] eq 'M'){ # standard matching bases
825 $end += $len[$index];
826 # warn "Operation is 'M', adding $len[$index] bp\n";
827 }
828 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
829 # warn "Operation is 'I', next\n";
830 }
831 elsif($ops[$index] eq 'D'){ # deletions do affect the end position
832 # warn "Operation is 'D',adding $len[$index] bp\n";
833 $end += $len[$index];
834 }
835 else{
836 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
837 }
838 }
839 }
840
841 ### reading in the next line
842 $_ = <IN>;
843 # the only thing we need is the start position
844 ($start) = (split (/\t/))[3];
845 }
846
847 ### Here we take the barcode sequence into consideration
848 $composite = join (":",$strand,$chr,$start,$end,$barcode);
849 }
850 else{
851 die "Input must be single or paired-end\n";
852 }
853
854 if (exists $unique_seqs{$composite}){
855 ++$removed;
856 unless (exists $positions{$composite}){
857 $positions{$composite}++;
858 }
859 }
860 else{
861 if ($paired){
862 print OUT $line1; # printing first paired-end line for SAM output
863 }
864 print OUT; # printing single-end SAM alignment or second paired-end line
865 $unique_seqs{$composite}++;
866 }
867 }
868
869 my $percentage;
870 my $percentage_leftover;
871 my $leftover = $count - $removed;
872
873 unless ($count == 0){
874 $percentage = sprintf("%.2f",$removed/$count*100);
875 $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
876 }
877 else{
878 $percentage = 'N/A';
879 $percentage_leftover = 'N/A';
880 }
881
882
883 warn "\nTotal number of alignments analysed in $file:\t$count\n";
884 warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
885 warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
886 warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
887
888 print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
889 print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
890 print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
891 print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
892
893 }
894
895 sub bam_isEmpty{
896
897 my $file = shift;
898
899 if ($file =~ /\.bam$/){
900 open (EMPTY,"$samtools_path view $file |") or die "Unable to read from BAM file $file: $!\n";
901 }
902 else{
903 open (EMPTY,$file) or die "Unable to read from $file: $!\n";
904 }
905 my $count = 0;
906 while (<EMPTY>){
907 if ($_){
908 $count++; # file contains data, fine.
909 }
910 last; # one line is enough
911 }
912
913 if ($count == 0){
914 die "\n### File appears to be empty, terminating deduplication process. Please make sure the input file has not been truncated. ###\n\n";
915 }
916 close EMPTY or warn "$!\n";
917 }
918
919
920 sub print_helpfile{
921 print "\n",'='x111,"\n";
922 print "\nThis script is supposed to remove alignments to the same position in the genome from the Bismark mapping output\n(both single and paired-end SAM files), which can arise by e.g. excessive PCR amplification. If sequences align\nto the same genomic position but on different strands they will be scored individually.\n\nNote that deduplication is not recommended for RRBS-type experiments!\n\nIn the default mode, the first alignment to a given position will be used irrespective of its methylation call\n(this is the fastest option, and as the alignments are not ordered in any way this is also near enough random).\n\n";
923 print "For single-end alignments only use the start coordinate of a read will be used for deduplication.\n\n";
924 print "For paired-end alignments the start-coordinate of the first read and the end coordinate of the second\nread will be used for deduplication. ";
925 print "This script expects the Bismark output to be in SAM format\n(Bismark v0.6.x or higher). To deduplicate the old custom Bismark output please specify '--vanilla'.\n\n";
926 print "*** Please note that for paired-end BAM files the deduplication script expects Read1 and Read2 to\nfollow each other in consecutive lines! If the file has been sorted by position make sure that you resort it\nby read name first (e.g. using samtools sort -n) ***\n\n";
927
928 print '='x111,"\n\n";
929 print ">>> USAGE: ./deduplicate_bismark_alignment_output.pl [options] filename(s) <<<\n\n";
930
931 print "-s/--single\t\tdeduplicate single-end Bismark files (default format: SAM)\n";
932 print "-p/--paired\t\tdeduplicate paired-end Bismark files (default format: SAM)\n\n";
933 print "--vanilla\t\tThe input file is in the old custom Bismark format and not in SAM format\n\n";
934 print "--barcode\t\tIn addition to chromosome, start position and orientation this will also take a potential barcode into\n consideration while deduplicating. The barcode needs to be the last element of the read ID and separated\n by a ':', e.g.: MISEQ:14:000000000-A55D0:1:1101:18024:2858_1:N:0:CTCCT\n\n";
935 print "--bam\t\t\tThe output will be written out in BAM format instead of the default SAM format. This script will\n\t\t\tattempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't\n\t\t\tbeen specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,\n\t\t\tthe SAM output will be compressed with GZIP instead (yielding a .sam.gz output file)\n\n";
936 print "--samtools_path\t\tThe path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified\n\t\t\texplicitly if Samtools is in the PATH already\n\n";
937 print "--version\t\tPrint version information and exit\n";
938
939 print '='x111,"\n\n";
940
941 print "This script was last modified on November 04, 2015\n\n";
942 }
943
944
945
946
947 sub test_positional_sorting{
948
949 my $filename = shift;
950
951 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
952 sleep(1);
953
954 if ($filename =~ /\.gz$/) {
955 open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n";
956 }
957 elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
958 if ($samtools_path){
959 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
960 }
961 else{
962 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
963 }
964 }
965 else {
966 open (TEST,$filename) or die "Can't open file $filename: $!\n";
967 }
968
969 my $count = 0;
970
971 while (<TEST>) {
972 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
973 if (/^\@SO/) {
974 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
975 }
976 next;
977 }
978 $count++;
979
980 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
981
982 my ($id_1) = (split (/\t/));
983
984 ### reading the next line which should be read 2
985 $_ = <TEST>;
986 my ($id_2) = (split (/\t/));
987 last unless ($id_2);
988 ++$count;
989
990 if ($id_1 eq $id_2){
991 ### ids are the same
992 next;
993 }
994 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
995 my $id_1_trunc = $id_1;
996 $id_1_trunc =~ s/\/1$//;
997 my $id_2_trunc = $id_2;
998 $id_2_trunc =~ s/\/2$//;
999
1000 unless ($id_1_trunc eq $id_2_trunc){
1001 die "\nThe IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
1002 }
1003 }
1004 }
1005 # close TEST or die $!; somehow fails on our cluster...
1006 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
1007 warn "...passed!\n";
1008 sleep(1);
1009
1010 }
1011
1012 sub isBam{
1013
1014 my $filename = shift;
1015
1016 # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
1017 open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";
1018
1019 ### when BAM files read through a gunzip -c stream they start with BAM...
1020 my $bam_in_disguise = <DISGUISE>;
1021 # warn "BAM in disguise: $bam_in_disguise\n\n";
1022
1023 if ($bam_in_disguise){
1024 if ($bam_in_disguise =~ /^BAM/){
1025 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
1026 return 1;
1027 }
1028 else{
1029 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
1030 return 0;
1031 }
1032 }
1033 else{
1034 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
1035 return 0;
1036 }
1037 }
1038
1039
1040
1041
1042
1043 #####################################
1044
1045 ### The following subroutine "deduplicate_representative" only works correctly for reads that do not contain any indels (as in only Bowtie1-based alignments).
1046 ### Please refrain from using it unless you want to test something out.
1047
1048 #####################################
1049
1050 sub deduplicate_representative {
1051 my $file = shift;
1052
1053 my %positions;
1054 my %unique_seqs;
1055
1056 my $count = 0;
1057 my $unique_seqs = 0;
1058 my $removed = 0;
1059
1060 ### going through the file first and storing all positions as well as their methylation call strings in a hash
1061 if ($file =~ /\.gz$/){
1062 open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
1063 }
1064 elsif ($file =~ /\.bam$/){
1065 open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
1066 }
1067 else{
1068 open (IN,$file) or die "Unable to read from $file: $!\n";
1069 }
1070
1071 if ($single){
1072
1073 my $outfile = $file;
1074 $outfile =~ s/\.gz$//;
1075 $outfile =~ s/\.sam$//;
1076 $outfile =~ s/\.bam$//;
1077 $outfile =~ s/\.txt$//;
1078
1079 if ($vanilla){
1080 $outfile =~ s/$/.deduplicated_to_representative_sequences.txt/;
1081 }
1082 else{
1083 if ($bam == 1){
1084 $outfile =~ s/$/.deduplicated_to_representative_sequences.bam/;
1085 }
1086 elsif ($bam == 2){
1087 $outfile =~ s/$/.deduplicated_to_representative_sequences.sam.gz/;
1088 }
1089 else{
1090 $outfile =~ s/$/.deduplicated_to_representative_sequences.sam/;
1091 }
1092 }
1093
1094 if ($bam == 1){
1095 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
1096 }
1097 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
1098 open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
1099 }
1100 else{
1101 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
1102 }
1103
1104 warn "Reading and storing all alignment positions\n";
1105
1106 ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
1107 if ($vanilla){
1108 $_ = <IN>; # Bismark version header
1109 print OUT; # Printing the Bismark version to the de-duplicated file again
1110 }
1111
1112 while (<IN>){
1113
1114 if ($count == 0){
1115 if ($_ =~ /^Bismark version:/){
1116 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
1117 sleep (2);
1118 print_helpfile();
1119 exit;
1120 }
1121 }
1122
1123 ### if this was a SAM file we ignore header lines
1124 unless ($vanilla){
1125 if (/^\@\w{2}\t/){
1126 warn "skipping SAM header line:\t$_";
1127 print OUT; # Printing the header lines again into the de-duplicated file
1128 next;
1129 }
1130 }
1131
1132 my ($strand,$chr,$start,$end,$meth_call);
1133
1134 if ($vanilla){
1135 ($strand,$chr,$start,$end,$meth_call) = (split (/\t/))[1,2,3,4,7];
1136 }
1137 else{ # SAM format
1138
1139 ($strand,$chr,$start,my $seq,$meth_call) = (split (/\t/))[1,2,3,9,13]; # we are assigning the FLAG value to $strand
1140 ### SAM single-end
1141 $end = $start + length($seq) - 1;
1142 }
1143
1144 my $composite = join (":",$strand,$chr,$start,$end);
1145
1146 $count++;
1147 $positions{$composite}->{$meth_call}->{count}++;
1148 $positions{$composite}->{$meth_call}->{alignment} = $_;
1149 }
1150 warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n";
1151 close IN or warn $!;
1152 }
1153
1154 elsif ($paired){
1155
1156 ### we are going to concatenate both methylation call strings from the paired end file to form a joint methylation call string
1157
1158 my $outfile = $file;
1159 if ($vanilla){
1160 $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.txt/;
1161 }
1162 else{
1163 $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.sam/;
1164 }
1165
1166 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
1167 warn "Reading and storing all alignment positions\n";
1168
1169 ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
1170 if ($vanilla){
1171 $_ = <IN>; # Bismark version header
1172 print OUT; # Printing the Bismark version to the de-duplicated file again
1173 }
1174
1175 while (<IN>){
1176
1177 if ($count == 0){
1178 if ($_ =~ /^Bismark version:/){
1179 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
1180 sleep (2);
1181 print_helpfile();
1182 exit;
1183 }
1184 }
1185
1186 ### if this was a SAM file we ignore header lines
1187 unless ($vanilla){
1188 if (/^\@\w{2}\t/){
1189 warn "skipping SAM header line:\t$_";
1190 print OUT; # Printing the header lines again into the de-duplicated file
1191 next;
1192 }
1193 }
1194
1195 my ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2);
1196 my $line1;
1197
1198 if ($vanilla){
1199 ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2) = (split (/\t/))[1,2,3,4,7,10];
1200 }
1201 else{ # SAM paired-end format
1202
1203 ($strand,$chr,$start,$meth_call_1) = (split (/\t/))[1,2,3,13]; # we are assigning the FLAG value to $strand
1204
1205 ### storing the first line (= read 1 alignment)
1206 $line1 = $_;
1207
1208 ### reading in the next line
1209 $_ = <IN>;
1210 # we only need the end position and the methylation call
1211 (my $pos,my $seq_2,$meth_call_2) = (split (/\t/))[3,9,13];
1212 $end = $pos + length($seq_2) - 1;
1213 }
1214
1215 my $composite = join (":",$strand,$chr,$start,$end);
1216 $count++;
1217 my $meth_call = $meth_call_1.$meth_call_2;
1218
1219 $positions{$composite}->{$meth_call}->{count}++;
1220 if ($vanilla){
1221 $positions{$composite}->{$meth_call}->{alignment} = $_;
1222 }
1223 else{ # SAM PAIRED-END
1224 $positions{$composite}->{$meth_call}->{alignment_1} = $line1;
1225 $positions{$composite}->{$meth_call}->{alignment_2} = $_;
1226 }
1227 }
1228 warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n";
1229 close IN or warn $!;
1230 }
1231
1232 ### PRINTING RESULTS
1233
1234 ### Now going through all stored positions and printing out the methylation call which is most representative, i.e. the one which occurred most often
1235 warn "Now printing out alignments with the most representative methylation call(s)\n";
1236
1237 foreach my $pos (keys %positions){
1238 foreach my $meth_call (sort { $positions{$pos}->{$b}->{count} <=> $positions{$pos}->{$a}->{count} }keys %{$positions{$pos}}){
1239 if ($paired){
1240 if ($vanilla){
1241 print OUT $positions{$pos}->{$meth_call}->{alignment};
1242 }
1243 else{
1244 print OUT $positions{$pos}->{$meth_call}->{alignment_1}; # SAM read 1
1245 print OUT $positions{$pos}->{$meth_call}->{alignment_2}; # SAM read 2
1246 }
1247 }
1248 else{ # single-end
1249 print OUT $positions{$pos}->{$meth_call}->{alignment};
1250 }
1251 $unique_seqs++;
1252 last; ### exiting once we printed a sequence with the most frequent methylation call for a position
1253 }
1254 }
1255
1256 my $percentage;
1257 unless ($count == 0){
1258 $percentage = sprintf ("%.2f",$unique_seqs*100/$count);
1259 }
1260 else{
1261 $percentage = 'N/A';
1262 }
1263
1264 warn "\nTotal number of alignments analysed in $file:\t$count\n";
1265 warn "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n";
1266 print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
1267 print REPORT "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n";
1268
1269 }
1270
1271
1272