4
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region
|
|
4
|
|
5 # Created by Jason Ellul, Oct 2012
|
|
6
|
|
7
|
|
8 use strict;
|
|
9 use warnings;
|
|
10 use Getopt::Std;
|
|
11 use File::Basename;
|
|
12 use Data::Dumper;
|
|
13 $| = 1;
|
|
14
|
|
15 # Grab and set all options
|
|
16 my %OPTIONS = (s => "MethylationData");
|
|
17 getopts('l:L:o:s:', \%OPTIONS);
|
|
18
|
|
19 die qq(
|
|
20 Usage: methylation_by_region_converter.pl [OPTIONS] <bed file> <bedGraph 1> [<bedGraph 2> ...]
|
|
21
|
|
22 OPTIONS: -o STR the name of the output file
|
|
23 -l STR filename of the log file [null]
|
|
24 -L STR append to an existing log file [null]
|
|
25 -s STR Sample Name [$OPTIONS{s}]
|
|
26 ) if(@ARGV < 2);
|
|
27
|
|
28 my $version = 0.1;
|
|
29 my $bed = shift @ARGV;
|
|
30 my @graphs = @ARGV;
|
|
31 my $Script = "methylation_by_region_converter.pl";
|
|
32 my $now;
|
|
33
|
|
34 # if log file specified
|
|
35 if(defined $OPTIONS{l}) {
|
|
36 open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n";
|
|
37 close (FH);
|
|
38 # Open the log file and redirect output to it
|
|
39 open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n";
|
|
40 my $now = localtime time;
|
|
41 print "Log File Created $now\n";
|
|
42 } elsif(defined $OPTIONS{L}) {
|
|
43 #append to a log file
|
|
44 # Open the log file and redirect output to it
|
|
45 open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n";
|
|
46 my $now = localtime time;
|
|
47 print "Appending To Log File $now\n\n";
|
|
48 }
|
|
49
|
|
50 # print version of this script.
|
|
51 print STDERR "Using $Script version $version\n\n";
|
|
52 print STDERR "Using region file: $bed\n";
|
|
53 print STDERR "And bedgraphs: @graphs\n\n";
|
|
54 my %Regions;
|
|
55 my @chr_order;
|
|
56
|
|
57 # read in regions file
|
|
58 print STDERR "Reading Regions Bed File $bed ... ";
|
|
59 open(BED, "$bed") || die "$bed: $!";
|
|
60 while(my $line = <BED>) {
|
|
61 chomp $line;
|
|
62 my @line_sp = split("\t", $line);
|
|
63 $line_sp[2]--;
|
|
64
|
|
65 push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]};
|
|
66 push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1];
|
|
67 push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2];
|
|
68 push @{ $Regions{$line_sp[0]}{Methylated} }, 0;
|
|
69 push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0;
|
|
70 }
|
|
71 close(BED);
|
|
72 print STDERR "Done.\n";
|
|
73
|
|
74 # read in bedgraphs
|
|
75 foreach my $bedGraph (@graphs) {
|
|
76 print STDERR "Loading bedGraph File $bedGraph ... ";
|
|
77 open(GRAPH, $bedGraph) || die "$bedGraph: $!";
|
|
78 my @lines = <GRAPH>;
|
|
79 close(GRAPH);
|
|
80 print STDERR "Done.\n\n";
|
|
81
|
|
82 print STDERR "Processing bedGraph File ... ";
|
|
83 foreach my $line (@lines) {
|
|
84 chomp $line;
|
|
85 my @line_sp = split("\t", $line);
|
|
86 $line_sp[1]++;
|
|
87
|
|
88 # if the chromosome is in the regions file
|
|
89 if(defined $Regions{$line_sp[0]}) {
|
|
90 my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} });
|
|
91
|
|
92 # if the position is within a region
|
|
93 if($pos >= 0) {
|
|
94 ${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4];
|
|
95 ${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5];
|
|
96 }
|
|
97 }
|
|
98 }
|
|
99 print STDERR "Done.\n\n";
|
|
100 }
|
|
101
|
|
102 if(defined $OPTIONS{o}) {
|
|
103 open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!";
|
|
104 }
|
|
105
|
|
106 # calculate percent methylated and print
|
|
107 print STDERR "Printing output ... ";
|
|
108 print "#type=DNA_METHYLATION\n";
|
|
109 print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n";
|
|
110 foreach my $chr (@chr_order) {
|
|
111 for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) {
|
|
112 my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region];
|
|
113 my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total;
|
|
114 print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total;
|
|
115 print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total;
|
|
116 }
|
|
117 }
|
|
118 close(STDERR) if defined $OPTIONS{o};
|
|
119 print STDERR "Done.\n";
|
|
120
|
|
121 sub binsearchregion {
|
|
122 my ($target, $cmp, $start, $end) = @_;
|
|
123
|
|
124 my $posmin = 0;
|
|
125 my $posmax = $#{ $start };
|
|
126
|
|
127 return -1 if &$cmp (0, $start, $target) > 0;
|
|
128 return -1 if &$cmp ($#{ $end }, $end, $target) < 0;
|
|
129
|
|
130 while (1) {
|
|
131 my $mid = int (($posmin + $posmax) / 2);
|
|
132 my $result = &$cmp ($mid, $start, $target);
|
|
133
|
|
134 if ($result < 0) {
|
|
135 $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin;
|
|
136 if($mid == $posmin) {
|
|
137 return -1 if &$cmp ($mid, $end, $target) < 0;
|
|
138 return $mid;
|
|
139 }
|
|
140 $posmin = $mid;
|
|
141 } elsif ($result > 0) {
|
|
142 $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin;
|
|
143 if($mid == $posmax) {
|
|
144 $mid--;
|
|
145 return -1 if &$cmp ($mid, $end, $target) < 0;
|
|
146 return $mid;
|
|
147 }
|
|
148 $posmax = $mid;
|
|
149 } else {
|
|
150 return $mid;
|
|
151 }
|
|
152 }
|
|
153 }
|
|
154
|
|
155 sub cmpFunc {
|
|
156 my ($index, $arrayRef, $target) = @_;
|
|
157 my $item = $$arrayRef[$index];
|
|
158
|
|
159 return $item <=> $target;
|
|
160 }
|
|
161
|