annotate methylation_analysis_bismark/methylation_analysis/methylation_by_region_converter.pl @ 15:d2ba233899b0 draft default tip

Uploaded
author fcaramia
date Mon, 17 Jun 2013 21:50:56 -0400
parents d15b4a2e3bdc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
1 #!/usr/bin/perl
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
2
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
3 # script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
4
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
5 # Created by Jason Ellul, Oct 2012
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
6
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
7
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
8 use strict;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
9 use warnings;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
10 use Getopt::Std;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
11 use File::Basename;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
12 use Data::Dumper;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
13 $| = 1;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
14
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
15 # Grab and set all options
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
16 my %OPTIONS = (s => "MethylationData");
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
17 getopts('l:L:o:s:', \%OPTIONS);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
18
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
19 die qq(
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
20 Usage: methylation_by_region_converter.pl [OPTIONS] <bed file> <bedGraph 1> [<bedGraph 2> ...]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
21
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
22 OPTIONS: -o STR the name of the output file
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
23 -l STR filename of the log file [null]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
24 -L STR append to an existing log file [null]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
25 -s STR Sample Name [$OPTIONS{s}]
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
26 ) if(@ARGV < 2);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
27
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
28 my $version = 0.1;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
29 my $bed = shift @ARGV;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
30 my @graphs = @ARGV;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
31 my $Script = "methylation_by_region_converter.pl";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
32 my $now;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
33
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
34 # if log file specified
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
35 if(defined $OPTIONS{l}) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
36 open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
37 close (FH);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
38 # Open the log file and redirect output to it
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
39 open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
40 my $now = localtime time;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
41 print "Log File Created $now\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
42 } elsif(defined $OPTIONS{L}) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
43 #append to a log file
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
44 # Open the log file and redirect output to it
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
45 open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
46 my $now = localtime time;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
47 print "Appending To Log File $now\n\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
48 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
49
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
50 # print version of this script.
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
51 print STDERR "Using $Script version $version\n\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
52 print STDERR "Using region file: $bed\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
53 print STDERR "And bedgraphs: @graphs\n\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
54 my %Regions;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
55 my @chr_order;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
56
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
57 # read in regions file
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
58 print STDERR "Reading Regions Bed File $bed ... ";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
59 open(BED, "$bed") || die "$bed: $!";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
60 while(my $line = <BED>) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
61 chomp $line;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
62 my @line_sp = split("\t", $line);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
63 $line_sp[2]--;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
64
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
65 push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]};
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
66 push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1];
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
67 push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2];
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
68 push @{ $Regions{$line_sp[0]}{Methylated} }, 0;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
69 push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
70 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
71 close(BED);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
72 print STDERR "Done.\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
73
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
74 # read in bedgraphs
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
75 foreach my $bedGraph (@graphs) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
76 print STDERR "Loading bedGraph File $bedGraph ... ";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
77 open(GRAPH, $bedGraph) || die "$bedGraph: $!";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
78 my @lines = <GRAPH>;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
79 close(GRAPH);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
80 print STDERR "Done.\n\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
81
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
82 print STDERR "Processing bedGraph File ... ";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
83 foreach my $line (@lines) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
84 chomp $line;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
85 my @line_sp = split("\t", $line);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
86 $line_sp[1]++;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
87
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
88 # if the chromosome is in the regions file
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
89 if(defined $Regions{$line_sp[0]}) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
90 my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} });
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
91
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
92 # if the position is within a region
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
93 if($pos >= 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
94 ${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4];
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
95 ${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5];
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
96 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
97 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
98 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
99 print STDERR "Done.\n\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
100 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
101
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
102 if(defined $OPTIONS{o}) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
103 open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
104 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
105
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
106 # calculate percent methylated and print
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
107 print STDERR "Printing output ... ";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
108 print "#type=DNA_METHYLATION\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
109 print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
110 foreach my $chr (@chr_order) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
111 for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
112 my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region];
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
113 my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
114 print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
115 print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
116 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
117 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
118 close(STDERR) if defined $OPTIONS{o};
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
119 print STDERR "Done.\n";
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
120
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
121 sub binsearchregion {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
122 my ($target, $cmp, $start, $end) = @_;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
123
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
124 my $posmin = 0;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
125 my $posmax = $#{ $start };
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
126
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
127 return -1 if &$cmp (0, $start, $target) > 0;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
128 return -1 if &$cmp ($#{ $end }, $end, $target) < 0;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
129
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
130 while (1) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
131 my $mid = int (($posmin + $posmax) / 2);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
132 my $result = &$cmp ($mid, $start, $target);
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
133
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
134 if ($result < 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
135 $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
136 if($mid == $posmin) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
137 return -1 if &$cmp ($mid, $end, $target) < 0;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
138 return $mid;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
139 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
140 $posmin = $mid;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
141 } elsif ($result > 0) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
142 $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
143 if($mid == $posmax) {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
144 $mid--;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
145 return -1 if &$cmp ($mid, $end, $target) < 0;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
146 return $mid;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
147 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
148 $posmax = $mid;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
149 } else {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
150 return $mid;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
151 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
152 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
153 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
154
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
155 sub cmpFunc {
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
156 my ($index, $arrayRef, $target) = @_;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
157 my $item = $$arrayRef[$index];
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
158
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
159 return $item <=> $target;
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
160 }
d15b4a2e3bdc Uploaded
fcaramia
parents:
diff changeset
161