Mercurial > repos > bgruening > bismark
comparison bismark_methyl_extractor/bismark2bedGraph @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/env perl | |
2 use warnings; | |
3 use strict; | |
4 $|++; | |
5 use Getopt::Long; | |
6 use Cwd; | |
7 use Carp; | |
8 | |
9 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) | |
10 | |
11 ## This program is free software: you can redistribute it and/or modify | |
12 ## it under the terms of the GNU General Public License as published by | |
13 ## the Free Software Foundation, either version 3 of the License, or | |
14 ## (at your option) any later version. | |
15 | |
16 ## This program is distributed in the hope that it will be useful, | |
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 ## GNU General Public License for more details. | |
20 | |
21 ## You should have received a copy of the GNU General Public License | |
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
23 | |
24 my $bismark2bedGraph_version = 'v0.16.3'; | |
25 | |
26 my @bedfiles; | |
27 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total | |
28 my @sorting_files; | |
29 | |
30 my ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir) = process_commandline(); | |
31 | |
32 warn "Using these input files: @sorting_files\n"; | |
33 warn "\nSummary of parameters for bismark2bedGraph conversion:\n"; | |
34 warn '='x54,"\n"; | |
35 warn "bedGraph output:\t\t$bedGraph_output\n"; | |
36 warn "output directory:\t\t>$output_dir<\n"; | |
37 if ($remove){ | |
38 warn "remove whitespaces:\t\tyes\n"; | |
39 } | |
40 else{ | |
41 warn "remove whitespaces:\t\tno\n"; | |
42 } | |
43 if ($CX_context){ | |
44 warn "CX context:\t\t\tyes\n"; | |
45 } | |
46 else{ | |
47 warn "CX context:\t\t\tno (CpG context only, default)\n"; | |
48 } | |
49 if ($no_header){ | |
50 warn "No-header selected:\t\tyes\n"; | |
51 } | |
52 else{ | |
53 warn "No-header selected:\t\tno\n"; | |
54 } | |
55 | |
56 if ($ample_mem){ | |
57 warn "Sorting method:\t\t\tArray-based (faster, but larger memory footprint)\n"; | |
58 } | |
59 else{ | |
60 warn "Sorting method:\t\t\tUnix sort-based (smaller memory footprint, but slower)\n"; | |
61 } | |
62 unless($ample_mem){ | |
63 warn "Sort buffer size:\t\t$sort_size\n"; | |
64 } | |
65 warn "Coverage threshold:\t\t$coverage_threshold\n"; | |
66 | |
67 | |
68 warn "="x77,"\n"; | |
69 warn "Methylation information will now be written into a bedGraph and coverage file\n"; | |
70 warn "="x77,"\n\n"; | |
71 sleep (2); | |
72 | |
73 ### deciding which files to use for bedGraph conversion | |
74 foreach my $filename (@sorting_files){ | |
75 | |
76 ### DETERMINING THE FULL PATH OF INPUT FILES | |
77 if ($filename =~ /^(.*\/)(.*)$/){ # if files are in a different output folder we extract the filename again | |
78 # warn "folder name: $1\nfilename: $2\n\n"; | |
79 chdir $1 or die "Failed to change directory to $1\n"; # $1 might be a relative path | |
80 $input_dir = getcwd(); # this will always be the full path | |
81 $filename = $2; | |
82 # warn "Full Input folder: $input_dir\nFilename: $filename\n\n"; sleep (1); | |
83 chdir $parent_dir or die "Failed to move back to the parent directory\n\n"; # moving back | |
84 } | |
85 else{ | |
86 $input_dir = $parent_dir; | |
87 } | |
88 $input_dir .= '/'; | |
89 | |
90 if ($CX_context){ | |
91 # push @bedfiles,$output_dir.$filename; | |
92 push @bedfiles,$input_dir.$filename; | |
93 } | |
94 else{ ## CpG context only (default) | |
95 if ($filename =~ /^CpG/){ # only testing the actual filename without the path information | |
96 push @bedfiles,$input_dir.$filename; # we are adding the full path to the filename | |
97 } | |
98 else{ | |
99 # skipping CHH or CHG files | |
100 } | |
101 } | |
102 } | |
103 | |
104 if (@bedfiles){ | |
105 warn "Using the following files as Input:\n"; | |
106 print join ("\t",@bedfiles),"\n\n"; | |
107 sleep (2); | |
108 } | |
109 else{ | |
110 die "It seems that you are trying to generate bedGraph files for files not starting with CpG.... Please specify the option '--CX' and try again\n\n"; | |
111 } | |
112 | |
113 open (OUT,"| gzip -c - > ${output_dir}${bedGraph_output}") or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!\n"; | |
114 warn "Writing bedGraph to file: $bedGraph_output\n"; | |
115 print OUT "track type=bedGraph\n"; | |
116 | |
117 my $coverage_output = $bedGraph_output; | |
118 unless ($coverage_output =~ s/bedGraph\.gz$/bismark.cov.gz/){ | |
119 $coverage_output =~ s/$/.bismark.cov.gz/; | |
120 } | |
121 | |
122 open (COVERAGE,"| gzip -c - > ${output_dir}${coverage_output}") or die "Problems writing to the coverage output detected. File path: '$output_dir'\tfile name: '$coverage_output' $!\n\n"; | |
123 warn "Also writing out a coverage file including counts methylated and unmethylated residues to file: $coverage_output\n"; | |
124 | |
125 if ($zero){ | |
126 my $zero_coverage_output = $bedGraph_output; | |
127 unless ($zero_coverage_output =~ s/bedGraph$/bismark.zero.cov/){ | |
128 $zero_coverage_output =~ s/$/.bismark.zero.cov/; | |
129 } | |
130 | |
131 open (ZEROCOVERAGE,'>',$output_dir.$zero_coverage_output) or die "Problems writing to the zero-based coverage output detected. File path: '$output_dir'\tfile name: '$\zero_coverage_output' $!\n\n"; | |
132 warn "Also writing out a 0-based, half-open coverage file including counts methylated and unmethylated residues to file: $zero_coverage_output\n"; | |
133 } | |
134 warn "\n"; | |
135 | |
136 my %temp_fhs; | |
137 my @temp_files; # writing all context files (default CpG only) to these files prior to sorting | |
138 my %chr_lengths; # storing chromosome lenghts in '--ample_memory' mode | |
139 | |
140 ### changing to the output directory | |
141 unless ($output_dir eq ''){ # default | |
142 chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
143 warn "Changed directory to $output_dir\n"; | |
144 } | |
145 | |
146 if ($gazillion){ | |
147 if (scalar @bedfiles == 1){ | |
148 warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Sorting everything in memory instead of writing out individual chromosome files ...\n"; | |
149 } | |
150 else{ | |
151 warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Merging all input files and sorting everything in memory instead of writing out individual chromosome files...\n"; | |
152 my $merge = "$bedGraph_output.methylation_calls.merged"; | |
153 open (MERGE,'>',$merge) or die "Failed to write to temporary merged file $merge: $!\n"; | |
154 warn "Writing all merged methylation calls to temp file $merge\n\n"; sleep(2); | |
155 push @temp_files, $merge; | |
156 } | |
157 } | |
158 | |
159 foreach my $infile (@bedfiles) { | |
160 | |
161 # warn "Processing $infile\n"; | |
162 | |
163 if ($remove) { | |
164 warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n"; | |
165 | |
166 if ($infile =~ /gz$/){ | |
167 open (READ,"gunzip -c $infile |") or die $!; | |
168 } | |
169 else{ | |
170 open (READ,$infile) or die $!; | |
171 } | |
172 | |
173 my $removed_spaces_outfile = $infile; | |
174 $removed_spaces_outfile =~ s/$/.spaces_removed.txt/; | |
175 $removed_spaces_outfile =~ s/.*\///;# replacing everything up to the last slash in the filename | |
176 | |
177 warn "Attempting to write to file ${output_dir}${removed_spaces_outfile}\n\n"; | |
178 open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n"; | |
179 | |
180 unless ($no_header){ | |
181 $_ = <READ>; ### Bismark version header | |
182 print REM $_; ### Bismark version header | |
183 } | |
184 | |
185 while (<READ>) { | |
186 chomp; | |
187 my ($id,$strand,$chr,$pos,$context) = (split (/\t/)); | |
188 $id =~ s/\s+/_/g; | |
189 print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n"; | |
190 } | |
191 | |
192 close READ or die $!; | |
193 close REM or die $!; | |
194 | |
195 ### changing the infile name to the new file without spaces | |
196 $infile = $removed_spaces_outfile; # at this stage we are already in the output directory so it should pick it up correctly | |
197 } | |
198 | |
199 # opening infile | |
200 if ($infile =~ /gz$/){ | |
201 open (IN,"gunzip -c $infile |") or die "Couldn't find file '$infile': $!\n"; | |
202 } | |
203 else{ | |
204 open (IN,$infile) or die "Couldn't find file '$infile': $!\n"; | |
205 } | |
206 | |
207 if ($infile =~ /\//){ # if input files are in a different folders we extract the filename again | |
208 $infile =~ s/.*\///;# replacing everything up to the last slash in the filename | |
209 # warn "Renamed Infile: $infile\n"; | |
210 } | |
211 | |
212 ### People these days seem to be aligning their data to newly assembled genomes more and more, which sometimes conist of up to half a million scaffolds instead of ~23 chromosomes. This | |
213 ### does normally clash with the operating system's limit of files that can be open for writing at the same time, and it is difficult and probably not advisable to increase this | |
214 ### limit (some even say there is a reason for the OS doing so...). | |
215 ### To still allow then generation of bedGraph files we will in these cases sort everything using the Linux sort command instead, which will sort by chromosome and position (the | |
216 ### chromosome sorting is not carried out for chromosome sorted files which makes the sort MUCH faster). | |
217 | |
218 if ($gazillion){ | |
219 # using all infiles instead of sorting | |
220 if (scalar @bedfiles == 1){ | |
221 push @temp_files, $infile; | |
222 } | |
223 else{ | |
224 ## always ignoring the version header | |
225 unless ($no_header){ | |
226 $_ = <IN>; ### Bismark version header | |
227 } | |
228 | |
229 while (<IN>) { | |
230 if ($_ =~ /^Bismark /){ | |
231 warn "Found Bismark version information. Skipping this line (should still work fine) but consider using '--no_header' next time...\n"; | |
232 next; | |
233 } | |
234 print MERGE; | |
235 } | |
236 warn "Finished writing methylation calls from $infile to merged temp file\n"; | |
237 } | |
238 } | |
239 else{ | |
240 warn "Now writing methylation information for file >>$infile<< to individual files for each chromosome\n"; | |
241 | |
242 ## always ignoring the version header | |
243 unless ($no_header){ | |
244 $_ = <IN>; ### Bismark version header | |
245 } | |
246 | |
247 while (<IN>) { | |
248 if ($_ =~ /^Bismark /){ | |
249 warn "Found Bismark version information. Skipping this line (should still work fine) but consider losing '--no_header' next time...\n"; | |
250 next; | |
251 } | |
252 | |
253 chomp; | |
254 | |
255 my ($chr,$pos) = (split (/\t/))[2,3]; | |
256 | |
257 ### If --ample_mem was specified we are keeping track of the highest position for each chromosome as this will determine the size of the array we need to create in the next step | |
258 if ($ample_mem){ | |
259 ### setting the first position for this chromosome | |
260 unless (defined $chr_lengths{$chr} ){ | |
261 $chr_lengths{$chr} = $pos; | |
262 } | |
263 # for all subsequent postions for this chromosome | |
264 if ($pos > $chr_lengths{$chr} ){ | |
265 $chr_lengths{$chr} = $pos; # set the current position as the new highest position | |
266 } | |
267 } | |
268 | |
269 # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n"; | |
270 $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names | |
271 # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n"; | |
272 unless (exists $temp_fhs{$chr}) { # Including the infile name to the temporary chromosome files to enable parallel processing of multiple files at the same time | |
273 | |
274 my $temp_file_name = $infile.'.chr'.$chr.'.methXtractor.temp'; | |
275 # warn "Using temp file name: $temp_file_name\n"; sleep(1); | |
276 | |
277 open ($temp_fhs{$chr},'>',$infile.'.chr'.$chr.'.methXtractor.temp') or die "Failed to open filehandle: $!"; | |
278 push @temp_files, $temp_file_name; # storing temp files as we open them instead | |
279 } | |
280 | |
281 print {$temp_fhs{$chr}} "$_\n"; | |
282 } | |
283 | |
284 warn "Finished writing out individual chromosome files for $infile\n"; | |
285 } | |
286 } | |
287 | |
288 # closing temporary filehandles to force writing out buffered content | |
289 foreach my $temp_fh(keys %temp_fhs){ | |
290 close $temp_fhs{$temp_fh} or warn "Failed to close temporary filehandle $temp_fhs{$temp_fh}: $!\n"; | |
291 } | |
292 | |
293 ### printing out the determined maximum position for each chromosome | |
294 if ($ample_mem){ | |
295 foreach my $chr (sort keys %chr_lengths){ | |
296 warn "Highest determined position for chromosome $chr:\t\t$chr_lengths{$chr} bp\n"; | |
297 } | |
298 warn "\n"; | |
299 } | |
300 | |
301 unless ($gazillion){ | |
302 warn "\n"; | |
303 warn "Collecting temporary chromosome file information... Processing the following input file(s):\n"; | |
304 warn join ("\n",@temp_files),"\n\n"; | |
305 sleep (1); | |
306 } | |
307 | |
308 if ($gazillion){ | |
309 if (scalar @bedfiles > 1){ | |
310 close (MERGE) or die "Failed to close filehandle MERGE: $!\n"; | |
311 } | |
312 } | |
313 | |
314 foreach my $in (@temp_files) { | |
315 | |
316 if ($sort_size){ | |
317 warn "Sorting input file $in by positions (using -S of $sort_size)\n" unless ($ample_mem); | |
318 } | |
319 | |
320 my $ifh; | |
321 | |
322 my $name; | |
323 my $meth_state; | |
324 my $chr = ""; | |
325 my $pos = 0; | |
326 my $meth_state2; | |
327 | |
328 my $last_pos; | |
329 my $last_chr; | |
330 | |
331 ### If the user specified to have a lot of RAM available (probably in the range of > 16GB for 2 arrays of human genome Chromosome 1) we will sort the methylation calls in two big arrays instead of using the Unix sort command | |
332 if ($ample_mem){ | |
333 # warn "Generating enormous array instead of sorting the file. This may temporily use quite a bit of memory (RAM)!\n\n"; | |
334 | |
335 my @meth_count; | |
336 my @unmeth_count; | |
337 | |
338 open ($ifh,$in) or die "Couldn't read from temporary file '$in': $!\n"; | |
339 | |
340 while (my $line = <$ifh>){ | |
341 next if ($line =~ /^Bismark/); | |
342 chomp $line; | |
343 | |
344 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; | |
345 | |
346 unless ($last_pos and $last_chr){ | |
347 $last_chr = $chr; | |
348 $last_pos = $pos; | |
349 } | |
350 unless (@meth_count and @unmeth_count){ | |
351 warn "Setting maximum position of arrays \@meth_count and \@unmeth_count for chromosome $chr to $chr_lengths{$chr}\n"; | |
352 @meth_count = (0) x $chr_lengths{$chr}; | |
353 @unmeth_count = (0) x $chr_lengths{$chr}; | |
354 # warn "length of array meth count: ",scalar @meth_count,"\n"; | |
355 warn "Finished generating arrays\n"; | |
356 # sleep(1); | |
357 } | |
358 # warn "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; sleep(1); | |
359 # print join ("\t",$name, $meth_state, $chr, $pos, $meth_state2),"\n"; | |
360 # sleep(1); | |
361 | |
362 # if ($last_chr ne $chr) { | |
363 # die "Reached new chromosome '$chr' which mustn't happen from pre-sorted files (previous chromosome was: '$last_chr')\n"; | |
364 # } | |
365 | |
366 my $validated = validate_methylation_call($meth_state, $meth_state2); # as a comment, methylation calls in Unknown context (U, u) would fail this check, but they should be ignored by the methylation extractor anyway | |
367 unless($validated){ | |
368 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; | |
369 next; | |
370 } | |
371 | |
372 if ($meth_state eq '+'){ | |
373 # warn "increasing meth $pos by 1\n"; sleep(1); | |
374 $meth_count[$pos-1]++; | |
375 } | |
376 else{ | |
377 $unmeth_count[$pos-1]++; | |
378 # warn "increasing unmeth $pos by 1\n"; sleep(1); | |
379 } | |
380 } | |
381 | |
382 close $ifh or die $!; | |
383 | |
384 warn "Now printing methylation information for this chromosome\n"; | |
385 # warn "length of array meth count: ",scalar @meth_count,"\n"; | |
386 # warn "chr\tposition\tcount methylated\tcount unmethylated\tcount total\n"; | |
387 foreach my $index (0..$#meth_count){ | |
388 my $totalcount = $meth_count[$index] + $unmeth_count[$index]; | |
389 if ($totalcount > 0){ | |
390 # warn "$index\t$meth_count[$index]\t$unmeth_count[$index]\t$totalcount\n"; | |
391 # sleep(1); | |
392 | |
393 my $bed_pos = $index; ### bedGraph coordinates are 0 based | |
394 my $one_based_pos = $bed_pos + 1; | |
395 | |
396 my $meth_percentage; | |
397 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($meth_count[$index]/$totalcount) * 100) : ($meth_percentage = undef); | |
398 | |
399 if (defined $meth_percentage){ | |
400 | |
401 # as of version 0.9.1 we will by default write out both a bedGraph and a more detailed coverage file | |
402 | |
403 # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (half-open. Clever, huh?) | |
404 print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n"; | |
405 | |
406 # this is the coverage file. Coordinates are 1-based | |
407 print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n"; | |
408 | |
409 # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end | |
410 if ($zero){ | |
411 print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n"; | |
412 } | |
413 | |
414 } | |
415 } | |
416 } | |
417 | |
418 @meth_count = (); | |
419 @unmeth_count = (); | |
420 | |
421 } | |
422 ### default: we assume that the user wants to use the Linux Sort command. This is quite a bit slower, but features a much smaller memory footprint | |
423 else{ | |
424 my $sort_dir = './'; # there has been a cd into the output_directory already | |
425 # my $sort_dir = $output_dir; | |
426 # if ($sort_dir eq ''){ | |
427 # $sort_dir = './'; | |
428 # } | |
429 | |
430 if ($gazillion){ | |
431 if ($in =~ /gz$/){ | |
432 open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k3,3V -k4,4n |" or die "Input file could not be sorted. $!\n"; | |
433 } | |
434 else{ | |
435 open $ifh, "sort -S $sort_size -T $sort_dir -k3,3V -k4,4n $in |" or die "Input file could not be sorted. $!\n"; | |
436 } | |
437 ### Comment by Volker Brendel, Indiana University | |
438 ### "The -k3,3V sort option is critical when the sequence names are numbered scaffolds (without left-buffering of zeros). Omit the V, and things go very wrong in the tallying of reads." | |
439 } | |
440 else{ | |
441 ### this sort command was used previously and sorts according to chromosome in addition to position. Since the files are being sorted according to chromosomes anyway, | |
442 ### we may drop the -k3,3V option. It has been reported that this will result in a dramatic speed increase | |
443 if ($in =~ /gz$/){ | |
444 open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k4,4n |" or die "Input file could not be sorted. $!\n"; | |
445 } | |
446 else{ | |
447 open $ifh, "sort -S $sort_size -T $sort_dir -k4,4n $in |" or die "Input file could not be sorted. $!\n"; | |
448 } | |
449 } | |
450 | |
451 while (my $line = <$ifh>) { | |
452 next if ($line =~ /^Bismark/); | |
453 chomp $line; | |
454 #warn "full line:\n$line\n"; | |
455 $last_chr = $chr; | |
456 $last_pos = $pos; | |
457 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; | |
458 #warn "$name, $meth_state, $chr, $pos, $meth_state2\n"; sleep(1); | |
459 if (($last_pos ne $pos) || ($last_chr ne $chr)) { | |
460 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
461 @methylcalls = qw (0 0 0); | |
462 } | |
463 | |
464 my $validated = validate_methylation_call($meth_state, $meth_state2); | |
465 unless($validated){ | |
466 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; | |
467 next; | |
468 } | |
469 if ($meth_state eq "+") { | |
470 $methylcalls[0]++; | |
471 $methylcalls[2]++; | |
472 } else { | |
473 $methylcalls[1]++; | |
474 $methylcalls[2]++; | |
475 } | |
476 } | |
477 | |
478 | |
479 $last_chr = $chr; | |
480 $last_pos = $pos; | |
481 if ($methylcalls[2] > 0) { | |
482 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
483 } | |
484 | |
485 close $ifh or die $!; | |
486 | |
487 @methylcalls = qw (0 0 0); # resetting @methylcalls | |
488 | |
489 } | |
490 | |
491 ### deleting temporary files (only needed if --gazillion hasn't been specified | |
492 if ($gazillion and scalar @bedfiles == 1){ | |
493 # if there was only 1 file to sort this will be the input file, which obviously shouldn't be removed | |
494 } | |
495 else{ | |
496 my $delete = unlink $in; | |
497 if ($delete) { | |
498 warn "Successfully deleted the temporary input file $in\n\n"; | |
499 } | |
500 else { | |
501 warn "The temporary inputfile $in could not be deleted $!\n\n"; | |
502 } | |
503 } | |
504 } | |
505 | |
506 close OUT or die $!; | |
507 close COVERAGE or die $!; | |
508 if ($zero){ | |
509 close ZEROCOVERAGE or die $!; | |
510 } | |
511 | |
512 exit 0; | |
513 | |
514 | |
515 | |
516 sub validate_methylation_call{ | |
517 my $meth_state = shift; | |
518 croak "Missing (+/-) methylation call" unless defined $meth_state; | |
519 my $meth_state2 = shift; | |
520 croak "Missing alphabetical methylation call" unless defined $meth_state2; | |
521 my $is_consistent; | |
522 ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) | |
523 : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2)); | |
524 return 1 if $is_consistent; | |
525 return 0; | |
526 } | |
527 | |
528 sub check_CpG_methylation_call{ | |
529 my $meth1 = shift; | |
530 my $meth2 = shift; | |
531 return 1 if($meth1 eq "+" && $meth2 eq "Z"); | |
532 return 1 if($meth1 eq "-" && $meth2 eq "z"); | |
533 return 0; | |
534 } | |
535 | |
536 sub check_nonCpG_methylation_call{ | |
537 my $meth1 = shift; | |
538 my $meth2 = shift; | |
539 return 1 if($meth1 eq "+" && $meth2 eq "C"); | |
540 return 1 if($meth1 eq "+" && $meth2 eq "X"); | |
541 return 1 if($meth1 eq "+" && $meth2 eq "H"); | |
542 return 1 if($meth1 eq "-" && $meth2 eq "c"); | |
543 return 1 if($meth1 eq "-" && $meth2 eq "x"); | |
544 return 1 if($meth1 eq "-" && $meth2 eq "h"); | |
545 return 0; | |
546 } | |
547 | |
548 sub generate_output{ | |
549 my $methcount = $methylcalls[0]; | |
550 my $nonmethcount = $methylcalls[1]; | |
551 my $totalcount = $methylcalls[2]; | |
552 my $last_chr = shift; | |
553 my $last_pos = shift; | |
554 croak "Should not be generating output if there's no reads to this region" unless ($totalcount > 0); | |
555 croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if ($totalcount != ($methcount + $nonmethcount) ); | |
556 | |
557 my $bed_pos = $last_pos - 1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based. | |
558 my $meth_percentage; | |
559 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef); | |
560 # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/; | |
561 if (defined $meth_percentage){ | |
562 | |
563 # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (clever, huh?) | |
564 my $one_based_pos = $bed_pos + 1; | |
565 print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n"; | |
566 | |
567 # this is the coverage file. Coordinates are 1-based | |
568 print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; | |
569 | |
570 # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end | |
571 if ($zero){ | |
572 print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; | |
573 } | |
574 } | |
575 | |
576 } | |
577 | |
578 sub process_commandline{ | |
579 my $help; | |
580 my $output_dir; | |
581 my $bedGraph_output; | |
582 my $no_header; | |
583 my $coverage_threshold; # Minimum number of reads covering before calling methylation status | |
584 my $remove; | |
585 my $counts; | |
586 my $CX_context; | |
587 my $sort_size; | |
588 my $version; | |
589 my $gazillion; | |
590 my $ample_mem; | |
591 my $zero; | |
592 my $input_dir; | |
593 | |
594 my $command_line = GetOptions ('help|man' => \$help, | |
595 'dir=s' => \$output_dir, | |
596 'o|output=s' => \$bedGraph_output, | |
597 'no_header' => \$no_header, | |
598 "cutoff=i" => \$coverage_threshold, | |
599 "remove_spaces" => \$remove, | |
600 "counts" => \$counts, | |
601 "CX|CX_context" => \$CX_context, | |
602 "buffer_size=s" => \$sort_size, | |
603 'version' => \$version, | |
604 'gazillion|scaffolds' => \$gazillion, | |
605 'ample_memory' => \$ample_mem, | |
606 "zero_based" => \$zero, | |
607 ); | |
608 | |
609 ### EXIT ON ERROR if there were errors with any of the supplied options | |
610 unless ($command_line){ | |
611 die "Please respecify command line options\n"; | |
612 } | |
613 | |
614 ### HELPFILE | |
615 if ($help){ | |
616 print_helpfile(); | |
617 exit; | |
618 } | |
619 | |
620 if ($version){ | |
621 print << "VERSION"; | |
622 | |
623 | |
624 Bismark Methylation Extractor Module - | |
625 bismark2bedGraph | |
626 | |
627 Bismark Extractor Version: $bismark2bedGraph_version | |
628 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics | |
629 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
630 | |
631 | |
632 VERSION | |
633 exit; | |
634 } | |
635 | |
636 @sorting_files = @ARGV; | |
637 | |
638 ### no files provided | |
639 unless (@sorting_files){ | |
640 warn "You need to provide one or more Bismark methylation caller files to create an individual C methylation bedGraph output. Please respecify!\n\n"; | |
641 sleep(2); | |
642 | |
643 print_helpfile(); | |
644 exit; | |
645 } | |
646 | |
647 ### PARENT DIRECTORY | |
648 my $parent_dir = getcwd(); | |
649 # warn "parent directory is: $parent_dir\n"; | |
650 | |
651 ### OUTPUT DIR PATH | |
652 if (defined $output_dir){ | |
653 | |
654 unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it | |
655 unless ($output_dir =~ /\/$/){ | |
656 $output_dir =~ s/$/\//; | |
657 } | |
658 unless (-d $output_dir){ | |
659 mkdir $output_dir or die "Failed to create output directory $output_dir: $!\n\n"; | |
660 warn "Created output directory $output_dir\n"; | |
661 } | |
662 | |
663 ### want to get an absolute path for the output directory instead of a relative one | |
664 chdir $output_dir or die "Failed to move into output directory '$output_dir': $!\n\n"; | |
665 $output_dir = getcwd(); | |
666 unless ($output_dir =~ /\/$/){ | |
667 $output_dir =~ s/$/\//; | |
668 } | |
669 # warn "output directory is: $output_dir\n"; | |
670 | |
671 # changing back to the parent directory | |
672 chdir $parent_dir or die "Failed to move back into parent directory '$parent_dir': $!\n\n"; | |
673 | |
674 } | |
675 | |
676 } | |
677 else{ | |
678 $output_dir = ''; | |
679 } | |
680 | |
681 unless (defined $bedGraph_output){ | |
682 die "Please provide the name of the output file using the option -o/--output filename\n"; | |
683 } | |
684 | |
685 if ($bedGraph_output =~ /\//){ # this is supposed a filename and not a path name | |
686 die "Please specify a file name without any path information (or use --dir if necessary)\n\n"; | |
687 } | |
688 | |
689 unless ($bedGraph_output =~ /\.gz$/){ | |
690 $bedGraph_output = "${bedGraph_output}.gz"; ### 22 07 2015: Output will be gzip compressed | |
691 } | |
692 | |
693 ### NO HEADER | |
694 unless ($no_header){ | |
695 $no_header = 0; | |
696 } | |
697 | |
698 ### remove white spaces in read ID (needed for sorting using the sort command | |
699 unless ($remove){ | |
700 $remove = 0; | |
701 } | |
702 | |
703 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT | |
704 if (defined $coverage_threshold){ | |
705 unless ($coverage_threshold > 0){ | |
706 die "Please select a coverage greater than 0 (positive integers only)\n"; | |
707 } | |
708 } | |
709 else{ | |
710 $coverage_threshold = 1; | |
711 } | |
712 | |
713 ### SORT buffer size | |
714 if (defined $sort_size){ | |
715 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ | |
716 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; | |
717 } | |
718 } | |
719 else{ | |
720 $sort_size = '2G'; | |
721 } | |
722 | |
723 unless ($CX_context){ | |
724 $CX_context = 0; | |
725 } | |
726 | |
727 unless ($counts){ | |
728 $counts = 1; | |
729 } | |
730 | |
731 if ($gazillion){ | |
732 if ($ample_mem){ | |
733 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; | |
734 } | |
735 } | |
736 | |
737 return ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir); | |
738 } | |
739 | |
740 | |
741 sub print_helpfile{ | |
742 print <<EOF | |
743 | |
744 SYNOPSIS: | |
745 | |
746 This script uses positional methylation data generated by the Bismark methylation extractor to generate | |
747 a bedGraph file as well as a coverage file which are both sorted by chromosomal position. The bedGraph | |
748 file uses 0-based genomic start and 1-based genomic end coordinates and should be UCSC compatible (if | |
749 UCSC genomes were used for the alignment step). In addition this module will write out a coverage file | |
750 which is similar to the bedGraph file, but uses 1-based genomic coordinates and also reports the count | |
751 of methylated and unmethylated cytosines for any covered position; this coverage file is required if you | |
752 wish to generate a genome-wide cytosine report with the module coverage2cytosine. | |
753 | |
754 USAGE: bismark2bedGraph [options] -o <output> [methylation extractor input files] | |
755 | |
756 Methylation extractor input files: These files are required to start with CpG... in order for the | |
757 script to correctly work out the sequence context when using CpG context only (default). If all cytosine | |
758 contexts are selected ('--CX_context'), all input files will be used regardless of their file file name(s). | |
759 | |
760 | |
761 -o/--output <filename> Name of the output file, mandatory. | |
762 | |
763 --dir Output directory. Output is written to the current directory if not specified explicitly. | |
764 | |
765 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide | |
766 before its methylation percentage is reported. Default: 1. | |
767 | |
768 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. | |
769 | |
770 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered | |
771 in the experiment irrespective of its sequence context. This applies to both forward and | |
772 reverse strands. Please be aware that this option may generate large temporary and output files | |
773 and may take a long time to sort (up to many hours). Default: OFF. | |
774 (i.e. Default = CpG context only). | |
775 | |
776 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information. | |
777 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or | |
778 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. | |
779 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. | |
780 Defaults to 2G. | |
781 | |
782 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of | |
783 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to | |
784 individual chromosome files. These errors were caused by the operating system's limit | |
785 of the number of filehandle that can be written to at any one time (typically 1024; to | |
786 find out this limit on Linux, type: ulimit -a). | |
787 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort | |
788 methylation calls into individual chromosome files. Instead, all input files are | |
789 temporarily merged into a single file (unless there is only a single file), and this | |
790 file will then be sorted by both chromosome AND position using the Unix sort command. | |
791 Please be aware that this option might take a looooong time to complete, depending on | |
792 the size of the input files, and the memory you allocate to this process (see --buffer_size). | |
793 Nevertheless, it seems to be working. | |
794 | |
795 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will | |
796 instead use two arrays to sort methylated and unmethylated calls, respectively. This may result | |
797 in a faster sorting process for very large files, but this comes at the cost of a larger memory | |
798 footprint (as an estimate, two arrays of the length of (the largest) human chromosome 1 (nearly | |
799 250 million bp) temporarily consume around 16GB of RAM). Note however that due to the overheads | |
800 of creating and looping through arrays this option might in fact be *slower* for small-ish | |
801 files (up to a few million alignments). Note also that this option is not currently compatible | |
802 with options '--scaffolds/--gazillion'. | |
803 | |
804 --zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start | |
805 and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file, | |
806 instead of using 1-based coordinates throughout. Default: OFF. | |
807 | |
808 | |
809 | |
810 The bedGraph output looks like this (tab-delimited; 0-based start coords, 1-based end coords): | |
811 ============================================================================================== | |
812 | |
813 track type=bedGraph (header line) | |
814 | |
815 <chromosome> <start position> <end position> <methylation percentage> | |
816 | |
817 | |
818 | |
819 The coverage output looks like this (tab-delimited, 1-based genomic coords; optional zero-based, half-open coords with '--zero_based'): | |
820 ======================================================================================================================================= | |
821 | |
822 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> | |
823 | |
824 | |
825 Script last modified: 09 December 2015 | |
826 | |
827 EOF | |
828 ; | |
829 exit 1; | |
830 } |