comparison bismark_methyl_extractor/bismark2bedGraph @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/env perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7 use Carp;
8
9 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)
10
11 ## This program is free software: you can redistribute it and/or modify
12 ## it under the terms of the GNU General Public License as published by
13 ## the Free Software Foundation, either version 3 of the License, or
14 ## (at your option) any later version.
15
16 ## This program is distributed in the hope that it will be useful,
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ## GNU General Public License for more details.
20
21 ## You should have received a copy of the GNU General Public License
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
23
24 my $bismark2bedGraph_version = 'v0.16.3';
25
26 my @bedfiles;
27 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
28 my @sorting_files;
29
30 my ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir) = process_commandline();
31
32 warn "Using these input files: @sorting_files\n";
33 warn "\nSummary of parameters for bismark2bedGraph conversion:\n";
34 warn '='x54,"\n";
35 warn "bedGraph output:\t\t$bedGraph_output\n";
36 warn "output directory:\t\t>$output_dir<\n";
37 if ($remove){
38 warn "remove whitespaces:\t\tyes\n";
39 }
40 else{
41 warn "remove whitespaces:\t\tno\n";
42 }
43 if ($CX_context){
44 warn "CX context:\t\t\tyes\n";
45 }
46 else{
47 warn "CX context:\t\t\tno (CpG context only, default)\n";
48 }
49 if ($no_header){
50 warn "No-header selected:\t\tyes\n";
51 }
52 else{
53 warn "No-header selected:\t\tno\n";
54 }
55
56 if ($ample_mem){
57 warn "Sorting method:\t\t\tArray-based (faster, but larger memory footprint)\n";
58 }
59 else{
60 warn "Sorting method:\t\t\tUnix sort-based (smaller memory footprint, but slower)\n";
61 }
62 unless($ample_mem){
63 warn "Sort buffer size:\t\t$sort_size\n";
64 }
65 warn "Coverage threshold:\t\t$coverage_threshold\n";
66
67
68 warn "="x77,"\n";
69 warn "Methylation information will now be written into a bedGraph and coverage file\n";
70 warn "="x77,"\n\n";
71 sleep (2);
72
73 ### deciding which files to use for bedGraph conversion
74 foreach my $filename (@sorting_files){
75
76 ### DETERMINING THE FULL PATH OF INPUT FILES
77 if ($filename =~ /^(.*\/)(.*)$/){ # if files are in a different output folder we extract the filename again
78 # warn "folder name: $1\nfilename: $2\n\n";
79 chdir $1 or die "Failed to change directory to $1\n"; # $1 might be a relative path
80 $input_dir = getcwd(); # this will always be the full path
81 $filename = $2;
82 # warn "Full Input folder: $input_dir\nFilename: $filename\n\n"; sleep (1);
83 chdir $parent_dir or die "Failed to move back to the parent directory\n\n"; # moving back
84 }
85 else{
86 $input_dir = $parent_dir;
87 }
88 $input_dir .= '/';
89
90 if ($CX_context){
91 # push @bedfiles,$output_dir.$filename;
92 push @bedfiles,$input_dir.$filename;
93 }
94 else{ ## CpG context only (default)
95 if ($filename =~ /^CpG/){ # only testing the actual filename without the path information
96 push @bedfiles,$input_dir.$filename; # we are adding the full path to the filename
97 }
98 else{
99 # skipping CHH or CHG files
100 }
101 }
102 }
103
104 if (@bedfiles){
105 warn "Using the following files as Input:\n";
106 print join ("\t",@bedfiles),"\n\n";
107 sleep (2);
108 }
109 else{
110 die "It seems that you are trying to generate bedGraph files for files not starting with CpG.... Please specify the option '--CX' and try again\n\n";
111 }
112
113 open (OUT,"| gzip -c - > ${output_dir}${bedGraph_output}") or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!\n";
114 warn "Writing bedGraph to file: $bedGraph_output\n";
115 print OUT "track type=bedGraph\n";
116
117 my $coverage_output = $bedGraph_output;
118 unless ($coverage_output =~ s/bedGraph\.gz$/bismark.cov.gz/){
119 $coverage_output =~ s/$/.bismark.cov.gz/;
120 }
121
122 open (COVERAGE,"| gzip -c - > ${output_dir}${coverage_output}") or die "Problems writing to the coverage output detected. File path: '$output_dir'\tfile name: '$coverage_output' $!\n\n";
123 warn "Also writing out a coverage file including counts methylated and unmethylated residues to file: $coverage_output\n";
124
125 if ($zero){
126 my $zero_coverage_output = $bedGraph_output;
127 unless ($zero_coverage_output =~ s/bedGraph$/bismark.zero.cov/){
128 $zero_coverage_output =~ s/$/.bismark.zero.cov/;
129 }
130
131 open (ZEROCOVERAGE,'>',$output_dir.$zero_coverage_output) or die "Problems writing to the zero-based coverage output detected. File path: '$output_dir'\tfile name: '$\zero_coverage_output' $!\n\n";
132 warn "Also writing out a 0-based, half-open coverage file including counts methylated and unmethylated residues to file: $zero_coverage_output\n";
133 }
134 warn "\n";
135
136 my %temp_fhs;
137 my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
138 my %chr_lengths; # storing chromosome lenghts in '--ample_memory' mode
139
140 ### changing to the output directory
141 unless ($output_dir eq ''){ # default
142 chdir $output_dir or die "Failed to change directory to $output_dir\n";
143 warn "Changed directory to $output_dir\n";
144 }
145
146 if ($gazillion){
147 if (scalar @bedfiles == 1){
148 warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Sorting everything in memory instead of writing out individual chromosome files ...\n";
149 }
150 else{
151 warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Merging all input files and sorting everything in memory instead of writing out individual chromosome files...\n";
152 my $merge = "$bedGraph_output.methylation_calls.merged";
153 open (MERGE,'>',$merge) or die "Failed to write to temporary merged file $merge: $!\n";
154 warn "Writing all merged methylation calls to temp file $merge\n\n"; sleep(2);
155 push @temp_files, $merge;
156 }
157 }
158
159 foreach my $infile (@bedfiles) {
160
161 # warn "Processing $infile\n";
162
163 if ($remove) {
164 warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
165
166 if ($infile =~ /gz$/){
167 open (READ,"gunzip -c $infile |") or die $!;
168 }
169 else{
170 open (READ,$infile) or die $!;
171 }
172
173 my $removed_spaces_outfile = $infile;
174 $removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
175 $removed_spaces_outfile =~ s/.*\///;# replacing everything up to the last slash in the filename
176
177 warn "Attempting to write to file ${output_dir}${removed_spaces_outfile}\n\n";
178 open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
179
180 unless ($no_header){
181 $_ = <READ>; ### Bismark version header
182 print REM $_; ### Bismark version header
183 }
184
185 while (<READ>) {
186 chomp;
187 my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
188 $id =~ s/\s+/_/g;
189 print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
190 }
191
192 close READ or die $!;
193 close REM or die $!;
194
195 ### changing the infile name to the new file without spaces
196 $infile = $removed_spaces_outfile; # at this stage we are already in the output directory so it should pick it up correctly
197 }
198
199 # opening infile
200 if ($infile =~ /gz$/){
201 open (IN,"gunzip -c $infile |") or die "Couldn't find file '$infile': $!\n";
202 }
203 else{
204 open (IN,$infile) or die "Couldn't find file '$infile': $!\n";
205 }
206
207 if ($infile =~ /\//){ # if input files are in a different folders we extract the filename again
208 $infile =~ s/.*\///;# replacing everything up to the last slash in the filename
209 # warn "Renamed Infile: $infile\n";
210 }
211
212 ### People these days seem to be aligning their data to newly assembled genomes more and more, which sometimes conist of up to half a million scaffolds instead of ~23 chromosomes. This
213 ### does normally clash with the operating system's limit of files that can be open for writing at the same time, and it is difficult and probably not advisable to increase this
214 ### limit (some even say there is a reason for the OS doing so...).
215 ### To still allow then generation of bedGraph files we will in these cases sort everything using the Linux sort command instead, which will sort by chromosome and position (the
216 ### chromosome sorting is not carried out for chromosome sorted files which makes the sort MUCH faster).
217
218 if ($gazillion){
219 # using all infiles instead of sorting
220 if (scalar @bedfiles == 1){
221 push @temp_files, $infile;
222 }
223 else{
224 ## always ignoring the version header
225 unless ($no_header){
226 $_ = <IN>; ### Bismark version header
227 }
228
229 while (<IN>) {
230 if ($_ =~ /^Bismark /){
231 warn "Found Bismark version information. Skipping this line (should still work fine) but consider using '--no_header' next time...\n";
232 next;
233 }
234 print MERGE;
235 }
236 warn "Finished writing methylation calls from $infile to merged temp file\n";
237 }
238 }
239 else{
240 warn "Now writing methylation information for file >>$infile<< to individual files for each chromosome\n";
241
242 ## always ignoring the version header
243 unless ($no_header){
244 $_ = <IN>; ### Bismark version header
245 }
246
247 while (<IN>) {
248 if ($_ =~ /^Bismark /){
249 warn "Found Bismark version information. Skipping this line (should still work fine) but consider losing '--no_header' next time...\n";
250 next;
251 }
252
253 chomp;
254
255 my ($chr,$pos) = (split (/\t/))[2,3];
256
257 ### If --ample_mem was specified we are keeping track of the highest position for each chromosome as this will determine the size of the array we need to create in the next step
258 if ($ample_mem){
259 ### setting the first position for this chromosome
260 unless (defined $chr_lengths{$chr} ){
261 $chr_lengths{$chr} = $pos;
262 }
263 # for all subsequent postions for this chromosome
264 if ($pos > $chr_lengths{$chr} ){
265 $chr_lengths{$chr} = $pos; # set the current position as the new highest position
266 }
267 }
268
269 # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n";
270 $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names
271 # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n";
272 unless (exists $temp_fhs{$chr}) { # Including the infile name to the temporary chromosome files to enable parallel processing of multiple files at the same time
273
274 my $temp_file_name = $infile.'.chr'.$chr.'.methXtractor.temp';
275 # warn "Using temp file name: $temp_file_name\n"; sleep(1);
276
277 open ($temp_fhs{$chr},'>',$infile.'.chr'.$chr.'.methXtractor.temp') or die "Failed to open filehandle: $!";
278 push @temp_files, $temp_file_name; # storing temp files as we open them instead
279 }
280
281 print {$temp_fhs{$chr}} "$_\n";
282 }
283
284 warn "Finished writing out individual chromosome files for $infile\n";
285 }
286 }
287
288 # closing temporary filehandles to force writing out buffered content
289 foreach my $temp_fh(keys %temp_fhs){
290 close $temp_fhs{$temp_fh} or warn "Failed to close temporary filehandle $temp_fhs{$temp_fh}: $!\n";
291 }
292
293 ### printing out the determined maximum position for each chromosome
294 if ($ample_mem){
295 foreach my $chr (sort keys %chr_lengths){
296 warn "Highest determined position for chromosome $chr:\t\t$chr_lengths{$chr} bp\n";
297 }
298 warn "\n";
299 }
300
301 unless ($gazillion){
302 warn "\n";
303 warn "Collecting temporary chromosome file information... Processing the following input file(s):\n";
304 warn join ("\n",@temp_files),"\n\n";
305 sleep (1);
306 }
307
308 if ($gazillion){
309 if (scalar @bedfiles > 1){
310 close (MERGE) or die "Failed to close filehandle MERGE: $!\n";
311 }
312 }
313
314 foreach my $in (@temp_files) {
315
316 if ($sort_size){
317 warn "Sorting input file $in by positions (using -S of $sort_size)\n" unless ($ample_mem);
318 }
319
320 my $ifh;
321
322 my $name;
323 my $meth_state;
324 my $chr = "";
325 my $pos = 0;
326 my $meth_state2;
327
328 my $last_pos;
329 my $last_chr;
330
331 ### If the user specified to have a lot of RAM available (probably in the range of > 16GB for 2 arrays of human genome Chromosome 1) we will sort the methylation calls in two big arrays instead of using the Unix sort command
332 if ($ample_mem){
333 # warn "Generating enormous array instead of sorting the file. This may temporily use quite a bit of memory (RAM)!\n\n";
334
335 my @meth_count;
336 my @unmeth_count;
337
338 open ($ifh,$in) or die "Couldn't read from temporary file '$in': $!\n";
339
340 while (my $line = <$ifh>){
341 next if ($line =~ /^Bismark/);
342 chomp $line;
343
344 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
345
346 unless ($last_pos and $last_chr){
347 $last_chr = $chr;
348 $last_pos = $pos;
349 }
350 unless (@meth_count and @unmeth_count){
351 warn "Setting maximum position of arrays \@meth_count and \@unmeth_count for chromosome $chr to $chr_lengths{$chr}\n";
352 @meth_count = (0) x $chr_lengths{$chr};
353 @unmeth_count = (0) x $chr_lengths{$chr};
354 # warn "length of array meth count: ",scalar @meth_count,"\n";
355 warn "Finished generating arrays\n";
356 # sleep(1);
357 }
358 # warn "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; sleep(1);
359 # print join ("\t",$name, $meth_state, $chr, $pos, $meth_state2),"\n";
360 # sleep(1);
361
362 # if ($last_chr ne $chr) {
363 # die "Reached new chromosome '$chr' which mustn't happen from pre-sorted files (previous chromosome was: '$last_chr')\n";
364 # }
365
366 my $validated = validate_methylation_call($meth_state, $meth_state2); # as a comment, methylation calls in Unknown context (U, u) would fail this check, but they should be ignored by the methylation extractor anyway
367 unless($validated){
368 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
369 next;
370 }
371
372 if ($meth_state eq '+'){
373 # warn "increasing meth $pos by 1\n"; sleep(1);
374 $meth_count[$pos-1]++;
375 }
376 else{
377 $unmeth_count[$pos-1]++;
378 # warn "increasing unmeth $pos by 1\n"; sleep(1);
379 }
380 }
381
382 close $ifh or die $!;
383
384 warn "Now printing methylation information for this chromosome\n";
385 # warn "length of array meth count: ",scalar @meth_count,"\n";
386 # warn "chr\tposition\tcount methylated\tcount unmethylated\tcount total\n";
387 foreach my $index (0..$#meth_count){
388 my $totalcount = $meth_count[$index] + $unmeth_count[$index];
389 if ($totalcount > 0){
390 # warn "$index\t$meth_count[$index]\t$unmeth_count[$index]\t$totalcount\n";
391 # sleep(1);
392
393 my $bed_pos = $index; ### bedGraph coordinates are 0 based
394 my $one_based_pos = $bed_pos + 1;
395
396 my $meth_percentage;
397 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($meth_count[$index]/$totalcount) * 100) : ($meth_percentage = undef);
398
399 if (defined $meth_percentage){
400
401 # as of version 0.9.1 we will by default write out both a bedGraph and a more detailed coverage file
402
403 # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (half-open. Clever, huh?)
404 print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";
405
406 # this is the coverage file. Coordinates are 1-based
407 print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";
408
409 # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
410 if ($zero){
411 print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";
412 }
413
414 }
415 }
416 }
417
418 @meth_count = ();
419 @unmeth_count = ();
420
421 }
422 ### default: we assume that the user wants to use the Linux Sort command. This is quite a bit slower, but features a much smaller memory footprint
423 else{
424 my $sort_dir = './'; # there has been a cd into the output_directory already
425 # my $sort_dir = $output_dir;
426 # if ($sort_dir eq ''){
427 # $sort_dir = './';
428 # }
429
430 if ($gazillion){
431 if ($in =~ /gz$/){
432 open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k3,3V -k4,4n |" or die "Input file could not be sorted. $!\n";
433 }
434 else{
435 open $ifh, "sort -S $sort_size -T $sort_dir -k3,3V -k4,4n $in |" or die "Input file could not be sorted. $!\n";
436 }
437 ### Comment by Volker Brendel, Indiana University
438 ### "The -k3,3V sort option is critical when the sequence names are numbered scaffolds (without left-buffering of zeros). Omit the V, and things go very wrong in the tallying of reads."
439 }
440 else{
441 ### this sort command was used previously and sorts according to chromosome in addition to position. Since the files are being sorted according to chromosomes anyway,
442 ### we may drop the -k3,3V option. It has been reported that this will result in a dramatic speed increase
443 if ($in =~ /gz$/){
444 open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k4,4n |" or die "Input file could not be sorted. $!\n";
445 }
446 else{
447 open $ifh, "sort -S $sort_size -T $sort_dir -k4,4n $in |" or die "Input file could not be sorted. $!\n";
448 }
449 }
450
451 while (my $line = <$ifh>) {
452 next if ($line =~ /^Bismark/);
453 chomp $line;
454 #warn "full line:\n$line\n";
455 $last_chr = $chr;
456 $last_pos = $pos;
457 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
458 #warn "$name, $meth_state, $chr, $pos, $meth_state2\n"; sleep(1);
459 if (($last_pos ne $pos) || ($last_chr ne $chr)) {
460 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
461 @methylcalls = qw (0 0 0);
462 }
463
464 my $validated = validate_methylation_call($meth_state, $meth_state2);
465 unless($validated){
466 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
467 next;
468 }
469 if ($meth_state eq "+") {
470 $methylcalls[0]++;
471 $methylcalls[2]++;
472 } else {
473 $methylcalls[1]++;
474 $methylcalls[2]++;
475 }
476 }
477
478
479 $last_chr = $chr;
480 $last_pos = $pos;
481 if ($methylcalls[2] > 0) {
482 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
483 }
484
485 close $ifh or die $!;
486
487 @methylcalls = qw (0 0 0); # resetting @methylcalls
488
489 }
490
491 ### deleting temporary files (only needed if --gazillion hasn't been specified
492 if ($gazillion and scalar @bedfiles == 1){
493 # if there was only 1 file to sort this will be the input file, which obviously shouldn't be removed
494 }
495 else{
496 my $delete = unlink $in;
497 if ($delete) {
498 warn "Successfully deleted the temporary input file $in\n\n";
499 }
500 else {
501 warn "The temporary inputfile $in could not be deleted $!\n\n";
502 }
503 }
504 }
505
506 close OUT or die $!;
507 close COVERAGE or die $!;
508 if ($zero){
509 close ZEROCOVERAGE or die $!;
510 }
511
512 exit 0;
513
514
515
516 sub validate_methylation_call{
517 my $meth_state = shift;
518 croak "Missing (+/-) methylation call" unless defined $meth_state;
519 my $meth_state2 = shift;
520 croak "Missing alphabetical methylation call" unless defined $meth_state2;
521 my $is_consistent;
522 ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
523 : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
524 return 1 if $is_consistent;
525 return 0;
526 }
527
528 sub check_CpG_methylation_call{
529 my $meth1 = shift;
530 my $meth2 = shift;
531 return 1 if($meth1 eq "+" && $meth2 eq "Z");
532 return 1 if($meth1 eq "-" && $meth2 eq "z");
533 return 0;
534 }
535
536 sub check_nonCpG_methylation_call{
537 my $meth1 = shift;
538 my $meth2 = shift;
539 return 1 if($meth1 eq "+" && $meth2 eq "C");
540 return 1 if($meth1 eq "+" && $meth2 eq "X");
541 return 1 if($meth1 eq "+" && $meth2 eq "H");
542 return 1 if($meth1 eq "-" && $meth2 eq "c");
543 return 1 if($meth1 eq "-" && $meth2 eq "x");
544 return 1 if($meth1 eq "-" && $meth2 eq "h");
545 return 0;
546 }
547
548 sub generate_output{
549 my $methcount = $methylcalls[0];
550 my $nonmethcount = $methylcalls[1];
551 my $totalcount = $methylcalls[2];
552 my $last_chr = shift;
553 my $last_pos = shift;
554 croak "Should not be generating output if there's no reads to this region" unless ($totalcount > 0);
555 croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if ($totalcount != ($methcount + $nonmethcount) );
556
557 my $bed_pos = $last_pos - 1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
558 my $meth_percentage;
559 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
560 # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
561 if (defined $meth_percentage){
562
563 # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (clever, huh?)
564 my $one_based_pos = $bed_pos + 1;
565 print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";
566
567 # this is the coverage file. Coordinates are 1-based
568 print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
569
570 # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
571 if ($zero){
572 print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
573 }
574 }
575
576 }
577
578 sub process_commandline{
579 my $help;
580 my $output_dir;
581 my $bedGraph_output;
582 my $no_header;
583 my $coverage_threshold; # Minimum number of reads covering before calling methylation status
584 my $remove;
585 my $counts;
586 my $CX_context;
587 my $sort_size;
588 my $version;
589 my $gazillion;
590 my $ample_mem;
591 my $zero;
592 my $input_dir;
593
594 my $command_line = GetOptions ('help|man' => \$help,
595 'dir=s' => \$output_dir,
596 'o|output=s' => \$bedGraph_output,
597 'no_header' => \$no_header,
598 "cutoff=i" => \$coverage_threshold,
599 "remove_spaces" => \$remove,
600 "counts" => \$counts,
601 "CX|CX_context" => \$CX_context,
602 "buffer_size=s" => \$sort_size,
603 'version' => \$version,
604 'gazillion|scaffolds' => \$gazillion,
605 'ample_memory' => \$ample_mem,
606 "zero_based" => \$zero,
607 );
608
609 ### EXIT ON ERROR if there were errors with any of the supplied options
610 unless ($command_line){
611 die "Please respecify command line options\n";
612 }
613
614 ### HELPFILE
615 if ($help){
616 print_helpfile();
617 exit;
618 }
619
620 if ($version){
621 print << "VERSION";
622
623
624 Bismark Methylation Extractor Module -
625 bismark2bedGraph
626
627 Bismark Extractor Version: $bismark2bedGraph_version
628 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
629 www.bioinformatics.babraham.ac.uk/projects/bismark/
630
631
632 VERSION
633 exit;
634 }
635
636 @sorting_files = @ARGV;
637
638 ### no files provided
639 unless (@sorting_files){
640 warn "You need to provide one or more Bismark methylation caller files to create an individual C methylation bedGraph output. Please respecify!\n\n";
641 sleep(2);
642
643 print_helpfile();
644 exit;
645 }
646
647 ### PARENT DIRECTORY
648 my $parent_dir = getcwd();
649 # warn "parent directory is: $parent_dir\n";
650
651 ### OUTPUT DIR PATH
652 if (defined $output_dir){
653
654 unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it
655 unless ($output_dir =~ /\/$/){
656 $output_dir =~ s/$/\//;
657 }
658 unless (-d $output_dir){
659 mkdir $output_dir or die "Failed to create output directory $output_dir: $!\n\n";
660 warn "Created output directory $output_dir\n";
661 }
662
663 ### want to get an absolute path for the output directory instead of a relative one
664 chdir $output_dir or die "Failed to move into output directory '$output_dir': $!\n\n";
665 $output_dir = getcwd();
666 unless ($output_dir =~ /\/$/){
667 $output_dir =~ s/$/\//;
668 }
669 # warn "output directory is: $output_dir\n";
670
671 # changing back to the parent directory
672 chdir $parent_dir or die "Failed to move back into parent directory '$parent_dir': $!\n\n";
673
674 }
675
676 }
677 else{
678 $output_dir = '';
679 }
680
681 unless (defined $bedGraph_output){
682 die "Please provide the name of the output file using the option -o/--output filename\n";
683 }
684
685 if ($bedGraph_output =~ /\//){ # this is supposed a filename and not a path name
686 die "Please specify a file name without any path information (or use --dir if necessary)\n\n";
687 }
688
689 unless ($bedGraph_output =~ /\.gz$/){
690 $bedGraph_output = "${bedGraph_output}.gz"; ### 22 07 2015: Output will be gzip compressed
691 }
692
693 ### NO HEADER
694 unless ($no_header){
695 $no_header = 0;
696 }
697
698 ### remove white spaces in read ID (needed for sorting using the sort command
699 unless ($remove){
700 $remove = 0;
701 }
702
703 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
704 if (defined $coverage_threshold){
705 unless ($coverage_threshold > 0){
706 die "Please select a coverage greater than 0 (positive integers only)\n";
707 }
708 }
709 else{
710 $coverage_threshold = 1;
711 }
712
713 ### SORT buffer size
714 if (defined $sort_size){
715 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
716 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
717 }
718 }
719 else{
720 $sort_size = '2G';
721 }
722
723 unless ($CX_context){
724 $CX_context = 0;
725 }
726
727 unless ($counts){
728 $counts = 1;
729 }
730
731 if ($gazillion){
732 if ($ample_mem){
733 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
734 }
735 }
736
737 return ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir);
738 }
739
740
741 sub print_helpfile{
742 print <<EOF
743
744 SYNOPSIS:
745
746 This script uses positional methylation data generated by the Bismark methylation extractor to generate
747 a bedGraph file as well as a coverage file which are both sorted by chromosomal position. The bedGraph
748 file uses 0-based genomic start and 1-based genomic end coordinates and should be UCSC compatible (if
749 UCSC genomes were used for the alignment step). In addition this module will write out a coverage file
750 which is similar to the bedGraph file, but uses 1-based genomic coordinates and also reports the count
751 of methylated and unmethylated cytosines for any covered position; this coverage file is required if you
752 wish to generate a genome-wide cytosine report with the module coverage2cytosine.
753
754 USAGE: bismark2bedGraph [options] -o <output> [methylation extractor input files]
755
756 Methylation extractor input files: These files are required to start with CpG... in order for the
757 script to correctly work out the sequence context when using CpG context only (default). If all cytosine
758 contexts are selected ('--CX_context'), all input files will be used regardless of their file file name(s).
759
760
761 -o/--output <filename> Name of the output file, mandatory.
762
763 --dir Output directory. Output is written to the current directory if not specified explicitly.
764
765 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
766 before its methylation percentage is reported. Default: 1.
767
768 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
769
770 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
771 in the experiment irrespective of its sequence context. This applies to both forward and
772 reverse strands. Please be aware that this option may generate large temporary and output files
773 and may take a long time to sort (up to many hours). Default: OFF.
774 (i.e. Default = CpG context only).
775
776 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
777 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
778 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
779 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
780 Defaults to 2G.
781
782 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
783 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
784 individual chromosome files. These errors were caused by the operating system's limit
785 of the number of filehandle that can be written to at any one time (typically 1024; to
786 find out this limit on Linux, type: ulimit -a).
787 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
788 methylation calls into individual chromosome files. Instead, all input files are
789 temporarily merged into a single file (unless there is only a single file), and this
790 file will then be sorted by both chromosome AND position using the Unix sort command.
791 Please be aware that this option might take a looooong time to complete, depending on
792 the size of the input files, and the memory you allocate to this process (see --buffer_size).
793 Nevertheless, it seems to be working.
794
795 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
796 instead use two arrays to sort methylated and unmethylated calls, respectively. This may result
797 in a faster sorting process for very large files, but this comes at the cost of a larger memory
798 footprint (as an estimate, two arrays of the length of (the largest) human chromosome 1 (nearly
799 250 million bp) temporarily consume around 16GB of RAM). Note however that due to the overheads
800 of creating and looping through arrays this option might in fact be *slower* for small-ish
801 files (up to a few million alignments). Note also that this option is not currently compatible
802 with options '--scaffolds/--gazillion'.
803
804 --zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start
805 and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file,
806 instead of using 1-based coordinates throughout. Default: OFF.
807
808
809
810 The bedGraph output looks like this (tab-delimited; 0-based start coords, 1-based end coords):
811 ==============================================================================================
812
813 track type=bedGraph (header line)
814
815 <chromosome> <start position> <end position> <methylation percentage>
816
817
818
819 The coverage output looks like this (tab-delimited, 1-based genomic coords; optional zero-based, half-open coords with '--zero_based'):
820 =======================================================================================================================================
821
822 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
823
824
825 Script last modified: 09 December 2015
826
827 EOF
828 ;
829 exit 1;
830 }