comparison new/bismark_genome_preparation @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use Cwd;
5 # use File::Path qw(rmtree);
6 $|++;
7
8
9 ## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk)
10
11 ## This program is free software: you can redistribute it and/or modify
12 ## it under the terms of the GNU General Public License as published by
13 ## the Free Software Foundation, either version 3 of the License, or
14 ## (at your option) any later version.
15
16 ## This program is distributed in the hope that it will be useful,
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ## GNU General Public License for more details.
20
21 ## You should have received a copy of the GNU General Public License
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
23
24 use Getopt::Long;
25 use Cwd;
26
27 my $verbose;
28 my $help;
29 my $version;
30 my $man;
31 my $path_to_bowtie;
32 my $multi_fasta;
33 my $single_fasta;
34 my $bowtie2;
35
36 my $bismark_version = 'v0.14.3';
37
38 GetOptions ('verbose' => \$verbose,
39 'help' => \$help,
40 'man' => \$man,
41 'version' => \$version,
42 'path_to_bowtie:s' => \$path_to_bowtie,
43 'single_fasta' => \$single_fasta,
44 'bowtie2' => \$bowtie2,
45 );
46
47 if ($help or $man){
48 print_helpfile();
49 exit;
50 }
51
52 if ($version){
53 print << "VERSION";
54
55 Bismark - Bisulfite Mapper and Methylation Caller.
56
57 Bismark Genome Preparation Version: $bismark_version
58 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
59 www.bioinformatics.babraham.ac.uk/projects/
60
61 VERSION
62 exit;
63 }
64
65 my $genome_folder = shift @ARGV; # mandatory
66 my %chromosomes; # checking if chromosome names are unique (required)
67
68 # Ensuring a genome folder has been specified
69 if ($genome_folder){
70 unless ($genome_folder =~ /\/$/){
71 $genome_folder =~ s/$/\//;
72 }
73 $verbose and print "Path to genome folder specified as: $genome_folder\n";
74 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
75
76 # making the genome folder path abolsolute so it won't break if the path was specified relative
77 $genome_folder = getcwd;
78 unless ($genome_folder =~ /\/$/){
79 $genome_folder =~ s/$/\//;
80 }
81 }
82 else{
83 die "Please specify a genome folder to be used for bisulfite conversion\n\n";
84 }
85
86
87 my $CT_dir;
88 my $GA_dir;
89
90
91 if ($single_fasta){
92 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
93 $multi_fasta = 0;
94 }
95 else{
96 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
97 $single_fasta = 0;
98 $multi_fasta = 1;
99 }
100
101 my @filenames = create_bisulfite_genome_folders();
102
103 process_sequence_files ();
104
105 launch_bowtie_indexer();
106
107 sub launch_bowtie_indexer{
108 if ($bowtie2){
109 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
110 }
111 else{
112 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
113 }
114 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
115 sleep(5);
116
117 ### if the path to bowtie was specfified explicitely
118 if ($path_to_bowtie){
119 if ($bowtie2){
120 $path_to_bowtie =~ s/$/bowtie2-build/;
121 }
122 else{
123 $path_to_bowtie =~ s/$/bowtie-build/;
124 }
125 }
126 ### otherwise we assume that bowtie-build is in the path
127 else{
128 if ($bowtie2){
129 $path_to_bowtie = 'bowtie2-build';
130 }
131 else{
132 $path_to_bowtie = 'bowtie-build';
133 }
134 }
135
136 $verbose and print "\n";
137
138 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
139 my $pid = fork();
140
141 # parent process
142 if ($pid){
143 sleep(1);
144 chdir $CT_dir or die "Unable to change directory: $!\n";
145 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
146 my @fasta_files = <*.fa>;
147 my $file_list = join (',',@fasta_files);
148 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
149 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
150
151 sleep (11);
152 exec ("$path_to_bowtie","-f","$file_list","BS_CT");
153 }
154
155 # child process
156 elsif ($pid == 0){
157 sleep(2);
158 chdir $GA_dir or die "Unable to change directory: $!\n";
159 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
160 my @fasta_files = <*.fa>;
161 my $file_list = join (',',@fasta_files);
162 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
163 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
164 $verbose and print "(starting in 10 seconds)\n";
165 sleep(10);
166 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
167 }
168
169 # if the platform doesn't support the fork command we will run the indexing processes one after the other
170 else{
171 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
172 sleep(10);
173
174 ### moving to CT genome folder
175 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
176 chdir $CT_dir or die "Unable to change directory: $!\n";
177 my @fasta_files = <*.fa>;
178 my $file_list = join (',',@fasta_files);
179 $verbose and print "$file_list\n\n";
180 sleep(2);
181 system ("$path_to_bowtie","-f","$file_list","BS_CT");
182 @fasta_files=();
183 $file_list= '';
184
185 ### moving to GA genome folder
186 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
187 chdir $GA_dir or die "Unable to change directory: $!\n";
188 @fasta_files = <*.fa>;
189 $file_list = join (',',@fasta_files);
190 $verbose and print "$file_list\n\n";
191 sleep(2);
192 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
193 }
194 }
195
196
197 sub process_sequence_files {
198
199 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
200 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
201 sleep (3);
202
203 $verbose and print "conversions performed:\n";
204 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
205
206
207 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
208 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
209 ### This is now the default option
210
211 if ($multi_fasta){
212 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
213 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
214 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
215
216 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
217 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
218 }
219
220 foreach my $filename(@filenames){
221 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
222 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
223 # warn "Reading chromosome information from $filename\n\n";
224
225 ### first line needs to be a fastA header
226 my $first_line = <IN>;
227 chomp $first_line;
228
229 ### Extracting chromosome name from the FastA header
230 my $chromosome_name = extract_chromosome_name($first_line);
231
232 ### Exiting if a chromosome with the same name was present already
233 if (exists $chromosomes{$chromosome_name}){
234 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
235 }
236 else{
237 $chromosomes{$chromosome_name}++;
238 }
239
240 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
241 unless ($multi_fasta){
242 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
243 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
244 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
245
246 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
247 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
248 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
249 }
250
251 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
252 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
253
254
255 while (<IN>){
256
257 ### in case the line is a new fastA header
258 if ($_ =~ /^>/){
259 ### printing out the stats for the previous chromosome
260 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
261 ### resetting the chromosome transliteration counters
262 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
263
264 ### Extracting chromosome name from the additional FastA header
265 $chromosome_name = extract_chromosome_name($_);
266
267 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
268 unless ($multi_fasta){
269 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
270 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
271 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
272
273 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
274 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
275 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
276 }
277
278 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
279 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
280 }
281
282 else{
283 my $sequence = uc$_;
284
285 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
286
287 $sequence =~ s/[^ATCGN\n\r]/N/g;
288
289 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
290
291 my $CT_sequence = $sequence;
292 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
293 $total_CT_conversions += $CT_transliterations_performed;
294 $chromosome_CT_conversions += $CT_transliterations_performed;
295
296 print CT_CONVERT $CT_sequence;
297
298 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
299 ### complementing the forward strand and then C->T converting it)
300
301 my $GA_sequence = $sequence;
302 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
303 $total_GA_conversions += $GA_transliterations_performed;
304 $chromosome_GA_conversions += $GA_transliterations_performed;
305
306 print GA_CONVERT $GA_sequence;
307
308 }
309 }
310 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
311 }
312 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
313 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
314
315
316 print "\nTotal number of conversions performed:\n";
317 print "C->T:\t$total_CT_conversions\n";
318 print "G->A:\t$total_GA_conversions\n";
319
320 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
321 }
322
323 sub extract_chromosome_name {
324
325 my $header = shift;
326
327 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
328
329 if ($header =~ s/^>//){
330 my ($chromosome_name) = split (/\s+/,$header);
331 return $chromosome_name;
332 }
333 else{
334 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
335 }
336 }
337
338 sub create_bisulfite_genome_folders{
339
340 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
341
342 if ($path_to_bowtie){
343 unless ($path_to_bowtie =~ /\/$/){
344 $path_to_bowtie =~ s/$/\//;
345 }
346 if (chdir $path_to_bowtie){
347 if ($bowtie2){
348 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
349 }
350 else{
351 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
352 }
353 }
354 else{
355 die "There was an error with the path to bowtie: $!\n";
356 }
357 }
358
359 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
360
361
362 # Exiting unless there are fastA files in the folder
363 my @filenames = <*.fa>;
364
365 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
366 unless (@filenames){
367 @filenames = <*.fasta>;
368 }
369
370 unless (@filenames){
371 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
372 }
373
374 warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n";
375 sleep (3);
376
377 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
378 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
379 unless (-d $bisulfite_dir){
380 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
381 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
382 }
383 else{
384 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
385 sleep(5);
386 }
387
388 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
389 $CT_dir = "${bisulfite_dir}CT_conversion/";
390 $GA_dir = "${bisulfite_dir}GA_conversion/";
391
392 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
393 # converted version of the genome
394 unless (-d $CT_dir){
395 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
396 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
397 }
398 unless (-d $GA_dir){
399 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
400 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
401 }
402
403 # moving back to the original genome folder
404 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
405 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
406 warn "\nStep I - Prepare genome folders - completed\n\n\n";
407 return @filenames;
408 }
409
410 sub print_helpfile{
411 print << 'HOW_TO';
412
413
414 DESCRIPTION
415
416 This script is supposed to convert a specified reference genome into two different bisulfite
417 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
418 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
419 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
420 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
421 two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
422 that the indexing process can take up to several hours; this will mainly depend on genome size
423 and system resources.
424
425
426
427 The following is a brief description of command line options and arguments to control the
428 Bismark Genome Preparation:
429
430
431 USAGE: bismark_genome_preparation [options] <arguments>
432
433
434 OPTIONS:
435
436 --help/--man Displays this help filea and exits.
437
438 --version Displays version information and exits.
439
440 --verbose Print verbose output for more details or debugging.
441
442 --path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system
443 (depending on which aligner/indexer you intend to use). Unless this path
444 is specified it is assumed that Bowtie is in the PATH.
445
446 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
447
448 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
449 single-entry FastA files instead of making one multi-FastA file (MFA)
450 per chromosome. This might be useful if individual bisulfite converted
451 chromosomes are needed (e.g. for debugging), however it can cause a
452 problem with indexing if the number of chromosomes is vast (this is likely
453 to be in the range of several thousand files; the operating system can
454 only handle lists up to a certain length, and some newly assembled
455 genomes may contain 20000-50000 contigs of scaffold files which do exceed
456 this list length limit).
457
458
459 ARGUMENTS:
460
461 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
462 The Bismark Genome Preparation expects one or more fastA files in the folder
463 (with the file extension: .fa or .fasta). Specifying this path is mandatory.
464
465
466 This script was last modified on 16 Oct 2014.
467 HOW_TO
468 }