comparison old/bismark_genome_preparation @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use Cwd;
5 use File::Path qw(rmtree);
6 $|++;
7
8
9 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
10
11 ## This program is free software: you can redistribute it and/or modify
12 ## it under the terms of the GNU General Public License as published by
13 ## the Free Software Foundation, either version 3 of the License, or
14 ## (at your option) any later version.
15
16 ## This program is distributed in the hope that it will be useful,
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ## GNU General Public License for more details.
20
21 ## You should have received a copy of the GNU General Public License
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
23
24 use Getopt::Long;
25 use Cwd;
26
27 my $verbose;
28 my $help;
29 my $version;
30 my $man;
31 my $path_to_bowtie;
32 my $multi_fasta;
33 my $single_fasta;
34 my $bowtie2;
35
36 my $bismark_version = 'v0.10.0';
37
38 GetOptions ('verbose' => \$verbose,
39 'help' => \$help,
40 'man' => \$man,
41 'version' => \$version,
42 'path_to_bowtie:s' => \$path_to_bowtie,
43 'single_fasta' => \$single_fasta,
44 'bowtie2' => \$bowtie2,
45 );
46
47 if ($help or $man){
48 print_helpfile();
49 exit;
50 }
51
52 if ($version){
53 print << "VERSION";
54
55 Bismark - Bisulfite Mapper and Methylation Caller.
56
57 Bismark Genome Preparation Version: $bismark_version
58 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
59 www.bioinformatics.babraham.ac.uk/projects/
60
61 VERSION
62 exit;
63 }
64
65 my $genome_folder = shift @ARGV; # mandatory
66
67 # Ensuring a genome folder has been specified
68 if ($genome_folder){
69 unless ($genome_folder =~ /\/$/){
70 $genome_folder =~ s/$/\//;
71 }
72 $verbose and print "Path to genome folder specified as: $genome_folder\n";
73 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
74
75 # making the genome folder path abolsolute so it won't break if the path was specified relative
76 $genome_folder = getcwd;
77 unless ($genome_folder =~ /\/$/){
78 $genome_folder =~ s/$/\//;
79 }
80 }
81 else{
82 die "Please specify a genome folder to be used for bisulfite conversion\n\n";
83 }
84
85
86 my $CT_dir;
87 my $GA_dir;
88
89
90 if ($single_fasta){
91 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
92 $multi_fasta = 0;
93 }
94 else{
95 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
96 $single_fasta = 0;
97 $multi_fasta = 1;
98 }
99
100 my @filenames = create_bisulfite_genome_folders();
101
102 process_sequence_files ();
103
104 launch_bowtie_indexer();
105
106 sub launch_bowtie_indexer{
107 if ($bowtie2){
108 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
109 }
110 else{
111 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
112 }
113 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
114 sleep(5);
115
116 ### if the path to bowtie was specfified explicitely
117 if ($path_to_bowtie){
118 if ($bowtie2){
119 $path_to_bowtie =~ s/$/bowtie2-build/;
120 }
121 else{
122 $path_to_bowtie =~ s/$/bowtie-build/;
123 }
124 }
125 ### otherwise we assume that bowtie-build is in the path
126 else{
127 if ($bowtie2){
128 $path_to_bowtie = 'bowtie2-build';
129 }
130 else{
131 $path_to_bowtie = 'bowtie-build';
132 }
133 }
134
135 $verbose and print "\n";
136
137 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
138 my $pid = fork();
139
140 # parent process
141 if ($pid){
142 sleep(1);
143 chdir $CT_dir or die "Unable to change directory: $!\n";
144 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
145 my @fasta_files = <*.fa>;
146 my $file_list = join (',',@fasta_files);
147 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
148 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
149
150 sleep (11);
151 exec ("$path_to_bowtie","-f","$file_list","BS_CT");
152 }
153
154 # child process
155 elsif ($pid == 0){
156 sleep(2);
157 chdir $GA_dir or die "Unable to change directory: $!\n";
158 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
159 my @fasta_files = <*.fa>;
160 my $file_list = join (',',@fasta_files);
161 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
162 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
163 $verbose and print "(starting in 10 seconds)\n";
164 sleep(10);
165 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
166 }
167
168 # if the platform doesn't support the fork command we will run the indexing processes one after the other
169 else{
170 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
171 sleep(10);
172
173 ### moving to CT genome folder
174 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
175 chdir $CT_dir or die "Unable to change directory: $!\n";
176 my @fasta_files = <*.fa>;
177 my $file_list = join (',',@fasta_files);
178 $verbose and print "$file_list\n\n";
179 sleep(2);
180 system ("$path_to_bowtie","-f","$file_list","BS_CT");
181 @fasta_files=();
182 $file_list= '';
183
184 ### moving to GA genome folder
185 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
186 chdir $GA_dir or die "Unable to change directory: $!\n";
187 @fasta_files = <*.fa>;
188 $file_list = join (',',@fasta_files);
189 $verbose and print "$file_list\n\n";
190 sleep(2);
191 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
192 }
193 }
194
195
196 sub process_sequence_files {
197
198 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
199 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
200 sleep (3);
201
202 $verbose and print "conversions performed:\n";
203 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
204
205
206 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
207 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
208 ### This is now the default option
209
210 if ($multi_fasta){
211 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
212 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
213 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
214
215 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
216 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
217 }
218
219 foreach my $filename(@filenames){
220 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
221 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
222 # warn "Reading chromosome information from $filename\n\n";
223
224 ### first line needs to be a fastA header
225 my $first_line = <IN>;
226 chomp $first_line;
227
228 ### Extracting chromosome name from the FastA header
229 my $chromosome_name = extract_chromosome_name($first_line);
230
231 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
232 unless ($multi_fasta){
233 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
234 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
235 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
236
237 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
238 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
239 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
240 }
241
242 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
243 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
244
245
246 while (<IN>){
247
248 ### in case the line is a new fastA header
249 if ($_ =~ /^>/){
250 ### printing out the stats for the previous chromosome
251 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
252 ### resetting the chromosome transliteration counters
253 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
254
255 ### Extracting chromosome name from the additional FastA header
256 $chromosome_name = extract_chromosome_name($_);
257
258 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
259 unless ($multi_fasta){
260 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
261 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
262 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
263
264 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
265 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
266 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
267 }
268
269 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
270 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
271 }
272
273 else{
274 my $sequence = uc$_;
275
276 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
277
278 $sequence =~ s/[^ATCGN\n\r]/N/g;
279
280 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
281
282 my $CT_sequence = $sequence;
283 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
284 $total_CT_conversions += $CT_transliterations_performed;
285 $chromosome_CT_conversions += $CT_transliterations_performed;
286
287 print CT_CONVERT $CT_sequence;
288
289 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
290 ### complementing the forward strand and then C->T converting it)
291
292 my $GA_sequence = $sequence;
293 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
294 $total_GA_conversions += $GA_transliterations_performed;
295 $chromosome_GA_conversions += $GA_transliterations_performed;
296
297 print GA_CONVERT $GA_sequence;
298
299 }
300 }
301 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
302 }
303 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
304 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
305
306
307 print "\nTotal number of conversions performed:\n";
308 print "C->T:\t$total_CT_conversions\n";
309 print "G->A:\t$total_GA_conversions\n";
310
311 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
312 }
313
314 sub extract_chromosome_name {
315
316 my $header = shift;
317
318 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
319
320 if ($header =~ s/^>//){
321 my ($chromosome_name) = split (/\s+/,$header);
322 return $chromosome_name;
323 }
324 else{
325 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
326 }
327 }
328
329 sub create_bisulfite_genome_folders{
330
331 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
332
333 if ($path_to_bowtie){
334 unless ($path_to_bowtie =~ /\/$/){
335 $path_to_bowtie =~ s/$/\//;
336 }
337 if (chdir $path_to_bowtie){
338 if ($bowtie2){
339 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
340 }
341 else{
342 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
343 }
344 }
345 else{
346 die "There was an error with the path to bowtie: $!\n";
347 }
348 }
349
350 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
351
352
353 # Exiting unless there are fastA files in the folder
354 my @filenames = <*.fa>;
355
356 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
357 unless (@filenames){
358 @filenames = <*.fasta>;
359 }
360
361 unless (@filenames){
362 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
363 }
364
365 warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n";
366 sleep (3);
367
368 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
369 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
370 unless (-d $bisulfite_dir){
371 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
372 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
373 }
374 else{
375 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
376 sleep(5);
377 }
378
379 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
380 $CT_dir = "${bisulfite_dir}CT_conversion/";
381 $GA_dir = "${bisulfite_dir}GA_conversion/";
382
383 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
384 # converted version of the genome
385 unless (-d $CT_dir){
386 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
387 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
388 }
389 unless (-d $GA_dir){
390 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
391 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
392 }
393
394 # moving back to the original genome folder
395 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
396 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
397 warn "\nStep I - Prepare genome folders - completed\n\n\n";
398 return @filenames;
399 }
400
401 sub print_helpfile{
402 print << 'HOW_TO';
403
404
405 DESCRIPTION
406
407 This script is supposed to convert a specified reference genome into two different bisulfite
408 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
409 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
410 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
411 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
412 two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
413 that the indexing process can take up to several hours; this will mainly depend on genome size
414 and system resources.
415
416
417
418 The following is a brief description of command line options and arguments to control the
419 Bismark Genome Preparation:
420
421
422 USAGE: bismark_genome_preparation [options] <arguments>
423
424
425 OPTIONS:
426
427 --help/--man Displays this help filea and exits.
428
429 --version Displays version information and exits.
430
431 --verbose Print verbose output for more details or debugging.
432
433 --path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system
434 (depending on which aligner/indexer you intend to use). Unless this path
435 is specified it is assumed that Bowtie is in the PATH.
436
437 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
438
439 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
440 single-entry FastA files instead of making one multi-FastA file (MFA)
441 per chromosome. This might be useful if individual bisulfite converted
442 chromosomes are needed (e.g. for debugging), however it can cause a
443 problem with indexing if the number of chromosomes is vast (this is likely
444 to be in the range of several thousand files; the operating system can
445 only handle lists up to a certain length, and some newly assembled
446 genomes may contain 20000-50000 contigs of scaffold files which do exceed
447 this list length limit).
448
449
450 ARGUMENTS:
451
452 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
453 The Bismark Genome Preparation expects one or more fastA files in the folder
454 (with the file extension: .fa or .fasta). Specifying this path is mandatory.
455
456
457 This script was last modified on 19 Sept 2013.
458 HOW_TO
459 }