0
|
1 #!/usr/bin/perl --
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Cwd;
|
|
5 use File::Path qw(rmtree);
|
|
6 $|++;
|
|
7
|
|
8
|
|
9 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
|
|
10
|
|
11 ## This program is free software: you can redistribute it and/or modify
|
|
12 ## it under the terms of the GNU General Public License as published by
|
|
13 ## the Free Software Foundation, either version 3 of the License, or
|
|
14 ## (at your option) any later version.
|
|
15
|
|
16 ## This program is distributed in the hope that it will be useful,
|
|
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19 ## GNU General Public License for more details.
|
|
20
|
|
21 ## You should have received a copy of the GNU General Public License
|
|
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
23
|
|
24 use Getopt::Long;
|
|
25 use Cwd;
|
|
26
|
|
27 my $verbose;
|
|
28 my $help;
|
|
29 my $version;
|
|
30 my $man;
|
|
31 my $path_to_bowtie;
|
|
32 my $multi_fasta;
|
|
33 my $single_fasta;
|
|
34 my $bowtie2;
|
|
35
|
3
|
36 my $bismark_version = 'v0.10.0';
|
0
|
37
|
|
38 GetOptions ('verbose' => \$verbose,
|
|
39 'help' => \$help,
|
|
40 'man' => \$man,
|
|
41 'version' => \$version,
|
|
42 'path_to_bowtie:s' => \$path_to_bowtie,
|
|
43 'single_fasta' => \$single_fasta,
|
|
44 'bowtie2' => \$bowtie2,
|
|
45 );
|
|
46
|
|
47 if ($help or $man){
|
|
48 print_helpfile();
|
|
49 exit;
|
|
50 }
|
|
51
|
|
52 if ($version){
|
|
53 print << "VERSION";
|
|
54
|
|
55 Bismark - Bisulfite Mapper and Methylation Caller.
|
|
56
|
|
57 Bismark Genome Preparation Version: $bismark_version
|
|
58 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
|
|
59 www.bioinformatics.babraham.ac.uk/projects/
|
|
60
|
|
61 VERSION
|
|
62 exit;
|
|
63 }
|
|
64
|
3
|
65 my $genome_folder = shift @ARGV; # mandatory
|
|
66
|
|
67 # Ensuring a genome folder has been specified
|
|
68 if ($genome_folder){
|
|
69 unless ($genome_folder =~ /\/$/){
|
|
70 $genome_folder =~ s/$/\//;
|
|
71 }
|
|
72 $verbose and print "Path to genome folder specified as: $genome_folder\n";
|
|
73 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
|
|
74
|
|
75 # making the genome folder path abolsolute so it won't break if the path was specified relative
|
|
76 $genome_folder = getcwd;
|
|
77 unless ($genome_folder =~ /\/$/){
|
|
78 $genome_folder =~ s/$/\//;
|
|
79 }
|
|
80 }
|
|
81 else{
|
|
82 die "Please specify a genome folder to be used for bisulfite conversion\n\n";
|
|
83 }
|
|
84
|
|
85
|
|
86 my $CT_dir;
|
|
87 my $GA_dir;
|
|
88
|
|
89
|
0
|
90 if ($single_fasta){
|
|
91 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
|
|
92 $multi_fasta = 0;
|
|
93 }
|
|
94 else{
|
|
95 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
|
|
96 $single_fasta = 0;
|
|
97 $multi_fasta = 1;
|
|
98 }
|
|
99
|
|
100 my @filenames = create_bisulfite_genome_folders();
|
|
101
|
|
102 process_sequence_files ();
|
|
103
|
|
104 launch_bowtie_indexer();
|
|
105
|
|
106 sub launch_bowtie_indexer{
|
|
107 if ($bowtie2){
|
|
108 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
|
|
109 }
|
|
110 else{
|
|
111 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
|
|
112 }
|
|
113 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
|
|
114 sleep(5);
|
|
115
|
|
116 ### if the path to bowtie was specfified explicitely
|
|
117 if ($path_to_bowtie){
|
|
118 if ($bowtie2){
|
|
119 $path_to_bowtie =~ s/$/bowtie2-build/;
|
|
120 }
|
|
121 else{
|
|
122 $path_to_bowtie =~ s/$/bowtie-build/;
|
|
123 }
|
|
124 }
|
|
125 ### otherwise we assume that bowtie-build is in the path
|
|
126 else{
|
|
127 if ($bowtie2){
|
|
128 $path_to_bowtie = 'bowtie2-build';
|
|
129 }
|
|
130 else{
|
|
131 $path_to_bowtie = 'bowtie-build';
|
|
132 }
|
|
133 }
|
|
134
|
|
135 $verbose and print "\n";
|
|
136
|
|
137 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
|
|
138 my $pid = fork();
|
|
139
|
|
140 # parent process
|
|
141 if ($pid){
|
|
142 sleep(1);
|
|
143 chdir $CT_dir or die "Unable to change directory: $!\n";
|
|
144 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
|
|
145 my @fasta_files = <*.fa>;
|
|
146 my $file_list = join (',',@fasta_files);
|
|
147 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
|
|
148 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
|
|
149
|
|
150 sleep (11);
|
|
151 exec ("$path_to_bowtie","-f","$file_list","BS_CT");
|
|
152 }
|
|
153
|
|
154 # child process
|
|
155 elsif ($pid == 0){
|
|
156 sleep(2);
|
|
157 chdir $GA_dir or die "Unable to change directory: $!\n";
|
|
158 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
|
|
159 my @fasta_files = <*.fa>;
|
|
160 my $file_list = join (',',@fasta_files);
|
|
161 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
|
|
162 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
|
|
163 $verbose and print "(starting in 10 seconds)\n";
|
|
164 sleep(10);
|
|
165 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
|
|
166 }
|
|
167
|
|
168 # if the platform doesn't support the fork command we will run the indexing processes one after the other
|
|
169 else{
|
|
170 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
|
|
171 sleep(10);
|
|
172
|
|
173 ### moving to CT genome folder
|
|
174 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
|
|
175 chdir $CT_dir or die "Unable to change directory: $!\n";
|
|
176 my @fasta_files = <*.fa>;
|
|
177 my $file_list = join (',',@fasta_files);
|
|
178 $verbose and print "$file_list\n\n";
|
|
179 sleep(2);
|
|
180 system ("$path_to_bowtie","-f","$file_list","BS_CT");
|
|
181 @fasta_files=();
|
|
182 $file_list= '';
|
|
183
|
|
184 ### moving to GA genome folder
|
|
185 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
|
|
186 chdir $GA_dir or die "Unable to change directory: $!\n";
|
|
187 @fasta_files = <*.fa>;
|
|
188 $file_list = join (',',@fasta_files);
|
|
189 $verbose and print "$file_list\n\n";
|
|
190 sleep(2);
|
|
191 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
|
|
192 }
|
|
193 }
|
|
194
|
|
195
|
|
196 sub process_sequence_files {
|
|
197
|
|
198 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
|
|
199 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
|
|
200 sleep (3);
|
|
201
|
|
202 $verbose and print "conversions performed:\n";
|
|
203 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
|
|
204
|
|
205
|
|
206 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
|
|
207 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
|
|
208 ### This is now the default option
|
|
209
|
|
210 if ($multi_fasta){
|
|
211 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
|
|
212 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
|
|
213 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
|
|
214
|
|
215 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
|
|
216 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
|
|
217 }
|
|
218
|
|
219 foreach my $filename(@filenames){
|
|
220 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
|
|
221 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
|
|
222 # warn "Reading chromosome information from $filename\n\n";
|
|
223
|
|
224 ### first line needs to be a fastA header
|
|
225 my $first_line = <IN>;
|
|
226 chomp $first_line;
|
|
227
|
|
228 ### Extracting chromosome name from the FastA header
|
|
229 my $chromosome_name = extract_chromosome_name($first_line);
|
|
230
|
|
231 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
|
|
232 unless ($multi_fasta){
|
|
233 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
|
|
234 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
|
|
235 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
|
|
236
|
|
237 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
|
|
238 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
|
|
239 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
|
|
240 }
|
|
241
|
|
242 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
|
|
243 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
|
|
244
|
|
245
|
|
246 while (<IN>){
|
|
247
|
|
248 ### in case the line is a new fastA header
|
|
249 if ($_ =~ /^>/){
|
|
250 ### printing out the stats for the previous chromosome
|
|
251 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
|
|
252 ### resetting the chromosome transliteration counters
|
|
253 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
|
|
254
|
|
255 ### Extracting chromosome name from the additional FastA header
|
|
256 $chromosome_name = extract_chromosome_name($_);
|
|
257
|
|
258 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
|
|
259 unless ($multi_fasta){
|
|
260 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
|
|
261 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
|
|
262 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
|
|
263
|
|
264 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
|
|
265 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
|
|
266 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
|
|
267 }
|
|
268
|
|
269 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
|
|
270 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
|
|
271 }
|
|
272
|
|
273 else{
|
|
274 my $sequence = uc$_;
|
|
275
|
|
276 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
|
|
277
|
|
278 $sequence =~ s/[^ATCGN\n\r]/N/g;
|
|
279
|
|
280 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
|
|
281
|
|
282 my $CT_sequence = $sequence;
|
|
283 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
|
|
284 $total_CT_conversions += $CT_transliterations_performed;
|
|
285 $chromosome_CT_conversions += $CT_transliterations_performed;
|
|
286
|
|
287 print CT_CONVERT $CT_sequence;
|
|
288
|
|
289 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
|
|
290 ### complementing the forward strand and then C->T converting it)
|
|
291
|
|
292 my $GA_sequence = $sequence;
|
|
293 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
|
|
294 $total_GA_conversions += $GA_transliterations_performed;
|
|
295 $chromosome_GA_conversions += $GA_transliterations_performed;
|
|
296
|
|
297 print GA_CONVERT $GA_sequence;
|
|
298
|
|
299 }
|
|
300 }
|
|
301 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
|
|
302 }
|
|
303 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
|
|
304 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
|
|
305
|
|
306
|
|
307 print "\nTotal number of conversions performed:\n";
|
|
308 print "C->T:\t$total_CT_conversions\n";
|
|
309 print "G->A:\t$total_GA_conversions\n";
|
|
310
|
|
311 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
|
|
312 }
|
|
313
|
|
314 sub extract_chromosome_name {
|
|
315
|
|
316 my $header = shift;
|
|
317
|
|
318 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
|
|
319
|
|
320 if ($header =~ s/^>//){
|
|
321 my ($chromosome_name) = split (/\s+/,$header);
|
|
322 return $chromosome_name;
|
|
323 }
|
|
324 else{
|
|
325 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
|
|
326 }
|
|
327 }
|
|
328
|
|
329 sub create_bisulfite_genome_folders{
|
|
330
|
|
331 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
|
|
332
|
|
333 if ($path_to_bowtie){
|
|
334 unless ($path_to_bowtie =~ /\/$/){
|
|
335 $path_to_bowtie =~ s/$/\//;
|
|
336 }
|
|
337 if (chdir $path_to_bowtie){
|
|
338 if ($bowtie2){
|
|
339 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
|
|
340 }
|
|
341 else{
|
|
342 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
|
|
343 }
|
|
344 }
|
|
345 else{
|
|
346 die "There was an error with the path to bowtie: $!\n";
|
|
347 }
|
|
348 }
|
|
349
|
|
350 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
|
|
351
|
|
352
|
|
353 # Exiting unless there are fastA files in the folder
|
|
354 my @filenames = <*.fa>;
|
|
355
|
|
356 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
|
|
357 unless (@filenames){
|
|
358 @filenames = <*.fasta>;
|
|
359 }
|
|
360
|
|
361 unless (@filenames){
|
|
362 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
|
|
363 }
|
|
364
|
3
|
365 warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n";
|
0
|
366 sleep (3);
|
|
367
|
|
368 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
|
|
369 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
|
|
370 unless (-d $bisulfite_dir){
|
|
371 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
|
|
372 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
|
|
373 }
|
|
374 else{
|
3
|
375 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
|
|
376 sleep(5);
|
0
|
377 }
|
|
378
|
|
379 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
|
|
380 $CT_dir = "${bisulfite_dir}CT_conversion/";
|
|
381 $GA_dir = "${bisulfite_dir}GA_conversion/";
|
|
382
|
|
383 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
|
|
384 # converted version of the genome
|
|
385 unless (-d $CT_dir){
|
|
386 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
|
|
387 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
|
|
388 }
|
|
389 unless (-d $GA_dir){
|
|
390 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
|
|
391 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
|
|
392 }
|
|
393
|
|
394 # moving back to the original genome folder
|
|
395 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
|
|
396 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
|
|
397 warn "\nStep I - Prepare genome folders - completed\n\n\n";
|
|
398 return @filenames;
|
|
399 }
|
|
400
|
|
401 sub print_helpfile{
|
|
402 print << 'HOW_TO';
|
|
403
|
|
404
|
|
405 DESCRIPTION
|
|
406
|
|
407 This script is supposed to convert a specified reference genome into two different bisulfite
|
|
408 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
|
|
409 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
|
|
410 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
|
|
411 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
|
3
|
412 two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
|
0
|
413 that the indexing process can take up to several hours; this will mainly depend on genome size
|
|
414 and system resources.
|
|
415
|
|
416
|
|
417
|
|
418 The following is a brief description of command line options and arguments to control the
|
3
|
419 Bismark Genome Preparation:
|
0
|
420
|
|
421
|
|
422 USAGE: bismark_genome_preparation [options] <arguments>
|
|
423
|
|
424
|
|
425 OPTIONS:
|
|
426
|
|
427 --help/--man Displays this help filea and exits.
|
|
428
|
|
429 --version Displays version information and exits.
|
|
430
|
|
431 --verbose Print verbose output for more details or debugging.
|
|
432
|
3
|
433 --path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system
|
|
434 (depending on which aligner/indexer you intend to use). Unless this path
|
|
435 is specified it is assumed that Bowtie is in the PATH.
|
0
|
436
|
|
437 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
|
|
438
|
|
439 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
|
|
440 single-entry FastA files instead of making one multi-FastA file (MFA)
|
|
441 per chromosome. This might be useful if individual bisulfite converted
|
|
442 chromosomes are needed (e.g. for debugging), however it can cause a
|
|
443 problem with indexing if the number of chromosomes is vast (this is likely
|
|
444 to be in the range of several thousand files; the operating system can
|
|
445 only handle lists up to a certain length, and some newly assembled
|
|
446 genomes may contain 20000-50000 contigs of scaffold files which do exceed
|
|
447 this list length limit).
|
|
448
|
|
449
|
|
450 ARGUMENTS:
|
|
451
|
|
452 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
|
3
|
453 The Bismark Genome Preparation expects one or more fastA files in the folder
|
|
454 (with the file extension: .fa or .fasta). Specifying this path is mandatory.
|
0
|
455
|
|
456
|
3
|
457 This script was last modified on 19 Sept 2013.
|
0
|
458 HOW_TO
|
|
459 }
|