Mercurial > repos > bgruening > bismark
comparison bismark_mapping/bismark_genome_preparation @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use warnings; | |
4 use Cwd; | |
5 use Getopt::Long; | |
6 $|++; | |
7 | |
8 | |
9 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) | |
10 | |
11 ## This program is free software: you can redistribute it and/or modify | |
12 ## it under the terms of the GNU General Public License as published by | |
13 ## the Free Software Foundation, either version 3 of the License, or | |
14 ## (at your option) any later version. | |
15 | |
16 ## This program is distributed in the hope that it will be useful, | |
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 ## GNU General Public License for more details. | |
20 | |
21 ## You should have received a copy of the GNU General Public License | |
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
23 | |
24 my $verbose; | |
25 my $help; | |
26 my $version; | |
27 my $man; | |
28 my $path_to_bowtie; | |
29 my $multi_fasta; | |
30 my $single_fasta; | |
31 my $bowtie2; | |
32 my $bowtie1; | |
33 my $parent_dir = getcwd(); | |
34 | |
35 my $genomic_composition; | |
36 my %genomic_freqs; # storing the genomic sequence composition | |
37 my %freqs; | |
38 | |
39 my $bismark_version = 'v0.16.3'; | |
40 | |
41 GetOptions ('verbose' => \$verbose, | |
42 'help' => \$help, | |
43 'man' => \$man, | |
44 'version' => \$version, | |
45 'path_to_bowtie:s' => \$path_to_bowtie, | |
46 'single_fasta' => \$single_fasta, | |
47 'bowtie2' => \$bowtie2, | |
48 'bowtie1' => \$bowtie1, | |
49 'genomic_composition' => \$genomic_composition, | |
50 ); | |
51 | |
52 if ($help or $man){ | |
53 print_helpfile(); | |
54 exit; | |
55 } | |
56 | |
57 if ($version){ | |
58 print << "VERSION"; | |
59 | |
60 Bismark - Bisulfite Mapper and Methylation Caller. | |
61 | |
62 Bismark Genome Preparation Version: $bismark_version | |
63 Copyright 2010-16 Felix Krueger, Babraham Bioinformatics | |
64 www.bioinformatics.babraham.ac.uk/projects/ | |
65 | |
66 VERSION | |
67 exit; | |
68 } | |
69 | |
70 my $genome_folder = shift @ARGV; # mandatory | |
71 my %chromosomes; # checking if chromosome names are unique (required) | |
72 | |
73 # Ensuring a genome folder has been specified | |
74 if ($genome_folder){ | |
75 unless ($genome_folder =~ /\/$/){ | |
76 $genome_folder =~ s/$/\//; | |
77 } | |
78 $verbose and print "Path to genome folder specified as: $genome_folder\n"; | |
79 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; | |
80 | |
81 # making the genome folder path abolsolute so it won't break if the path was specified relative | |
82 $genome_folder = getcwd(); | |
83 unless ($genome_folder =~ /\/$/){ | |
84 $genome_folder =~ s/$/\//; | |
85 } | |
86 } | |
87 else{ | |
88 die "Please specify a genome folder to be used for bisulfite conversion\n\n"; | |
89 } | |
90 | |
91 | |
92 my $CT_dir; | |
93 my $GA_dir; | |
94 | |
95 if ($bowtie1){ | |
96 if ($bowtie2){ | |
97 die "You may not select both --bowtie1 and --bowtie2. Make your pick! (default is Bowtie2)\n"; | |
98 } | |
99 $bowtie2 = 0; | |
100 $verbose and print "Aligner to be used: Bowtie (1)\n"; | |
101 } | |
102 else{ # Bowtie 2 is now the default mode (as of 27 July 2015) | |
103 if ($bowtie2){ | |
104 $verbose and print "Aligner to be used: Bowtie 2 (user-defined)\n"; | |
105 } | |
106 else{ | |
107 $verbose and print "Aligner to be used: Bowtie 2 (default)\n"; | |
108 } | |
109 $bowtie2 = 1; | |
110 } | |
111 | |
112 if ($single_fasta){ | |
113 warn "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; | |
114 $multi_fasta = 0; | |
115 } | |
116 else{ | |
117 warn "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n"; | |
118 $single_fasta = 0; | |
119 $multi_fasta = 1; | |
120 } | |
121 | |
122 my @filenames = create_bisulfite_genome_folders(); | |
123 | |
124 if ($genomic_composition){ | |
125 get_genomic_frequencies(); | |
126 warn "Finished processing genomic nucleotide frequencies\n\n"; | |
127 %chromosomes = (); # resetting | |
128 } | |
129 | |
130 process_sequence_files (); | |
131 | |
132 launch_bowtie_indexer(); | |
133 | |
134 sub launch_bowtie_indexer{ | |
135 if ($bowtie2){ | |
136 warn "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n"; | |
137 } | |
138 else{ | |
139 warn "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n"; | |
140 } | |
141 print "Please be aware that this process can - depending on genome size - take several hours!\n"; | |
142 sleep(1); | |
143 | |
144 ### if the path to bowtie was specfified explicitely | |
145 if ($path_to_bowtie){ | |
146 if ($bowtie2){ | |
147 $path_to_bowtie =~ s/$/bowtie2-build/; | |
148 } | |
149 else{ | |
150 $path_to_bowtie =~ s/$/bowtie-build/; | |
151 } | |
152 } | |
153 ### otherwise we assume that bowtie-build is in the path | |
154 else{ | |
155 if ($bowtie2){ | |
156 $path_to_bowtie = 'bowtie2-build'; | |
157 } | |
158 else{ | |
159 $path_to_bowtie = 'bowtie-build'; | |
160 } | |
161 } | |
162 | |
163 $verbose and print "\n"; | |
164 | |
165 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer) | |
166 my $pid = fork(); | |
167 | |
168 # parent process | |
169 if ($pid){ | |
170 sleep(1); | |
171 chdir $CT_dir or die "Unable to change directory: $!\n"; | |
172 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n"; | |
173 my @fasta_files = <*.fa>; | |
174 my $file_list = join (',',@fasta_files); | |
175 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n"; | |
176 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n"; | |
177 | |
178 system ("$path_to_bowtie","-f","$file_list","BS_CT")== 0 or die "Parent process: Failed to build index\n"; | |
179 waitpid( $pid, 0 ); | |
180 $? == 0 or die "Parent process: Child process failed\n"; | |
181 } | |
182 | |
183 # child process | |
184 elsif ($pid == 0){ | |
185 sleep(2); | |
186 chdir $GA_dir or die "Unable to change directory: $!\n"; | |
187 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n"; | |
188 my @fasta_files = <*.fa>; | |
189 my $file_list = join (',',@fasta_files); | |
190 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n"; | |
191 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n"; | |
192 | |
193 exec ("$path_to_bowtie","-f","$file_list","BS_GA"); | |
194 } | |
195 | |
196 # if the platform doesn't support the fork command we will run the indexing processes one after the other | |
197 else{ | |
198 print "Forking process was not successful, therefore performing the indexing sequentially instead\n"; | |
199 # sleep(10); | |
200 | |
201 ### moving to CT genome folder | |
202 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n"; | |
203 chdir $CT_dir or die "Unable to change directory: $!\n"; | |
204 my @fasta_files = <*.fa>; | |
205 my $file_list = join (',',@fasta_files); | |
206 $verbose and print "$file_list\n\n"; | |
207 # sleep(2); | |
208 system ("$path_to_bowtie","-f","$file_list","BS_CT"); | |
209 @fasta_files=(); | |
210 $file_list= ''; | |
211 | |
212 ### moving to GA genome folder | |
213 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n"; | |
214 chdir $GA_dir or die "Unable to change directory: $!\n"; | |
215 @fasta_files = <*.fa>; | |
216 $file_list = join (',',@fasta_files); | |
217 $verbose and print "$file_list\n\n"; | |
218 # sleep(2); | |
219 exec ("$path_to_bowtie","-f","$file_list","BS_GA"); | |
220 } | |
221 } | |
222 | |
223 | |
224 sub process_sequence_files { | |
225 | |
226 my ($total_CT_conversions,$total_GA_conversions) = (0,0); | |
227 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n"; | |
228 sleep (3); | |
229 | |
230 $verbose and print "conversions performed:\n"; | |
231 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n"; | |
232 | |
233 | |
234 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file | |
235 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle | |
236 ### This is now the default option | |
237 | |
238 if ($multi_fasta){ | |
239 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files | |
240 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa"; | |
241 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; | |
242 | |
243 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa"; | |
244 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; | |
245 } | |
246 | |
247 foreach my $filename(@filenames){ | |
248 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); | |
249 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n"; | |
250 # warn "Reading chromosome information from $filename\n\n"; | |
251 | |
252 ### first line needs to be a fastA header | |
253 my $first_line = <IN>; | |
254 chomp $first_line; | |
255 | |
256 ### Extracting chromosome name from the FastA header | |
257 my $chromosome_name = extract_chromosome_name($first_line); | |
258 | |
259 ### Exiting if a chromosome with the same name was present already | |
260 if (exists $chromosomes{$chromosome_name}){ | |
261 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
262 } | |
263 else{ | |
264 $chromosomes{$chromosome_name}++; | |
265 } | |
266 | |
267 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. | |
268 unless ($multi_fasta){ | |
269 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; | |
270 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; | |
271 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; | |
272 | |
273 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; | |
274 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; | |
275 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; | |
276 } | |
277 | |
278 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry | |
279 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry | |
280 | |
281 | |
282 while (<IN>){ | |
283 | |
284 ### in case the line is a new fastA header | |
285 if ($_ =~ /^>/){ | |
286 ### printing out the stats for the previous chromosome | |
287 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; | |
288 ### resetting the chromosome transliteration counters | |
289 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); | |
290 | |
291 ### Extracting chromosome name from the additional FastA header | |
292 $chromosome_name = extract_chromosome_name($_); | |
293 | |
294 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. | |
295 unless ($multi_fasta){ | |
296 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; | |
297 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; | |
298 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; | |
299 | |
300 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; | |
301 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; | |
302 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; | |
303 } | |
304 | |
305 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; | |
306 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; | |
307 } | |
308 | |
309 else{ | |
310 my $sequence = uc$_; | |
311 | |
312 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group) | |
313 | |
314 $sequence =~ s/[^ATCGN\n\r]/N/g; | |
315 | |
316 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion) | |
317 | |
318 my $CT_sequence = $sequence; | |
319 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts | |
320 $total_CT_conversions += $CT_transliterations_performed; | |
321 $chromosome_CT_conversions += $CT_transliterations_performed; | |
322 | |
323 print CT_CONVERT $CT_sequence; | |
324 | |
325 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse- | |
326 ### complementing the forward strand and then C->T converting it) | |
327 | |
328 my $GA_sequence = $sequence; | |
329 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand | |
330 $total_GA_conversions += $GA_transliterations_performed; | |
331 $chromosome_GA_conversions += $GA_transliterations_performed; | |
332 | |
333 print GA_CONVERT $GA_sequence; | |
334 | |
335 } | |
336 } | |
337 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; | |
338 } | |
339 close (CT_CONVERT) or die "Failed to close filehandle: $!\n"; | |
340 close (GA_CONVERT) or die "Failed to close filehandle: $!\n"; | |
341 | |
342 | |
343 print "\nTotal number of conversions performed:\n"; | |
344 print "C->T:\t$total_CT_conversions\n"; | |
345 print "G->A:\t$total_GA_conversions\n"; | |
346 | |
347 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n"; | |
348 } | |
349 | |
350 sub get_genomic_frequencies{ | |
351 | |
352 warn "Calculating genomic frequencies (this may take several minutes depending on genome size) ...\n"; | |
353 warn "="x164,"\n"; | |
354 read_genome_into_memory($parent_dir); | |
355 foreach my $chr (keys %chromosomes){ | |
356 warn "Processing chromosome >> $chr <<\n"; | |
357 process_sequence($chromosomes{$chr}); | |
358 } | |
359 | |
360 %genomic_freqs = %freqs; | |
361 | |
362 ### Attempting to write a genomic nucleotide frequency table out to the genome folder so we can re-use it next time without the need to re-calculate | |
363 ### if this fails | |
364 if ( open (FREQS,'>',"${genome_folder}genomic_nucleotide_frequencies.txt") ){ | |
365 warn "Writing genomic nucleotide frequencies to the file >${genome_folder}genomic_nucleotide_frequencies.txt< for future re-use\n"; | |
366 foreach my $f(sort keys %genomic_freqs){ | |
367 warn "Writing count of (di-)nucleotide: $f\t$genomic_freqs{$f}\n"; | |
368 print FREQS "$f\t$genomic_freqs{$f}\n"; | |
369 } | |
370 close FREQS or warn "Failed to close filehandle FREQS: $!\n\n"; | |
371 } | |
372 else{ | |
373 warn "Failed to write out file ${genome_folder}genomic_nucleotide_frequencies.txt because of: $!. Skipping writing out genomic frequency table\n\n"; | |
374 } | |
375 } | |
376 | |
377 | |
378 sub process_sequence{ | |
379 | |
380 my $seq = shift; | |
381 my $mono; | |
382 my $di; | |
383 | |
384 foreach my $index (0..(length$seq)-1){ | |
385 my $counted = 0; | |
386 if ($index%10000000==0){ | |
387 # warn "Current index number is $index\n"; | |
388 } | |
389 | |
390 $mono = substr($seq,$index,1); | |
391 unless ( $mono eq 'N'){ | |
392 $freqs{$mono}++; | |
393 } | |
394 | |
395 unless ( ($index + 2) > length$seq ){ | |
396 $di = substr($seq,$index,2); | |
397 if (index($di,'N') < 0) { | |
398 $freqs{$di}++; | |
399 } | |
400 } | |
401 } | |
402 | |
403 } | |
404 | |
405 sub extract_chromosome_name { | |
406 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well | |
407 my $fasta_header = shift; | |
408 if ($fasta_header =~ s/^>//){ | |
409 my ($chromosome_name) = split (/\s+/,$fasta_header); | |
410 return $chromosome_name; | |
411 } | |
412 else{ | |
413 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
414 } | |
415 } | |
416 | |
417 | |
418 sub create_bisulfite_genome_folders{ | |
419 | |
420 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; | |
421 | |
422 if ($path_to_bowtie){ | |
423 unless ($path_to_bowtie =~ /\/$/){ | |
424 $path_to_bowtie =~ s/$/\//; | |
425 } | |
426 if (chdir $path_to_bowtie){ | |
427 if ($bowtie2){ | |
428 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n"; | |
429 } | |
430 else{ | |
431 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n"; | |
432 } | |
433 } | |
434 else{ | |
435 die "There was an error with the path to bowtie: $!\n"; | |
436 } | |
437 } | |
438 | |
439 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; | |
440 | |
441 | |
442 # Exiting unless there are fastA files in the folder | |
443 my @filenames = <*.fa>; | |
444 | |
445 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
446 unless (@filenames){ | |
447 @filenames = <*.fasta>; | |
448 } | |
449 | |
450 unless (@filenames){ | |
451 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; | |
452 } | |
453 | |
454 warn "Bisulfite Genome Indexer version $bismark_version (last modified 25 August 2015)\n\n"; | |
455 sleep (3); | |
456 | |
457 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists | |
458 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; | |
459 unless (-d $bisulfite_dir){ | |
460 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; | |
461 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; | |
462 } | |
463 else{ | |
464 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n"; | |
465 sleep(5); | |
466 } | |
467 | |
468 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; | |
469 $CT_dir = "${bisulfite_dir}CT_conversion/"; | |
470 $GA_dir = "${bisulfite_dir}GA_conversion/"; | |
471 | |
472 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion) | |
473 # converted version of the genome | |
474 unless (-d $CT_dir){ | |
475 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n"; | |
476 $verbose and print "Created Bisulfite Genome folder $CT_dir\n"; | |
477 } | |
478 unless (-d $GA_dir){ | |
479 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n"; | |
480 $verbose and print "Created Bisulfite Genome folder $GA_dir\n"; | |
481 } | |
482 | |
483 # moving back to the original genome folder | |
484 chdir $genome_folder or die "Could't move to directory $genome_folder $!"; | |
485 # $verbose and print "Moved back to genome folder folder $genome_folder\n"; | |
486 warn "\nStep I - Prepare genome folders - completed\n\n\n"; | |
487 return @filenames; | |
488 } | |
489 | |
490 sub read_genome_into_memory{ | |
491 | |
492 ## reading in and storing the specified genome in the %chromosomes hash | |
493 chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
494 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
495 | |
496 my @chromosome_filenames = <*.fa>; | |
497 | |
498 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
499 unless (@chromosome_filenames){ | |
500 @chromosome_filenames = <*.fasta>; | |
501 } | |
502 unless (@chromosome_filenames){ | |
503 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
504 } | |
505 | |
506 foreach my $chromosome_filename (@chromosome_filenames){ | |
507 | |
508 # skipping the tophat entire mouse genome fasta file | |
509 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); | |
510 | |
511 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
512 ### first line needs to be a fastA header | |
513 my $first_line = <CHR_IN>; | |
514 chomp $first_line; | |
515 $first_line =~ s/\r//; # removing /r carriage returns | |
516 | |
517 ### Extracting chromosome name from the FastA header | |
518 my $chromosome_name = extract_chromosome_name($first_line); | |
519 | |
520 my $sequence; | |
521 while (<CHR_IN>){ | |
522 chomp; | |
523 $_ =~ s/\r//; # removing /r carriage returns | |
524 | |
525 if ($_ =~ /^>/){ | |
526 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
527 if (exists $chromosomes{$chromosome_name}){ | |
528 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
529 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
530 } | |
531 else { | |
532 if (length($sequence) == 0){ | |
533 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
534 } | |
535 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
536 $chromosomes{$chromosome_name} = $sequence; | |
537 } | |
538 ### resetting the sequence variable | |
539 $sequence = ''; | |
540 ### setting new chromosome name | |
541 $chromosome_name = extract_chromosome_name($_); | |
542 } | |
543 else{ | |
544 $sequence .= uc$_; | |
545 } | |
546 } | |
547 | |
548 if (exists $chromosomes{$chromosome_name}){ | |
549 warn "chr $chromosome_name (",length $sequence ," bp)\t"; | |
550 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
551 } | |
552 else{ | |
553 if (length($sequence) == 0){ | |
554 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
555 } | |
556 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
557 $chromosomes{$chromosome_name} = $sequence; | |
558 } | |
559 } | |
560 warn "\n"; | |
561 chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; | |
562 } | |
563 | |
564 | |
565 | |
566 | |
567 sub print_helpfile{ | |
568 print << 'HOW_TO'; | |
569 | |
570 | |
571 DESCRIPTION | |
572 | |
573 This script is supposed to convert a specified reference genome into two different bisulfite | |
574 converted versions and index them for alignments with Bowtie 2 (default), or Bowtie 1. The first | |
575 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs | |
576 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference | |
577 genome folder. Once the bisulfite conversion has been completed the program will fork and launch | |
578 two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware | |
579 that the indexing process can take up to several hours; this will mainly depend on genome size | |
580 and system resources. | |
581 | |
582 | |
583 | |
584 The following is a brief description of command line options and arguments to control the | |
585 Bismark Genome Preparation: | |
586 | |
587 | |
588 USAGE: bismark_genome_preparation [options] <arguments> | |
589 | |
590 | |
591 OPTIONS: | |
592 | |
593 --help/--man Displays this help filea and exits. | |
594 | |
595 --version Displays version information and exits. | |
596 | |
597 --verbose Print verbose output for more details or debugging. | |
598 | |
599 --path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system | |
600 (depending on which aligner/indexer you intend to use). Unless this path | |
601 is specified it is assumed that Bowtie is in the PATH. | |
602 | |
603 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: ON). | |
604 | |
605 --bowtie1 This will create bisulfite indexes for Bowtie 1. (Default: OFF). | |
606 | |
607 --single_fasta Instruct the Bismark Indexer to write the converted genomes into | |
608 single-entry FastA files instead of making one multi-FastA file (MFA) | |
609 per chromosome. This might be useful if individual bisulfite converted | |
610 chromosomes are needed (e.g. for debugging), however it can cause a | |
611 problem with indexing if the number of chromosomes is vast (this is likely | |
612 to be in the range of several thousand files; the operating system can | |
613 only handle lists up to a certain length, and some newly assembled | |
614 genomes may contain 20000-500000 contigs of scaffold files which do exceed | |
615 this list length limit). | |
616 | |
617 --genomic_composition Calculate and extract the genomic sequence composition for mono and di-nucleotides | |
618 and write the genomic composition table 'genomic_nucleotide_frequencies.txt' to the | |
619 genome folder. This may be useful later on when using bam2nuc or the Bismark option | |
620 --nucleotide_coverage. | |
621 | |
622 | |
623 ARGUMENTS: | |
624 | |
625 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted. | |
626 The Bismark Genome Preparation expects one or more fastA files in the folder | |
627 (with the file extension: .fa or .fasta). Specifying this path is mandatory. | |
628 | |
629 | |
630 This script was last modified on 07 July 2016. | |
631 HOW_TO | |
632 } |