comparison bismark_mapping/bismark_genome_preparation @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Cwd;
5 use Getopt::Long;
6 $|++;
7
8
9 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)
10
11 ## This program is free software: you can redistribute it and/or modify
12 ## it under the terms of the GNU General Public License as published by
13 ## the Free Software Foundation, either version 3 of the License, or
14 ## (at your option) any later version.
15
16 ## This program is distributed in the hope that it will be useful,
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ## GNU General Public License for more details.
20
21 ## You should have received a copy of the GNU General Public License
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
23
24 my $verbose;
25 my $help;
26 my $version;
27 my $man;
28 my $path_to_bowtie;
29 my $multi_fasta;
30 my $single_fasta;
31 my $bowtie2;
32 my $bowtie1;
33 my $parent_dir = getcwd();
34
35 my $genomic_composition;
36 my %genomic_freqs; # storing the genomic sequence composition
37 my %freqs;
38
39 my $bismark_version = 'v0.16.3';
40
41 GetOptions ('verbose' => \$verbose,
42 'help' => \$help,
43 'man' => \$man,
44 'version' => \$version,
45 'path_to_bowtie:s' => \$path_to_bowtie,
46 'single_fasta' => \$single_fasta,
47 'bowtie2' => \$bowtie2,
48 'bowtie1' => \$bowtie1,
49 'genomic_composition' => \$genomic_composition,
50 );
51
52 if ($help or $man){
53 print_helpfile();
54 exit;
55 }
56
57 if ($version){
58 print << "VERSION";
59
60 Bismark - Bisulfite Mapper and Methylation Caller.
61
62 Bismark Genome Preparation Version: $bismark_version
63 Copyright 2010-16 Felix Krueger, Babraham Bioinformatics
64 www.bioinformatics.babraham.ac.uk/projects/
65
66 VERSION
67 exit;
68 }
69
70 my $genome_folder = shift @ARGV; # mandatory
71 my %chromosomes; # checking if chromosome names are unique (required)
72
73 # Ensuring a genome folder has been specified
74 if ($genome_folder){
75 unless ($genome_folder =~ /\/$/){
76 $genome_folder =~ s/$/\//;
77 }
78 $verbose and print "Path to genome folder specified as: $genome_folder\n";
79 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
80
81 # making the genome folder path abolsolute so it won't break if the path was specified relative
82 $genome_folder = getcwd();
83 unless ($genome_folder =~ /\/$/){
84 $genome_folder =~ s/$/\//;
85 }
86 }
87 else{
88 die "Please specify a genome folder to be used for bisulfite conversion\n\n";
89 }
90
91
92 my $CT_dir;
93 my $GA_dir;
94
95 if ($bowtie1){
96 if ($bowtie2){
97 die "You may not select both --bowtie1 and --bowtie2. Make your pick! (default is Bowtie2)\n";
98 }
99 $bowtie2 = 0;
100 $verbose and print "Aligner to be used: Bowtie (1)\n";
101 }
102 else{ # Bowtie 2 is now the default mode (as of 27 July 2015)
103 if ($bowtie2){
104 $verbose and print "Aligner to be used: Bowtie 2 (user-defined)\n";
105 }
106 else{
107 $verbose and print "Aligner to be used: Bowtie 2 (default)\n";
108 }
109 $bowtie2 = 1;
110 }
111
112 if ($single_fasta){
113 warn "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
114 $multi_fasta = 0;
115 }
116 else{
117 warn "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
118 $single_fasta = 0;
119 $multi_fasta = 1;
120 }
121
122 my @filenames = create_bisulfite_genome_folders();
123
124 if ($genomic_composition){
125 get_genomic_frequencies();
126 warn "Finished processing genomic nucleotide frequencies\n\n";
127 %chromosomes = (); # resetting
128 }
129
130 process_sequence_files ();
131
132 launch_bowtie_indexer();
133
134 sub launch_bowtie_indexer{
135 if ($bowtie2){
136 warn "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
137 }
138 else{
139 warn "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
140 }
141 print "Please be aware that this process can - depending on genome size - take several hours!\n";
142 sleep(1);
143
144 ### if the path to bowtie was specfified explicitely
145 if ($path_to_bowtie){
146 if ($bowtie2){
147 $path_to_bowtie =~ s/$/bowtie2-build/;
148 }
149 else{
150 $path_to_bowtie =~ s/$/bowtie-build/;
151 }
152 }
153 ### otherwise we assume that bowtie-build is in the path
154 else{
155 if ($bowtie2){
156 $path_to_bowtie = 'bowtie2-build';
157 }
158 else{
159 $path_to_bowtie = 'bowtie-build';
160 }
161 }
162
163 $verbose and print "\n";
164
165 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
166 my $pid = fork();
167
168 # parent process
169 if ($pid){
170 sleep(1);
171 chdir $CT_dir or die "Unable to change directory: $!\n";
172 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
173 my @fasta_files = <*.fa>;
174 my $file_list = join (',',@fasta_files);
175 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
176 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
177
178 system ("$path_to_bowtie","-f","$file_list","BS_CT")== 0 or die "Parent process: Failed to build index\n";
179 waitpid( $pid, 0 );
180 $? == 0 or die "Parent process: Child process failed\n";
181 }
182
183 # child process
184 elsif ($pid == 0){
185 sleep(2);
186 chdir $GA_dir or die "Unable to change directory: $!\n";
187 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
188 my @fasta_files = <*.fa>;
189 my $file_list = join (',',@fasta_files);
190 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
191 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
192
193 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
194 }
195
196 # if the platform doesn't support the fork command we will run the indexing processes one after the other
197 else{
198 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
199 # sleep(10);
200
201 ### moving to CT genome folder
202 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
203 chdir $CT_dir or die "Unable to change directory: $!\n";
204 my @fasta_files = <*.fa>;
205 my $file_list = join (',',@fasta_files);
206 $verbose and print "$file_list\n\n";
207 # sleep(2);
208 system ("$path_to_bowtie","-f","$file_list","BS_CT");
209 @fasta_files=();
210 $file_list= '';
211
212 ### moving to GA genome folder
213 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
214 chdir $GA_dir or die "Unable to change directory: $!\n";
215 @fasta_files = <*.fa>;
216 $file_list = join (',',@fasta_files);
217 $verbose and print "$file_list\n\n";
218 # sleep(2);
219 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
220 }
221 }
222
223
224 sub process_sequence_files {
225
226 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
227 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
228 sleep (3);
229
230 $verbose and print "conversions performed:\n";
231 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
232
233
234 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
235 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
236 ### This is now the default option
237
238 if ($multi_fasta){
239 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
240 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
241 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
242
243 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
244 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
245 }
246
247 foreach my $filename(@filenames){
248 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
249 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
250 # warn "Reading chromosome information from $filename\n\n";
251
252 ### first line needs to be a fastA header
253 my $first_line = <IN>;
254 chomp $first_line;
255
256 ### Extracting chromosome name from the FastA header
257 my $chromosome_name = extract_chromosome_name($first_line);
258
259 ### Exiting if a chromosome with the same name was present already
260 if (exists $chromosomes{$chromosome_name}){
261 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
262 }
263 else{
264 $chromosomes{$chromosome_name}++;
265 }
266
267 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
268 unless ($multi_fasta){
269 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
270 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
271 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
272
273 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
274 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
275 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
276 }
277
278 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
279 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
280
281
282 while (<IN>){
283
284 ### in case the line is a new fastA header
285 if ($_ =~ /^>/){
286 ### printing out the stats for the previous chromosome
287 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
288 ### resetting the chromosome transliteration counters
289 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
290
291 ### Extracting chromosome name from the additional FastA header
292 $chromosome_name = extract_chromosome_name($_);
293
294 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
295 unless ($multi_fasta){
296 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
297 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
298 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
299
300 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
301 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
302 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
303 }
304
305 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
306 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
307 }
308
309 else{
310 my $sequence = uc$_;
311
312 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
313
314 $sequence =~ s/[^ATCGN\n\r]/N/g;
315
316 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
317
318 my $CT_sequence = $sequence;
319 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
320 $total_CT_conversions += $CT_transliterations_performed;
321 $chromosome_CT_conversions += $CT_transliterations_performed;
322
323 print CT_CONVERT $CT_sequence;
324
325 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
326 ### complementing the forward strand and then C->T converting it)
327
328 my $GA_sequence = $sequence;
329 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
330 $total_GA_conversions += $GA_transliterations_performed;
331 $chromosome_GA_conversions += $GA_transliterations_performed;
332
333 print GA_CONVERT $GA_sequence;
334
335 }
336 }
337 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
338 }
339 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
340 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
341
342
343 print "\nTotal number of conversions performed:\n";
344 print "C->T:\t$total_CT_conversions\n";
345 print "G->A:\t$total_GA_conversions\n";
346
347 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
348 }
349
350 sub get_genomic_frequencies{
351
352 warn "Calculating genomic frequencies (this may take several minutes depending on genome size) ...\n";
353 warn "="x164,"\n";
354 read_genome_into_memory($parent_dir);
355 foreach my $chr (keys %chromosomes){
356 warn "Processing chromosome >> $chr <<\n";
357 process_sequence($chromosomes{$chr});
358 }
359
360 %genomic_freqs = %freqs;
361
362 ### Attempting to write a genomic nucleotide frequency table out to the genome folder so we can re-use it next time without the need to re-calculate
363 ### if this fails
364 if ( open (FREQS,'>',"${genome_folder}genomic_nucleotide_frequencies.txt") ){
365 warn "Writing genomic nucleotide frequencies to the file >${genome_folder}genomic_nucleotide_frequencies.txt< for future re-use\n";
366 foreach my $f(sort keys %genomic_freqs){
367 warn "Writing count of (di-)nucleotide: $f\t$genomic_freqs{$f}\n";
368 print FREQS "$f\t$genomic_freqs{$f}\n";
369 }
370 close FREQS or warn "Failed to close filehandle FREQS: $!\n\n";
371 }
372 else{
373 warn "Failed to write out file ${genome_folder}genomic_nucleotide_frequencies.txt because of: $!. Skipping writing out genomic frequency table\n\n";
374 }
375 }
376
377
378 sub process_sequence{
379
380 my $seq = shift;
381 my $mono;
382 my $di;
383
384 foreach my $index (0..(length$seq)-1){
385 my $counted = 0;
386 if ($index%10000000==0){
387 # warn "Current index number is $index\n";
388 }
389
390 $mono = substr($seq,$index,1);
391 unless ( $mono eq 'N'){
392 $freqs{$mono}++;
393 }
394
395 unless ( ($index + 2) > length$seq ){
396 $di = substr($seq,$index,2);
397 if (index($di,'N') < 0) {
398 $freqs{$di}++;
399 }
400 }
401 }
402
403 }
404
405 sub extract_chromosome_name {
406 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
407 my $fasta_header = shift;
408 if ($fasta_header =~ s/^>//){
409 my ($chromosome_name) = split (/\s+/,$fasta_header);
410 return $chromosome_name;
411 }
412 else{
413 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
414 }
415 }
416
417
418 sub create_bisulfite_genome_folders{
419
420 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
421
422 if ($path_to_bowtie){
423 unless ($path_to_bowtie =~ /\/$/){
424 $path_to_bowtie =~ s/$/\//;
425 }
426 if (chdir $path_to_bowtie){
427 if ($bowtie2){
428 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
429 }
430 else{
431 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
432 }
433 }
434 else{
435 die "There was an error with the path to bowtie: $!\n";
436 }
437 }
438
439 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
440
441
442 # Exiting unless there are fastA files in the folder
443 my @filenames = <*.fa>;
444
445 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
446 unless (@filenames){
447 @filenames = <*.fasta>;
448 }
449
450 unless (@filenames){
451 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
452 }
453
454 warn "Bisulfite Genome Indexer version $bismark_version (last modified 25 August 2015)\n\n";
455 sleep (3);
456
457 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
458 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
459 unless (-d $bisulfite_dir){
460 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
461 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
462 }
463 else{
464 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
465 sleep(5);
466 }
467
468 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
469 $CT_dir = "${bisulfite_dir}CT_conversion/";
470 $GA_dir = "${bisulfite_dir}GA_conversion/";
471
472 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
473 # converted version of the genome
474 unless (-d $CT_dir){
475 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
476 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
477 }
478 unless (-d $GA_dir){
479 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
480 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
481 }
482
483 # moving back to the original genome folder
484 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
485 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
486 warn "\nStep I - Prepare genome folders - completed\n\n\n";
487 return @filenames;
488 }
489
490 sub read_genome_into_memory{
491
492 ## reading in and storing the specified genome in the %chromosomes hash
493 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
494 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
495
496 my @chromosome_filenames = <*.fa>;
497
498 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
499 unless (@chromosome_filenames){
500 @chromosome_filenames = <*.fasta>;
501 }
502 unless (@chromosome_filenames){
503 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
504 }
505
506 foreach my $chromosome_filename (@chromosome_filenames){
507
508 # skipping the tophat entire mouse genome fasta file
509 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
510
511 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
512 ### first line needs to be a fastA header
513 my $first_line = <CHR_IN>;
514 chomp $first_line;
515 $first_line =~ s/\r//; # removing /r carriage returns
516
517 ### Extracting chromosome name from the FastA header
518 my $chromosome_name = extract_chromosome_name($first_line);
519
520 my $sequence;
521 while (<CHR_IN>){
522 chomp;
523 $_ =~ s/\r//; # removing /r carriage returns
524
525 if ($_ =~ /^>/){
526 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
527 if (exists $chromosomes{$chromosome_name}){
528 warn "chr $chromosome_name (",length $sequence ," bp)\n";
529 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
530 }
531 else {
532 if (length($sequence) == 0){
533 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
534 }
535 warn "chr $chromosome_name (",length $sequence ," bp)\n";
536 $chromosomes{$chromosome_name} = $sequence;
537 }
538 ### resetting the sequence variable
539 $sequence = '';
540 ### setting new chromosome name
541 $chromosome_name = extract_chromosome_name($_);
542 }
543 else{
544 $sequence .= uc$_;
545 }
546 }
547
548 if (exists $chromosomes{$chromosome_name}){
549 warn "chr $chromosome_name (",length $sequence ," bp)\t";
550 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
551 }
552 else{
553 if (length($sequence) == 0){
554 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
555 }
556 warn "chr $chromosome_name (",length $sequence ," bp)\n";
557 $chromosomes{$chromosome_name} = $sequence;
558 }
559 }
560 warn "\n";
561 chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
562 }
563
564
565
566
567 sub print_helpfile{
568 print << 'HOW_TO';
569
570
571 DESCRIPTION
572
573 This script is supposed to convert a specified reference genome into two different bisulfite
574 converted versions and index them for alignments with Bowtie 2 (default), or Bowtie 1. The first
575 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
576 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
577 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
578 two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
579 that the indexing process can take up to several hours; this will mainly depend on genome size
580 and system resources.
581
582
583
584 The following is a brief description of command line options and arguments to control the
585 Bismark Genome Preparation:
586
587
588 USAGE: bismark_genome_preparation [options] <arguments>
589
590
591 OPTIONS:
592
593 --help/--man Displays this help filea and exits.
594
595 --version Displays version information and exits.
596
597 --verbose Print verbose output for more details or debugging.
598
599 --path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system
600 (depending on which aligner/indexer you intend to use). Unless this path
601 is specified it is assumed that Bowtie is in the PATH.
602
603 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: ON).
604
605 --bowtie1 This will create bisulfite indexes for Bowtie 1. (Default: OFF).
606
607 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
608 single-entry FastA files instead of making one multi-FastA file (MFA)
609 per chromosome. This might be useful if individual bisulfite converted
610 chromosomes are needed (e.g. for debugging), however it can cause a
611 problem with indexing if the number of chromosomes is vast (this is likely
612 to be in the range of several thousand files; the operating system can
613 only handle lists up to a certain length, and some newly assembled
614 genomes may contain 20000-500000 contigs of scaffold files which do exceed
615 this list length limit).
616
617 --genomic_composition Calculate and extract the genomic sequence composition for mono and di-nucleotides
618 and write the genomic composition table 'genomic_nucleotide_frequencies.txt' to the
619 genome folder. This may be useful later on when using bam2nuc or the Bismark option
620 --nucleotide_coverage.
621
622
623 ARGUMENTS:
624
625 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
626 The Bismark Genome Preparation expects one or more fastA files in the folder
627 (with the file extension: .fa or .fasta). Specifying this path is mandatory.
628
629
630 This script was last modified on 07 July 2016.
631 HOW_TO
632 }