comparison bismark_genome_preparation @ 3:91f07ff056ca draft

Uploaded
author bgruening
date Mon, 14 Apr 2014 16:43:14 -0400
parents 62c6da72dd4a
children
comparison
equal deleted inserted replaced
2:82814a8a2395 3:91f07ff056ca
31 my $path_to_bowtie; 31 my $path_to_bowtie;
32 my $multi_fasta; 32 my $multi_fasta;
33 my $single_fasta; 33 my $single_fasta;
34 my $bowtie2; 34 my $bowtie2;
35 35
36 my $bismark_version = 'v0.7.12'; 36 my $bismark_version = 'v0.10.0';
37 37
38 GetOptions ('verbose' => \$verbose, 38 GetOptions ('verbose' => \$verbose,
39 'help' => \$help, 39 'help' => \$help,
40 'man' => \$man, 40 'man' => \$man,
41 'version' => \$version, 41 'version' => \$version,
42 'path_to_bowtie:s' => \$path_to_bowtie, 42 'path_to_bowtie:s' => \$path_to_bowtie,
43 'single_fasta' => \$single_fasta, 43 'single_fasta' => \$single_fasta,
44 'bowtie2' => \$bowtie2, 44 'bowtie2' => \$bowtie2,
45 ); 45 );
46 46
47 my $genome_folder = shift @ARGV; # mandatory
48 my $CT_dir;
49 my $GA_dir;
50
51 if ($help or $man){ 47 if ($help or $man){
52 print_helpfile(); 48 print_helpfile();
53 exit; 49 exit;
54 } 50 }
55 51
63 www.bioinformatics.babraham.ac.uk/projects/ 59 www.bioinformatics.babraham.ac.uk/projects/
64 60
65 VERSION 61 VERSION
66 exit; 62 exit;
67 } 63 }
64
65 my $genome_folder = shift @ARGV; # mandatory
66
67 # Ensuring a genome folder has been specified
68 if ($genome_folder){
69 unless ($genome_folder =~ /\/$/){
70 $genome_folder =~ s/$/\//;
71 }
72 $verbose and print "Path to genome folder specified as: $genome_folder\n";
73 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
74
75 # making the genome folder path abolsolute so it won't break if the path was specified relative
76 $genome_folder = getcwd;
77 unless ($genome_folder =~ /\/$/){
78 $genome_folder =~ s/$/\//;
79 }
80 }
81 else{
82 die "Please specify a genome folder to be used for bisulfite conversion\n\n";
83 }
84
85
86 my $CT_dir;
87 my $GA_dir;
88
68 89
69 if ($single_fasta){ 90 if ($single_fasta){
70 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; 91 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
71 $multi_fasta = 0; 92 $multi_fasta = 0;
72 } 93 }
307 328
308 sub create_bisulfite_genome_folders{ 329 sub create_bisulfite_genome_folders{
309 330
310 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; 331 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
311 332
312 # Ensuring a genome folder has been specified
313 if ($genome_folder){
314 unless ($genome_folder =~ /\/$/){
315 $genome_folder =~ s/$/\//;
316 }
317 $verbose and print "Path to genome folder specified: $genome_folder\n";
318 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
319
320 # making the genome folder path abolsolute so it won't break if the path was specified relative
321 $genome_folder = getcwd;
322 unless ($genome_folder =~ /\/$/){
323 $genome_folder =~ s/$/\//;
324 }
325 }
326
327 else{
328 $verbose and print "Genome folder was not provided as argument ";
329 while (1){
330 print "Please specify a genome folder to be bisulfite converted:\n";
331 $genome_folder = <STDIN>;
332 chomp $genome_folder;
333
334 # adding a trailing slash unless already present
335 unless ($genome_folder =~ /\/$/){
336 $genome_folder =~ s/$/\//;
337 }
338 if (chdir $genome_folder){
339 last;
340 }
341 else{
342 warn "Could't move to directory $genome_folder! $!";
343 }
344 }
345 }
346
347 if ($path_to_bowtie){ 333 if ($path_to_bowtie){
348 unless ($path_to_bowtie =~ /\/$/){ 334 unless ($path_to_bowtie =~ /\/$/){
349 $path_to_bowtie =~ s/$/\//; 335 $path_to_bowtie =~ s/$/\//;
350 } 336 }
351 if (chdir $path_to_bowtie){ 337 if (chdir $path_to_bowtie){
374 360
375 unless (@filenames){ 361 unless (@filenames){
376 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; 362 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
377 } 363 }
378 364
379 warn "Bisulfite Genome Indexer version $bismark_version (last modified 17 Nov 2011)\n\n"; 365 warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n";
380 sleep (3); 366 sleep (3);
381 367
382 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists 368 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
383 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; 369 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
384 unless (-d $bisulfite_dir){ 370 unless (-d $bisulfite_dir){
385 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; 371 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
386 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; 372 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
387 } 373 }
388 else{ 374 else{
389 while (1){ 375 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
390 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indexes might be overwritten!\nDo you want to continue anyway?\t"; 376 sleep(5);
391 my $proceed = <STDIN>; 377 }
392 chomp $proceed;
393 if ($proceed =~ /^y/i ){
394 last;
395 }
396 elsif ($proceed =~ /^n/i){
397 die "Terminated by user\n\n";
398 }
399 }
400 }
401
402 ### as of version 0.6.0 the Bismark indexer will no longer delete the Bisulfite_Genome directory if it was present already, since it could store the Bowtie 1 or 2 indexes already
403 # removing any existing files and subfolders in the bisulfite directory (the specified directory won't be deleted)
404 # rmtree($bisulfite_dir, {verbose => 1,keep_root => 1});
405 # unless (-d $bisulfite_dir){ # had to add this after changing remove_tree to rmtree // suggested by Samantha Cooper @ Illumina
406 # mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
407 # }
408 # }
409 378
410 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; 379 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
411 $CT_dir = "${bisulfite_dir}CT_conversion/"; 380 $CT_dir = "${bisulfite_dir}CT_conversion/";
412 $GA_dir = "${bisulfite_dir}GA_conversion/"; 381 $GA_dir = "${bisulfite_dir}GA_conversion/";
413 382
438 This script is supposed to convert a specified reference genome into two different bisulfite 407 This script is supposed to convert a specified reference genome into two different bisulfite
439 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first 408 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
440 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs 409 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
441 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference 410 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
442 genome folder. Once the bisulfite conversion has been completed the program will fork and launch 411 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
443 two simultaneous instances of the bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware 412 two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
444 that the indexing process can take up to several hours; this will mainly depend on genome size 413 that the indexing process can take up to several hours; this will mainly depend on genome size
445 and system resources. 414 and system resources.
446 415
447 416
448 417
449
450 The following is a brief description of command line options and arguments to control the 418 The following is a brief description of command line options and arguments to control the
451 Bismark Genome Preparation script: 419 Bismark Genome Preparation:
452 420
453 421
454 USAGE: bismark_genome_preparation [options] <arguments> 422 USAGE: bismark_genome_preparation [options] <arguments>
455 423
456 424
460 428
461 --version Displays version information and exits. 429 --version Displays version information and exits.
462 430
463 --verbose Print verbose output for more details or debugging. 431 --verbose Print verbose output for more details or debugging.
464 432
465 --path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system.If 433 --path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system
466 the path </../../> is not provided as an option you will be prompted for it. 434 (depending on which aligner/indexer you intend to use). Unless this path
435 is specified it is assumed that Bowtie is in the PATH.
467 436
468 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1). 437 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
469 438
470 --single_fasta Instruct the Bismark Indexer to write the converted genomes into 439 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
471 single-entry FastA files instead of making one multi-FastA file (MFA) 440 single-entry FastA files instead of making one multi-FastA file (MFA)
479 448
480 449
481 ARGUMENTS: 450 ARGUMENTS:
482 451
483 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted. 452 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
484 At the current time Bismark Genome Preparation expects one or more fastA 453 The Bismark Genome Preparation expects one or more fastA files in the folder
485 files in the folder (with the file extension: .fa or .fasta). If the path 454 (with the file extension: .fa or .fasta). Specifying this path is mandatory.
486 is not provided as an argument you will be prompted for it. 455
487 456
488 457 This script was last modified on 19 Sept 2013.
489
490 This script was last modified on 18 Nov 2011.
491 HOW_TO 458 HOW_TO
492 } 459 }