annotate bismark_wrapper/bismark_genome_preparation @ 1:183de9d00131 draft

add indices.loc files
author bjoern-gruening
date Tue, 25 Dec 2012 05:52:28 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1 #!/usr/bin/perl --
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2 use strict;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3 use warnings;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4 use Cwd;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5 use File::Path qw(rmtree);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6 $|++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
7
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
8
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
9 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
10
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
11 ## This program is free software: you can redistribute it and/or modify
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
12 ## it under the terms of the GNU General Public License as published by
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
13 ## the Free Software Foundation, either version 3 of the License, or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
14 ## (at your option) any later version.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
15
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
16 ## This program is distributed in the hope that it will be useful,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
19 ## GNU General Public License for more details.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
20
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
21 ## You should have received a copy of the GNU General Public License
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
23
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
24 use Getopt::Long;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
25 use Cwd;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
26
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
27 my $verbose;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
28 my $help;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
29 my $version;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
30 my $man;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
31 my $path_to_bowtie;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
32 my $multi_fasta;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
33 my $single_fasta;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
34 my $bowtie2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
35
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
36 my $bismark_version = 'v0.7.7';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
37
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
38 GetOptions ('verbose' => \$verbose,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
39 'help' => \$help,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
40 'man' => \$man,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
41 'version' => \$version,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
42 'path_to_bowtie:s' => \$path_to_bowtie,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
43 'single_fasta' => \$single_fasta,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
44 'bowtie2' => \$bowtie2,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
45 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
46
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
47 my $genome_folder = shift @ARGV; # mandatory
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
48 my $CT_dir;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
49 my $GA_dir;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
50
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
51 if ($help or $man){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
52 print_helpfile();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
53 exit;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
54 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
55
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
56 if ($version){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
57 print << "VERSION";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
58
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
59 Bismark - Bisulfite Mapper and Methylation Caller.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
60
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
61 Bismark Genome Preparation Version: $bismark_version
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
62 Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
63 www.bioinformatics.babraham.ac.uk/projects/
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
64
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
65 VERSION
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
66 exit;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
67 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
68
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
69 if ($single_fasta){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
70 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
71 $multi_fasta = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
72 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
73 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
74 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
75 $single_fasta = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
76 $multi_fasta = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
77 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
78
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
79 my @filenames = create_bisulfite_genome_folders();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
80
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
81 process_sequence_files ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
82
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
83 launch_bowtie_indexer();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
84
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
85 sub launch_bowtie_indexer{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
86 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
87 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
88 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
89 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
90 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
91 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
92 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
93 sleep(5);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
94
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
95 ### if the path to bowtie was specfified explicitely
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
96 if ($path_to_bowtie){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
97 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
98 $path_to_bowtie =~ s/$/bowtie2-build/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
99 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
100 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
101 $path_to_bowtie =~ s/$/bowtie-build/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
102 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
103 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
104 ### otherwise we assume that bowtie-build is in the path
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
105 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
106 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
107 $path_to_bowtie = 'bowtie2-build';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
108 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
109 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
110 $path_to_bowtie = 'bowtie-build';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
111 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
112 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
113
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
114 $verbose and print "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
115
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
116 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
117 my $pid = fork();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
118
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
119 # parent process
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
120 if ($pid){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
121 sleep(1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
122 chdir $CT_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
123 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
124 my @fasta_files = <*.fa>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
125 my $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
126 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
127 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
128
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
129 sleep (11);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
130 exec ("$path_to_bowtie","-f","$file_list","BS_CT");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
131 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
132
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
133 # child process
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
134 elsif ($pid == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
135 sleep(2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
136 chdir $GA_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
137 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
138 my @fasta_files = <*.fa>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
139 my $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
140 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
141 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
142 $verbose and print "(starting in 10 seconds)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
143 sleep(10);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
144 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
145 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
146
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
147 # if the platform doesn't support the fork command we will run the indexing processes one after the other
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
148 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
149 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
150 sleep(10);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
151
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
152 ### moving to CT genome folder
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
153 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
154 chdir $CT_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
155 my @fasta_files = <*.fa>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
156 my $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
157 $verbose and print "$file_list\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
158 sleep(2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
159 system ("$path_to_bowtie","-f","$file_list","BS_CT");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
160 @fasta_files=();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
161 $file_list= '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
162
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
163 ### moving to GA genome folder
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
164 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
165 chdir $GA_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
166 @fasta_files = <*.fa>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
167 $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
168 $verbose and print "$file_list\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
169 sleep(2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
170 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
171 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
172 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
173
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
174
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
175 sub process_sequence_files {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
176
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
177 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
178 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
179 sleep (3);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
180
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
181 $verbose and print "conversions performed:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
182 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
183
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
184
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
185 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
186 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
187 ### This is now the default option
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
188
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
189 if ($multi_fasta){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
190 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
191 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
192 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
193
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
194 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
195 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
196 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
197
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
198 foreach my $filename(@filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
199 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
200 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
201 # warn "Reading chromosome information from $filename\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
202
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
203 ### first line needs to be a fastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
204 my $first_line = <IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
205 chomp $first_line;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
206
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
207 ### Extracting chromosome name from the FastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
208 my $chromosome_name = extract_chromosome_name($first_line);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
209
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
210 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
211 unless ($multi_fasta){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
212 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
213 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
214 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
215
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
216 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
217 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
218 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
219 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
220
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
221 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
222 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
223
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
224
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
225 while (<IN>){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
226
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
227 ### in case the line is a new fastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
228 if ($_ =~ /^>/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
229 ### printing out the stats for the previous chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
230 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
231 ### resetting the chromosome transliteration counters
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
232 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
233
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
234 ### Extracting chromosome name from the additional FastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
235 $chromosome_name = extract_chromosome_name($_);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
236
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
237 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
238 unless ($multi_fasta){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
239 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
240 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
241 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
242
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
243 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
244 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
245 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
246 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
247
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
248 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
249 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
250 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
251
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
252 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
253 my $sequence = uc$_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
254
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
255 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
256
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
257 $sequence =~ s/[^ATCGN\n\r]/N/g;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
258
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
259 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
260
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
261 my $CT_sequence = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
262 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
263 $total_CT_conversions += $CT_transliterations_performed;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
264 $chromosome_CT_conversions += $CT_transliterations_performed;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
265
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
266 print CT_CONVERT $CT_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
267
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
268 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
269 ### complementing the forward strand and then C->T converting it)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
270
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
271 my $GA_sequence = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
272 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
273 $total_GA_conversions += $GA_transliterations_performed;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
274 $chromosome_GA_conversions += $GA_transliterations_performed;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
275
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
276 print GA_CONVERT $GA_sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
277
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
278 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
279 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
280 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
281 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
282 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
283 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
284
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
285
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
286 print "\nTotal number of conversions performed:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
287 print "C->T:\t$total_CT_conversions\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
288 print "G->A:\t$total_GA_conversions\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
289
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
290 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
291 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
292
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
293 sub extract_chromosome_name {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
294
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
295 my $header = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
296
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
297 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
298
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
299 if ($header =~ s/^>//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
300 my ($chromosome_name) = split (/\s+/,$header);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
301 return $chromosome_name;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
302 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
303 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
304 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
305 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
306 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
307
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
308 sub create_bisulfite_genome_folders{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
309
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
310 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
311
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
312 # Ensuring a genome folder has been specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
313 if ($genome_folder){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
314 unless ($genome_folder =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
315 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
316 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
317 $verbose and print "Path to genome folder specified: $genome_folder\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
318 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
319
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
320 # making the genome folder path abolsolute so it won't break if the path was specified relative
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
321 $genome_folder = getcwd;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
322 unless ($genome_folder =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
323 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
324 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
325 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
326
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
327 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
328 $verbose and print "Genome folder was not provided as argument ";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
329 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
330 print "Please specify a genome folder to be bisulfite converted:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
331 $genome_folder = <STDIN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
332 chomp $genome_folder;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
333
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
334 # adding a trailing slash unless already present
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
335 unless ($genome_folder =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
336 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
337 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
338 if (chdir $genome_folder){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
339 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
340 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
341 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
342 warn "Could't move to directory $genome_folder! $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
343 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
344 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
345 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
346
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
347 if ($path_to_bowtie){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
348 unless ($path_to_bowtie =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
349 $path_to_bowtie =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
350 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
351 if (chdir $path_to_bowtie){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
352 if ($bowtie2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
353 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
354 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
355 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
356 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
357 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
358 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
359 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
360 die "There was an error with the path to bowtie: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
361 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
362 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
363
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
364 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
365
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
366
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
367 # Exiting unless there are fastA files in the folder
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
368 my @filenames = <*.fa>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
369
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
370 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
371 unless (@filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
372 @filenames = <*.fasta>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
373 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
374
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
375 unless (@filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
376 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
377 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
378
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
379 warn "Bisulfite Genome Indexer version $bismark_version (last modified 17 Nov 2011)\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
380 sleep (3);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
381
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
382 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
383 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
384 unless (-d $bisulfite_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
385 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
386 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
387 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
388 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
389 while (1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
390 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indexes might be overwritten!\nDo you want to continue anyway?\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
391 my $proceed = <STDIN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
392 chomp $proceed;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
393 if ($proceed =~ /^y/i ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
394 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
395 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
396 elsif ($proceed =~ /^n/i){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
397 die "Terminated by user\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
398 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
399 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
400 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
401
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
402 ### as of version 0.6.0 the Bismark indexer will no longer delete the Bisulfite_Genome directory if it was present already, since it could store the Bowtie 1 or 2 indexes already
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
403 # removing any existing files and subfolders in the bisulfite directory (the specified directory won't be deleted)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
404 # rmtree($bisulfite_dir, {verbose => 1,keep_root => 1});
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
405 # unless (-d $bisulfite_dir){ # had to add this after changing remove_tree to rmtree // suggested by Samantha Cooper @ Illumina
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
406 # mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
407 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
408 # }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
409
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
410 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
411 $CT_dir = "${bisulfite_dir}CT_conversion/";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
412 $GA_dir = "${bisulfite_dir}GA_conversion/";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
413
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
414 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
415 # converted version of the genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
416 unless (-d $CT_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
417 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
418 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
419 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
420 unless (-d $GA_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
421 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
422 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
423 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
424
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
425 # moving back to the original genome folder
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
426 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
427 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
428 warn "\nStep I - Prepare genome folders - completed\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
429 return @filenames;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
430 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
431
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
432 sub print_helpfile{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
433 print << 'HOW_TO';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
434
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
435
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
436 DESCRIPTION
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
437
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
438 This script is supposed to convert a specified reference genome into two different bisulfite
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
439 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
440 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
441 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
442 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
443 two simultaneous instances of the bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
444 that the indexing process can take up to several hours; this will mainly depend on genome size
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
445 and system resources.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
446
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
447
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
448
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
449
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
450 The following is a brief description of command line options and arguments to control the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
451 Bismark Genome Preparation script:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
452
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
453
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
454 USAGE: bismark_genome_preparation [options] <arguments>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
455
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
456
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
457 OPTIONS:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
458
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
459 --help/--man Displays this help filea and exits.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
460
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
461 --version Displays version information and exits.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
462
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
463 --verbose Print verbose output for more details or debugging.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
464
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
465 --path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system.If
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
466 the path </../../> is not provided as an option you will be prompted for it.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
467
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
468 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
469
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
470 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
471 single-entry FastA files instead of making one multi-FastA file (MFA)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
472 per chromosome. This might be useful if individual bisulfite converted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
473 chromosomes are needed (e.g. for debugging), however it can cause a
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
474 problem with indexing if the number of chromosomes is vast (this is likely
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
475 to be in the range of several thousand files; the operating system can
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
476 only handle lists up to a certain length, and some newly assembled
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
477 genomes may contain 20000-50000 contigs of scaffold files which do exceed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
478 this list length limit).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
479
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
480
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
481 ARGUMENTS:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
482
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
483 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
484 At the current time Bismark Genome Preparation expects one or more fastA
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
485 files in the folder (with the file extension: .fa or .fasta). If the path
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
486 is not provided as an argument you will be prompted for it.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
487
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
488
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
489
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
490 This script was last modified on 18 Nov 2011.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
491 HOW_TO
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
492 }