Mercurial > repos > stheil > metavelvet_wrapper
comparison velvet.pl @ 3:c979f8682b21 draft
Uploaded
author | stheil |
---|---|
date | Thu, 24 Sep 2015 10:42:55 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:1bb80c25b379 | 3:c979f8682b21 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Logger::Logger; | |
5 use Getopt::Long; | |
6 use Tools::Fasta; | |
7 use Pod::Usage; | |
8 | |
9 my $directory; | |
10 my $hashLength; | |
11 my $fileString; | |
12 my $performMetagenomicAssembly = 1; | |
13 my $man; | |
14 my $help; | |
15 | |
16 my $velvethOptions = {}; | |
17 my $velvetgOptions = {}; | |
18 my $velvetgmOptions = {}; | |
19 my $metaVelvetgOptions = {}; | |
20 my $lastOptFile =''; | |
21 | |
22 GetOptions( | |
23 | |
24 'd|directory=s' => \$directory, | |
25 'hash_length=s' => \$hashLength, | |
26 'm|meta!' => \$performMetagenomicAssembly, | |
27 'man' => \$man, | |
28 'h|help' => \$help, | |
29 'short:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
30 'short2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
31 'short3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
32 'shortPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
33 'shortPaired2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
34 'shortPaired3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
35 'long:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
36 'longPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
37 'reference:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
38 'fasta:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
39 'fastq:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
40 'raw:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
41 'fasta_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
42 'fastq_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
43 'raw_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
44 'sam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
45 'bam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
46 'fmtAuto:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
47 'interleaved:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
48 'separate:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, | |
49 'strand_specific' => sub{registerOnOffOption($velvethOptions, @_)}, | |
50 'reuse_Sequences:s' => sub{registerOnOffOption($velvethOptions, @_)}, | |
51 'reuse_binary:s' => sub{registerOnOffOption($velvethOptions, @_)}, | |
52 'noHash:s' => sub{registerOnOffOption($velvethOptions, @_)}, | |
53 'create_binary:s' => sub{registerOnOffOption($velvethOptions, @_)}, | |
54 | |
55 'cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
56 'ins_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
57 'read_trkg=s' => sub{registerYesNoOption($velvetgOptions, @_)}, | |
58 'min_contig_lgth=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
59 'amos_file=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
60 'exp_cov=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
61 'long_cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
62 'ins_length_long=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
63 'ins_length2=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
64 'ins_length_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
65 'ins_length_long_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
66 'ins_length2_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
67 'scaffolding=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
68 'max_branch_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
69 'max_divergence=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
70 'max_gap_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
71 'min_pair_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
72 'max_coverage=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
73 'coverage_mask=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
74 'long_mult_cutoff=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, | |
75 'unused_reads=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, | |
76 'alignments=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, | |
77 'exportFiltered=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, | |
78 'clean=s' => sub{registerYesNoOption($velvetgOptions, @_)}, | |
79 'very_clean=s' => sub{registerYesNoOption($velvetgOptions, @_)}, | |
80 'paired_exp_fraction=f' => sub{registerYesNoOption($velvetgmOptions, @_)}, | |
81 'shortMatePaired=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, | |
82 'conserveLong=s' => sub{registerYesNoOption($velvetgOptions, @_)}, | |
83 | |
84 'discard_chimera=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, | |
85 'max_chimera_rate=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
86 'repeat_cov_sd=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
87 'min_split_length=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
88 'valid_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
89 'noise_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
90 'use_connections=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, | |
91 'report_split_detail=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, | |
92 'report_subgraph=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, | |
93 'exp_covs=s' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
94 'min_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
95 'max_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
96 'histo_bin_width=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
97 'histo_sn_ratio=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, | |
98 | |
99 ) or pod2usage( "Try '$0 --help' for more information." ); | |
100 | |
101 pod2usage( -verbose => 2 ) if $man; | |
102 pod2usage( -verbose => 1 ) if ($help || ! defined $fileString); | |
103 | |
104 Logger::Logger->changeMode(0); | |
105 | |
106 if(!defined $directory || $directory eq ''){ | |
107 $directory = '.'; | |
108 } | |
109 | |
110 $directory .= '/'; | |
111 | |
112 #Running velveth | |
113 my $velvethCommand = 'velveth ' . $directory . ' ' . $hashLength . ' ' . $fileString . ' ' . convertOptionHashToCommandLine($velvethOptions); | |
114 $logger->info('Running velveth...'); | |
115 $logger->info($velvethCommand); | |
116 `$velvethCommand`; | |
117 | |
118 #Running velvetg | |
119 if($performMetagenomicAssembly){ | |
120 $velvetgOptions->{'exp_cov'} = 'auto'; | |
121 } | |
122 my $velvetgCommand = 'velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($velvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions); | |
123 $logger->info('Running velvetg...'); | |
124 $logger->info($velvetgCommand); | |
125 `$velvetgCommand`; | |
126 | |
127 #Running meta-velvetg | |
128 if ($performMetagenomicAssembly){ | |
129 my $metaVelvetCommand = 'meta-velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($metaVelvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions); | |
130 $logger->info('Running meta-velvetg...'); | |
131 $logger->info($metaVelvetCommand); | |
132 `$metaVelvetCommand`; | |
133 } | |
134 | |
135 if(exists $velvetgmOptions->{'unused_reads'} && $velvetgmOptions->{'unused_reads'} eq 'yes'){ | |
136 createSingleFile($directory.'UnusedReads.fa', $directory.'Sequences', $directory.'Singlets.fasta'); | |
137 } | |
138 | |
139 sub convertOptionHashToCommandLine{ | |
140 my ($optionHash) = @_; | |
141 my $commandLineString = ''; | |
142 | |
143 foreach my $opt (keys %$optionHash){ | |
144 if(defined $optionHash->{$opt}){ | |
145 if(ref $optionHash->{$opt}){ | |
146 $commandLineString .= '-' . $opt . ' ' . join(' -'.$opt.' ',@{$optionHash->{$opt}}) . ' '; | |
147 } | |
148 else{ | |
149 $commandLineString .= '-' . $opt . ' ' . $optionHash->{$opt} . ' '; | |
150 } | |
151 } | |
152 } | |
153 | |
154 return $commandLineString; | |
155 } | |
156 | |
157 sub registerYesNoOption{ | |
158 my ($hash, $optionName, $optionValue) = @_; | |
159 | |
160 if(defined $optionValue && ($optionValue eq 'no' || $optionValue eq 'yes')){ | |
161 registerScalarOptionHash(@_); | |
162 } | |
163 else{ | |
164 $logger->logdie("Option '$optionName' must be yes or no\n"); | |
165 } | |
166 } | |
167 | |
168 sub registerOnOffOption{ | |
169 my ($hash, $optionName, $optionValue) = @_; | |
170 | |
171 if(! defined $optionValue || ($optionValue eq '')){ | |
172 registerScalarOptionHash(@_); | |
173 } | |
174 else{ | |
175 $logger->logdie("No value allowed for option '$optionName'\n"); | |
176 } | |
177 } | |
178 | |
179 sub registerScalarOptionHash{ | |
180 my ($hash, $optionName, $optionValue) = @_; | |
181 $hash->{$optionName} = $optionValue; | |
182 } | |
183 | |
184 sub registerVelvetFileOptionHash{ | |
185 my ($fileString, $lastOptFile, $optionName, $optionValue) = @_; | |
186 | |
187 if($$lastOptFile ne $optionName){ | |
188 $$fileString .= ' -' . $optionName; | |
189 } | |
190 $$fileString .= ' ' . $optionValue; | |
191 $$lastOptFile = $optionName; | |
192 } | |
193 | |
194 sub createSingleFile{ | |
195 my ($velvetUnusedReads, $sequencesFile, $outputFile) = @_; | |
196 | |
197 if(! defined $outputFile){ | |
198 $outputFile = ''; | |
199 } | |
200 | |
201 my $sequenceNumber; | |
202 my %singlets; | |
203 my $id; | |
204 my $number; | |
205 | |
206 open(UNUSED_FILE, $velvetUnusedReads) or $logger->logdie("Unable to open velvet UnusedReads file $velvetUnusedReads : $!"); | |
207 open(SEQUENCES_FILE, $sequencesFile) or $logger->logdie("Unable to open velvet Sequences file $sequencesFile : $!"); | |
208 open(OUTPUT_FILE, '>'.$outputFile) or $logger->logdie("Unable to create output file $outputFile : $!"); | |
209 | |
210 my $sequenceFileObject = Tools::Fasta->new(file => $velvetUnusedReads); | |
211 | |
212 while(my $line=<UNUSED_FILE>){ | |
213 if($line =~ /^>SEQUENCE_([^_]+)/){ | |
214 $singlets{$1} = 1; | |
215 } | |
216 } | |
217 close UNUSED_FILE; | |
218 | |
219 while(my $line=<SEQUENCES_FILE>){ | |
220 if($line =~ /^>/){ | |
221 ($id, $number) = split("\t", $line); | |
222 $line = $id . "\n"; | |
223 } | |
224 if(exists $singlets{$number} && $singlets{$number} == 1){ | |
225 print OUTPUT_FILE $line; | |
226 } | |
227 } | |
228 close SEQUENCES_FILE; | |
229 close OUTPUT_FILE; | |
230 } | |
231 | |
232 =head1 NAME | |
233 | |
234 velvet.pl | |
235 | |
236 =head1 SYNOPSIS | |
237 | |
238 perl velvet.pl -hash_length HASH_LENGTH [-directory OUTPUT_DIRECTORY] [-meta] [OPTIONS VELVETH / VELVETG / METAVELVETG] | |
239 | |
240 =head1 DESCRIPTION | |
241 | |
242 Run velvet or meta-velvetg assembly. | |
243 | |
244 =head1 OPTIONS | |
245 | |
246 =over 4 | |
247 | |
248 =item -directory [DIRECTORY] | |
249 | |
250 directory path for output files | |
251 | |
252 Default is . | |
253 | |
254 =item -hash_length [INTEGER|m,M,s] | |
255 | |
256 EITHER an odd integer (if even, it will be decremented) <= 31 (if above, will be reduced) | |
257 | |
258 OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= 31 (if above, will be reduced) and s is a step (even number). Velvet will then hash from k=m to k=M with a step of s | |
259 | |
260 =item -meta|nometa | |
261 | |
262 Perform a metagenomic assembly using meta-velvetg ? | |
263 | |
264 Default is -meta | |
265 | |
266 =back | |
267 | |
268 =head1 VELVETH OPTIONS | |
269 | |
270 File format options: | |
271 | |
272 -fasta -fastq -raw -fasta_gz -fastq_gz -raw_gz -sam -bam -fmtAuto | |
273 | |
274 (Note: -fmtAuto will detect fasta or fastq, and will try the following programs for decompression : gunzip, pbunzip2, bunzip2 | |
275 | |
276 File layout options for paired reads (only for fasta and fastq formats): | |
277 | |
278 -interleaved : File contains paired reads interleaved in the one file (default) | |
279 | |
280 -separate : Read 2 separate files for paired reads | |
281 | |
282 Read type options: | |
283 | |
284 -short -shortPaired | |
285 | |
286 -short2 -shortPaired2 | |
287 | |
288 -long -longPaired | |
289 | |
290 -reference | |
291 | |
292 Options: | |
293 | |
294 -strand_specific : for strand specific transcriptome sequencing data (default: off) | |
295 | |
296 -reuse_Sequences : reuse Sequences file (or link) already in directory (no need to provide original filenames in this case (default: off) | |
297 | |
298 -reuse_binary : reuse binary sequences file (or link) already in directory (no need to provide original filenames in this case (default: off) | |
299 | |
300 -noHash : simply prepare Sequences file, do not hash reads or prepare Roadmaps file (default: off) | |
301 | |
302 -create_binary : create binary CnyUnifiedSeq file (default: off) | |
303 | |
304 Outputs: | |
305 | |
306 directory/Roadmaps | |
307 | |
308 directory/Sequences | |
309 | |
310 [Both files are picked up by graph, so please leave them there] | |
311 | |
312 =head1 VELVETG OPTIONS | |
313 | |
314 Standard options: | |
315 | |
316 -cov_cutoff <floating-point|auto> : removal of low coverage nodes AFTER tour bus or allow the system to infer it. (default: no removal) | |
317 | |
318 -ins_length <integer> : expected distance between two paired end reads (default: no read pairing) | |
319 | |
320 -read_trkg <yes|no> : tracking of short read positions in assembly (default: no tracking) | |
321 | |
322 -min_contig_lgth <integer> : minimum contig length exported to contigs.fa file (default: hash length * 2) | |
323 | |
324 -amos_file <yes|no> : export assembly to AMOS file (default: no export) | |
325 | |
326 -exp_cov <floating point|auto> : expected coverage of unique regions or allow the system to infer it (default: no long or paired-end read resolution) | |
327 | |
328 In case of metagenomic analysis, exp_cov value will be set to auto. | |
329 | |
330 -long_cov_cutoff <floating-point>: removal of nodes with low long-read coverage AFTER tour bus (default: no removal) | |
331 | |
332 Advanced options: | |
333 | |
334 -ins_length* <integer> : expected distance between two paired-end reads in the respective short-read dataset (default: no read pairing) | |
335 | |
336 -ins_length_long <integer> : expected distance between two long paired-end reads (default: no read pairing) | |
337 | |
338 -ins_length*_sd <integer> : est. standard deviation of respective dataset (default: 10% of corresponding length) | |
339 | |
340 [replace '*' by nothing, '2' or '_long' as necessary] | |
341 | |
342 -scaffolding <yes|no> : scaffolding of contigs used paired end information (default: on) | |
343 | |
344 -max_branch_length <integer> : maximum length in base pair of bubble (default: 100) | |
345 | |
346 -max_divergence <floating-point>: maximum divergence rate between two branches in a bubble (default: 0.2) | |
347 | |
348 -max_gap_count <integer> : maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3) | |
349 | |
350 -min_pair_count <integer> : minimum number of paired end connections to justify the scaffolding of two long contigs (default: 5) | |
351 | |
352 -max_coverage <floating point> : removal of high coverage nodes AFTER tour bus (default: no removal) | |
353 | |
354 -coverage_mask <int> : minimum coverage required for confident regions of contigs (default: 1) | |
355 | |
356 -long_mult_cutoff <int> : minimum number of long reads required to merge contigs (default: 2) | |
357 | |
358 -unused_reads <yes|no> : export unused reads in UnusedReads.fa file (default: no) | |
359 | |
360 -alignments <yes|no> : export a summary of contig alignment to the reference sequences (default: no) | |
361 | |
362 -exportFiltered <yes|no> : export the long nodes which were eliminated by the coverage filters (default: no) | |
363 | |
364 -clean <yes|no> : remove all the intermediary files which are useless for recalculation (default : no) | |
365 | |
366 -very_clean <yes|no> : remove all the intermediary files (no recalculation possible) (default: no) | |
367 | |
368 -paired_exp_fraction <double> : remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1) | |
369 | |
370 -shortMatePaired* <yes|no> : for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no) | |
371 | |
372 -conserveLong <yes|no> : preserve sequences with long reads in them (default no) | |
373 | |
374 Output: | |
375 | |
376 directory/contigs.fa : fasta file of contigs longer than twice hash length | |
377 | |
378 directory/stats.txt : stats file (tab-spaced) useful for determining appropriate coverage cutoff | |
379 | |
380 directory/LastGraph : special formatted file with all the information on the final graph | |
381 | |
382 directory/velvet_asm.afg : (if requested) AMOS compatible assembly file | |
383 | |
384 =head1 META-VELVETG OPTIONS | |
385 | |
386 Graph-splitting options (metagenome-specific): | |
387 | |
388 -discard_chimera <yes|no> : discard chimera sub-graph (default: no) | |
389 | |
390 -max_chimera_rate <double> : maximum allowable chimera rate (default: 0.0) | |
391 | |
392 -repeat_cov_sd : standard deviation of repeat node coverages (default: 0.1) | |
393 | |
394 -min_split_length <int> : minimum node length required for repeat resolution (default: 0) | |
395 | |
396 -valid_connections <int> : minimum allowable number of consistent paired-end connections (default: 1) | |
397 | |
398 -noise_connections <int> : maximum allowable number of inconsistent paired-end connections (default: 0) | |
399 | |
400 -use_connections <yes|no> : use paired-end connections for graph splitting (default: yes) | |
401 | |
402 -report_split_detail <yes|no> : report sequences around repeat nodes (default: no) | |
403 | |
404 -report_subgraph <yes|no> : report node sequences for each subgraph (default: no) | |
405 | |
406 Peak detection options (metagenome-specific): | |
407 | |
408 -exp_covs <string|auto> : expected coverages for each species in microbiome (default: auto) | |
409 | |
410 ex : -exp_covs 214_122_70_43_25_13.5 | |
411 | |
412 coverage values should be sorted in a descending order | |
413 | |
414 -min_peak_cov <double> : minimum peak coverage (default: 0) | |
415 | |
416 -max_peak_cov <double> : maximum peak coverage (default: 500) | |
417 | |
418 -histo_bin_width <double> : bin width of peak coverage histogram (default: 1) | |
419 | |
420 -histo_sn_ratio <double> : signal-noise ratio to remove peak noises (default: 10) | |
421 | |
422 Output: | |
423 | |
424 directory/meta-velvetg.contigs.fa : fasta file of contigs longer than twice hash length | |
425 | |
426 directory/meta-velvetg.LastGraph : special formatted file with all the information on the final graph | |
427 | |
428 directory/meta-velvetg.Graph2-stats.txt : stats file (tab-delimited) useful for optimizing coverage peak values | |
429 | |
430 directory/meta-velvetg.split-stats.txt : stats file (tab-delimited) useful for optimizing graph-splitting parameters | |
431 | |
432 =cut |