3
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Logger::Logger;
|
|
5 use Getopt::Long;
|
|
6 use Tools::Fasta;
|
|
7 use Pod::Usage;
|
|
8
|
|
9 my $directory;
|
|
10 my $hashLength;
|
|
11 my $fileString;
|
|
12 my $performMetagenomicAssembly = 1;
|
|
13 my $man;
|
|
14 my $help;
|
|
15
|
|
16 my $velvethOptions = {};
|
|
17 my $velvetgOptions = {};
|
|
18 my $velvetgmOptions = {};
|
|
19 my $metaVelvetgOptions = {};
|
|
20 my $lastOptFile ='';
|
|
21
|
|
22 GetOptions(
|
|
23
|
|
24 'd|directory=s' => \$directory,
|
|
25 'hash_length=s' => \$hashLength,
|
|
26 'm|meta!' => \$performMetagenomicAssembly,
|
|
27 'man' => \$man,
|
|
28 'h|help' => \$help,
|
|
29 'short:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
30 'short2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
31 'short3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
32 'shortPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
33 'shortPaired2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
34 'shortPaired3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
35 'long:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
36 'longPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
37 'reference:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
38 'fasta:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
39 'fastq:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
40 'raw:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
41 'fasta_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
42 'fastq_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
43 'raw_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
44 'sam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
45 'bam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
46 'fmtAuto:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
47 'interleaved:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
48 'separate:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
|
|
49 'strand_specific' => sub{registerOnOffOption($velvethOptions, @_)},
|
|
50 'reuse_Sequences:s' => sub{registerOnOffOption($velvethOptions, @_)},
|
|
51 'reuse_binary:s' => sub{registerOnOffOption($velvethOptions, @_)},
|
|
52 'noHash:s' => sub{registerOnOffOption($velvethOptions, @_)},
|
|
53 'create_binary:s' => sub{registerOnOffOption($velvethOptions, @_)},
|
|
54
|
|
55 'cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
56 'ins_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
57 'read_trkg=s' => sub{registerYesNoOption($velvetgOptions, @_)},
|
|
58 'min_contig_lgth=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
59 'amos_file=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
60 'exp_cov=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
61 'long_cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
62 'ins_length_long=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
63 'ins_length2=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
64 'ins_length_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
65 'ins_length_long_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
66 'ins_length2_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
67 'scaffolding=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
68 'max_branch_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
69 'max_divergence=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
70 'max_gap_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
71 'min_pair_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
72 'max_coverage=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
73 'coverage_mask=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
74 'long_mult_cutoff=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
|
|
75 'unused_reads=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
|
|
76 'alignments=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
|
|
77 'exportFiltered=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
|
|
78 'clean=s' => sub{registerYesNoOption($velvetgOptions, @_)},
|
|
79 'very_clean=s' => sub{registerYesNoOption($velvetgOptions, @_)},
|
|
80 'paired_exp_fraction=f' => sub{registerYesNoOption($velvetgmOptions, @_)},
|
|
81 'shortMatePaired=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
|
|
82 'conserveLong=s' => sub{registerYesNoOption($velvetgOptions, @_)},
|
|
83
|
|
84 'discard_chimera=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
|
|
85 'max_chimera_rate=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
86 'repeat_cov_sd=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
87 'min_split_length=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
88 'valid_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
89 'noise_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
90 'use_connections=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
|
|
91 'report_split_detail=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
|
|
92 'report_subgraph=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
|
|
93 'exp_covs=s' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
94 'min_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
95 'max_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
96 'histo_bin_width=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
97 'histo_sn_ratio=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
|
|
98
|
|
99 ) or pod2usage( "Try '$0 --help' for more information." );
|
|
100
|
|
101 pod2usage( -verbose => 2 ) if $man;
|
|
102 pod2usage( -verbose => 1 ) if ($help || ! defined $fileString);
|
|
103
|
|
104 Logger::Logger->changeMode(0);
|
|
105
|
|
106 if(!defined $directory || $directory eq ''){
|
|
107 $directory = '.';
|
|
108 }
|
|
109
|
|
110 $directory .= '/';
|
|
111
|
|
112 #Running velveth
|
|
113 my $velvethCommand = 'velveth ' . $directory . ' ' . $hashLength . ' ' . $fileString . ' ' . convertOptionHashToCommandLine($velvethOptions);
|
|
114 $logger->info('Running velveth...');
|
|
115 $logger->info($velvethCommand);
|
|
116 `$velvethCommand`;
|
|
117
|
|
118 #Running velvetg
|
|
119 if($performMetagenomicAssembly){
|
|
120 $velvetgOptions->{'exp_cov'} = 'auto';
|
|
121 }
|
|
122 my $velvetgCommand = 'velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($velvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions);
|
|
123 $logger->info('Running velvetg...');
|
|
124 $logger->info($velvetgCommand);
|
|
125 `$velvetgCommand`;
|
|
126
|
|
127 #Running meta-velvetg
|
|
128 if ($performMetagenomicAssembly){
|
|
129 my $metaVelvetCommand = 'meta-velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($metaVelvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions);
|
|
130 $logger->info('Running meta-velvetg...');
|
|
131 $logger->info($metaVelvetCommand);
|
|
132 `$metaVelvetCommand`;
|
|
133 }
|
|
134
|
|
135 if(exists $velvetgmOptions->{'unused_reads'} && $velvetgmOptions->{'unused_reads'} eq 'yes'){
|
|
136 createSingleFile($directory.'UnusedReads.fa', $directory.'Sequences', $directory.'Singlets.fasta');
|
|
137 }
|
|
138
|
|
139 sub convertOptionHashToCommandLine{
|
|
140 my ($optionHash) = @_;
|
|
141 my $commandLineString = '';
|
|
142
|
|
143 foreach my $opt (keys %$optionHash){
|
|
144 if(defined $optionHash->{$opt}){
|
|
145 if(ref $optionHash->{$opt}){
|
|
146 $commandLineString .= '-' . $opt . ' ' . join(' -'.$opt.' ',@{$optionHash->{$opt}}) . ' ';
|
|
147 }
|
|
148 else{
|
|
149 $commandLineString .= '-' . $opt . ' ' . $optionHash->{$opt} . ' ';
|
|
150 }
|
|
151 }
|
|
152 }
|
|
153
|
|
154 return $commandLineString;
|
|
155 }
|
|
156
|
|
157 sub registerYesNoOption{
|
|
158 my ($hash, $optionName, $optionValue) = @_;
|
|
159
|
|
160 if(defined $optionValue && ($optionValue eq 'no' || $optionValue eq 'yes')){
|
|
161 registerScalarOptionHash(@_);
|
|
162 }
|
|
163 else{
|
|
164 $logger->logdie("Option '$optionName' must be yes or no\n");
|
|
165 }
|
|
166 }
|
|
167
|
|
168 sub registerOnOffOption{
|
|
169 my ($hash, $optionName, $optionValue) = @_;
|
|
170
|
|
171 if(! defined $optionValue || ($optionValue eq '')){
|
|
172 registerScalarOptionHash(@_);
|
|
173 }
|
|
174 else{
|
|
175 $logger->logdie("No value allowed for option '$optionName'\n");
|
|
176 }
|
|
177 }
|
|
178
|
|
179 sub registerScalarOptionHash{
|
|
180 my ($hash, $optionName, $optionValue) = @_;
|
|
181 $hash->{$optionName} = $optionValue;
|
|
182 }
|
|
183
|
|
184 sub registerVelvetFileOptionHash{
|
|
185 my ($fileString, $lastOptFile, $optionName, $optionValue) = @_;
|
|
186
|
|
187 if($$lastOptFile ne $optionName){
|
|
188 $$fileString .= ' -' . $optionName;
|
|
189 }
|
|
190 $$fileString .= ' ' . $optionValue;
|
|
191 $$lastOptFile = $optionName;
|
|
192 }
|
|
193
|
|
194 sub createSingleFile{
|
|
195 my ($velvetUnusedReads, $sequencesFile, $outputFile) = @_;
|
|
196
|
|
197 if(! defined $outputFile){
|
|
198 $outputFile = '';
|
|
199 }
|
|
200
|
|
201 my $sequenceNumber;
|
|
202 my %singlets;
|
|
203 my $id;
|
|
204 my $number;
|
|
205
|
|
206 open(UNUSED_FILE, $velvetUnusedReads) or $logger->logdie("Unable to open velvet UnusedReads file $velvetUnusedReads : $!");
|
|
207 open(SEQUENCES_FILE, $sequencesFile) or $logger->logdie("Unable to open velvet Sequences file $sequencesFile : $!");
|
|
208 open(OUTPUT_FILE, '>'.$outputFile) or $logger->logdie("Unable to create output file $outputFile : $!");
|
|
209
|
|
210 my $sequenceFileObject = Tools::Fasta->new(file => $velvetUnusedReads);
|
|
211
|
|
212 while(my $line=<UNUSED_FILE>){
|
|
213 if($line =~ /^>SEQUENCE_([^_]+)/){
|
|
214 $singlets{$1} = 1;
|
|
215 }
|
|
216 }
|
|
217 close UNUSED_FILE;
|
|
218
|
|
219 while(my $line=<SEQUENCES_FILE>){
|
|
220 if($line =~ /^>/){
|
|
221 ($id, $number) = split("\t", $line);
|
|
222 $line = $id . "\n";
|
|
223 }
|
|
224 if(exists $singlets{$number} && $singlets{$number} == 1){
|
|
225 print OUTPUT_FILE $line;
|
|
226 }
|
|
227 }
|
|
228 close SEQUENCES_FILE;
|
|
229 close OUTPUT_FILE;
|
|
230 }
|
|
231
|
|
232 =head1 NAME
|
|
233
|
|
234 velvet.pl
|
|
235
|
|
236 =head1 SYNOPSIS
|
|
237
|
|
238 perl velvet.pl -hash_length HASH_LENGTH [-directory OUTPUT_DIRECTORY] [-meta] [OPTIONS VELVETH / VELVETG / METAVELVETG]
|
|
239
|
|
240 =head1 DESCRIPTION
|
|
241
|
|
242 Run velvet or meta-velvetg assembly.
|
|
243
|
|
244 =head1 OPTIONS
|
|
245
|
|
246 =over 4
|
|
247
|
|
248 =item -directory [DIRECTORY]
|
|
249
|
|
250 directory path for output files
|
|
251
|
|
252 Default is .
|
|
253
|
|
254 =item -hash_length [INTEGER|m,M,s]
|
|
255
|
|
256 EITHER an odd integer (if even, it will be decremented) <= 31 (if above, will be reduced)
|
|
257
|
|
258 OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= 31 (if above, will be reduced) and s is a step (even number). Velvet will then hash from k=m to k=M with a step of s
|
|
259
|
|
260 =item -meta|nometa
|
|
261
|
|
262 Perform a metagenomic assembly using meta-velvetg ?
|
|
263
|
|
264 Default is -meta
|
|
265
|
|
266 =back
|
|
267
|
|
268 =head1 VELVETH OPTIONS
|
|
269
|
|
270 File format options:
|
|
271
|
|
272 -fasta -fastq -raw -fasta_gz -fastq_gz -raw_gz -sam -bam -fmtAuto
|
|
273
|
|
274 (Note: -fmtAuto will detect fasta or fastq, and will try the following programs for decompression : gunzip, pbunzip2, bunzip2
|
|
275
|
|
276 File layout options for paired reads (only for fasta and fastq formats):
|
|
277
|
|
278 -interleaved : File contains paired reads interleaved in the one file (default)
|
|
279
|
|
280 -separate : Read 2 separate files for paired reads
|
|
281
|
|
282 Read type options:
|
|
283
|
|
284 -short -shortPaired
|
|
285
|
|
286 -short2 -shortPaired2
|
|
287
|
|
288 -long -longPaired
|
|
289
|
|
290 -reference
|
|
291
|
|
292 Options:
|
|
293
|
|
294 -strand_specific : for strand specific transcriptome sequencing data (default: off)
|
|
295
|
|
296 -reuse_Sequences : reuse Sequences file (or link) already in directory (no need to provide original filenames in this case (default: off)
|
|
297
|
|
298 -reuse_binary : reuse binary sequences file (or link) already in directory (no need to provide original filenames in this case (default: off)
|
|
299
|
|
300 -noHash : simply prepare Sequences file, do not hash reads or prepare Roadmaps file (default: off)
|
|
301
|
|
302 -create_binary : create binary CnyUnifiedSeq file (default: off)
|
|
303
|
|
304 Outputs:
|
|
305
|
|
306 directory/Roadmaps
|
|
307
|
|
308 directory/Sequences
|
|
309
|
|
310 [Both files are picked up by graph, so please leave them there]
|
|
311
|
|
312 =head1 VELVETG OPTIONS
|
|
313
|
|
314 Standard options:
|
|
315
|
|
316 -cov_cutoff <floating-point|auto> : removal of low coverage nodes AFTER tour bus or allow the system to infer it. (default: no removal)
|
|
317
|
|
318 -ins_length <integer> : expected distance between two paired end reads (default: no read pairing)
|
|
319
|
|
320 -read_trkg <yes|no> : tracking of short read positions in assembly (default: no tracking)
|
|
321
|
|
322 -min_contig_lgth <integer> : minimum contig length exported to contigs.fa file (default: hash length * 2)
|
|
323
|
|
324 -amos_file <yes|no> : export assembly to AMOS file (default: no export)
|
|
325
|
|
326 -exp_cov <floating point|auto> : expected coverage of unique regions or allow the system to infer it (default: no long or paired-end read resolution)
|
|
327
|
|
328 In case of metagenomic analysis, exp_cov value will be set to auto.
|
|
329
|
|
330 -long_cov_cutoff <floating-point>: removal of nodes with low long-read coverage AFTER tour bus (default: no removal)
|
|
331
|
|
332 Advanced options:
|
|
333
|
|
334 -ins_length* <integer> : expected distance between two paired-end reads in the respective short-read dataset (default: no read pairing)
|
|
335
|
|
336 -ins_length_long <integer> : expected distance between two long paired-end reads (default: no read pairing)
|
|
337
|
|
338 -ins_length*_sd <integer> : est. standard deviation of respective dataset (default: 10% of corresponding length)
|
|
339
|
|
340 [replace '*' by nothing, '2' or '_long' as necessary]
|
|
341
|
|
342 -scaffolding <yes|no> : scaffolding of contigs used paired end information (default: on)
|
|
343
|
|
344 -max_branch_length <integer> : maximum length in base pair of bubble (default: 100)
|
|
345
|
|
346 -max_divergence <floating-point>: maximum divergence rate between two branches in a bubble (default: 0.2)
|
|
347
|
|
348 -max_gap_count <integer> : maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3)
|
|
349
|
|
350 -min_pair_count <integer> : minimum number of paired end connections to justify the scaffolding of two long contigs (default: 5)
|
|
351
|
|
352 -max_coverage <floating point> : removal of high coverage nodes AFTER tour bus (default: no removal)
|
|
353
|
|
354 -coverage_mask <int> : minimum coverage required for confident regions of contigs (default: 1)
|
|
355
|
|
356 -long_mult_cutoff <int> : minimum number of long reads required to merge contigs (default: 2)
|
|
357
|
|
358 -unused_reads <yes|no> : export unused reads in UnusedReads.fa file (default: no)
|
|
359
|
|
360 -alignments <yes|no> : export a summary of contig alignment to the reference sequences (default: no)
|
|
361
|
|
362 -exportFiltered <yes|no> : export the long nodes which were eliminated by the coverage filters (default: no)
|
|
363
|
|
364 -clean <yes|no> : remove all the intermediary files which are useless for recalculation (default : no)
|
|
365
|
|
366 -very_clean <yes|no> : remove all the intermediary files (no recalculation possible) (default: no)
|
|
367
|
|
368 -paired_exp_fraction <double> : remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1)
|
|
369
|
|
370 -shortMatePaired* <yes|no> : for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no)
|
|
371
|
|
372 -conserveLong <yes|no> : preserve sequences with long reads in them (default no)
|
|
373
|
|
374 Output:
|
|
375
|
|
376 directory/contigs.fa : fasta file of contigs longer than twice hash length
|
|
377
|
|
378 directory/stats.txt : stats file (tab-spaced) useful for determining appropriate coverage cutoff
|
|
379
|
|
380 directory/LastGraph : special formatted file with all the information on the final graph
|
|
381
|
|
382 directory/velvet_asm.afg : (if requested) AMOS compatible assembly file
|
|
383
|
|
384 =head1 META-VELVETG OPTIONS
|
|
385
|
|
386 Graph-splitting options (metagenome-specific):
|
|
387
|
|
388 -discard_chimera <yes|no> : discard chimera sub-graph (default: no)
|
|
389
|
|
390 -max_chimera_rate <double> : maximum allowable chimera rate (default: 0.0)
|
|
391
|
|
392 -repeat_cov_sd : standard deviation of repeat node coverages (default: 0.1)
|
|
393
|
|
394 -min_split_length <int> : minimum node length required for repeat resolution (default: 0)
|
|
395
|
|
396 -valid_connections <int> : minimum allowable number of consistent paired-end connections (default: 1)
|
|
397
|
|
398 -noise_connections <int> : maximum allowable number of inconsistent paired-end connections (default: 0)
|
|
399
|
|
400 -use_connections <yes|no> : use paired-end connections for graph splitting (default: yes)
|
|
401
|
|
402 -report_split_detail <yes|no> : report sequences around repeat nodes (default: no)
|
|
403
|
|
404 -report_subgraph <yes|no> : report node sequences for each subgraph (default: no)
|
|
405
|
|
406 Peak detection options (metagenome-specific):
|
|
407
|
|
408 -exp_covs <string|auto> : expected coverages for each species in microbiome (default: auto)
|
|
409
|
|
410 ex : -exp_covs 214_122_70_43_25_13.5
|
|
411
|
|
412 coverage values should be sorted in a descending order
|
|
413
|
|
414 -min_peak_cov <double> : minimum peak coverage (default: 0)
|
|
415
|
|
416 -max_peak_cov <double> : maximum peak coverage (default: 500)
|
|
417
|
|
418 -histo_bin_width <double> : bin width of peak coverage histogram (default: 1)
|
|
419
|
|
420 -histo_sn_ratio <double> : signal-noise ratio to remove peak noises (default: 10)
|
|
421
|
|
422 Output:
|
|
423
|
|
424 directory/meta-velvetg.contigs.fa : fasta file of contigs longer than twice hash length
|
|
425
|
|
426 directory/meta-velvetg.LastGraph : special formatted file with all the information on the final graph
|
|
427
|
|
428 directory/meta-velvetg.Graph2-stats.txt : stats file (tab-delimited) useful for optimizing coverage peak values
|
|
429
|
|
430 directory/meta-velvetg.split-stats.txt : stats file (tab-delimited) useful for optimizing graph-splitting parameters
|
|
431
|
|
432 =cut
|