# HG changeset patch # User stheil # Date 1443105775 14400 # Node ID c979f8682b214e19a1ff8cd25278d3ae6cdb25fd # Parent 1bb80c25b379f84d976c081d957931542f0eee09 Uploaded diff -r 1bb80c25b379 -r c979f8682b21 velvet.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/velvet.pl Thu Sep 24 10:42:55 2015 -0400 @@ -0,0 +1,432 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Logger::Logger; +use Getopt::Long; +use Tools::Fasta; +use Pod::Usage; + +my $directory; +my $hashLength; +my $fileString; +my $performMetagenomicAssembly = 1; +my $man; +my $help; + +my $velvethOptions = {}; +my $velvetgOptions = {}; +my $velvetgmOptions = {}; +my $metaVelvetgOptions = {}; +my $lastOptFile =''; + +GetOptions( + + 'd|directory=s' => \$directory, + 'hash_length=s' => \$hashLength, + 'm|meta!' => \$performMetagenomicAssembly, + 'man' => \$man, + 'h|help' => \$help, + 'short:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'short2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'short3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'shortPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'shortPaired2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'shortPaired3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'long:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'longPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'reference:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'fasta:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'fastq:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'raw:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'fasta_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'fastq_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'raw_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'sam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'bam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'fmtAuto:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'interleaved:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'separate:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, + 'strand_specific' => sub{registerOnOffOption($velvethOptions, @_)}, + 'reuse_Sequences:s' => sub{registerOnOffOption($velvethOptions, @_)}, + 'reuse_binary:s' => sub{registerOnOffOption($velvethOptions, @_)}, + 'noHash:s' => sub{registerOnOffOption($velvethOptions, @_)}, + 'create_binary:s' => sub{registerOnOffOption($velvethOptions, @_)}, + + 'cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'ins_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'read_trkg=s' => sub{registerYesNoOption($velvetgOptions, @_)}, + 'min_contig_lgth=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'amos_file=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'exp_cov=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'long_cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'ins_length_long=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'ins_length2=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'ins_length_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'ins_length_long_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'ins_length2_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'scaffolding=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'max_branch_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'max_divergence=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'max_gap_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'min_pair_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'max_coverage=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'coverage_mask=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'long_mult_cutoff=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, + 'unused_reads=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, + 'alignments=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, + 'exportFiltered=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, + 'clean=s' => sub{registerYesNoOption($velvetgOptions, @_)}, + 'very_clean=s' => sub{registerYesNoOption($velvetgOptions, @_)}, + 'paired_exp_fraction=f' => sub{registerYesNoOption($velvetgmOptions, @_)}, + 'shortMatePaired=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, + 'conserveLong=s' => sub{registerYesNoOption($velvetgOptions, @_)}, + + 'discard_chimera=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, + 'max_chimera_rate=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'repeat_cov_sd=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'min_split_length=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'valid_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'noise_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'use_connections=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, + 'report_split_detail=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, + 'report_subgraph=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, + 'exp_covs=s' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'min_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'max_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'histo_bin_width=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + 'histo_sn_ratio=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, + +) or pod2usage( "Try '$0 --help' for more information." ); + +pod2usage( -verbose => 2 ) if $man; +pod2usage( -verbose => 1 ) if ($help || ! defined $fileString); + +Logger::Logger->changeMode(0); + +if(!defined $directory || $directory eq ''){ + $directory = '.'; +} + +$directory .= '/'; + +#Running velveth +my $velvethCommand = 'velveth ' . $directory . ' ' . $hashLength . ' ' . $fileString . ' ' . convertOptionHashToCommandLine($velvethOptions); +$logger->info('Running velveth...'); +$logger->info($velvethCommand); +`$velvethCommand`; + +#Running velvetg +if($performMetagenomicAssembly){ + $velvetgOptions->{'exp_cov'} = 'auto'; +} +my $velvetgCommand = 'velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($velvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions); +$logger->info('Running velvetg...'); +$logger->info($velvetgCommand); +`$velvetgCommand`; + +#Running meta-velvetg +if ($performMetagenomicAssembly){ + my $metaVelvetCommand = 'meta-velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($metaVelvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions); + $logger->info('Running meta-velvetg...'); + $logger->info($metaVelvetCommand); + `$metaVelvetCommand`; +} + +if(exists $velvetgmOptions->{'unused_reads'} && $velvetgmOptions->{'unused_reads'} eq 'yes'){ + createSingleFile($directory.'UnusedReads.fa', $directory.'Sequences', $directory.'Singlets.fasta'); +} + +sub convertOptionHashToCommandLine{ + my ($optionHash) = @_; + my $commandLineString = ''; + + foreach my $opt (keys %$optionHash){ + if(defined $optionHash->{$opt}){ + if(ref $optionHash->{$opt}){ + $commandLineString .= '-' . $opt . ' ' . join(' -'.$opt.' ',@{$optionHash->{$opt}}) . ' '; + } + else{ + $commandLineString .= '-' . $opt . ' ' . $optionHash->{$opt} . ' '; + } + } + } + + return $commandLineString; +} + +sub registerYesNoOption{ + my ($hash, $optionName, $optionValue) = @_; + + if(defined $optionValue && ($optionValue eq 'no' || $optionValue eq 'yes')){ + registerScalarOptionHash(@_); + } + else{ + $logger->logdie("Option '$optionName' must be yes or no\n"); + } +} + +sub registerOnOffOption{ + my ($hash, $optionName, $optionValue) = @_; + + if(! defined $optionValue || ($optionValue eq '')){ + registerScalarOptionHash(@_); + } + else{ + $logger->logdie("No value allowed for option '$optionName'\n"); + } +} + +sub registerScalarOptionHash{ + my ($hash, $optionName, $optionValue) = @_; + $hash->{$optionName} = $optionValue; +} + +sub registerVelvetFileOptionHash{ + my ($fileString, $lastOptFile, $optionName, $optionValue) = @_; + + if($$lastOptFile ne $optionName){ + $$fileString .= ' -' . $optionName; + } + $$fileString .= ' ' . $optionValue; + $$lastOptFile = $optionName; +} + +sub createSingleFile{ + my ($velvetUnusedReads, $sequencesFile, $outputFile) = @_; + + if(! defined $outputFile){ + $outputFile = ''; + } + + my $sequenceNumber; + my %singlets; + my $id; + my $number; + + open(UNUSED_FILE, $velvetUnusedReads) or $logger->logdie("Unable to open velvet UnusedReads file $velvetUnusedReads : $!"); + open(SEQUENCES_FILE, $sequencesFile) or $logger->logdie("Unable to open velvet Sequences file $sequencesFile : $!"); + open(OUTPUT_FILE, '>'.$outputFile) or $logger->logdie("Unable to create output file $outputFile : $!"); + + my $sequenceFileObject = Tools::Fasta->new(file => $velvetUnusedReads); + + while(my $line=){ + if($line =~ /^>SEQUENCE_([^_]+)/){ + $singlets{$1} = 1; + } + } + close UNUSED_FILE; + + while(my $line=){ + if($line =~ /^>/){ + ($id, $number) = split("\t", $line); + $line = $id . "\n"; + } + if(exists $singlets{$number} && $singlets{$number} == 1){ + print OUTPUT_FILE $line; + } + } + close SEQUENCES_FILE; + close OUTPUT_FILE; +} + +=head1 NAME + +velvet.pl + +=head1 SYNOPSIS + +perl velvet.pl -hash_length HASH_LENGTH [-directory OUTPUT_DIRECTORY] [-meta] [OPTIONS VELVETH / VELVETG / METAVELVETG] + +=head1 DESCRIPTION + +Run velvet or meta-velvetg assembly. + +=head1 OPTIONS + +=over 4 + +=item -directory [DIRECTORY] + +directory path for output files + +Default is . + +=item -hash_length [INTEGER|m,M,s] + +EITHER an odd integer (if even, it will be decremented) <= 31 (if above, will be reduced) + +OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= 31 (if above, will be reduced) and s is a step (even number). Velvet will then hash from k=m to k=M with a step of s + +=item -meta|nometa + +Perform a metagenomic assembly using meta-velvetg ? + +Default is -meta + +=back + +=head1 VELVETH OPTIONS + +File format options: + + -fasta -fastq -raw -fasta_gz -fastq_gz -raw_gz -sam -bam -fmtAuto + + (Note: -fmtAuto will detect fasta or fastq, and will try the following programs for decompression : gunzip, pbunzip2, bunzip2 + +File layout options for paired reads (only for fasta and fastq formats): + + -interleaved : File contains paired reads interleaved in the one file (default) + + -separate : Read 2 separate files for paired reads + +Read type options: + + -short -shortPaired + + -short2 -shortPaired2 + + -long -longPaired + + -reference + +Options: + + -strand_specific : for strand specific transcriptome sequencing data (default: off) + + -reuse_Sequences : reuse Sequences file (or link) already in directory (no need to provide original filenames in this case (default: off) + + -reuse_binary : reuse binary sequences file (or link) already in directory (no need to provide original filenames in this case (default: off) + + -noHash : simply prepare Sequences file, do not hash reads or prepare Roadmaps file (default: off) + + -create_binary : create binary CnyUnifiedSeq file (default: off) + +Outputs: + + directory/Roadmaps + + directory/Sequences + + [Both files are picked up by graph, so please leave them there] + +=head1 VELVETG OPTIONS + + Standard options: + + -cov_cutoff : removal of low coverage nodes AFTER tour bus or allow the system to infer it. (default: no removal) + + -ins_length : expected distance between two paired end reads (default: no read pairing) + + -read_trkg : tracking of short read positions in assembly (default: no tracking) + + -min_contig_lgth : minimum contig length exported to contigs.fa file (default: hash length * 2) + + -amos_file : export assembly to AMOS file (default: no export) + + -exp_cov : expected coverage of unique regions or allow the system to infer it (default: no long or paired-end read resolution) + + In case of metagenomic analysis, exp_cov value will be set to auto. + + -long_cov_cutoff : removal of nodes with low long-read coverage AFTER tour bus (default: no removal) + +Advanced options: + + -ins_length* : expected distance between two paired-end reads in the respective short-read dataset (default: no read pairing) + + -ins_length_long : expected distance between two long paired-end reads (default: no read pairing) + + -ins_length*_sd : est. standard deviation of respective dataset (default: 10% of corresponding length) + + [replace '*' by nothing, '2' or '_long' as necessary] + + -scaffolding : scaffolding of contigs used paired end information (default: on) + + -max_branch_length : maximum length in base pair of bubble (default: 100) + + -max_divergence : maximum divergence rate between two branches in a bubble (default: 0.2) + + -max_gap_count : maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3) + + -min_pair_count : minimum number of paired end connections to justify the scaffolding of two long contigs (default: 5) + + -max_coverage : removal of high coverage nodes AFTER tour bus (default: no removal) + + -coverage_mask : minimum coverage required for confident regions of contigs (default: 1) + + -long_mult_cutoff : minimum number of long reads required to merge contigs (default: 2) + + -unused_reads : export unused reads in UnusedReads.fa file (default: no) + + -alignments : export a summary of contig alignment to the reference sequences (default: no) + + -exportFiltered : export the long nodes which were eliminated by the coverage filters (default: no) + + -clean : remove all the intermediary files which are useless for recalculation (default : no) + + -very_clean : remove all the intermediary files (no recalculation possible) (default: no) + + -paired_exp_fraction : remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1) + + -shortMatePaired* : for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no) + + -conserveLong : preserve sequences with long reads in them (default no) + +Output: + + directory/contigs.fa : fasta file of contigs longer than twice hash length + + directory/stats.txt : stats file (tab-spaced) useful for determining appropriate coverage cutoff + + directory/LastGraph : special formatted file with all the information on the final graph + + directory/velvet_asm.afg : (if requested) AMOS compatible assembly file + +=head1 META-VELVETG OPTIONS + + Graph-splitting options (metagenome-specific): + + -discard_chimera : discard chimera sub-graph (default: no) + + -max_chimera_rate : maximum allowable chimera rate (default: 0.0) + + -repeat_cov_sd : standard deviation of repeat node coverages (default: 0.1) + + -min_split_length : minimum node length required for repeat resolution (default: 0) + + -valid_connections : minimum allowable number of consistent paired-end connections (default: 1) + + -noise_connections : maximum allowable number of inconsistent paired-end connections (default: 0) + + -use_connections : use paired-end connections for graph splitting (default: yes) + + -report_split_detail : report sequences around repeat nodes (default: no) + + -report_subgraph : report node sequences for each subgraph (default: no) + + Peak detection options (metagenome-specific): + + -exp_covs : expected coverages for each species in microbiome (default: auto) + + ex : -exp_covs 214_122_70_43_25_13.5 + + coverage values should be sorted in a descending order + + -min_peak_cov : minimum peak coverage (default: 0) + + -max_peak_cov : maximum peak coverage (default: 500) + + -histo_bin_width : bin width of peak coverage histogram (default: 1) + + -histo_sn_ratio : signal-noise ratio to remove peak noises (default: 10) + + Output: + + directory/meta-velvetg.contigs.fa : fasta file of contigs longer than twice hash length + + directory/meta-velvetg.LastGraph : special formatted file with all the information on the final graph + + directory/meta-velvetg.Graph2-stats.txt : stats file (tab-delimited) useful for optimizing coverage peak values + + directory/meta-velvetg.split-stats.txt : stats file (tab-delimited) useful for optimizing graph-splitting parameters + +=cut