Mercurial > repos > stheil > metavelvet_wrapper
view velvet.pl @ 3:c979f8682b21 draft
Uploaded
author | stheil |
---|---|
date | Thu, 24 Sep 2015 10:42:55 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; use Logger::Logger; use Getopt::Long; use Tools::Fasta; use Pod::Usage; my $directory; my $hashLength; my $fileString; my $performMetagenomicAssembly = 1; my $man; my $help; my $velvethOptions = {}; my $velvetgOptions = {}; my $velvetgmOptions = {}; my $metaVelvetgOptions = {}; my $lastOptFile =''; GetOptions( 'd|directory=s' => \$directory, 'hash_length=s' => \$hashLength, 'm|meta!' => \$performMetagenomicAssembly, 'man' => \$man, 'h|help' => \$help, 'short:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'short2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'short3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'shortPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'shortPaired2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'shortPaired3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'long:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'longPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'reference:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'fasta:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'fastq:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'raw:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'fasta_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'fastq_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'raw_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'sam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'bam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'fmtAuto:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'interleaved:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'separate:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)}, 'strand_specific' => sub{registerOnOffOption($velvethOptions, @_)}, 'reuse_Sequences:s' => sub{registerOnOffOption($velvethOptions, @_)}, 'reuse_binary:s' => sub{registerOnOffOption($velvethOptions, @_)}, 'noHash:s' => sub{registerOnOffOption($velvethOptions, @_)}, 'create_binary:s' => sub{registerOnOffOption($velvethOptions, @_)}, 'cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'ins_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'read_trkg=s' => sub{registerYesNoOption($velvetgOptions, @_)}, 'min_contig_lgth=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'amos_file=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'exp_cov=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'long_cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'ins_length_long=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'ins_length2=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'ins_length_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'ins_length_long_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'ins_length2_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'scaffolding=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'max_branch_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'max_divergence=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'max_gap_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'min_pair_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'max_coverage=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'coverage_mask=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'long_mult_cutoff=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)}, 'unused_reads=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, 'alignments=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, 'exportFiltered=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, 'clean=s' => sub{registerYesNoOption($velvetgOptions, @_)}, 'very_clean=s' => sub{registerYesNoOption($velvetgOptions, @_)}, 'paired_exp_fraction=f' => sub{registerYesNoOption($velvetgmOptions, @_)}, 'shortMatePaired=s' => sub{registerYesNoOption($velvetgmOptions, @_)}, 'conserveLong=s' => sub{registerYesNoOption($velvetgOptions, @_)}, 'discard_chimera=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, 'max_chimera_rate=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'repeat_cov_sd=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'min_split_length=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'valid_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'noise_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'use_connections=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, 'report_split_detail=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, 'report_subgraph=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)}, 'exp_covs=s' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'min_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'max_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'histo_bin_width=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, 'histo_sn_ratio=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)}, ) or pod2usage( "Try '$0 --help' for more information." ); pod2usage( -verbose => 2 ) if $man; pod2usage( -verbose => 1 ) if ($help || ! defined $fileString); Logger::Logger->changeMode(0); if(!defined $directory || $directory eq ''){ $directory = '.'; } $directory .= '/'; #Running velveth my $velvethCommand = 'velveth ' . $directory . ' ' . $hashLength . ' ' . $fileString . ' ' . convertOptionHashToCommandLine($velvethOptions); $logger->info('Running velveth...'); $logger->info($velvethCommand); `$velvethCommand`; #Running velvetg if($performMetagenomicAssembly){ $velvetgOptions->{'exp_cov'} = 'auto'; } my $velvetgCommand = 'velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($velvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions); $logger->info('Running velvetg...'); $logger->info($velvetgCommand); `$velvetgCommand`; #Running meta-velvetg if ($performMetagenomicAssembly){ my $metaVelvetCommand = 'meta-velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($metaVelvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions); $logger->info('Running meta-velvetg...'); $logger->info($metaVelvetCommand); `$metaVelvetCommand`; } if(exists $velvetgmOptions->{'unused_reads'} && $velvetgmOptions->{'unused_reads'} eq 'yes'){ createSingleFile($directory.'UnusedReads.fa', $directory.'Sequences', $directory.'Singlets.fasta'); } sub convertOptionHashToCommandLine{ my ($optionHash) = @_; my $commandLineString = ''; foreach my $opt (keys %$optionHash){ if(defined $optionHash->{$opt}){ if(ref $optionHash->{$opt}){ $commandLineString .= '-' . $opt . ' ' . join(' -'.$opt.' ',@{$optionHash->{$opt}}) . ' '; } else{ $commandLineString .= '-' . $opt . ' ' . $optionHash->{$opt} . ' '; } } } return $commandLineString; } sub registerYesNoOption{ my ($hash, $optionName, $optionValue) = @_; if(defined $optionValue && ($optionValue eq 'no' || $optionValue eq 'yes')){ registerScalarOptionHash(@_); } else{ $logger->logdie("Option '$optionName' must be yes or no\n"); } } sub registerOnOffOption{ my ($hash, $optionName, $optionValue) = @_; if(! defined $optionValue || ($optionValue eq '')){ registerScalarOptionHash(@_); } else{ $logger->logdie("No value allowed for option '$optionName'\n"); } } sub registerScalarOptionHash{ my ($hash, $optionName, $optionValue) = @_; $hash->{$optionName} = $optionValue; } sub registerVelvetFileOptionHash{ my ($fileString, $lastOptFile, $optionName, $optionValue) = @_; if($$lastOptFile ne $optionName){ $$fileString .= ' -' . $optionName; } $$fileString .= ' ' . $optionValue; $$lastOptFile = $optionName; } sub createSingleFile{ my ($velvetUnusedReads, $sequencesFile, $outputFile) = @_; if(! defined $outputFile){ $outputFile = ''; } my $sequenceNumber; my %singlets; my $id; my $number; open(UNUSED_FILE, $velvetUnusedReads) or $logger->logdie("Unable to open velvet UnusedReads file $velvetUnusedReads : $!"); open(SEQUENCES_FILE, $sequencesFile) or $logger->logdie("Unable to open velvet Sequences file $sequencesFile : $!"); open(OUTPUT_FILE, '>'.$outputFile) or $logger->logdie("Unable to create output file $outputFile : $!"); my $sequenceFileObject = Tools::Fasta->new(file => $velvetUnusedReads); while(my $line=<UNUSED_FILE>){ if($line =~ /^>SEQUENCE_([^_]+)/){ $singlets{$1} = 1; } } close UNUSED_FILE; while(my $line=<SEQUENCES_FILE>){ if($line =~ /^>/){ ($id, $number) = split("\t", $line); $line = $id . "\n"; } if(exists $singlets{$number} && $singlets{$number} == 1){ print OUTPUT_FILE $line; } } close SEQUENCES_FILE; close OUTPUT_FILE; } =head1 NAME velvet.pl =head1 SYNOPSIS perl velvet.pl -hash_length HASH_LENGTH [-directory OUTPUT_DIRECTORY] [-meta] [OPTIONS VELVETH / VELVETG / METAVELVETG] =head1 DESCRIPTION Run velvet or meta-velvetg assembly. =head1 OPTIONS =over 4 =item -directory [DIRECTORY] directory path for output files Default is . =item -hash_length [INTEGER|m,M,s] EITHER an odd integer (if even, it will be decremented) <= 31 (if above, will be reduced) OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= 31 (if above, will be reduced) and s is a step (even number). Velvet will then hash from k=m to k=M with a step of s =item -meta|nometa Perform a metagenomic assembly using meta-velvetg ? Default is -meta =back =head1 VELVETH OPTIONS File format options: -fasta -fastq -raw -fasta_gz -fastq_gz -raw_gz -sam -bam -fmtAuto (Note: -fmtAuto will detect fasta or fastq, and will try the following programs for decompression : gunzip, pbunzip2, bunzip2 File layout options for paired reads (only for fasta and fastq formats): -interleaved : File contains paired reads interleaved in the one file (default) -separate : Read 2 separate files for paired reads Read type options: -short -shortPaired -short2 -shortPaired2 -long -longPaired -reference Options: -strand_specific : for strand specific transcriptome sequencing data (default: off) -reuse_Sequences : reuse Sequences file (or link) already in directory (no need to provide original filenames in this case (default: off) -reuse_binary : reuse binary sequences file (or link) already in directory (no need to provide original filenames in this case (default: off) -noHash : simply prepare Sequences file, do not hash reads or prepare Roadmaps file (default: off) -create_binary : create binary CnyUnifiedSeq file (default: off) Outputs: directory/Roadmaps directory/Sequences [Both files are picked up by graph, so please leave them there] =head1 VELVETG OPTIONS Standard options: -cov_cutoff <floating-point|auto> : removal of low coverage nodes AFTER tour bus or allow the system to infer it. (default: no removal) -ins_length <integer> : expected distance between two paired end reads (default: no read pairing) -read_trkg <yes|no> : tracking of short read positions in assembly (default: no tracking) -min_contig_lgth <integer> : minimum contig length exported to contigs.fa file (default: hash length * 2) -amos_file <yes|no> : export assembly to AMOS file (default: no export) -exp_cov <floating point|auto> : expected coverage of unique regions or allow the system to infer it (default: no long or paired-end read resolution) In case of metagenomic analysis, exp_cov value will be set to auto. -long_cov_cutoff <floating-point>: removal of nodes with low long-read coverage AFTER tour bus (default: no removal) Advanced options: -ins_length* <integer> : expected distance between two paired-end reads in the respective short-read dataset (default: no read pairing) -ins_length_long <integer> : expected distance between two long paired-end reads (default: no read pairing) -ins_length*_sd <integer> : est. standard deviation of respective dataset (default: 10% of corresponding length) [replace '*' by nothing, '2' or '_long' as necessary] -scaffolding <yes|no> : scaffolding of contigs used paired end information (default: on) -max_branch_length <integer> : maximum length in base pair of bubble (default: 100) -max_divergence <floating-point>: maximum divergence rate between two branches in a bubble (default: 0.2) -max_gap_count <integer> : maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3) -min_pair_count <integer> : minimum number of paired end connections to justify the scaffolding of two long contigs (default: 5) -max_coverage <floating point> : removal of high coverage nodes AFTER tour bus (default: no removal) -coverage_mask <int> : minimum coverage required for confident regions of contigs (default: 1) -long_mult_cutoff <int> : minimum number of long reads required to merge contigs (default: 2) -unused_reads <yes|no> : export unused reads in UnusedReads.fa file (default: no) -alignments <yes|no> : export a summary of contig alignment to the reference sequences (default: no) -exportFiltered <yes|no> : export the long nodes which were eliminated by the coverage filters (default: no) -clean <yes|no> : remove all the intermediary files which are useless for recalculation (default : no) -very_clean <yes|no> : remove all the intermediary files (no recalculation possible) (default: no) -paired_exp_fraction <double> : remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1) -shortMatePaired* <yes|no> : for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no) -conserveLong <yes|no> : preserve sequences with long reads in them (default no) Output: directory/contigs.fa : fasta file of contigs longer than twice hash length directory/stats.txt : stats file (tab-spaced) useful for determining appropriate coverage cutoff directory/LastGraph : special formatted file with all the information on the final graph directory/velvet_asm.afg : (if requested) AMOS compatible assembly file =head1 META-VELVETG OPTIONS Graph-splitting options (metagenome-specific): -discard_chimera <yes|no> : discard chimera sub-graph (default: no) -max_chimera_rate <double> : maximum allowable chimera rate (default: 0.0) -repeat_cov_sd : standard deviation of repeat node coverages (default: 0.1) -min_split_length <int> : minimum node length required for repeat resolution (default: 0) -valid_connections <int> : minimum allowable number of consistent paired-end connections (default: 1) -noise_connections <int> : maximum allowable number of inconsistent paired-end connections (default: 0) -use_connections <yes|no> : use paired-end connections for graph splitting (default: yes) -report_split_detail <yes|no> : report sequences around repeat nodes (default: no) -report_subgraph <yes|no> : report node sequences for each subgraph (default: no) Peak detection options (metagenome-specific): -exp_covs <string|auto> : expected coverages for each species in microbiome (default: auto) ex : -exp_covs 214_122_70_43_25_13.5 coverage values should be sorted in a descending order -min_peak_cov <double> : minimum peak coverage (default: 0) -max_peak_cov <double> : maximum peak coverage (default: 500) -histo_bin_width <double> : bin width of peak coverage histogram (default: 1) -histo_sn_ratio <double> : signal-noise ratio to remove peak noises (default: 10) Output: directory/meta-velvetg.contigs.fa : fasta file of contigs longer than twice hash length directory/meta-velvetg.LastGraph : special formatted file with all the information on the final graph directory/meta-velvetg.Graph2-stats.txt : stats file (tab-delimited) useful for optimizing coverage peak values directory/meta-velvetg.split-stats.txt : stats file (tab-delimited) useful for optimizing graph-splitting parameters =cut