changeset 3:c979f8682b21 draft

Uploaded
author stheil
date Thu, 24 Sep 2015 10:42:55 -0400
parents 1bb80c25b379
children 34ea8f113018
files velvet.pl
diffstat 1 files changed, 432 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/velvet.pl	Thu Sep 24 10:42:55 2015 -0400
@@ -0,0 +1,432 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Logger::Logger;
+use Getopt::Long;
+use Tools::Fasta;
+use Pod::Usage;
+
+my $directory;
+my $hashLength;
+my $fileString;
+my $performMetagenomicAssembly = 1;
+my $man;
+my $help;
+
+my $velvethOptions = {};
+my $velvetgOptions = {};
+my $velvetgmOptions = {};
+my $metaVelvetgOptions = {};
+my $lastOptFile ='';
+
+GetOptions(
+
+    'd|directory=s' => \$directory,
+    'hash_length=s' => \$hashLength,
+    'm|meta!' => \$performMetagenomicAssembly,
+    'man' => \$man,
+    'h|help' => \$help,
+    'short:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'short2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'short3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'shortPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'shortPaired2:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'shortPaired3:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'long:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'longPaired:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'reference:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'fasta:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'fastq:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'raw:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'fasta_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'fastq_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'raw_gz:s{,}' => sub {$_[0] =~ s/_/./; registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'sam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'bam:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'fmtAuto:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'interleaved:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'separate:s{,}' => sub {registerVelvetFileOptionHash(\$fileString, \$lastOptFile, @_)},
+    'strand_specific' => sub{registerOnOffOption($velvethOptions, @_)},
+    'reuse_Sequences:s' => sub{registerOnOffOption($velvethOptions, @_)},
+    'reuse_binary:s' => sub{registerOnOffOption($velvethOptions, @_)},
+    'noHash:s' => sub{registerOnOffOption($velvethOptions, @_)},
+    'create_binary:s' => sub{registerOnOffOption($velvethOptions, @_)},
+
+    'cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'ins_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'read_trkg=s' => sub{registerYesNoOption($velvetgOptions, @_)},
+    'min_contig_lgth=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'amos_file=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'exp_cov=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'long_cov_cutoff=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'ins_length_long=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'ins_length2=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'ins_length_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'ins_length_long_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'ins_length2_sd=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'scaffolding=s' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'max_branch_length=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'max_divergence=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'max_gap_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'min_pair_count=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'max_coverage=f' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'coverage_mask=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'long_mult_cutoff=i' => sub{registerScalarOptionHash($velvetgmOptions, @_)},
+    'unused_reads=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
+    'alignments=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
+    'exportFiltered=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
+    'clean=s' => sub{registerYesNoOption($velvetgOptions, @_)},
+    'very_clean=s' => sub{registerYesNoOption($velvetgOptions, @_)},
+    'paired_exp_fraction=f' => sub{registerYesNoOption($velvetgmOptions, @_)},
+    'shortMatePaired=s' => sub{registerYesNoOption($velvetgmOptions, @_)},
+    'conserveLong=s' => sub{registerYesNoOption($velvetgOptions, @_)},
+
+    'discard_chimera=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
+    'max_chimera_rate=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'repeat_cov_sd=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'min_split_length=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'valid_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'noise_connections=i' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'use_connections=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
+    'report_split_detail=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
+    'report_subgraph=s' => sub{registerYesNoOption($metaVelvetgOptions, @_)},
+    'exp_covs=s' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'min_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'max_peak_cov=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'histo_bin_width=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+    'histo_sn_ratio=f' => sub{registerScalarOptionHash($metaVelvetgOptions, @_)},
+
+) or pod2usage( "Try '$0 --help' for more information." );
+
+pod2usage( -verbose => 2 ) if $man;
+pod2usage( -verbose => 1 ) if ($help || ! defined $fileString);
+
+Logger::Logger->changeMode(0);
+
+if(!defined $directory || $directory eq ''){
+  $directory = '.';
+}
+
+$directory .= '/';
+
+#Running velveth
+my $velvethCommand = 'velveth ' . $directory . ' ' . $hashLength . ' ' . $fileString . ' ' . convertOptionHashToCommandLine($velvethOptions);
+$logger->info('Running velveth...');
+$logger->info($velvethCommand);
+`$velvethCommand`;
+
+#Running velvetg
+if($performMetagenomicAssembly){
+  $velvetgOptions->{'exp_cov'} = 'auto';
+}
+my $velvetgCommand = 'velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($velvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions);
+$logger->info('Running velvetg...');
+$logger->info($velvetgCommand);
+`$velvetgCommand`;
+
+#Running meta-velvetg
+if ($performMetagenomicAssembly){
+  my $metaVelvetCommand = 'meta-velvetg ' . $directory . ' ' . convertOptionHashToCommandLine($metaVelvetgOptions) . ' ' . convertOptionHashToCommandLine($velvetgmOptions);
+  $logger->info('Running meta-velvetg...');
+  $logger->info($metaVelvetCommand);
+  `$metaVelvetCommand`;
+}
+
+if(exists $velvetgmOptions->{'unused_reads'} && $velvetgmOptions->{'unused_reads'} eq 'yes'){
+  createSingleFile($directory.'UnusedReads.fa', $directory.'Sequences', $directory.'Singlets.fasta');
+}
+
+sub convertOptionHashToCommandLine{
+  my ($optionHash) = @_;
+  my $commandLineString = '';
+
+  foreach my $opt (keys %$optionHash){
+    if(defined $optionHash->{$opt}){
+      if(ref $optionHash->{$opt}){
+        $commandLineString .= '-' . $opt . ' ' . join(' -'.$opt.' ',@{$optionHash->{$opt}}) . ' ';
+      }
+      else{
+        $commandLineString .= '-' . $opt . ' ' . $optionHash->{$opt} . ' ';
+      }
+    }
+  }
+
+  return $commandLineString;
+}
+
+sub registerYesNoOption{
+  my ($hash, $optionName, $optionValue) = @_;
+
+  if(defined $optionValue && ($optionValue eq 'no' || $optionValue eq 'yes')){
+    registerScalarOptionHash(@_);
+  }
+  else{
+    $logger->logdie("Option '$optionName' must be yes or no\n");
+  }
+}
+
+sub registerOnOffOption{
+  my ($hash, $optionName, $optionValue) = @_;
+
+  if(! defined $optionValue || ($optionValue eq '')){
+    registerScalarOptionHash(@_);
+  }
+  else{
+    $logger->logdie("No value allowed for option '$optionName'\n");
+  }
+}
+
+sub registerScalarOptionHash{
+  my ($hash, $optionName, $optionValue) = @_;
+  $hash->{$optionName} = $optionValue;
+}
+
+sub registerVelvetFileOptionHash{
+  my ($fileString, $lastOptFile, $optionName, $optionValue) = @_;
+
+  if($$lastOptFile ne $optionName){
+    $$fileString .= ' -' . $optionName;
+  }
+  $$fileString .= ' ' . $optionValue;
+  $$lastOptFile = $optionName;
+}
+
+sub createSingleFile{
+  my ($velvetUnusedReads, $sequencesFile, $outputFile) = @_;
+
+  if(! defined $outputFile){
+    $outputFile = '';
+  }
+
+  my $sequenceNumber;
+  my %singlets;
+  my $id;
+  my $number;
+
+  open(UNUSED_FILE, $velvetUnusedReads) or $logger->logdie("Unable to open velvet UnusedReads file $velvetUnusedReads : $!");
+  open(SEQUENCES_FILE, $sequencesFile) or $logger->logdie("Unable to open velvet Sequences file $sequencesFile : $!");
+  open(OUTPUT_FILE, '>'.$outputFile) or $logger->logdie("Unable to create output file $outputFile : $!");
+
+  my $sequenceFileObject = Tools::Fasta->new(file => $velvetUnusedReads);
+
+  while(my $line=<UNUSED_FILE>){
+    if($line =~ /^>SEQUENCE_([^_]+)/){
+      $singlets{$1} = 1;
+    }
+  }
+  close UNUSED_FILE;
+
+  while(my $line=<SEQUENCES_FILE>){
+    if($line =~ /^>/){
+      ($id, $number) = split("\t", $line);
+      $line = $id . "\n";
+    }
+    if(exists $singlets{$number} && $singlets{$number} == 1){
+      print OUTPUT_FILE $line;
+    }
+  }
+  close SEQUENCES_FILE;
+  close OUTPUT_FILE;
+}
+
+=head1 NAME
+
+velvet.pl
+
+=head1 SYNOPSIS
+
+perl velvet.pl -hash_length HASH_LENGTH [-directory OUTPUT_DIRECTORY] [-meta] [OPTIONS VELVETH / VELVETG / METAVELVETG]
+
+=head1 DESCRIPTION
+
+Run velvet or meta-velvetg assembly.
+
+=head1 OPTIONS
+
+=over 4
+
+=item -directory  [DIRECTORY]
+
+directory path for output files
+
+Default is .
+
+=item -hash_length  [INTEGER|m,M,s]
+
+EITHER an odd integer (if even, it will be decremented) <= 31 (if above, will be reduced)
+
+OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= 31 (if above, will be reduced) and s is a step (even number). Velvet will then hash from k=m to k=M with a step of s
+
+=item -meta|nometa
+
+Perform a metagenomic assembly using meta-velvetg ?
+
+Default is -meta
+
+=back
+
+=head1 VELVETH OPTIONS
+
+File format options:
+
+	-fasta	-fastq	-raw	-fasta_gz	-fastq_gz	-raw_gz	-sam	-bam	-fmtAuto
+
+	(Note: -fmtAuto will detect fasta or fastq, and will try the following programs for decompression : gunzip, pbunzip2, bunzip2
+
+File layout options for paired reads (only for fasta and fastq formats):
+
+	-interleaved	: File contains paired reads interleaved in the one file (default)
+
+	-separate	: Read 2 separate files for paired reads
+
+Read type options:
+
+	-short	-shortPaired
+
+	-short2	-shortPaired2
+
+	-long	-longPaired
+
+	-reference
+
+Options:
+
+	-strand_specific	: for strand specific transcriptome sequencing data (default: off)
+
+	-reuse_Sequences	: reuse Sequences file (or link) already in directory (no need to provide original filenames in this case (default: off)
+
+	-reuse_binary    	: reuse binary sequences file (or link) already in directory (no need to provide original filenames in this case (default: off)
+
+	-noHash			: simply prepare Sequences file, do not hash reads or prepare Roadmaps file (default: off)
+
+	-create_binary  	: create binary CnyUnifiedSeq file (default: off)
+
+Outputs:
+
+	directory/Roadmaps
+
+	directory/Sequences
+
+	[Both files are picked up by graph, so please leave them there]
+
+=head1 VELVETG OPTIONS
+
+  Standard options:
+
+	-cov_cutoff <floating-point|auto>	: removal of low coverage nodes AFTER tour bus or allow the system to infer it. (default: no removal)
+
+	-ins_length <integer>		: expected distance between two paired end reads (default: no read pairing)
+
+	-read_trkg <yes|no>		: tracking of short read positions in assembly (default: no tracking)
+
+	-min_contig_lgth <integer>	: minimum contig length exported to contigs.fa file (default: hash length * 2)
+
+	-amos_file <yes|no>		: export assembly to AMOS file (default: no export)
+
+	-exp_cov <floating point|auto>	: expected coverage of unique regions or allow the system to infer it (default: no long or paired-end read resolution)
+
+        In case of metagenomic analysis, exp_cov value will be set to auto.
+
+	-long_cov_cutoff <floating-point>: removal of nodes with low long-read coverage AFTER tour bus (default: no removal)
+
+Advanced options:
+
+	-ins_length* <integer>		: expected distance between two paired-end reads in the respective short-read dataset (default: no read pairing)
+
+	-ins_length_long <integer>	: expected distance between two long paired-end reads (default: no read pairing)
+
+	-ins_length*_sd <integer>	: est. standard deviation of respective dataset (default: 10% of corresponding length)
+
+	[replace '*' by nothing, '2' or '_long' as necessary]
+
+	-scaffolding <yes|no>		: scaffolding of contigs used paired end information (default: on)
+
+	-max_branch_length <integer>	: maximum length in base pair of bubble (default: 100)
+
+	-max_divergence <floating-point>: maximum divergence rate between two branches in a bubble (default: 0.2)
+
+	-max_gap_count <integer>	: maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3)
+
+	-min_pair_count <integer>	: minimum number of paired end connections to justify the scaffolding of two long contigs (default: 5)
+
+	-max_coverage <floating point>	: removal of high coverage nodes AFTER tour bus (default: no removal)
+
+	-coverage_mask <int>            : minimum coverage required for confident regions of contigs (default: 1)
+
+	-long_mult_cutoff <int>		: minimum number of long reads required to merge contigs (default: 2)
+
+	-unused_reads <yes|no>		: export unused reads in UnusedReads.fa file (default: no)
+
+	-alignments <yes|no>		: export a summary of contig alignment to the reference sequences (default: no)
+
+	-exportFiltered <yes|no>	: export the long nodes which were eliminated by the coverage filters (default: no)
+
+	-clean <yes|no>			: remove all the intermediary files which are useless for recalculation (default : no)
+
+	-very_clean <yes|no>		: remove all the intermediary files (no recalculation possible) (default: no)
+
+	-paired_exp_fraction <double>	: remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1)
+
+	-shortMatePaired* <yes|no>	: for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no)
+
+	-conserveLong <yes|no>		: preserve sequences with long reads in them (default no)
+
+Output:
+
+	directory/contigs.fa		: fasta file of contigs longer than twice hash length
+
+	directory/stats.txt		: stats file (tab-spaced) useful for determining appropriate coverage cutoff
+
+	directory/LastGraph		: special formatted file with all the information on the final graph
+
+	directory/velvet_asm.afg	: (if requested) AMOS compatible assembly file
+
+=head1 META-VELVETG OPTIONS
+
+  Graph-splitting options (metagenome-specific):
+
+	-discard_chimera <yes|no>    	: discard chimera sub-graph (default: no)
+
+	-max_chimera_rate <double>   	: maximum allowable chimera rate (default: 0.0)
+
+	-repeat_cov_sd               	: standard deviation of repeat node coverages (default: 0.1)
+
+	-min_split_length <int>      	: minimum node length required for repeat resolution (default: 0)
+
+	-valid_connections <int>     	: minimum allowable number of consistent paired-end connections (default: 1)
+
+	-noise_connections <int>     	: maximum allowable number of inconsistent paired-end connections (default: 0)
+
+	-use_connections <yes|no>    	: use paired-end connections for graph splitting (default: yes)
+
+	-report_split_detail <yes|no>	: report sequences around repeat nodes (default: no)
+
+	-report_subgraph <yes|no>    	: report node sequences for each subgraph (default: no)
+
+  Peak detection options (metagenome-specific):
+
+	-exp_covs <string|auto>      	: expected coverages for each species in microbiome (default: auto)
+
+	ex : -exp_covs 214_122_70_43_25_13.5
+
+	coverage values should be sorted in a descending order
+
+	-min_peak_cov <double>       	: minimum peak coverage (default: 0)
+
+	-max_peak_cov <double>       	: maximum peak coverage (default: 500)
+
+	-histo_bin_width <double>    	: bin width of peak coverage histogram (default: 1)
+
+	-histo_sn_ratio <double>     	: signal-noise ratio to remove peak noises (default: 10)
+
+  Output:
+
+	directory/meta-velvetg.contigs.fa       	: fasta file of contigs longer than twice hash length
+
+	directory/meta-velvetg.LastGraph        	: special formatted file with all the information on the final graph
+
+	directory/meta-velvetg.Graph2-stats.txt 	: stats file (tab-delimited) useful for optimizing coverage peak values
+
+	directory/meta-velvetg.split-stats.txt  	: stats file (tab-delimited) useful for optimizing graph-splitting parameters
+
+=cut