view VelvetOptimiser-2.1.7_modified/VelvetOpt/Assembly.pm @ 0:50ae1360fbbe default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author konradpaszkiewicz
date Tue, 07 Jun 2011 18:07:56 -0400
parents
children
line wrap: on
line source

#       VelvetOpt::Assembly.pm
#
#       Copyright 2008,2009 Simon Gladman <simon.gladman@csiro.au>
#
#       This program is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#       the Free Software Foundation; either version 2 of the License, or
#       (at your option) any later version.
#
#       This program is distributed in the hope that it will be useful,
#       but WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#       GNU General Public License for more details.
#
#       You should have received a copy of the GNU General Public License
#       along with this program; if not, write to the Free Software
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#       MA 02110-1301, USA.

#	Version 2.1.2
#
#	Changes for 2.0.1
#	*Bug fix in CalcAssemblyScore.  Now returns 0 if there is no calculable score instead of crashing.
#
#	Changes for 2.1.0
#	*Added 2 stage optimisation functions for optimising kmer size and cov_cutoff differently if required.
#
#	Changes for 2.1.1
#	*Allowed for non-word characters in prefix names.  (. - etc.)  Still no spaces allowed in prefix name or any filenames.
#
#	Changes for 2.1.2
#	*Now warns nicely of optimisation function returning undef or 0. Suggests you choose and alternative.
#
#	Changes for 2.1.3
#	*Now prints the velvet calculated insert sizes and standard deviations in the Assembly summaries (both log files and screen).

package VelvetOpt::Assembly;

=head1 NAME

VelvetOpt::Assembly.pm - Velvet assembly container class.

=head1 AUTHOR

Simon Gladman, CSIRO, 2007, 2008.

=head1 LICENSE

Copyright 2008, 2009 Simon Gladman <simon.gladman@csiro.au>

       This program is free software; you can redistribute it and/or modify
       it under the terms of the GNU General Public License as published by
       the Free Software Foundation; either version 2 of the License, or
       (at your option) any later version.

       This program is distributed in the hope that it will be useful,
       but WITHOUT ANY WARRANTY; without even the implied warranty of
       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       GNU General Public License for more details.

       You should have received a copy of the GNU General Public License
       along with this program; if not, write to the Free Software
       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
       MA 02110-1301, USA.

=head1 SYNOPSIS

    use VelvetOpt::Assembly;
    my $object = VelvetOpt::Assembly->new(
        timestamph => "23 November 2008 15:00:00",
        ass_id => "1",
        versionh => "0.7.04",
        ass_dir => "/home/gla048/Desktop/newVelvetOptimiser/data_1"
    );
    print $object->toString();

=head1 DESCRIPTION

A container class to hold the results of a Velvet assembly.  Includes timestamps,
version information, parameter strings and assembly output metrics.

Version 1.1

=head2 Uses

=over 8

=item strict

=item warnings

=item Carp

=back

=head2 Fields

=over 8

=item assmscore

The assembly score metric for this object

=item timstamph

The timestamp of the start of the velveth run for this assembly

=item timestampg

The date and time of the end of the velvetg run.

=item ass_id

The assembly id number.  Sequential for all the runs for this optimisation.

=item versionh

The version number of velveth used in this assembly

=item versiong

The version number of velvetg used in this assembly

=item readfilename

The name of the file containing all the reads (or a qw of them if more than one...)

=item pstringh

The velveth parameter string used in this assembly

=item pstringg

The velvetg parameter string used in this assembly

=item ass_dir

The assembly directory path (full)

=item hashval

The hash value used for this assembly

=item rmapfs

The roadmap file size

=item sequences

The total number of sequences in the input files

=item nconts

The number of contigs in the final assembly

=item totalbp

The total number of bases in the contigs

=item n50

The n50 of the assembly

=item maxlength

The length of the longest contig in the assembly

=item maxcont

The size of the largest contig in the assembly

=item nconts1k

The number of contigs greater than 1k in size

=item totalbp1k

the sum of the length of contigs > 1k in size

=item velvethout

The velveth output

=item velvetgout

The velvetg output

=back

=head2 Methods

=over 8

=item new

Returns a new VelvetAssembly object.

=item accessor methods

Accessor methods for all fields.

=item calcAssemblyScore

Calculates the assembly score of the object (after velvetg has been run.) and stores it in self.

=item getHashingDetails

Gets the details of the outputs from the velveth run and stores it in self.

=item getAssemblyDetails

Gets the details of the outputs from the velvetg run and stores it in self.

=item toString

Returns a string representation of the object's contents.

=item toStringNoV

Returns a string representation of the object's contents without the velvet outputs which are large.

=item opt_func_toString

Returns the usage of the optimisation function.

=back

=cut

use strict;
use lib "/usr/local/lib/perl5/site_perl/5.8.8";
use Carp;
use warnings;
#use base "Storable";
use Cwd;
use Bio::SeqIO;

my $interested = 0;



#constructor
sub new {
    my $class = shift;
    my $self = {@_};
    bless ($self, $class);
    return $self;
}

#optimisation function options...
my %f_opts;
	$f_opts{'ncon'}->{'intname'} = 'nconts';
	$f_opts{'ncon'}->{'desc'} = "The total number of contigs";
	$f_opts{'n50'}->{'intname'} = 'n50';
	$f_opts{'n50'}->{'desc'} = "The n50";
	$f_opts{'max'}->{'intname'} = 'maxlength';
	$f_opts{'max'}->{'desc'} = "The length of the longest contig";
	$f_opts{'Lcon'}->{'intname'} = 'nconts1k';
	$f_opts{'Lcon'}->{'desc'} = "The number of large contigs";
	$f_opts{'tbp'}->{'intname'} = 'totalbp';
	$f_opts{'tbp'}->{'desc'} = "The total number of basepairs in contigs";
	$f_opts{'Lbp'}->{'intname'} = 'totalbp1k';
	$f_opts{'Lbp'}->{'desc'} = "The total number of base pairs in large contigs";

#accessor methods
sub assmscore{ $_[0]->{assmscore}=$_[1] if defined $_[1]; $_[0]->{assmscore}}
sub timestamph{ $_[0]->{timestamph}=$_[1] if defined $_[1]; $_[0]->{timestamph}}
sub timestampg{ $_[0]->{timestampg}=$_[1] if defined $_[1]; $_[0]->{timestampg}}
sub ass_id{ $_[0]->{ass_id}=$_[1] if defined $_[1]; $_[0]->{ass_id}}
sub versionh{ $_[0]->{versionh}=$_[1] if defined $_[1]; $_[0]->{versionh}}
sub versiong{ $_[0]->{versiong}=$_[1] if defined $_[1]; $_[0]->{versiong}}
sub readfilename{ $_[0]->{readfilename}=$_[1] if defined $_[1]; $_[0]->{readfilename}}
sub pstringh{ $_[0]->{pstringh}=$_[1] if defined $_[1]; $_[0]->{pstringh}}
sub pstringg{ $_[0]->{pstringg}=$_[1] if defined $_[1]; $_[0]->{pstringg}}
sub ass_dir{ $_[0]->{ass_dir}=$_[1] if defined $_[1]; $_[0]->{ass_dir}}
sub hashval{ $_[0]->{hashval}=$_[1] if defined $_[1]; $_[0]->{hashval}}
sub rmapfs{ $_[0]->{rmapfs}=$_[1] if defined $_[1]; $_[0]->{rmapfs}}
sub nconts{ $_[0]->{nconts}=$_[1] if defined $_[1]; $_[0]->{nconts}}
sub n50{ $_[0]->{n50}=$_[1] if defined $_[1]; $_[0]->{n50}}
sub maxlength{ $_[0]->{maxlength}=$_[1] if defined $_[1]; $_[0]->{maxlength}}
sub nconts1k{ $_[0]->{nconts1k}=$_[1] if defined $_[1]; $_[0]->{nconts1k}}
sub totalbp{ $_[0]->{totalbp}=$_[1] if defined $_[1]; $_[0]->{totalbp}}
sub totalbp1k{ $_[0]->{totalbp1k}=$_[1] if defined $_[1]; $_[0]->{totalbp1k}}
sub velvethout{ $_[0]->{velvethout}=$_[1] if defined $_[1]; $_[0]->{velvethout}}
sub velvetgout{ $_[0]->{velvetgout}=$_[1] if defined $_[1]; $_[0]->{velvetgout}}
sub sequences{ $_[0]->{sequences}=$_[1] if defined $_[1]; $_[0]->{sequences}}
sub assmfunc{ $_[0]->{assmfunc}=$_[1] if defined $_[1]; $_[0]->{assmfunc}}
sub assmfunc2{ $_[0]->{assmfunc2}=$_[1] if defined $_[1]; $_[0]->{assmfunc2}}

#assemblyScoreCalculator
sub calcAssemblyScore {
    use Safe;
	
	my $self = shift;
	my $func = shift;
	
	my $cpt = new Safe;
	
	#Basic variable IO and traversal
	$cpt->permit(qw(null scalar const padany lineseq leaveeval rv2sv rv2hv helem hslice each values keys exists delete rv2cv));
	#Comparators
	$cpt->permit(qw(lt i_lt gt i_gt le i_le ge i_ge eq i_eq ne i_ne ncmp i_ncmp slt sgt sle sge seq sne scmp));
	#Base math
	$cpt->permit(qw(preinc i_preinc predec i_predec postinc i_postinc postdec i_postdec int hex oct abs pow multiply i_multiply divide i_divide modulo i_modulo add i_add subtract i_subtract));
	#Binary math
	$cpt->permit(qw(left_shift right_shift bit_and bit_xor bit_or negate i_negate not complement));
	#Regex
	$cpt->permit(qw(match split qr));
	#Conditionals
	$cpt->permit(qw(cond_expr flip flop andassign orassign and or xor));
	#Advanced math
	$cpt->permit(qw(atan2 sin cos exp log sqrt rand srand));

	foreach my $key (keys %f_opts){
		print "\nkey: $key\tintname: ", $f_opts{$key}->{'intname'}, "\n" if $interested;
		
		$func =~ s/\b$key\b/$self->{$f_opts{$key}->{'intname'}}/g;
	}
		
	my $r = $cpt->reval($func);
	warn $@ if $@;
	$self->{assmscore} = $r;
	unless($r =~ /^\d+/){ 
		warn "Optimisation function did not return a single float.\nOptimisation function was not evaluatable.\nOptfunc: $func";
		warn "Setting assembly score to 0\n"; 
		$self->{assmscore} = 0;
	}
	if($r == 0){
		print STDERR "**********\n";
		print STDERR "Warning: Assembly score for assembly_id " . $self->{ass_id} .  " is 0\n";
		print STDERR "You may want to consider choosing a different optimisation variable or function.\n";
		print STDERR "Current optimisation functions are ", $self->{assmfunc}, " for k value and ", $self->{assmfunc2}, " for cov_cutoff\n";
		print STDERR "**********\n";
	}
	return 1;
}

#getHashingDetails
sub getHashingDetails {
    my $self = shift;
    unless(!$self->timestamph || !$self->pstringh){
        my $programPath = cwd;
        $self->pstringh =~ /^(\S+)\s+(\d+)\s+(.*)$/;
        $self->{ass_dir} = $programPath . "/" . $1;
        $self->{rmapfs} = -s $self->ass_dir . "/Roadmaps";
        $self->{hashval} = $2;
        $self->{readfilename} = $3;
        my @t = split /\n/, $self->velvethout;
        foreach(@t){
            if(/^(\d+).*total\.$/){
                $self->{sequences} = $1;
                last;
            }
        }
        return 1;
    }
    return 0;
}

#getAssemblyDetails
sub getAssemblyDetails {
    my $self = shift;
	my $file = $self->ass_dir . "/contigs.fa";
    unless(!(-e $file)){
		
		my $all = &contigStats($file,1);
		my $large = &contigStats($file,1000);
		
		$self->{nconts} = defined $all->{numSeqs} ? $all->{numSeqs} : 0;
		$self->{n50} = defined $all->{n50} ? $all->{n50} : 0;
		$self->{maxlength} = defined $all->{maxLen} ? $all->{maxLen} : 0;
		$self->{nconts1k} = defined $large->{numSeqs} ? $large->{numSeqs} : 0;
		$self->{totalbp} = defined $all->{numBases} ? $all->{numBases} : 0;
		$self->{totalbp1k} = defined $large->{numBases} ? $large->{numBases} : 0;
		
		if($self->pstringg =~ m/cov_cutoff/){
			$self->calcAssemblyScore($self->{assmfunc2});
		}
		else {
			$self->calcAssemblyScore($self->{assmfunc});
		}

        return 1;
	}
    return 0;
}

#contigStats
#Original script fa-show.pl by Torsten Seemann (Monash University, Melbourne, Australia)
#Modified by Simon Gladman to suit.
sub contigStats {
	
	my $file = shift;
	my $minsize = shift;
	
	print "In contigStats with $file, $minsize\n" if $interested;
	
	my $numseq=0;
	my $avglen=0;
	my $minlen=1E9;
	my $maxlen=0;
	my @len;
	my $toosmall=0;
	my $nn=0;
	
	my $in = Bio::SeqIO->new(-file => $file, -format => 'Fasta');
	while(my $seq = $in->next_seq()){
		my $L = $seq->length;
		#check > minsize
		if($L < $minsize){
			$toosmall ++;
			next;
		}
		#count Ns
		my $s = $seq->seq;
		my $n = $s =~ s/N/N/gi;
		$n ||= 0;
		$nn += $n;
		#count seqs and other stats
		$numseq ++;
		$avglen += $L;
		$maxlen = $L if $L > $maxlen;
		$minlen = $L if $L < $minlen;
		push @len, $L;
	}
	@len = sort { $a <=> $b } @len;
	my $cum = 0;
	my $n50 = 0;
	for my $i (0 .. $#len){
		$cum += $len[$i];
		if($cum >= $avglen/2) {
			$n50 = $len[$i];
			last;
		}
	}
	
	my %out;
	if($numseq > 0){
		$out{numSeqs} = $numseq;
		$out{numBases} = $avglen;
		$out{numOK} = ($avglen - $nn);
		$out{numNs} = $nn;
		$out{minLen} = $minlen;
		$out{avgLen} = $avglen/$numseq;
		$out{maxLen} = $maxlen;
		$out{n50} = $n50;
		$out{minsize} = $minsize;
		$out{numTooSmall} = $toosmall;
	}
	else {
		$out{$numseq} = 0;
	}
	
	print "Leaving contigstats!\n" if $interested;
	return (\%out);
}


#toString method
sub toString {
    my $self = shift;
    my $tmp = $self->toStringNoV();
    if(defined $self->velvethout){
        $tmp .= "Velveth Output:\n" . $self->velvethout() . "\n";
    }
    if(defined $self->velvetgout){
        $tmp .= "Velvetg Output:\n" . $self->velvetgout() . "\n";
    }
    $tmp .= "**********************************************************\n";
    return $tmp;
}


#toStringNoV method
sub toStringNoV {
    my $self = shift;
    my $tmp = "********************************************************\n";
    if($self->ass_id()){
        $tmp .= "Assembly id: " . $self->ass_id(). "\n";
    }
    if($self->assmscore()){
        $tmp .= "Assembly score: " .$self->assmscore(). "\n";
    }
    if($self->timestamph()){
        $tmp .= "Velveth timestamp: " . $self->timestamph(). "\n";
    }
    if($self->timestampg()){
        $tmp .= "Velvetg timestamp: " . $self->timestampg(). "\n";
    }
    if(defined $self->versionh){
        $tmp .= "Velveth version: " . $self->versionh(). "\n";
    }
    if(defined $self->versiong){
        $tmp .= "Velvetg version: " . $self->versiong(). "\n";
    }
    if(defined $self->readfilename){
        $tmp .= "Readfile(s): " . $self->readfilename(). "\n";
    }
    if(defined $self->pstringh){
        $tmp .= "Velveth parameter string: " . $self->pstringh(). "\n";
    }
    if(defined $self->pstringg){
        $tmp .= "Velvetg parameter string: " . $self->pstringg(). "\n";
    }
    if(defined $self->ass_dir){
        $tmp .= "Assembly directory: " . $self->ass_dir(). "\n";
    }
    if(defined $self->hashval){
        $tmp .= "Velvet hash value: " . $self->hashval(). "\n";
    }
    if(defined $self->rmapfs){
        $tmp .= "Roadmap file size: " . $self->rmapfs(). "\n";
    }
    if(defined $self->sequences){
        $tmp .= "Total number of sequences: " . $self->sequences(). "\n";
    }
    if(defined $self->nconts){
        $tmp .= "Total number of contigs: " . $self->nconts(). "\n";
    }
    if(defined $self->n50){
        $tmp .= "n50: " . $self->n50(). "\n";
    }
    if(defined $self->maxlength){
        $tmp .= "length of longest contig: " . $self->maxlength(). "\n";
    }
    if(defined $self->totalbp){
        $tmp .= "Total bases in contigs: " . $self->totalbp(). "\n";
    }
    if(defined $self->nconts1k){
        $tmp .= "Number of contigs > 1k: " . $self->nconts1k(). "\n";
    }
    if(defined $self->totalbp1k){
        $tmp .= "Total bases in contigs > 1k: " . $self->totalbp1k(). "\n";
    }
    if($self->pstringh =~ /Pair/ && defined $self->pstringg && $self->pstringg =~ /-exp_cov/){
		$tmp .= "Paired Library insert stats:\n";
		my @x = split /\n/, $self->velvetgout;
		foreach(@x){
			chomp;
			if(/^Paired-end library \d+ has/){
				$tmp .= "$_\n";
			}
		}
	}
    $tmp .= "**********************************************************\n";
    return $tmp;
}

sub opt_func_toString {
	my $out = "\nVelvet optimiser assembly optimisation function can be built from the following variables.\n";
	foreach my $key (sort keys %f_opts){
		$out .= "\t$key = " . $f_opts{$key}->{'desc'} . "\n";
	}
	$out .= "Examples are:\n\t'Lbp' = Just the total basepairs in contigs longer than 1kb\n";
	$out .= "\t'n50*Lcon' = The n50 times the number of long contigs.\n";
	$out .= "\t'n50*Lcon/tbp+log(Lbp)' = The n50 times the number of long contigs divided\n\t\tby the total bases in all contigs plus the log of the number of bases\n\t\tin long contigs.\n";
	return $out
}

1;