diff lib/CPT/Bio.pm @ 1:97ef96676b48 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:26 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/CPT/Bio.pm	Mon Jun 05 02:51:26 2023 +0000
@@ -0,0 +1,337 @@
+package CPT::Bio;
+use Moose;
+use strict;
+use warnings;
+use autodie;
+use CPT::FiletypeDetector;
+use CPT::BioData;
+my $bd = CPT::BioData->new();
+
+my $filetype = CPT::FiletypeDetector->new();
+
+has 'var_translate' => ( is => 'rw', isa => 'Bool');
+has 'var_header' => ( is => 'rw', isa => 'Bool');
+has codonTable => (
+	is	    => 'rw',
+	isa 	=> 'Any',
+	default => sub {
+		$bd->getTranslationTable(11)
+	},
+);
+
+sub set_codon_table {
+	my ($self, $num) = @_;
+	$self->codonTable($bd->getTranslationTable($num));
+}
+
+
+sub _getFeatureTag {
+	my ( $self, $feat, $tag ) = @_;
+	if(! defined($feat)){
+		warn "Undefined feature";
+	}
+	return $feat->has_tag($tag)
+	  ? ( join( ',', $feat->get_tag_values($tag) ) )
+	  : '';
+}
+
+sub _getIdentifier {
+	my ( $self, $feat ) = @_;
+	my $line;
+	if ( ref $feat eq 'Bio::Seq::RichSeq' || ref $feat eq 'Bio::Seq' ) {
+		return $feat->display_id;
+	}
+	else {
+		my $locus_tag = $self->_getFeatureTag( $feat, 'locus_tag' );
+		if ($locus_tag) {
+			return $locus_tag;
+		}
+		my $gene = $self->_getFeatureTag( $feat, 'gene' );
+		if ($gene) {
+			return $gene;
+		}
+		my $product = $self->_getFeatureTag( $feat, 'product' );
+		if ($product) {
+			return $product;
+		}
+	}
+	return sprintf("%s_%s_%s", $feat->start(), $feat->end(), ($feat->strand() == 1 ? 'sense':'antisense'));
+}
+
+
+sub requestCopy {
+	my ( $self, %data ) = @_;
+	use Bio::SeqIO;
+	if ($data{'file'} ) {
+		my ($guessed_type) = $filetype->detect( $data{'file'} );
+		my $seqio = Bio::SeqIO->new(
+			-file   => $data{'file'},
+			-format => $guessed_type
+		);
+		my @results;
+		while ( my $seqobj = $seqio->next_seq() ) {
+			return \$seqobj;
+		}
+	}
+	else {
+		die "No file specified";
+	}
+}
+
+
+sub getSeqIO {
+	my ( $self, $file ) = @_;
+	use Bio::SeqIO;
+	if ($file ) {
+		my ($guessed_type) = $filetype->detect( $file );
+		my $seqio = Bio::SeqIO->new(
+			-file   => $file,
+			-format => $guessed_type
+		);
+		return $seqio;
+	}
+	else {
+		die "No file specified";
+	}
+}
+
+
+sub parseFile {
+	my ( $self, %data ) = @_;
+	use Bio::SeqIO;
+
+	my ($guessed_type) = $filetype->detect( $data{'file'} );
+	my $seqio = Bio::SeqIO->new(
+		-file   => $data{'file'},
+		-format => $guessed_type
+	);
+
+	# Are we to translate this
+	$self->var_translate(defined($data{translate}) && $data{translate});
+	$self->var_header(defined($data{header}) && $data{header});
+
+	my @results;
+	if ( not defined $data{'subset'} ) {
+		$data{'subset'} = 'all';
+	}
+	while ( my $seqobj = $seqio->next_seq() ) {
+		if (
+			(ref $data{'subset'} ne 'ARRAY'
+			&& $data{'subset'} eq 'whole' ) # Want the whole thing for a richseq
+			||
+			(ref $seqobj eq 'Bio::Seq' || ref $seqobj eq 'Bio::Seq::fasta')
+			# or it's a fasta type sequence
+		)
+		{
+			push( @results, $self->handle_seq($seqobj));
+		}
+		else                    #data subset eq sometag
+		{
+			my %wanted_tags;
+			if ( ref $data{'subset'} eq 'ARRAY' ) {
+				%wanted_tags =
+				  map { $_ => 1 } @{ $data{'subset'} };
+			}
+			else {
+				$wanted_tags{ $data{'subset'} }++;
+			}
+			foreach my $feat ( $seqobj->get_SeqFeatures ) {
+				if (
+					$wanted_tags{ $feat->primary_tag }
+					|| (       $wanted_tags{'all'}
+						&& $feat->primary_tag ne
+						"source" )
+				  )
+				{
+					push( @results, $self->handle_seq($feat));
+				}
+			}
+		}
+	}
+	if ( $data{'callback'} ) {
+		$data{'callback'}->( \@results );
+	}
+	else {
+		return \@results;
+	}
+}
+
+sub handle_seq {
+	my ($self, $obj) = @_;
+
+	my @line;
+	if ( $self->var_header() ){
+		$line[0] = '>' . $self->_getIdentifier($obj);
+	}
+	
+	# Get our sequence
+	$line[1] = $self->intelligent_get_seq($obj);
+
+	if ( $self->var_translate() ) {
+		$line[1] = $self->translate($line[1]);
+	}
+	return \@line;
+}
+
+sub intelligent_get_seq {
+	my ($self, $obj, %extra) = @_;
+	# Top level, e.g., fasta/gbk file, "extra" doesn't apply to these
+	if ( ref $obj eq 'Bio::Seq::RichSeq' || ref $obj eq 'Bio::Seq' ) {
+		return $obj->seq;
+	}else{
+		return $self->get_seq_from_feature($obj, %extra);
+	}
+}
+sub get_seq_from_feature {
+	my ($self, $feat, %extra) = @_;
+	my $seq;
+	my $l;
+	if($extra{parent}){
+		$l = $extra{parent}->length();
+	}
+
+	if($extra{upstream}){
+		if($feat->strand < 0){
+			my $y = $feat->end + 1;
+			my $z = $feat->end + $extra{upstream};
+			if($y < $l){
+				if($z > $l){
+					$z = $l;
+				}
+				$seq .= $extra{parent}->trunc($y, $z)->revcom->seq;
+			}
+		}else{
+			my $y = $feat->start - $extra{upstream};
+			my $z = $feat->start - 1;
+			if($z > 0){
+				if($y < 1){
+					$y = 1;
+				}
+				$seq .= $extra{parent}->trunc($y, $z)->seq;
+			}
+		}
+	}
+	if(ref($feat->location) eq 'Bio::Location::Simple'){
+		$seq .= $feat->seq->seq();
+	}else{
+		$seq .= $feat->spliced_seq->seq();
+	}
+	if($extra{downstream}){
+		if($feat->strand < 0){
+			my $y = $feat->start - $extra{downstream};
+			my $z = $feat->start - 1;
+			if($z > 0){
+				if($y < 1){
+					$y = 1;
+				}
+				$seq .= $extra{parent}->trunc($y, $z)->revcom->seq;
+			}
+		}else{
+			my $y = $feat->end + 1;
+			my $z = $feat->end + $extra{downstream};
+			if($y < $l){
+				if($z > $l){
+					$z = $l;
+				}
+				$seq .= $extra{parent}->trunc($y, $z)->seq;
+			}
+		}
+	}
+
+	return $seq;
+}
+
+
+sub translate {
+	my ($self, $seq) = @_;
+	if($seq =~ /^[ACTGN]+$/){
+		my %ct = %{$self->codonTable};
+		$seq  = join( '' , map { if($ct{$_}){ $ct{$_} }else{ () } } unpack("(A3)*", $seq));
+	}
+	return $seq;
+}
+
+no Moose;
+1;
+
+__END__
+
+=pod
+
+=encoding UTF-8
+
+=head1 NAME
+
+CPT::Bio
+
+=head1 VERSION
+
+version 1.99.4
+
+=head2 _getFeatureTag
+
+	my $tag = $libCPT->_getFeatureTag($feature,'note');
+
+returns all values of the given tag, joined with ','.
+
+=head2 requestCopy
+
+	my $seqobj = $libCPT->requestCopy('file'=>'test.gbk');
+
+requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file.
+
+=head2 getSeqIO
+
+	my $seqio = $libCPT->getSeqIO('file'=>'test.gbk');
+
+requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file.
+
+=head2 parseFile
+
+	$libCPT->parseFile(
+		'file'      => $options{'file'},
+		'callback'  => \&func,
+		'translate' => 1,
+		'header'    => 1,
+		'subset'    => ['CDS', $options{'tag'}],
+	);
+
+Arguably the most important function in this library, wraps a lot of functionality in a clean wrapper, since most of the scripts we have are written around data munging.
+
+=over 4
+
+=item *
+
+file - the Bio::SeqIO file to process
+
+=item *
+
+callback - the function to send our data to. Done all at once, in an array
+
+=item *
+
+translate - should we translate the sequence to amino acids if it's not already.
+
+=item *
+
+subset - either "whole", a valid tag, or an array of valid tags
+
+=item *
+
+header - Do we want a header (FASTA) with our result set
+
+=back
+
+=head1 AUTHOR
+
+Eric Rasche <rasche.eric@yandex.ru>
+
+=head1 COPYRIGHT AND LICENSE
+
+This software is Copyright (c) 2014 by Eric Rasche.
+
+This is free software, licensed under:
+
+  The GNU General Public License, Version 3, June 2007
+
+=cut