Mercurial > repos > cpt > cpt_psm_recombine
comparison lib/CPT/Bio.pm @ 1:97ef96676b48 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:51:26 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:b18e8268bf4e | 1:97ef96676b48 |
|---|---|
| 1 package CPT::Bio; | |
| 2 use Moose; | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use autodie; | |
| 6 use CPT::FiletypeDetector; | |
| 7 use CPT::BioData; | |
| 8 my $bd = CPT::BioData->new(); | |
| 9 | |
| 10 my $filetype = CPT::FiletypeDetector->new(); | |
| 11 | |
| 12 has 'var_translate' => ( is => 'rw', isa => 'Bool'); | |
| 13 has 'var_header' => ( is => 'rw', isa => 'Bool'); | |
| 14 has codonTable => ( | |
| 15 is => 'rw', | |
| 16 isa => 'Any', | |
| 17 default => sub { | |
| 18 $bd->getTranslationTable(11) | |
| 19 }, | |
| 20 ); | |
| 21 | |
| 22 sub set_codon_table { | |
| 23 my ($self, $num) = @_; | |
| 24 $self->codonTable($bd->getTranslationTable($num)); | |
| 25 } | |
| 26 | |
| 27 | |
| 28 sub _getFeatureTag { | |
| 29 my ( $self, $feat, $tag ) = @_; | |
| 30 if(! defined($feat)){ | |
| 31 warn "Undefined feature"; | |
| 32 } | |
| 33 return $feat->has_tag($tag) | |
| 34 ? ( join( ',', $feat->get_tag_values($tag) ) ) | |
| 35 : ''; | |
| 36 } | |
| 37 | |
| 38 sub _getIdentifier { | |
| 39 my ( $self, $feat ) = @_; | |
| 40 my $line; | |
| 41 if ( ref $feat eq 'Bio::Seq::RichSeq' || ref $feat eq 'Bio::Seq' ) { | |
| 42 return $feat->display_id; | |
| 43 } | |
| 44 else { | |
| 45 my $locus_tag = $self->_getFeatureTag( $feat, 'locus_tag' ); | |
| 46 if ($locus_tag) { | |
| 47 return $locus_tag; | |
| 48 } | |
| 49 my $gene = $self->_getFeatureTag( $feat, 'gene' ); | |
| 50 if ($gene) { | |
| 51 return $gene; | |
| 52 } | |
| 53 my $product = $self->_getFeatureTag( $feat, 'product' ); | |
| 54 if ($product) { | |
| 55 return $product; | |
| 56 } | |
| 57 } | |
| 58 return sprintf("%s_%s_%s", $feat->start(), $feat->end(), ($feat->strand() == 1 ? 'sense':'antisense')); | |
| 59 } | |
| 60 | |
| 61 | |
| 62 sub requestCopy { | |
| 63 my ( $self, %data ) = @_; | |
| 64 use Bio::SeqIO; | |
| 65 if ($data{'file'} ) { | |
| 66 my ($guessed_type) = $filetype->detect( $data{'file'} ); | |
| 67 my $seqio = Bio::SeqIO->new( | |
| 68 -file => $data{'file'}, | |
| 69 -format => $guessed_type | |
| 70 ); | |
| 71 my @results; | |
| 72 while ( my $seqobj = $seqio->next_seq() ) { | |
| 73 return \$seqobj; | |
| 74 } | |
| 75 } | |
| 76 else { | |
| 77 die "No file specified"; | |
| 78 } | |
| 79 } | |
| 80 | |
| 81 | |
| 82 sub getSeqIO { | |
| 83 my ( $self, $file ) = @_; | |
| 84 use Bio::SeqIO; | |
| 85 if ($file ) { | |
| 86 my ($guessed_type) = $filetype->detect( $file ); | |
| 87 my $seqio = Bio::SeqIO->new( | |
| 88 -file => $file, | |
| 89 -format => $guessed_type | |
| 90 ); | |
| 91 return $seqio; | |
| 92 } | |
| 93 else { | |
| 94 die "No file specified"; | |
| 95 } | |
| 96 } | |
| 97 | |
| 98 | |
| 99 sub parseFile { | |
| 100 my ( $self, %data ) = @_; | |
| 101 use Bio::SeqIO; | |
| 102 | |
| 103 my ($guessed_type) = $filetype->detect( $data{'file'} ); | |
| 104 my $seqio = Bio::SeqIO->new( | |
| 105 -file => $data{'file'}, | |
| 106 -format => $guessed_type | |
| 107 ); | |
| 108 | |
| 109 # Are we to translate this | |
| 110 $self->var_translate(defined($data{translate}) && $data{translate}); | |
| 111 $self->var_header(defined($data{header}) && $data{header}); | |
| 112 | |
| 113 my @results; | |
| 114 if ( not defined $data{'subset'} ) { | |
| 115 $data{'subset'} = 'all'; | |
| 116 } | |
| 117 while ( my $seqobj = $seqio->next_seq() ) { | |
| 118 if ( | |
| 119 (ref $data{'subset'} ne 'ARRAY' | |
| 120 && $data{'subset'} eq 'whole' ) # Want the whole thing for a richseq | |
| 121 || | |
| 122 (ref $seqobj eq 'Bio::Seq' || ref $seqobj eq 'Bio::Seq::fasta') | |
| 123 # or it's a fasta type sequence | |
| 124 ) | |
| 125 { | |
| 126 push( @results, $self->handle_seq($seqobj)); | |
| 127 } | |
| 128 else #data subset eq sometag | |
| 129 { | |
| 130 my %wanted_tags; | |
| 131 if ( ref $data{'subset'} eq 'ARRAY' ) { | |
| 132 %wanted_tags = | |
| 133 map { $_ => 1 } @{ $data{'subset'} }; | |
| 134 } | |
| 135 else { | |
| 136 $wanted_tags{ $data{'subset'} }++; | |
| 137 } | |
| 138 foreach my $feat ( $seqobj->get_SeqFeatures ) { | |
| 139 if ( | |
| 140 $wanted_tags{ $feat->primary_tag } | |
| 141 || ( $wanted_tags{'all'} | |
| 142 && $feat->primary_tag ne | |
| 143 "source" ) | |
| 144 ) | |
| 145 { | |
| 146 push( @results, $self->handle_seq($feat)); | |
| 147 } | |
| 148 } | |
| 149 } | |
| 150 } | |
| 151 if ( $data{'callback'} ) { | |
| 152 $data{'callback'}->( \@results ); | |
| 153 } | |
| 154 else { | |
| 155 return \@results; | |
| 156 } | |
| 157 } | |
| 158 | |
| 159 sub handle_seq { | |
| 160 my ($self, $obj) = @_; | |
| 161 | |
| 162 my @line; | |
| 163 if ( $self->var_header() ){ | |
| 164 $line[0] = '>' . $self->_getIdentifier($obj); | |
| 165 } | |
| 166 | |
| 167 # Get our sequence | |
| 168 $line[1] = $self->intelligent_get_seq($obj); | |
| 169 | |
| 170 if ( $self->var_translate() ) { | |
| 171 $line[1] = $self->translate($line[1]); | |
| 172 } | |
| 173 return \@line; | |
| 174 } | |
| 175 | |
| 176 sub intelligent_get_seq { | |
| 177 my ($self, $obj, %extra) = @_; | |
| 178 # Top level, e.g., fasta/gbk file, "extra" doesn't apply to these | |
| 179 if ( ref $obj eq 'Bio::Seq::RichSeq' || ref $obj eq 'Bio::Seq' ) { | |
| 180 return $obj->seq; | |
| 181 }else{ | |
| 182 return $self->get_seq_from_feature($obj, %extra); | |
| 183 } | |
| 184 } | |
| 185 sub get_seq_from_feature { | |
| 186 my ($self, $feat, %extra) = @_; | |
| 187 my $seq; | |
| 188 my $l; | |
| 189 if($extra{parent}){ | |
| 190 $l = $extra{parent}->length(); | |
| 191 } | |
| 192 | |
| 193 if($extra{upstream}){ | |
| 194 if($feat->strand < 0){ | |
| 195 my $y = $feat->end + 1; | |
| 196 my $z = $feat->end + $extra{upstream}; | |
| 197 if($y < $l){ | |
| 198 if($z > $l){ | |
| 199 $z = $l; | |
| 200 } | |
| 201 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq; | |
| 202 } | |
| 203 }else{ | |
| 204 my $y = $feat->start - $extra{upstream}; | |
| 205 my $z = $feat->start - 1; | |
| 206 if($z > 0){ | |
| 207 if($y < 1){ | |
| 208 $y = 1; | |
| 209 } | |
| 210 $seq .= $extra{parent}->trunc($y, $z)->seq; | |
| 211 } | |
| 212 } | |
| 213 } | |
| 214 if(ref($feat->location) eq 'Bio::Location::Simple'){ | |
| 215 $seq .= $feat->seq->seq(); | |
| 216 }else{ | |
| 217 $seq .= $feat->spliced_seq->seq(); | |
| 218 } | |
| 219 if($extra{downstream}){ | |
| 220 if($feat->strand < 0){ | |
| 221 my $y = $feat->start - $extra{downstream}; | |
| 222 my $z = $feat->start - 1; | |
| 223 if($z > 0){ | |
| 224 if($y < 1){ | |
| 225 $y = 1; | |
| 226 } | |
| 227 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq; | |
| 228 } | |
| 229 }else{ | |
| 230 my $y = $feat->end + 1; | |
| 231 my $z = $feat->end + $extra{downstream}; | |
| 232 if($y < $l){ | |
| 233 if($z > $l){ | |
| 234 $z = $l; | |
| 235 } | |
| 236 $seq .= $extra{parent}->trunc($y, $z)->seq; | |
| 237 } | |
| 238 } | |
| 239 } | |
| 240 | |
| 241 return $seq; | |
| 242 } | |
| 243 | |
| 244 | |
| 245 sub translate { | |
| 246 my ($self, $seq) = @_; | |
| 247 if($seq =~ /^[ACTGN]+$/){ | |
| 248 my %ct = %{$self->codonTable}; | |
| 249 $seq = join( '' , map { if($ct{$_}){ $ct{$_} }else{ () } } unpack("(A3)*", $seq)); | |
| 250 } | |
| 251 return $seq; | |
| 252 } | |
| 253 | |
| 254 no Moose; | |
| 255 1; | |
| 256 | |
| 257 __END__ | |
| 258 | |
| 259 =pod | |
| 260 | |
| 261 =encoding UTF-8 | |
| 262 | |
| 263 =head1 NAME | |
| 264 | |
| 265 CPT::Bio | |
| 266 | |
| 267 =head1 VERSION | |
| 268 | |
| 269 version 1.99.4 | |
| 270 | |
| 271 =head2 _getFeatureTag | |
| 272 | |
| 273 my $tag = $libCPT->_getFeatureTag($feature,'note'); | |
| 274 | |
| 275 returns all values of the given tag, joined with ','. | |
| 276 | |
| 277 =head2 requestCopy | |
| 278 | |
| 279 my $seqobj = $libCPT->requestCopy('file'=>'test.gbk'); | |
| 280 | |
| 281 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file. | |
| 282 | |
| 283 =head2 getSeqIO | |
| 284 | |
| 285 my $seqio = $libCPT->getSeqIO('file'=>'test.gbk'); | |
| 286 | |
| 287 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file. | |
| 288 | |
| 289 =head2 parseFile | |
| 290 | |
| 291 $libCPT->parseFile( | |
| 292 'file' => $options{'file'}, | |
| 293 'callback' => \&func, | |
| 294 'translate' => 1, | |
| 295 'header' => 1, | |
| 296 'subset' => ['CDS', $options{'tag'}], | |
| 297 ); | |
| 298 | |
| 299 Arguably the most important function in this library, wraps a lot of functionality in a clean wrapper, since most of the scripts we have are written around data munging. | |
| 300 | |
| 301 =over 4 | |
| 302 | |
| 303 =item * | |
| 304 | |
| 305 file - the Bio::SeqIO file to process | |
| 306 | |
| 307 =item * | |
| 308 | |
| 309 callback - the function to send our data to. Done all at once, in an array | |
| 310 | |
| 311 =item * | |
| 312 | |
| 313 translate - should we translate the sequence to amino acids if it's not already. | |
| 314 | |
| 315 =item * | |
| 316 | |
| 317 subset - either "whole", a valid tag, or an array of valid tags | |
| 318 | |
| 319 =item * | |
| 320 | |
| 321 header - Do we want a header (FASTA) with our result set | |
| 322 | |
| 323 =back | |
| 324 | |
| 325 =head1 AUTHOR | |
| 326 | |
| 327 Eric Rasche <rasche.eric@yandex.ru> | |
| 328 | |
| 329 =head1 COPYRIGHT AND LICENSE | |
| 330 | |
| 331 This software is Copyright (c) 2014 by Eric Rasche. | |
| 332 | |
| 333 This is free software, licensed under: | |
| 334 | |
| 335 The GNU General Public License, Version 3, June 2007 | |
| 336 | |
| 337 =cut |
