Mercurial > repos > cpt > cpt_psm_plotter
comparison lib/CPT/GBK2GFF3.pm @ 1:8691c1c61a8e draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt | 
|---|---|
| date | Mon, 05 Jun 2023 02:48:47 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:54c7a3ea81e2 | 1:8691c1c61a8e | 
|---|---|
| 1 package CPT::GBK2GFF3; | |
| 2 use strict; | |
| 3 use warnings; | |
| 4 use CPT; | |
| 5 use Data::Dumper; | |
| 6 use autodie; | |
| 7 use Bio::SeqIO; | |
| 8 use Moose; | |
| 9 | |
| 10 has data => ( | |
| 11 is => 'rw', | |
| 12 isa => 'ArrayRef', | |
| 13 default => sub { [] }, | |
| 14 ); | |
| 15 has header => ( | |
| 16 is => 'rw', | |
| 17 isa => 'ArrayRef', | |
| 18 ); | |
| 19 has sequence => ( | |
| 20 is => 'rw', | |
| 21 isa => 'Str', | |
| 22 ); | |
| 23 has seqid => ( | |
| 24 is => 'rw', | |
| 25 isa => 'Str', | |
| 26 ); | |
| 27 has genbank => ( | |
| 28 is => 'rw', | |
| 29 isa => 'Str', | |
| 30 ); | |
| 31 has is_circular => ( | |
| 32 is => 'rw', | |
| 33 isa => 'Str', | |
| 34 ); | |
| 35 has id_prefix => ( is => 'rw', isa => 'Str' ); | |
| 36 has global_feat_idx => ( is => 'rw', isa => 'Int', default => sub { 0 } ); | |
| 37 has source => ( is => 'rw', isa => 'ArrayRef'); | |
| 38 | |
| 39 my %feat_type_count; | |
| 40 my %reserved_keys = map { $_ => 1 } | |
| 41 qw/ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term/; | |
| 42 | |
| 43 use CPT::Bio; | |
| 44 my $bio = CPT::Bio->new(); | |
| 45 my %key_mapping = ( | |
| 46 "-" => [ "located_sequence_feature", "SO:0000110" ], | |
| 47 "-10_signal" => [ "minus_10_signal", "SO:0000175" ], | |
| 48 "-35_signal" => [ "minus_35_signal", "SO:0000176" ], | |
| 49 "3'UTR" => [ "three_prime_UTR", "SO:0000205" ], | |
| 50 "3'clip" => [ "three_prime_clip", "SO:0000557" ], | |
| 51 "5'UTR" => [ "five_prime_UTR", "SO:0000204" ], | |
| 52 "5'clip" => [ "five_prime_clip", "SO:0000555" ], | |
| 53 "CAAT_signal" => [ "CAAT_signal", "SO:0000172" ], | |
| 54 "CDS" => [ "CDS", "SO:0000316" ], | |
| 55 "D-loop" => [ "D_loop", "SO:0000297" ], | |
| 56 "D_segment" => [ "D_gene", "SO:0000458" ], | |
| 57 "GC_signal" => [ "GC_rich_region", "SO:0000173" ], | |
| 58 "LTR" => [ "long_terminal_repeat", "SO:0000286" ], | |
| 59 "RBS" => [ "ribosome_entry_site", "SO:0000139" ], | |
| 60 "STS" => [ "STS", "SO:0000331" ], | |
| 61 "TATA_signal" => [ "TATA_box", "SO:0000174" ], | |
| 62 "attenuator" => [ "attenuator", "SO:0000140" ], | |
| 63 "enhancer" => [ "enhancer", "SO:0000165" ], | |
| 64 "exon" => [ "exon", "SO:0000147" ], | |
| 65 "gap" => [ "gap", "SO:0000730" ], | |
| 66 "gene" => [ "gene", "SO:0000704" ], | |
| 67 "iDNA" => [ "iDNA", "SO:0000723" ], | |
| 68 "intron" => [ "intron", "SO:0000188" ], | |
| 69 "mRNA" => [ "mRNA", "SO:0000234" ], | |
| 70 "mat_peptide" => [ "mature_protein_region", "SO:0000419" ], | |
| 71 "misc_RNA" => [ "transcript", "SO:0000673" ], | |
| 72 "misc_binding" => [ "binding_site", "SO:0000409" ], | |
| 73 "misc_difference" => [ "sequence_difference", "SO:0000413" ], | |
| 74 "misc_feature" => [ "region", "SO:0000001" ], | |
| 75 "misc_recomb" => [ "recombination_feature", "SO:0000298" ], | |
| 76 "misc_signal" => [ "regulatory_region", "SO:0005836" ], | |
| 77 "misc_structure" => [ "sequence_secondary_structure", "SO:0000002" ], | |
| 78 "modified_base" => [ "modified_DNA_base", "SO:0000305" ], | |
| 79 "operon" => [ "operon", "SO:0000178" ], | |
| 80 "oriT" => [ "origin_of_transfer", "SO:0000724" ], | |
| 81 "polyA_signal" => [ "polyA_signal_sequence", "SO:0000551" ], | |
| 82 "polyA_site" => [ "polyA_site", "SO:0000553" ], | |
| 83 "precursor_RNA" => [ "primary_transcript", "SO:0000185" ], | |
| 84 "prim_transcript" => [ "primary_transcript", "SO:0000185" ], | |
| 85 "primer_bind" => [ "primer_binding_site", "SO:0005850" ], | |
| 86 "promoter" => [ "promoter", "SO:0000167" ], | |
| 87 "protein_bind" => [ "protein_binding_site", "SO:0000410" ], | |
| 88 "rRNA" => [ "rRNA", "SO:0000252" ], | |
| 89 "repeat_region" => [ "repeat_region", "SO:0000657" ], | |
| 90 "repeat_unit" => [ "repeat_unit", "SO:0000726" ], | |
| 91 "satellite" => [ "satellite_DNA", "SO:0000005" ], | |
| 92 "scRNA" => [ "scRNA", "SO:0000013" ], | |
| 93 "sig_peptide" => [ "signal_peptide", "SO:0000418" ], | |
| 94 "snRNA" => [ "snRNA", "SO:0000274" ], | |
| 95 "snoRNA" => [ "snoRNA", "SO:0000275" ], | |
| 96 "source" => [ "contig", "SO:0000149" ], # manually modified | |
| 97 "stem_loop" => [ "stem_loop", "SO:0000313" ], | |
| 98 "tRNA" => [ "tRNA", "SO:0000253" ], | |
| 99 "terminator" => [ "terminator", "SO:0000141" ], | |
| 100 "transit_peptide" => [ "transit_peptide", "SO:0000725" ], | |
| 101 "variation" => [ "sequence_variant", "SO:0000109" ], | |
| 102 ); | |
| 103 | |
| 104 | |
| 105 sub get_gff3_file { | |
| 106 my ($self) = @_; | |
| 107 my @output; | |
| 108 for my $header_line ( @{ $self->header() } ) { | |
| 109 push( @output, $header_line ); | |
| 110 } | |
| 111 push( @output, join( "\t", @{ $self->get_source } ) ); | |
| 112 for my $data_line ( @{ $self->data() }) { | |
| 113 push( @output, join( "\t", @{$data_line} ) ); | |
| 114 } | |
| 115 push( @output, '##FASTA' ); | |
| 116 push( @output, '>' . $self->seqid() ); | |
| 117 my $seq = $self->sequence(); | |
| 118 $seq =~ s/(.{80})/$1\n/g; | |
| 119 push( @output, $seq ); | |
| 120 return join( "\n", @output ); | |
| 121 } | |
| 122 | |
| 123 sub escape { | |
| 124 my ($self, $str) = @_; | |
| 125 $str =~ s/,/%2C/g; | |
| 126 $str =~ s/=/%3D/g; | |
| 127 $str =~ s/;/%3B/g; | |
| 128 $str =~ s/\t/%09/g; | |
| 129 return $str; | |
| 130 } | |
| 131 | |
| 132 sub FT_SO_map { | |
| 133 my ( $self, $key ) = @_; | |
| 134 if ( $key_mapping{$key} ) { | |
| 135 my @result = @{ $key_mapping{$key} }; | |
| 136 return $result[0]; | |
| 137 } | |
| 138 else { | |
| 139 return 'region'; | |
| 140 } | |
| 141 } | |
| 142 | |
| 143 sub source_map { | |
| 144 my ( $self, $type ) = @_; | |
| 145 if ( $self->{'override_source'} ) { | |
| 146 return $self->{'override_source'}; | |
| 147 } | |
| 148 if ( $self->{'annotation_software'}{$type} ) { | |
| 149 return $self->{'annotation_software'}{$type}; | |
| 150 } | |
| 151 else { | |
| 152 return '.'; | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 sub get_attrs { | |
| 157 my ( $self, %data ) = @_; | |
| 158 my $feature = $data{'feat'}; | |
| 159 my $parents_ref = $data{'parents'}; | |
| 160 | |
| 161 my %attrs = (); | |
| 162 $attrs{'ID'} = | |
| 163 $self->id_prefix() . '.' . ($self->global_feat_idx()); | |
| 164 | |
| 165 $self->global_feat_idx($self->global_feat_idx()+1); | |
| 166 | |
| 167 # Handle Identifier | |
| 168 my $identifier = $bio->_getIdentifier($feature); | |
| 169 if ( $identifier ne 'ERROR' ) { | |
| 170 $attrs{'Name'} = $identifier . '.' . $feature->primary_tag; | |
| 171 } | |
| 172 | |
| 173 # Handle parents, if there are any | |
| 174 if ($parents_ref) { | |
| 175 if ( ref($parents_ref) eq 'ARRAY' ) { | |
| 176 $attrs{'Parent'} = $parents_ref; | |
| 177 } | |
| 178 else { | |
| 179 $attrs{'Parent'} = [$parents_ref]; | |
| 180 } | |
| 181 } | |
| 182 | |
| 183 # These are otherwise "Special" keys that need to be handled differently. | |
| 184 if ( $feature->has_tag('note') ) { | |
| 185 my @notes = $feature->get_tag_values('note'); | |
| 186 $attrs{'Note'} = \@notes; | |
| 187 } | |
| 188 if ( $feature->has_tag('db_xref') ) { | |
| 189 my @dbxref = $feature->get_tag_values('db_xref'); | |
| 190 $attrs{'Dbxref'} = \@dbxref; | |
| 191 } | |
| 192 | |
| 193 # Do the rest | |
| 194 for my $tag ( $feature->get_all_tags() ) { | |
| 195 | |
| 196 # If not one of the specially handled ones | |
| 197 if ( $tag ne 'name' && $tag ne 'note' && $tag ne 'db_xref' ) { | |
| 198 | |
| 199 # If not a reserved_key | |
| 200 if ( !$reserved_keys{$tag} ) { | |
| 201 my @vals = $feature->get_tag_values($tag); | |
| 202 $attrs{lc($tag)} = \@vals; | |
| 203 } | |
| 204 else { | |
| 205 warn | |
| 206 "Trying to set a reserved key $tag with value $attrs{$tag}"; | |
| 207 } | |
| 208 } | |
| 209 } | |
| 210 my %response = ( | |
| 211 id => $attrs{'ID'}, | |
| 212 attr_str => $self->post_process_attribute_string(%attrs), | |
| 213 ); | |
| 214 return %response; | |
| 215 } | |
| 216 | |
| 217 sub post_process_attribute_string { | |
| 218 my ($self,%attrs) = @_; | |
| 219 my @parts = (); | |
| 220 for my $k ( keys %attrs ) { | |
| 221 | |
| 222 # IGNORED TAGS | |
| 223 if ( $k ne 'translation' && $k ne 'product' ) { | |
| 224 my $joined = | |
| 225 $self->escape_and_join_attribute_subpart( $attrs{$k} ); | |
| 226 push @parts, "$k=$joined"; | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 #print STDERR join("\t",@parts),"\n"; | |
| 231 return join( ";", @parts ); | |
| 232 | |
| 233 } | |
| 234 | |
| 235 sub escape_and_join_attribute_subpart { | |
| 236 my ($self,$ref) = @_; | |
| 237 if ( ref($ref) eq 'ARRAY' ) { | |
| 238 my @attrs = @{$ref}; | |
| 239 return join( ",", map { $self->escape($_) } @attrs ); | |
| 240 } | |
| 241 else { #scalar | |
| 242 return $self->escape($ref); | |
| 243 } | |
| 244 } | |
| 245 | |
| 246 sub get_source { | |
| 247 my ($self) = @_; | |
| 248 if ( !$self->source() ) { | |
| 249 $self->auto_source(); | |
| 250 } | |
| 251 return $self->source(); | |
| 252 } | |
| 253 | |
| 254 sub auto_source { | |
| 255 | |
| 256 # Auto generate a source feature, in case there isn't one. | |
| 257 my ($self) = @_; | |
| 258 my @region = ( | |
| 259 $self->seqid(), | |
| 260 ( $self->genbank() ? 'Genbank' : 'Assembly' ), | |
| 261 'contig', | |
| 262 1, | |
| 263 $self->get_length, | |
| 264 '.', | |
| 265 '.', | |
| 266 '.', | |
| 267 sprintf( "ID=%s;Name=%s", $self->seqid(), $self->seqid() ) | |
| 268 . ( $self->is_circular() ? ";Is_circular=True" : "" ) | |
| 269 ); | |
| 270 $self->source(\@region); | |
| 271 } | |
| 272 | |
| 273 sub add_feature { | |
| 274 my ( $self, $feat ) = @_; | |
| 275 my $primary_tag = $feat->primary_tag; | |
| 276 if ( $primary_tag eq 'CDS' ) { | |
| 277 my ( $id, $data_0 ) = $self->_add_gene( feat => $feat, ); | |
| 278 my ($data_1) = $self->_add_feature( | |
| 279 feat => $feat, | |
| 280 parent => $id, | |
| 281 ); | |
| 282 $self->_low_level_add_feature($data_0); | |
| 283 $self->_low_level_add_feature($data_1); | |
| 284 } | |
| 285 elsif ( $primary_tag eq 'source' ) { | |
| 286 my ($data) = $self->_add_feature( feat => $feat, ); | |
| 287 | |
| 288 # YUCK. | |
| 289 my @z = @{$data}; | |
| 290 my $seqid = $self->seqid(); | |
| 291 $z[8] =~ s/ID=[^;]*;/ID=$seqid;/g; | |
| 292 $self->source( \@z ); | |
| 293 } | |
| 294 elsif ( $primary_tag ne 'gene' ) { | |
| 295 my ($data) = $self->_add_feature( feat => $feat, ); | |
| 296 $self->_low_level_add_feature($data); | |
| 297 } | |
| 298 | |
| 299 } | |
| 300 | |
| 301 sub _low_level_add_feature { | |
| 302 my ( $self, $data ) = @_; | |
| 303 push( @{ $self->data() }, $data ); | |
| 304 } | |
| 305 | |
| 306 sub _add_feature { | |
| 307 my ( $self, %data ) = @_; | |
| 308 my %attrs = | |
| 309 $self->get_attrs( feat => $data{feat}, parents => $data{parent} ); | |
| 310 my @data = ( | |
| 311 $self->seqid(), | |
| 312 $self->source_map( $data{feat}->primary_tag ), | |
| 313 $self->FT_SO_map( $data{feat}->primary_tag ), | |
| 314 $data{feat}->start, | |
| 315 $data{feat}->end, | |
| 316 '.', | |
| 317 ( $data{feat}->strand == 1 ? '+' : '-' ), | |
| 318 '.', | |
| 319 $attrs{attr_str} | |
| 320 ); | |
| 321 | |
| 322 #$self->_low_level_add_feature(\@data); | |
| 323 return \@data; | |
| 324 } | |
| 325 | |
| 326 sub _add_gene { | |
| 327 my ( $self, %data ) = @_; | |
| 328 my $id = $self->id_prefix() . '.' . ( $self->global_feat_idx()); | |
| 329 $self->global_feat_idx($self->global_feat_idx()+1); | |
| 330 my $gene_count = ++$feat_type_count{'gene'}; | |
| 331 my @data = ( | |
| 332 $self->seqid(), | |
| 333 $self->source_map( $data{feat}->primary_tag ), | |
| 334 $self->FT_SO_map('gene'), | |
| 335 $data{feat}->start, | |
| 336 $data{feat}->end, | |
| 337 '.', | |
| 338 ( $data{feat}->strand == 1 ? '+' : '-' ), | |
| 339 '.', | |
| 340 "ID=$id;Name=Gene$gene_count" | |
| 341 ); | |
| 342 | |
| 343 #$self->_low_level_add_feature(\@data); | |
| 344 return ( $id, \@data ); | |
| 345 } | |
| 346 | |
| 347 no Moose; | |
| 348 1; | |
| 349 | |
| 350 __END__ | |
| 351 | |
| 352 =pod | |
| 353 | |
| 354 =encoding UTF-8 | |
| 355 | |
| 356 =head1 NAME | |
| 357 | |
| 358 CPT::GBK2GFF3 | |
| 359 | |
| 360 =head1 VERSION | |
| 361 | |
| 362 version 1.99.4 | |
| 363 | |
| 364 =head1 AUTHOR | |
| 365 | |
| 366 Eric Rasche <rasche.eric@yandex.ru> | |
| 367 | |
| 368 =head1 COPYRIGHT AND LICENSE | |
| 369 | |
| 370 This software is Copyright (c) 2014 by Eric Rasche. | |
| 371 | |
| 372 This is free software, licensed under: | |
| 373 | |
| 374 The GNU General Public License, Version 3, June 2007 | |
| 375 | |
| 376 =cut | 
