comparison lib/CPT/GBK2GFF3.pm @ 1:97ef96676b48 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:26 +0000
parents
children
comparison
equal deleted inserted replaced
0:b18e8268bf4e 1:97ef96676b48
1 package CPT::GBK2GFF3;
2 use strict;
3 use warnings;
4 use CPT;
5 use Data::Dumper;
6 use autodie;
7 use Bio::SeqIO;
8 use Moose;
9
10 has data => (
11 is => 'rw',
12 isa => 'ArrayRef',
13 default => sub { [] },
14 );
15 has header => (
16 is => 'rw',
17 isa => 'ArrayRef',
18 );
19 has sequence => (
20 is => 'rw',
21 isa => 'Str',
22 );
23 has seqid => (
24 is => 'rw',
25 isa => 'Str',
26 );
27 has genbank => (
28 is => 'rw',
29 isa => 'Str',
30 );
31 has is_circular => (
32 is => 'rw',
33 isa => 'Str',
34 );
35 has id_prefix => ( is => 'rw', isa => 'Str' );
36 has global_feat_idx => ( is => 'rw', isa => 'Int', default => sub { 0 } );
37 has source => ( is => 'rw', isa => 'ArrayRef');
38
39 my %feat_type_count;
40 my %reserved_keys = map { $_ => 1 }
41 qw/ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term/;
42
43 use CPT::Bio;
44 my $bio = CPT::Bio->new();
45 my %key_mapping = (
46 "-" => [ "located_sequence_feature", "SO:0000110" ],
47 "-10_signal" => [ "minus_10_signal", "SO:0000175" ],
48 "-35_signal" => [ "minus_35_signal", "SO:0000176" ],
49 "3'UTR" => [ "three_prime_UTR", "SO:0000205" ],
50 "3'clip" => [ "three_prime_clip", "SO:0000557" ],
51 "5'UTR" => [ "five_prime_UTR", "SO:0000204" ],
52 "5'clip" => [ "five_prime_clip", "SO:0000555" ],
53 "CAAT_signal" => [ "CAAT_signal", "SO:0000172" ],
54 "CDS" => [ "CDS", "SO:0000316" ],
55 "D-loop" => [ "D_loop", "SO:0000297" ],
56 "D_segment" => [ "D_gene", "SO:0000458" ],
57 "GC_signal" => [ "GC_rich_region", "SO:0000173" ],
58 "LTR" => [ "long_terminal_repeat", "SO:0000286" ],
59 "RBS" => [ "ribosome_entry_site", "SO:0000139" ],
60 "STS" => [ "STS", "SO:0000331" ],
61 "TATA_signal" => [ "TATA_box", "SO:0000174" ],
62 "attenuator" => [ "attenuator", "SO:0000140" ],
63 "enhancer" => [ "enhancer", "SO:0000165" ],
64 "exon" => [ "exon", "SO:0000147" ],
65 "gap" => [ "gap", "SO:0000730" ],
66 "gene" => [ "gene", "SO:0000704" ],
67 "iDNA" => [ "iDNA", "SO:0000723" ],
68 "intron" => [ "intron", "SO:0000188" ],
69 "mRNA" => [ "mRNA", "SO:0000234" ],
70 "mat_peptide" => [ "mature_protein_region", "SO:0000419" ],
71 "misc_RNA" => [ "transcript", "SO:0000673" ],
72 "misc_binding" => [ "binding_site", "SO:0000409" ],
73 "misc_difference" => [ "sequence_difference", "SO:0000413" ],
74 "misc_feature" => [ "region", "SO:0000001" ],
75 "misc_recomb" => [ "recombination_feature", "SO:0000298" ],
76 "misc_signal" => [ "regulatory_region", "SO:0005836" ],
77 "misc_structure" => [ "sequence_secondary_structure", "SO:0000002" ],
78 "modified_base" => [ "modified_DNA_base", "SO:0000305" ],
79 "operon" => [ "operon", "SO:0000178" ],
80 "oriT" => [ "origin_of_transfer", "SO:0000724" ],
81 "polyA_signal" => [ "polyA_signal_sequence", "SO:0000551" ],
82 "polyA_site" => [ "polyA_site", "SO:0000553" ],
83 "precursor_RNA" => [ "primary_transcript", "SO:0000185" ],
84 "prim_transcript" => [ "primary_transcript", "SO:0000185" ],
85 "primer_bind" => [ "primer_binding_site", "SO:0005850" ],
86 "promoter" => [ "promoter", "SO:0000167" ],
87 "protein_bind" => [ "protein_binding_site", "SO:0000410" ],
88 "rRNA" => [ "rRNA", "SO:0000252" ],
89 "repeat_region" => [ "repeat_region", "SO:0000657" ],
90 "repeat_unit" => [ "repeat_unit", "SO:0000726" ],
91 "satellite" => [ "satellite_DNA", "SO:0000005" ],
92 "scRNA" => [ "scRNA", "SO:0000013" ],
93 "sig_peptide" => [ "signal_peptide", "SO:0000418" ],
94 "snRNA" => [ "snRNA", "SO:0000274" ],
95 "snoRNA" => [ "snoRNA", "SO:0000275" ],
96 "source" => [ "contig", "SO:0000149" ], # manually modified
97 "stem_loop" => [ "stem_loop", "SO:0000313" ],
98 "tRNA" => [ "tRNA", "SO:0000253" ],
99 "terminator" => [ "terminator", "SO:0000141" ],
100 "transit_peptide" => [ "transit_peptide", "SO:0000725" ],
101 "variation" => [ "sequence_variant", "SO:0000109" ],
102 );
103
104
105 sub get_gff3_file {
106 my ($self) = @_;
107 my @output;
108 for my $header_line ( @{ $self->header() } ) {
109 push( @output, $header_line );
110 }
111 push( @output, join( "\t", @{ $self->get_source } ) );
112 for my $data_line ( @{ $self->data() }) {
113 push( @output, join( "\t", @{$data_line} ) );
114 }
115 push( @output, '##FASTA' );
116 push( @output, '>' . $self->seqid() );
117 my $seq = $self->sequence();
118 $seq =~ s/(.{80})/$1\n/g;
119 push( @output, $seq );
120 return join( "\n", @output );
121 }
122
123 sub escape {
124 my ($self, $str) = @_;
125 $str =~ s/,/%2C/g;
126 $str =~ s/=/%3D/g;
127 $str =~ s/;/%3B/g;
128 $str =~ s/\t/%09/g;
129 return $str;
130 }
131
132 sub FT_SO_map {
133 my ( $self, $key ) = @_;
134 if ( $key_mapping{$key} ) {
135 my @result = @{ $key_mapping{$key} };
136 return $result[0];
137 }
138 else {
139 return 'region';
140 }
141 }
142
143 sub source_map {
144 my ( $self, $type ) = @_;
145 if ( $self->{'override_source'} ) {
146 return $self->{'override_source'};
147 }
148 if ( $self->{'annotation_software'}{$type} ) {
149 return $self->{'annotation_software'}{$type};
150 }
151 else {
152 return '.';
153 }
154 }
155
156 sub get_attrs {
157 my ( $self, %data ) = @_;
158 my $feature = $data{'feat'};
159 my $parents_ref = $data{'parents'};
160
161 my %attrs = ();
162 $attrs{'ID'} =
163 $self->id_prefix() . '.' . ($self->global_feat_idx());
164
165 $self->global_feat_idx($self->global_feat_idx()+1);
166
167 # Handle Identifier
168 my $identifier = $bio->_getIdentifier($feature);
169 if ( $identifier ne 'ERROR' ) {
170 $attrs{'Name'} = $identifier . '.' . $feature->primary_tag;
171 }
172
173 # Handle parents, if there are any
174 if ($parents_ref) {
175 if ( ref($parents_ref) eq 'ARRAY' ) {
176 $attrs{'Parent'} = $parents_ref;
177 }
178 else {
179 $attrs{'Parent'} = [$parents_ref];
180 }
181 }
182
183 # These are otherwise "Special" keys that need to be handled differently.
184 if ( $feature->has_tag('note') ) {
185 my @notes = $feature->get_tag_values('note');
186 $attrs{'Note'} = \@notes;
187 }
188 if ( $feature->has_tag('db_xref') ) {
189 my @dbxref = $feature->get_tag_values('db_xref');
190 $attrs{'Dbxref'} = \@dbxref;
191 }
192
193 # Do the rest
194 for my $tag ( $feature->get_all_tags() ) {
195
196 # If not one of the specially handled ones
197 if ( $tag ne 'name' && $tag ne 'note' && $tag ne 'db_xref' ) {
198
199 # If not a reserved_key
200 if ( !$reserved_keys{$tag} ) {
201 my @vals = $feature->get_tag_values($tag);
202 $attrs{lc($tag)} = \@vals;
203 }
204 else {
205 warn
206 "Trying to set a reserved key $tag with value $attrs{$tag}";
207 }
208 }
209 }
210 my %response = (
211 id => $attrs{'ID'},
212 attr_str => $self->post_process_attribute_string(%attrs),
213 );
214 return %response;
215 }
216
217 sub post_process_attribute_string {
218 my ($self,%attrs) = @_;
219 my @parts = ();
220 for my $k ( keys %attrs ) {
221
222 # IGNORED TAGS
223 if ( $k ne 'translation' && $k ne 'product' ) {
224 my $joined =
225 $self->escape_and_join_attribute_subpart( $attrs{$k} );
226 push @parts, "$k=$joined";
227 }
228 }
229
230 #print STDERR join("\t",@parts),"\n";
231 return join( ";", @parts );
232
233 }
234
235 sub escape_and_join_attribute_subpart {
236 my ($self,$ref) = @_;
237 if ( ref($ref) eq 'ARRAY' ) {
238 my @attrs = @{$ref};
239 return join( ",", map { $self->escape($_) } @attrs );
240 }
241 else { #scalar
242 return $self->escape($ref);
243 }
244 }
245
246 sub get_source {
247 my ($self) = @_;
248 if ( !$self->source() ) {
249 $self->auto_source();
250 }
251 return $self->source();
252 }
253
254 sub auto_source {
255
256 # Auto generate a source feature, in case there isn't one.
257 my ($self) = @_;
258 my @region = (
259 $self->seqid(),
260 ( $self->genbank() ? 'Genbank' : 'Assembly' ),
261 'contig',
262 1,
263 $self->get_length,
264 '.',
265 '.',
266 '.',
267 sprintf( "ID=%s;Name=%s", $self->seqid(), $self->seqid() )
268 . ( $self->is_circular() ? ";Is_circular=True" : "" )
269 );
270 $self->source(\@region);
271 }
272
273 sub add_feature {
274 my ( $self, $feat ) = @_;
275 my $primary_tag = $feat->primary_tag;
276 if ( $primary_tag eq 'CDS' ) {
277 my ( $id, $data_0 ) = $self->_add_gene( feat => $feat, );
278 my ($data_1) = $self->_add_feature(
279 feat => $feat,
280 parent => $id,
281 );
282 $self->_low_level_add_feature($data_0);
283 $self->_low_level_add_feature($data_1);
284 }
285 elsif ( $primary_tag eq 'source' ) {
286 my ($data) = $self->_add_feature( feat => $feat, );
287
288 # YUCK.
289 my @z = @{$data};
290 my $seqid = $self->seqid();
291 $z[8] =~ s/ID=[^;]*;/ID=$seqid;/g;
292 $self->source( \@z );
293 }
294 elsif ( $primary_tag ne 'gene' ) {
295 my ($data) = $self->_add_feature( feat => $feat, );
296 $self->_low_level_add_feature($data);
297 }
298
299 }
300
301 sub _low_level_add_feature {
302 my ( $self, $data ) = @_;
303 push( @{ $self->data() }, $data );
304 }
305
306 sub _add_feature {
307 my ( $self, %data ) = @_;
308 my %attrs =
309 $self->get_attrs( feat => $data{feat}, parents => $data{parent} );
310 my @data = (
311 $self->seqid(),
312 $self->source_map( $data{feat}->primary_tag ),
313 $self->FT_SO_map( $data{feat}->primary_tag ),
314 $data{feat}->start,
315 $data{feat}->end,
316 '.',
317 ( $data{feat}->strand == 1 ? '+' : '-' ),
318 '.',
319 $attrs{attr_str}
320 );
321
322 #$self->_low_level_add_feature(\@data);
323 return \@data;
324 }
325
326 sub _add_gene {
327 my ( $self, %data ) = @_;
328 my $id = $self->id_prefix() . '.' . ( $self->global_feat_idx());
329 $self->global_feat_idx($self->global_feat_idx()+1);
330 my $gene_count = ++$feat_type_count{'gene'};
331 my @data = (
332 $self->seqid(),
333 $self->source_map( $data{feat}->primary_tag ),
334 $self->FT_SO_map('gene'),
335 $data{feat}->start,
336 $data{feat}->end,
337 '.',
338 ( $data{feat}->strand == 1 ? '+' : '-' ),
339 '.',
340 "ID=$id;Name=Gene$gene_count"
341 );
342
343 #$self->_low_level_add_feature(\@data);
344 return ( $id, \@data );
345 }
346
347 no Moose;
348 1;
349
350 __END__
351
352 =pod
353
354 =encoding UTF-8
355
356 =head1 NAME
357
358 CPT::GBK2GFF3
359
360 =head1 VERSION
361
362 version 1.99.4
363
364 =head1 AUTHOR
365
366 Eric Rasche <rasche.eric@yandex.ru>
367
368 =head1 COPYRIGHT AND LICENSE
369
370 This software is Copyright (c) 2014 by Eric Rasche.
371
372 This is free software, licensed under:
373
374 The GNU General Public License, Version 3, June 2007
375
376 =cut