comparison Perl/bp_genbank2gff3.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/opt/anaconda1anaconda2anaconda3/bin/perl
2
3 eval 'exec /opt/anaconda1anaconda2anaconda3/bin/perl -S $0 ${1+"$@"}'
4 if 0; # not running under some shell
5
6
7
8 =pod
9
10 =head1 NAME
11
12 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3
13
14 =head1 SYNOPSIS
15
16 bp_genbank2gff3.pl [options] filename(s)
17
18 # process a directory containing GenBank flatfiles
19 perl bp_genbank2gff3.pl --dir path_to_files --zip
20
21 # process a single file, ignore explicit exons and introns
22 perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz
23
24 # process a list of files
25 perl bp_genbank2gff3.pl *gbk.gz
26
27 # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader
28 curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \
29 | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \
30 | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata
31
32 Options:
33 --noinfer -r don't infer exon/mRNA subfeatures
34 --conf -i path to the curation configuration file that contains user preferences
35 for Genbank entries (must be YAML format)
36 (if --manual is passed without --ini, user will be prompted to
37 create the file if any manual input is saved)
38 --sofile -l path to to the so.obo file to use for feature type mapping
39 (--sofile live will download the latest online revision)
40 --manual -m when trying to guess the proper SO term, if more than
41 one option matches the primary tag, the converter will
42 wait for user input to choose the correct one
43 (only works with --sofile)
44 --dir -d path to a list of genbank flatfiles
45 --outdir -o location to write GFF files (can be 'stdout' or '-' for pipe)
46 --zip -z compress GFF3 output files with gzip
47 --summary -s print a summary of the features in each contig
48 --filter -x genbank feature type(s) to ignore
49 --split -y split output to separate GFF and fasta files for
50 each genbank record
51 --nolump -n separate file for each reference sequence
52 (default is to lump all records together into one
53 output file for each input file)
54 --ethresh -e error threshold for unflattener
55 set this high (>2) to ignore all unflattener errors
56 --[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon
57 model. --CDS is default. Use --CDS to keep default GFF gene model,
58 use --noCDS to convert to g-r-p-e.
59 --format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work
60 (GenBank is default)
61 --GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available
62 --quiet don't talk about what is being processed
63 --typesource SO sequence type for source (e.g. chromosome; region; contig)
64 --help -h display this message
65
66
67 =head1 DESCRIPTION
68
69 This script uses Bio::SeqFeature::Tools::Unflattener and
70 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene
71 containment hierarchies mapped for optimal display in gbrowse.
72
73 The input files are assumed to be gzipped GenBank flatfiles for refseq
74 contigs. The files may contain multiple GenBank records. Either a
75 single file or an entire directory can be processed. By default, the
76 DNA sequence is embedded in the GFF but it can be saved into separate
77 fasta file with the --split(-y) option.
78
79 If an input file contains multiple records, the default behaviour is
80 to dump all GFF and sequence to a file of the same name (with .gff
81 appended). Using the 'nolump' option will create a separate file for
82 each genbank record. Using the 'split' option will create separate
83 GFF and Fasta files for each genbank record.
84
85
86 =head2 Notes
87
88 =head3 'split' and 'nolump' produce many files
89
90 In cases where the input files contain many GenBank records (for
91 example, the chromosome files for the mouse genome build), a very
92 large number of output files will be produced if the 'split' or
93 'nolump' options are selected. If you do have lists of files E<gt> 6000,
94 use the --long_list option in bp_bulk_load_gff.pl or
95 bp_fast_load_gff.pl to load the gff and/ or fasta files.
96
97 =head3 Designed for RefSeq
98
99 This script is designed for RefSeq genomic sequence entries. It may
100 work for third party annotations but this has not been tested.
101 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl
102 if you don't mind some gene model unflattener errors (dgg).
103
104 =head3 G-R-P-E Gene Model
105
106 Don Gilbert worked this over with needs to produce GFF3 suited to
107 loading to GMOD Chado databases. Most of the changes I believe are
108 suited for general use. One main chado-specific addition is the
109 --[no]cds2protein flag
110
111 My favorite GFF is to set the above as ON by default (disable with --nocds2prot)
112 For general use it probably should be OFF, enabled with --cds2prot.
113
114 This writes GFF with an alternate, but useful Gene model,
115 instead of the consensus model for GFF3
116
117 [ gene > mRNA> (exon,CDS,UTR) ]
118
119 This alternate is
120
121 gene > mRNA > polypeptide > exon
122
123 means the only feature with dna bases is the exon. The others
124 specify only location ranges on a genome. Exon of course is a child
125 of mRNA and protein/peptide.
126
127 The protein/polypeptide feature is an important one, having all the
128 annotations of the GenBank CDS feature, protein ID, translation, GO
129 terms, Dbxrefs to other proteins.
130
131 UTRs, introns, CDS-exons are all inferred from the primary exon bases
132 inside/outside appropriate higher feature ranges. Other special gene
133 model features remain the same.
134
135 Several other improvements and bugfixes, minor but useful are included
136
137 * IO pipes now work:
138 curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...
139
140 * GenBank main record fields are added to source feature, e.g. organism, date,
141 and the sourcetype, commonly chromosome for genomes, is used.
142
143 * Gene Model handling for ncRNA, pseudogenes are added.
144
145 * GFF header is cleaner, more informative.
146 --GFF_VERSION flag allows choice of v2 as well as default v3
147
148 * GFF ##FASTA inclusion is improved, and
149 CDS translation sequence is moved to FASTA records.
150
151 * FT -> GFF attribute mapping is improved.
152
153 * --format choice of SeqIO input formats (GenBank default).
154 Uniprot/Swissprot and EMBL work and produce useful GFF.
155
156 * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions
157 and more flexible usage.
158
159 =head1 TODO
160
161 =head2 Are these additions desired?
162
163 * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY
164 * handle Entrezgene, other non-sequence SeqIO structures (really should change
165 those parsers to produce consistent annotation tags).
166
167 =head2 Related bugfixes/tests
168
169 These items from Bioperl mail were tested (sample data generating
170 errors), and found corrected:
171
172 From: Ed Green <green <at> eva.mpg.de>
173 Subject: genbank2gff3.pl on new human RefSeq
174 Date: 2006-03-13 21:22:26 GMT
175 -- unspecified errors (sample data works now).
176
177 From: Eric Just <e-just <at> northwestern.edu>
178 Subject: genbank2gff3.pl
179 Date: 2007-01-26 17:08:49 GMT
180 -- bug fixed in genbank2gff3 for multi-record handling
181
182 This error is for a /trans_splice gene that is hard to handle, and
183 unflattner/genbank2 doesn't
184
185 From: Chad Matsalla <chad <at> dieselwurks.com>
186 Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?
187 Date: 2005-07-15 19:51:48 GMT
188
189 =head1 AUTHOR
190
191 Sheldon McKay (mckays@cshl.edu)
192
193 Copyright (c) 2004 Cold Spring Harbor Laboratory.
194
195 =head2 AUTHOR of hacks for GFF2Chado loading
196
197 Don Gilbert (gilbertd@indiana.edu)
198
199
200 =cut
201
202 use strict;
203 use warnings;
204
205 use lib "$ENV{HOME}/bioperl-live";
206 # chad put this here to enable situations when this script is tested
207 # against bioperl compiled into blib along with other programs using blib
208 BEGIN {
209 unshift(@INC,'blib/lib');
210 };
211 use Pod::Usage;
212 use Bio::Root::RootI;
213 use Bio::SeqIO;
214 use File::Spec;
215 use Bio::SeqFeature::Tools::Unflattener;
216 use Bio::SeqFeature::Tools::TypeMapper;
217 use Bio::SeqFeature::Tools::IDHandler;
218 use Bio::Location::SplitLocationI;
219 use Bio::Location::Simple;
220 use Bio::Tools::GFF;
221 use Getopt::Long;
222 use List::Util qw(first);
223 use Bio::OntologyIO;
224 use YAML qw(Dump LoadFile DumpFile);
225 use File::Basename;
226
227 use vars qw/$split @filter $zip $outdir $help $ethresh
228 $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
229 $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
230 $file @files $dir $summary $nolump
231 $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
232 $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;
233
234 use constant SO_URL =>
235 'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';
236 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];
237 use constant ALPHABET_TO_NUMBER => {
238 a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,
239 j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,
240 r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,
241 z => 25,
242 };
243 use constant ALPHABET_DIVISOR => 26;
244 use constant GM_NEW_TOPLEVEL => 2;
245 use constant GM_NEW_PART => 1;
246 use constant GM_DUP_PART => 0;
247 use constant GM_NOT_PART => -1;
248
249 # Options cycle in multiples of 2 because of left side/right side pairing.
250 # You can make this number odd, but displayed matches will still round up
251 use constant OPTION_CYCLE => 6;
252
253 $GFF_VERSION = 3; # allow v2 ...
254 $verbose = 1; # right default? -nov to turn off
255
256 # dgg: change the gene model to Gene/mRNA/Polypeptide/exons...
257 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()
258 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;
259 # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support
260
261 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **
262 my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work
263 # other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG
264
265
266 my %TAG_MAP = (
267 db_xref => 'Dbxref',
268 name => 'Name',
269 note => 'Note', # also pull GO: ids into Ontology_term
270 synonym => 'Alias',
271 symbol => 'Alias', # is symbol still used?
272 # protein_id => 'Dbxref', also seen Dbxref tags: EC_number
273 # translation: handled in gene_features
274 );
275
276
277 $| = 1;
278 my $quiet= !$verbose;
279 my $ok= GetOptions( 'd|dir|input:s' => \$dir,
280 'z|zip' => \$zip,
281 'h|help' => \$help,
282 's|summary' => \$summary,
283 'r|noinfer' => \$noinfer,
284 'i|conf=s' => \$CONF,
285 'sofile=s' => \$SO_FILE,
286 'm|manual' => \$MANUAL,
287 'o|outdir|output:s'=> \$outdir,
288 'x|filter:s'=> \@filter,
289 'y|split' => \$split,
290 "ethresh|e=s"=>\$ethresh,
291 'c|CDS!' => \$CDSkeep,
292 'f|format=s' => \$FORMAT,
293 'typesource=s' => \$source_type,
294 'GFF_VERSION=s' => \$GFF_VERSION,
295 'quiet!' => \$quiet, # swap quiet to verbose
296 'DEBUG!' => \$DEBUG,
297 'n|nolump' => \$nolump);
298
299 my $lump = 1 unless $nolump || $split;
300 $verbose= !$quiet;
301
302 # look for help request
303 pod2usage(2) if $help || !$ok;
304
305 # keep SOURCEID as-is and change FORMAT for SeqIO types;
306 # note SeqIO uses file.suffix to guess type; not useful here
307 $SOURCEID= $FORMAT;
308 $FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/;
309 $verbose =1 if($DEBUG);
310
311 # initialize handlers
312 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);
313 $unflattener->error_threshold($ethresh) if $ethresh;
314 $unflattener->verbose(1) if($DEBUG);
315 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?
316 # ensembl parsing is still problematic, forget this
317
318 my $tm = Bio::SeqFeature::Tools::TypeMapper->new;
319 my $idh = Bio::SeqFeature::Tools::IDHandler->new;
320
321 # dgg
322 $source_type ||= "region"; # should really parse from FT.source contents below
323
324 #my $FTSOmap = $tm->FT_SO_map();
325 my $FTSOmap;
326 my $FTSOsynonyms;
327
328 if (defined($SO_FILE) && $SO_FILE eq 'live') {
329 print "\nDownloading the latest SO file from ".SO_URL."\n\n";
330 use LWP::UserAgent;
331 my $ua = LWP::UserAgent->new(timeout => 30);
332 my $request = HTTP::Request->new(GET => SO_URL);
333 my $response = $ua->request($request);
334
335
336 if ($response->status_line =~ /200/) {
337 use File::Temp qw/ tempfile /;
338 my ($fh, $fn) = tempfile();
339 print $fh $response->content;
340 $SO_FILE = $fn;
341 } else {
342 print "Couldn't download SO file online...skipping validation.\n"
343 . "HTTP Status was " . $response->status_line . "\n"
344 and undef $SO_FILE
345 }
346 }
347
348 if ($SO_FILE) {
349
350
351 my (%terms, %syn);
352
353 my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );
354 $ONTOLOGY = $parser->next_ontology();
355
356 for ($ONTOLOGY->get_all_terms) {
357 my $feat = $_;
358
359 $terms{$feat->name} = $feat->name;
360 #$terms{$feat->name} = $feat;
361
362 my @syn = $_->each_synonym;
363
364 push @{$syn{$_}}, $feat->name for @syn;
365 #push @{$syn{$_}}, $feat for @syn;
366 }
367
368 $FTSOmap = \%terms;
369 $FTSOsynonyms = \%syn;
370
371 my %hardTerms = %{ $tm->FT_SO_map() };
372 map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;
373
374 } else {
375 my %terms = %{ $tm->FT_SO_map() };
376 while (my ($k,$v) = each %terms) {
377 $FTSOmap->{$k} = ref($v) ? shift @$v : $v;
378 }
379 }
380
381 $TYPE_MAP = $FTSOmap;
382 $SYN_MAP = $FTSOsynonyms;
383
384
385 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")
386
387 # stringify filter list if applicable
388 my $filter = join ' ', @filter if @filter;
389
390 # determine input files
391 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use
392 if ($dir && ($dir eq '-' || $dir eq 'stdin')) {
393 $stdin=1; $dir=''; @files=('stdin');
394
395 } elsif ( $dir ) {
396 if ( -d $dir ) {
397 opendir DIR, $dir or die "could not open $dir for reading: $!";
398 @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;
399 closedir DIR;
400 }
401 else {
402 die "$dir is not a directory\n";
403 }
404 }
405 else {
406 @files = @ARGV;
407 $dir = '';
408 }
409
410 # we should have some files by now
411 pod2usage(2) unless @files;
412
413
414 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use
415 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {
416 warn("std. output chosen: cannot split\n") if($split);
417 warn("std. output chosen: cannot zip\n") if($zip);
418 warn("std. output chosen: cannot nolump\n") if($nolump);
419 $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip
420
421 } elsif ( $outdir && !-e $outdir ) {
422 mkdir($outdir) or die "could not create directory $outdir: $!\n";
423 }
424 elsif ( !$outdir ) {
425 $outdir = $dir || '.';
426 }
427
428 for my $file ( @files ) {
429 # dgg ; allow 'stdin' / '-' input ?
430 chomp $file;
431 die "$! $file" unless($stdin || -e $file);
432 print "# Input: $file\n" if($verbose);
433
434 my ($lump_fh, $lumpfa_fh, $outfile, $outfa);
435 if ($stdout) {
436 $lump_fh= *STDOUT; $lump="stdout$$";
437 $outfa= "stdout$$.fa"; # this is a temp file ... see below
438 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
439
440 } elsif ( $lump ) {
441 my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);
442 $lump = File::Spec->catfile($outdir, $fileonly.'.gff');
443 ($outfa = $lump) =~ s/\.gff/\.fa/;
444 open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";
445 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
446
447 }
448
449 # open input file, unzip if req'd
450 if ($stdin) {
451 *FH = *STDIN;
452 } elsif ( $file =~ /\.gz/ ) {
453 open FH, "gunzip -c $file |";
454 }
455 else {
456 open FH, '<', $file;
457 }
458
459 my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);
460 my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );
461
462 while ( my $seq = $in->next_seq() ) {
463 my $seq_name = $seq->accession_number;
464 my $end = $seq->length;
465 my @to_print;
466
467 # arrange disposition of GFF output
468 $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');
469 my $out;
470
471 if ( $lump ) {
472 $outfile = $lump;
473 $out = $lump_fh;
474 }
475 else {
476 $outfile = File::Spec->catfile($outdir, $seq_name.'.gff');
477 open $out, ">$outfile";
478 }
479
480 # filter out unwanted features
481 my $source_feat= undef;
482 my @source= filter($seq); $source_feat= $source[0];
483
484 ($source_type,$source_feat)=
485 getSourceInfo( $seq, $source_type, $source_feat ) ;
486 # always; here we build main prot $source_feat; # if @source;
487
488 # abort if there are no features
489 warn "$seq_name has no features, skipping\n" and next
490 if !$seq->all_SeqFeatures;
491
492
493 $FTSOmap->{'source'} = $source_type;
494 ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features
495
496 # construct a GFF header
497 # add: get source_type from attributes of source feature? chromosome=X tag
498 # also combine 1st ft line here with source ft from $seq ..
499 my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);
500 print $out $header;
501 print "# working on $info\n" if($verbose);
502
503 # unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..
504 unflatten_seq($seq);
505
506 # Note that we use our own get_all_SeqFeatures function
507 # to rescue cloned exons
508
509 @GFF_LINE_FEAT = ();
510 for my $feature ( get_all_SeqFeatures($seq) ) {
511
512 my $method = $feature->primary_tag;
513 next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);
514
515 $feature->seq_id($seq->id) unless($feature->seq_id);
516 $feature->source_tag($SOURCEID);
517
518 # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;
519 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
520 maptags2gff($feature);
521
522 # current gene name. The unflattened gene features should be in order so any
523 # exons, CDSs, etc that follow will belong to this gene
524 my $gene_name;
525 if ( $method eq 'gene' || $method eq 'pseudogene' ) {
526 @to_print= print_held($out, $gffio, \@to_print);
527 $gene_id = $gene_name= gene_name($feature);
528 } else {
529 $gene_name= gene_name($feature);
530 }
531
532 #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx
533 # be converted to or added as Name=xxx (if not ID= or as well)
534 ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags
535 convert_to_name($feature);
536
537 ## dgg: extended to protein|polypeptide
538 ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes
539 ## in yeast have that genbank tag; why?
540 ## these include pseudogene ...
541
542 ## Note we also have mapped types to SO, so these RNA's are now transcripts:
543 # pseudomRNA => "pseudogenic_transcript",
544 # pseudotranscript" => "pseudogenic_transcript",
545 # misc_RNA=>'processed_transcript',
546
547 warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;
548
549 if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
550 || ( $gene_id && $gene_name eq $gene_id ) ) {
551
552 my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result
553 if ($action == GM_DUP_PART) {
554 # ignore, this is dupl. exon with new parent ...
555
556 } elsif ($action == GM_NOT_PART) {
557 add_generic_id( $feature, $gene_name, "nocount");
558 my $gff = $gffio->gff_string($feature);
559 push @GFF_LINE_FEAT, $feature;
560 #print $out "$gff\n" if $gff;
561
562 } elsif ($action > 0) {
563 # hold off print because exon etc. may get 2nd, 3rd parents
564 @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);
565 push(@to_print, $feature);
566 }
567
568 }
569
570 # otherwise handle as generic feats with IDHandler labels
571 else {
572 add_generic_id( $feature, $gene_name, "");
573 my $gff= $gffio->gff_string($feature);
574 push @GFF_LINE_FEAT, $feature;
575 #print $out "$gff\n" if $gff;
576 }
577 }
578
579 # don't like doing this after others; do after each new gene id?
580 @to_print= print_held($out, $gffio, \@to_print);
581
582 gff_validate(@GFF_LINE_FEAT);
583
584 for my $feature (@GFF_LINE_FEAT) {
585 my $gff= $gffio->gff_string($feature);
586 print $out "$gff\n" if $gff;
587 }
588
589 # deal with the corresponding DNA
590 my ($fa_out,$fa_outfile);
591 my $dna = $seq->seq;
592 if($dna || %proteinfa) {
593 $method{'RESIDUES'} += length($dna);
594 $dna =~ s/(\S{60})/$1\n/g;
595 $dna .= "\n";
596
597 if ($split) {
598 $fa_outfile = $outfile;
599 $fa_outfile =~ s/gff$/fa/;
600 open $fa_out, ">$fa_outfile" or die $!;
601 print $fa_out ">$seq_name\n$dna" if $dna;
602 foreach my $aid (sort keys %proteinfa) {
603 my $aa= delete $proteinfa{$aid};
604 $method{'RESIDUES(tr)'} += length($aa);
605 $aa =~ s/(\S{60})/$1\n/g;
606 print $fa_out ">$aid\n$aa\n";
607 }
608
609 }
610 else {
611 ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out
612 ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk
613 ## maybe write this to temp .fa then cat to end of lumped gff $out
614 print $lumpfa_fh ">$seq_name\n$dna" if $dna;
615 foreach my $aid (sort keys %proteinfa) {
616 my $aa= delete $proteinfa{$aid};
617 $method{'RESIDUES(tr)'} += length($aa);
618 $aa =~ s/(\S{60})/$1\n/g;
619 print $lumpfa_fh ">$aid\n$aa\n";
620 }
621 }
622
623 %proteinfa=();
624 }
625
626 if ( $zip && !$lump ) {
627 system "gzip -f $outfile";
628 system "gzip -f $fa_outfile" if($fa_outfile);
629 $outfile .= '.gz';
630 $fa_outfile .= '.gz' if $split;
631 }
632
633 # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA
634 print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);
635 print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);
636
637 # dgg: moved to after all inputs; here it prints cumulative sum for each record
638 #if ( $summary ) {
639 # print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
640 #
641 # for ( keys %method ) {
642 # print "# $_ $method{$_}\n";
643 # }
644 # print "# \n";
645 # }
646
647 }
648
649 print "# GFF3 saved to $outfile\n" if( $verbose && $lump);
650 if ( $summary ) {
651 print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
652 for ( keys %method ) {
653 print "# $_ $method{$_}\n";
654 }
655 print "# \n";
656 }
657
658 ## FIXME for piped output w/ split FA files ...
659 close($lumpfa_fh) if $lumpfa_fh;
660 if (!$split && $outfa && $lump_fh) {
661 print $lump_fh "##FASTA\n"; # GFF3 spec
662 open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";
663 while( <$lumpfa_fh>) {
664 print $lump_fh $_;
665 } # is $lump_fh still open?
666 close($lumpfa_fh);
667 unlink($outfa);
668 }
669
670
671 if ( $zip && $lump ) {
672 system "gzip -f $lump";
673 }
674
675 close FH;
676 }
677
678
679
680
681
682 sub typeorder {
683 return 1 if ($_[0] =~ /gene/);
684 return 2 if ($_[0] =~ /RNA|transcript/);
685 return 3 if ($_[0] =~ /protein|peptide/);
686 return 4 if ($_[0] =~ /exon|CDS/);
687 return 3; # default before exon (smallest part)
688 }
689
690 sub sort_by_feattype {
691 my($at,$bt)= ($a->primary_tag, $b->primary_tag);
692 return (typeorder($at) <=> typeorder($bt))
693 or ($at cmp $bt);
694 ## or ($a->name() cmp $b->name());
695 }
696
697 sub print_held {
698 my($out,$gffio,$to_print)= @_;
699 return unless(@$to_print);
700 @$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes
701 while ( my $feature = shift @$to_print) {
702 my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode
703 push @GFF_LINE_FEAT, $feature;
704 #print $out "$gff\n";
705 }
706 return (); # @to_print
707 }
708
709 sub maptags2gff {
710 my $f = shift;
711 ## should copy/move locus_tag to Alias, if not ID/Name/Alias already
712 # but see below /gene /locus_tag usage
713 foreach my $tag (keys %TAG_MAP) {
714 if ($f->has_tag($tag)) {
715 my $newtag= $TAG_MAP{$tag};
716 my @v= $f->get_tag_values($tag);
717 $f->remove_tag($tag);
718 $f->add_tag_value($newtag,@v);
719
720 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
721 ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'
722 if ($tag eq 'note') {
723 map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;
724 my @go= map { m/(GO:\d+)/g } @v;
725 $f->add_tag_value('Ontology_term',@go) if(@go);
726 }
727
728 }
729 }
730 }
731
732
733 sub getSourceInfo {
734 my ($seq, $source_type, $sf) = @_;
735
736 my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);
737 my $is_gene = ($SOURCEID =~/entrezgene/i);
738 my $is_rich = (ref($seq) =~ /RichSeq/);
739 my $seq_name= $seq->accession_number();
740
741 unless($sf) { # make one
742 $source_type= $is_swiss ? $PROTEIN_TYPE
743 : $is_gene ? "eneg" # "gene" # "region" #
744 : $is_rich ? $seq->molecule : $source_type;
745 $sf = Bio::SeqFeature::Generic->direct_new();
746
747 my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);
748 my $loc= $seq->can('location') ? $seq->location()
749 : new Bio::Location::Simple( -start => $start, -end => $len);
750 $sf->location( $loc );
751 $sf->primary_tag($source_type);
752 $sf->source_tag($SOURCEID);
753 $sf->seq_id( $seq_name);
754 #? $sf->display_name($seq->id()); ## Name or Alias ?
755 $sf->add_tag_value( Alias => $seq->id()); # unless id == accession
756 $seq->add_SeqFeature($sf);
757 ## $source_feat= $sf;
758 }
759
760 if ($sf->has_tag("chromosome")) {
761 $source_type= "chromosome";
762 my ($chrname) = $sf->get_tag_values("chromosome");
763 ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead
764 ## e.g. Mouse chr 19 has two IDs in NCBI genbank now
765 $sf->add_tag_value( Alias => $chrname );
766 }
767
768 # pull GB Comment, Description for source ft ...
769 # add reference - can be long, not plain string...
770 warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;
771 # GenBank fields: keyword,comment,reference,date_changed
772 # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM
773
774 # is this just for main $seq object or for all seqfeatures ?
775 my %AnnotTagMap= (
776 'gene_name' => 'Alias',
777 'ALIAS_SYMBOL' => 'Alias', # Entrezgene
778 'LOCUS_SYNONYM' => 'Alias', #?
779 'symbol' => 'Alias',
780 'synonym' => 'Alias',
781 'dblink' => 'Dbxref',
782 'product' => 'product',
783 'Reference' => 'reference',
784 'OntologyTerm' => 'Ontology_term',
785 'comment' => 'Note',
786 'comment1' => 'Note',
787 # various map-type locations
788 # gene accession tag is named per source db !??
789 # 'Index terms' => keywords ??
790 );
791
792
793 my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );
794 my ($date)= $seq->annotation->get_Annotations("dates")
795 || $seq->annotation->get_Annotations("update-date")
796 || $is_rich ? $seq->get_dates() : ();
797 my ($comment)= $seq->annotation->get_Annotations("comment");
798 my ($species)= $seq->annotation->get_Annotations("species");
799 if (!$species
800 && $seq->can('species')
801 && defined $seq->species()
802 && $seq->species()->can('binomial') ) {
803 $species= $seq->species()->binomial();
804 }
805
806 # update source feature with main GB fields
807 $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');
808 $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));
809 $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));
810 $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));
811 $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));
812
813 $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;
814
815 foreach my $atag (sort keys %AnnotTagMap) {
816 my $gtag= $AnnotTagMap{$atag}; next unless($gtag);
817 my @anno = map{
818 if (ref $_ && $_->can('get_all_values')) {
819 split( /[,;] */, join ";", $_->get_all_values)
820 }
821 elsif (ref $_ && $_->can('display_text')) {
822 split( /[,;] */, $_->display_text)
823 }
824 elsif (ref $_ && $_->can('value')) {
825 split( /[,;] */, $_->value)
826 }
827 else {
828 ();
829 }
830 } $seq->annotation->get_Annotations($atag);
831 foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }
832 }
833
834 #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');
835 #$sf->add_tag_value( Alias => $_ ) foreach(@genes);
836 #
837 #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all
838 #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);
839
840 return (wantarray)? ($source_type,$sf) : $source_type; #?
841 }
842
843
844 sub gene_features {
845 my ($f, $gene_id, $genelinkID) = @_;
846 local $_ = $f->primary_tag;
847 $method{$_}++;
848
849 if ( /gene/ ) {
850 $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?
851 $tnum = $rnum= 0; $ncrna_id= $rna_id = '';
852 return GM_NEW_TOPLEVEL;
853
854 } elsif ( /mRNA/ ) {
855 return GM_NOT_PART unless $gene_id;
856 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
857 ($rna_id = $gene_id ) =~ s/gene/mRNA/;
858 $rna_id .= '.t0' . ++$tnum;
859 $f->add_tag_value( ID => $rna_id );
860 $f->add_tag_value( Parent => $gene_id );
861
862 } elsif ( /RNA|transcript/) {
863 ## misc_RNA here; missing exons ... flattener problem?
864 # all of {t,nc,sn}RNA can have gene models now
865 ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag
866 ## CDS needs to use mRNA, not misc_RNA, rna_id ...
867 ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level
868
869 if($gene_id) {
870 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
871 ($ncrna_id = $gene_id) =~ s/gene/ncRNA/;
872 $ncrna_id .= '.r0' . ++$rnum;
873 $f->add_tag_value( Parent => $gene_id );
874 $f->add_tag_value( ID => $ncrna_id );
875 } else {
876 unless ($f->has_tag('ID')) {
877 if($genelinkID) {
878 $f->add_tag_value( ID => $genelinkID ) ;
879 } else {
880 $idh->generate_unique_persistent_id($f);
881 }
882 }
883 ($ncrna_id)= $f->get_tag_values('ID');
884 return GM_NEW_TOPLEVEL;
885 # this feat now acts as gene-top-level; need to print @to_print to flush prior exons?
886 }
887
888 } elsif ( /exon/ ) { # can belong to any kind of RNA
889 return GM_NOT_PART unless ($rna_id||$ncrna_id);
890 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
891 ## we are getting duplicate Parents here, which chokes chado loader, with reason...
892 ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice
893 ## check all Parents
894 for my $expar ($rna_id, $ncrna_id) {
895 next unless($expar);
896 if ( $exonpar{$expar} and $f->has_tag('Parent') ) {
897 my @vals = $f->get_tag_values('Parent');
898 next if (grep {$expar eq $_} @vals);
899 }
900 $exonpar{$expar}++;
901 $f->add_tag_value( Parent => $expar);
902 # last; #? could be both
903 }
904 # now we can skip cloned exons
905 # dgg note: multiple parents get added and printed for each unique exon
906 return GM_DUP_PART if ++$seen{$f} > 1;
907
908 } elsif ( /CDS|protein|polypeptide/ ) {
909 return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??
910 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??
911 (my $pro_id = $rna_id) =~ s/\.t/\.p/;
912
913 if( ! $CDSkeep && /CDS/) {
914 $f->primary_tag($PROTEIN_TYPE);
915
916 ## duplicate problem is Location ..
917 if ($f->location->isa("Bio::Location::SplitLocationI")) {
918 # my($b,$e)=($f->start, $f->end); # is this all we need?
919 my($b,$e)=(-1,0);
920 foreach my $l ($f->location->each_Location) {
921 $b = $l->start if($b<0 || $b > $l->start);
922 $e = $l->end if($e < $l->end);
923 }
924 $f->location( Bio::Location::Simple->new(
925 -start => $b, -end => $e, -strand => $f->strand) );
926 }
927
928 $f->add_tag_value( Derives_from => $rna_id );
929 }
930 else {
931 $f->add_tag_value( Parent => $rna_id );
932 }
933
934 $f->add_tag_value( ID => $pro_id );
935
936 move_translation_fasta($f, $pro_id);
937 #if( $f->has_tag('translation')) {
938 # my ($aa) = $f->get_tag_values("translation");
939 # $proteinfa{$pro_id}= $aa;
940 # $f->remove_tag("translation");
941 # $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
942 #}
943 } elsif ( /region/ ) {
944 $f->primary_tag('gene_component_region');
945 $f->add_tag_value( Parent => $gene_id );
946 } else {
947 return GM_NOT_PART unless $gene_id;
948 $f->add_tag_value( Parent => $gene_id );
949 }
950
951 ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;
952
953 return GM_NEW_PART;
954 }
955
956 ## was generic_features > add_generic_id
957 sub add_generic_id {
958 my ($f, $ft_name, $flags) = @_;
959 my $method = $f->primary_tag;
960 $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above
961
962 if ($f->has_tag('ID')) {
963
964 }
965 elsif ( $f->has_tag($method) ) {
966 my ($name) = $f->get_tag_values($method);
967 $f->add_tag_value( ID => "$method:$name" );
968 }
969 elsif($ft_name) { # is this unique ?
970 $f->add_tag_value( ID => $ft_name );
971 }
972 else {
973 $idh->generate_unique_persistent_id($f);
974 }
975
976 move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )
977 if($method =~ /CDS/);
978
979 # return $io->gff_string($f);
980 }
981
982 sub move_translation_fasta {
983 my ($f, $ft_id) = @_;
984 if( $ft_id && $f->has_tag('translation') ) {
985 my ($aa) = $f->get_tag_values("translation");
986 if($aa && $aa !~ /^length/) {
987 $proteinfa{$ft_id}= $aa;
988 $f->remove_tag("translation");
989 $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
990 }
991 }
992 }
993
994 sub gff_header {
995 my ($name, $end, $source_type, $source_feat) = @_;
996 $source_type ||= "region";
997
998 my $info = "$source_type:$name";
999 my $head = "##gff-version $GFF_VERSION\n".
1000 "##sequence-region $name 1 $end\n".
1001 "# conversion-by bp_genbank2gff3.pl\n";
1002 if ($source_feat) {
1003 ## dgg: these header comment fields are not useful when have multi-records, diff organisms
1004 for my $key (qw(organism Note date)) {
1005 my $value;
1006 if ($source_feat->has_tag($key)) {
1007 ($value) = $source_feat->get_tag_values($key);
1008 }
1009 if ($value) {
1010 $head .= "# $key $value\n";
1011 $info .= ", $value";
1012 }
1013 }
1014 $head = "" if $didheader;
1015 } else {
1016 $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";
1017 }
1018 $didheader++;
1019 return (wantarray) ? ($head,$info) : $head;
1020 }
1021
1022 sub unflatten_seq {
1023 my $seq = shift;
1024
1025 ## print "# working on $source_type:", $seq->accession, "\n";
1026 my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number .
1027 ": consult STDERR\n";
1028
1029 eval {
1030 $unflattener->unflatten_seq( -seq => $seq,
1031 -noinfer => $noinfer,
1032 -use_magic => 1 );
1033 };
1034
1035 # deal with unflattening errors
1036 if ( $@ ) {
1037 warn $seq->accession_number . " Unflattening error:\n";
1038 warn "Details: $@\n";
1039 print "# ".$uh_oh;
1040 }
1041
1042 return 0 if !$seq || !$seq->all_SeqFeatures;
1043
1044 # map feature types to the sequence ontology
1045 ## $tm->map_types_to_SO( -seq => $seq );
1046 #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg
1047
1048 map_types(
1049 $tm,
1050 -seq => $seq,
1051 -type_map => $FTSOmap,
1052 -syn_map => $FTSOsynonyms,
1053 -undefined => "region"
1054 ); #nml
1055
1056 }
1057
1058 sub filter {
1059 my $seq = shift;
1060 ## return unless $filter;
1061 my @feats;
1062 my @sources; # dgg; pick source features here; only 1 always?
1063 if ($filter) {
1064 for my $f ( $seq->remove_SeqFeatures ) {
1065 my $m = $f->primary_tag;
1066 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1067 push @feats, $f unless $filter =~ /$m/i;
1068 }
1069 $seq->add_SeqFeature($_) foreach @feats;
1070 } else {
1071 for my $f ( $seq->get_SeqFeatures ){
1072 my $m = $f->primary_tag;
1073 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1074 }
1075 }
1076
1077 return @sources;
1078 }
1079
1080
1081 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
1082 # changed to filter out cloned features. We have to implement the old
1083 # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
1084 sub get_all_SeqFeatures {
1085 my $seq = shift;
1086 my @flatarr;
1087
1088 foreach my $feat ( $seq->get_SeqFeatures ){
1089 push(@flatarr,$feat);
1090 _add_flattened_SeqFeatures(\@flatarr,$feat);
1091 }
1092 return @flatarr;
1093 }
1094
1095 sub gene_name {
1096 my $g = shift;
1097 my $gene_id = ''; # zero it;
1098
1099 if ($g->has_tag('locus_tag')) {
1100 ($gene_id) = $g->get_tag_values('locus_tag');
1101 }
1102
1103 elsif ($g->has_tag('gene')) {
1104 ($gene_id) = $g->get_tag_values('gene');
1105 }
1106 elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene
1107 ($gene_id) = $g->get_tag_values('ID');
1108 }
1109
1110 ## See Unflattener comment:
1111 # on rare occasions, records will have no /gene or /locus_tag
1112 # but it WILL have /product tags. These serve the same purpose
1113 # for grouping. For an example, see AY763288 (also in t/data)
1114 # eg. product=tRNA-Asp ; product=similar to crooked neck protein
1115 elsif ($g->has_tag('product')) {
1116 my ($name)= $g->get_tag_values('product');
1117 ($gene_id) = $name unless($name =~ / /); # a description not name
1118 }
1119
1120 ## dgg; also handle transposon=xxxx ID/name
1121 # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746
1122 elsif ($g->has_tag('transposon')) {
1123 my ($name)= $g->get_tag_values('transposon');
1124 ($gene_id) = $name unless($name =~ / /); # a description not name
1125 }
1126
1127 return $gene_id;
1128 }
1129
1130 # same list as gene_name .. change tag to generic Name
1131 sub convert_to_name {
1132 my $g = shift;
1133 my $gene_id = ''; # zero it;
1134
1135 if ($g->has_tag('gene')) {
1136 ($gene_id) = $g->get_tag_values('gene');
1137 $g->remove_tag('gene');
1138 $g->add_tag_value('Name', $gene_id);
1139 }
1140 elsif ($g->has_tag('locus_tag')) {
1141 ($gene_id) = $g->get_tag_values('locus_tag');
1142 $g->remove_tag('locus_tag');
1143 $g->add_tag_value('Name', $gene_id);
1144 }
1145
1146 elsif ($g->has_tag('product')) {
1147 my ($name)= $g->get_tag_values('product');
1148 ($gene_id) = $name unless($name =~ / /); # a description not name
1149 ## $g->remove_tag('product');
1150 $g->add_tag_value('Name', $gene_id);
1151 }
1152
1153 elsif ($g->has_tag('transposon')) {
1154 my ($name)= $g->get_tag_values('transposon');
1155 ($gene_id) = $name unless($name =~ / /); # a description not name
1156 ## $g->remove_tag('transposon');
1157 $g->add_tag_value('Name', $gene_id);
1158 }
1159 elsif ($g->has_tag('ID')) {
1160 my ($name)= $g->get_tag_values('ID');
1161 $g->add_tag_value('Name', $name);
1162 }
1163 return $gene_id;
1164 }
1165
1166
1167 sub _add_flattened_SeqFeatures {
1168 my ($arrayref,$feat) = @_;
1169 my @subs = ();
1170
1171 if ($feat->isa("Bio::FeatureHolderI")) {
1172 @subs = $feat->get_SeqFeatures;
1173 }
1174 elsif ($feat->isa("Bio::SeqFeatureI")) {
1175 @subs = $feat->sub_SeqFeature;
1176 }
1177 else {
1178 warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
1179 "Don't know how to flatten.";
1180 }
1181
1182 for my $sub (@subs) {
1183 push(@$arrayref,$sub);
1184 _add_flattened_SeqFeatures($arrayref,$sub);
1185 }
1186
1187 }
1188
1189 sub map_types {
1190
1191 my ($self, @args) = @_;
1192
1193 my($sf, $seq, $type_map, $syn_map, $undefmap) =
1194 $self->_rearrange([qw(FEATURE
1195 SEQ
1196 TYPE_MAP
1197 SYN_MAP
1198 UNDEFINED
1199 )],
1200 @args);
1201
1202 if (!$sf && !$seq) {
1203 $self->throw("you need to pass in either -feature or -seq");
1204 }
1205
1206 my @sfs = ($sf);
1207 if ($seq) {
1208 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
1209 @sfs = $seq->get_all_SeqFeatures;
1210 }
1211 $type_map = $type_map || $self->typemap; # dgg: was type_map;
1212 foreach my $feat (@sfs) {
1213
1214 $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
1215 $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
1216
1217 my $primary_tag = $feat->primary_tag;
1218
1219 #if ($primary_tag =~ /^pseudo(.*)$/) {
1220 # $primary_tag = $1;
1221 # $feat->primary_tag($primary_tag);
1222 #}
1223
1224 my $mtype = $type_map->{$primary_tag};
1225 if ($mtype) {
1226 if (ref($mtype)) {
1227 if (ref($mtype) eq 'ARRAY') {
1228 my $soID;
1229 ($mtype, $soID) = @$mtype;
1230
1231 if ($soID && ref($ONTOLOGY)) {
1232 my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
1233 $mtype = $term->name if $term;
1234 }
1235 # if SO ID is undefined AND we have an ontology to search, we want to delete
1236 # the feature type hash entry in order to force a fuzzy search
1237 elsif (! defined $soID && ref($ONTOLOGY)) {
1238 undef $mtype;
1239 delete $type_map->{$primary_tag};
1240 }
1241 elsif ($undefmap && $mtype eq 'undefined') { # dgg
1242 $mtype= $undefmap;
1243 }
1244
1245 $type_map->{$primary_tag} = $mtype if $mtype;
1246 }
1247 elsif (ref($mtype) eq 'CODE') {
1248 $mtype = $mtype->($feat);
1249 }
1250 else {
1251 $self->throw('must be scalar or CODE ref');
1252 }
1253 }
1254 elsif ($undefmap && $mtype eq 'undefined') { # dgg
1255 $mtype= $undefmap;
1256 }
1257 $feat->primary_tag($mtype);
1258 }
1259
1260 if ($CONF) {
1261 conf_read();
1262 my %perfect_matches;
1263 while (my ($p_tag,$rules) = each %$YAML) {
1264 RULE:
1265 for my $rule (@$rules) {
1266 for my $tags (@$rule) {
1267 while (my ($tag,$values) = each %$tags) {
1268 for my $value (@$values) {
1269 if ($feat->has_tag($tag)) {
1270 for ($feat->get_tag_values($tag)) {
1271 next RULE unless $_ =~ /\Q$value\E/;
1272 }
1273 } elsif ($tag eq 'primary_tag') {
1274 next RULE unless $value eq
1275 $feat->primary_tag;
1276 } elsif ($tag eq 'location') {
1277 next RULE unless $value eq
1278 $feat->start.'..'.$feat->end;
1279 } else { next RULE }
1280 }
1281 }
1282 }
1283 $perfect_matches{$p_tag}++;
1284 }
1285 }
1286 if (scalar(keys %perfect_matches) == 1) {
1287 $mtype = $_ for keys %perfect_matches;
1288 } elsif (scalar(keys %perfect_matches) > 1) {
1289 warn "There are conflicting rules in the config file for the" .
1290 " following types: ";
1291 warn "\t$_\n" for keys %perfect_matches;
1292 warn "Until conflict resolution is built into the converter," .
1293 " you will have to manually edit the config file to remove the" .
1294 " conflict. Sorry :(. Skipping user preference for this entry";
1295 sleep(2);
1296 }
1297 }
1298
1299 if ( ! $mtype && $syn_map) {
1300 if ($feat->has_tag('note')) {
1301
1302 my @all_matches;
1303
1304 my @note = $feat->each_tag_value('note');
1305
1306 for my $k (keys %$syn_map) {
1307
1308 if ($k =~ /"(.+)"/) {
1309
1310 my $syn = $1;
1311
1312 for my $note (@note) {
1313
1314 # look through the notes to see if the description
1315 # is an exact match for synonyms
1316 if ( $syn eq $note ) {
1317
1318 my @map = @{$syn_map->{$k}};
1319
1320
1321 my $best_guess = $map[0];
1322
1323 unshift @{$all_matches[-1]}, [$best_guess];
1324
1325 $mtype = $MANUAL
1326 ? manual_curation($feat, $best_guess, \@all_matches)
1327 : $best_guess;
1328
1329 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1330 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1331 . '#' x 78 . "\n\n";
1332
1333 } else {
1334 # check both primary tag and and note against
1335 # SO synonyms for best matching description
1336
1337 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
1338 }
1339
1340 }
1341 }
1342 }
1343
1344 #unless ($mtype) {
1345 for my $note (@note) {
1346 for my $name (values %$type_map) {
1347 # check primary tag against SO names for best matching
1348 # descriptions //NML also need to check against
1349 # definition && camel case split terms
1350
1351 SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
1352 }
1353 }
1354 #}
1355
1356 if (scalar(@all_matches) && !$mtype) {
1357
1358 my $top_matches = first { defined $_ } @{$all_matches[-1]};
1359
1360 my $best_guess = $top_matches->[0];
1361
1362
1363
1364 # if guess has quotes, it is a synonym term. we need to
1365 # look up the corresponding name term
1366 # otherwise, guess is a name, so we can use it directly
1367 if ($best_guess =~ /"(.+)"/) {
1368
1369 $best_guess = $syn_map->{$best_guess}->[0];
1370
1371 }
1372
1373 @RETURN = @all_matches;
1374 $mtype = $MANUAL
1375 ? manual_curation($feat, $best_guess, \@all_matches)
1376 : $best_guess;
1377
1378 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1379 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1380 . '#' x 78 . "\n\n";
1381
1382 }
1383 }
1384 $mtype ||= $undefmap;
1385 $feat->primary_tag($mtype);
1386 }
1387 }
1388
1389
1390 }
1391
1392 sub SO_fuzzy_match {
1393
1394 my $candidate = shift;
1395 my $primary_tag = shift;
1396 my $note = shift;
1397 my $SO_terms = shift;
1398 my $best_matches_ref = shift;
1399 my $modifier = shift;
1400
1401 $modifier ||= '';
1402
1403 my @feat_terms;
1404
1405 for ( split(" |_", $primary_tag) ) {
1406 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1407 my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1408 push @feat_terms, @camelCase;
1409 }
1410
1411 for ( split(" |_", $note) ) {
1412 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1413 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1414 (my $word = $_) =~ s/[;:.,]//g;
1415 push @feat_terms, $word;
1416 }
1417
1418
1419 my @SO_terms = split(" |_", $SO_terms);
1420
1421 # fuzzy match works on a simple point system. When 2 words match,
1422 # the $plus counter adds one. When they don't, the $minus counter adds
1423 # one. This is used to sort similar matches together. Better matches
1424 # are found at the end of the array, near the top.
1425
1426 # NML: can we improve best match by using synonym tags
1427 # EXACT,RELATED,NARROW,BROAD?
1428
1429 my ($plus, $minus) = (0, 0);
1430 my %feat_terms;
1431 my %SO_terms;
1432
1433 #unique terms
1434 map {$feat_terms{$_} = 1} @feat_terms;
1435 map {$SO_terms{$_} = 1} @SO_terms;
1436
1437 for my $st (keys %SO_terms) {
1438 for my $ft (keys %feat_terms) {
1439
1440 ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
1441
1442 }
1443 }
1444
1445 push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
1446
1447 }
1448
1449 sub manual_curation {
1450
1451 my ($feat, $default_opt, $all_matches) = @_;
1452
1453 my @all_matches = @$all_matches;
1454
1455 # convert all SO synonyms into names and filter
1456 # all matches into unique term list because
1457 # synonyms can map to multiple duplicate names
1458
1459 my (@unique_SO_terms, %seen);
1460 for (reverse @all_matches) {
1461 for (@$_) {
1462 for (@$_) {
1463 #my @names;
1464 if ($_ =~ /"(.+)"/) {
1465 for (@{$SYN_MAP->{$_}}) {
1466 push @unique_SO_terms, $_ unless $seen{$_};
1467 $seen{$_}++;
1468 }
1469 } else {
1470 push @unique_SO_terms, $_ unless $seen{$_};
1471 $seen{$_}++;
1472 }
1473 }
1474 }
1475 }
1476
1477 my $s = scalar(@unique_SO_terms);
1478
1479 my $choice = 0;
1480
1481 my $more =
1482 "[a]uto : automatic input (selects best guess for remaining entries)\r" .
1483 "[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .
1484 "[i]nput : add a specific term\r" .
1485 "[r]eset : reset to the beginning of matches\r" .
1486 "[s]kip : skip this entry (selects best guess for this entry)\r"
1487 ;
1488
1489 $more .=
1490 "[n]ext : view the next ".OPTION_CYCLE." terms\r" .
1491 "[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
1492
1493 my $msg = #"\n\n" . '-' x 156 . "\n"
1494 "The converter found $s possible matches for the following GenBank entry: ";
1495
1496 my $directions =
1497 "Type a number to select the SO term that best matches"
1498 . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
1499
1500
1501 # lookup filtered list to pull out definitions
1502 my @options = map {
1503 my $term = $_;
1504 my %term;
1505 for (['name', 'name'], ['def', 'definition'], ['synonym',
1506 'each_synonym']) {
1507 my ($label, $method) = @$_;
1508 $term{$label} = \@{[$term->$method]};
1509 }
1510 [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
1511 } map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
1512
1513
1514 my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
1515 $default_opt, @options);
1516
1517 if ($option eq 'skip') { return $default_opt
1518 } elsif ($option eq 'auto') {
1519 $MANUAL = 0;
1520 return $default_opt;
1521 } else { return $option }
1522
1523 }
1524
1525 sub options_cycle {
1526
1527 my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
1528
1529 #NML: really should only call GenBank_entry once. Will need to change
1530 #method to return array & shift off header
1531 my $entry = GenBank_entry($feat, "\r");
1532
1533 my $total = scalar(@opt);
1534
1535 ($start,$stop) = (0, OPTION_CYCLE)
1536 if ( ($start < 0) && ($stop > 0) );
1537
1538 ($start,$stop) = (0, OPTION_CYCLE)
1539 if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
1540
1541 ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
1542 ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
1543
1544 $stop = $total if $stop > $total;
1545
1546 my $dir_copy = $directions;
1547 my $msg_copy = $msg;
1548 my $format = "format STDOUT = \n" .
1549 '-' x 156 . "\n" .
1550 '^' . '<' x 77 . '| Available Commands:' . "\n" .
1551 '$msg_copy' . "\n" .
1552 '-' x 156 . "\n" .
1553 ' ' x 78 . "|\n" .
1554 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
1555 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
1556 (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
1557 ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";
1558
1559 {
1560 # eval throws redefined warning that breaks formatting.
1561 # Turning off warnings just for the eval to fix this.
1562 no warnings 'redefine';
1563 eval $format;
1564 }
1565
1566 write;
1567
1568 print '-' x 156 . "\n" .
1569 'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
1570 " - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
1571 "\n" . '-' x 156 . "\n";
1572
1573 for (my $i = $start; $i < $stop; $i+=2) {
1574
1575 my ($left, $right) = @opt[$i,$i+1];
1576
1577 my ($nL, $nmL, $descL, $termL, @synL) = @$left;
1578
1579 #odd numbered lists can cause fatal undefined errors, so check
1580 #to make sure we have data
1581
1582 my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
1583
1584
1585 my $format = "format STDOUT = \n";
1586
1587 $format .=
1588 ' ' x 78 . "|\n" .
1589
1590 '@>>: name: ^' . '<' x 64 . '~' . ' |' .
1591 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .
1592 '$nL,' . ' ' x 7 . '$nmL,' .
1593 ( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .
1594
1595 ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
1596 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1597 ' ' x 11 . '$nmL,' .
1598 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
1599 #' ' x 78 . '|' . "\n" .
1600
1601
1602 ' def: ^' . '<' x 65 . ' |' .
1603 (ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .
1604 ' ' x 11 . '$descL,' .
1605 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1606
1607
1608 (' ^' . '<' x 65 . ' |' .
1609 (ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .
1610 ' ' x 11 . '$descL,' .
1611 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
1612
1613
1614 ' ^' . '<' x 61 . '...~ |' .
1615 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1616 ' ' x 11 . '$descL,' .
1617 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1618
1619 ".\n";
1620
1621 {
1622 # eval throws redefined warning that breaks formatting.
1623 # Turning off warnings just for the eval to fix this.
1624 no warnings 'redefine';
1625 eval $format;
1626 }
1627 write;
1628
1629 }
1630 print '-' x 156 . "\nenter a command:";
1631
1632 while (<STDIN>) {
1633
1634 (my $input = $_) =~ s/\s+$//;
1635
1636 if ($input =~ /^\d+$/) {
1637 if ( $input && defined $opt[$input-1] ) {
1638 return $opt[$input-1]->[1]
1639 } else {
1640 print "\nThat number is not an option. Please enter a valid number.\n:";
1641 }
1642 } elsif ($input =~ /^n/i | $input =~ /next/i ) {
1643 return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
1644 $feat, $directions, $best_guess, @opt)
1645 } elsif ($input =~ /^p/i | $input =~ /prev/i ) {
1646 return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
1647 $feat, $directions, $best_guess, @opt)
1648 } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
1649 } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
1650 } elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
1651 return manual_curation($feat, $best_guess, \@RETURN );
1652 } elsif ( $input =~ /^f/i || $input =~ /find/i ) {
1653
1654 my ($query, @query_results);
1655
1656 if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
1657 } else {
1658
1659 #do a SO search
1660 print "Type your search query\n:";
1661 while (<STDIN>) { chomp($query = $_); last }
1662 }
1663
1664 for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
1665 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
1666 }
1667
1668 return manual_curation($feat, $best_guess, \@query_results);
1669
1670 } elsif ( $input =~ /^i/i || $input =~ /input/i ) {
1671
1672 #NML fast input for later
1673 #my $query;
1674 #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
1675
1676 #manual input
1677 print "Type the term you want to use\n:";
1678 while (<STDIN>) {
1679 chomp(my $input = $_);
1680
1681 if (! $TYPE_MAP->{$input}) {
1682
1683 print "\"$input\" doesn't appear to be a valid SO term. Are ".
1684 "you sure you want to use it? (y or n)\n:";
1685
1686 while (<STDIN>) {
1687
1688 chomp(my $choice = $_);
1689
1690 if ($choice eq 'y') {
1691 print
1692 "\nWould you like to save your preference for " .
1693 "future use (so you don't have to redo manual " .
1694 "curation for this feature everytime you run " .
1695 "the converter)? (y or n)\n";
1696
1697 #NML: all these while loops are a mess. Really should condense it.
1698 while (<STDIN>) {
1699
1700 chomp(my $choice = $_);
1701
1702 if ($choice eq 'y') {
1703 curation_save($feat, $input);
1704 return $input;
1705 } elsif ($choice eq 'n') {
1706 return $input
1707 } else {
1708 print "\nDidn't recognize that command. Please " .
1709 "type y or n.\n:"
1710 }
1711 }
1712
1713
1714 } elsif ($choice eq 'n') {
1715 return options_cycle($start, $stop, $msg, $feat,
1716 $directions, $best_guess, @opt)
1717 } else {
1718 print "\nDidn't recognize that command. Please " .
1719 "type y or n.\n:"
1720 }
1721 }
1722
1723 } else {
1724 print
1725 "\nWould you like to save your preference for " .
1726 "future use (so you don't have to redo manual " .
1727 "curation for this feature everytime you run " .
1728 "the converter)? (y or n)\n";
1729
1730 #NML: all these while loops are a mess. Really should condense it.
1731 while (<STDIN>) {
1732
1733 chomp(my $choice = $_);
1734
1735 if ($choice eq 'y') {
1736 curation_save($feat, $input);
1737 return $input;
1738 } elsif ($choice eq 'n') {
1739 return $input
1740 } else {
1741 print "\nDidn't recognize that command. Please " .
1742 "type y or n.\n:"
1743 }
1744 }
1745
1746 }
1747
1748 }
1749 } else {
1750 print "\nDidn't recognize that command. Please re-enter your choice.\n:"
1751 }
1752 }
1753
1754 }
1755
1756 sub GenBank_entry {
1757 my ($f, $delimiter, $num) = @_;
1758
1759 $delimiter ||= "\n";
1760
1761
1762 my $entry =
1763
1764 ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
1765 . ($num
1766 ? ' ' x (12 - length $f->primary_tag ) . ' [2] '
1767 : ' ' x (15 - length $f->primary_tag)
1768 )
1769 . $f->start.'..'.$f->end
1770
1771 . "$delimiter";
1772
1773 if ($num) {
1774 words_tag($f, \$entry);
1775 } else {
1776 for my $tag ($f->all_tags) {
1777 for my $val ( $f->each_tag_value($tag) ) {
1778 $entry .= ' ' x 20;
1779 #$entry .= "/$tag=\"$val\"$delimiter";
1780 $entry .= $val eq '_no_value'
1781 ? "/$tag$delimiter"
1782 : "/$tag=\"$val\"$delimiter";
1783 }
1784 }
1785
1786 }
1787
1788 return $entry;
1789 }
1790
1791
1792 sub gff_validate {
1793 warn "Validating GFF file\n" if $DEBUG;
1794 my @feat = @_;
1795
1796 my (%parent2child, %all_ids, %descendants, %reserved);
1797
1798 for my $f (@feat) {
1799 for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
1800 map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
1801 if $f->has_tag($$aTags[0]);
1802 }
1803 }
1804
1805 if ($SO_FILE) {
1806 while (my ($parentID, $aChildren) = each %parent2child) {
1807 parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
1808 }
1809 }
1810
1811 id_validate(\%all_ids, \%reserved);
1812 }
1813
1814 sub parent_validate {
1815 my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
1816
1817 my $aParents = $hAll->{$parentID};
1818
1819 map {
1820 my $child = $_;
1821 $child->add_tag_value( validation_error =>
1822 "feature tried to add Parent tag, but no Parent found with ID $parentID"
1823 );
1824 my %parents;
1825 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1826 delete $parents{$parentID};
1827 my @parents = keys %parents;
1828
1829 $child->remove_tag('Parent');
1830
1831 unless ($child->has_tag('ID')) {
1832 my $id = gene_name($child);
1833 $child->add_tag_value('ID', $id);
1834 push @{$hAll->{$id}}, $child
1835 }
1836
1837 $child->add_tag_value('Parent', @parents) if @parents;
1838
1839 } @$aChildren and return unless scalar(@$aParents);
1840
1841 my $par = join(',', map { $_->primary_tag } @$aParents);
1842 warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
1843
1844 #NML manual curation needs to happen here
1845
1846
1847 my %parentsToRemove;
1848
1849 CHILD:
1850 for my $child (@$aChildren) {
1851 my $childType = $child->primary_tag;
1852
1853 warn "WORKING ON $childType at ".$child->start.' to '.$child->end
1854 if $DEBUG;
1855
1856 for (my $i = 0; $i < scalar(@$aParents); $i++) {
1857 my $parent = $aParents->[$i];
1858 my $parentType = $parent->primary_tag;
1859
1860 warn "CHECKING $childType against $parentType" if $DEBUG;
1861
1862 #cache descendants so we don't have to do repeat searches
1863 unless ($hDescendants->{$parentType}) {
1864
1865 for my $term ($ONTOLOGY->find_terms(
1866 -name => $parentType
1867 ) ) {
1868
1869 map {
1870 $hDescendants->{$parentType}{$_->name}++
1871 } $ONTOLOGY->get_descendant_terms($term);
1872
1873 }
1874
1875 # NML: hopefully temporary fix.
1876 # SO doesn't consider exon/CDS to be a child of mRNA
1877 # even though common knowledge dictates that they are
1878 # This cheat fixes the false positives for now
1879 if ($parentType eq 'mRNA') {
1880 $hDescendants->{$parentType}{'exon'} = 1;
1881 $hDescendants->{$parentType}{'CDS'} = 1;
1882 }
1883
1884 }
1885
1886 warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
1887 " be a child of $parentType?" if $DEBUG;
1888 if ($hDescendants->{$parentType}{$childType}) {
1889 warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
1890
1891 #NML need to deal with multiple children matched to multiple different
1892 #parents. This model only assumes the first parent id that matches a child will
1893 #be the reserved feature.
1894
1895 $hReserved->{$parentID}{$parent}{'parent'} = $parent;
1896 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
1897
1898 #mark parent for later removal from all IDs
1899 #so we don't accidentally change any parents
1900
1901 $parentsToRemove{$i}++;
1902
1903 next CHILD;
1904 }
1905 }
1906
1907
1908
1909 #NML shouldn't have to check this; somehow child can lose Parent
1910 #it's happening W3110
1911 #need to track this down
1912 if ( $child->has_tag('Parent') ) {
1913
1914 warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
1915 if $DEBUG;
1916
1917 my %parents;
1918
1919 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1920
1921 delete $parents{$parentID};
1922 my @parents = keys %parents;
1923
1924 warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
1925 ' to ' . $child->end . " cannot be a child of ID $parentID"
1926 if $DEBUG;
1927
1928 $child->add_tag_value( validation_error =>
1929 "feature cannot be a child of $parentID");
1930
1931 $child->remove_tag('Parent');
1932
1933 unless ($child->has_tag('ID')) {
1934 my $id = gene_name($child);
1935 $child->add_tag_value('ID', $id);
1936 push @{$hAll->{$id}}, $child
1937 }
1938
1939 $child->add_tag_value('Parent', @parents) if @parents;
1940 }
1941
1942 }
1943
1944 #delete $aParents->[$_] for keys %parentsToRemove;
1945 splice(@$aParents, $_, 1) for keys %parentsToRemove;
1946 }
1947
1948 sub id_validate {
1949 my ($hAll, $hReserved) = @_;
1950
1951
1952 for my $id (keys %$hAll) {
1953
1954 #since 1 feature can have this id,
1955 #let's just shift it off and uniquify
1956 #the rest (unless it's reserved)
1957
1958 shift @{$hAll->{$id}} unless $hReserved->{$id};
1959 for my $feat (@{$hAll->{$id}}) {
1960 id_uniquify(0, $id, $feat, $hAll);
1961 }
1962 }
1963
1964 for my $parentID (keys %$hReserved) {
1965
1966 my @keys = keys %{$hReserved->{$parentID}};
1967
1968 shift @keys;
1969
1970 for my $k (@keys) {
1971 my $parent = $hReserved->{$parentID}{$k}{'parent'};
1972 my $aChildren= $hReserved->{$parentID}{$k}{'children'};
1973
1974 my $value = id_uniquify(0, $parentID, $parent, $hAll);
1975 for my $child (@$aChildren) {
1976
1977 my %parents;
1978 map { $parents{$_}++ } $child->get_tag_values('Parent');
1979 $child->remove_tag('Parent');
1980 delete $parents{$parentID};
1981 $parents{$value}++;
1982 my @parents = keys %parents;
1983 $child->add_tag_value('Parent', @parents);
1984 }
1985
1986 }
1987 }
1988 }
1989
1990 sub id_uniquify {
1991 my ($count, $value, $feat, $hAll) = @_;
1992
1993 warn "UNIQUIFYING $value" if $DEBUG;
1994
1995 if (! $count) {
1996 $feat->add_tag_value(Alias => $value);
1997 $value .= ('.' . $feat->primary_tag)
1998 } elsif ($count == 1) {
1999 $value .= ".$count"
2000 } else {
2001 chop $value;
2002 $value .= $count
2003 }
2004 $count++;
2005
2006 warn "ENDED UP WITH $value" if $DEBUG;
2007 if ( $hAll->{$value} ) {
2008 warn "$value IS ALREADY TAKEN" if $DEBUG;
2009 id_uniquify($count, $value, $feat, $hAll);
2010 } else {
2011 #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
2012 $feat->remove_tag('ID');
2013 $feat->add_tag_value('ID', $value);
2014 push @{$hAll->{$value}}, $value;
2015 }
2016
2017 $value;
2018 }
2019
2020 sub conf_read {
2021
2022 print "\nCannot read $CONF. Change file permissions and retry, " .
2023 "or enter another file\n" and conf_locate() unless -r $CONF;
2024
2025 print "\nCannot write $CONF. Change file permissions and retry, " .
2026 "or enter another file\n" and conf_locate() unless -w $CONF;
2027
2028 $YAML = LoadFile($CONF);
2029
2030 }
2031
2032 sub conf_create {
2033
2034 my ($path, $input) = @_;
2035
2036 print "Cannot write to $path. Change directory permissions and retry " .
2037 "or enter another save path\n" and conf_locate() unless -w $path;
2038
2039 $CONF = $input;
2040
2041 open(FH, '>', $CONF);
2042 close(FH);
2043 conf_read();
2044
2045
2046 }
2047
2048 sub conf_write { DumpFile($CONF, $YAML) }
2049
2050 sub conf_locate {
2051
2052 print "\nEnter the location of a previously saved config, or a new " .
2053 "path and file name to create a new config (this step is " .
2054 "necessary to save any preferences)";
2055
2056 print "\n\nenter a command:";
2057
2058 while (<STDIN>) {
2059 chomp(my $input = $_);
2060 my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
2061
2062 if (-e $input && (! -d $input)) {
2063
2064 print "\nReading $input...\n";
2065 $CONF = $input;
2066
2067 conf_read();
2068 last;
2069
2070 } elsif (! -d $input && $fn.$suffix) {
2071
2072 print "Creating $input...\n";
2073 conf_create($path, $input);
2074 last;
2075
2076 } elsif (-e $input && -d $input) {
2077 print "You only entered a directory. " .
2078 "Please enter BOTH a directory and filename\n";
2079 } else {
2080 print "$input does not appear to be a valid path. Please enter a " .
2081 "valid directory and filename\n";
2082 }
2083 print "\nenter a command:";
2084 }
2085 }
2086
2087 sub curation_save {
2088
2089 my ($feat, $input) = @_;
2090
2091 #my $error = "Enter the location of a previously saved config, or a new " .
2092 # "path and file name to create a new config (this step is " .
2093 # "necessary to save any preferences)\n";
2094
2095 if (!$CONF) {
2096 print "\n\n";
2097 conf_locate();
2098 } elsif (! -e $CONF) {
2099 print "\n\nThe config file you have chosen doesn't exist.\n";
2100 conf_locate();
2101 } else { conf_read() }
2102
2103 my $entry = GenBank_entry($feat, "\r", 1);
2104
2105 my $msg = "Term entered: $input";
2106 my $directions = "Please select any/all tags that provide evidence for the term you
2107 have entered. You may enter multiple tags by separating them by commas/dashes
2108 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
2109 the option of either selecting the entire note as evidence, or specific
2110 keywords. If a tag has multiple keywords, they will be tagged alphabetically
2111 for selection. To select a specific keyword in a tag field, you must enter the
2112 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
2113 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
2114 to be to match your curation. To match the GenBank entry exactly as it
2115 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
2116 preference, you will no longer be prompted to choose a feature type for any
2117 matching entries until you edit the curation.ini file.";
2118 my $msg_copy = $msg;
2119 my $dir_copy = $directions;
2120
2121 my $format = "format STDOUT = \n" .
2122 '-' x 156 . "\n" .
2123 '^' . '<' x 77 . '| Directions:' . "\n" .
2124 '$msg_copy' . "\n" .
2125 '-' x 156 . "\n" .
2126 ' ' x 78 . "|\n" .
2127 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
2128 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
2129 (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
2130 ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";
2131
2132 {
2133 # eval throws redefined warning that breaks formatting.
2134 # Turning off warnings just for the eval to fix this.
2135 no warnings 'redefine';
2136 eval $format;
2137 }
2138
2139 write;
2140 print '-' x 156 . "\nenter a command:";
2141
2142 my @tags = words_tag($feat);
2143
2144 my $final = {};
2145 my $choices;
2146 while (<STDIN>) {
2147
2148 chomp(my $choice = $_);
2149
2150 if (scalar(keys %$final) && $choice =~ /^y/i) { last
2151
2152 } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
2153
2154 } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
2155
2156 } elsif ($choice eq 'all') {
2157
2158 $choice = '';
2159 for (my $i=1; $i < scalar(@tags); $i++) {
2160 $choice .= "$i,";
2161 }
2162 chop $choice;
2163 }
2164 #print "CHOICE [$choice]";
2165
2166 my @selections;
2167 for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
2168 if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
2169
2170 for ($1..$2) { push @selections, $_ }
2171
2172 my $dangling_alphas = $3;
2173 alpha_expand($dangling_alphas, \@selections);
2174
2175 } else {
2176 alpha_expand($_, \@selections);
2177 }
2178 }
2179
2180 foreach my $numbers (@selections) {
2181
2182 my @c = split(/(?=[\w])/, $numbers);
2183 s/\W+//g foreach @c;
2184 my $num;
2185
2186 {
2187 $^W = 0;
2188 $num = 0 + shift @c;
2189 }
2190
2191 my $tag = $tags[$num];
2192 if (ref $tag && scalar(@c)) {
2193 my $no_value;
2194 foreach (@c) {
2195 if (defined $tag->{$_}) {
2196 $choices .= "${num}[$_] ";
2197 my ($t,$v) = @{$tag->{$_}};
2198 push @{${$final->{$input}}[0]{$t}}, $v;
2199
2200 } else { $no_value++ }
2201 }
2202
2203 if ($no_value) {
2204 _selection_add($tag,$final,$input,\$choices,$num);
2205 #my ($t,$v) = @{$tag->{'all'}};
2206 #unless (defined ${$final->{$input}}[0]{$t}) {
2207 #$choices .= "$num, ";
2208 #push @{${$final->{$input}}[0]{$t}}, $v
2209 #}
2210 }
2211
2212 $choices = substr($choices, 0, -2);
2213 $choices .= ', ';
2214
2215 } elsif (ref $tag) {
2216 _selection_add($tag,$final,$input,\$choices,$num);
2217 #my ($t,$v) = @{$tag->{'all'}};
2218 #unless (defined ${$final->{$input}}[0]{$t}) {
2219 #$choices .= "$num, ";
2220 #push @{${$final->{$input}}[0]{$t}}, $v
2221 #}
2222 }
2223 }
2224 $choices = substr($choices, 0, -2) if $choices;
2225 if ($final) {
2226 print "\nYou have chosen the following tags:\n$choices\n";
2227 print "This will be written to the config file as:\n";
2228 print Dump $final;
2229 print "\nIs this correct? (y or n)\n";
2230 } else { print "\nInvalid selection. Please try again\n" }
2231 }
2232 push @{$YAML->{$input}}, $final->{$input};
2233 conf_write();
2234 }
2235
2236 # words_tag() splits each tag value string into multiple words so that the
2237 # user can select the parts he/she wants to use for curation
2238 # it can tag 702 (a - zz) separate words; this should be enough
2239
2240 sub words_tag {
2241
2242 my ($feat, $entry) = @_;
2243
2244 my @tags;
2245
2246 @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
2247 my $i = 3;
2248 foreach my $tag ($feat->all_tags) {
2249 foreach my $value ($feat->each_tag_value($tag)) {
2250
2251 my ($string, $tagged_string);
2252
2253 my @words = split(/(?=\w+?)/, $value);
2254
2255 my $pos = 0;
2256
2257
2258 foreach my $word (@words) {
2259
2260 (my $sanitized_word = $word) =~ s/\W+?//g;
2261 $string .= $word;
2262
2263 my $lead = int($pos/ALPHABET_DIVISOR);
2264 my $lag = $pos % ALPHABET_DIVISOR;
2265
2266 my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';
2267 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2268
2269 $tagged_string .= " ($a) $word";
2270
2271 $tags[$i]{$a} = [$tag, $sanitized_word];
2272 $pos++;
2273 }
2274
2275 $value = $tagged_string if scalar(@words) > 1;
2276
2277 $$entry .= "[$i] /$tag=\"$value\"\r";
2278
2279 $tags[$i]{'all'} = [$tag, $string];
2280 }
2281 $i++;
2282 }
2283
2284 return @tags;
2285
2286 }
2287
2288 sub alpha_expand {
2289
2290 my ($dangling_alphas, $selections) = @_;
2291
2292 if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
2293
2294 my $digit = $1;
2295 push @$selections, $digit if $digit;
2296
2297 my $start = $2;
2298 my $stop = $3;
2299
2300 my @starts = split('', $start);
2301 my @stops = split('', $stop);
2302
2303 my ($final_start, $final_stop);
2304
2305 for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
2306
2307 my ($final, $splits) = @$_;
2308
2309 my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
2310 my $rem;
2311
2312
2313 if ($$splits[1]) {
2314 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
2315 $int++
2316 } else {
2317 $rem = $int;
2318 $int = 0;
2319 }
2320
2321
2322 $$final = $int * ALPHABET_DIVISOR;
2323 $$final += $rem;
2324
2325 }
2326
2327 my $last_number = pop @$selections;
2328 for my $pos ($final_start..$final_stop) {
2329 my $lead = int($pos/ALPHABET_DIVISOR);
2330 my $lag = $pos % ALPHABET_DIVISOR;
2331 my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';
2332 $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2333 push @$selections, $last_number.$alpha;
2334 }
2335
2336 } elsif (defined($dangling_alphas)) {
2337 if ($dangling_alphas =~ /^\d/) {
2338 push @$selections, $dangling_alphas;
2339 } elsif ($dangling_alphas =~ /^\D/) {
2340 #print "$dangling_alphas ".Dumper @$selections;
2341 my $last_number = pop @$selections;
2342 $last_number ||= '';
2343 push @$selections, $last_number.$dangling_alphas;
2344 #$$selections[-1] .= $dangling_alphas;
2345 }
2346 }
2347
2348 }
2349
2350 sub _selection_add {
2351
2352 my ($tag, $final, $input, $choices, $num) = @_;
2353 my ($t,$v) = @{$tag->{'all'}};
2354 unless (defined ${$final->{$input}}[0]{$t}) {
2355 $$choices .= "$num, ";
2356 push @{${$final->{$input}}[0]{$t}}, $v
2357 }
2358
2359 }