Mercurial > repos > dereeper > pangenome_explorer
comparison Perl/bp_genbank2gff3.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/opt/anaconda1anaconda2anaconda3/bin/perl | |
2 | |
3 eval 'exec /opt/anaconda1anaconda2anaconda3/bin/perl -S $0 ${1+"$@"}' | |
4 if 0; # not running under some shell | |
5 | |
6 | |
7 | |
8 =pod | |
9 | |
10 =head1 NAME | |
11 | |
12 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3 | |
13 | |
14 =head1 SYNOPSIS | |
15 | |
16 bp_genbank2gff3.pl [options] filename(s) | |
17 | |
18 # process a directory containing GenBank flatfiles | |
19 perl bp_genbank2gff3.pl --dir path_to_files --zip | |
20 | |
21 # process a single file, ignore explicit exons and introns | |
22 perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz | |
23 | |
24 # process a list of files | |
25 perl bp_genbank2gff3.pl *gbk.gz | |
26 | |
27 # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader | |
28 curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \ | |
29 | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \ | |
30 | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata | |
31 | |
32 Options: | |
33 --noinfer -r don't infer exon/mRNA subfeatures | |
34 --conf -i path to the curation configuration file that contains user preferences | |
35 for Genbank entries (must be YAML format) | |
36 (if --manual is passed without --ini, user will be prompted to | |
37 create the file if any manual input is saved) | |
38 --sofile -l path to to the so.obo file to use for feature type mapping | |
39 (--sofile live will download the latest online revision) | |
40 --manual -m when trying to guess the proper SO term, if more than | |
41 one option matches the primary tag, the converter will | |
42 wait for user input to choose the correct one | |
43 (only works with --sofile) | |
44 --dir -d path to a list of genbank flatfiles | |
45 --outdir -o location to write GFF files (can be 'stdout' or '-' for pipe) | |
46 --zip -z compress GFF3 output files with gzip | |
47 --summary -s print a summary of the features in each contig | |
48 --filter -x genbank feature type(s) to ignore | |
49 --split -y split output to separate GFF and fasta files for | |
50 each genbank record | |
51 --nolump -n separate file for each reference sequence | |
52 (default is to lump all records together into one | |
53 output file for each input file) | |
54 --ethresh -e error threshold for unflattener | |
55 set this high (>2) to ignore all unflattener errors | |
56 --[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon | |
57 model. --CDS is default. Use --CDS to keep default GFF gene model, | |
58 use --noCDS to convert to g-r-p-e. | |
59 --format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work | |
60 (GenBank is default) | |
61 --GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available | |
62 --quiet don't talk about what is being processed | |
63 --typesource SO sequence type for source (e.g. chromosome; region; contig) | |
64 --help -h display this message | |
65 | |
66 | |
67 =head1 DESCRIPTION | |
68 | |
69 This script uses Bio::SeqFeature::Tools::Unflattener and | |
70 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene | |
71 containment hierarchies mapped for optimal display in gbrowse. | |
72 | |
73 The input files are assumed to be gzipped GenBank flatfiles for refseq | |
74 contigs. The files may contain multiple GenBank records. Either a | |
75 single file or an entire directory can be processed. By default, the | |
76 DNA sequence is embedded in the GFF but it can be saved into separate | |
77 fasta file with the --split(-y) option. | |
78 | |
79 If an input file contains multiple records, the default behaviour is | |
80 to dump all GFF and sequence to a file of the same name (with .gff | |
81 appended). Using the 'nolump' option will create a separate file for | |
82 each genbank record. Using the 'split' option will create separate | |
83 GFF and Fasta files for each genbank record. | |
84 | |
85 | |
86 =head2 Notes | |
87 | |
88 =head3 'split' and 'nolump' produce many files | |
89 | |
90 In cases where the input files contain many GenBank records (for | |
91 example, the chromosome files for the mouse genome build), a very | |
92 large number of output files will be produced if the 'split' or | |
93 'nolump' options are selected. If you do have lists of files E<gt> 6000, | |
94 use the --long_list option in bp_bulk_load_gff.pl or | |
95 bp_fast_load_gff.pl to load the gff and/ or fasta files. | |
96 | |
97 =head3 Designed for RefSeq | |
98 | |
99 This script is designed for RefSeq genomic sequence entries. It may | |
100 work for third party annotations but this has not been tested. | |
101 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl | |
102 if you don't mind some gene model unflattener errors (dgg). | |
103 | |
104 =head3 G-R-P-E Gene Model | |
105 | |
106 Don Gilbert worked this over with needs to produce GFF3 suited to | |
107 loading to GMOD Chado databases. Most of the changes I believe are | |
108 suited for general use. One main chado-specific addition is the | |
109 --[no]cds2protein flag | |
110 | |
111 My favorite GFF is to set the above as ON by default (disable with --nocds2prot) | |
112 For general use it probably should be OFF, enabled with --cds2prot. | |
113 | |
114 This writes GFF with an alternate, but useful Gene model, | |
115 instead of the consensus model for GFF3 | |
116 | |
117 [ gene > mRNA> (exon,CDS,UTR) ] | |
118 | |
119 This alternate is | |
120 | |
121 gene > mRNA > polypeptide > exon | |
122 | |
123 means the only feature with dna bases is the exon. The others | |
124 specify only location ranges on a genome. Exon of course is a child | |
125 of mRNA and protein/peptide. | |
126 | |
127 The protein/polypeptide feature is an important one, having all the | |
128 annotations of the GenBank CDS feature, protein ID, translation, GO | |
129 terms, Dbxrefs to other proteins. | |
130 | |
131 UTRs, introns, CDS-exons are all inferred from the primary exon bases | |
132 inside/outside appropriate higher feature ranges. Other special gene | |
133 model features remain the same. | |
134 | |
135 Several other improvements and bugfixes, minor but useful are included | |
136 | |
137 * IO pipes now work: | |
138 curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ... | |
139 | |
140 * GenBank main record fields are added to source feature, e.g. organism, date, | |
141 and the sourcetype, commonly chromosome for genomes, is used. | |
142 | |
143 * Gene Model handling for ncRNA, pseudogenes are added. | |
144 | |
145 * GFF header is cleaner, more informative. | |
146 --GFF_VERSION flag allows choice of v2 as well as default v3 | |
147 | |
148 * GFF ##FASTA inclusion is improved, and | |
149 CDS translation sequence is moved to FASTA records. | |
150 | |
151 * FT -> GFF attribute mapping is improved. | |
152 | |
153 * --format choice of SeqIO input formats (GenBank default). | |
154 Uniprot/Swissprot and EMBL work and produce useful GFF. | |
155 | |
156 * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions | |
157 and more flexible usage. | |
158 | |
159 =head1 TODO | |
160 | |
161 =head2 Are these additions desired? | |
162 | |
163 * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY | |
164 * handle Entrezgene, other non-sequence SeqIO structures (really should change | |
165 those parsers to produce consistent annotation tags). | |
166 | |
167 =head2 Related bugfixes/tests | |
168 | |
169 These items from Bioperl mail were tested (sample data generating | |
170 errors), and found corrected: | |
171 | |
172 From: Ed Green <green <at> eva.mpg.de> | |
173 Subject: genbank2gff3.pl on new human RefSeq | |
174 Date: 2006-03-13 21:22:26 GMT | |
175 -- unspecified errors (sample data works now). | |
176 | |
177 From: Eric Just <e-just <at> northwestern.edu> | |
178 Subject: genbank2gff3.pl | |
179 Date: 2007-01-26 17:08:49 GMT | |
180 -- bug fixed in genbank2gff3 for multi-record handling | |
181 | |
182 This error is for a /trans_splice gene that is hard to handle, and | |
183 unflattner/genbank2 doesn't | |
184 | |
185 From: Chad Matsalla <chad <at> dieselwurks.com> | |
186 Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order? | |
187 Date: 2005-07-15 19:51:48 GMT | |
188 | |
189 =head1 AUTHOR | |
190 | |
191 Sheldon McKay (mckays@cshl.edu) | |
192 | |
193 Copyright (c) 2004 Cold Spring Harbor Laboratory. | |
194 | |
195 =head2 AUTHOR of hacks for GFF2Chado loading | |
196 | |
197 Don Gilbert (gilbertd@indiana.edu) | |
198 | |
199 | |
200 =cut | |
201 | |
202 use strict; | |
203 use warnings; | |
204 | |
205 use lib "$ENV{HOME}/bioperl-live"; | |
206 # chad put this here to enable situations when this script is tested | |
207 # against bioperl compiled into blib along with other programs using blib | |
208 BEGIN { | |
209 unshift(@INC,'blib/lib'); | |
210 }; | |
211 use Pod::Usage; | |
212 use Bio::Root::RootI; | |
213 use Bio::SeqIO; | |
214 use File::Spec; | |
215 use Bio::SeqFeature::Tools::Unflattener; | |
216 use Bio::SeqFeature::Tools::TypeMapper; | |
217 use Bio::SeqFeature::Tools::IDHandler; | |
218 use Bio::Location::SplitLocationI; | |
219 use Bio::Location::Simple; | |
220 use Bio::Tools::GFF; | |
221 use Getopt::Long; | |
222 use List::Util qw(first); | |
223 use Bio::OntologyIO; | |
224 use YAML qw(Dump LoadFile DumpFile); | |
225 use File::Basename; | |
226 | |
227 use vars qw/$split @filter $zip $outdir $help $ethresh | |
228 $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT | |
229 $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE | |
230 $file @files $dir $summary $nolump | |
231 $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION | |
232 $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/; | |
233 | |
234 use constant SO_URL => | |
235 'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo'; | |
236 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)]; | |
237 use constant ALPHABET_TO_NUMBER => { | |
238 a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8, | |
239 j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16, | |
240 r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24, | |
241 z => 25, | |
242 }; | |
243 use constant ALPHABET_DIVISOR => 26; | |
244 use constant GM_NEW_TOPLEVEL => 2; | |
245 use constant GM_NEW_PART => 1; | |
246 use constant GM_DUP_PART => 0; | |
247 use constant GM_NOT_PART => -1; | |
248 | |
249 # Options cycle in multiples of 2 because of left side/right side pairing. | |
250 # You can make this number odd, but displayed matches will still round up | |
251 use constant OPTION_CYCLE => 6; | |
252 | |
253 $GFF_VERSION = 3; # allow v2 ... | |
254 $verbose = 1; # right default? -nov to turn off | |
255 | |
256 # dgg: change the gene model to Gene/mRNA/Polypeptide/exons... | |
257 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features() | |
258 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep; | |
259 # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support | |
260 | |
261 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID ** | |
262 my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work | |
263 # other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG | |
264 | |
265 | |
266 my %TAG_MAP = ( | |
267 db_xref => 'Dbxref', | |
268 name => 'Name', | |
269 note => 'Note', # also pull GO: ids into Ontology_term | |
270 synonym => 'Alias', | |
271 symbol => 'Alias', # is symbol still used? | |
272 # protein_id => 'Dbxref', also seen Dbxref tags: EC_number | |
273 # translation: handled in gene_features | |
274 ); | |
275 | |
276 | |
277 $| = 1; | |
278 my $quiet= !$verbose; | |
279 my $ok= GetOptions( 'd|dir|input:s' => \$dir, | |
280 'z|zip' => \$zip, | |
281 'h|help' => \$help, | |
282 's|summary' => \$summary, | |
283 'r|noinfer' => \$noinfer, | |
284 'i|conf=s' => \$CONF, | |
285 'sofile=s' => \$SO_FILE, | |
286 'm|manual' => \$MANUAL, | |
287 'o|outdir|output:s'=> \$outdir, | |
288 'x|filter:s'=> \@filter, | |
289 'y|split' => \$split, | |
290 "ethresh|e=s"=>\$ethresh, | |
291 'c|CDS!' => \$CDSkeep, | |
292 'f|format=s' => \$FORMAT, | |
293 'typesource=s' => \$source_type, | |
294 'GFF_VERSION=s' => \$GFF_VERSION, | |
295 'quiet!' => \$quiet, # swap quiet to verbose | |
296 'DEBUG!' => \$DEBUG, | |
297 'n|nolump' => \$nolump); | |
298 | |
299 my $lump = 1 unless $nolump || $split; | |
300 $verbose= !$quiet; | |
301 | |
302 # look for help request | |
303 pod2usage(2) if $help || !$ok; | |
304 | |
305 # keep SOURCEID as-is and change FORMAT for SeqIO types; | |
306 # note SeqIO uses file.suffix to guess type; not useful here | |
307 $SOURCEID= $FORMAT; | |
308 $FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/; | |
309 $verbose =1 if($DEBUG); | |
310 | |
311 # initialize handlers | |
312 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1); | |
313 $unflattener->error_threshold($ethresh) if $ethresh; | |
314 $unflattener->verbose(1) if($DEBUG); | |
315 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only? | |
316 # ensembl parsing is still problematic, forget this | |
317 | |
318 my $tm = Bio::SeqFeature::Tools::TypeMapper->new; | |
319 my $idh = Bio::SeqFeature::Tools::IDHandler->new; | |
320 | |
321 # dgg | |
322 $source_type ||= "region"; # should really parse from FT.source contents below | |
323 | |
324 #my $FTSOmap = $tm->FT_SO_map(); | |
325 my $FTSOmap; | |
326 my $FTSOsynonyms; | |
327 | |
328 if (defined($SO_FILE) && $SO_FILE eq 'live') { | |
329 print "\nDownloading the latest SO file from ".SO_URL."\n\n"; | |
330 use LWP::UserAgent; | |
331 my $ua = LWP::UserAgent->new(timeout => 30); | |
332 my $request = HTTP::Request->new(GET => SO_URL); | |
333 my $response = $ua->request($request); | |
334 | |
335 | |
336 if ($response->status_line =~ /200/) { | |
337 use File::Temp qw/ tempfile /; | |
338 my ($fh, $fn) = tempfile(); | |
339 print $fh $response->content; | |
340 $SO_FILE = $fn; | |
341 } else { | |
342 print "Couldn't download SO file online...skipping validation.\n" | |
343 . "HTTP Status was " . $response->status_line . "\n" | |
344 and undef $SO_FILE | |
345 } | |
346 } | |
347 | |
348 if ($SO_FILE) { | |
349 | |
350 | |
351 my (%terms, %syn); | |
352 | |
353 my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE ); | |
354 $ONTOLOGY = $parser->next_ontology(); | |
355 | |
356 for ($ONTOLOGY->get_all_terms) { | |
357 my $feat = $_; | |
358 | |
359 $terms{$feat->name} = $feat->name; | |
360 #$terms{$feat->name} = $feat; | |
361 | |
362 my @syn = $_->each_synonym; | |
363 | |
364 push @{$syn{$_}}, $feat->name for @syn; | |
365 #push @{$syn{$_}}, $feat for @syn; | |
366 } | |
367 | |
368 $FTSOmap = \%terms; | |
369 $FTSOsynonyms = \%syn; | |
370 | |
371 my %hardTerms = %{ $tm->FT_SO_map() }; | |
372 map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms; | |
373 | |
374 } else { | |
375 my %terms = %{ $tm->FT_SO_map() }; | |
376 while (my ($k,$v) = each %terms) { | |
377 $FTSOmap->{$k} = ref($v) ? shift @$v : $v; | |
378 } | |
379 } | |
380 | |
381 $TYPE_MAP = $FTSOmap; | |
382 $SYN_MAP = $FTSOsynonyms; | |
383 | |
384 | |
385 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region") | |
386 | |
387 # stringify filter list if applicable | |
388 my $filter = join ' ', @filter if @filter; | |
389 | |
390 # determine input files | |
391 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use | |
392 if ($dir && ($dir eq '-' || $dir eq 'stdin')) { | |
393 $stdin=1; $dir=''; @files=('stdin'); | |
394 | |
395 } elsif ( $dir ) { | |
396 if ( -d $dir ) { | |
397 opendir DIR, $dir or die "could not open $dir for reading: $!"; | |
398 @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR; | |
399 closedir DIR; | |
400 } | |
401 else { | |
402 die "$dir is not a directory\n"; | |
403 } | |
404 } | |
405 else { | |
406 @files = @ARGV; | |
407 $dir = ''; | |
408 } | |
409 | |
410 # we should have some files by now | |
411 pod2usage(2) unless @files; | |
412 | |
413 | |
414 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use | |
415 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) { | |
416 warn("std. output chosen: cannot split\n") if($split); | |
417 warn("std. output chosen: cannot zip\n") if($zip); | |
418 warn("std. output chosen: cannot nolump\n") if($nolump); | |
419 $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip | |
420 | |
421 } elsif ( $outdir && !-e $outdir ) { | |
422 mkdir($outdir) or die "could not create directory $outdir: $!\n"; | |
423 } | |
424 elsif ( !$outdir ) { | |
425 $outdir = $dir || '.'; | |
426 } | |
427 | |
428 for my $file ( @files ) { | |
429 # dgg ; allow 'stdin' / '-' input ? | |
430 chomp $file; | |
431 die "$! $file" unless($stdin || -e $file); | |
432 print "# Input: $file\n" if($verbose); | |
433 | |
434 my ($lump_fh, $lumpfa_fh, $outfile, $outfa); | |
435 if ($stdout) { | |
436 $lump_fh= *STDOUT; $lump="stdout$$"; | |
437 $outfa= "stdout$$.fa"; # this is a temp file ... see below | |
438 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n"; | |
439 | |
440 } elsif ( $lump ) { | |
441 my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file); | |
442 $lump = File::Spec->catfile($outdir, $fileonly.'.gff'); | |
443 ($outfa = $lump) =~ s/\.gff/\.fa/; | |
444 open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n"; | |
445 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n"; | |
446 | |
447 } | |
448 | |
449 # open input file, unzip if req'd | |
450 if ($stdin) { | |
451 *FH = *STDIN; | |
452 } elsif ( $file =~ /\.gz/ ) { | |
453 open FH, "gunzip -c $file |"; | |
454 } | |
455 else { | |
456 open FH, '<', $file; | |
457 } | |
458 | |
459 my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG); | |
460 my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION ); | |
461 | |
462 while ( my $seq = $in->next_seq() ) { | |
463 my $seq_name = $seq->accession_number; | |
464 my $end = $seq->length; | |
465 my @to_print; | |
466 | |
467 # arrange disposition of GFF output | |
468 $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff'); | |
469 my $out; | |
470 | |
471 if ( $lump ) { | |
472 $outfile = $lump; | |
473 $out = $lump_fh; | |
474 } | |
475 else { | |
476 $outfile = File::Spec->catfile($outdir, $seq_name.'.gff'); | |
477 open $out, ">$outfile"; | |
478 } | |
479 | |
480 # filter out unwanted features | |
481 my $source_feat= undef; | |
482 my @source= filter($seq); $source_feat= $source[0]; | |
483 | |
484 ($source_type,$source_feat)= | |
485 getSourceInfo( $seq, $source_type, $source_feat ) ; | |
486 # always; here we build main prot $source_feat; # if @source; | |
487 | |
488 # abort if there are no features | |
489 warn "$seq_name has no features, skipping\n" and next | |
490 if !$seq->all_SeqFeatures; | |
491 | |
492 | |
493 $FTSOmap->{'source'} = $source_type; | |
494 ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features | |
495 | |
496 # construct a GFF header | |
497 # add: get source_type from attributes of source feature? chromosome=X tag | |
498 # also combine 1st ft line here with source ft from $seq .. | |
499 my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat); | |
500 print $out $header; | |
501 print "# working on $info\n" if($verbose); | |
502 | |
503 # unflatten gene graphs, apply SO types, etc; this also does TypeMapper .. | |
504 unflatten_seq($seq); | |
505 | |
506 # Note that we use our own get_all_SeqFeatures function | |
507 # to rescue cloned exons | |
508 | |
509 @GFF_LINE_FEAT = (); | |
510 for my $feature ( get_all_SeqFeatures($seq) ) { | |
511 | |
512 my $method = $feature->primary_tag; | |
513 next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type); | |
514 | |
515 $feature->seq_id($seq->id) unless($feature->seq_id); | |
516 $feature->source_tag($SOURCEID); | |
517 | |
518 # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref; | |
519 ## also, pull any GO:000 ids from /note tag and put into Ontology_term | |
520 maptags2gff($feature); | |
521 | |
522 # current gene name. The unflattened gene features should be in order so any | |
523 # exons, CDSs, etc that follow will belong to this gene | |
524 my $gene_name; | |
525 if ( $method eq 'gene' || $method eq 'pseudogene' ) { | |
526 @to_print= print_held($out, $gffio, \@to_print); | |
527 $gene_id = $gene_name= gene_name($feature); | |
528 } else { | |
529 $gene_name= gene_name($feature); | |
530 } | |
531 | |
532 #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx | |
533 # be converted to or added as Name=xxx (if not ID= or as well) | |
534 ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags | |
535 convert_to_name($feature); | |
536 | |
537 ## dgg: extended to protein|polypeptide | |
538 ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes | |
539 ## in yeast have that genbank tag; why? | |
540 ## these include pseudogene ... | |
541 | |
542 ## Note we also have mapped types to SO, so these RNA's are now transcripts: | |
543 # pseudomRNA => "pseudogenic_transcript", | |
544 # pseudotranscript" => "pseudogenic_transcript", | |
545 # misc_RNA=>'processed_transcript', | |
546 | |
547 warn "#at: $method $gene_id/$gene_name\n" if $DEBUG; | |
548 | |
549 if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/ | |
550 || ( $gene_id && $gene_name eq $gene_id ) ) { | |
551 | |
552 my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result | |
553 if ($action == GM_DUP_PART) { | |
554 # ignore, this is dupl. exon with new parent ... | |
555 | |
556 } elsif ($action == GM_NOT_PART) { | |
557 add_generic_id( $feature, $gene_name, "nocount"); | |
558 my $gff = $gffio->gff_string($feature); | |
559 push @GFF_LINE_FEAT, $feature; | |
560 #print $out "$gff\n" if $gff; | |
561 | |
562 } elsif ($action > 0) { | |
563 # hold off print because exon etc. may get 2nd, 3rd parents | |
564 @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL); | |
565 push(@to_print, $feature); | |
566 } | |
567 | |
568 } | |
569 | |
570 # otherwise handle as generic feats with IDHandler labels | |
571 else { | |
572 add_generic_id( $feature, $gene_name, ""); | |
573 my $gff= $gffio->gff_string($feature); | |
574 push @GFF_LINE_FEAT, $feature; | |
575 #print $out "$gff\n" if $gff; | |
576 } | |
577 } | |
578 | |
579 # don't like doing this after others; do after each new gene id? | |
580 @to_print= print_held($out, $gffio, \@to_print); | |
581 | |
582 gff_validate(@GFF_LINE_FEAT); | |
583 | |
584 for my $feature (@GFF_LINE_FEAT) { | |
585 my $gff= $gffio->gff_string($feature); | |
586 print $out "$gff\n" if $gff; | |
587 } | |
588 | |
589 # deal with the corresponding DNA | |
590 my ($fa_out,$fa_outfile); | |
591 my $dna = $seq->seq; | |
592 if($dna || %proteinfa) { | |
593 $method{'RESIDUES'} += length($dna); | |
594 $dna =~ s/(\S{60})/$1\n/g; | |
595 $dna .= "\n"; | |
596 | |
597 if ($split) { | |
598 $fa_outfile = $outfile; | |
599 $fa_outfile =~ s/gff$/fa/; | |
600 open $fa_out, ">$fa_outfile" or die $!; | |
601 print $fa_out ">$seq_name\n$dna" if $dna; | |
602 foreach my $aid (sort keys %proteinfa) { | |
603 my $aa= delete $proteinfa{$aid}; | |
604 $method{'RESIDUES(tr)'} += length($aa); | |
605 $aa =~ s/(\S{60})/$1\n/g; | |
606 print $fa_out ">$aid\n$aa\n"; | |
607 } | |
608 | |
609 } | |
610 else { | |
611 ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out | |
612 ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk | |
613 ## maybe write this to temp .fa then cat to end of lumped gff $out | |
614 print $lumpfa_fh ">$seq_name\n$dna" if $dna; | |
615 foreach my $aid (sort keys %proteinfa) { | |
616 my $aa= delete $proteinfa{$aid}; | |
617 $method{'RESIDUES(tr)'} += length($aa); | |
618 $aa =~ s/(\S{60})/$1\n/g; | |
619 print $lumpfa_fh ">$aid\n$aa\n"; | |
620 } | |
621 } | |
622 | |
623 %proteinfa=(); | |
624 } | |
625 | |
626 if ( $zip && !$lump ) { | |
627 system "gzip -f $outfile"; | |
628 system "gzip -f $fa_outfile" if($fa_outfile); | |
629 $outfile .= '.gz'; | |
630 $fa_outfile .= '.gz' if $split; | |
631 } | |
632 | |
633 # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA | |
634 print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump); | |
635 print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump); | |
636 | |
637 # dgg: moved to after all inputs; here it prints cumulative sum for each record | |
638 #if ( $summary ) { | |
639 # print "# Summary:\n# Feature\tCount\n# -------\t-----\n"; | |
640 # | |
641 # for ( keys %method ) { | |
642 # print "# $_ $method{$_}\n"; | |
643 # } | |
644 # print "# \n"; | |
645 # } | |
646 | |
647 } | |
648 | |
649 print "# GFF3 saved to $outfile\n" if( $verbose && $lump); | |
650 if ( $summary ) { | |
651 print "# Summary:\n# Feature\tCount\n# -------\t-----\n"; | |
652 for ( keys %method ) { | |
653 print "# $_ $method{$_}\n"; | |
654 } | |
655 print "# \n"; | |
656 } | |
657 | |
658 ## FIXME for piped output w/ split FA files ... | |
659 close($lumpfa_fh) if $lumpfa_fh; | |
660 if (!$split && $outfa && $lump_fh) { | |
661 print $lump_fh "##FASTA\n"; # GFF3 spec | |
662 open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!"; | |
663 while( <$lumpfa_fh>) { | |
664 print $lump_fh $_; | |
665 } # is $lump_fh still open? | |
666 close($lumpfa_fh); | |
667 unlink($outfa); | |
668 } | |
669 | |
670 | |
671 if ( $zip && $lump ) { | |
672 system "gzip -f $lump"; | |
673 } | |
674 | |
675 close FH; | |
676 } | |
677 | |
678 | |
679 | |
680 | |
681 | |
682 sub typeorder { | |
683 return 1 if ($_[0] =~ /gene/); | |
684 return 2 if ($_[0] =~ /RNA|transcript/); | |
685 return 3 if ($_[0] =~ /protein|peptide/); | |
686 return 4 if ($_[0] =~ /exon|CDS/); | |
687 return 3; # default before exon (smallest part) | |
688 } | |
689 | |
690 sub sort_by_feattype { | |
691 my($at,$bt)= ($a->primary_tag, $b->primary_tag); | |
692 return (typeorder($at) <=> typeorder($bt)) | |
693 or ($at cmp $bt); | |
694 ## or ($a->name() cmp $b->name()); | |
695 } | |
696 | |
697 sub print_held { | |
698 my($out,$gffio,$to_print)= @_; | |
699 return unless(@$to_print); | |
700 @$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes | |
701 while ( my $feature = shift @$to_print) { | |
702 my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode | |
703 push @GFF_LINE_FEAT, $feature; | |
704 #print $out "$gff\n"; | |
705 } | |
706 return (); # @to_print | |
707 } | |
708 | |
709 sub maptags2gff { | |
710 my $f = shift; | |
711 ## should copy/move locus_tag to Alias, if not ID/Name/Alias already | |
712 # but see below /gene /locus_tag usage | |
713 foreach my $tag (keys %TAG_MAP) { | |
714 if ($f->has_tag($tag)) { | |
715 my $newtag= $TAG_MAP{$tag}; | |
716 my @v= $f->get_tag_values($tag); | |
717 $f->remove_tag($tag); | |
718 $f->add_tag_value($newtag,@v); | |
719 | |
720 ## also, pull any GO:000 ids from /note tag and put into Ontology_term | |
721 ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]' | |
722 if ($tag eq 'note') { | |
723 map { s/\[goid (\d+)/\[goid GO:$1/g; } @v; | |
724 my @go= map { m/(GO:\d+)/g } @v; | |
725 $f->add_tag_value('Ontology_term',@go) if(@go); | |
726 } | |
727 | |
728 } | |
729 } | |
730 } | |
731 | |
732 | |
733 sub getSourceInfo { | |
734 my ($seq, $source_type, $sf) = @_; | |
735 | |
736 my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i); | |
737 my $is_gene = ($SOURCEID =~/entrezgene/i); | |
738 my $is_rich = (ref($seq) =~ /RichSeq/); | |
739 my $seq_name= $seq->accession_number(); | |
740 | |
741 unless($sf) { # make one | |
742 $source_type= $is_swiss ? $PROTEIN_TYPE | |
743 : $is_gene ? "eneg" # "gene" # "region" # | |
744 : $is_rich ? $seq->molecule : $source_type; | |
745 $sf = Bio::SeqFeature::Generic->direct_new(); | |
746 | |
747 my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1); | |
748 my $loc= $seq->can('location') ? $seq->location() | |
749 : new Bio::Location::Simple( -start => $start, -end => $len); | |
750 $sf->location( $loc ); | |
751 $sf->primary_tag($source_type); | |
752 $sf->source_tag($SOURCEID); | |
753 $sf->seq_id( $seq_name); | |
754 #? $sf->display_name($seq->id()); ## Name or Alias ? | |
755 $sf->add_tag_value( Alias => $seq->id()); # unless id == accession | |
756 $seq->add_SeqFeature($sf); | |
757 ## $source_feat= $sf; | |
758 } | |
759 | |
760 if ($sf->has_tag("chromosome")) { | |
761 $source_type= "chromosome"; | |
762 my ($chrname) = $sf->get_tag_values("chromosome"); | |
763 ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead | |
764 ## e.g. Mouse chr 19 has two IDs in NCBI genbank now | |
765 $sf->add_tag_value( Alias => $chrname ); | |
766 } | |
767 | |
768 # pull GB Comment, Description for source ft ... | |
769 # add reference - can be long, not plain string... | |
770 warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG; | |
771 # GenBank fields: keyword,comment,reference,date_changed | |
772 # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM | |
773 | |
774 # is this just for main $seq object or for all seqfeatures ? | |
775 my %AnnotTagMap= ( | |
776 'gene_name' => 'Alias', | |
777 'ALIAS_SYMBOL' => 'Alias', # Entrezgene | |
778 'LOCUS_SYNONYM' => 'Alias', #? | |
779 'symbol' => 'Alias', | |
780 'synonym' => 'Alias', | |
781 'dblink' => 'Dbxref', | |
782 'product' => 'product', | |
783 'Reference' => 'reference', | |
784 'OntologyTerm' => 'Ontology_term', | |
785 'comment' => 'Note', | |
786 'comment1' => 'Note', | |
787 # various map-type locations | |
788 # gene accession tag is named per source db !?? | |
789 # 'Index terms' => keywords ?? | |
790 ); | |
791 | |
792 | |
793 my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() ); | |
794 my ($date)= $seq->annotation->get_Annotations("dates") | |
795 || $seq->annotation->get_Annotations("update-date") | |
796 || $is_rich ? $seq->get_dates() : (); | |
797 my ($comment)= $seq->annotation->get_Annotations("comment"); | |
798 my ($species)= $seq->annotation->get_Annotations("species"); | |
799 if (!$species | |
800 && $seq->can('species') | |
801 && defined $seq->species() | |
802 && $seq->species()->can('binomial') ) { | |
803 $species= $seq->species()->binomial(); | |
804 } | |
805 | |
806 # update source feature with main GB fields | |
807 $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID'); | |
808 $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note')); | |
809 $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism')); | |
810 $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1')); | |
811 $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date')); | |
812 | |
813 $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene; | |
814 | |
815 foreach my $atag (sort keys %AnnotTagMap) { | |
816 my $gtag= $AnnotTagMap{$atag}; next unless($gtag); | |
817 my @anno = map{ | |
818 if (ref $_ && $_->can('get_all_values')) { | |
819 split( /[,;] */, join ";", $_->get_all_values) | |
820 } | |
821 elsif (ref $_ && $_->can('display_text')) { | |
822 split( /[,;] */, $_->display_text) | |
823 } | |
824 elsif (ref $_ && $_->can('value')) { | |
825 split( /[,;] */, $_->value) | |
826 } | |
827 else { | |
828 (); | |
829 } | |
830 } $seq->annotation->get_Annotations($atag); | |
831 foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); } | |
832 } | |
833 | |
834 #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name'); | |
835 #$sf->add_tag_value( Alias => $_ ) foreach(@genes); | |
836 # | |
837 #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all | |
838 #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink); | |
839 | |
840 return (wantarray)? ($source_type,$sf) : $source_type; #? | |
841 } | |
842 | |
843 | |
844 sub gene_features { | |
845 my ($f, $gene_id, $genelinkID) = @_; | |
846 local $_ = $f->primary_tag; | |
847 $method{$_}++; | |
848 | |
849 if ( /gene/ ) { | |
850 $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!? | |
851 $tnum = $rnum= 0; $ncrna_id= $rna_id = ''; | |
852 return GM_NEW_TOPLEVEL; | |
853 | |
854 } elsif ( /mRNA/ ) { | |
855 return GM_NOT_PART unless $gene_id; | |
856 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); | |
857 ($rna_id = $gene_id ) =~ s/gene/mRNA/; | |
858 $rna_id .= '.t0' . ++$tnum; | |
859 $f->add_tag_value( ID => $rna_id ); | |
860 $f->add_tag_value( Parent => $gene_id ); | |
861 | |
862 } elsif ( /RNA|transcript/) { | |
863 ## misc_RNA here; missing exons ... flattener problem? | |
864 # all of {t,nc,sn}RNA can have gene models now | |
865 ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag | |
866 ## CDS needs to use mRNA, not misc_RNA, rna_id ... | |
867 ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level | |
868 | |
869 if($gene_id) { | |
870 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); | |
871 ($ncrna_id = $gene_id) =~ s/gene/ncRNA/; | |
872 $ncrna_id .= '.r0' . ++$rnum; | |
873 $f->add_tag_value( Parent => $gene_id ); | |
874 $f->add_tag_value( ID => $ncrna_id ); | |
875 } else { | |
876 unless ($f->has_tag('ID')) { | |
877 if($genelinkID) { | |
878 $f->add_tag_value( ID => $genelinkID ) ; | |
879 } else { | |
880 $idh->generate_unique_persistent_id($f); | |
881 } | |
882 } | |
883 ($ncrna_id)= $f->get_tag_values('ID'); | |
884 return GM_NEW_TOPLEVEL; | |
885 # this feat now acts as gene-top-level; need to print @to_print to flush prior exons? | |
886 } | |
887 | |
888 } elsif ( /exon/ ) { # can belong to any kind of RNA | |
889 return GM_NOT_PART unless ($rna_id||$ncrna_id); | |
890 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); | |
891 ## we are getting duplicate Parents here, which chokes chado loader, with reason... | |
892 ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice | |
893 ## check all Parents | |
894 for my $expar ($rna_id, $ncrna_id) { | |
895 next unless($expar); | |
896 if ( $exonpar{$expar} and $f->has_tag('Parent') ) { | |
897 my @vals = $f->get_tag_values('Parent'); | |
898 next if (grep {$expar eq $_} @vals); | |
899 } | |
900 $exonpar{$expar}++; | |
901 $f->add_tag_value( Parent => $expar); | |
902 # last; #? could be both | |
903 } | |
904 # now we can skip cloned exons | |
905 # dgg note: multiple parents get added and printed for each unique exon | |
906 return GM_DUP_PART if ++$seen{$f} > 1; | |
907 | |
908 } elsif ( /CDS|protein|polypeptide/ ) { | |
909 return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ?? | |
910 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #?? | |
911 (my $pro_id = $rna_id) =~ s/\.t/\.p/; | |
912 | |
913 if( ! $CDSkeep && /CDS/) { | |
914 $f->primary_tag($PROTEIN_TYPE); | |
915 | |
916 ## duplicate problem is Location .. | |
917 if ($f->location->isa("Bio::Location::SplitLocationI")) { | |
918 # my($b,$e)=($f->start, $f->end); # is this all we need? | |
919 my($b,$e)=(-1,0); | |
920 foreach my $l ($f->location->each_Location) { | |
921 $b = $l->start if($b<0 || $b > $l->start); | |
922 $e = $l->end if($e < $l->end); | |
923 } | |
924 $f->location( Bio::Location::Simple->new( | |
925 -start => $b, -end => $e, -strand => $f->strand) ); | |
926 } | |
927 | |
928 $f->add_tag_value( Derives_from => $rna_id ); | |
929 } | |
930 else { | |
931 $f->add_tag_value( Parent => $rna_id ); | |
932 } | |
933 | |
934 $f->add_tag_value( ID => $pro_id ); | |
935 | |
936 move_translation_fasta($f, $pro_id); | |
937 #if( $f->has_tag('translation')) { | |
938 # my ($aa) = $f->get_tag_values("translation"); | |
939 # $proteinfa{$pro_id}= $aa; | |
940 # $f->remove_tag("translation"); | |
941 # $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem | |
942 #} | |
943 } elsif ( /region/ ) { | |
944 $f->primary_tag('gene_component_region'); | |
945 $f->add_tag_value( Parent => $gene_id ); | |
946 } else { | |
947 return GM_NOT_PART unless $gene_id; | |
948 $f->add_tag_value( Parent => $gene_id ); | |
949 } | |
950 | |
951 ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1; | |
952 | |
953 return GM_NEW_PART; | |
954 } | |
955 | |
956 ## was generic_features > add_generic_id | |
957 sub add_generic_id { | |
958 my ($f, $ft_name, $flags) = @_; | |
959 my $method = $f->primary_tag; | |
960 $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above | |
961 | |
962 if ($f->has_tag('ID')) { | |
963 | |
964 } | |
965 elsif ( $f->has_tag($method) ) { | |
966 my ($name) = $f->get_tag_values($method); | |
967 $f->add_tag_value( ID => "$method:$name" ); | |
968 } | |
969 elsif($ft_name) { # is this unique ? | |
970 $f->add_tag_value( ID => $ft_name ); | |
971 } | |
972 else { | |
973 $idh->generate_unique_persistent_id($f); | |
974 } | |
975 | |
976 move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] ) | |
977 if($method =~ /CDS/); | |
978 | |
979 # return $io->gff_string($f); | |
980 } | |
981 | |
982 sub move_translation_fasta { | |
983 my ($f, $ft_id) = @_; | |
984 if( $ft_id && $f->has_tag('translation') ) { | |
985 my ($aa) = $f->get_tag_values("translation"); | |
986 if($aa && $aa !~ /^length/) { | |
987 $proteinfa{$ft_id}= $aa; | |
988 $f->remove_tag("translation"); | |
989 $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem | |
990 } | |
991 } | |
992 } | |
993 | |
994 sub gff_header { | |
995 my ($name, $end, $source_type, $source_feat) = @_; | |
996 $source_type ||= "region"; | |
997 | |
998 my $info = "$source_type:$name"; | |
999 my $head = "##gff-version $GFF_VERSION\n". | |
1000 "##sequence-region $name 1 $end\n". | |
1001 "# conversion-by bp_genbank2gff3.pl\n"; | |
1002 if ($source_feat) { | |
1003 ## dgg: these header comment fields are not useful when have multi-records, diff organisms | |
1004 for my $key (qw(organism Note date)) { | |
1005 my $value; | |
1006 if ($source_feat->has_tag($key)) { | |
1007 ($value) = $source_feat->get_tag_values($key); | |
1008 } | |
1009 if ($value) { | |
1010 $head .= "# $key $value\n"; | |
1011 $info .= ", $value"; | |
1012 } | |
1013 } | |
1014 $head = "" if $didheader; | |
1015 } else { | |
1016 $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n"; | |
1017 } | |
1018 $didheader++; | |
1019 return (wantarray) ? ($head,$info) : $head; | |
1020 } | |
1021 | |
1022 sub unflatten_seq { | |
1023 my $seq = shift; | |
1024 | |
1025 ## print "# working on $source_type:", $seq->accession, "\n"; | |
1026 my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number . | |
1027 ": consult STDERR\n"; | |
1028 | |
1029 eval { | |
1030 $unflattener->unflatten_seq( -seq => $seq, | |
1031 -noinfer => $noinfer, | |
1032 -use_magic => 1 ); | |
1033 }; | |
1034 | |
1035 # deal with unflattening errors | |
1036 if ( $@ ) { | |
1037 warn $seq->accession_number . " Unflattening error:\n"; | |
1038 warn "Details: $@\n"; | |
1039 print "# ".$uh_oh; | |
1040 } | |
1041 | |
1042 return 0 if !$seq || !$seq->all_SeqFeatures; | |
1043 | |
1044 # map feature types to the sequence ontology | |
1045 ## $tm->map_types_to_SO( -seq => $seq ); | |
1046 #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg | |
1047 | |
1048 map_types( | |
1049 $tm, | |
1050 -seq => $seq, | |
1051 -type_map => $FTSOmap, | |
1052 -syn_map => $FTSOsynonyms, | |
1053 -undefined => "region" | |
1054 ); #nml | |
1055 | |
1056 } | |
1057 | |
1058 sub filter { | |
1059 my $seq = shift; | |
1060 ## return unless $filter; | |
1061 my @feats; | |
1062 my @sources; # dgg; pick source features here; only 1 always? | |
1063 if ($filter) { | |
1064 for my $f ( $seq->remove_SeqFeatures ) { | |
1065 my $m = $f->primary_tag; | |
1066 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ? | |
1067 push @feats, $f unless $filter =~ /$m/i; | |
1068 } | |
1069 $seq->add_SeqFeature($_) foreach @feats; | |
1070 } else { | |
1071 for my $f ( $seq->get_SeqFeatures ){ | |
1072 my $m = $f->primary_tag; | |
1073 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ? | |
1074 } | |
1075 } | |
1076 | |
1077 return @sources; | |
1078 } | |
1079 | |
1080 | |
1081 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures | |
1082 # changed to filter out cloned features. We have to implement the old | |
1083 # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI | |
1084 sub get_all_SeqFeatures { | |
1085 my $seq = shift; | |
1086 my @flatarr; | |
1087 | |
1088 foreach my $feat ( $seq->get_SeqFeatures ){ | |
1089 push(@flatarr,$feat); | |
1090 _add_flattened_SeqFeatures(\@flatarr,$feat); | |
1091 } | |
1092 return @flatarr; | |
1093 } | |
1094 | |
1095 sub gene_name { | |
1096 my $g = shift; | |
1097 my $gene_id = ''; # zero it; | |
1098 | |
1099 if ($g->has_tag('locus_tag')) { | |
1100 ($gene_id) = $g->get_tag_values('locus_tag'); | |
1101 } | |
1102 | |
1103 elsif ($g->has_tag('gene')) { | |
1104 ($gene_id) = $g->get_tag_values('gene'); | |
1105 } | |
1106 elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene | |
1107 ($gene_id) = $g->get_tag_values('ID'); | |
1108 } | |
1109 | |
1110 ## See Unflattener comment: | |
1111 # on rare occasions, records will have no /gene or /locus_tag | |
1112 # but it WILL have /product tags. These serve the same purpose | |
1113 # for grouping. For an example, see AY763288 (also in t/data) | |
1114 # eg. product=tRNA-Asp ; product=similar to crooked neck protein | |
1115 elsif ($g->has_tag('product')) { | |
1116 my ($name)= $g->get_tag_values('product'); | |
1117 ($gene_id) = $name unless($name =~ / /); # a description not name | |
1118 } | |
1119 | |
1120 ## dgg; also handle transposon=xxxx ID/name | |
1121 # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746 | |
1122 elsif ($g->has_tag('transposon')) { | |
1123 my ($name)= $g->get_tag_values('transposon'); | |
1124 ($gene_id) = $name unless($name =~ / /); # a description not name | |
1125 } | |
1126 | |
1127 return $gene_id; | |
1128 } | |
1129 | |
1130 # same list as gene_name .. change tag to generic Name | |
1131 sub convert_to_name { | |
1132 my $g = shift; | |
1133 my $gene_id = ''; # zero it; | |
1134 | |
1135 if ($g->has_tag('gene')) { | |
1136 ($gene_id) = $g->get_tag_values('gene'); | |
1137 $g->remove_tag('gene'); | |
1138 $g->add_tag_value('Name', $gene_id); | |
1139 } | |
1140 elsif ($g->has_tag('locus_tag')) { | |
1141 ($gene_id) = $g->get_tag_values('locus_tag'); | |
1142 $g->remove_tag('locus_tag'); | |
1143 $g->add_tag_value('Name', $gene_id); | |
1144 } | |
1145 | |
1146 elsif ($g->has_tag('product')) { | |
1147 my ($name)= $g->get_tag_values('product'); | |
1148 ($gene_id) = $name unless($name =~ / /); # a description not name | |
1149 ## $g->remove_tag('product'); | |
1150 $g->add_tag_value('Name', $gene_id); | |
1151 } | |
1152 | |
1153 elsif ($g->has_tag('transposon')) { | |
1154 my ($name)= $g->get_tag_values('transposon'); | |
1155 ($gene_id) = $name unless($name =~ / /); # a description not name | |
1156 ## $g->remove_tag('transposon'); | |
1157 $g->add_tag_value('Name', $gene_id); | |
1158 } | |
1159 elsif ($g->has_tag('ID')) { | |
1160 my ($name)= $g->get_tag_values('ID'); | |
1161 $g->add_tag_value('Name', $name); | |
1162 } | |
1163 return $gene_id; | |
1164 } | |
1165 | |
1166 | |
1167 sub _add_flattened_SeqFeatures { | |
1168 my ($arrayref,$feat) = @_; | |
1169 my @subs = (); | |
1170 | |
1171 if ($feat->isa("Bio::FeatureHolderI")) { | |
1172 @subs = $feat->get_SeqFeatures; | |
1173 } | |
1174 elsif ($feat->isa("Bio::SeqFeatureI")) { | |
1175 @subs = $feat->sub_SeqFeature; | |
1176 } | |
1177 else { | |
1178 warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ". | |
1179 "Don't know how to flatten."; | |
1180 } | |
1181 | |
1182 for my $sub (@subs) { | |
1183 push(@$arrayref,$sub); | |
1184 _add_flattened_SeqFeatures($arrayref,$sub); | |
1185 } | |
1186 | |
1187 } | |
1188 | |
1189 sub map_types { | |
1190 | |
1191 my ($self, @args) = @_; | |
1192 | |
1193 my($sf, $seq, $type_map, $syn_map, $undefmap) = | |
1194 $self->_rearrange([qw(FEATURE | |
1195 SEQ | |
1196 TYPE_MAP | |
1197 SYN_MAP | |
1198 UNDEFINED | |
1199 )], | |
1200 @args); | |
1201 | |
1202 if (!$sf && !$seq) { | |
1203 $self->throw("you need to pass in either -feature or -seq"); | |
1204 } | |
1205 | |
1206 my @sfs = ($sf); | |
1207 if ($seq) { | |
1208 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI"); | |
1209 @sfs = $seq->get_all_SeqFeatures; | |
1210 } | |
1211 $type_map = $type_map || $self->typemap; # dgg: was type_map; | |
1212 foreach my $feat (@sfs) { | |
1213 | |
1214 $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI"); | |
1215 $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI"); | |
1216 | |
1217 my $primary_tag = $feat->primary_tag; | |
1218 | |
1219 #if ($primary_tag =~ /^pseudo(.*)$/) { | |
1220 # $primary_tag = $1; | |
1221 # $feat->primary_tag($primary_tag); | |
1222 #} | |
1223 | |
1224 my $mtype = $type_map->{$primary_tag}; | |
1225 if ($mtype) { | |
1226 if (ref($mtype)) { | |
1227 if (ref($mtype) eq 'ARRAY') { | |
1228 my $soID; | |
1229 ($mtype, $soID) = @$mtype; | |
1230 | |
1231 if ($soID && ref($ONTOLOGY)) { | |
1232 my ($term) = $ONTOLOGY->find_terms(-identifier => $soID); | |
1233 $mtype = $term->name if $term; | |
1234 } | |
1235 # if SO ID is undefined AND we have an ontology to search, we want to delete | |
1236 # the feature type hash entry in order to force a fuzzy search | |
1237 elsif (! defined $soID && ref($ONTOLOGY)) { | |
1238 undef $mtype; | |
1239 delete $type_map->{$primary_tag}; | |
1240 } | |
1241 elsif ($undefmap && $mtype eq 'undefined') { # dgg | |
1242 $mtype= $undefmap; | |
1243 } | |
1244 | |
1245 $type_map->{$primary_tag} = $mtype if $mtype; | |
1246 } | |
1247 elsif (ref($mtype) eq 'CODE') { | |
1248 $mtype = $mtype->($feat); | |
1249 } | |
1250 else { | |
1251 $self->throw('must be scalar or CODE ref'); | |
1252 } | |
1253 } | |
1254 elsif ($undefmap && $mtype eq 'undefined') { # dgg | |
1255 $mtype= $undefmap; | |
1256 } | |
1257 $feat->primary_tag($mtype); | |
1258 } | |
1259 | |
1260 if ($CONF) { | |
1261 conf_read(); | |
1262 my %perfect_matches; | |
1263 while (my ($p_tag,$rules) = each %$YAML) { | |
1264 RULE: | |
1265 for my $rule (@$rules) { | |
1266 for my $tags (@$rule) { | |
1267 while (my ($tag,$values) = each %$tags) { | |
1268 for my $value (@$values) { | |
1269 if ($feat->has_tag($tag)) { | |
1270 for ($feat->get_tag_values($tag)) { | |
1271 next RULE unless $_ =~ /\Q$value\E/; | |
1272 } | |
1273 } elsif ($tag eq 'primary_tag') { | |
1274 next RULE unless $value eq | |
1275 $feat->primary_tag; | |
1276 } elsif ($tag eq 'location') { | |
1277 next RULE unless $value eq | |
1278 $feat->start.'..'.$feat->end; | |
1279 } else { next RULE } | |
1280 } | |
1281 } | |
1282 } | |
1283 $perfect_matches{$p_tag}++; | |
1284 } | |
1285 } | |
1286 if (scalar(keys %perfect_matches) == 1) { | |
1287 $mtype = $_ for keys %perfect_matches; | |
1288 } elsif (scalar(keys %perfect_matches) > 1) { | |
1289 warn "There are conflicting rules in the config file for the" . | |
1290 " following types: "; | |
1291 warn "\t$_\n" for keys %perfect_matches; | |
1292 warn "Until conflict resolution is built into the converter," . | |
1293 " you will have to manually edit the config file to remove the" . | |
1294 " conflict. Sorry :(. Skipping user preference for this entry"; | |
1295 sleep(2); | |
1296 } | |
1297 } | |
1298 | |
1299 if ( ! $mtype && $syn_map) { | |
1300 if ($feat->has_tag('note')) { | |
1301 | |
1302 my @all_matches; | |
1303 | |
1304 my @note = $feat->each_tag_value('note'); | |
1305 | |
1306 for my $k (keys %$syn_map) { | |
1307 | |
1308 if ($k =~ /"(.+)"/) { | |
1309 | |
1310 my $syn = $1; | |
1311 | |
1312 for my $note (@note) { | |
1313 | |
1314 # look through the notes to see if the description | |
1315 # is an exact match for synonyms | |
1316 if ( $syn eq $note ) { | |
1317 | |
1318 my @map = @{$syn_map->{$k}}; | |
1319 | |
1320 | |
1321 my $best_guess = $map[0]; | |
1322 | |
1323 unshift @{$all_matches[-1]}, [$best_guess]; | |
1324 | |
1325 $mtype = $MANUAL | |
1326 ? manual_curation($feat, $best_guess, \@all_matches) | |
1327 : $best_guess; | |
1328 | |
1329 print '#' x 78 . "\nGuessing the proper SO term for GenBank" | |
1330 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n" | |
1331 . '#' x 78 . "\n\n"; | |
1332 | |
1333 } else { | |
1334 # check both primary tag and and note against | |
1335 # SO synonyms for best matching description | |
1336 | |
1337 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches); | |
1338 } | |
1339 | |
1340 } | |
1341 } | |
1342 } | |
1343 | |
1344 #unless ($mtype) { | |
1345 for my $note (@note) { | |
1346 for my $name (values %$type_map) { | |
1347 # check primary tag against SO names for best matching | |
1348 # descriptions //NML also need to check against | |
1349 # definition && camel case split terms | |
1350 | |
1351 SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches); | |
1352 } | |
1353 } | |
1354 #} | |
1355 | |
1356 if (scalar(@all_matches) && !$mtype) { | |
1357 | |
1358 my $top_matches = first { defined $_ } @{$all_matches[-1]}; | |
1359 | |
1360 my $best_guess = $top_matches->[0]; | |
1361 | |
1362 | |
1363 | |
1364 # if guess has quotes, it is a synonym term. we need to | |
1365 # look up the corresponding name term | |
1366 # otherwise, guess is a name, so we can use it directly | |
1367 if ($best_guess =~ /"(.+)"/) { | |
1368 | |
1369 $best_guess = $syn_map->{$best_guess}->[0]; | |
1370 | |
1371 } | |
1372 | |
1373 @RETURN = @all_matches; | |
1374 $mtype = $MANUAL | |
1375 ? manual_curation($feat, $best_guess, \@all_matches) | |
1376 : $best_guess; | |
1377 | |
1378 print '#' x 78 . "\nGuessing the proper SO term for GenBank" | |
1379 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n" | |
1380 . '#' x 78 . "\n\n"; | |
1381 | |
1382 } | |
1383 } | |
1384 $mtype ||= $undefmap; | |
1385 $feat->primary_tag($mtype); | |
1386 } | |
1387 } | |
1388 | |
1389 | |
1390 } | |
1391 | |
1392 sub SO_fuzzy_match { | |
1393 | |
1394 my $candidate = shift; | |
1395 my $primary_tag = shift; | |
1396 my $note = shift; | |
1397 my $SO_terms = shift; | |
1398 my $best_matches_ref = shift; | |
1399 my $modifier = shift; | |
1400 | |
1401 $modifier ||= ''; | |
1402 | |
1403 my @feat_terms; | |
1404 | |
1405 for ( split(" |_", $primary_tag) ) { | |
1406 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g; | |
1407 my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g; | |
1408 push @feat_terms, @camelCase; | |
1409 } | |
1410 | |
1411 for ( split(" |_", $note) ) { | |
1412 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g; | |
1413 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g; | |
1414 (my $word = $_) =~ s/[;:.,]//g; | |
1415 push @feat_terms, $word; | |
1416 } | |
1417 | |
1418 | |
1419 my @SO_terms = split(" |_", $SO_terms); | |
1420 | |
1421 # fuzzy match works on a simple point system. When 2 words match, | |
1422 # the $plus counter adds one. When they don't, the $minus counter adds | |
1423 # one. This is used to sort similar matches together. Better matches | |
1424 # are found at the end of the array, near the top. | |
1425 | |
1426 # NML: can we improve best match by using synonym tags | |
1427 # EXACT,RELATED,NARROW,BROAD? | |
1428 | |
1429 my ($plus, $minus) = (0, 0); | |
1430 my %feat_terms; | |
1431 my %SO_terms; | |
1432 | |
1433 #unique terms | |
1434 map {$feat_terms{$_} = 1} @feat_terms; | |
1435 map {$SO_terms{$_} = 1} @SO_terms; | |
1436 | |
1437 for my $st (keys %SO_terms) { | |
1438 for my $ft (keys %feat_terms) { | |
1439 | |
1440 ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++; | |
1441 | |
1442 } | |
1443 } | |
1444 | |
1445 push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus; | |
1446 | |
1447 } | |
1448 | |
1449 sub manual_curation { | |
1450 | |
1451 my ($feat, $default_opt, $all_matches) = @_; | |
1452 | |
1453 my @all_matches = @$all_matches; | |
1454 | |
1455 # convert all SO synonyms into names and filter | |
1456 # all matches into unique term list because | |
1457 # synonyms can map to multiple duplicate names | |
1458 | |
1459 my (@unique_SO_terms, %seen); | |
1460 for (reverse @all_matches) { | |
1461 for (@$_) { | |
1462 for (@$_) { | |
1463 #my @names; | |
1464 if ($_ =~ /"(.+)"/) { | |
1465 for (@{$SYN_MAP->{$_}}) { | |
1466 push @unique_SO_terms, $_ unless $seen{$_}; | |
1467 $seen{$_}++; | |
1468 } | |
1469 } else { | |
1470 push @unique_SO_terms, $_ unless $seen{$_}; | |
1471 $seen{$_}++; | |
1472 } | |
1473 } | |
1474 } | |
1475 } | |
1476 | |
1477 my $s = scalar(@unique_SO_terms); | |
1478 | |
1479 my $choice = 0; | |
1480 | |
1481 my $more = | |
1482 "[a]uto : automatic input (selects best guess for remaining entries)\r" . | |
1483 "[f]ind : search for other SO terms matching your query (e.g. f gene)\r" . | |
1484 "[i]nput : add a specific term\r" . | |
1485 "[r]eset : reset to the beginning of matches\r" . | |
1486 "[s]kip : skip this entry (selects best guess for this entry)\r" | |
1487 ; | |
1488 | |
1489 $more .= | |
1490 "[n]ext : view the next ".OPTION_CYCLE." terms\r" . | |
1491 "[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE); | |
1492 | |
1493 my $msg = #"\n\n" . '-' x 156 . "\n" | |
1494 "The converter found $s possible matches for the following GenBank entry: "; | |
1495 | |
1496 my $directions = | |
1497 "Type a number to select the SO term that best matches" | |
1498 . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more"; | |
1499 | |
1500 | |
1501 # lookup filtered list to pull out definitions | |
1502 my @options = map { | |
1503 my $term = $_; | |
1504 my %term; | |
1505 for (['name', 'name'], ['def', 'definition'], ['synonym', | |
1506 'each_synonym']) { | |
1507 my ($label, $method) = @$_; | |
1508 $term{$label} = \@{[$term->$method]}; | |
1509 } | |
1510 [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ]; | |
1511 } map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms; | |
1512 | |
1513 | |
1514 my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions, | |
1515 $default_opt, @options); | |
1516 | |
1517 if ($option eq 'skip') { return $default_opt | |
1518 } elsif ($option eq 'auto') { | |
1519 $MANUAL = 0; | |
1520 return $default_opt; | |
1521 } else { return $option } | |
1522 | |
1523 } | |
1524 | |
1525 sub options_cycle { | |
1526 | |
1527 my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_; | |
1528 | |
1529 #NML: really should only call GenBank_entry once. Will need to change | |
1530 #method to return array & shift off header | |
1531 my $entry = GenBank_entry($feat, "\r"); | |
1532 | |
1533 my $total = scalar(@opt); | |
1534 | |
1535 ($start,$stop) = (0, OPTION_CYCLE) | |
1536 if ( ($start < 0) && ($stop > 0) ); | |
1537 | |
1538 ($start,$stop) = (0, OPTION_CYCLE) | |
1539 if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total); | |
1540 | |
1541 ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0; | |
1542 ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total; | |
1543 | |
1544 $stop = $total if $stop > $total; | |
1545 | |
1546 my $dir_copy = $directions; | |
1547 my $msg_copy = $msg; | |
1548 my $format = "format STDOUT = \n" . | |
1549 '-' x 156 . "\n" . | |
1550 '^' . '<' x 77 . '| Available Commands:' . "\n" . | |
1551 '$msg_copy' . "\n" . | |
1552 '-' x 156 . "\n" . | |
1553 ' ' x 78 . "|\n" . | |
1554 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" . | |
1555 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" . | |
1556 (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" . | |
1557 ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n"; | |
1558 | |
1559 { | |
1560 # eval throws redefined warning that breaks formatting. | |
1561 # Turning off warnings just for the eval to fix this. | |
1562 no warnings 'redefine'; | |
1563 eval $format; | |
1564 } | |
1565 | |
1566 write; | |
1567 | |
1568 print '-' x 156 . "\n" . | |
1569 'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) . | |
1570 " - $stop of possible SO term matches: (best guess is \"$best_guess\")" . | |
1571 "\n" . '-' x 156 . "\n"; | |
1572 | |
1573 for (my $i = $start; $i < $stop; $i+=2) { | |
1574 | |
1575 my ($left, $right) = @opt[$i,$i+1]; | |
1576 | |
1577 my ($nL, $nmL, $descL, $termL, @synL) = @$left; | |
1578 | |
1579 #odd numbered lists can cause fatal undefined errors, so check | |
1580 #to make sure we have data | |
1581 | |
1582 my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef); | |
1583 | |
1584 | |
1585 my $format = "format STDOUT = \n"; | |
1586 | |
1587 $format .= | |
1588 ' ' x 78 . "|\n" . | |
1589 | |
1590 '@>>: name: ^' . '<' x 64 . '~' . ' |' . | |
1591 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" . | |
1592 '$nL,' . ' ' x 7 . '$nmL,' . | |
1593 ( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" . | |
1594 | |
1595 ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' . | |
1596 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" . | |
1597 ' ' x 11 . '$nmL,' . | |
1598 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" . | |
1599 #' ' x 78 . '|' . "\n" . | |
1600 | |
1601 | |
1602 ' def: ^' . '<' x 65 . ' |' . | |
1603 (ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" . | |
1604 ' ' x 11 . '$descL,' . | |
1605 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" . | |
1606 | |
1607 | |
1608 (' ^' . '<' x 65 . ' |' . | |
1609 (ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" . | |
1610 ' ' x 11 . '$descL,' . | |
1611 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 . | |
1612 | |
1613 | |
1614 ' ^' . '<' x 61 . '...~ |' . | |
1615 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" . | |
1616 ' ' x 11 . '$descL,' . | |
1617 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" . | |
1618 | |
1619 ".\n"; | |
1620 | |
1621 { | |
1622 # eval throws redefined warning that breaks formatting. | |
1623 # Turning off warnings just for the eval to fix this. | |
1624 no warnings 'redefine'; | |
1625 eval $format; | |
1626 } | |
1627 write; | |
1628 | |
1629 } | |
1630 print '-' x 156 . "\nenter a command:"; | |
1631 | |
1632 while (<STDIN>) { | |
1633 | |
1634 (my $input = $_) =~ s/\s+$//; | |
1635 | |
1636 if ($input =~ /^\d+$/) { | |
1637 if ( $input && defined $opt[$input-1] ) { | |
1638 return $opt[$input-1]->[1] | |
1639 } else { | |
1640 print "\nThat number is not an option. Please enter a valid number.\n:"; | |
1641 } | |
1642 } elsif ($input =~ /^n/i | $input =~ /next/i ) { | |
1643 return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg, | |
1644 $feat, $directions, $best_guess, @opt) | |
1645 } elsif ($input =~ /^p/i | $input =~ /prev/i ) { | |
1646 return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg, | |
1647 $feat, $directions, $best_guess, @opt) | |
1648 } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip' | |
1649 } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto' | |
1650 } elsif ( $input =~ /^r/i || $input =~ /reset/i ) { | |
1651 return manual_curation($feat, $best_guess, \@RETURN ); | |
1652 } elsif ( $input =~ /^f/i || $input =~ /find/i ) { | |
1653 | |
1654 my ($query, @query_results); | |
1655 | |
1656 if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1; | |
1657 } else { | |
1658 | |
1659 #do a SO search | |
1660 print "Type your search query\n:"; | |
1661 while (<STDIN>) { chomp($query = $_); last } | |
1662 } | |
1663 | |
1664 for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) { | |
1665 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)'); | |
1666 } | |
1667 | |
1668 return manual_curation($feat, $best_guess, \@query_results); | |
1669 | |
1670 } elsif ( $input =~ /^i/i || $input =~ /input/i ) { | |
1671 | |
1672 #NML fast input for later | |
1673 #my $query; | |
1674 #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 }; | |
1675 | |
1676 #manual input | |
1677 print "Type the term you want to use\n:"; | |
1678 while (<STDIN>) { | |
1679 chomp(my $input = $_); | |
1680 | |
1681 if (! $TYPE_MAP->{$input}) { | |
1682 | |
1683 print "\"$input\" doesn't appear to be a valid SO term. Are ". | |
1684 "you sure you want to use it? (y or n)\n:"; | |
1685 | |
1686 while (<STDIN>) { | |
1687 | |
1688 chomp(my $choice = $_); | |
1689 | |
1690 if ($choice eq 'y') { | |
1691 print | |
1692 "\nWould you like to save your preference for " . | |
1693 "future use (so you don't have to redo manual " . | |
1694 "curation for this feature everytime you run " . | |
1695 "the converter)? (y or n)\n"; | |
1696 | |
1697 #NML: all these while loops are a mess. Really should condense it. | |
1698 while (<STDIN>) { | |
1699 | |
1700 chomp(my $choice = $_); | |
1701 | |
1702 if ($choice eq 'y') { | |
1703 curation_save($feat, $input); | |
1704 return $input; | |
1705 } elsif ($choice eq 'n') { | |
1706 return $input | |
1707 } else { | |
1708 print "\nDidn't recognize that command. Please " . | |
1709 "type y or n.\n:" | |
1710 } | |
1711 } | |
1712 | |
1713 | |
1714 } elsif ($choice eq 'n') { | |
1715 return options_cycle($start, $stop, $msg, $feat, | |
1716 $directions, $best_guess, @opt) | |
1717 } else { | |
1718 print "\nDidn't recognize that command. Please " . | |
1719 "type y or n.\n:" | |
1720 } | |
1721 } | |
1722 | |
1723 } else { | |
1724 print | |
1725 "\nWould you like to save your preference for " . | |
1726 "future use (so you don't have to redo manual " . | |
1727 "curation for this feature everytime you run " . | |
1728 "the converter)? (y or n)\n"; | |
1729 | |
1730 #NML: all these while loops are a mess. Really should condense it. | |
1731 while (<STDIN>) { | |
1732 | |
1733 chomp(my $choice = $_); | |
1734 | |
1735 if ($choice eq 'y') { | |
1736 curation_save($feat, $input); | |
1737 return $input; | |
1738 } elsif ($choice eq 'n') { | |
1739 return $input | |
1740 } else { | |
1741 print "\nDidn't recognize that command. Please " . | |
1742 "type y or n.\n:" | |
1743 } | |
1744 } | |
1745 | |
1746 } | |
1747 | |
1748 } | |
1749 } else { | |
1750 print "\nDidn't recognize that command. Please re-enter your choice.\n:" | |
1751 } | |
1752 } | |
1753 | |
1754 } | |
1755 | |
1756 sub GenBank_entry { | |
1757 my ($f, $delimiter, $num) = @_; | |
1758 | |
1759 $delimiter ||= "\n"; | |
1760 | |
1761 | |
1762 my $entry = | |
1763 | |
1764 ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag | |
1765 . ($num | |
1766 ? ' ' x (12 - length $f->primary_tag ) . ' [2] ' | |
1767 : ' ' x (15 - length $f->primary_tag) | |
1768 ) | |
1769 . $f->start.'..'.$f->end | |
1770 | |
1771 . "$delimiter"; | |
1772 | |
1773 if ($num) { | |
1774 words_tag($f, \$entry); | |
1775 } else { | |
1776 for my $tag ($f->all_tags) { | |
1777 for my $val ( $f->each_tag_value($tag) ) { | |
1778 $entry .= ' ' x 20; | |
1779 #$entry .= "/$tag=\"$val\"$delimiter"; | |
1780 $entry .= $val eq '_no_value' | |
1781 ? "/$tag$delimiter" | |
1782 : "/$tag=\"$val\"$delimiter"; | |
1783 } | |
1784 } | |
1785 | |
1786 } | |
1787 | |
1788 return $entry; | |
1789 } | |
1790 | |
1791 | |
1792 sub gff_validate { | |
1793 warn "Validating GFF file\n" if $DEBUG; | |
1794 my @feat = @_; | |
1795 | |
1796 my (%parent2child, %all_ids, %descendants, %reserved); | |
1797 | |
1798 for my $f (@feat) { | |
1799 for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) { | |
1800 map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0]) | |
1801 if $f->has_tag($$aTags[0]); | |
1802 } | |
1803 } | |
1804 | |
1805 if ($SO_FILE) { | |
1806 while (my ($parentID, $aChildren) = each %parent2child) { | |
1807 parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved); | |
1808 } | |
1809 } | |
1810 | |
1811 id_validate(\%all_ids, \%reserved); | |
1812 } | |
1813 | |
1814 sub parent_validate { | |
1815 my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_; | |
1816 | |
1817 my $aParents = $hAll->{$parentID}; | |
1818 | |
1819 map { | |
1820 my $child = $_; | |
1821 $child->add_tag_value( validation_error => | |
1822 "feature tried to add Parent tag, but no Parent found with ID $parentID" | |
1823 ); | |
1824 my %parents; | |
1825 map { $parents{$_} = 1 } $child->get_tag_values('Parent'); | |
1826 delete $parents{$parentID}; | |
1827 my @parents = keys %parents; | |
1828 | |
1829 $child->remove_tag('Parent'); | |
1830 | |
1831 unless ($child->has_tag('ID')) { | |
1832 my $id = gene_name($child); | |
1833 $child->add_tag_value('ID', $id); | |
1834 push @{$hAll->{$id}}, $child | |
1835 } | |
1836 | |
1837 $child->add_tag_value('Parent', @parents) if @parents; | |
1838 | |
1839 } @$aChildren and return unless scalar(@$aParents); | |
1840 | |
1841 my $par = join(',', map { $_->primary_tag } @$aParents); | |
1842 warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG; | |
1843 | |
1844 #NML manual curation needs to happen here | |
1845 | |
1846 | |
1847 my %parentsToRemove; | |
1848 | |
1849 CHILD: | |
1850 for my $child (@$aChildren) { | |
1851 my $childType = $child->primary_tag; | |
1852 | |
1853 warn "WORKING ON $childType at ".$child->start.' to '.$child->end | |
1854 if $DEBUG; | |
1855 | |
1856 for (my $i = 0; $i < scalar(@$aParents); $i++) { | |
1857 my $parent = $aParents->[$i]; | |
1858 my $parentType = $parent->primary_tag; | |
1859 | |
1860 warn "CHECKING $childType against $parentType" if $DEBUG; | |
1861 | |
1862 #cache descendants so we don't have to do repeat searches | |
1863 unless ($hDescendants->{$parentType}) { | |
1864 | |
1865 for my $term ($ONTOLOGY->find_terms( | |
1866 -name => $parentType | |
1867 ) ) { | |
1868 | |
1869 map { | |
1870 $hDescendants->{$parentType}{$_->name}++ | |
1871 } $ONTOLOGY->get_descendant_terms($term); | |
1872 | |
1873 } | |
1874 | |
1875 # NML: hopefully temporary fix. | |
1876 # SO doesn't consider exon/CDS to be a child of mRNA | |
1877 # even though common knowledge dictates that they are | |
1878 # This cheat fixes the false positives for now | |
1879 if ($parentType eq 'mRNA') { | |
1880 $hDescendants->{$parentType}{'exon'} = 1; | |
1881 $hDescendants->{$parentType}{'CDS'} = 1; | |
1882 } | |
1883 | |
1884 } | |
1885 | |
1886 warn "\tCAN $childType at " . $child->start . ' to ' . $child->end . | |
1887 " be a child of $parentType?" if $DEBUG; | |
1888 if ($hDescendants->{$parentType}{$childType}) { | |
1889 warn "\tYES, $childType can be a child of $parentType" if $DEBUG; | |
1890 | |
1891 #NML need to deal with multiple children matched to multiple different | |
1892 #parents. This model only assumes the first parent id that matches a child will | |
1893 #be the reserved feature. | |
1894 | |
1895 $hReserved->{$parentID}{$parent}{'parent'} = $parent; | |
1896 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child; | |
1897 | |
1898 #mark parent for later removal from all IDs | |
1899 #so we don't accidentally change any parents | |
1900 | |
1901 $parentsToRemove{$i}++; | |
1902 | |
1903 next CHILD; | |
1904 } | |
1905 } | |
1906 | |
1907 | |
1908 | |
1909 #NML shouldn't have to check this; somehow child can lose Parent | |
1910 #it's happening W3110 | |
1911 #need to track this down | |
1912 if ( $child->has_tag('Parent') ) { | |
1913 | |
1914 warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID" | |
1915 if $DEBUG; | |
1916 | |
1917 my %parents; | |
1918 | |
1919 map { $parents{$_} = 1 } $child->get_tag_values('Parent'); | |
1920 | |
1921 delete $parents{$parentID}; | |
1922 my @parents = keys %parents; | |
1923 | |
1924 warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start . | |
1925 ' to ' . $child->end . " cannot be a child of ID $parentID" | |
1926 if $DEBUG; | |
1927 | |
1928 $child->add_tag_value( validation_error => | |
1929 "feature cannot be a child of $parentID"); | |
1930 | |
1931 $child->remove_tag('Parent'); | |
1932 | |
1933 unless ($child->has_tag('ID')) { | |
1934 my $id = gene_name($child); | |
1935 $child->add_tag_value('ID', $id); | |
1936 push @{$hAll->{$id}}, $child | |
1937 } | |
1938 | |
1939 $child->add_tag_value('Parent', @parents) if @parents; | |
1940 } | |
1941 | |
1942 } | |
1943 | |
1944 #delete $aParents->[$_] for keys %parentsToRemove; | |
1945 splice(@$aParents, $_, 1) for keys %parentsToRemove; | |
1946 } | |
1947 | |
1948 sub id_validate { | |
1949 my ($hAll, $hReserved) = @_; | |
1950 | |
1951 | |
1952 for my $id (keys %$hAll) { | |
1953 | |
1954 #since 1 feature can have this id, | |
1955 #let's just shift it off and uniquify | |
1956 #the rest (unless it's reserved) | |
1957 | |
1958 shift @{$hAll->{$id}} unless $hReserved->{$id}; | |
1959 for my $feat (@{$hAll->{$id}}) { | |
1960 id_uniquify(0, $id, $feat, $hAll); | |
1961 } | |
1962 } | |
1963 | |
1964 for my $parentID (keys %$hReserved) { | |
1965 | |
1966 my @keys = keys %{$hReserved->{$parentID}}; | |
1967 | |
1968 shift @keys; | |
1969 | |
1970 for my $k (@keys) { | |
1971 my $parent = $hReserved->{$parentID}{$k}{'parent'}; | |
1972 my $aChildren= $hReserved->{$parentID}{$k}{'children'}; | |
1973 | |
1974 my $value = id_uniquify(0, $parentID, $parent, $hAll); | |
1975 for my $child (@$aChildren) { | |
1976 | |
1977 my %parents; | |
1978 map { $parents{$_}++ } $child->get_tag_values('Parent'); | |
1979 $child->remove_tag('Parent'); | |
1980 delete $parents{$parentID}; | |
1981 $parents{$value}++; | |
1982 my @parents = keys %parents; | |
1983 $child->add_tag_value('Parent', @parents); | |
1984 } | |
1985 | |
1986 } | |
1987 } | |
1988 } | |
1989 | |
1990 sub id_uniquify { | |
1991 my ($count, $value, $feat, $hAll) = @_; | |
1992 | |
1993 warn "UNIQUIFYING $value" if $DEBUG; | |
1994 | |
1995 if (! $count) { | |
1996 $feat->add_tag_value(Alias => $value); | |
1997 $value .= ('.' . $feat->primary_tag) | |
1998 } elsif ($count == 1) { | |
1999 $value .= ".$count" | |
2000 } else { | |
2001 chop $value; | |
2002 $value .= $count | |
2003 } | |
2004 $count++; | |
2005 | |
2006 warn "ENDED UP WITH $value" if $DEBUG; | |
2007 if ( $hAll->{$value} ) { | |
2008 warn "$value IS ALREADY TAKEN" if $DEBUG; | |
2009 id_uniquify($count, $value, $feat, $hAll); | |
2010 } else { | |
2011 #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end; | |
2012 $feat->remove_tag('ID'); | |
2013 $feat->add_tag_value('ID', $value); | |
2014 push @{$hAll->{$value}}, $value; | |
2015 } | |
2016 | |
2017 $value; | |
2018 } | |
2019 | |
2020 sub conf_read { | |
2021 | |
2022 print "\nCannot read $CONF. Change file permissions and retry, " . | |
2023 "or enter another file\n" and conf_locate() unless -r $CONF; | |
2024 | |
2025 print "\nCannot write $CONF. Change file permissions and retry, " . | |
2026 "or enter another file\n" and conf_locate() unless -w $CONF; | |
2027 | |
2028 $YAML = LoadFile($CONF); | |
2029 | |
2030 } | |
2031 | |
2032 sub conf_create { | |
2033 | |
2034 my ($path, $input) = @_; | |
2035 | |
2036 print "Cannot write to $path. Change directory permissions and retry " . | |
2037 "or enter another save path\n" and conf_locate() unless -w $path; | |
2038 | |
2039 $CONF = $input; | |
2040 | |
2041 open(FH, '>', $CONF); | |
2042 close(FH); | |
2043 conf_read(); | |
2044 | |
2045 | |
2046 } | |
2047 | |
2048 sub conf_write { DumpFile($CONF, $YAML) } | |
2049 | |
2050 sub conf_locate { | |
2051 | |
2052 print "\nEnter the location of a previously saved config, or a new " . | |
2053 "path and file name to create a new config (this step is " . | |
2054 "necessary to save any preferences)"; | |
2055 | |
2056 print "\n\nenter a command:"; | |
2057 | |
2058 while (<STDIN>) { | |
2059 chomp(my $input = $_); | |
2060 my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/); | |
2061 | |
2062 if (-e $input && (! -d $input)) { | |
2063 | |
2064 print "\nReading $input...\n"; | |
2065 $CONF = $input; | |
2066 | |
2067 conf_read(); | |
2068 last; | |
2069 | |
2070 } elsif (! -d $input && $fn.$suffix) { | |
2071 | |
2072 print "Creating $input...\n"; | |
2073 conf_create($path, $input); | |
2074 last; | |
2075 | |
2076 } elsif (-e $input && -d $input) { | |
2077 print "You only entered a directory. " . | |
2078 "Please enter BOTH a directory and filename\n"; | |
2079 } else { | |
2080 print "$input does not appear to be a valid path. Please enter a " . | |
2081 "valid directory and filename\n"; | |
2082 } | |
2083 print "\nenter a command:"; | |
2084 } | |
2085 } | |
2086 | |
2087 sub curation_save { | |
2088 | |
2089 my ($feat, $input) = @_; | |
2090 | |
2091 #my $error = "Enter the location of a previously saved config, or a new " . | |
2092 # "path and file name to create a new config (this step is " . | |
2093 # "necessary to save any preferences)\n"; | |
2094 | |
2095 if (!$CONF) { | |
2096 print "\n\n"; | |
2097 conf_locate(); | |
2098 } elsif (! -e $CONF) { | |
2099 print "\n\nThe config file you have chosen doesn't exist.\n"; | |
2100 conf_locate(); | |
2101 } else { conf_read() } | |
2102 | |
2103 my $entry = GenBank_entry($feat, "\r", 1); | |
2104 | |
2105 my $msg = "Term entered: $input"; | |
2106 my $directions = "Please select any/all tags that provide evidence for the term you | |
2107 have entered. You may enter multiple tags by separating them by commas/dashes | |
2108 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have | |
2109 the option of either selecting the entire note as evidence, or specific | |
2110 keywords. If a tag has multiple keywords, they will be tagged alphabetically | |
2111 for selection. To select a specific keyword in a tag field, you must enter the | |
2112 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be | |
2113 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have | |
2114 to be to match your curation. To match the GenBank entry exactly as it | |
2115 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your | |
2116 preference, you will no longer be prompted to choose a feature type for any | |
2117 matching entries until you edit the curation.ini file."; | |
2118 my $msg_copy = $msg; | |
2119 my $dir_copy = $directions; | |
2120 | |
2121 my $format = "format STDOUT = \n" . | |
2122 '-' x 156 . "\n" . | |
2123 '^' . '<' x 77 . '| Directions:' . "\n" . | |
2124 '$msg_copy' . "\n" . | |
2125 '-' x 156 . "\n" . | |
2126 ' ' x 78 . "|\n" . | |
2127 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" . | |
2128 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" . | |
2129 (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" . | |
2130 ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n"; | |
2131 | |
2132 { | |
2133 # eval throws redefined warning that breaks formatting. | |
2134 # Turning off warnings just for the eval to fix this. | |
2135 no warnings 'redefine'; | |
2136 eval $format; | |
2137 } | |
2138 | |
2139 write; | |
2140 print '-' x 156 . "\nenter a command:"; | |
2141 | |
2142 my @tags = words_tag($feat); | |
2143 | |
2144 my $final = {}; | |
2145 my $choices; | |
2146 while (<STDIN>) { | |
2147 | |
2148 chomp(my $choice = $_); | |
2149 | |
2150 if (scalar(keys %$final) && $choice =~ /^y/i) { last | |
2151 | |
2152 } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input) | |
2153 | |
2154 } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n"; | |
2155 | |
2156 } elsif ($choice eq 'all') { | |
2157 | |
2158 $choice = ''; | |
2159 for (my $i=1; $i < scalar(@tags); $i++) { | |
2160 $choice .= "$i,"; | |
2161 } | |
2162 chop $choice; | |
2163 } | |
2164 #print "CHOICE [$choice]"; | |
2165 | |
2166 my @selections; | |
2167 for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) { | |
2168 if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) { | |
2169 | |
2170 for ($1..$2) { push @selections, $_ } | |
2171 | |
2172 my $dangling_alphas = $3; | |
2173 alpha_expand($dangling_alphas, \@selections); | |
2174 | |
2175 } else { | |
2176 alpha_expand($_, \@selections); | |
2177 } | |
2178 } | |
2179 | |
2180 foreach my $numbers (@selections) { | |
2181 | |
2182 my @c = split(/(?=[\w])/, $numbers); | |
2183 s/\W+//g foreach @c; | |
2184 my $num; | |
2185 | |
2186 { | |
2187 $^W = 0; | |
2188 $num = 0 + shift @c; | |
2189 } | |
2190 | |
2191 my $tag = $tags[$num]; | |
2192 if (ref $tag && scalar(@c)) { | |
2193 my $no_value; | |
2194 foreach (@c) { | |
2195 if (defined $tag->{$_}) { | |
2196 $choices .= "${num}[$_] "; | |
2197 my ($t,$v) = @{$tag->{$_}}; | |
2198 push @{${$final->{$input}}[0]{$t}}, $v; | |
2199 | |
2200 } else { $no_value++ } | |
2201 } | |
2202 | |
2203 if ($no_value) { | |
2204 _selection_add($tag,$final,$input,\$choices,$num); | |
2205 #my ($t,$v) = @{$tag->{'all'}}; | |
2206 #unless (defined ${$final->{$input}}[0]{$t}) { | |
2207 #$choices .= "$num, "; | |
2208 #push @{${$final->{$input}}[0]{$t}}, $v | |
2209 #} | |
2210 } | |
2211 | |
2212 $choices = substr($choices, 0, -2); | |
2213 $choices .= ', '; | |
2214 | |
2215 } elsif (ref $tag) { | |
2216 _selection_add($tag,$final,$input,\$choices,$num); | |
2217 #my ($t,$v) = @{$tag->{'all'}}; | |
2218 #unless (defined ${$final->{$input}}[0]{$t}) { | |
2219 #$choices .= "$num, "; | |
2220 #push @{${$final->{$input}}[0]{$t}}, $v | |
2221 #} | |
2222 } | |
2223 } | |
2224 $choices = substr($choices, 0, -2) if $choices; | |
2225 if ($final) { | |
2226 print "\nYou have chosen the following tags:\n$choices\n"; | |
2227 print "This will be written to the config file as:\n"; | |
2228 print Dump $final; | |
2229 print "\nIs this correct? (y or n)\n"; | |
2230 } else { print "\nInvalid selection. Please try again\n" } | |
2231 } | |
2232 push @{$YAML->{$input}}, $final->{$input}; | |
2233 conf_write(); | |
2234 } | |
2235 | |
2236 # words_tag() splits each tag value string into multiple words so that the | |
2237 # user can select the parts he/she wants to use for curation | |
2238 # it can tag 702 (a - zz) separate words; this should be enough | |
2239 | |
2240 sub words_tag { | |
2241 | |
2242 my ($feat, $entry) = @_; | |
2243 | |
2244 my @tags; | |
2245 | |
2246 @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]}); | |
2247 my $i = 3; | |
2248 foreach my $tag ($feat->all_tags) { | |
2249 foreach my $value ($feat->each_tag_value($tag)) { | |
2250 | |
2251 my ($string, $tagged_string); | |
2252 | |
2253 my @words = split(/(?=\w+?)/, $value); | |
2254 | |
2255 my $pos = 0; | |
2256 | |
2257 | |
2258 foreach my $word (@words) { | |
2259 | |
2260 (my $sanitized_word = $word) =~ s/\W+?//g; | |
2261 $string .= $word; | |
2262 | |
2263 my $lead = int($pos/ALPHABET_DIVISOR); | |
2264 my $lag = $pos % ALPHABET_DIVISOR; | |
2265 | |
2266 my $a = $lead ? ${(ALPHABET)}[$lead-1] : ''; | |
2267 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a'; | |
2268 | |
2269 $tagged_string .= " ($a) $word"; | |
2270 | |
2271 $tags[$i]{$a} = [$tag, $sanitized_word]; | |
2272 $pos++; | |
2273 } | |
2274 | |
2275 $value = $tagged_string if scalar(@words) > 1; | |
2276 | |
2277 $$entry .= "[$i] /$tag=\"$value\"\r"; | |
2278 | |
2279 $tags[$i]{'all'} = [$tag, $string]; | |
2280 } | |
2281 $i++; | |
2282 } | |
2283 | |
2284 return @tags; | |
2285 | |
2286 } | |
2287 | |
2288 sub alpha_expand { | |
2289 | |
2290 my ($dangling_alphas, $selections) = @_; | |
2291 | |
2292 if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) { | |
2293 | |
2294 my $digit = $1; | |
2295 push @$selections, $digit if $digit; | |
2296 | |
2297 my $start = $2; | |
2298 my $stop = $3; | |
2299 | |
2300 my @starts = split('', $start); | |
2301 my @stops = split('', $stop); | |
2302 | |
2303 my ($final_start, $final_stop); | |
2304 | |
2305 for ([\$final_start, \@starts], [\$final_stop, \@stops]) { | |
2306 | |
2307 my ($final, $splits) = @$_; | |
2308 | |
2309 my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]}; | |
2310 my $rem; | |
2311 | |
2312 | |
2313 if ($$splits[1]) { | |
2314 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]}; | |
2315 $int++ | |
2316 } else { | |
2317 $rem = $int; | |
2318 $int = 0; | |
2319 } | |
2320 | |
2321 | |
2322 $$final = $int * ALPHABET_DIVISOR; | |
2323 $$final += $rem; | |
2324 | |
2325 } | |
2326 | |
2327 my $last_number = pop @$selections; | |
2328 for my $pos ($final_start..$final_stop) { | |
2329 my $lead = int($pos/ALPHABET_DIVISOR); | |
2330 my $lag = $pos % ALPHABET_DIVISOR; | |
2331 my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : ''; | |
2332 $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a'; | |
2333 push @$selections, $last_number.$alpha; | |
2334 } | |
2335 | |
2336 } elsif (defined($dangling_alphas)) { | |
2337 if ($dangling_alphas =~ /^\d/) { | |
2338 push @$selections, $dangling_alphas; | |
2339 } elsif ($dangling_alphas =~ /^\D/) { | |
2340 #print "$dangling_alphas ".Dumper @$selections; | |
2341 my $last_number = pop @$selections; | |
2342 $last_number ||= ''; | |
2343 push @$selections, $last_number.$dangling_alphas; | |
2344 #$$selections[-1] .= $dangling_alphas; | |
2345 } | |
2346 } | |
2347 | |
2348 } | |
2349 | |
2350 sub _selection_add { | |
2351 | |
2352 my ($tag, $final, $input, $choices, $num) = @_; | |
2353 my ($t,$v) = @{$tag->{'all'}}; | |
2354 unless (defined ${$final->{$input}}[0]{$t}) { | |
2355 $$choices .= "$num, "; | |
2356 push @{${$final->{$input}}[0]{$t}}, $v | |
2357 } | |
2358 | |
2359 } |