1
|
1 #!/opt/anaconda1anaconda2anaconda3/bin/perl
|
|
2
|
|
3 eval 'exec /opt/anaconda1anaconda2anaconda3/bin/perl -S $0 ${1+"$@"}'
|
|
4 if 0; # not running under some shell
|
|
5
|
|
6
|
|
7
|
|
8 =pod
|
|
9
|
|
10 =head1 NAME
|
|
11
|
|
12 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3
|
|
13
|
|
14 =head1 SYNOPSIS
|
|
15
|
|
16 bp_genbank2gff3.pl [options] filename(s)
|
|
17
|
|
18 # process a directory containing GenBank flatfiles
|
|
19 perl bp_genbank2gff3.pl --dir path_to_files --zip
|
|
20
|
|
21 # process a single file, ignore explicit exons and introns
|
|
22 perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz
|
|
23
|
|
24 # process a list of files
|
|
25 perl bp_genbank2gff3.pl *gbk.gz
|
|
26
|
|
27 # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader
|
|
28 curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \
|
|
29 | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \
|
|
30 | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata
|
|
31
|
|
32 Options:
|
|
33 --noinfer -r don't infer exon/mRNA subfeatures
|
|
34 --conf -i path to the curation configuration file that contains user preferences
|
|
35 for Genbank entries (must be YAML format)
|
|
36 (if --manual is passed without --ini, user will be prompted to
|
|
37 create the file if any manual input is saved)
|
|
38 --sofile -l path to to the so.obo file to use for feature type mapping
|
|
39 (--sofile live will download the latest online revision)
|
|
40 --manual -m when trying to guess the proper SO term, if more than
|
|
41 one option matches the primary tag, the converter will
|
|
42 wait for user input to choose the correct one
|
|
43 (only works with --sofile)
|
|
44 --dir -d path to a list of genbank flatfiles
|
|
45 --outdir -o location to write GFF files (can be 'stdout' or '-' for pipe)
|
|
46 --zip -z compress GFF3 output files with gzip
|
|
47 --summary -s print a summary of the features in each contig
|
|
48 --filter -x genbank feature type(s) to ignore
|
|
49 --split -y split output to separate GFF and fasta files for
|
|
50 each genbank record
|
|
51 --nolump -n separate file for each reference sequence
|
|
52 (default is to lump all records together into one
|
|
53 output file for each input file)
|
|
54 --ethresh -e error threshold for unflattener
|
|
55 set this high (>2) to ignore all unflattener errors
|
|
56 --[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon
|
|
57 model. --CDS is default. Use --CDS to keep default GFF gene model,
|
|
58 use --noCDS to convert to g-r-p-e.
|
|
59 --format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work
|
|
60 (GenBank is default)
|
|
61 --GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available
|
|
62 --quiet don't talk about what is being processed
|
|
63 --typesource SO sequence type for source (e.g. chromosome; region; contig)
|
|
64 --help -h display this message
|
|
65
|
|
66
|
|
67 =head1 DESCRIPTION
|
|
68
|
|
69 This script uses Bio::SeqFeature::Tools::Unflattener and
|
|
70 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene
|
|
71 containment hierarchies mapped for optimal display in gbrowse.
|
|
72
|
|
73 The input files are assumed to be gzipped GenBank flatfiles for refseq
|
|
74 contigs. The files may contain multiple GenBank records. Either a
|
|
75 single file or an entire directory can be processed. By default, the
|
|
76 DNA sequence is embedded in the GFF but it can be saved into separate
|
|
77 fasta file with the --split(-y) option.
|
|
78
|
|
79 If an input file contains multiple records, the default behaviour is
|
|
80 to dump all GFF and sequence to a file of the same name (with .gff
|
|
81 appended). Using the 'nolump' option will create a separate file for
|
|
82 each genbank record. Using the 'split' option will create separate
|
|
83 GFF and Fasta files for each genbank record.
|
|
84
|
|
85
|
|
86 =head2 Notes
|
|
87
|
|
88 =head3 'split' and 'nolump' produce many files
|
|
89
|
|
90 In cases where the input files contain many GenBank records (for
|
|
91 example, the chromosome files for the mouse genome build), a very
|
|
92 large number of output files will be produced if the 'split' or
|
|
93 'nolump' options are selected. If you do have lists of files E<gt> 6000,
|
|
94 use the --long_list option in bp_bulk_load_gff.pl or
|
|
95 bp_fast_load_gff.pl to load the gff and/ or fasta files.
|
|
96
|
|
97 =head3 Designed for RefSeq
|
|
98
|
|
99 This script is designed for RefSeq genomic sequence entries. It may
|
|
100 work for third party annotations but this has not been tested.
|
|
101 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl
|
|
102 if you don't mind some gene model unflattener errors (dgg).
|
|
103
|
|
104 =head3 G-R-P-E Gene Model
|
|
105
|
|
106 Don Gilbert worked this over with needs to produce GFF3 suited to
|
|
107 loading to GMOD Chado databases. Most of the changes I believe are
|
|
108 suited for general use. One main chado-specific addition is the
|
|
109 --[no]cds2protein flag
|
|
110
|
|
111 My favorite GFF is to set the above as ON by default (disable with --nocds2prot)
|
|
112 For general use it probably should be OFF, enabled with --cds2prot.
|
|
113
|
|
114 This writes GFF with an alternate, but useful Gene model,
|
|
115 instead of the consensus model for GFF3
|
|
116
|
|
117 [ gene > mRNA> (exon,CDS,UTR) ]
|
|
118
|
|
119 This alternate is
|
|
120
|
|
121 gene > mRNA > polypeptide > exon
|
|
122
|
|
123 means the only feature with dna bases is the exon. The others
|
|
124 specify only location ranges on a genome. Exon of course is a child
|
|
125 of mRNA and protein/peptide.
|
|
126
|
|
127 The protein/polypeptide feature is an important one, having all the
|
|
128 annotations of the GenBank CDS feature, protein ID, translation, GO
|
|
129 terms, Dbxrefs to other proteins.
|
|
130
|
|
131 UTRs, introns, CDS-exons are all inferred from the primary exon bases
|
|
132 inside/outside appropriate higher feature ranges. Other special gene
|
|
133 model features remain the same.
|
|
134
|
|
135 Several other improvements and bugfixes, minor but useful are included
|
|
136
|
|
137 * IO pipes now work:
|
|
138 curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...
|
|
139
|
|
140 * GenBank main record fields are added to source feature, e.g. organism, date,
|
|
141 and the sourcetype, commonly chromosome for genomes, is used.
|
|
142
|
|
143 * Gene Model handling for ncRNA, pseudogenes are added.
|
|
144
|
|
145 * GFF header is cleaner, more informative.
|
|
146 --GFF_VERSION flag allows choice of v2 as well as default v3
|
|
147
|
|
148 * GFF ##FASTA inclusion is improved, and
|
|
149 CDS translation sequence is moved to FASTA records.
|
|
150
|
|
151 * FT -> GFF attribute mapping is improved.
|
|
152
|
|
153 * --format choice of SeqIO input formats (GenBank default).
|
|
154 Uniprot/Swissprot and EMBL work and produce useful GFF.
|
|
155
|
|
156 * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions
|
|
157 and more flexible usage.
|
|
158
|
|
159 =head1 TODO
|
|
160
|
|
161 =head2 Are these additions desired?
|
|
162
|
|
163 * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY
|
|
164 * handle Entrezgene, other non-sequence SeqIO structures (really should change
|
|
165 those parsers to produce consistent annotation tags).
|
|
166
|
|
167 =head2 Related bugfixes/tests
|
|
168
|
|
169 These items from Bioperl mail were tested (sample data generating
|
|
170 errors), and found corrected:
|
|
171
|
|
172 From: Ed Green <green <at> eva.mpg.de>
|
|
173 Subject: genbank2gff3.pl on new human RefSeq
|
|
174 Date: 2006-03-13 21:22:26 GMT
|
|
175 -- unspecified errors (sample data works now).
|
|
176
|
|
177 From: Eric Just <e-just <at> northwestern.edu>
|
|
178 Subject: genbank2gff3.pl
|
|
179 Date: 2007-01-26 17:08:49 GMT
|
|
180 -- bug fixed in genbank2gff3 for multi-record handling
|
|
181
|
|
182 This error is for a /trans_splice gene that is hard to handle, and
|
|
183 unflattner/genbank2 doesn't
|
|
184
|
|
185 From: Chad Matsalla <chad <at> dieselwurks.com>
|
|
186 Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?
|
|
187 Date: 2005-07-15 19:51:48 GMT
|
|
188
|
|
189 =head1 AUTHOR
|
|
190
|
|
191 Sheldon McKay (mckays@cshl.edu)
|
|
192
|
|
193 Copyright (c) 2004 Cold Spring Harbor Laboratory.
|
|
194
|
|
195 =head2 AUTHOR of hacks for GFF2Chado loading
|
|
196
|
|
197 Don Gilbert (gilbertd@indiana.edu)
|
|
198
|
|
199
|
|
200 =cut
|
|
201
|
|
202 use strict;
|
|
203 use warnings;
|
|
204
|
|
205 use lib "$ENV{HOME}/bioperl-live";
|
|
206 # chad put this here to enable situations when this script is tested
|
|
207 # against bioperl compiled into blib along with other programs using blib
|
|
208 BEGIN {
|
|
209 unshift(@INC,'blib/lib');
|
|
210 };
|
|
211 use Pod::Usage;
|
|
212 use Bio::Root::RootI;
|
|
213 use Bio::SeqIO;
|
|
214 use File::Spec;
|
|
215 use Bio::SeqFeature::Tools::Unflattener;
|
|
216 use Bio::SeqFeature::Tools::TypeMapper;
|
|
217 use Bio::SeqFeature::Tools::IDHandler;
|
|
218 use Bio::Location::SplitLocationI;
|
|
219 use Bio::Location::Simple;
|
|
220 use Bio::Tools::GFF;
|
|
221 use Getopt::Long;
|
|
222 use List::Util qw(first);
|
|
223 use Bio::OntologyIO;
|
|
224 use YAML qw(Dump LoadFile DumpFile);
|
|
225 use File::Basename;
|
|
226
|
|
227 use vars qw/$split @filter $zip $outdir $help $ethresh
|
|
228 $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
|
|
229 $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
|
|
230 $file @files $dir $summary $nolump
|
|
231 $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
|
|
232 $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;
|
|
233
|
|
234 use constant SO_URL =>
|
|
235 'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';
|
|
236 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];
|
|
237 use constant ALPHABET_TO_NUMBER => {
|
|
238 a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,
|
|
239 j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,
|
|
240 r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,
|
|
241 z => 25,
|
|
242 };
|
|
243 use constant ALPHABET_DIVISOR => 26;
|
|
244 use constant GM_NEW_TOPLEVEL => 2;
|
|
245 use constant GM_NEW_PART => 1;
|
|
246 use constant GM_DUP_PART => 0;
|
|
247 use constant GM_NOT_PART => -1;
|
|
248
|
|
249 # Options cycle in multiples of 2 because of left side/right side pairing.
|
|
250 # You can make this number odd, but displayed matches will still round up
|
|
251 use constant OPTION_CYCLE => 6;
|
|
252
|
|
253 $GFF_VERSION = 3; # allow v2 ...
|
|
254 $verbose = 1; # right default? -nov to turn off
|
|
255
|
|
256 # dgg: change the gene model to Gene/mRNA/Polypeptide/exons...
|
|
257 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()
|
|
258 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;
|
|
259 # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support
|
|
260
|
|
261 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **
|
|
262 my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work
|
|
263 # other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG
|
|
264
|
|
265
|
|
266 my %TAG_MAP = (
|
|
267 db_xref => 'Dbxref',
|
|
268 name => 'Name',
|
|
269 note => 'Note', # also pull GO: ids into Ontology_term
|
|
270 synonym => 'Alias',
|
|
271 symbol => 'Alias', # is symbol still used?
|
|
272 # protein_id => 'Dbxref', also seen Dbxref tags: EC_number
|
|
273 # translation: handled in gene_features
|
|
274 );
|
|
275
|
|
276
|
|
277 $| = 1;
|
|
278 my $quiet= !$verbose;
|
|
279 my $ok= GetOptions( 'd|dir|input:s' => \$dir,
|
|
280 'z|zip' => \$zip,
|
|
281 'h|help' => \$help,
|
|
282 's|summary' => \$summary,
|
|
283 'r|noinfer' => \$noinfer,
|
|
284 'i|conf=s' => \$CONF,
|
|
285 'sofile=s' => \$SO_FILE,
|
|
286 'm|manual' => \$MANUAL,
|
|
287 'o|outdir|output:s'=> \$outdir,
|
|
288 'x|filter:s'=> \@filter,
|
|
289 'y|split' => \$split,
|
|
290 "ethresh|e=s"=>\$ethresh,
|
|
291 'c|CDS!' => \$CDSkeep,
|
|
292 'f|format=s' => \$FORMAT,
|
|
293 'typesource=s' => \$source_type,
|
|
294 'GFF_VERSION=s' => \$GFF_VERSION,
|
|
295 'quiet!' => \$quiet, # swap quiet to verbose
|
|
296 'DEBUG!' => \$DEBUG,
|
|
297 'n|nolump' => \$nolump);
|
|
298
|
|
299 my $lump = 1 unless $nolump || $split;
|
|
300 $verbose= !$quiet;
|
|
301
|
|
302 # look for help request
|
|
303 pod2usage(2) if $help || !$ok;
|
|
304
|
|
305 # keep SOURCEID as-is and change FORMAT for SeqIO types;
|
|
306 # note SeqIO uses file.suffix to guess type; not useful here
|
|
307 $SOURCEID= $FORMAT;
|
|
308 $FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/;
|
|
309 $verbose =1 if($DEBUG);
|
|
310
|
|
311 # initialize handlers
|
|
312 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);
|
|
313 $unflattener->error_threshold($ethresh) if $ethresh;
|
|
314 $unflattener->verbose(1) if($DEBUG);
|
|
315 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?
|
|
316 # ensembl parsing is still problematic, forget this
|
|
317
|
|
318 my $tm = Bio::SeqFeature::Tools::TypeMapper->new;
|
|
319 my $idh = Bio::SeqFeature::Tools::IDHandler->new;
|
|
320
|
|
321 # dgg
|
|
322 $source_type ||= "region"; # should really parse from FT.source contents below
|
|
323
|
|
324 #my $FTSOmap = $tm->FT_SO_map();
|
|
325 my $FTSOmap;
|
|
326 my $FTSOsynonyms;
|
|
327
|
|
328 if (defined($SO_FILE) && $SO_FILE eq 'live') {
|
|
329 print "\nDownloading the latest SO file from ".SO_URL."\n\n";
|
|
330 use LWP::UserAgent;
|
|
331 my $ua = LWP::UserAgent->new(timeout => 30);
|
|
332 my $request = HTTP::Request->new(GET => SO_URL);
|
|
333 my $response = $ua->request($request);
|
|
334
|
|
335
|
|
336 if ($response->status_line =~ /200/) {
|
|
337 use File::Temp qw/ tempfile /;
|
|
338 my ($fh, $fn) = tempfile();
|
|
339 print $fh $response->content;
|
|
340 $SO_FILE = $fn;
|
|
341 } else {
|
|
342 print "Couldn't download SO file online...skipping validation.\n"
|
|
343 . "HTTP Status was " . $response->status_line . "\n"
|
|
344 and undef $SO_FILE
|
|
345 }
|
|
346 }
|
|
347
|
|
348 if ($SO_FILE) {
|
|
349
|
|
350
|
|
351 my (%terms, %syn);
|
|
352
|
|
353 my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );
|
|
354 $ONTOLOGY = $parser->next_ontology();
|
|
355
|
|
356 for ($ONTOLOGY->get_all_terms) {
|
|
357 my $feat = $_;
|
|
358
|
|
359 $terms{$feat->name} = $feat->name;
|
|
360 #$terms{$feat->name} = $feat;
|
|
361
|
|
362 my @syn = $_->each_synonym;
|
|
363
|
|
364 push @{$syn{$_}}, $feat->name for @syn;
|
|
365 #push @{$syn{$_}}, $feat for @syn;
|
|
366 }
|
|
367
|
|
368 $FTSOmap = \%terms;
|
|
369 $FTSOsynonyms = \%syn;
|
|
370
|
|
371 my %hardTerms = %{ $tm->FT_SO_map() };
|
|
372 map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;
|
|
373
|
|
374 } else {
|
|
375 my %terms = %{ $tm->FT_SO_map() };
|
|
376 while (my ($k,$v) = each %terms) {
|
|
377 $FTSOmap->{$k} = ref($v) ? shift @$v : $v;
|
|
378 }
|
|
379 }
|
|
380
|
|
381 $TYPE_MAP = $FTSOmap;
|
|
382 $SYN_MAP = $FTSOsynonyms;
|
|
383
|
|
384
|
|
385 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")
|
|
386
|
|
387 # stringify filter list if applicable
|
|
388 my $filter = join ' ', @filter if @filter;
|
|
389
|
|
390 # determine input files
|
|
391 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use
|
|
392 if ($dir && ($dir eq '-' || $dir eq 'stdin')) {
|
|
393 $stdin=1; $dir=''; @files=('stdin');
|
|
394
|
|
395 } elsif ( $dir ) {
|
|
396 if ( -d $dir ) {
|
|
397 opendir DIR, $dir or die "could not open $dir for reading: $!";
|
|
398 @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;
|
|
399 closedir DIR;
|
|
400 }
|
|
401 else {
|
|
402 die "$dir is not a directory\n";
|
|
403 }
|
|
404 }
|
|
405 else {
|
|
406 @files = @ARGV;
|
|
407 $dir = '';
|
|
408 }
|
|
409
|
|
410 # we should have some files by now
|
|
411 pod2usage(2) unless @files;
|
|
412
|
|
413
|
|
414 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use
|
|
415 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {
|
|
416 warn("std. output chosen: cannot split\n") if($split);
|
|
417 warn("std. output chosen: cannot zip\n") if($zip);
|
|
418 warn("std. output chosen: cannot nolump\n") if($nolump);
|
|
419 $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip
|
|
420
|
|
421 } elsif ( $outdir && !-e $outdir ) {
|
|
422 mkdir($outdir) or die "could not create directory $outdir: $!\n";
|
|
423 }
|
|
424 elsif ( !$outdir ) {
|
|
425 $outdir = $dir || '.';
|
|
426 }
|
|
427
|
|
428 for my $file ( @files ) {
|
|
429 # dgg ; allow 'stdin' / '-' input ?
|
|
430 chomp $file;
|
|
431 die "$! $file" unless($stdin || -e $file);
|
|
432 print "# Input: $file\n" if($verbose);
|
|
433
|
|
434 my ($lump_fh, $lumpfa_fh, $outfile, $outfa);
|
|
435 if ($stdout) {
|
|
436 $lump_fh= *STDOUT; $lump="stdout$$";
|
|
437 $outfa= "stdout$$.fa"; # this is a temp file ... see below
|
|
438 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
|
|
439
|
|
440 } elsif ( $lump ) {
|
|
441 my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);
|
|
442 $lump = File::Spec->catfile($outdir, $fileonly.'.gff');
|
|
443 ($outfa = $lump) =~ s/\.gff/\.fa/;
|
|
444 open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";
|
|
445 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
|
|
446
|
|
447 }
|
|
448
|
|
449 # open input file, unzip if req'd
|
|
450 if ($stdin) {
|
|
451 *FH = *STDIN;
|
|
452 } elsif ( $file =~ /\.gz/ ) {
|
|
453 open FH, "gunzip -c $file |";
|
|
454 }
|
|
455 else {
|
|
456 open FH, '<', $file;
|
|
457 }
|
|
458
|
|
459 my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);
|
|
460 my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );
|
|
461
|
|
462 while ( my $seq = $in->next_seq() ) {
|
|
463 my $seq_name = $seq->accession_number;
|
|
464 my $end = $seq->length;
|
|
465 my @to_print;
|
|
466
|
|
467 # arrange disposition of GFF output
|
|
468 $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');
|
|
469 my $out;
|
|
470
|
|
471 if ( $lump ) {
|
|
472 $outfile = $lump;
|
|
473 $out = $lump_fh;
|
|
474 }
|
|
475 else {
|
|
476 $outfile = File::Spec->catfile($outdir, $seq_name.'.gff');
|
|
477 open $out, ">$outfile";
|
|
478 }
|
|
479
|
|
480 # filter out unwanted features
|
|
481 my $source_feat= undef;
|
|
482 my @source= filter($seq); $source_feat= $source[0];
|
|
483
|
|
484 ($source_type,$source_feat)=
|
|
485 getSourceInfo( $seq, $source_type, $source_feat ) ;
|
|
486 # always; here we build main prot $source_feat; # if @source;
|
|
487
|
|
488 # abort if there are no features
|
|
489 warn "$seq_name has no features, skipping\n" and next
|
|
490 if !$seq->all_SeqFeatures;
|
|
491
|
|
492
|
|
493 $FTSOmap->{'source'} = $source_type;
|
|
494 ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features
|
|
495
|
|
496 # construct a GFF header
|
|
497 # add: get source_type from attributes of source feature? chromosome=X tag
|
|
498 # also combine 1st ft line here with source ft from $seq ..
|
|
499 my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);
|
|
500 print $out $header;
|
|
501 print "# working on $info\n" if($verbose);
|
|
502
|
|
503 # unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..
|
|
504 unflatten_seq($seq);
|
|
505
|
|
506 # Note that we use our own get_all_SeqFeatures function
|
|
507 # to rescue cloned exons
|
|
508
|
|
509 @GFF_LINE_FEAT = ();
|
|
510 for my $feature ( get_all_SeqFeatures($seq) ) {
|
|
511
|
|
512 my $method = $feature->primary_tag;
|
|
513 next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);
|
|
514
|
|
515 $feature->seq_id($seq->id) unless($feature->seq_id);
|
|
516 $feature->source_tag($SOURCEID);
|
|
517
|
|
518 # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;
|
|
519 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
|
|
520 maptags2gff($feature);
|
|
521
|
|
522 # current gene name. The unflattened gene features should be in order so any
|
|
523 # exons, CDSs, etc that follow will belong to this gene
|
|
524 my $gene_name;
|
|
525 if ( $method eq 'gene' || $method eq 'pseudogene' ) {
|
|
526 @to_print= print_held($out, $gffio, \@to_print);
|
|
527 $gene_id = $gene_name= gene_name($feature);
|
|
528 } else {
|
|
529 $gene_name= gene_name($feature);
|
|
530 }
|
|
531
|
|
532 #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx
|
|
533 # be converted to or added as Name=xxx (if not ID= or as well)
|
|
534 ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags
|
|
535 convert_to_name($feature);
|
|
536
|
|
537 ## dgg: extended to protein|polypeptide
|
|
538 ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes
|
|
539 ## in yeast have that genbank tag; why?
|
|
540 ## these include pseudogene ...
|
|
541
|
|
542 ## Note we also have mapped types to SO, so these RNA's are now transcripts:
|
|
543 # pseudomRNA => "pseudogenic_transcript",
|
|
544 # pseudotranscript" => "pseudogenic_transcript",
|
|
545 # misc_RNA=>'processed_transcript',
|
|
546
|
|
547 warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;
|
|
548
|
|
549 if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
|
|
550 || ( $gene_id && $gene_name eq $gene_id ) ) {
|
|
551
|
|
552 my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result
|
|
553 if ($action == GM_DUP_PART) {
|
|
554 # ignore, this is dupl. exon with new parent ...
|
|
555
|
|
556 } elsif ($action == GM_NOT_PART) {
|
|
557 add_generic_id( $feature, $gene_name, "nocount");
|
|
558 my $gff = $gffio->gff_string($feature);
|
|
559 push @GFF_LINE_FEAT, $feature;
|
|
560 #print $out "$gff\n" if $gff;
|
|
561
|
|
562 } elsif ($action > 0) {
|
|
563 # hold off print because exon etc. may get 2nd, 3rd parents
|
|
564 @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);
|
|
565 push(@to_print, $feature);
|
|
566 }
|
|
567
|
|
568 }
|
|
569
|
|
570 # otherwise handle as generic feats with IDHandler labels
|
|
571 else {
|
|
572 add_generic_id( $feature, $gene_name, "");
|
|
573 my $gff= $gffio->gff_string($feature);
|
|
574 push @GFF_LINE_FEAT, $feature;
|
|
575 #print $out "$gff\n" if $gff;
|
|
576 }
|
|
577 }
|
|
578
|
|
579 # don't like doing this after others; do after each new gene id?
|
|
580 @to_print= print_held($out, $gffio, \@to_print);
|
|
581
|
|
582 gff_validate(@GFF_LINE_FEAT);
|
|
583
|
|
584 for my $feature (@GFF_LINE_FEAT) {
|
|
585 my $gff= $gffio->gff_string($feature);
|
|
586 print $out "$gff\n" if $gff;
|
|
587 }
|
|
588
|
|
589 # deal with the corresponding DNA
|
|
590 my ($fa_out,$fa_outfile);
|
|
591 my $dna = $seq->seq;
|
|
592 if($dna || %proteinfa) {
|
|
593 $method{'RESIDUES'} += length($dna);
|
|
594 $dna =~ s/(\S{60})/$1\n/g;
|
|
595 $dna .= "\n";
|
|
596
|
|
597 if ($split) {
|
|
598 $fa_outfile = $outfile;
|
|
599 $fa_outfile =~ s/gff$/fa/;
|
|
600 open $fa_out, ">$fa_outfile" or die $!;
|
|
601 print $fa_out ">$seq_name\n$dna" if $dna;
|
|
602 foreach my $aid (sort keys %proteinfa) {
|
|
603 my $aa= delete $proteinfa{$aid};
|
|
604 $method{'RESIDUES(tr)'} += length($aa);
|
|
605 $aa =~ s/(\S{60})/$1\n/g;
|
|
606 print $fa_out ">$aid\n$aa\n";
|
|
607 }
|
|
608
|
|
609 }
|
|
610 else {
|
|
611 ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out
|
|
612 ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk
|
|
613 ## maybe write this to temp .fa then cat to end of lumped gff $out
|
|
614 print $lumpfa_fh ">$seq_name\n$dna" if $dna;
|
|
615 foreach my $aid (sort keys %proteinfa) {
|
|
616 my $aa= delete $proteinfa{$aid};
|
|
617 $method{'RESIDUES(tr)'} += length($aa);
|
|
618 $aa =~ s/(\S{60})/$1\n/g;
|
|
619 print $lumpfa_fh ">$aid\n$aa\n";
|
|
620 }
|
|
621 }
|
|
622
|
|
623 %proteinfa=();
|
|
624 }
|
|
625
|
|
626 if ( $zip && !$lump ) {
|
|
627 system "gzip -f $outfile";
|
|
628 system "gzip -f $fa_outfile" if($fa_outfile);
|
|
629 $outfile .= '.gz';
|
|
630 $fa_outfile .= '.gz' if $split;
|
|
631 }
|
|
632
|
|
633 # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA
|
|
634 print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);
|
|
635 print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);
|
|
636
|
|
637 # dgg: moved to after all inputs; here it prints cumulative sum for each record
|
|
638 #if ( $summary ) {
|
|
639 # print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
|
|
640 #
|
|
641 # for ( keys %method ) {
|
|
642 # print "# $_ $method{$_}\n";
|
|
643 # }
|
|
644 # print "# \n";
|
|
645 # }
|
|
646
|
|
647 }
|
|
648
|
|
649 print "# GFF3 saved to $outfile\n" if( $verbose && $lump);
|
|
650 if ( $summary ) {
|
|
651 print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
|
|
652 for ( keys %method ) {
|
|
653 print "# $_ $method{$_}\n";
|
|
654 }
|
|
655 print "# \n";
|
|
656 }
|
|
657
|
|
658 ## FIXME for piped output w/ split FA files ...
|
|
659 close($lumpfa_fh) if $lumpfa_fh;
|
|
660 if (!$split && $outfa && $lump_fh) {
|
|
661 print $lump_fh "##FASTA\n"; # GFF3 spec
|
|
662 open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";
|
|
663 while( <$lumpfa_fh>) {
|
|
664 print $lump_fh $_;
|
|
665 } # is $lump_fh still open?
|
|
666 close($lumpfa_fh);
|
|
667 unlink($outfa);
|
|
668 }
|
|
669
|
|
670
|
|
671 if ( $zip && $lump ) {
|
|
672 system "gzip -f $lump";
|
|
673 }
|
|
674
|
|
675 close FH;
|
|
676 }
|
|
677
|
|
678
|
|
679
|
|
680
|
|
681
|
|
682 sub typeorder {
|
|
683 return 1 if ($_[0] =~ /gene/);
|
|
684 return 2 if ($_[0] =~ /RNA|transcript/);
|
|
685 return 3 if ($_[0] =~ /protein|peptide/);
|
|
686 return 4 if ($_[0] =~ /exon|CDS/);
|
|
687 return 3; # default before exon (smallest part)
|
|
688 }
|
|
689
|
|
690 sub sort_by_feattype {
|
|
691 my($at,$bt)= ($a->primary_tag, $b->primary_tag);
|
|
692 return (typeorder($at) <=> typeorder($bt))
|
|
693 or ($at cmp $bt);
|
|
694 ## or ($a->name() cmp $b->name());
|
|
695 }
|
|
696
|
|
697 sub print_held {
|
|
698 my($out,$gffio,$to_print)= @_;
|
|
699 return unless(@$to_print);
|
|
700 @$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes
|
|
701 while ( my $feature = shift @$to_print) {
|
|
702 my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode
|
|
703 push @GFF_LINE_FEAT, $feature;
|
|
704 #print $out "$gff\n";
|
|
705 }
|
|
706 return (); # @to_print
|
|
707 }
|
|
708
|
|
709 sub maptags2gff {
|
|
710 my $f = shift;
|
|
711 ## should copy/move locus_tag to Alias, if not ID/Name/Alias already
|
|
712 # but see below /gene /locus_tag usage
|
|
713 foreach my $tag (keys %TAG_MAP) {
|
|
714 if ($f->has_tag($tag)) {
|
|
715 my $newtag= $TAG_MAP{$tag};
|
|
716 my @v= $f->get_tag_values($tag);
|
|
717 $f->remove_tag($tag);
|
|
718 $f->add_tag_value($newtag,@v);
|
|
719
|
|
720 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
|
|
721 ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'
|
|
722 if ($tag eq 'note') {
|
|
723 map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;
|
|
724 my @go= map { m/(GO:\d+)/g } @v;
|
|
725 $f->add_tag_value('Ontology_term',@go) if(@go);
|
|
726 }
|
|
727
|
|
728 }
|
|
729 }
|
|
730 }
|
|
731
|
|
732
|
|
733 sub getSourceInfo {
|
|
734 my ($seq, $source_type, $sf) = @_;
|
|
735
|
|
736 my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);
|
|
737 my $is_gene = ($SOURCEID =~/entrezgene/i);
|
|
738 my $is_rich = (ref($seq) =~ /RichSeq/);
|
|
739 my $seq_name= $seq->accession_number();
|
|
740
|
|
741 unless($sf) { # make one
|
|
742 $source_type= $is_swiss ? $PROTEIN_TYPE
|
|
743 : $is_gene ? "eneg" # "gene" # "region" #
|
|
744 : $is_rich ? $seq->molecule : $source_type;
|
|
745 $sf = Bio::SeqFeature::Generic->direct_new();
|
|
746
|
|
747 my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);
|
|
748 my $loc= $seq->can('location') ? $seq->location()
|
|
749 : new Bio::Location::Simple( -start => $start, -end => $len);
|
|
750 $sf->location( $loc );
|
|
751 $sf->primary_tag($source_type);
|
|
752 $sf->source_tag($SOURCEID);
|
|
753 $sf->seq_id( $seq_name);
|
|
754 #? $sf->display_name($seq->id()); ## Name or Alias ?
|
|
755 $sf->add_tag_value( Alias => $seq->id()); # unless id == accession
|
|
756 $seq->add_SeqFeature($sf);
|
|
757 ## $source_feat= $sf;
|
|
758 }
|
|
759
|
|
760 if ($sf->has_tag("chromosome")) {
|
|
761 $source_type= "chromosome";
|
|
762 my ($chrname) = $sf->get_tag_values("chromosome");
|
|
763 ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead
|
|
764 ## e.g. Mouse chr 19 has two IDs in NCBI genbank now
|
|
765 $sf->add_tag_value( Alias => $chrname );
|
|
766 }
|
|
767
|
|
768 # pull GB Comment, Description for source ft ...
|
|
769 # add reference - can be long, not plain string...
|
|
770 warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;
|
|
771 # GenBank fields: keyword,comment,reference,date_changed
|
|
772 # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM
|
|
773
|
|
774 # is this just for main $seq object or for all seqfeatures ?
|
|
775 my %AnnotTagMap= (
|
|
776 'gene_name' => 'Alias',
|
|
777 'ALIAS_SYMBOL' => 'Alias', # Entrezgene
|
|
778 'LOCUS_SYNONYM' => 'Alias', #?
|
|
779 'symbol' => 'Alias',
|
|
780 'synonym' => 'Alias',
|
|
781 'dblink' => 'Dbxref',
|
|
782 'product' => 'product',
|
|
783 'Reference' => 'reference',
|
|
784 'OntologyTerm' => 'Ontology_term',
|
|
785 'comment' => 'Note',
|
|
786 'comment1' => 'Note',
|
|
787 # various map-type locations
|
|
788 # gene accession tag is named per source db !??
|
|
789 # 'Index terms' => keywords ??
|
|
790 );
|
|
791
|
|
792
|
|
793 my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );
|
|
794 my ($date)= $seq->annotation->get_Annotations("dates")
|
|
795 || $seq->annotation->get_Annotations("update-date")
|
|
796 || $is_rich ? $seq->get_dates() : ();
|
|
797 my ($comment)= $seq->annotation->get_Annotations("comment");
|
|
798 my ($species)= $seq->annotation->get_Annotations("species");
|
|
799 if (!$species
|
|
800 && $seq->can('species')
|
|
801 && defined $seq->species()
|
|
802 && $seq->species()->can('binomial') ) {
|
|
803 $species= $seq->species()->binomial();
|
|
804 }
|
|
805
|
|
806 # update source feature with main GB fields
|
|
807 $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');
|
|
808 $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));
|
|
809 $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));
|
|
810 $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));
|
|
811 $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));
|
|
812
|
|
813 $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;
|
|
814
|
|
815 foreach my $atag (sort keys %AnnotTagMap) {
|
|
816 my $gtag= $AnnotTagMap{$atag}; next unless($gtag);
|
|
817 my @anno = map{
|
|
818 if (ref $_ && $_->can('get_all_values')) {
|
|
819 split( /[,;] */, join ";", $_->get_all_values)
|
|
820 }
|
|
821 elsif (ref $_ && $_->can('display_text')) {
|
|
822 split( /[,;] */, $_->display_text)
|
|
823 }
|
|
824 elsif (ref $_ && $_->can('value')) {
|
|
825 split( /[,;] */, $_->value)
|
|
826 }
|
|
827 else {
|
|
828 ();
|
|
829 }
|
|
830 } $seq->annotation->get_Annotations($atag);
|
|
831 foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }
|
|
832 }
|
|
833
|
|
834 #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');
|
|
835 #$sf->add_tag_value( Alias => $_ ) foreach(@genes);
|
|
836 #
|
|
837 #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all
|
|
838 #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);
|
|
839
|
|
840 return (wantarray)? ($source_type,$sf) : $source_type; #?
|
|
841 }
|
|
842
|
|
843
|
|
844 sub gene_features {
|
|
845 my ($f, $gene_id, $genelinkID) = @_;
|
|
846 local $_ = $f->primary_tag;
|
|
847 $method{$_}++;
|
|
848
|
|
849 if ( /gene/ ) {
|
|
850 $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?
|
|
851 $tnum = $rnum= 0; $ncrna_id= $rna_id = '';
|
|
852 return GM_NEW_TOPLEVEL;
|
|
853
|
|
854 } elsif ( /mRNA/ ) {
|
|
855 return GM_NOT_PART unless $gene_id;
|
|
856 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
|
|
857 ($rna_id = $gene_id ) =~ s/gene/mRNA/;
|
|
858 $rna_id .= '.t0' . ++$tnum;
|
|
859 $f->add_tag_value( ID => $rna_id );
|
|
860 $f->add_tag_value( Parent => $gene_id );
|
|
861
|
|
862 } elsif ( /RNA|transcript/) {
|
|
863 ## misc_RNA here; missing exons ... flattener problem?
|
|
864 # all of {t,nc,sn}RNA can have gene models now
|
|
865 ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag
|
|
866 ## CDS needs to use mRNA, not misc_RNA, rna_id ...
|
|
867 ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level
|
|
868
|
|
869 if($gene_id) {
|
|
870 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
|
|
871 ($ncrna_id = $gene_id) =~ s/gene/ncRNA/;
|
|
872 $ncrna_id .= '.r0' . ++$rnum;
|
|
873 $f->add_tag_value( Parent => $gene_id );
|
|
874 $f->add_tag_value( ID => $ncrna_id );
|
|
875 } else {
|
|
876 unless ($f->has_tag('ID')) {
|
|
877 if($genelinkID) {
|
|
878 $f->add_tag_value( ID => $genelinkID ) ;
|
|
879 } else {
|
|
880 $idh->generate_unique_persistent_id($f);
|
|
881 }
|
|
882 }
|
|
883 ($ncrna_id)= $f->get_tag_values('ID');
|
|
884 return GM_NEW_TOPLEVEL;
|
|
885 # this feat now acts as gene-top-level; need to print @to_print to flush prior exons?
|
|
886 }
|
|
887
|
|
888 } elsif ( /exon/ ) { # can belong to any kind of RNA
|
|
889 return GM_NOT_PART unless ($rna_id||$ncrna_id);
|
|
890 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
|
|
891 ## we are getting duplicate Parents here, which chokes chado loader, with reason...
|
|
892 ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice
|
|
893 ## check all Parents
|
|
894 for my $expar ($rna_id, $ncrna_id) {
|
|
895 next unless($expar);
|
|
896 if ( $exonpar{$expar} and $f->has_tag('Parent') ) {
|
|
897 my @vals = $f->get_tag_values('Parent');
|
|
898 next if (grep {$expar eq $_} @vals);
|
|
899 }
|
|
900 $exonpar{$expar}++;
|
|
901 $f->add_tag_value( Parent => $expar);
|
|
902 # last; #? could be both
|
|
903 }
|
|
904 # now we can skip cloned exons
|
|
905 # dgg note: multiple parents get added and printed for each unique exon
|
|
906 return GM_DUP_PART if ++$seen{$f} > 1;
|
|
907
|
|
908 } elsif ( /CDS|protein|polypeptide/ ) {
|
|
909 return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??
|
|
910 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??
|
|
911 (my $pro_id = $rna_id) =~ s/\.t/\.p/;
|
|
912
|
|
913 if( ! $CDSkeep && /CDS/) {
|
|
914 $f->primary_tag($PROTEIN_TYPE);
|
|
915
|
|
916 ## duplicate problem is Location ..
|
|
917 if ($f->location->isa("Bio::Location::SplitLocationI")) {
|
|
918 # my($b,$e)=($f->start, $f->end); # is this all we need?
|
|
919 my($b,$e)=(-1,0);
|
|
920 foreach my $l ($f->location->each_Location) {
|
|
921 $b = $l->start if($b<0 || $b > $l->start);
|
|
922 $e = $l->end if($e < $l->end);
|
|
923 }
|
|
924 $f->location( Bio::Location::Simple->new(
|
|
925 -start => $b, -end => $e, -strand => $f->strand) );
|
|
926 }
|
|
927
|
|
928 $f->add_tag_value( Derives_from => $rna_id );
|
|
929 }
|
|
930 else {
|
|
931 $f->add_tag_value( Parent => $rna_id );
|
|
932 }
|
|
933
|
|
934 $f->add_tag_value( ID => $pro_id );
|
|
935
|
|
936 move_translation_fasta($f, $pro_id);
|
|
937 #if( $f->has_tag('translation')) {
|
|
938 # my ($aa) = $f->get_tag_values("translation");
|
|
939 # $proteinfa{$pro_id}= $aa;
|
|
940 # $f->remove_tag("translation");
|
|
941 # $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
|
|
942 #}
|
|
943 } elsif ( /region/ ) {
|
|
944 $f->primary_tag('gene_component_region');
|
|
945 $f->add_tag_value( Parent => $gene_id );
|
|
946 } else {
|
|
947 return GM_NOT_PART unless $gene_id;
|
|
948 $f->add_tag_value( Parent => $gene_id );
|
|
949 }
|
|
950
|
|
951 ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;
|
|
952
|
|
953 return GM_NEW_PART;
|
|
954 }
|
|
955
|
|
956 ## was generic_features > add_generic_id
|
|
957 sub add_generic_id {
|
|
958 my ($f, $ft_name, $flags) = @_;
|
|
959 my $method = $f->primary_tag;
|
|
960 $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above
|
|
961
|
|
962 if ($f->has_tag('ID')) {
|
|
963
|
|
964 }
|
|
965 elsif ( $f->has_tag($method) ) {
|
|
966 my ($name) = $f->get_tag_values($method);
|
|
967 $f->add_tag_value( ID => "$method:$name" );
|
|
968 }
|
|
969 elsif($ft_name) { # is this unique ?
|
|
970 $f->add_tag_value( ID => $ft_name );
|
|
971 }
|
|
972 else {
|
|
973 $idh->generate_unique_persistent_id($f);
|
|
974 }
|
|
975
|
|
976 move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )
|
|
977 if($method =~ /CDS/);
|
|
978
|
|
979 # return $io->gff_string($f);
|
|
980 }
|
|
981
|
|
982 sub move_translation_fasta {
|
|
983 my ($f, $ft_id) = @_;
|
|
984 if( $ft_id && $f->has_tag('translation') ) {
|
|
985 my ($aa) = $f->get_tag_values("translation");
|
|
986 if($aa && $aa !~ /^length/) {
|
|
987 $proteinfa{$ft_id}= $aa;
|
|
988 $f->remove_tag("translation");
|
|
989 $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
|
|
990 }
|
|
991 }
|
|
992 }
|
|
993
|
|
994 sub gff_header {
|
|
995 my ($name, $end, $source_type, $source_feat) = @_;
|
|
996 $source_type ||= "region";
|
|
997
|
|
998 my $info = "$source_type:$name";
|
|
999 my $head = "##gff-version $GFF_VERSION\n".
|
|
1000 "##sequence-region $name 1 $end\n".
|
|
1001 "# conversion-by bp_genbank2gff3.pl\n";
|
|
1002 if ($source_feat) {
|
|
1003 ## dgg: these header comment fields are not useful when have multi-records, diff organisms
|
|
1004 for my $key (qw(organism Note date)) {
|
|
1005 my $value;
|
|
1006 if ($source_feat->has_tag($key)) {
|
|
1007 ($value) = $source_feat->get_tag_values($key);
|
|
1008 }
|
|
1009 if ($value) {
|
|
1010 $head .= "# $key $value\n";
|
|
1011 $info .= ", $value";
|
|
1012 }
|
|
1013 }
|
|
1014 $head = "" if $didheader;
|
|
1015 } else {
|
|
1016 $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";
|
|
1017 }
|
|
1018 $didheader++;
|
|
1019 return (wantarray) ? ($head,$info) : $head;
|
|
1020 }
|
|
1021
|
|
1022 sub unflatten_seq {
|
|
1023 my $seq = shift;
|
|
1024
|
|
1025 ## print "# working on $source_type:", $seq->accession, "\n";
|
|
1026 my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number .
|
|
1027 ": consult STDERR\n";
|
|
1028
|
|
1029 eval {
|
|
1030 $unflattener->unflatten_seq( -seq => $seq,
|
|
1031 -noinfer => $noinfer,
|
|
1032 -use_magic => 1 );
|
|
1033 };
|
|
1034
|
|
1035 # deal with unflattening errors
|
|
1036 if ( $@ ) {
|
|
1037 warn $seq->accession_number . " Unflattening error:\n";
|
|
1038 warn "Details: $@\n";
|
|
1039 print "# ".$uh_oh;
|
|
1040 }
|
|
1041
|
|
1042 return 0 if !$seq || !$seq->all_SeqFeatures;
|
|
1043
|
|
1044 # map feature types to the sequence ontology
|
|
1045 ## $tm->map_types_to_SO( -seq => $seq );
|
|
1046 #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg
|
|
1047
|
|
1048 map_types(
|
|
1049 $tm,
|
|
1050 -seq => $seq,
|
|
1051 -type_map => $FTSOmap,
|
|
1052 -syn_map => $FTSOsynonyms,
|
|
1053 -undefined => "region"
|
|
1054 ); #nml
|
|
1055
|
|
1056 }
|
|
1057
|
|
1058 sub filter {
|
|
1059 my $seq = shift;
|
|
1060 ## return unless $filter;
|
|
1061 my @feats;
|
|
1062 my @sources; # dgg; pick source features here; only 1 always?
|
|
1063 if ($filter) {
|
|
1064 for my $f ( $seq->remove_SeqFeatures ) {
|
|
1065 my $m = $f->primary_tag;
|
|
1066 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
|
|
1067 push @feats, $f unless $filter =~ /$m/i;
|
|
1068 }
|
|
1069 $seq->add_SeqFeature($_) foreach @feats;
|
|
1070 } else {
|
|
1071 for my $f ( $seq->get_SeqFeatures ){
|
|
1072 my $m = $f->primary_tag;
|
|
1073 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
|
|
1074 }
|
|
1075 }
|
|
1076
|
|
1077 return @sources;
|
|
1078 }
|
|
1079
|
|
1080
|
|
1081 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
|
|
1082 # changed to filter out cloned features. We have to implement the old
|
|
1083 # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
|
|
1084 sub get_all_SeqFeatures {
|
|
1085 my $seq = shift;
|
|
1086 my @flatarr;
|
|
1087
|
|
1088 foreach my $feat ( $seq->get_SeqFeatures ){
|
|
1089 push(@flatarr,$feat);
|
|
1090 _add_flattened_SeqFeatures(\@flatarr,$feat);
|
|
1091 }
|
|
1092 return @flatarr;
|
|
1093 }
|
|
1094
|
|
1095 sub gene_name {
|
|
1096 my $g = shift;
|
|
1097 my $gene_id = ''; # zero it;
|
|
1098
|
|
1099 if ($g->has_tag('locus_tag')) {
|
|
1100 ($gene_id) = $g->get_tag_values('locus_tag');
|
|
1101 }
|
|
1102
|
|
1103 elsif ($g->has_tag('gene')) {
|
|
1104 ($gene_id) = $g->get_tag_values('gene');
|
|
1105 }
|
|
1106 elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene
|
|
1107 ($gene_id) = $g->get_tag_values('ID');
|
|
1108 }
|
|
1109
|
|
1110 ## See Unflattener comment:
|
|
1111 # on rare occasions, records will have no /gene or /locus_tag
|
|
1112 # but it WILL have /product tags. These serve the same purpose
|
|
1113 # for grouping. For an example, see AY763288 (also in t/data)
|
|
1114 # eg. product=tRNA-Asp ; product=similar to crooked neck protein
|
|
1115 elsif ($g->has_tag('product')) {
|
|
1116 my ($name)= $g->get_tag_values('product');
|
|
1117 ($gene_id) = $name unless($name =~ / /); # a description not name
|
|
1118 }
|
|
1119
|
|
1120 ## dgg; also handle transposon=xxxx ID/name
|
|
1121 # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746
|
|
1122 elsif ($g->has_tag('transposon')) {
|
|
1123 my ($name)= $g->get_tag_values('transposon');
|
|
1124 ($gene_id) = $name unless($name =~ / /); # a description not name
|
|
1125 }
|
|
1126
|
|
1127 return $gene_id;
|
|
1128 }
|
|
1129
|
|
1130 # same list as gene_name .. change tag to generic Name
|
|
1131 sub convert_to_name {
|
|
1132 my $g = shift;
|
|
1133 my $gene_id = ''; # zero it;
|
|
1134
|
|
1135 if ($g->has_tag('gene')) {
|
|
1136 ($gene_id) = $g->get_tag_values('gene');
|
|
1137 $g->remove_tag('gene');
|
|
1138 $g->add_tag_value('Name', $gene_id);
|
|
1139 }
|
|
1140 elsif ($g->has_tag('locus_tag')) {
|
|
1141 ($gene_id) = $g->get_tag_values('locus_tag');
|
|
1142 $g->remove_tag('locus_tag');
|
|
1143 $g->add_tag_value('Name', $gene_id);
|
|
1144 }
|
|
1145
|
|
1146 elsif ($g->has_tag('product')) {
|
|
1147 my ($name)= $g->get_tag_values('product');
|
|
1148 ($gene_id) = $name unless($name =~ / /); # a description not name
|
|
1149 ## $g->remove_tag('product');
|
|
1150 $g->add_tag_value('Name', $gene_id);
|
|
1151 }
|
|
1152
|
|
1153 elsif ($g->has_tag('transposon')) {
|
|
1154 my ($name)= $g->get_tag_values('transposon');
|
|
1155 ($gene_id) = $name unless($name =~ / /); # a description not name
|
|
1156 ## $g->remove_tag('transposon');
|
|
1157 $g->add_tag_value('Name', $gene_id);
|
|
1158 }
|
|
1159 elsif ($g->has_tag('ID')) {
|
|
1160 my ($name)= $g->get_tag_values('ID');
|
|
1161 $g->add_tag_value('Name', $name);
|
|
1162 }
|
|
1163 return $gene_id;
|
|
1164 }
|
|
1165
|
|
1166
|
|
1167 sub _add_flattened_SeqFeatures {
|
|
1168 my ($arrayref,$feat) = @_;
|
|
1169 my @subs = ();
|
|
1170
|
|
1171 if ($feat->isa("Bio::FeatureHolderI")) {
|
|
1172 @subs = $feat->get_SeqFeatures;
|
|
1173 }
|
|
1174 elsif ($feat->isa("Bio::SeqFeatureI")) {
|
|
1175 @subs = $feat->sub_SeqFeature;
|
|
1176 }
|
|
1177 else {
|
|
1178 warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
|
|
1179 "Don't know how to flatten.";
|
|
1180 }
|
|
1181
|
|
1182 for my $sub (@subs) {
|
|
1183 push(@$arrayref,$sub);
|
|
1184 _add_flattened_SeqFeatures($arrayref,$sub);
|
|
1185 }
|
|
1186
|
|
1187 }
|
|
1188
|
|
1189 sub map_types {
|
|
1190
|
|
1191 my ($self, @args) = @_;
|
|
1192
|
|
1193 my($sf, $seq, $type_map, $syn_map, $undefmap) =
|
|
1194 $self->_rearrange([qw(FEATURE
|
|
1195 SEQ
|
|
1196 TYPE_MAP
|
|
1197 SYN_MAP
|
|
1198 UNDEFINED
|
|
1199 )],
|
|
1200 @args);
|
|
1201
|
|
1202 if (!$sf && !$seq) {
|
|
1203 $self->throw("you need to pass in either -feature or -seq");
|
|
1204 }
|
|
1205
|
|
1206 my @sfs = ($sf);
|
|
1207 if ($seq) {
|
|
1208 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
|
|
1209 @sfs = $seq->get_all_SeqFeatures;
|
|
1210 }
|
|
1211 $type_map = $type_map || $self->typemap; # dgg: was type_map;
|
|
1212 foreach my $feat (@sfs) {
|
|
1213
|
|
1214 $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
|
|
1215 $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
|
|
1216
|
|
1217 my $primary_tag = $feat->primary_tag;
|
|
1218
|
|
1219 #if ($primary_tag =~ /^pseudo(.*)$/) {
|
|
1220 # $primary_tag = $1;
|
|
1221 # $feat->primary_tag($primary_tag);
|
|
1222 #}
|
|
1223
|
|
1224 my $mtype = $type_map->{$primary_tag};
|
|
1225 if ($mtype) {
|
|
1226 if (ref($mtype)) {
|
|
1227 if (ref($mtype) eq 'ARRAY') {
|
|
1228 my $soID;
|
|
1229 ($mtype, $soID) = @$mtype;
|
|
1230
|
|
1231 if ($soID && ref($ONTOLOGY)) {
|
|
1232 my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
|
|
1233 $mtype = $term->name if $term;
|
|
1234 }
|
|
1235 # if SO ID is undefined AND we have an ontology to search, we want to delete
|
|
1236 # the feature type hash entry in order to force a fuzzy search
|
|
1237 elsif (! defined $soID && ref($ONTOLOGY)) {
|
|
1238 undef $mtype;
|
|
1239 delete $type_map->{$primary_tag};
|
|
1240 }
|
|
1241 elsif ($undefmap && $mtype eq 'undefined') { # dgg
|
|
1242 $mtype= $undefmap;
|
|
1243 }
|
|
1244
|
|
1245 $type_map->{$primary_tag} = $mtype if $mtype;
|
|
1246 }
|
|
1247 elsif (ref($mtype) eq 'CODE') {
|
|
1248 $mtype = $mtype->($feat);
|
|
1249 }
|
|
1250 else {
|
|
1251 $self->throw('must be scalar or CODE ref');
|
|
1252 }
|
|
1253 }
|
|
1254 elsif ($undefmap && $mtype eq 'undefined') { # dgg
|
|
1255 $mtype= $undefmap;
|
|
1256 }
|
|
1257 $feat->primary_tag($mtype);
|
|
1258 }
|
|
1259
|
|
1260 if ($CONF) {
|
|
1261 conf_read();
|
|
1262 my %perfect_matches;
|
|
1263 while (my ($p_tag,$rules) = each %$YAML) {
|
|
1264 RULE:
|
|
1265 for my $rule (@$rules) {
|
|
1266 for my $tags (@$rule) {
|
|
1267 while (my ($tag,$values) = each %$tags) {
|
|
1268 for my $value (@$values) {
|
|
1269 if ($feat->has_tag($tag)) {
|
|
1270 for ($feat->get_tag_values($tag)) {
|
|
1271 next RULE unless $_ =~ /\Q$value\E/;
|
|
1272 }
|
|
1273 } elsif ($tag eq 'primary_tag') {
|
|
1274 next RULE unless $value eq
|
|
1275 $feat->primary_tag;
|
|
1276 } elsif ($tag eq 'location') {
|
|
1277 next RULE unless $value eq
|
|
1278 $feat->start.'..'.$feat->end;
|
|
1279 } else { next RULE }
|
|
1280 }
|
|
1281 }
|
|
1282 }
|
|
1283 $perfect_matches{$p_tag}++;
|
|
1284 }
|
|
1285 }
|
|
1286 if (scalar(keys %perfect_matches) == 1) {
|
|
1287 $mtype = $_ for keys %perfect_matches;
|
|
1288 } elsif (scalar(keys %perfect_matches) > 1) {
|
|
1289 warn "There are conflicting rules in the config file for the" .
|
|
1290 " following types: ";
|
|
1291 warn "\t$_\n" for keys %perfect_matches;
|
|
1292 warn "Until conflict resolution is built into the converter," .
|
|
1293 " you will have to manually edit the config file to remove the" .
|
|
1294 " conflict. Sorry :(. Skipping user preference for this entry";
|
|
1295 sleep(2);
|
|
1296 }
|
|
1297 }
|
|
1298
|
|
1299 if ( ! $mtype && $syn_map) {
|
|
1300 if ($feat->has_tag('note')) {
|
|
1301
|
|
1302 my @all_matches;
|
|
1303
|
|
1304 my @note = $feat->each_tag_value('note');
|
|
1305
|
|
1306 for my $k (keys %$syn_map) {
|
|
1307
|
|
1308 if ($k =~ /"(.+)"/) {
|
|
1309
|
|
1310 my $syn = $1;
|
|
1311
|
|
1312 for my $note (@note) {
|
|
1313
|
|
1314 # look through the notes to see if the description
|
|
1315 # is an exact match for synonyms
|
|
1316 if ( $syn eq $note ) {
|
|
1317
|
|
1318 my @map = @{$syn_map->{$k}};
|
|
1319
|
|
1320
|
|
1321 my $best_guess = $map[0];
|
|
1322
|
|
1323 unshift @{$all_matches[-1]}, [$best_guess];
|
|
1324
|
|
1325 $mtype = $MANUAL
|
|
1326 ? manual_curation($feat, $best_guess, \@all_matches)
|
|
1327 : $best_guess;
|
|
1328
|
|
1329 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
|
|
1330 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
|
|
1331 . '#' x 78 . "\n\n";
|
|
1332
|
|
1333 } else {
|
|
1334 # check both primary tag and and note against
|
|
1335 # SO synonyms for best matching description
|
|
1336
|
|
1337 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
|
|
1338 }
|
|
1339
|
|
1340 }
|
|
1341 }
|
|
1342 }
|
|
1343
|
|
1344 #unless ($mtype) {
|
|
1345 for my $note (@note) {
|
|
1346 for my $name (values %$type_map) {
|
|
1347 # check primary tag against SO names for best matching
|
|
1348 # descriptions //NML also need to check against
|
|
1349 # definition && camel case split terms
|
|
1350
|
|
1351 SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
|
|
1352 }
|
|
1353 }
|
|
1354 #}
|
|
1355
|
|
1356 if (scalar(@all_matches) && !$mtype) {
|
|
1357
|
|
1358 my $top_matches = first { defined $_ } @{$all_matches[-1]};
|
|
1359
|
|
1360 my $best_guess = $top_matches->[0];
|
|
1361
|
|
1362
|
|
1363
|
|
1364 # if guess has quotes, it is a synonym term. we need to
|
|
1365 # look up the corresponding name term
|
|
1366 # otherwise, guess is a name, so we can use it directly
|
|
1367 if ($best_guess =~ /"(.+)"/) {
|
|
1368
|
|
1369 $best_guess = $syn_map->{$best_guess}->[0];
|
|
1370
|
|
1371 }
|
|
1372
|
|
1373 @RETURN = @all_matches;
|
|
1374 $mtype = $MANUAL
|
|
1375 ? manual_curation($feat, $best_guess, \@all_matches)
|
|
1376 : $best_guess;
|
|
1377
|
|
1378 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
|
|
1379 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
|
|
1380 . '#' x 78 . "\n\n";
|
|
1381
|
|
1382 }
|
|
1383 }
|
|
1384 $mtype ||= $undefmap;
|
|
1385 $feat->primary_tag($mtype);
|
|
1386 }
|
|
1387 }
|
|
1388
|
|
1389
|
|
1390 }
|
|
1391
|
|
1392 sub SO_fuzzy_match {
|
|
1393
|
|
1394 my $candidate = shift;
|
|
1395 my $primary_tag = shift;
|
|
1396 my $note = shift;
|
|
1397 my $SO_terms = shift;
|
|
1398 my $best_matches_ref = shift;
|
|
1399 my $modifier = shift;
|
|
1400
|
|
1401 $modifier ||= '';
|
|
1402
|
|
1403 my @feat_terms;
|
|
1404
|
|
1405 for ( split(" |_", $primary_tag) ) {
|
|
1406 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
|
|
1407 my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
|
|
1408 push @feat_terms, @camelCase;
|
|
1409 }
|
|
1410
|
|
1411 for ( split(" |_", $note) ) {
|
|
1412 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
|
|
1413 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
|
|
1414 (my $word = $_) =~ s/[;:.,]//g;
|
|
1415 push @feat_terms, $word;
|
|
1416 }
|
|
1417
|
|
1418
|
|
1419 my @SO_terms = split(" |_", $SO_terms);
|
|
1420
|
|
1421 # fuzzy match works on a simple point system. When 2 words match,
|
|
1422 # the $plus counter adds one. When they don't, the $minus counter adds
|
|
1423 # one. This is used to sort similar matches together. Better matches
|
|
1424 # are found at the end of the array, near the top.
|
|
1425
|
|
1426 # NML: can we improve best match by using synonym tags
|
|
1427 # EXACT,RELATED,NARROW,BROAD?
|
|
1428
|
|
1429 my ($plus, $minus) = (0, 0);
|
|
1430 my %feat_terms;
|
|
1431 my %SO_terms;
|
|
1432
|
|
1433 #unique terms
|
|
1434 map {$feat_terms{$_} = 1} @feat_terms;
|
|
1435 map {$SO_terms{$_} = 1} @SO_terms;
|
|
1436
|
|
1437 for my $st (keys %SO_terms) {
|
|
1438 for my $ft (keys %feat_terms) {
|
|
1439
|
|
1440 ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
|
|
1441
|
|
1442 }
|
|
1443 }
|
|
1444
|
|
1445 push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
|
|
1446
|
|
1447 }
|
|
1448
|
|
1449 sub manual_curation {
|
|
1450
|
|
1451 my ($feat, $default_opt, $all_matches) = @_;
|
|
1452
|
|
1453 my @all_matches = @$all_matches;
|
|
1454
|
|
1455 # convert all SO synonyms into names and filter
|
|
1456 # all matches into unique term list because
|
|
1457 # synonyms can map to multiple duplicate names
|
|
1458
|
|
1459 my (@unique_SO_terms, %seen);
|
|
1460 for (reverse @all_matches) {
|
|
1461 for (@$_) {
|
|
1462 for (@$_) {
|
|
1463 #my @names;
|
|
1464 if ($_ =~ /"(.+)"/) {
|
|
1465 for (@{$SYN_MAP->{$_}}) {
|
|
1466 push @unique_SO_terms, $_ unless $seen{$_};
|
|
1467 $seen{$_}++;
|
|
1468 }
|
|
1469 } else {
|
|
1470 push @unique_SO_terms, $_ unless $seen{$_};
|
|
1471 $seen{$_}++;
|
|
1472 }
|
|
1473 }
|
|
1474 }
|
|
1475 }
|
|
1476
|
|
1477 my $s = scalar(@unique_SO_terms);
|
|
1478
|
|
1479 my $choice = 0;
|
|
1480
|
|
1481 my $more =
|
|
1482 "[a]uto : automatic input (selects best guess for remaining entries)\r" .
|
|
1483 "[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .
|
|
1484 "[i]nput : add a specific term\r" .
|
|
1485 "[r]eset : reset to the beginning of matches\r" .
|
|
1486 "[s]kip : skip this entry (selects best guess for this entry)\r"
|
|
1487 ;
|
|
1488
|
|
1489 $more .=
|
|
1490 "[n]ext : view the next ".OPTION_CYCLE." terms\r" .
|
|
1491 "[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
|
|
1492
|
|
1493 my $msg = #"\n\n" . '-' x 156 . "\n"
|
|
1494 "The converter found $s possible matches for the following GenBank entry: ";
|
|
1495
|
|
1496 my $directions =
|
|
1497 "Type a number to select the SO term that best matches"
|
|
1498 . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
|
|
1499
|
|
1500
|
|
1501 # lookup filtered list to pull out definitions
|
|
1502 my @options = map {
|
|
1503 my $term = $_;
|
|
1504 my %term;
|
|
1505 for (['name', 'name'], ['def', 'definition'], ['synonym',
|
|
1506 'each_synonym']) {
|
|
1507 my ($label, $method) = @$_;
|
|
1508 $term{$label} = \@{[$term->$method]};
|
|
1509 }
|
|
1510 [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
|
|
1511 } map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
|
|
1512
|
|
1513
|
|
1514 my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
|
|
1515 $default_opt, @options);
|
|
1516
|
|
1517 if ($option eq 'skip') { return $default_opt
|
|
1518 } elsif ($option eq 'auto') {
|
|
1519 $MANUAL = 0;
|
|
1520 return $default_opt;
|
|
1521 } else { return $option }
|
|
1522
|
|
1523 }
|
|
1524
|
|
1525 sub options_cycle {
|
|
1526
|
|
1527 my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
|
|
1528
|
|
1529 #NML: really should only call GenBank_entry once. Will need to change
|
|
1530 #method to return array & shift off header
|
|
1531 my $entry = GenBank_entry($feat, "\r");
|
|
1532
|
|
1533 my $total = scalar(@opt);
|
|
1534
|
|
1535 ($start,$stop) = (0, OPTION_CYCLE)
|
|
1536 if ( ($start < 0) && ($stop > 0) );
|
|
1537
|
|
1538 ($start,$stop) = (0, OPTION_CYCLE)
|
|
1539 if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
|
|
1540
|
|
1541 ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
|
|
1542 ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
|
|
1543
|
|
1544 $stop = $total if $stop > $total;
|
|
1545
|
|
1546 my $dir_copy = $directions;
|
|
1547 my $msg_copy = $msg;
|
|
1548 my $format = "format STDOUT = \n" .
|
|
1549 '-' x 156 . "\n" .
|
|
1550 '^' . '<' x 77 . '| Available Commands:' . "\n" .
|
|
1551 '$msg_copy' . "\n" .
|
|
1552 '-' x 156 . "\n" .
|
|
1553 ' ' x 78 . "|\n" .
|
|
1554 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
|
|
1555 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
|
|
1556 (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
|
|
1557 ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";
|
|
1558
|
|
1559 {
|
|
1560 # eval throws redefined warning that breaks formatting.
|
|
1561 # Turning off warnings just for the eval to fix this.
|
|
1562 no warnings 'redefine';
|
|
1563 eval $format;
|
|
1564 }
|
|
1565
|
|
1566 write;
|
|
1567
|
|
1568 print '-' x 156 . "\n" .
|
|
1569 'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
|
|
1570 " - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
|
|
1571 "\n" . '-' x 156 . "\n";
|
|
1572
|
|
1573 for (my $i = $start; $i < $stop; $i+=2) {
|
|
1574
|
|
1575 my ($left, $right) = @opt[$i,$i+1];
|
|
1576
|
|
1577 my ($nL, $nmL, $descL, $termL, @synL) = @$left;
|
|
1578
|
|
1579 #odd numbered lists can cause fatal undefined errors, so check
|
|
1580 #to make sure we have data
|
|
1581
|
|
1582 my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
|
|
1583
|
|
1584
|
|
1585 my $format = "format STDOUT = \n";
|
|
1586
|
|
1587 $format .=
|
|
1588 ' ' x 78 . "|\n" .
|
|
1589
|
|
1590 '@>>: name: ^' . '<' x 64 . '~' . ' |' .
|
|
1591 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .
|
|
1592 '$nL,' . ' ' x 7 . '$nmL,' .
|
|
1593 ( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .
|
|
1594
|
|
1595 ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
|
|
1596 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
|
|
1597 ' ' x 11 . '$nmL,' .
|
|
1598 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
|
|
1599 #' ' x 78 . '|' . "\n" .
|
|
1600
|
|
1601
|
|
1602 ' def: ^' . '<' x 65 . ' |' .
|
|
1603 (ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .
|
|
1604 ' ' x 11 . '$descL,' .
|
|
1605 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
|
|
1606
|
|
1607
|
|
1608 (' ^' . '<' x 65 . ' |' .
|
|
1609 (ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .
|
|
1610 ' ' x 11 . '$descL,' .
|
|
1611 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
|
|
1612
|
|
1613
|
|
1614 ' ^' . '<' x 61 . '...~ |' .
|
|
1615 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
|
|
1616 ' ' x 11 . '$descL,' .
|
|
1617 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
|
|
1618
|
|
1619 ".\n";
|
|
1620
|
|
1621 {
|
|
1622 # eval throws redefined warning that breaks formatting.
|
|
1623 # Turning off warnings just for the eval to fix this.
|
|
1624 no warnings 'redefine';
|
|
1625 eval $format;
|
|
1626 }
|
|
1627 write;
|
|
1628
|
|
1629 }
|
|
1630 print '-' x 156 . "\nenter a command:";
|
|
1631
|
|
1632 while (<STDIN>) {
|
|
1633
|
|
1634 (my $input = $_) =~ s/\s+$//;
|
|
1635
|
|
1636 if ($input =~ /^\d+$/) {
|
|
1637 if ( $input && defined $opt[$input-1] ) {
|
|
1638 return $opt[$input-1]->[1]
|
|
1639 } else {
|
|
1640 print "\nThat number is not an option. Please enter a valid number.\n:";
|
|
1641 }
|
|
1642 } elsif ($input =~ /^n/i | $input =~ /next/i ) {
|
|
1643 return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
|
|
1644 $feat, $directions, $best_guess, @opt)
|
|
1645 } elsif ($input =~ /^p/i | $input =~ /prev/i ) {
|
|
1646 return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
|
|
1647 $feat, $directions, $best_guess, @opt)
|
|
1648 } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
|
|
1649 } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
|
|
1650 } elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
|
|
1651 return manual_curation($feat, $best_guess, \@RETURN );
|
|
1652 } elsif ( $input =~ /^f/i || $input =~ /find/i ) {
|
|
1653
|
|
1654 my ($query, @query_results);
|
|
1655
|
|
1656 if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
|
|
1657 } else {
|
|
1658
|
|
1659 #do a SO search
|
|
1660 print "Type your search query\n:";
|
|
1661 while (<STDIN>) { chomp($query = $_); last }
|
|
1662 }
|
|
1663
|
|
1664 for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
|
|
1665 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
|
|
1666 }
|
|
1667
|
|
1668 return manual_curation($feat, $best_guess, \@query_results);
|
|
1669
|
|
1670 } elsif ( $input =~ /^i/i || $input =~ /input/i ) {
|
|
1671
|
|
1672 #NML fast input for later
|
|
1673 #my $query;
|
|
1674 #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
|
|
1675
|
|
1676 #manual input
|
|
1677 print "Type the term you want to use\n:";
|
|
1678 while (<STDIN>) {
|
|
1679 chomp(my $input = $_);
|
|
1680
|
|
1681 if (! $TYPE_MAP->{$input}) {
|
|
1682
|
|
1683 print "\"$input\" doesn't appear to be a valid SO term. Are ".
|
|
1684 "you sure you want to use it? (y or n)\n:";
|
|
1685
|
|
1686 while (<STDIN>) {
|
|
1687
|
|
1688 chomp(my $choice = $_);
|
|
1689
|
|
1690 if ($choice eq 'y') {
|
|
1691 print
|
|
1692 "\nWould you like to save your preference for " .
|
|
1693 "future use (so you don't have to redo manual " .
|
|
1694 "curation for this feature everytime you run " .
|
|
1695 "the converter)? (y or n)\n";
|
|
1696
|
|
1697 #NML: all these while loops are a mess. Really should condense it.
|
|
1698 while (<STDIN>) {
|
|
1699
|
|
1700 chomp(my $choice = $_);
|
|
1701
|
|
1702 if ($choice eq 'y') {
|
|
1703 curation_save($feat, $input);
|
|
1704 return $input;
|
|
1705 } elsif ($choice eq 'n') {
|
|
1706 return $input
|
|
1707 } else {
|
|
1708 print "\nDidn't recognize that command. Please " .
|
|
1709 "type y or n.\n:"
|
|
1710 }
|
|
1711 }
|
|
1712
|
|
1713
|
|
1714 } elsif ($choice eq 'n') {
|
|
1715 return options_cycle($start, $stop, $msg, $feat,
|
|
1716 $directions, $best_guess, @opt)
|
|
1717 } else {
|
|
1718 print "\nDidn't recognize that command. Please " .
|
|
1719 "type y or n.\n:"
|
|
1720 }
|
|
1721 }
|
|
1722
|
|
1723 } else {
|
|
1724 print
|
|
1725 "\nWould you like to save your preference for " .
|
|
1726 "future use (so you don't have to redo manual " .
|
|
1727 "curation for this feature everytime you run " .
|
|
1728 "the converter)? (y or n)\n";
|
|
1729
|
|
1730 #NML: all these while loops are a mess. Really should condense it.
|
|
1731 while (<STDIN>) {
|
|
1732
|
|
1733 chomp(my $choice = $_);
|
|
1734
|
|
1735 if ($choice eq 'y') {
|
|
1736 curation_save($feat, $input);
|
|
1737 return $input;
|
|
1738 } elsif ($choice eq 'n') {
|
|
1739 return $input
|
|
1740 } else {
|
|
1741 print "\nDidn't recognize that command. Please " .
|
|
1742 "type y or n.\n:"
|
|
1743 }
|
|
1744 }
|
|
1745
|
|
1746 }
|
|
1747
|
|
1748 }
|
|
1749 } else {
|
|
1750 print "\nDidn't recognize that command. Please re-enter your choice.\n:"
|
|
1751 }
|
|
1752 }
|
|
1753
|
|
1754 }
|
|
1755
|
|
1756 sub GenBank_entry {
|
|
1757 my ($f, $delimiter, $num) = @_;
|
|
1758
|
|
1759 $delimiter ||= "\n";
|
|
1760
|
|
1761
|
|
1762 my $entry =
|
|
1763
|
|
1764 ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
|
|
1765 . ($num
|
|
1766 ? ' ' x (12 - length $f->primary_tag ) . ' [2] '
|
|
1767 : ' ' x (15 - length $f->primary_tag)
|
|
1768 )
|
|
1769 . $f->start.'..'.$f->end
|
|
1770
|
|
1771 . "$delimiter";
|
|
1772
|
|
1773 if ($num) {
|
|
1774 words_tag($f, \$entry);
|
|
1775 } else {
|
|
1776 for my $tag ($f->all_tags) {
|
|
1777 for my $val ( $f->each_tag_value($tag) ) {
|
|
1778 $entry .= ' ' x 20;
|
|
1779 #$entry .= "/$tag=\"$val\"$delimiter";
|
|
1780 $entry .= $val eq '_no_value'
|
|
1781 ? "/$tag$delimiter"
|
|
1782 : "/$tag=\"$val\"$delimiter";
|
|
1783 }
|
|
1784 }
|
|
1785
|
|
1786 }
|
|
1787
|
|
1788 return $entry;
|
|
1789 }
|
|
1790
|
|
1791
|
|
1792 sub gff_validate {
|
|
1793 warn "Validating GFF file\n" if $DEBUG;
|
|
1794 my @feat = @_;
|
|
1795
|
|
1796 my (%parent2child, %all_ids, %descendants, %reserved);
|
|
1797
|
|
1798 for my $f (@feat) {
|
|
1799 for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
|
|
1800 map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
|
|
1801 if $f->has_tag($$aTags[0]);
|
|
1802 }
|
|
1803 }
|
|
1804
|
|
1805 if ($SO_FILE) {
|
|
1806 while (my ($parentID, $aChildren) = each %parent2child) {
|
|
1807 parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
|
|
1808 }
|
|
1809 }
|
|
1810
|
|
1811 id_validate(\%all_ids, \%reserved);
|
|
1812 }
|
|
1813
|
|
1814 sub parent_validate {
|
|
1815 my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
|
|
1816
|
|
1817 my $aParents = $hAll->{$parentID};
|
|
1818
|
|
1819 map {
|
|
1820 my $child = $_;
|
|
1821 $child->add_tag_value( validation_error =>
|
|
1822 "feature tried to add Parent tag, but no Parent found with ID $parentID"
|
|
1823 );
|
|
1824 my %parents;
|
|
1825 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
|
|
1826 delete $parents{$parentID};
|
|
1827 my @parents = keys %parents;
|
|
1828
|
|
1829 $child->remove_tag('Parent');
|
|
1830
|
|
1831 unless ($child->has_tag('ID')) {
|
|
1832 my $id = gene_name($child);
|
|
1833 $child->add_tag_value('ID', $id);
|
|
1834 push @{$hAll->{$id}}, $child
|
|
1835 }
|
|
1836
|
|
1837 $child->add_tag_value('Parent', @parents) if @parents;
|
|
1838
|
|
1839 } @$aChildren and return unless scalar(@$aParents);
|
|
1840
|
|
1841 my $par = join(',', map { $_->primary_tag } @$aParents);
|
|
1842 warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
|
|
1843
|
|
1844 #NML manual curation needs to happen here
|
|
1845
|
|
1846
|
|
1847 my %parentsToRemove;
|
|
1848
|
|
1849 CHILD:
|
|
1850 for my $child (@$aChildren) {
|
|
1851 my $childType = $child->primary_tag;
|
|
1852
|
|
1853 warn "WORKING ON $childType at ".$child->start.' to '.$child->end
|
|
1854 if $DEBUG;
|
|
1855
|
|
1856 for (my $i = 0; $i < scalar(@$aParents); $i++) {
|
|
1857 my $parent = $aParents->[$i];
|
|
1858 my $parentType = $parent->primary_tag;
|
|
1859
|
|
1860 warn "CHECKING $childType against $parentType" if $DEBUG;
|
|
1861
|
|
1862 #cache descendants so we don't have to do repeat searches
|
|
1863 unless ($hDescendants->{$parentType}) {
|
|
1864
|
|
1865 for my $term ($ONTOLOGY->find_terms(
|
|
1866 -name => $parentType
|
|
1867 ) ) {
|
|
1868
|
|
1869 map {
|
|
1870 $hDescendants->{$parentType}{$_->name}++
|
|
1871 } $ONTOLOGY->get_descendant_terms($term);
|
|
1872
|
|
1873 }
|
|
1874
|
|
1875 # NML: hopefully temporary fix.
|
|
1876 # SO doesn't consider exon/CDS to be a child of mRNA
|
|
1877 # even though common knowledge dictates that they are
|
|
1878 # This cheat fixes the false positives for now
|
|
1879 if ($parentType eq 'mRNA') {
|
|
1880 $hDescendants->{$parentType}{'exon'} = 1;
|
|
1881 $hDescendants->{$parentType}{'CDS'} = 1;
|
|
1882 }
|
|
1883
|
|
1884 }
|
|
1885
|
|
1886 warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
|
|
1887 " be a child of $parentType?" if $DEBUG;
|
|
1888 if ($hDescendants->{$parentType}{$childType}) {
|
|
1889 warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
|
|
1890
|
|
1891 #NML need to deal with multiple children matched to multiple different
|
|
1892 #parents. This model only assumes the first parent id that matches a child will
|
|
1893 #be the reserved feature.
|
|
1894
|
|
1895 $hReserved->{$parentID}{$parent}{'parent'} = $parent;
|
|
1896 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
|
|
1897
|
|
1898 #mark parent for later removal from all IDs
|
|
1899 #so we don't accidentally change any parents
|
|
1900
|
|
1901 $parentsToRemove{$i}++;
|
|
1902
|
|
1903 next CHILD;
|
|
1904 }
|
|
1905 }
|
|
1906
|
|
1907
|
|
1908
|
|
1909 #NML shouldn't have to check this; somehow child can lose Parent
|
|
1910 #it's happening W3110
|
|
1911 #need to track this down
|
|
1912 if ( $child->has_tag('Parent') ) {
|
|
1913
|
|
1914 warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
|
|
1915 if $DEBUG;
|
|
1916
|
|
1917 my %parents;
|
|
1918
|
|
1919 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
|
|
1920
|
|
1921 delete $parents{$parentID};
|
|
1922 my @parents = keys %parents;
|
|
1923
|
|
1924 warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
|
|
1925 ' to ' . $child->end . " cannot be a child of ID $parentID"
|
|
1926 if $DEBUG;
|
|
1927
|
|
1928 $child->add_tag_value( validation_error =>
|
|
1929 "feature cannot be a child of $parentID");
|
|
1930
|
|
1931 $child->remove_tag('Parent');
|
|
1932
|
|
1933 unless ($child->has_tag('ID')) {
|
|
1934 my $id = gene_name($child);
|
|
1935 $child->add_tag_value('ID', $id);
|
|
1936 push @{$hAll->{$id}}, $child
|
|
1937 }
|
|
1938
|
|
1939 $child->add_tag_value('Parent', @parents) if @parents;
|
|
1940 }
|
|
1941
|
|
1942 }
|
|
1943
|
|
1944 #delete $aParents->[$_] for keys %parentsToRemove;
|
|
1945 splice(@$aParents, $_, 1) for keys %parentsToRemove;
|
|
1946 }
|
|
1947
|
|
1948 sub id_validate {
|
|
1949 my ($hAll, $hReserved) = @_;
|
|
1950
|
|
1951
|
|
1952 for my $id (keys %$hAll) {
|
|
1953
|
|
1954 #since 1 feature can have this id,
|
|
1955 #let's just shift it off and uniquify
|
|
1956 #the rest (unless it's reserved)
|
|
1957
|
|
1958 shift @{$hAll->{$id}} unless $hReserved->{$id};
|
|
1959 for my $feat (@{$hAll->{$id}}) {
|
|
1960 id_uniquify(0, $id, $feat, $hAll);
|
|
1961 }
|
|
1962 }
|
|
1963
|
|
1964 for my $parentID (keys %$hReserved) {
|
|
1965
|
|
1966 my @keys = keys %{$hReserved->{$parentID}};
|
|
1967
|
|
1968 shift @keys;
|
|
1969
|
|
1970 for my $k (@keys) {
|
|
1971 my $parent = $hReserved->{$parentID}{$k}{'parent'};
|
|
1972 my $aChildren= $hReserved->{$parentID}{$k}{'children'};
|
|
1973
|
|
1974 my $value = id_uniquify(0, $parentID, $parent, $hAll);
|
|
1975 for my $child (@$aChildren) {
|
|
1976
|
|
1977 my %parents;
|
|
1978 map { $parents{$_}++ } $child->get_tag_values('Parent');
|
|
1979 $child->remove_tag('Parent');
|
|
1980 delete $parents{$parentID};
|
|
1981 $parents{$value}++;
|
|
1982 my @parents = keys %parents;
|
|
1983 $child->add_tag_value('Parent', @parents);
|
|
1984 }
|
|
1985
|
|
1986 }
|
|
1987 }
|
|
1988 }
|
|
1989
|
|
1990 sub id_uniquify {
|
|
1991 my ($count, $value, $feat, $hAll) = @_;
|
|
1992
|
|
1993 warn "UNIQUIFYING $value" if $DEBUG;
|
|
1994
|
|
1995 if (! $count) {
|
|
1996 $feat->add_tag_value(Alias => $value);
|
|
1997 $value .= ('.' . $feat->primary_tag)
|
|
1998 } elsif ($count == 1) {
|
|
1999 $value .= ".$count"
|
|
2000 } else {
|
|
2001 chop $value;
|
|
2002 $value .= $count
|
|
2003 }
|
|
2004 $count++;
|
|
2005
|
|
2006 warn "ENDED UP WITH $value" if $DEBUG;
|
|
2007 if ( $hAll->{$value} ) {
|
|
2008 warn "$value IS ALREADY TAKEN" if $DEBUG;
|
|
2009 id_uniquify($count, $value, $feat, $hAll);
|
|
2010 } else {
|
|
2011 #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
|
|
2012 $feat->remove_tag('ID');
|
|
2013 $feat->add_tag_value('ID', $value);
|
|
2014 push @{$hAll->{$value}}, $value;
|
|
2015 }
|
|
2016
|
|
2017 $value;
|
|
2018 }
|
|
2019
|
|
2020 sub conf_read {
|
|
2021
|
|
2022 print "\nCannot read $CONF. Change file permissions and retry, " .
|
|
2023 "or enter another file\n" and conf_locate() unless -r $CONF;
|
|
2024
|
|
2025 print "\nCannot write $CONF. Change file permissions and retry, " .
|
|
2026 "or enter another file\n" and conf_locate() unless -w $CONF;
|
|
2027
|
|
2028 $YAML = LoadFile($CONF);
|
|
2029
|
|
2030 }
|
|
2031
|
|
2032 sub conf_create {
|
|
2033
|
|
2034 my ($path, $input) = @_;
|
|
2035
|
|
2036 print "Cannot write to $path. Change directory permissions and retry " .
|
|
2037 "or enter another save path\n" and conf_locate() unless -w $path;
|
|
2038
|
|
2039 $CONF = $input;
|
|
2040
|
|
2041 open(FH, '>', $CONF);
|
|
2042 close(FH);
|
|
2043 conf_read();
|
|
2044
|
|
2045
|
|
2046 }
|
|
2047
|
|
2048 sub conf_write { DumpFile($CONF, $YAML) }
|
|
2049
|
|
2050 sub conf_locate {
|
|
2051
|
|
2052 print "\nEnter the location of a previously saved config, or a new " .
|
|
2053 "path and file name to create a new config (this step is " .
|
|
2054 "necessary to save any preferences)";
|
|
2055
|
|
2056 print "\n\nenter a command:";
|
|
2057
|
|
2058 while (<STDIN>) {
|
|
2059 chomp(my $input = $_);
|
|
2060 my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
|
|
2061
|
|
2062 if (-e $input && (! -d $input)) {
|
|
2063
|
|
2064 print "\nReading $input...\n";
|
|
2065 $CONF = $input;
|
|
2066
|
|
2067 conf_read();
|
|
2068 last;
|
|
2069
|
|
2070 } elsif (! -d $input && $fn.$suffix) {
|
|
2071
|
|
2072 print "Creating $input...\n";
|
|
2073 conf_create($path, $input);
|
|
2074 last;
|
|
2075
|
|
2076 } elsif (-e $input && -d $input) {
|
|
2077 print "You only entered a directory. " .
|
|
2078 "Please enter BOTH a directory and filename\n";
|
|
2079 } else {
|
|
2080 print "$input does not appear to be a valid path. Please enter a " .
|
|
2081 "valid directory and filename\n";
|
|
2082 }
|
|
2083 print "\nenter a command:";
|
|
2084 }
|
|
2085 }
|
|
2086
|
|
2087 sub curation_save {
|
|
2088
|
|
2089 my ($feat, $input) = @_;
|
|
2090
|
|
2091 #my $error = "Enter the location of a previously saved config, or a new " .
|
|
2092 # "path and file name to create a new config (this step is " .
|
|
2093 # "necessary to save any preferences)\n";
|
|
2094
|
|
2095 if (!$CONF) {
|
|
2096 print "\n\n";
|
|
2097 conf_locate();
|
|
2098 } elsif (! -e $CONF) {
|
|
2099 print "\n\nThe config file you have chosen doesn't exist.\n";
|
|
2100 conf_locate();
|
|
2101 } else { conf_read() }
|
|
2102
|
|
2103 my $entry = GenBank_entry($feat, "\r", 1);
|
|
2104
|
|
2105 my $msg = "Term entered: $input";
|
|
2106 my $directions = "Please select any/all tags that provide evidence for the term you
|
|
2107 have entered. You may enter multiple tags by separating them by commas/dashes
|
|
2108 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
|
|
2109 the option of either selecting the entire note as evidence, or specific
|
|
2110 keywords. If a tag has multiple keywords, they will be tagged alphabetically
|
|
2111 for selection. To select a specific keyword in a tag field, you must enter the
|
|
2112 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
|
|
2113 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
|
|
2114 to be to match your curation. To match the GenBank entry exactly as it
|
|
2115 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
|
|
2116 preference, you will no longer be prompted to choose a feature type for any
|
|
2117 matching entries until you edit the curation.ini file.";
|
|
2118 my $msg_copy = $msg;
|
|
2119 my $dir_copy = $directions;
|
|
2120
|
|
2121 my $format = "format STDOUT = \n" .
|
|
2122 '-' x 156 . "\n" .
|
|
2123 '^' . '<' x 77 . '| Directions:' . "\n" .
|
|
2124 '$msg_copy' . "\n" .
|
|
2125 '-' x 156 . "\n" .
|
|
2126 ' ' x 78 . "|\n" .
|
|
2127 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
|
|
2128 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
|
|
2129 (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
|
|
2130 ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";
|
|
2131
|
|
2132 {
|
|
2133 # eval throws redefined warning that breaks formatting.
|
|
2134 # Turning off warnings just for the eval to fix this.
|
|
2135 no warnings 'redefine';
|
|
2136 eval $format;
|
|
2137 }
|
|
2138
|
|
2139 write;
|
|
2140 print '-' x 156 . "\nenter a command:";
|
|
2141
|
|
2142 my @tags = words_tag($feat);
|
|
2143
|
|
2144 my $final = {};
|
|
2145 my $choices;
|
|
2146 while (<STDIN>) {
|
|
2147
|
|
2148 chomp(my $choice = $_);
|
|
2149
|
|
2150 if (scalar(keys %$final) && $choice =~ /^y/i) { last
|
|
2151
|
|
2152 } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
|
|
2153
|
|
2154 } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
|
|
2155
|
|
2156 } elsif ($choice eq 'all') {
|
|
2157
|
|
2158 $choice = '';
|
|
2159 for (my $i=1; $i < scalar(@tags); $i++) {
|
|
2160 $choice .= "$i,";
|
|
2161 }
|
|
2162 chop $choice;
|
|
2163 }
|
|
2164 #print "CHOICE [$choice]";
|
|
2165
|
|
2166 my @selections;
|
|
2167 for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
|
|
2168 if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
|
|
2169
|
|
2170 for ($1..$2) { push @selections, $_ }
|
|
2171
|
|
2172 my $dangling_alphas = $3;
|
|
2173 alpha_expand($dangling_alphas, \@selections);
|
|
2174
|
|
2175 } else {
|
|
2176 alpha_expand($_, \@selections);
|
|
2177 }
|
|
2178 }
|
|
2179
|
|
2180 foreach my $numbers (@selections) {
|
|
2181
|
|
2182 my @c = split(/(?=[\w])/, $numbers);
|
|
2183 s/\W+//g foreach @c;
|
|
2184 my $num;
|
|
2185
|
|
2186 {
|
|
2187 $^W = 0;
|
|
2188 $num = 0 + shift @c;
|
|
2189 }
|
|
2190
|
|
2191 my $tag = $tags[$num];
|
|
2192 if (ref $tag && scalar(@c)) {
|
|
2193 my $no_value;
|
|
2194 foreach (@c) {
|
|
2195 if (defined $tag->{$_}) {
|
|
2196 $choices .= "${num}[$_] ";
|
|
2197 my ($t,$v) = @{$tag->{$_}};
|
|
2198 push @{${$final->{$input}}[0]{$t}}, $v;
|
|
2199
|
|
2200 } else { $no_value++ }
|
|
2201 }
|
|
2202
|
|
2203 if ($no_value) {
|
|
2204 _selection_add($tag,$final,$input,\$choices,$num);
|
|
2205 #my ($t,$v) = @{$tag->{'all'}};
|
|
2206 #unless (defined ${$final->{$input}}[0]{$t}) {
|
|
2207 #$choices .= "$num, ";
|
|
2208 #push @{${$final->{$input}}[0]{$t}}, $v
|
|
2209 #}
|
|
2210 }
|
|
2211
|
|
2212 $choices = substr($choices, 0, -2);
|
|
2213 $choices .= ', ';
|
|
2214
|
|
2215 } elsif (ref $tag) {
|
|
2216 _selection_add($tag,$final,$input,\$choices,$num);
|
|
2217 #my ($t,$v) = @{$tag->{'all'}};
|
|
2218 #unless (defined ${$final->{$input}}[0]{$t}) {
|
|
2219 #$choices .= "$num, ";
|
|
2220 #push @{${$final->{$input}}[0]{$t}}, $v
|
|
2221 #}
|
|
2222 }
|
|
2223 }
|
|
2224 $choices = substr($choices, 0, -2) if $choices;
|
|
2225 if ($final) {
|
|
2226 print "\nYou have chosen the following tags:\n$choices\n";
|
|
2227 print "This will be written to the config file as:\n";
|
|
2228 print Dump $final;
|
|
2229 print "\nIs this correct? (y or n)\n";
|
|
2230 } else { print "\nInvalid selection. Please try again\n" }
|
|
2231 }
|
|
2232 push @{$YAML->{$input}}, $final->{$input};
|
|
2233 conf_write();
|
|
2234 }
|
|
2235
|
|
2236 # words_tag() splits each tag value string into multiple words so that the
|
|
2237 # user can select the parts he/she wants to use for curation
|
|
2238 # it can tag 702 (a - zz) separate words; this should be enough
|
|
2239
|
|
2240 sub words_tag {
|
|
2241
|
|
2242 my ($feat, $entry) = @_;
|
|
2243
|
|
2244 my @tags;
|
|
2245
|
|
2246 @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
|
|
2247 my $i = 3;
|
|
2248 foreach my $tag ($feat->all_tags) {
|
|
2249 foreach my $value ($feat->each_tag_value($tag)) {
|
|
2250
|
|
2251 my ($string, $tagged_string);
|
|
2252
|
|
2253 my @words = split(/(?=\w+?)/, $value);
|
|
2254
|
|
2255 my $pos = 0;
|
|
2256
|
|
2257
|
|
2258 foreach my $word (@words) {
|
|
2259
|
|
2260 (my $sanitized_word = $word) =~ s/\W+?//g;
|
|
2261 $string .= $word;
|
|
2262
|
|
2263 my $lead = int($pos/ALPHABET_DIVISOR);
|
|
2264 my $lag = $pos % ALPHABET_DIVISOR;
|
|
2265
|
|
2266 my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';
|
|
2267 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
|
|
2268
|
|
2269 $tagged_string .= " ($a) $word";
|
|
2270
|
|
2271 $tags[$i]{$a} = [$tag, $sanitized_word];
|
|
2272 $pos++;
|
|
2273 }
|
|
2274
|
|
2275 $value = $tagged_string if scalar(@words) > 1;
|
|
2276
|
|
2277 $$entry .= "[$i] /$tag=\"$value\"\r";
|
|
2278
|
|
2279 $tags[$i]{'all'} = [$tag, $string];
|
|
2280 }
|
|
2281 $i++;
|
|
2282 }
|
|
2283
|
|
2284 return @tags;
|
|
2285
|
|
2286 }
|
|
2287
|
|
2288 sub alpha_expand {
|
|
2289
|
|
2290 my ($dangling_alphas, $selections) = @_;
|
|
2291
|
|
2292 if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
|
|
2293
|
|
2294 my $digit = $1;
|
|
2295 push @$selections, $digit if $digit;
|
|
2296
|
|
2297 my $start = $2;
|
|
2298 my $stop = $3;
|
|
2299
|
|
2300 my @starts = split('', $start);
|
|
2301 my @stops = split('', $stop);
|
|
2302
|
|
2303 my ($final_start, $final_stop);
|
|
2304
|
|
2305 for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
|
|
2306
|
|
2307 my ($final, $splits) = @$_;
|
|
2308
|
|
2309 my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
|
|
2310 my $rem;
|
|
2311
|
|
2312
|
|
2313 if ($$splits[1]) {
|
|
2314 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
|
|
2315 $int++
|
|
2316 } else {
|
|
2317 $rem = $int;
|
|
2318 $int = 0;
|
|
2319 }
|
|
2320
|
|
2321
|
|
2322 $$final = $int * ALPHABET_DIVISOR;
|
|
2323 $$final += $rem;
|
|
2324
|
|
2325 }
|
|
2326
|
|
2327 my $last_number = pop @$selections;
|
|
2328 for my $pos ($final_start..$final_stop) {
|
|
2329 my $lead = int($pos/ALPHABET_DIVISOR);
|
|
2330 my $lag = $pos % ALPHABET_DIVISOR;
|
|
2331 my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';
|
|
2332 $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
|
|
2333 push @$selections, $last_number.$alpha;
|
|
2334 }
|
|
2335
|
|
2336 } elsif (defined($dangling_alphas)) {
|
|
2337 if ($dangling_alphas =~ /^\d/) {
|
|
2338 push @$selections, $dangling_alphas;
|
|
2339 } elsif ($dangling_alphas =~ /^\D/) {
|
|
2340 #print "$dangling_alphas ".Dumper @$selections;
|
|
2341 my $last_number = pop @$selections;
|
|
2342 $last_number ||= '';
|
|
2343 push @$selections, $last_number.$dangling_alphas;
|
|
2344 #$$selections[-1] .= $dangling_alphas;
|
|
2345 }
|
|
2346 }
|
|
2347
|
|
2348 }
|
|
2349
|
|
2350 sub _selection_add {
|
|
2351
|
|
2352 my ($tag, $final, $input, $choices, $num) = @_;
|
|
2353 my ($t,$v) = @{$tag->{'all'}};
|
|
2354 unless (defined ${$final->{$input}}[0]{$t}) {
|
|
2355 $$choices .= "$num, ";
|
|
2356 push @{${$final->{$input}}[0]{$t}}, $v
|
|
2357 }
|
|
2358
|
|
2359 }
|