annotate Perl/bp_genbank2gff3.pl @ 14:5a5c9a6b047b draft

Uploaded
author dereeper
date Tue, 10 Dec 2024 16:20:53 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/opt/anaconda1anaconda2anaconda3/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 eval 'exec /opt/anaconda1anaconda2anaconda3/bin/perl -S $0 ${1+"$@"}'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 if 0; # not running under some shell
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 =pod
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 =head1 NAME
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 =head1 SYNOPSIS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 bp_genbank2gff3.pl [options] filename(s)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 # process a directory containing GenBank flatfiles
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 perl bp_genbank2gff3.pl --dir path_to_files --zip
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 # process a single file, ignore explicit exons and introns
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 # process a list of files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 perl bp_genbank2gff3.pl *gbk.gz
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 Options:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 --noinfer -r don't infer exon/mRNA subfeatures
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 --conf -i path to the curation configuration file that contains user preferences
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 for Genbank entries (must be YAML format)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 (if --manual is passed without --ini, user will be prompted to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 create the file if any manual input is saved)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 --sofile -l path to to the so.obo file to use for feature type mapping
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 (--sofile live will download the latest online revision)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 --manual -m when trying to guess the proper SO term, if more than
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 one option matches the primary tag, the converter will
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 wait for user input to choose the correct one
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 (only works with --sofile)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 --dir -d path to a list of genbank flatfiles
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 --outdir -o location to write GFF files (can be 'stdout' or '-' for pipe)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 --zip -z compress GFF3 output files with gzip
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 --summary -s print a summary of the features in each contig
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 --filter -x genbank feature type(s) to ignore
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 --split -y split output to separate GFF and fasta files for
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 each genbank record
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 --nolump -n separate file for each reference sequence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 (default is to lump all records together into one
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 output file for each input file)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 --ethresh -e error threshold for unflattener
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 set this high (>2) to ignore all unflattener errors
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 --[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 model. --CDS is default. Use --CDS to keep default GFF gene model,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 use --noCDS to convert to g-r-p-e.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 --format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 (GenBank is default)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 --GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 --quiet don't talk about what is being processed
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 --typesource SO sequence type for source (e.g. chromosome; region; contig)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 --help -h display this message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 =head1 DESCRIPTION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 This script uses Bio::SeqFeature::Tools::Unflattener and
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 containment hierarchies mapped for optimal display in gbrowse.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 The input files are assumed to be gzipped GenBank flatfiles for refseq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 contigs. The files may contain multiple GenBank records. Either a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 single file or an entire directory can be processed. By default, the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 DNA sequence is embedded in the GFF but it can be saved into separate
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 fasta file with the --split(-y) option.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 If an input file contains multiple records, the default behaviour is
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 to dump all GFF and sequence to a file of the same name (with .gff
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 appended). Using the 'nolump' option will create a separate file for
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 each genbank record. Using the 'split' option will create separate
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 GFF and Fasta files for each genbank record.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 =head2 Notes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 =head3 'split' and 'nolump' produce many files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 In cases where the input files contain many GenBank records (for
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 example, the chromosome files for the mouse genome build), a very
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 large number of output files will be produced if the 'split' or
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 'nolump' options are selected. If you do have lists of files E<gt> 6000,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 use the --long_list option in bp_bulk_load_gff.pl or
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 bp_fast_load_gff.pl to load the gff and/ or fasta files.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 =head3 Designed for RefSeq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 This script is designed for RefSeq genomic sequence entries. It may
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 work for third party annotations but this has not been tested.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 if you don't mind some gene model unflattener errors (dgg).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 =head3 G-R-P-E Gene Model
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 Don Gilbert worked this over with needs to produce GFF3 suited to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 loading to GMOD Chado databases. Most of the changes I believe are
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 suited for general use. One main chado-specific addition is the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 --[no]cds2protein flag
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 My favorite GFF is to set the above as ON by default (disable with --nocds2prot)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 For general use it probably should be OFF, enabled with --cds2prot.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 This writes GFF with an alternate, but useful Gene model,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 instead of the consensus model for GFF3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 [ gene > mRNA> (exon,CDS,UTR) ]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 This alternate is
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 gene > mRNA > polypeptide > exon
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 means the only feature with dna bases is the exon. The others
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 specify only location ranges on a genome. Exon of course is a child
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 of mRNA and protein/peptide.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 The protein/polypeptide feature is an important one, having all the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 annotations of the GenBank CDS feature, protein ID, translation, GO
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 terms, Dbxrefs to other proteins.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 UTRs, introns, CDS-exons are all inferred from the primary exon bases
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 inside/outside appropriate higher feature ranges. Other special gene
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 model features remain the same.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 Several other improvements and bugfixes, minor but useful are included
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 * IO pipes now work:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 * GenBank main record fields are added to source feature, e.g. organism, date,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 and the sourcetype, commonly chromosome for genomes, is used.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 * Gene Model handling for ncRNA, pseudogenes are added.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 * GFF header is cleaner, more informative.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 --GFF_VERSION flag allows choice of v2 as well as default v3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 * GFF ##FASTA inclusion is improved, and
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 CDS translation sequence is moved to FASTA records.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 * FT -> GFF attribute mapping is improved.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 * --format choice of SeqIO input formats (GenBank default).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 Uniprot/Swissprot and EMBL work and produce useful GFF.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 and more flexible usage.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 =head1 TODO
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 =head2 Are these additions desired?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 * handle Entrezgene, other non-sequence SeqIO structures (really should change
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 those parsers to produce consistent annotation tags).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 =head2 Related bugfixes/tests
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 These items from Bioperl mail were tested (sample data generating
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 errors), and found corrected:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 From: Ed Green <green <at> eva.mpg.de>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 Subject: genbank2gff3.pl on new human RefSeq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 Date: 2006-03-13 21:22:26 GMT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 -- unspecified errors (sample data works now).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 From: Eric Just <e-just <at> northwestern.edu>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 Subject: genbank2gff3.pl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 Date: 2007-01-26 17:08:49 GMT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 -- bug fixed in genbank2gff3 for multi-record handling
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 This error is for a /trans_splice gene that is hard to handle, and
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183 unflattner/genbank2 doesn't
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 From: Chad Matsalla <chad <at> dieselwurks.com>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 Date: 2005-07-15 19:51:48 GMT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 =head1 AUTHOR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191 Sheldon McKay (mckays@cshl.edu)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193 Copyright (c) 2004 Cold Spring Harbor Laboratory.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 =head2 AUTHOR of hacks for GFF2Chado loading
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 Don Gilbert (gilbertd@indiana.edu)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 =cut
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 use lib "$ENV{HOME}/bioperl-live";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 # chad put this here to enable situations when this script is tested
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 # against bioperl compiled into blib along with other programs using blib
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 BEGIN {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 unshift(@INC,'blib/lib');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 };
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 use Pod::Usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 use Bio::Root::RootI;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 use Bio::SeqIO;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 use File::Spec;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 use Bio::SeqFeature::Tools::Unflattener;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216 use Bio::SeqFeature::Tools::TypeMapper;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 use Bio::SeqFeature::Tools::IDHandler;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218 use Bio::Location::SplitLocationI;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219 use Bio::Location::Simple;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 use Bio::Tools::GFF;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 use Getopt::Long;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 use List::Util qw(first);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223 use Bio::OntologyIO;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 use YAML qw(Dump LoadFile DumpFile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 use File::Basename;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227 use vars qw/$split @filter $zip $outdir $help $ethresh
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230 $file @files $dir $summary $nolump
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231 $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232 $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 use constant SO_URL =>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 use constant ALPHABET_TO_NUMBER => {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239 j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241 z => 25,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242 };
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243 use constant ALPHABET_DIVISOR => 26;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 use constant GM_NEW_TOPLEVEL => 2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 use constant GM_NEW_PART => 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 use constant GM_DUP_PART => 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 use constant GM_NOT_PART => -1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249 # Options cycle in multiples of 2 because of left side/right side pairing.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 # You can make this number odd, but displayed matches will still round up
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 use constant OPTION_CYCLE => 6;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 $GFF_VERSION = 3; # allow v2 ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 $verbose = 1; # right default? -nov to turn off
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256 # dgg: change the gene model to Gene/mRNA/Polypeptide/exons...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259 # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262 my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263 # other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266 my %TAG_MAP = (
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 db_xref => 'Dbxref',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268 name => 'Name',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269 note => 'Note', # also pull GO: ids into Ontology_term
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270 synonym => 'Alias',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271 symbol => 'Alias', # is symbol still used?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 # protein_id => 'Dbxref', also seen Dbxref tags: EC_number
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273 # translation: handled in gene_features
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274 );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277 $| = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278 my $quiet= !$verbose;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279 my $ok= GetOptions( 'd|dir|input:s' => \$dir,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280 'z|zip' => \$zip,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 'h|help' => \$help,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282 's|summary' => \$summary,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283 'r|noinfer' => \$noinfer,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284 'i|conf=s' => \$CONF,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285 'sofile=s' => \$SO_FILE,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286 'm|manual' => \$MANUAL,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287 'o|outdir|output:s'=> \$outdir,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 'x|filter:s'=> \@filter,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289 'y|split' => \$split,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290 "ethresh|e=s"=>\$ethresh,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291 'c|CDS!' => \$CDSkeep,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292 'f|format=s' => \$FORMAT,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293 'typesource=s' => \$source_type,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294 'GFF_VERSION=s' => \$GFF_VERSION,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 'quiet!' => \$quiet, # swap quiet to verbose
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296 'DEBUG!' => \$DEBUG,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297 'n|nolump' => \$nolump);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
298
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
299 my $lump = 1 unless $nolump || $split;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
300 $verbose= !$quiet;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
301
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
302 # look for help request
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
303 pod2usage(2) if $help || !$ok;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
304
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
305 # keep SOURCEID as-is and change FORMAT for SeqIO types;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
306 # note SeqIO uses file.suffix to guess type; not useful here
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
307 $SOURCEID= $FORMAT;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
308 $FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
309 $verbose =1 if($DEBUG);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
310
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
311 # initialize handlers
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
312 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
313 $unflattener->error_threshold($ethresh) if $ethresh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
314 $unflattener->verbose(1) if($DEBUG);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
315 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
316 # ensembl parsing is still problematic, forget this
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
317
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
318 my $tm = Bio::SeqFeature::Tools::TypeMapper->new;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
319 my $idh = Bio::SeqFeature::Tools::IDHandler->new;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
320
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
321 # dgg
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
322 $source_type ||= "region"; # should really parse from FT.source contents below
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
323
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
324 #my $FTSOmap = $tm->FT_SO_map();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
325 my $FTSOmap;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
326 my $FTSOsynonyms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
327
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
328 if (defined($SO_FILE) && $SO_FILE eq 'live') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
329 print "\nDownloading the latest SO file from ".SO_URL."\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
330 use LWP::UserAgent;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
331 my $ua = LWP::UserAgent->new(timeout => 30);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
332 my $request = HTTP::Request->new(GET => SO_URL);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
333 my $response = $ua->request($request);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
334
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
335
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
336 if ($response->status_line =~ /200/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
337 use File::Temp qw/ tempfile /;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
338 my ($fh, $fn) = tempfile();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
339 print $fh $response->content;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
340 $SO_FILE = $fn;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
341 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
342 print "Couldn't download SO file online...skipping validation.\n"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
343 . "HTTP Status was " . $response->status_line . "\n"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
344 and undef $SO_FILE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
345 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
346 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
347
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
348 if ($SO_FILE) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
349
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
350
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
351 my (%terms, %syn);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
352
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
353 my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
354 $ONTOLOGY = $parser->next_ontology();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
355
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
356 for ($ONTOLOGY->get_all_terms) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
357 my $feat = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
358
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
359 $terms{$feat->name} = $feat->name;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
360 #$terms{$feat->name} = $feat;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
361
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
362 my @syn = $_->each_synonym;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
363
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
364 push @{$syn{$_}}, $feat->name for @syn;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
365 #push @{$syn{$_}}, $feat for @syn;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
366 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
367
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
368 $FTSOmap = \%terms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
369 $FTSOsynonyms = \%syn;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
370
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
371 my %hardTerms = %{ $tm->FT_SO_map() };
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
372 map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
373
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
374 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
375 my %terms = %{ $tm->FT_SO_map() };
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
376 while (my ($k,$v) = each %terms) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
377 $FTSOmap->{$k} = ref($v) ? shift @$v : $v;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
378 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
379 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
380
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
381 $TYPE_MAP = $FTSOmap;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
382 $SYN_MAP = $FTSOsynonyms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
383
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
384
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
385 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
386
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
387 # stringify filter list if applicable
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
388 my $filter = join ' ', @filter if @filter;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
389
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
390 # determine input files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
391 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
392 if ($dir && ($dir eq '-' || $dir eq 'stdin')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
393 $stdin=1; $dir=''; @files=('stdin');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
394
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
395 } elsif ( $dir ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
396 if ( -d $dir ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
397 opendir DIR, $dir or die "could not open $dir for reading: $!";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
398 @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
399 closedir DIR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
400 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
401 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
402 die "$dir is not a directory\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
403 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
404 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
405 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
406 @files = @ARGV;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
407 $dir = '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
408 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
409
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
410 # we should have some files by now
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
411 pod2usage(2) unless @files;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
412
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
413
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
414 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
415 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
416 warn("std. output chosen: cannot split\n") if($split);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
417 warn("std. output chosen: cannot zip\n") if($zip);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
418 warn("std. output chosen: cannot nolump\n") if($nolump);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
419 $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
420
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
421 } elsif ( $outdir && !-e $outdir ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
422 mkdir($outdir) or die "could not create directory $outdir: $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
423 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
424 elsif ( !$outdir ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
425 $outdir = $dir || '.';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
426 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
427
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
428 for my $file ( @files ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
429 # dgg ; allow 'stdin' / '-' input ?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
430 chomp $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
431 die "$! $file" unless($stdin || -e $file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
432 print "# Input: $file\n" if($verbose);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
433
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
434 my ($lump_fh, $lumpfa_fh, $outfile, $outfa);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
435 if ($stdout) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
436 $lump_fh= *STDOUT; $lump="stdout$$";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
437 $outfa= "stdout$$.fa"; # this is a temp file ... see below
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
438 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
439
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
440 } elsif ( $lump ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
441 my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
442 $lump = File::Spec->catfile($outdir, $fileonly.'.gff');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
443 ($outfa = $lump) =~ s/\.gff/\.fa/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
444 open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
445 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
446
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
447 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
448
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
449 # open input file, unzip if req'd
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
450 if ($stdin) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
451 *FH = *STDIN;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
452 } elsif ( $file =~ /\.gz/ ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
453 open FH, "gunzip -c $file |";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
454 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
455 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
456 open FH, '<', $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
457 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
458
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
459 my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
460 my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
461
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
462 while ( my $seq = $in->next_seq() ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
463 my $seq_name = $seq->accession_number;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
464 my $end = $seq->length;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
465 my @to_print;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
466
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
467 # arrange disposition of GFF output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
468 $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
469 my $out;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
470
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
471 if ( $lump ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
472 $outfile = $lump;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
473 $out = $lump_fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
474 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
475 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
476 $outfile = File::Spec->catfile($outdir, $seq_name.'.gff');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
477 open $out, ">$outfile";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
478 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
479
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
480 # filter out unwanted features
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
481 my $source_feat= undef;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
482 my @source= filter($seq); $source_feat= $source[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
483
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
484 ($source_type,$source_feat)=
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
485 getSourceInfo( $seq, $source_type, $source_feat ) ;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
486 # always; here we build main prot $source_feat; # if @source;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
487
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
488 # abort if there are no features
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
489 warn "$seq_name has no features, skipping\n" and next
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
490 if !$seq->all_SeqFeatures;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
491
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
492
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
493 $FTSOmap->{'source'} = $source_type;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
494 ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
495
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
496 # construct a GFF header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
497 # add: get source_type from attributes of source feature? chromosome=X tag
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
498 # also combine 1st ft line here with source ft from $seq ..
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
499 my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
500 print $out $header;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
501 print "# working on $info\n" if($verbose);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
502
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
503 # unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
504 unflatten_seq($seq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
505
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
506 # Note that we use our own get_all_SeqFeatures function
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
507 # to rescue cloned exons
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
508
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
509 @GFF_LINE_FEAT = ();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
510 for my $feature ( get_all_SeqFeatures($seq) ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
511
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
512 my $method = $feature->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
513 next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
514
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
515 $feature->seq_id($seq->id) unless($feature->seq_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
516 $feature->source_tag($SOURCEID);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
517
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
518 # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
519 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
520 maptags2gff($feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
521
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
522 # current gene name. The unflattened gene features should be in order so any
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
523 # exons, CDSs, etc that follow will belong to this gene
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
524 my $gene_name;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
525 if ( $method eq 'gene' || $method eq 'pseudogene' ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
526 @to_print= print_held($out, $gffio, \@to_print);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
527 $gene_id = $gene_name= gene_name($feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
528 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
529 $gene_name= gene_name($feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
530 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
531
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
532 #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
533 # be converted to or added as Name=xxx (if not ID= or as well)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
534 ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
535 convert_to_name($feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
536
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
537 ## dgg: extended to protein|polypeptide
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
538 ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
539 ## in yeast have that genbank tag; why?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
540 ## these include pseudogene ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
541
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
542 ## Note we also have mapped types to SO, so these RNA's are now transcripts:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
543 # pseudomRNA => "pseudogenic_transcript",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
544 # pseudotranscript" => "pseudogenic_transcript",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
545 # misc_RNA=>'processed_transcript',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
546
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
547 warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
548
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
549 if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
550 || ( $gene_id && $gene_name eq $gene_id ) ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
551
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
552 my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
553 if ($action == GM_DUP_PART) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
554 # ignore, this is dupl. exon with new parent ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
555
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
556 } elsif ($action == GM_NOT_PART) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
557 add_generic_id( $feature, $gene_name, "nocount");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
558 my $gff = $gffio->gff_string($feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
559 push @GFF_LINE_FEAT, $feature;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
560 #print $out "$gff\n" if $gff;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
561
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
562 } elsif ($action > 0) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
563 # hold off print because exon etc. may get 2nd, 3rd parents
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
564 @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
565 push(@to_print, $feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
566 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
567
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
568 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
569
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
570 # otherwise handle as generic feats with IDHandler labels
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
571 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
572 add_generic_id( $feature, $gene_name, "");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
573 my $gff= $gffio->gff_string($feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
574 push @GFF_LINE_FEAT, $feature;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
575 #print $out "$gff\n" if $gff;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
576 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
577 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
578
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
579 # don't like doing this after others; do after each new gene id?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
580 @to_print= print_held($out, $gffio, \@to_print);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
581
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
582 gff_validate(@GFF_LINE_FEAT);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
583
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
584 for my $feature (@GFF_LINE_FEAT) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
585 my $gff= $gffio->gff_string($feature);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
586 print $out "$gff\n" if $gff;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
587 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
588
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
589 # deal with the corresponding DNA
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
590 my ($fa_out,$fa_outfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
591 my $dna = $seq->seq;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
592 if($dna || %proteinfa) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
593 $method{'RESIDUES'} += length($dna);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
594 $dna =~ s/(\S{60})/$1\n/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
595 $dna .= "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
596
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
597 if ($split) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
598 $fa_outfile = $outfile;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
599 $fa_outfile =~ s/gff$/fa/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
600 open $fa_out, ">$fa_outfile" or die $!;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
601 print $fa_out ">$seq_name\n$dna" if $dna;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
602 foreach my $aid (sort keys %proteinfa) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
603 my $aa= delete $proteinfa{$aid};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
604 $method{'RESIDUES(tr)'} += length($aa);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
605 $aa =~ s/(\S{60})/$1\n/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
606 print $fa_out ">$aid\n$aa\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
607 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
608
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
609 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
610 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
611 ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
612 ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
613 ## maybe write this to temp .fa then cat to end of lumped gff $out
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
614 print $lumpfa_fh ">$seq_name\n$dna" if $dna;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
615 foreach my $aid (sort keys %proteinfa) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
616 my $aa= delete $proteinfa{$aid};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
617 $method{'RESIDUES(tr)'} += length($aa);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
618 $aa =~ s/(\S{60})/$1\n/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
619 print $lumpfa_fh ">$aid\n$aa\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
620 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
621 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
622
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
623 %proteinfa=();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
624 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
625
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
626 if ( $zip && !$lump ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
627 system "gzip -f $outfile";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
628 system "gzip -f $fa_outfile" if($fa_outfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
629 $outfile .= '.gz';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
630 $fa_outfile .= '.gz' if $split;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
631 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
632
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
633 # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
634 print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
635 print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
636
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
637 # dgg: moved to after all inputs; here it prints cumulative sum for each record
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
638 #if ( $summary ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
639 # print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
640 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
641 # for ( keys %method ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
642 # print "# $_ $method{$_}\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
643 # }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
644 # print "# \n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
645 # }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
646
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
647 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
648
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
649 print "# GFF3 saved to $outfile\n" if( $verbose && $lump);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
650 if ( $summary ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
651 print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
652 for ( keys %method ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
653 print "# $_ $method{$_}\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
654 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
655 print "# \n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
656 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
657
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
658 ## FIXME for piped output w/ split FA files ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
659 close($lumpfa_fh) if $lumpfa_fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
660 if (!$split && $outfa && $lump_fh) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
661 print $lump_fh "##FASTA\n"; # GFF3 spec
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
662 open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
663 while( <$lumpfa_fh>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
664 print $lump_fh $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
665 } # is $lump_fh still open?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
666 close($lumpfa_fh);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
667 unlink($outfa);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
668 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
669
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
670
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
671 if ( $zip && $lump ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
672 system "gzip -f $lump";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
673 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
674
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
675 close FH;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
676 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
677
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
678
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
679
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
680
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
681
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
682 sub typeorder {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
683 return 1 if ($_[0] =~ /gene/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
684 return 2 if ($_[0] =~ /RNA|transcript/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
685 return 3 if ($_[0] =~ /protein|peptide/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
686 return 4 if ($_[0] =~ /exon|CDS/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
687 return 3; # default before exon (smallest part)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
688 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
689
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
690 sub sort_by_feattype {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
691 my($at,$bt)= ($a->primary_tag, $b->primary_tag);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
692 return (typeorder($at) <=> typeorder($bt))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
693 or ($at cmp $bt);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
694 ## or ($a->name() cmp $b->name());
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
695 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
696
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
697 sub print_held {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
698 my($out,$gffio,$to_print)= @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
699 return unless(@$to_print);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
700 @$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
701 while ( my $feature = shift @$to_print) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
702 my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
703 push @GFF_LINE_FEAT, $feature;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
704 #print $out "$gff\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
705 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
706 return (); # @to_print
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
707 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
708
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
709 sub maptags2gff {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
710 my $f = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
711 ## should copy/move locus_tag to Alias, if not ID/Name/Alias already
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
712 # but see below /gene /locus_tag usage
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
713 foreach my $tag (keys %TAG_MAP) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
714 if ($f->has_tag($tag)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
715 my $newtag= $TAG_MAP{$tag};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
716 my @v= $f->get_tag_values($tag);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
717 $f->remove_tag($tag);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
718 $f->add_tag_value($newtag,@v);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
719
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
720 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
721 ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
722 if ($tag eq 'note') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
723 map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
724 my @go= map { m/(GO:\d+)/g } @v;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
725 $f->add_tag_value('Ontology_term',@go) if(@go);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
726 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
727
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
728 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
729 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
730 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
731
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
732
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
733 sub getSourceInfo {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
734 my ($seq, $source_type, $sf) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
735
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
736 my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
737 my $is_gene = ($SOURCEID =~/entrezgene/i);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
738 my $is_rich = (ref($seq) =~ /RichSeq/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
739 my $seq_name= $seq->accession_number();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
740
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
741 unless($sf) { # make one
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
742 $source_type= $is_swiss ? $PROTEIN_TYPE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
743 : $is_gene ? "eneg" # "gene" # "region" #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
744 : $is_rich ? $seq->molecule : $source_type;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
745 $sf = Bio::SeqFeature::Generic->direct_new();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
746
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
747 my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
748 my $loc= $seq->can('location') ? $seq->location()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
749 : new Bio::Location::Simple( -start => $start, -end => $len);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
750 $sf->location( $loc );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
751 $sf->primary_tag($source_type);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
752 $sf->source_tag($SOURCEID);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
753 $sf->seq_id( $seq_name);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
754 #? $sf->display_name($seq->id()); ## Name or Alias ?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
755 $sf->add_tag_value( Alias => $seq->id()); # unless id == accession
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
756 $seq->add_SeqFeature($sf);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
757 ## $source_feat= $sf;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
758 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
759
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
760 if ($sf->has_tag("chromosome")) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
761 $source_type= "chromosome";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
762 my ($chrname) = $sf->get_tag_values("chromosome");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
763 ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
764 ## e.g. Mouse chr 19 has two IDs in NCBI genbank now
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
765 $sf->add_tag_value( Alias => $chrname );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
766 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
767
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
768 # pull GB Comment, Description for source ft ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
769 # add reference - can be long, not plain string...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
770 warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
771 # GenBank fields: keyword,comment,reference,date_changed
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
772 # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
773
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
774 # is this just for main $seq object or for all seqfeatures ?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
775 my %AnnotTagMap= (
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
776 'gene_name' => 'Alias',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
777 'ALIAS_SYMBOL' => 'Alias', # Entrezgene
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
778 'LOCUS_SYNONYM' => 'Alias', #?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
779 'symbol' => 'Alias',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
780 'synonym' => 'Alias',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
781 'dblink' => 'Dbxref',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
782 'product' => 'product',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
783 'Reference' => 'reference',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
784 'OntologyTerm' => 'Ontology_term',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
785 'comment' => 'Note',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
786 'comment1' => 'Note',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
787 # various map-type locations
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
788 # gene accession tag is named per source db !??
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
789 # 'Index terms' => keywords ??
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
790 );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
791
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
792
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
793 my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
794 my ($date)= $seq->annotation->get_Annotations("dates")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
795 || $seq->annotation->get_Annotations("update-date")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
796 || $is_rich ? $seq->get_dates() : ();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
797 my ($comment)= $seq->annotation->get_Annotations("comment");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
798 my ($species)= $seq->annotation->get_Annotations("species");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
799 if (!$species
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
800 && $seq->can('species')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
801 && defined $seq->species()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
802 && $seq->species()->can('binomial') ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
803 $species= $seq->species()->binomial();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
804 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
805
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
806 # update source feature with main GB fields
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
807 $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
808 $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
809 $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
810 $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
811 $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
812
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
813 $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
814
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
815 foreach my $atag (sort keys %AnnotTagMap) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
816 my $gtag= $AnnotTagMap{$atag}; next unless($gtag);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
817 my @anno = map{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
818 if (ref $_ && $_->can('get_all_values')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
819 split( /[,;] */, join ";", $_->get_all_values)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
820 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
821 elsif (ref $_ && $_->can('display_text')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
822 split( /[,;] */, $_->display_text)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
823 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
824 elsif (ref $_ && $_->can('value')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
825 split( /[,;] */, $_->value)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
826 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
827 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
828 ();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
829 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
830 } $seq->annotation->get_Annotations($atag);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
831 foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
832 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
833
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
834 #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
835 #$sf->add_tag_value( Alias => $_ ) foreach(@genes);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
836 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
837 #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
838 #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
839
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
840 return (wantarray)? ($source_type,$sf) : $source_type; #?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
841 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
842
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
843
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
844 sub gene_features {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
845 my ($f, $gene_id, $genelinkID) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
846 local $_ = $f->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
847 $method{$_}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
848
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
849 if ( /gene/ ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
850 $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
851 $tnum = $rnum= 0; $ncrna_id= $rna_id = '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
852 return GM_NEW_TOPLEVEL;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
853
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
854 } elsif ( /mRNA/ ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
855 return GM_NOT_PART unless $gene_id;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
856 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
857 ($rna_id = $gene_id ) =~ s/gene/mRNA/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
858 $rna_id .= '.t0' . ++$tnum;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
859 $f->add_tag_value( ID => $rna_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
860 $f->add_tag_value( Parent => $gene_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
861
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
862 } elsif ( /RNA|transcript/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
863 ## misc_RNA here; missing exons ... flattener problem?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
864 # all of {t,nc,sn}RNA can have gene models now
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
865 ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
866 ## CDS needs to use mRNA, not misc_RNA, rna_id ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
867 ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
868
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
869 if($gene_id) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
870 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
871 ($ncrna_id = $gene_id) =~ s/gene/ncRNA/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
872 $ncrna_id .= '.r0' . ++$rnum;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
873 $f->add_tag_value( Parent => $gene_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
874 $f->add_tag_value( ID => $ncrna_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
875 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
876 unless ($f->has_tag('ID')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
877 if($genelinkID) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
878 $f->add_tag_value( ID => $genelinkID ) ;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
879 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
880 $idh->generate_unique_persistent_id($f);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
881 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
882 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
883 ($ncrna_id)= $f->get_tag_values('ID');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
884 return GM_NEW_TOPLEVEL;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
885 # this feat now acts as gene-top-level; need to print @to_print to flush prior exons?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
886 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
887
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
888 } elsif ( /exon/ ) { # can belong to any kind of RNA
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
889 return GM_NOT_PART unless ($rna_id||$ncrna_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
890 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
891 ## we are getting duplicate Parents here, which chokes chado loader, with reason...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
892 ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
893 ## check all Parents
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
894 for my $expar ($rna_id, $ncrna_id) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
895 next unless($expar);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
896 if ( $exonpar{$expar} and $f->has_tag('Parent') ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
897 my @vals = $f->get_tag_values('Parent');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
898 next if (grep {$expar eq $_} @vals);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
899 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
900 $exonpar{$expar}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
901 $f->add_tag_value( Parent => $expar);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
902 # last; #? could be both
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
903 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
904 # now we can skip cloned exons
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
905 # dgg note: multiple parents get added and printed for each unique exon
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
906 return GM_DUP_PART if ++$seen{$f} > 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
907
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
908 } elsif ( /CDS|protein|polypeptide/ ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
909 return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
910 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
911 (my $pro_id = $rna_id) =~ s/\.t/\.p/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
912
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
913 if( ! $CDSkeep && /CDS/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
914 $f->primary_tag($PROTEIN_TYPE);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
915
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
916 ## duplicate problem is Location ..
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
917 if ($f->location->isa("Bio::Location::SplitLocationI")) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
918 # my($b,$e)=($f->start, $f->end); # is this all we need?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
919 my($b,$e)=(-1,0);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
920 foreach my $l ($f->location->each_Location) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
921 $b = $l->start if($b<0 || $b > $l->start);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
922 $e = $l->end if($e < $l->end);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
923 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
924 $f->location( Bio::Location::Simple->new(
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
925 -start => $b, -end => $e, -strand => $f->strand) );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
926 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
927
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
928 $f->add_tag_value( Derives_from => $rna_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
929 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
930 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
931 $f->add_tag_value( Parent => $rna_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
932 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
933
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
934 $f->add_tag_value( ID => $pro_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
935
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
936 move_translation_fasta($f, $pro_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
937 #if( $f->has_tag('translation')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
938 # my ($aa) = $f->get_tag_values("translation");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
939 # $proteinfa{$pro_id}= $aa;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
940 # $f->remove_tag("translation");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
941 # $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
942 #}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
943 } elsif ( /region/ ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
944 $f->primary_tag('gene_component_region');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
945 $f->add_tag_value( Parent => $gene_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
946 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
947 return GM_NOT_PART unless $gene_id;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
948 $f->add_tag_value( Parent => $gene_id );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
949 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
950
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
951 ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
952
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
953 return GM_NEW_PART;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
954 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
955
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
956 ## was generic_features > add_generic_id
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
957 sub add_generic_id {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
958 my ($f, $ft_name, $flags) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
959 my $method = $f->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
960 $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
961
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
962 if ($f->has_tag('ID')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
963
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
964 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
965 elsif ( $f->has_tag($method) ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
966 my ($name) = $f->get_tag_values($method);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
967 $f->add_tag_value( ID => "$method:$name" );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
968 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
969 elsif($ft_name) { # is this unique ?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
970 $f->add_tag_value( ID => $ft_name );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
971 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
972 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
973 $idh->generate_unique_persistent_id($f);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
974 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
975
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
976 move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
977 if($method =~ /CDS/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
978
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
979 # return $io->gff_string($f);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
980 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
981
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
982 sub move_translation_fasta {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
983 my ($f, $ft_id) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
984 if( $ft_id && $f->has_tag('translation') ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
985 my ($aa) = $f->get_tag_values("translation");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
986 if($aa && $aa !~ /^length/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
987 $proteinfa{$ft_id}= $aa;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
988 $f->remove_tag("translation");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
989 $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
990 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
991 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
992 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
993
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
994 sub gff_header {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
995 my ($name, $end, $source_type, $source_feat) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
996 $source_type ||= "region";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
997
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
998 my $info = "$source_type:$name";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
999 my $head = "##gff-version $GFF_VERSION\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1000 "##sequence-region $name 1 $end\n".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1001 "# conversion-by bp_genbank2gff3.pl\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1002 if ($source_feat) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1003 ## dgg: these header comment fields are not useful when have multi-records, diff organisms
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1004 for my $key (qw(organism Note date)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1005 my $value;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1006 if ($source_feat->has_tag($key)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1007 ($value) = $source_feat->get_tag_values($key);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1008 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1009 if ($value) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1010 $head .= "# $key $value\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1011 $info .= ", $value";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1012 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1013 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1014 $head = "" if $didheader;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1015 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1016 $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1017 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1018 $didheader++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1019 return (wantarray) ? ($head,$info) : $head;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1020 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1021
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1022 sub unflatten_seq {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1023 my $seq = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1024
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1025 ## print "# working on $source_type:", $seq->accession, "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1026 my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1027 ": consult STDERR\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1028
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1029 eval {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1030 $unflattener->unflatten_seq( -seq => $seq,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1031 -noinfer => $noinfer,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1032 -use_magic => 1 );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1033 };
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1034
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1035 # deal with unflattening errors
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1036 if ( $@ ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1037 warn $seq->accession_number . " Unflattening error:\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1038 warn "Details: $@\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1039 print "# ".$uh_oh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1040 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1041
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1042 return 0 if !$seq || !$seq->all_SeqFeatures;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1043
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1044 # map feature types to the sequence ontology
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1045 ## $tm->map_types_to_SO( -seq => $seq );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1046 #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1047
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1048 map_types(
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1049 $tm,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1050 -seq => $seq,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1051 -type_map => $FTSOmap,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1052 -syn_map => $FTSOsynonyms,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1053 -undefined => "region"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1054 ); #nml
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1055
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1056 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1057
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1058 sub filter {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1059 my $seq = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1060 ## return unless $filter;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1061 my @feats;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1062 my @sources; # dgg; pick source features here; only 1 always?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1063 if ($filter) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1064 for my $f ( $seq->remove_SeqFeatures ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1065 my $m = $f->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1066 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1067 push @feats, $f unless $filter =~ /$m/i;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1068 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1069 $seq->add_SeqFeature($_) foreach @feats;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1070 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1071 for my $f ( $seq->get_SeqFeatures ){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1072 my $m = $f->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1073 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1074 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1075 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1076
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1077 return @sources;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1078 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1079
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1080
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1081 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1082 # changed to filter out cloned features. We have to implement the old
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1083 # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1084 sub get_all_SeqFeatures {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1085 my $seq = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1086 my @flatarr;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1087
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1088 foreach my $feat ( $seq->get_SeqFeatures ){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1089 push(@flatarr,$feat);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1090 _add_flattened_SeqFeatures(\@flatarr,$feat);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1091 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1092 return @flatarr;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1093 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1094
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1095 sub gene_name {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1096 my $g = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1097 my $gene_id = ''; # zero it;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1098
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1099 if ($g->has_tag('locus_tag')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1100 ($gene_id) = $g->get_tag_values('locus_tag');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1101 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1102
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1103 elsif ($g->has_tag('gene')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1104 ($gene_id) = $g->get_tag_values('gene');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1105 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1106 elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1107 ($gene_id) = $g->get_tag_values('ID');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1108 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1109
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1110 ## See Unflattener comment:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1111 # on rare occasions, records will have no /gene or /locus_tag
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1112 # but it WILL have /product tags. These serve the same purpose
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1113 # for grouping. For an example, see AY763288 (also in t/data)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1114 # eg. product=tRNA-Asp ; product=similar to crooked neck protein
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1115 elsif ($g->has_tag('product')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1116 my ($name)= $g->get_tag_values('product');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1117 ($gene_id) = $name unless($name =~ / /); # a description not name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1118 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1119
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1120 ## dgg; also handle transposon=xxxx ID/name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1121 # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1122 elsif ($g->has_tag('transposon')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1123 my ($name)= $g->get_tag_values('transposon');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1124 ($gene_id) = $name unless($name =~ / /); # a description not name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1125 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1127 return $gene_id;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1128 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1129
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1130 # same list as gene_name .. change tag to generic Name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1131 sub convert_to_name {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1132 my $g = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1133 my $gene_id = ''; # zero it;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1134
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1135 if ($g->has_tag('gene')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1136 ($gene_id) = $g->get_tag_values('gene');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1137 $g->remove_tag('gene');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1138 $g->add_tag_value('Name', $gene_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1139 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1140 elsif ($g->has_tag('locus_tag')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1141 ($gene_id) = $g->get_tag_values('locus_tag');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1142 $g->remove_tag('locus_tag');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1143 $g->add_tag_value('Name', $gene_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1144 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1145
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1146 elsif ($g->has_tag('product')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1147 my ($name)= $g->get_tag_values('product');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1148 ($gene_id) = $name unless($name =~ / /); # a description not name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1149 ## $g->remove_tag('product');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1150 $g->add_tag_value('Name', $gene_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1151 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1152
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1153 elsif ($g->has_tag('transposon')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1154 my ($name)= $g->get_tag_values('transposon');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1155 ($gene_id) = $name unless($name =~ / /); # a description not name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1156 ## $g->remove_tag('transposon');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1157 $g->add_tag_value('Name', $gene_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1158 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1159 elsif ($g->has_tag('ID')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1160 my ($name)= $g->get_tag_values('ID');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1161 $g->add_tag_value('Name', $name);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1162 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1163 return $gene_id;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1164 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1165
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1166
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1167 sub _add_flattened_SeqFeatures {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1168 my ($arrayref,$feat) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1169 my @subs = ();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1170
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1171 if ($feat->isa("Bio::FeatureHolderI")) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1172 @subs = $feat->get_SeqFeatures;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1173 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1174 elsif ($feat->isa("Bio::SeqFeatureI")) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1175 @subs = $feat->sub_SeqFeature;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1176 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1177 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1178 warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1179 "Don't know how to flatten.";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1180 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1181
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1182 for my $sub (@subs) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1183 push(@$arrayref,$sub);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1184 _add_flattened_SeqFeatures($arrayref,$sub);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1185 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1186
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1187 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1188
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1189 sub map_types {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1190
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1191 my ($self, @args) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1192
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1193 my($sf, $seq, $type_map, $syn_map, $undefmap) =
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1194 $self->_rearrange([qw(FEATURE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1195 SEQ
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1196 TYPE_MAP
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1197 SYN_MAP
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1198 UNDEFINED
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1199 )],
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1200 @args);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1201
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1202 if (!$sf && !$seq) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1203 $self->throw("you need to pass in either -feature or -seq");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1204 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1205
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1206 my @sfs = ($sf);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1207 if ($seq) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1208 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1209 @sfs = $seq->get_all_SeqFeatures;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1210 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1211 $type_map = $type_map || $self->typemap; # dgg: was type_map;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1212 foreach my $feat (@sfs) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1213
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1214 $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1215 $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1216
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1217 my $primary_tag = $feat->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1218
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1219 #if ($primary_tag =~ /^pseudo(.*)$/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1220 # $primary_tag = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1221 # $feat->primary_tag($primary_tag);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1222 #}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1223
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1224 my $mtype = $type_map->{$primary_tag};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1225 if ($mtype) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1226 if (ref($mtype)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1227 if (ref($mtype) eq 'ARRAY') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1228 my $soID;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1229 ($mtype, $soID) = @$mtype;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1230
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1231 if ($soID && ref($ONTOLOGY)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1232 my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1233 $mtype = $term->name if $term;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1234 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1235 # if SO ID is undefined AND we have an ontology to search, we want to delete
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1236 # the feature type hash entry in order to force a fuzzy search
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1237 elsif (! defined $soID && ref($ONTOLOGY)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1238 undef $mtype;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1239 delete $type_map->{$primary_tag};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1240 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1241 elsif ($undefmap && $mtype eq 'undefined') { # dgg
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1242 $mtype= $undefmap;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1243 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1244
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1245 $type_map->{$primary_tag} = $mtype if $mtype;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1246 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1247 elsif (ref($mtype) eq 'CODE') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1248 $mtype = $mtype->($feat);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1249 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1250 else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1251 $self->throw('must be scalar or CODE ref');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1252 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1253 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1254 elsif ($undefmap && $mtype eq 'undefined') { # dgg
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1255 $mtype= $undefmap;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1256 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1257 $feat->primary_tag($mtype);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1258 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1259
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1260 if ($CONF) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1261 conf_read();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1262 my %perfect_matches;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1263 while (my ($p_tag,$rules) = each %$YAML) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1264 RULE:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1265 for my $rule (@$rules) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1266 for my $tags (@$rule) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1267 while (my ($tag,$values) = each %$tags) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1268 for my $value (@$values) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1269 if ($feat->has_tag($tag)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1270 for ($feat->get_tag_values($tag)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1271 next RULE unless $_ =~ /\Q$value\E/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1272 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1273 } elsif ($tag eq 'primary_tag') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1274 next RULE unless $value eq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1275 $feat->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1276 } elsif ($tag eq 'location') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1277 next RULE unless $value eq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1278 $feat->start.'..'.$feat->end;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1279 } else { next RULE }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1280 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1281 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1282 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1283 $perfect_matches{$p_tag}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1284 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1285 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1286 if (scalar(keys %perfect_matches) == 1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1287 $mtype = $_ for keys %perfect_matches;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1288 } elsif (scalar(keys %perfect_matches) > 1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1289 warn "There are conflicting rules in the config file for the" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1290 " following types: ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1291 warn "\t$_\n" for keys %perfect_matches;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1292 warn "Until conflict resolution is built into the converter," .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1293 " you will have to manually edit the config file to remove the" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1294 " conflict. Sorry :(. Skipping user preference for this entry";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1295 sleep(2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1296 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1297 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1298
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1299 if ( ! $mtype && $syn_map) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1300 if ($feat->has_tag('note')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1301
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1302 my @all_matches;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1303
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1304 my @note = $feat->each_tag_value('note');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1305
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1306 for my $k (keys %$syn_map) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1307
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1308 if ($k =~ /"(.+)"/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1309
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1310 my $syn = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1311
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1312 for my $note (@note) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1313
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1314 # look through the notes to see if the description
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1315 # is an exact match for synonyms
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1316 if ( $syn eq $note ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1317
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1318 my @map = @{$syn_map->{$k}};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1319
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1320
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1321 my $best_guess = $map[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1322
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1323 unshift @{$all_matches[-1]}, [$best_guess];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1324
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1325 $mtype = $MANUAL
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1326 ? manual_curation($feat, $best_guess, \@all_matches)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1327 : $best_guess;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1328
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1329 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1330 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1331 . '#' x 78 . "\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1332
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1333 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1334 # check both primary tag and and note against
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1335 # SO synonyms for best matching description
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1336
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1337 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1338 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1339
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1340 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1341 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1342 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1343
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1344 #unless ($mtype) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1345 for my $note (@note) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1346 for my $name (values %$type_map) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1347 # check primary tag against SO names for best matching
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1348 # descriptions //NML also need to check against
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1349 # definition && camel case split terms
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1350
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1351 SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1352 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1353 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1354 #}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1355
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1356 if (scalar(@all_matches) && !$mtype) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1357
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1358 my $top_matches = first { defined $_ } @{$all_matches[-1]};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1359
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1360 my $best_guess = $top_matches->[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1361
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1362
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1363
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1364 # if guess has quotes, it is a synonym term. we need to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1365 # look up the corresponding name term
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1366 # otherwise, guess is a name, so we can use it directly
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1367 if ($best_guess =~ /"(.+)"/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1368
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1369 $best_guess = $syn_map->{$best_guess}->[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1370
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1371 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1372
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1373 @RETURN = @all_matches;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1374 $mtype = $MANUAL
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1375 ? manual_curation($feat, $best_guess, \@all_matches)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1376 : $best_guess;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1377
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1378 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1379 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1380 . '#' x 78 . "\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1381
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1382 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1383 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1384 $mtype ||= $undefmap;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1385 $feat->primary_tag($mtype);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1386 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1387 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1388
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1389
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1390 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1391
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1392 sub SO_fuzzy_match {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1393
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1394 my $candidate = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1395 my $primary_tag = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1396 my $note = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1397 my $SO_terms = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1398 my $best_matches_ref = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1399 my $modifier = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1400
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1401 $modifier ||= '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1402
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1403 my @feat_terms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1404
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1405 for ( split(" |_", $primary_tag) ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1406 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1407 my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1408 push @feat_terms, @camelCase;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1409 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1410
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1411 for ( split(" |_", $note) ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1412 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1413 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1414 (my $word = $_) =~ s/[;:.,]//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1415 push @feat_terms, $word;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1416 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1417
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1418
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1419 my @SO_terms = split(" |_", $SO_terms);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1420
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1421 # fuzzy match works on a simple point system. When 2 words match,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1422 # the $plus counter adds one. When they don't, the $minus counter adds
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1423 # one. This is used to sort similar matches together. Better matches
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1424 # are found at the end of the array, near the top.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1425
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1426 # NML: can we improve best match by using synonym tags
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1427 # EXACT,RELATED,NARROW,BROAD?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1428
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1429 my ($plus, $minus) = (0, 0);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1430 my %feat_terms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1431 my %SO_terms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1432
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1433 #unique terms
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1434 map {$feat_terms{$_} = 1} @feat_terms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1435 map {$SO_terms{$_} = 1} @SO_terms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1436
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1437 for my $st (keys %SO_terms) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1438 for my $ft (keys %feat_terms) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1439
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1440 ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1441
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1442 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1443 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1444
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1445 push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1446
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1447 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1448
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1449 sub manual_curation {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1450
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1451 my ($feat, $default_opt, $all_matches) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1452
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1453 my @all_matches = @$all_matches;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1454
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1455 # convert all SO synonyms into names and filter
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1456 # all matches into unique term list because
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1457 # synonyms can map to multiple duplicate names
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1458
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1459 my (@unique_SO_terms, %seen);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1460 for (reverse @all_matches) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1461 for (@$_) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1462 for (@$_) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1463 #my @names;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1464 if ($_ =~ /"(.+)"/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1465 for (@{$SYN_MAP->{$_}}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1466 push @unique_SO_terms, $_ unless $seen{$_};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1467 $seen{$_}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1468 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1469 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1470 push @unique_SO_terms, $_ unless $seen{$_};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1471 $seen{$_}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1472 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1473 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1474 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1475 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1476
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1477 my $s = scalar(@unique_SO_terms);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1478
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1479 my $choice = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1480
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1481 my $more =
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1482 "[a]uto : automatic input (selects best guess for remaining entries)\r" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1483 "[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1484 "[i]nput : add a specific term\r" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1485 "[r]eset : reset to the beginning of matches\r" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1486 "[s]kip : skip this entry (selects best guess for this entry)\r"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1487 ;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1488
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1489 $more .=
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1490 "[n]ext : view the next ".OPTION_CYCLE." terms\r" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1491 "[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1492
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1493 my $msg = #"\n\n" . '-' x 156 . "\n"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1494 "The converter found $s possible matches for the following GenBank entry: ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1495
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1496 my $directions =
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1497 "Type a number to select the SO term that best matches"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1498 . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1499
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1500
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1501 # lookup filtered list to pull out definitions
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1502 my @options = map {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1503 my $term = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1504 my %term;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1505 for (['name', 'name'], ['def', 'definition'], ['synonym',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1506 'each_synonym']) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1507 my ($label, $method) = @$_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1508 $term{$label} = \@{[$term->$method]};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1509 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1510 [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1511 } map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1512
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1513
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1514 my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1515 $default_opt, @options);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1516
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1517 if ($option eq 'skip') { return $default_opt
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1518 } elsif ($option eq 'auto') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1519 $MANUAL = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1520 return $default_opt;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1521 } else { return $option }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1522
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1523 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1524
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1525 sub options_cycle {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1526
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1527 my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1528
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1529 #NML: really should only call GenBank_entry once. Will need to change
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1530 #method to return array & shift off header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1531 my $entry = GenBank_entry($feat, "\r");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1532
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1533 my $total = scalar(@opt);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1534
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1535 ($start,$stop) = (0, OPTION_CYCLE)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1536 if ( ($start < 0) && ($stop > 0) );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1537
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1538 ($start,$stop) = (0, OPTION_CYCLE)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1539 if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1540
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1541 ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1542 ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1543
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1544 $stop = $total if $stop > $total;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1545
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1546 my $dir_copy = $directions;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1547 my $msg_copy = $msg;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1548 my $format = "format STDOUT = \n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1549 '-' x 156 . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1550 '^' . '<' x 77 . '| Available Commands:' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1551 '$msg_copy' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1552 '-' x 156 . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1553 ' ' x 78 . "|\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1554 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1555 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1556 (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1557 ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1558
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1559 {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1560 # eval throws redefined warning that breaks formatting.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1561 # Turning off warnings just for the eval to fix this.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1562 no warnings 'redefine';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1563 eval $format;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1564 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1565
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1566 write;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1567
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1568 print '-' x 156 . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1569 'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1570 " - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1571 "\n" . '-' x 156 . "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1572
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1573 for (my $i = $start; $i < $stop; $i+=2) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1574
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1575 my ($left, $right) = @opt[$i,$i+1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1576
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1577 my ($nL, $nmL, $descL, $termL, @synL) = @$left;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1578
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1579 #odd numbered lists can cause fatal undefined errors, so check
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1580 #to make sure we have data
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1581
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1582 my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1583
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1584
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1585 my $format = "format STDOUT = \n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1586
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1587 $format .=
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1588 ' ' x 78 . "|\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1589
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1590 '@>>: name: ^' . '<' x 64 . '~' . ' |' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1591 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1592 '$nL,' . ' ' x 7 . '$nmL,' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1593 ( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1594
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1595 ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1596 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1597 ' ' x 11 . '$nmL,' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1598 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1599 #' ' x 78 . '|' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1600
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1601
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1602 ' def: ^' . '<' x 65 . ' |' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1603 (ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1604 ' ' x 11 . '$descL,' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1605 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1606
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1607
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1608 (' ^' . '<' x 65 . ' |' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1609 (ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1610 ' ' x 11 . '$descL,' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1611 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1612
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1613
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1614 ' ^' . '<' x 61 . '...~ |' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1615 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1616 ' ' x 11 . '$descL,' .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1617 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1618
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1619 ".\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1620
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1621 {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1622 # eval throws redefined warning that breaks formatting.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1623 # Turning off warnings just for the eval to fix this.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1624 no warnings 'redefine';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1625 eval $format;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1626 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1627 write;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1628
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1629 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1630 print '-' x 156 . "\nenter a command:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1631
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1632 while (<STDIN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1633
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1634 (my $input = $_) =~ s/\s+$//;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1635
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1636 if ($input =~ /^\d+$/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1637 if ( $input && defined $opt[$input-1] ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1638 return $opt[$input-1]->[1]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1639 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1640 print "\nThat number is not an option. Please enter a valid number.\n:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1641 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1642 } elsif ($input =~ /^n/i | $input =~ /next/i ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1643 return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1644 $feat, $directions, $best_guess, @opt)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1645 } elsif ($input =~ /^p/i | $input =~ /prev/i ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1646 return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1647 $feat, $directions, $best_guess, @opt)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1648 } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1649 } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1650 } elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1651 return manual_curation($feat, $best_guess, \@RETURN );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1652 } elsif ( $input =~ /^f/i || $input =~ /find/i ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1653
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1654 my ($query, @query_results);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1655
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1656 if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1657 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1658
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1659 #do a SO search
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1660 print "Type your search query\n:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1661 while (<STDIN>) { chomp($query = $_); last }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1662 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1663
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1664 for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1665 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1666 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1667
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1668 return manual_curation($feat, $best_guess, \@query_results);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1669
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1670 } elsif ( $input =~ /^i/i || $input =~ /input/i ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1671
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1672 #NML fast input for later
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1673 #my $query;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1674 #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1675
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1676 #manual input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1677 print "Type the term you want to use\n:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1678 while (<STDIN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1679 chomp(my $input = $_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1680
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1681 if (! $TYPE_MAP->{$input}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1682
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1683 print "\"$input\" doesn't appear to be a valid SO term. Are ".
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1684 "you sure you want to use it? (y or n)\n:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1685
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1686 while (<STDIN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1687
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1688 chomp(my $choice = $_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1689
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1690 if ($choice eq 'y') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1691 print
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1692 "\nWould you like to save your preference for " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1693 "future use (so you don't have to redo manual " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1694 "curation for this feature everytime you run " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1695 "the converter)? (y or n)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1696
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1697 #NML: all these while loops are a mess. Really should condense it.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1698 while (<STDIN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1699
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1700 chomp(my $choice = $_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1701
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1702 if ($choice eq 'y') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1703 curation_save($feat, $input);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1704 return $input;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1705 } elsif ($choice eq 'n') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1706 return $input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1707 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1708 print "\nDidn't recognize that command. Please " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1709 "type y or n.\n:"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1710 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1711 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1712
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1713
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1714 } elsif ($choice eq 'n') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1715 return options_cycle($start, $stop, $msg, $feat,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1716 $directions, $best_guess, @opt)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1717 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1718 print "\nDidn't recognize that command. Please " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1719 "type y or n.\n:"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1720 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1721 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1722
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1723 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1724 print
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1725 "\nWould you like to save your preference for " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1726 "future use (so you don't have to redo manual " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1727 "curation for this feature everytime you run " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1728 "the converter)? (y or n)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1729
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1730 #NML: all these while loops are a mess. Really should condense it.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1731 while (<STDIN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1732
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1733 chomp(my $choice = $_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1734
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1735 if ($choice eq 'y') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1736 curation_save($feat, $input);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1737 return $input;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1738 } elsif ($choice eq 'n') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1739 return $input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1740 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1741 print "\nDidn't recognize that command. Please " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1742 "type y or n.\n:"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1743 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1744 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1745
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1746 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1747
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1748 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1749 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1750 print "\nDidn't recognize that command. Please re-enter your choice.\n:"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1751 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1752 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1753
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1754 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1755
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1756 sub GenBank_entry {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1757 my ($f, $delimiter, $num) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1758
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1759 $delimiter ||= "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1760
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1761
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1762 my $entry =
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1763
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1764 ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1765 . ($num
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1766 ? ' ' x (12 - length $f->primary_tag ) . ' [2] '
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1767 : ' ' x (15 - length $f->primary_tag)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1768 )
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1769 . $f->start.'..'.$f->end
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1770
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1771 . "$delimiter";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1772
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1773 if ($num) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1774 words_tag($f, \$entry);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1775 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1776 for my $tag ($f->all_tags) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1777 for my $val ( $f->each_tag_value($tag) ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1778 $entry .= ' ' x 20;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1779 #$entry .= "/$tag=\"$val\"$delimiter";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1780 $entry .= $val eq '_no_value'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1781 ? "/$tag$delimiter"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1782 : "/$tag=\"$val\"$delimiter";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1783 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1784 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1785
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1786 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1787
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1788 return $entry;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1789 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1790
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1791
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1792 sub gff_validate {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1793 warn "Validating GFF file\n" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1794 my @feat = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1795
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1796 my (%parent2child, %all_ids, %descendants, %reserved);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1797
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1798 for my $f (@feat) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1799 for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1800 map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1801 if $f->has_tag($$aTags[0]);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1802 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1803 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1804
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1805 if ($SO_FILE) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1806 while (my ($parentID, $aChildren) = each %parent2child) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1807 parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1808 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1809 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1810
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1811 id_validate(\%all_ids, \%reserved);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1812 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1813
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1814 sub parent_validate {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1815 my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1816
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1817 my $aParents = $hAll->{$parentID};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1818
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1819 map {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1820 my $child = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1821 $child->add_tag_value( validation_error =>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1822 "feature tried to add Parent tag, but no Parent found with ID $parentID"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1823 );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1824 my %parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1825 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1826 delete $parents{$parentID};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1827 my @parents = keys %parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1828
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1829 $child->remove_tag('Parent');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1830
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1831 unless ($child->has_tag('ID')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1832 my $id = gene_name($child);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1833 $child->add_tag_value('ID', $id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1834 push @{$hAll->{$id}}, $child
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1835 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1836
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1837 $child->add_tag_value('Parent', @parents) if @parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1838
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1839 } @$aChildren and return unless scalar(@$aParents);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1840
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1841 my $par = join(',', map { $_->primary_tag } @$aParents);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1842 warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1843
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1844 #NML manual curation needs to happen here
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1845
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1846
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1847 my %parentsToRemove;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1848
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1849 CHILD:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1850 for my $child (@$aChildren) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1851 my $childType = $child->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1852
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1853 warn "WORKING ON $childType at ".$child->start.' to '.$child->end
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1854 if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1855
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1856 for (my $i = 0; $i < scalar(@$aParents); $i++) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1857 my $parent = $aParents->[$i];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1858 my $parentType = $parent->primary_tag;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1859
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1860 warn "CHECKING $childType against $parentType" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1861
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1862 #cache descendants so we don't have to do repeat searches
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1863 unless ($hDescendants->{$parentType}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1864
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1865 for my $term ($ONTOLOGY->find_terms(
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1866 -name => $parentType
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1867 ) ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1868
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1869 map {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1870 $hDescendants->{$parentType}{$_->name}++
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1871 } $ONTOLOGY->get_descendant_terms($term);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1872
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1873 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1874
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1875 # NML: hopefully temporary fix.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1876 # SO doesn't consider exon/CDS to be a child of mRNA
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1877 # even though common knowledge dictates that they are
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1878 # This cheat fixes the false positives for now
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1879 if ($parentType eq 'mRNA') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1880 $hDescendants->{$parentType}{'exon'} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1881 $hDescendants->{$parentType}{'CDS'} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1882 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1883
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1884 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1885
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1886 warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1887 " be a child of $parentType?" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1888 if ($hDescendants->{$parentType}{$childType}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1889 warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1890
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1891 #NML need to deal with multiple children matched to multiple different
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1892 #parents. This model only assumes the first parent id that matches a child will
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1893 #be the reserved feature.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1894
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1895 $hReserved->{$parentID}{$parent}{'parent'} = $parent;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1896 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1897
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1898 #mark parent for later removal from all IDs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1899 #so we don't accidentally change any parents
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1900
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1901 $parentsToRemove{$i}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1902
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1903 next CHILD;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1904 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1905 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1906
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1907
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1908
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1909 #NML shouldn't have to check this; somehow child can lose Parent
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1910 #it's happening W3110
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1911 #need to track this down
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1912 if ( $child->has_tag('Parent') ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1913
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1914 warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1915 if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1916
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1917 my %parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1918
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1919 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1920
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1921 delete $parents{$parentID};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1922 my @parents = keys %parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1923
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1924 warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1925 ' to ' . $child->end . " cannot be a child of ID $parentID"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1926 if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1927
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1928 $child->add_tag_value( validation_error =>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1929 "feature cannot be a child of $parentID");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1930
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1931 $child->remove_tag('Parent');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1932
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1933 unless ($child->has_tag('ID')) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1934 my $id = gene_name($child);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1935 $child->add_tag_value('ID', $id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1936 push @{$hAll->{$id}}, $child
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1937 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1938
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1939 $child->add_tag_value('Parent', @parents) if @parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1940 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1941
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1942 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1943
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1944 #delete $aParents->[$_] for keys %parentsToRemove;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1945 splice(@$aParents, $_, 1) for keys %parentsToRemove;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1946 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1947
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1948 sub id_validate {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1949 my ($hAll, $hReserved) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1950
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1951
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1952 for my $id (keys %$hAll) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1953
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1954 #since 1 feature can have this id,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1955 #let's just shift it off and uniquify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1956 #the rest (unless it's reserved)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1957
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1958 shift @{$hAll->{$id}} unless $hReserved->{$id};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1959 for my $feat (@{$hAll->{$id}}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1960 id_uniquify(0, $id, $feat, $hAll);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1961 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1962 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1963
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1964 for my $parentID (keys %$hReserved) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1965
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1966 my @keys = keys %{$hReserved->{$parentID}};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1967
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1968 shift @keys;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1969
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1970 for my $k (@keys) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1971 my $parent = $hReserved->{$parentID}{$k}{'parent'};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1972 my $aChildren= $hReserved->{$parentID}{$k}{'children'};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1973
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1974 my $value = id_uniquify(0, $parentID, $parent, $hAll);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1975 for my $child (@$aChildren) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1976
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1977 my %parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1978 map { $parents{$_}++ } $child->get_tag_values('Parent');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1979 $child->remove_tag('Parent');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1980 delete $parents{$parentID};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1981 $parents{$value}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1982 my @parents = keys %parents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1983 $child->add_tag_value('Parent', @parents);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1984 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1985
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1986 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1987 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1988 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1989
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1990 sub id_uniquify {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1991 my ($count, $value, $feat, $hAll) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1992
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1993 warn "UNIQUIFYING $value" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1994
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1995 if (! $count) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1996 $feat->add_tag_value(Alias => $value);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1997 $value .= ('.' . $feat->primary_tag)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1998 } elsif ($count == 1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1999 $value .= ".$count"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2000 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2001 chop $value;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2002 $value .= $count
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2003 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2004 $count++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2005
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2006 warn "ENDED UP WITH $value" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2007 if ( $hAll->{$value} ) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2008 warn "$value IS ALREADY TAKEN" if $DEBUG;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2009 id_uniquify($count, $value, $feat, $hAll);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2010 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2011 #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2012 $feat->remove_tag('ID');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2013 $feat->add_tag_value('ID', $value);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2014 push @{$hAll->{$value}}, $value;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2015 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2016
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2017 $value;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2018 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2019
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2020 sub conf_read {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2021
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2022 print "\nCannot read $CONF. Change file permissions and retry, " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2023 "or enter another file\n" and conf_locate() unless -r $CONF;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2024
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2025 print "\nCannot write $CONF. Change file permissions and retry, " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2026 "or enter another file\n" and conf_locate() unless -w $CONF;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2027
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2028 $YAML = LoadFile($CONF);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2029
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2030 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2031
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2032 sub conf_create {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2033
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2034 my ($path, $input) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2035
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2036 print "Cannot write to $path. Change directory permissions and retry " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2037 "or enter another save path\n" and conf_locate() unless -w $path;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2038
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2039 $CONF = $input;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2040
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2041 open(FH, '>', $CONF);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2042 close(FH);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2043 conf_read();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2044
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2045
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2046 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2047
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2048 sub conf_write { DumpFile($CONF, $YAML) }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2049
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2050 sub conf_locate {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2051
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2052 print "\nEnter the location of a previously saved config, or a new " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2053 "path and file name to create a new config (this step is " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2054 "necessary to save any preferences)";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2055
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2056 print "\n\nenter a command:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2057
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2058 while (<STDIN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2059 chomp(my $input = $_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2060 my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2061
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2062 if (-e $input && (! -d $input)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2063
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2064 print "\nReading $input...\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2065 $CONF = $input;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2066
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2067 conf_read();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2068 last;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2069
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2070 } elsif (! -d $input && $fn.$suffix) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2071
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2072 print "Creating $input...\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2073 conf_create($path, $input);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2074 last;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2075
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2076 } elsif (-e $input && -d $input) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2077 print "You only entered a directory. " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2078 "Please enter BOTH a directory and filename\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2079 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2080 print "$input does not appear to be a valid path. Please enter a " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2081 "valid directory and filename\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2082 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2083 print "\nenter a command:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2084 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2085 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2086
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2087 sub curation_save {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2088
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2089 my ($feat, $input) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2090
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2091 #my $error = "Enter the location of a previously saved config, or a new " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2092 # "path and file name to create a new config (this step is " .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2093 # "necessary to save any preferences)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2094
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2095 if (!$CONF) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2096 print "\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2097 conf_locate();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2098 } elsif (! -e $CONF) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2099 print "\n\nThe config file you have chosen doesn't exist.\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2100 conf_locate();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2101 } else { conf_read() }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2102
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2103 my $entry = GenBank_entry($feat, "\r", 1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2104
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2105 my $msg = "Term entered: $input";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2106 my $directions = "Please select any/all tags that provide evidence for the term you
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2107 have entered. You may enter multiple tags by separating them by commas/dashes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2108 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2109 the option of either selecting the entire note as evidence, or specific
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2110 keywords. If a tag has multiple keywords, they will be tagged alphabetically
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2111 for selection. To select a specific keyword in a tag field, you must enter the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2112 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2113 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2114 to be to match your curation. To match the GenBank entry exactly as it
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2115 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2116 preference, you will no longer be prompted to choose a feature type for any
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2117 matching entries until you edit the curation.ini file.";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2118 my $msg_copy = $msg;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2119 my $dir_copy = $directions;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2120
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2121 my $format = "format STDOUT = \n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2122 '-' x 156 . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2123 '^' . '<' x 77 . '| Directions:' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2124 '$msg_copy' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2125 '-' x 156 . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2126 ' ' x 78 . "|\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2127 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2128 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2129 (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2130 ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2131
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2132 {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2133 # eval throws redefined warning that breaks formatting.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2134 # Turning off warnings just for the eval to fix this.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2135 no warnings 'redefine';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2136 eval $format;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2137 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2138
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2139 write;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2140 print '-' x 156 . "\nenter a command:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2141
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2142 my @tags = words_tag($feat);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2143
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2144 my $final = {};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2145 my $choices;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2146 while (<STDIN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2147
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2148 chomp(my $choice = $_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2149
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2150 if (scalar(keys %$final) && $choice =~ /^y/i) { last
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2151
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2152 } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2153
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2154 } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2155
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2156 } elsif ($choice eq 'all') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2157
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2158 $choice = '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2159 for (my $i=1; $i < scalar(@tags); $i++) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2160 $choice .= "$i,";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2161 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2162 chop $choice;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2163 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2164 #print "CHOICE [$choice]";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2165
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2166 my @selections;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2167 for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2168 if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2169
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2170 for ($1..$2) { push @selections, $_ }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2171
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2172 my $dangling_alphas = $3;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2173 alpha_expand($dangling_alphas, \@selections);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2174
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2175 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2176 alpha_expand($_, \@selections);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2177 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2178 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2179
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2180 foreach my $numbers (@selections) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2181
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2182 my @c = split(/(?=[\w])/, $numbers);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2183 s/\W+//g foreach @c;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2184 my $num;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2185
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2186 {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2187 $^W = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2188 $num = 0 + shift @c;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2189 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2190
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2191 my $tag = $tags[$num];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2192 if (ref $tag && scalar(@c)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2193 my $no_value;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2194 foreach (@c) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2195 if (defined $tag->{$_}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2196 $choices .= "${num}[$_] ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2197 my ($t,$v) = @{$tag->{$_}};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2198 push @{${$final->{$input}}[0]{$t}}, $v;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2199
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2200 } else { $no_value++ }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2201 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2202
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2203 if ($no_value) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2204 _selection_add($tag,$final,$input,\$choices,$num);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2205 #my ($t,$v) = @{$tag->{'all'}};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2206 #unless (defined ${$final->{$input}}[0]{$t}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2207 #$choices .= "$num, ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2208 #push @{${$final->{$input}}[0]{$t}}, $v
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2209 #}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2210 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2211
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2212 $choices = substr($choices, 0, -2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2213 $choices .= ', ';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2214
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2215 } elsif (ref $tag) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2216 _selection_add($tag,$final,$input,\$choices,$num);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2217 #my ($t,$v) = @{$tag->{'all'}};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2218 #unless (defined ${$final->{$input}}[0]{$t}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2219 #$choices .= "$num, ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2220 #push @{${$final->{$input}}[0]{$t}}, $v
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2221 #}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2222 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2223 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2224 $choices = substr($choices, 0, -2) if $choices;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2225 if ($final) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2226 print "\nYou have chosen the following tags:\n$choices\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2227 print "This will be written to the config file as:\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2228 print Dump $final;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2229 print "\nIs this correct? (y or n)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2230 } else { print "\nInvalid selection. Please try again\n" }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2231 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2232 push @{$YAML->{$input}}, $final->{$input};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2233 conf_write();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2234 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2235
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2236 # words_tag() splits each tag value string into multiple words so that the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2237 # user can select the parts he/she wants to use for curation
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2238 # it can tag 702 (a - zz) separate words; this should be enough
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2239
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2240 sub words_tag {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2241
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2242 my ($feat, $entry) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2243
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2244 my @tags;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2245
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2246 @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2247 my $i = 3;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2248 foreach my $tag ($feat->all_tags) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2249 foreach my $value ($feat->each_tag_value($tag)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2250
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2251 my ($string, $tagged_string);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2252
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2253 my @words = split(/(?=\w+?)/, $value);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2254
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2255 my $pos = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2256
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2257
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2258 foreach my $word (@words) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2259
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2260 (my $sanitized_word = $word) =~ s/\W+?//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2261 $string .= $word;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2262
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2263 my $lead = int($pos/ALPHABET_DIVISOR);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2264 my $lag = $pos % ALPHABET_DIVISOR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2265
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2266 my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2267 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2268
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2269 $tagged_string .= " ($a) $word";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2270
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2271 $tags[$i]{$a} = [$tag, $sanitized_word];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2272 $pos++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2273 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2274
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2275 $value = $tagged_string if scalar(@words) > 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2276
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2277 $$entry .= "[$i] /$tag=\"$value\"\r";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2278
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2279 $tags[$i]{'all'} = [$tag, $string];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2280 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2281 $i++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2282 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2283
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2284 return @tags;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2285
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2286 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2287
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2288 sub alpha_expand {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2289
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2290 my ($dangling_alphas, $selections) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2291
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2292 if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2293
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2294 my $digit = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2295 push @$selections, $digit if $digit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2296
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2297 my $start = $2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2298 my $stop = $3;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2299
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2300 my @starts = split('', $start);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2301 my @stops = split('', $stop);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2302
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2303 my ($final_start, $final_stop);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2304
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2305 for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2306
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2307 my ($final, $splits) = @$_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2308
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2309 my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2310 my $rem;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2311
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2312
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2313 if ($$splits[1]) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2314 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2315 $int++
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2316 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2317 $rem = $int;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2318 $int = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2319 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2320
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2321
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2322 $$final = $int * ALPHABET_DIVISOR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2323 $$final += $rem;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2324
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2325 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2326
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2327 my $last_number = pop @$selections;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2328 for my $pos ($final_start..$final_stop) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2329 my $lead = int($pos/ALPHABET_DIVISOR);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2330 my $lag = $pos % ALPHABET_DIVISOR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2331 my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2332 $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2333 push @$selections, $last_number.$alpha;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2334 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2335
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2336 } elsif (defined($dangling_alphas)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2337 if ($dangling_alphas =~ /^\d/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2338 push @$selections, $dangling_alphas;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2339 } elsif ($dangling_alphas =~ /^\D/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2340 #print "$dangling_alphas ".Dumper @$selections;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2341 my $last_number = pop @$selections;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2342 $last_number ||= '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2343 push @$selections, $last_number.$dangling_alphas;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2344 #$$selections[-1] .= $dangling_alphas;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2345 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2346 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2347
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2348 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2349
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2350 sub _selection_add {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2351
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2352 my ($tag, $final, $input, $choices, $num) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2353 my ($t,$v) = @{$tag->{'all'}};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2354 unless (defined ${$final->{$input}}[0]{$t}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2355 $$choices .= "$num, ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2356 push @{${$final->{$input}}[0]{$t}}, $v
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2357 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2358
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2359 }