Mercurial > repos > iuc > jbrowse
comparison gff3_rebase.py @ 1:497c6bb3b717 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse commit 0887009a23d176b21536c9fd8a18c4fecc417d4f
author | iuc |
---|---|
date | Thu, 18 Jun 2015 12:10:51 -0400 |
parents | |
children | 7342f467507b |
comparison
equal
deleted
inserted
replaced
0:2c9e5136b416 | 1:497c6bb3b717 |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import logging | |
4 logging.basicConfig(level=logging.INFO) | |
5 import argparse | |
6 from BCBio import GFF | |
7 from Bio.SeqFeature import FeatureLocation | |
8 log = logging.getLogger(__name__) | |
9 | |
10 __author__ = "Eric Rasche" | |
11 __version__ = "0.4.0" | |
12 __maintainer__ = "Eric Rasche" | |
13 __email__ = "esr@tamu.edu" | |
14 | |
15 | |
16 def __get_features(child, interpro=False): | |
17 child_features = {} | |
18 for rec in GFF.parse(child): | |
19 for feature in rec.features: | |
20 parent_feature_id = rec.id | |
21 if interpro: | |
22 if feature.type == 'polypeptide': | |
23 continue | |
24 if '_' in parent_feature_id: | |
25 parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] | |
26 | |
27 try: | |
28 child_features[parent_feature_id].append(feature) | |
29 except KeyError: | |
30 child_features[parent_feature_id] = [feature] | |
31 return child_features | |
32 | |
33 | |
34 def __update_feature_location(feature, parent, protein2dna): | |
35 start = feature.location.start | |
36 end = feature.location.end | |
37 if protein2dna: | |
38 start *= 3 | |
39 end *= 3 | |
40 | |
41 if parent.location.strand >= 0: | |
42 ns = parent.location.start + start | |
43 ne = parent.location.start + end | |
44 st = +1 | |
45 else: | |
46 ns = parent.location.end - end | |
47 ne = parent.location.end - start | |
48 st = -1 | |
49 | |
50 # Don't let start/stops be less than zero. It's technically valid for them | |
51 # to be (at least in the model I'm working with) but it causes numerous | |
52 # issues. | |
53 # | |
54 # Instead, we'll replace with %3 to try and keep it in the same reading | |
55 # frame that it should be in. | |
56 if ns < 0: | |
57 ns %= 3 | |
58 if ne < 0: | |
59 ne %= 3 | |
60 | |
61 feature.location = FeatureLocation(ns, ne, strand=st) | |
62 | |
63 if hasattr(feature, 'sub_features'): | |
64 for subfeature in feature.sub_features: | |
65 __update_feature_location(subfeature, parent, protein2dna) | |
66 | |
67 | |
68 def rebase(parent, child, interpro=False, protein2dna=False): | |
69 child_features = __get_features(child, interpro=interpro) | |
70 | |
71 for rec in GFF.parse(parent): | |
72 # TODO, replace with recursion in case it's matched against a | |
73 # non-parent feature. We're cheating a bit here right now... | |
74 replacement_features = [] | |
75 for feature in rec.features: | |
76 if feature.id in child_features: | |
77 new_subfeatures = child_features[feature.id] | |
78 # TODO: update starts | |
79 fixed_subfeatures = [] | |
80 for x in new_subfeatures: | |
81 # Then update the location of the actual feature | |
82 __update_feature_location(x, feature, protein2dna) | |
83 | |
84 if interpro: | |
85 for y in ('status', 'Target'): | |
86 try: | |
87 del x.qualifiers[y] | |
88 except: | |
89 pass | |
90 | |
91 fixed_subfeatures.append(x) | |
92 replacement_features.extend(fixed_subfeatures) | |
93 # We do this so we don't include the original set of features that we | |
94 # were rebasing against in our result. | |
95 rec.features = replacement_features | |
96 GFF.write([rec], sys.stdout) | |
97 | |
98 | |
99 if __name__ == '__main__': | |
100 parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="") | |
101 parser.add_argument('parent', type=file, help='Parent GFF3 annotations') | |
102 parser.add_argument('child', help='Child GFF3 annotations to rebase against parent') | |
103 parser.add_argument('--interpro', action='store_true', | |
104 help='Interpro specific modifications') | |
105 parser.add_argument('--protein2dna', action='store_true', | |
106 help='Map protein translated results to original DNA data') | |
107 args = parser.parse_args() | |
108 rebase(**vars(args)) |