comparison gff3_rebase.py @ 0:d78175596286 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit cd77dffaad652cfb75b98bde5231beaa6d63cd5b-dirty
author fubar
date Mon, 08 Jan 2024 09:20:33 +0000
parents
children 4c201a3d4755
comparison
equal deleted inserted replaced
-1:000000000000 0:d78175596286
1 #!/usr/bin/env python
2 import argparse
3 import copy
4 import logging
5 import sys
6
7 from BCBio import GFF
8 from Bio.SeqFeature import FeatureLocation
9
10 logging.basicConfig(level=logging.INFO)
11 log = logging.getLogger(__name__)
12
13 __author__ = "Eric Rasche"
14 __version__ = "0.4.0"
15 __maintainer__ = "Eric Rasche"
16 __email__ = "esr@tamu.edu"
17
18
19 def feature_lambda(feature_list, test, test_kwargs, subfeatures=True):
20 """Recursively search through features, testing each with a test function, yielding matches.
21
22 GFF3 is a hierachical data structure, so we need to be able to recursively
23 search through features. E.g. if you're looking for a feature with
24 ID='bob.42', you can't just do a simple list comprehension with a test
25 case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
26
27 :type feature_list: list
28 :param feature_list: an iterable of features
29
30 :type test: function reference
31 :param test: a closure with the method signature (feature, **kwargs) where
32 the kwargs are those passed in the next argument. This
33 function should return True or False, True if the feature is
34 to be yielded as part of the main feature_lambda function, or
35 False if it is to be ignored. This function CAN mutate the
36 features passed to it (think "apply").
37
38 :type test_kwargs: dictionary
39 :param test_kwargs: kwargs to pass to your closure when it is called.
40
41 :type subfeatures: boolean
42 :param subfeatures: when a feature is matched, should just that feature be
43 yielded to the caller, or should the entire sub_feature
44 tree for that feature be included? subfeatures=True is
45 useful in cases such as searching for a gene feature,
46 and wanting to know what RBS/Shine_Dalgarno_sequences
47 are in the sub_feature tree (which can be accomplished
48 with two feature_lambda calls). subfeatures=False is
49 useful in cases when you want to process (and possibly
50 return) the entire feature tree, such as applying a
51 qualifier to every single feature.
52
53 :rtype: yielded list
54 :return: Yields a list of matching features.
55 """
56 # Either the top level set of [features] or the subfeature attribute
57 for feature in feature_list:
58 if test(feature, **test_kwargs):
59 if not subfeatures:
60 feature_copy = copy.deepcopy(feature)
61 feature_copy.sub_features = []
62 yield feature_copy
63 else:
64 yield feature
65
66 if hasattr(feature, "sub_features"):
67 for x in feature_lambda(
68 feature.sub_features, test, test_kwargs, subfeatures=subfeatures
69 ):
70 yield x
71
72
73 def feature_test_qual_value(feature, **kwargs):
74 """Test qualifier values.
75
76 For every feature, check that at least one value in
77 feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
78 """
79 for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []):
80 if attribute_value in kwargs["attribute_list"]:
81 return True
82 return False
83
84
85 def __get_features(child, interpro=False):
86 child_features = {}
87 for rec in GFF.parse(child):
88 # Only top level
89 for feature in rec.features:
90 # Get the record id as parent_feature_id (since this is how it will be during remapping)
91 parent_feature_id = rec.id
92 # If it's an interpro specific gff3 file
93 if interpro:
94 # Then we ignore polypeptide features as they're useless
95 if feature.type == "polypeptide":
96 continue
97 # If there's an underscore, we strip up to that underscore?
98 # I do not know the rationale for this, removing.
99 # if '_' in parent_feature_id:
100 # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
101
102 try:
103 child_features[parent_feature_id].append(feature)
104 except KeyError:
105 child_features[parent_feature_id] = [feature]
106 # Keep a list of feature objects keyed by parent record id
107 return child_features
108
109
110 def __update_feature_location(feature, parent, protein2dna):
111 start = feature.location.start
112 end = feature.location.end
113 if protein2dna:
114 start *= 3
115 end *= 3
116
117 if parent.location.strand >= 0:
118 ns = parent.location.start + start
119 ne = parent.location.start + end
120 st = +1
121 else:
122 ns = parent.location.end - end
123 ne = parent.location.end - start
124 st = -1
125
126 # Don't let start/stops be less than zero. It's technically valid for them
127 # to be (at least in the model I'm working with) but it causes numerous
128 # issues.
129 #
130 # Instead, we'll replace with %3 to try and keep it in the same reading
131 # frame that it should be in.
132 if ns < 0:
133 ns %= 3
134 if ne < 0:
135 ne %= 3
136
137 feature.location = FeatureLocation(ns, ne, strand=st)
138
139 if hasattr(feature, "sub_features"):
140 for subfeature in feature.sub_features:
141 __update_feature_location(subfeature, parent, protein2dna)
142
143
144 def rebase(parent, child, interpro=False, protein2dna=False, map_by="ID"):
145 # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID
146 child_features = __get_features(child, interpro=interpro)
147
148 for rec in GFF.parse(parent):
149 replacement_features = []
150 for feature in feature_lambda(
151 rec.features,
152 # Filter features in the parent genome by those that are
153 # "interesting", i.e. have results in child_features array.
154 # Probably an unnecessary optimisation.
155 feature_test_qual_value,
156 {
157 "qualifier": map_by,
158 "attribute_list": child_features.keys(),
159 },
160 subfeatures=False,
161 ):
162
163 # Features which will be re-mapped
164 to_remap = child_features[feature.id]
165 # TODO: update starts
166 fixed_features = []
167 for x in to_remap:
168 # Then update the location of the actual feature
169 __update_feature_location(x, feature, protein2dna)
170
171 if interpro:
172 for y in ("status", "Target"):
173 try:
174 del x.qualifiers[y]
175 except Exception:
176 pass
177
178 fixed_features.append(x)
179 replacement_features.extend(fixed_features)
180 # We do this so we don't include the original set of features that we
181 # were rebasing against in our result.
182 rec.features = replacement_features
183 rec.annotations = {}
184 GFF.write([rec], sys.stdout)
185
186
187 if __name__ == "__main__":
188 parser = argparse.ArgumentParser(
189 description="rebase gff3 features against parent locations", epilog=""
190 )
191 parser.add_argument(
192 "parent", type=argparse.FileType("r"), help="Parent GFF3 annotations"
193 )
194 parser.add_argument(
195 "child",
196 type=argparse.FileType("r"),
197 help="Child GFF3 annotations to rebase against parent",
198 )
199 parser.add_argument(
200 "--interpro", action="store_true", help="Interpro specific modifications"
201 )
202 parser.add_argument(
203 "--protein2dna",
204 action="store_true",
205 help="Map protein translated results to original DNA data",
206 )
207 parser.add_argument("--map_by", help="Map by key", default="ID")
208 args = parser.parse_args()
209 rebase(**vars(args))