Mercurial > repos > iuc > gff3_rebase
comparison gff3_rebase.py @ 2:238981ed43b7 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gff3_rebase commit 908f16ea4eb082227437dc93e06e8cb742f5a257
author | iuc |
---|---|
date | Wed, 15 Nov 2017 15:15:12 -0500 |
parents | ea35a85b941d |
children |
comparison
equal
deleted
inserted
replaced
1:ea35a85b941d | 2:238981ed43b7 |
---|---|
81 | 81 |
82 | 82 |
83 def __get_features(child, interpro=False): | 83 def __get_features(child, interpro=False): |
84 child_features = {} | 84 child_features = {} |
85 for rec in GFF.parse(child): | 85 for rec in GFF.parse(child): |
86 # Only top level | |
86 for feature in rec.features: | 87 for feature in rec.features: |
88 # Get the record id as parent_feature_id (since this is how it will be during remapping) | |
87 parent_feature_id = rec.id | 89 parent_feature_id = rec.id |
90 # If it's an interpro specific gff3 file | |
88 if interpro: | 91 if interpro: |
92 # Then we ignore polypeptide features as they're useless | |
89 if feature.type == 'polypeptide': | 93 if feature.type == 'polypeptide': |
90 continue | 94 continue |
91 if '_' in parent_feature_id: | 95 # If there's an underscore, we strip up to that underscore? |
92 parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] | 96 # I do not know the rationale for this, removing. |
97 # if '_' in parent_feature_id: | |
98 # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] | |
93 | 99 |
94 try: | 100 try: |
95 child_features[parent_feature_id].append(feature) | 101 child_features[parent_feature_id].append(feature) |
96 except KeyError: | 102 except KeyError: |
97 child_features[parent_feature_id] = [feature] | 103 child_features[parent_feature_id] = [feature] |
104 # Keep a list of feature objects keyed by parent record id | |
98 return child_features | 105 return child_features |
99 | 106 |
100 | 107 |
101 def __update_feature_location(feature, parent, protein2dna): | 108 def __update_feature_location(feature, parent, protein2dna): |
102 start = feature.location.start | 109 start = feature.location.start |
130 if hasattr(feature, 'sub_features'): | 137 if hasattr(feature, 'sub_features'): |
131 for subfeature in feature.sub_features: | 138 for subfeature in feature.sub_features: |
132 __update_feature_location(subfeature, parent, protein2dna) | 139 __update_feature_location(subfeature, parent, protein2dna) |
133 | 140 |
134 | 141 |
135 def rebase(parent, child, interpro=False, protein2dna=False): | 142 def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'): |
143 # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID | |
136 child_features = __get_features(child, interpro=interpro) | 144 child_features = __get_features(child, interpro=interpro) |
137 | 145 |
138 for rec in GFF.parse(parent): | 146 for rec in GFF.parse(parent): |
139 replacement_features = [] | 147 replacement_features = [] |
140 for feature in feature_lambda( | 148 for feature in feature_lambda( |
141 rec.features, | 149 rec.features, |
150 # Filter features in the parent genome by those that are | |
151 # "interesting", i.e. have results in child_features array. | |
152 # Probably an unnecessary optimisation. | |
142 feature_test_qual_value, | 153 feature_test_qual_value, |
143 { | 154 { |
144 'qualifier': 'ID', | 155 'qualifier': map_by, |
145 'attribute_list': child_features.keys(), | 156 'attribute_list': child_features.keys(), |
146 }, | 157 }, |
147 subfeatures=False): | 158 subfeatures=False): |
148 | 159 |
149 new_subfeatures = child_features[feature.id] | 160 # Features which will be re-mapped |
150 fixed_subfeatures = [] | 161 to_remap = child_features[feature.id] |
151 for x in new_subfeatures: | 162 # TODO: update starts |
163 fixed_features = [] | |
164 for x in to_remap: | |
152 # Then update the location of the actual feature | 165 # Then update the location of the actual feature |
153 __update_feature_location(x, feature, protein2dna) | 166 __update_feature_location(x, feature, protein2dna) |
154 | 167 |
155 if interpro: | 168 if interpro: |
156 for y in ('status', 'Target'): | 169 for y in ('status', 'Target'): |
157 try: | 170 try: |
158 del x.qualifiers[y] | 171 del x.qualifiers[y] |
159 except Exception: | 172 except Exception: |
160 pass | 173 pass |
161 | 174 |
162 fixed_subfeatures.append(x) | 175 fixed_features.append(x) |
163 replacement_features.extend(fixed_subfeatures) | 176 replacement_features.extend(fixed_features) |
164 # We do this so we don't include the original set of features that we | 177 # We do this so we don't include the original set of features that we |
165 # were rebasing against in our result. | 178 # were rebasing against in our result. |
166 rec.features = replacement_features | 179 rec.features = replacement_features |
167 rec.annotations = {} | 180 rec.annotations = {} |
168 GFF.write([rec], sys.stdout) | 181 GFF.write([rec], sys.stdout) |
174 parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent') | 187 parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent') |
175 parser.add_argument('--interpro', action='store_true', | 188 parser.add_argument('--interpro', action='store_true', |
176 help='Interpro specific modifications') | 189 help='Interpro specific modifications') |
177 parser.add_argument('--protein2dna', action='store_true', | 190 parser.add_argument('--protein2dna', action='store_true', |
178 help='Map protein translated results to original DNA data') | 191 help='Map protein translated results to original DNA data') |
192 parser.add_argument('--map_by', help='Map by key', default='ID') | |
179 args = parser.parse_args() | 193 args = parser.parse_args() |
180 rebase(**vars(args)) | 194 rebase(**vars(args)) |