diff gff3_rebase.py @ 2:238981ed43b7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gff3_rebase commit 908f16ea4eb082227437dc93e06e8cb742f5a257
author iuc
date Wed, 15 Nov 2017 15:15:12 -0500
parents ea35a85b941d
children
line wrap: on
line diff
--- a/gff3_rebase.py	Mon Oct 23 13:26:48 2017 -0400
+++ b/gff3_rebase.py	Wed Nov 15 15:15:12 2017 -0500
@@ -83,18 +83,25 @@
 def __get_features(child, interpro=False):
     child_features = {}
     for rec in GFF.parse(child):
+        # Only top level
         for feature in rec.features:
+            # Get the record id as parent_feature_id (since this is how it will be during remapping)
             parent_feature_id = rec.id
+            # If it's an interpro specific gff3 file
             if interpro:
+                # Then we ignore polypeptide features as they're useless
                 if feature.type == 'polypeptide':
                     continue
-                if '_' in parent_feature_id:
-                    parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
+                # If there's an underscore, we strip up to that underscore?
+                # I do not know the rationale for this, removing.
+                # if '_' in parent_feature_id:
+                    # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
 
             try:
                 child_features[parent_feature_id].append(feature)
             except KeyError:
                 child_features[parent_feature_id] = [feature]
+            # Keep a list of feature objects keyed by parent record id
     return child_features
 
 
@@ -132,23 +139,29 @@
             __update_feature_location(subfeature, parent, protein2dna)
 
 
-def rebase(parent, child, interpro=False, protein2dna=False):
+def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'):
+    # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID
     child_features = __get_features(child, interpro=interpro)
 
     for rec in GFF.parse(parent):
         replacement_features = []
         for feature in feature_lambda(
                 rec.features,
+                # Filter features in the parent genome by those that are
+                # "interesting", i.e. have results in child_features array.
+                # Probably an unnecessary optimisation.
                 feature_test_qual_value,
                 {
-                    'qualifier': 'ID',
+                    'qualifier': map_by,
                     'attribute_list': child_features.keys(),
                 },
                 subfeatures=False):
 
-            new_subfeatures = child_features[feature.id]
-            fixed_subfeatures = []
-            for x in new_subfeatures:
+            # Features which will be re-mapped
+            to_remap = child_features[feature.id]
+            # TODO: update starts
+            fixed_features = []
+            for x in to_remap:
                 # Then update the location of the actual feature
                 __update_feature_location(x, feature, protein2dna)
 
@@ -159,8 +172,8 @@
                         except Exception:
                             pass
 
-                fixed_subfeatures.append(x)
-            replacement_features.extend(fixed_subfeatures)
+                fixed_features.append(x)
+            replacement_features.extend(fixed_features)
         # We do this so we don't include the original set of features that we
         # were rebasing against in our result.
         rec.features = replacement_features
@@ -176,5 +189,6 @@
                         help='Interpro specific modifications')
     parser.add_argument('--protein2dna', action='store_true',
                         help='Map protein translated results to original DNA data')
+    parser.add_argument('--map_by', help='Map by key', default='ID')
     args = parser.parse_args()
     rebase(**vars(args))