Repository 'cpt_fix_aragorn'
hg clone https://toolshed.g2.bx.psu.edu/repos/cpt/cpt_fix_aragorn

Changeset 0:8e124760f3cc (2022-05-13)
Next changeset 1:0b22a90c660e (2022-05-20)
Commit message:
Uploaded
added:
cpt_fix_aragorn/cpt-macros.xml
cpt_fix_aragorn/fix-aragorn-gff3.py
cpt_fix_aragorn/fix-aragorn-gff3.xml
cpt_fix_aragorn/gff3.py
cpt_fix_aragorn/macros.xml
cpt_fix_aragorn/test-data/FixAra_In.gff3
cpt_fix_aragorn/test-data/FixAra_Out.gff3
b
diff -r 000000000000 -r 8e124760f3cc cpt_fix_aragorn/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fix_aragorn/cpt-macros.xml Fri May 13 17:59:38 2022 +0000
[
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+ <xml name="gff_requirements">
+ <requirements>
+ <requirement type="package" version="2.7">python</requirement>
+ <requirement type="package" version="1.65">biopython</requirement>
+ <requirement type="package" version="2.12.1">requests</requirement>
+ <yield/>
+ </requirements>
+ <version_command>
+ <![CDATA[
+ cd $__tool_directory__ && git rev-parse HEAD
+ ]]>
+ </version_command>
+ </xml>
+ <xml name="citation/mijalisrasche">
+ <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+ <citation type="bibtex">@unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+ </xml>
+ <xml name="citations">
+ <citations>
+ <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+ <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation> 
+ <yield/>
+ </citations>
+ </xml>
+     <xml name="citations-crr">
+ <citations>
+ <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+ <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+ <yield/>
+ </citations>
+ </xml>
+        <xml name="citations-2020">
+ <citations>
+ <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+ <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+                        <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+                        </citation>
+                        <yield/>
+ </citations>
+ </xml>
+        <xml name="citations-2020-AJC-solo">
+ <citations>
+ <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+                        </citation>
+                        <yield/>
+ </citations>
+ </xml>
+        <xml name="citations-clm">
+ <citations>
+ <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+ <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+                        <yield/>
+ </citations>
+ </xml>
+        <xml name="sl-citations-clm">
+ <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+                        <yield/>
+ </xml>
+</macros>
b
diff -r 000000000000 -r 8e124760f3cc cpt_fix_aragorn/fix-aragorn-gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fix_aragorn/fix-aragorn-gff3.py Fri May 13 17:59:38 2022 +0000
[
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+import sys
+import logging
+import argparse
+from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
+from Bio.SeqFeature import SeqFeature
+from gff3 import feature_lambda, feature_test_type
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def fixed_feature(rec):
+    for idx, feature in enumerate(
+        feature_lambda(
+            rec.features, feature_test_type, {"types": ["tRNA", "tmRNA"]}, subfeatures=True
+        )
+    ):
+        
+        fid = "%s-%03d" % (feature.type, 1 + idx)
+        try:
+            name = [feature.type + "-" + feature.qualifiers["Codon"][0]]
+        except KeyError:
+            name = [feature.qualifiers['product'][0]]
+        try:
+          origSource = feature.qualifiers["source"][0]
+        except:
+          origSource = "."
+        gene = gffSeqFeature(
+            location=feature.location,
+            type="gene",
+            qualifiers={"ID": [fid + ".gene"], "source": [origSource], "Name": name},
+        )
+        feature.qualifiers["Name"] = name
+        # Below that we have an mRNA
+        exon = gffSeqFeature(
+            location=feature.location,
+            type="exon",
+            qualifiers={"source": [origSource], "ID": ["%s.exon" % fid], "Name": name},
+        )
+        feature.qualifiers["ID"] = [fid]
+        exon.qualifiers["Parent"] = [fid]
+        feature.qualifiers["Parent"] = [fid + ".gene"]
+        # gene -> trna -> exon
+        feature.sub_features = [exon]
+        gene.sub_features = [feature]
+        yield gene
+
+
+def gff_filter(gff3):
+    found_gff = False
+    for rec in gffParse(gff3):
+        found_gff = True
+        rec.features = sorted(list(fixed_feature(rec)), key=lambda x: x.location.start)
+        rec.annotations = {}
+        gffWrite([rec], sys.stdout)
+    if not found_gff:
+        print("##gff-version 3")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="add parent gene features to CDSs")
+    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
+    args = parser.parse_args()
+    gff_filter(**vars(args))
b
diff -r 000000000000 -r 8e124760f3cc cpt_fix_aragorn/fix-aragorn-gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fix_aragorn/fix-aragorn-gff3.xml Fri May 13 17:59:38 2022 +0000
[
@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<tool id="edu.tamu.cpt.external.aragorn-gff3" name="Fix tRNA model" version="19.1.0.0">
+  <description></description>
+  <macros>
+    <import>macros.xml</import>
+    <import>cpt-macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <command detect_errors="aggressive"><![CDATA[
+$__tool_directory__/fix-aragorn-gff3.py
+@INPUT_GFF@
+> $default]]></command>
+  <inputs>
+      <expand macro="gff3_input" />
+  </inputs>
+  <outputs>
+    <data format="gff3" name="default"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="gff3_data" value="FixAra_In.gff3"/>
+      <output name="default" file="FixAra_Out.gff3"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+**What it does**
+
+For an input GFF3 file with tRNAs from the Aragorn or converted from the tRNAscan-SE tools, this tool modifies 
+the gene model to reflect a gene-tRNA-exon hierarchy. That change is needed 
+to allow for creation of proper tRNA features in Apollo.
+      ]]></help>
+ <expand macro="citations" />
+</tool>
b
diff -r 000000000000 -r 8e124760f3cc cpt_fix_aragorn/gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fix_aragorn/gff3.py Fri May 13 17:59:38 2022 +0000
[
b'@@ -0,0 +1,346 @@\n+import copy\n+import logging\n+\n+log = logging.getLogger()\n+log.setLevel(logging.WARN)\n+\n+\n+def feature_lambda(\n+    feature_list,\n+    test,\n+    test_kwargs,\n+    subfeatures=True,\n+    parent=None,\n+    invert=False,\n+    recurse=True,\n+):\n+    """Recursively search through features, testing each with a test function, yielding matches.\n+\n+    GFF3 is a hierachical data structure, so we need to be able to recursively\n+    search through features. E.g. if you\'re looking for a feature with\n+    ID=\'bob.42\', you can\'t just do a simple list comprehension with a test\n+    case. You don\'t know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.\n+\n+    :type feature_list: list\n+    :param feature_list: an iterable of features\n+\n+    :type test: function reference\n+    :param test: a closure with the method signature (feature, **kwargs) where\n+                 the kwargs are those passed in the next argument. This\n+                 function should return True or False, True if the feature is\n+                 to be yielded as part of the main feature_lambda function, or\n+                 False if it is to be ignored. This function CAN mutate the\n+                 features passed to it (think "apply").\n+\n+    :type test_kwargs: dictionary\n+    :param test_kwargs: kwargs to pass to your closure when it is called.\n+\n+    :type subfeatures: boolean\n+    :param subfeatures: when a feature is matched, should just that feature be\n+                        yielded to the caller, or should the entire sub_feature\n+                        tree for that feature be included? subfeatures=True is\n+                        useful in cases such as searching for a gene feature,\n+                        and wanting to know what RBS/Shine_Dalgarno_sequences\n+                        are in the sub_feature tree (which can be accomplished\n+                        with two feature_lambda calls). subfeatures=False is\n+                        useful in cases when you want to process (and possibly\n+                        return) the entire feature tree, such as applying a\n+                        qualifier to every single feature.\n+\n+    :type invert: boolean\n+    :param invert: Negate/invert the result of the filter.\n+\n+    :rtype: yielded list\n+    :return: Yields a list of matching features.\n+    """\n+    # Either the top level set of [features] or the subfeature attribute\n+    for feature in feature_list:\n+        feature._parent = parent\n+        if not parent:\n+            # Set to self so we cannot go above root.\n+            feature._parent = feature\n+        test_result = test(feature, **test_kwargs)\n+        # if (not invert and test_result) or (invert and not test_result):\n+        if invert ^ test_result:\n+            if not subfeatures:\n+                feature_copy = copy.deepcopy(feature)\n+                feature_copy.sub_features = list()\n+                yield feature_copy\n+            else:\n+                yield feature\n+\n+        if recurse and hasattr(feature, "sub_features"):\n+            for x in feature_lambda(\n+                feature.sub_features,\n+                test,\n+                test_kwargs,\n+                subfeatures=subfeatures,\n+                parent=feature,\n+                invert=invert,\n+                recurse=recurse,\n+            ):\n+                yield x\n+\n+\n+def fetchParent(feature):\n+    if not hasattr(feature, "_parent") or feature._parent is None:\n+        return feature\n+    else:\n+        return fetchParent(feature._parent)\n+\n+\n+def feature_test_true(feature, **kwargs):\n+    return True\n+\n+\n+def feature_test_type(feature, **kwargs):\n+    if "type" in kwargs:\n+        return str(feature.type).upper() == str(kwargs["type"]).upper()\n+    elif "types" in kwargs:\n+      for x in kwargs["types"]:\n+        if str(feature.type).upper() == str(x).upper():\n+          return True\n+      return False\n+    raise Exception("Incorrect feature_test_type call, ne'..b'feature.location.start,\n+        # feature.location.end,\n+        # feature.location.strand\n+        # )\n+    return result\n+\n+\n+def get_gff3_id(gene):\n+    return gene.qualifiers.get("Name", [gene.id])[0]\n+\n+\n+def ensure_location_in_bounds(start=0, end=0, parent_length=0):\n+    # This prevents frameshift errors\n+    while start < 0:\n+        start += 3\n+    while end < 0:\n+        end += 3\n+    while start > parent_length:\n+        start -= 3\n+    while end > parent_length:\n+        end -= 3\n+    return (start, end)\n+\n+\n+def coding_genes(feature_list):\n+    for x in genes(feature_list):\n+        if (\n+            len(\n+                list(\n+                    feature_lambda(\n+                        x.sub_features,\n+                        feature_test_type,\n+                        {"type": "CDS"},\n+                        subfeatures=False,\n+                    )\n+                )\n+            )\n+            > 0\n+        ):\n+            yield x\n+\n+\n+def genes(feature_list, feature_type="gene", sort=False):\n+    """\n+    Simple filter to extract gene features from the feature set.\n+    """\n+\n+    if not sort:\n+        for x in feature_lambda(\n+            feature_list, feature_test_type, {"type": feature_type}, subfeatures=True\n+        ):\n+            yield x\n+    else:\n+        data = list(genes(feature_list, feature_type=feature_type, sort=False))\n+        data = sorted(data, key=lambda feature: feature.location.start)\n+        for x in data:\n+            yield x\n+\n+\n+def wa_unified_product_name(feature):\n+    """\n+    Try and figure out a name. We gave conflicting instructions, so\n+    this isn\'t as trivial as it should be. Sometimes it will be in\n+    \'product\' or \'Product\', othertimes in \'Name\'\n+    """\n+    # Manually applied tags.\n+    protein_product = feature.qualifiers.get(\n+        "product", feature.qualifiers.get("Product", [None])\n+    )[0]\n+\n+    # If neither of those are available ...\n+    if protein_product is None:\n+        # And there\'s a name...\n+        if "Name" in feature.qualifiers:\n+            if not is_uuid(feature.qualifiers["Name"][0]):\n+                protein_product = feature.qualifiers["Name"][0]\n+\n+    return protein_product\n+\n+\n+def is_uuid(name):\n+    return name.count("-") == 4 and len(name) == 36\n+\n+\n+def get_rbs_from(gene):\n+    # Normal RBS annotation types\n+    rbs_rbs = list(\n+        feature_lambda(\n+            gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False\n+        )\n+    )\n+    rbs_sds = list(\n+        feature_lambda(\n+            gene.sub_features,\n+            feature_test_type,\n+            {"type": "Shine_Dalgarno_sequence"},\n+            subfeatures=False,\n+        )\n+    )\n+    # Fraking apollo\n+    apollo_exons = list(\n+        feature_lambda(\n+            gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False\n+        )\n+    )\n+    apollo_exons = [x for x in apollo_exons if len(x) < 10]\n+    # These are more NCBI\'s style\n+    regulatory_elements = list(\n+        feature_lambda(\n+            gene.sub_features,\n+            feature_test_type,\n+            {"type": "regulatory"},\n+            subfeatures=False,\n+        )\n+    )\n+    rbs_regulatory = list(\n+        feature_lambda(\n+            regulatory_elements,\n+            feature_test_quals,\n+            {"regulatory_class": ["ribosome_binding_site"]},\n+            subfeatures=False,\n+        )\n+    )\n+    # Here\'s hoping you find just one ;)\n+    return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons\n+\n+\n+def nice_name(record):\n+    """\n+    get the real name rather than NCBI IDs and so on. If fails, will return record.id\n+    """\n+    name = record.id\n+    likely_parental_contig = list(genes(record.features, feature_type="contig"))\n+    if len(likely_parental_contig) == 1:\n+        name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]\n+    return name\n+\n+\n+def fsort(it):\n+    for i in sorted(it, key=lambda x: int(x.location.start)):\n+        yield i\n'
b
diff -r 000000000000 -r 8e124760f3cc cpt_fix_aragorn/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fix_aragorn/macros.xml Fri May 13 17:59:38 2022 +0000
b
@@ -0,0 +1,66 @@
+<?xml version="1.0"?>
+<macros>
+ <xml name="requirements">
+ <requirements>
+ <requirement type="package" version="3.7">python</requirement>
+ <requirement type="package" version="1.77">biopython</requirement>
+ <requirement type="package" version="1.1.3">cpt_gffparser</requirement>  
+ <yield/>
+ </requirements>
+ </xml>
+ <xml name="genome_selector">
+ <conditional name="reference_genome">
+ <param name="reference_genome_source" type="select" label="Reference Genome">
+ <option value="history" selected="True">From History</option>
+ <option value="cached">Locally Cached</option>
+ </param>
+ <when value="cached">
+ <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+ <options from_data_table="all_fasta"/>
+ </param>
+ </when>
+ <when value="history">
+ <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+ </when>
+ </conditional>
+ </xml>
+ <xml name="gff3_input">
+ <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+ </xml>
+ <xml name="input/gff3+fasta">
+ <expand macro="gff3_input" />
+ <expand macro="genome_selector" />
+ </xml>
+ <token name="@INPUT_GFF@">
+ "$gff3_data"
+ </token>
+ <token name="@INPUT_FASTA@">
+#if str($reference_genome.reference_genome_source) == 'cached':
+ "${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+#end if
+ </token>
+ <token name="@GENOME_SELECTOR_PRE@">
+#if $reference_genome.reference_genome_source == 'history':
+ ln -s $reference_genome.genome_fasta genomeref.fa;
+#end if
+ </token>
+ <token name="@GENOME_SELECTOR@">
+#if str($reference_genome.reference_genome_source) == 'cached':
+ "${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+#end if
+ </token>
+        <xml name="input/fasta">
+ <param label="Fasta file" name="sequences" type="data" format="fasta"/>
+ </xml>
+
+ <token name="@SEQUENCE@">
+ "$sequences"
+ </token>
+ <xml name="input/fasta/protein">
+ <param label="Protein fasta file" name="sequences" type="data" format="fasta"/>
+ </xml>
+</macros>
b
diff -r 000000000000 -r 8e124760f3cc cpt_fix_aragorn/test-data/FixAra_In.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fix_aragorn/test-data/FixAra_In.gff3 Fri May 13 17:59:38 2022 +0000
b
@@ -0,0 +1,2 @@
+##gff-version-3
+Phriendly aragorn tRNA 48610 48685 . - . Anticodon="tcc";Codon="Gly"
b
diff -r 000000000000 -r 8e124760f3cc cpt_fix_aragorn/test-data/FixAra_Out.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fix_aragorn/test-data/FixAra_Out.gff3 Fri May 13 17:59:38 2022 +0000
b
@@ -0,0 +1,4 @@
+##gff-version 3
+Phriendly feature gene 48610 48685 . - . ID=tRNA-001.gene;source=.;Name=tRNA-"Gly";
+Phriendly aragorn tRNA 48610 48685 . - . Anticodon="tcc";Codon="Gly";Name=tRNA-"Gly";ID=tRNA-001;Parent=tRNA-001.gene;
+Phriendly feature exon 48610 48685 . - . source=.;ID=tRNA-001.exon;Name=tRNA-"Gly";Parent=tRNA-001;