Galaxy |

Changeset 3:f0f0ab9db43f (2023-06-05)

Previous changeset 2:dc22c76a57bd (2022-05-20) Next changeset 4:733cb0807083 (2023-08-30)

Commit message:
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c

added:
cpt-macros.xml
fix-aragorn-gff3.py
fix-aragorn-gff3.xml
gff3.py
macros.xml
test-data/FixAra_In.gff3
test-data/FixAra_Out.gff3

removed:
cpt_fix_aragorn/cpt-macros.xml
cpt_fix_aragorn/fix-aragorn-gff3.py
cpt_fix_aragorn/fix-aragorn-gff3.xml
cpt_fix_aragorn/gff3.py
cpt_fix_aragorn/macros.xml
cpt_fix_aragorn/test-data/FixAra_In.gff3
cpt_fix_aragorn/test-data/FixAra_Out.gff3

diff -r dc22c76a57bd -r f0f0ab9db43f cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml Mon Jun 05 02:42:12 2023 +0000

[

@@ -0,0 +1,115 @@
+<macros>
+    <xml name="gff_requirements">
+        <requirements>
+            <requirement type="package" version="2.7">python</requirement>
+            <requirement type="package" version="1.65">biopython</requirement>
+            <requirement type="package" version="2.12.1">requests</requirement>
+ <requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+            <yield/>
+        </requirements>
+        <version_command>
+ <![CDATA[
+ cd '$__tool_directory__' && git rev-parse HEAD
+ ]]>
+ </version_command>
+    </xml>
+    <xml name="citation/mijalisrasche">
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">@unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-crr">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020-AJC-solo">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-clm">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="sl-citations-clm">
+        <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+        <yield/>
+    </xml>
+</macros>

diff -r dc22c76a57bd -r f0f0ab9db43f cpt_fix_aragorn/cpt-macros.xml
--- a/cpt_fix_aragorn/cpt-macros.xml Fri May 20 08:46:58 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,115 +0,0 @@
-<?xml version="1.0"?>
-<macros>
- <xml name="gff_requirements">
- <requirements>
- <requirement type="package" version="2.7">python</requirement>
- <requirement type="package" version="1.65">biopython</requirement>
- <requirement type="package" version="2.12.1">requests</requirement>
- <yield/>
- </requirements>
- <version_command>
- <![CDATA[
- cd $__tool_directory__ && git rev-parse HEAD
- ]]>
- </version_command>
- </xml>
- <xml name="citation/mijalisrasche">
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">@unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
- </xml>
- <xml name="citations">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
- <yield/>
- </citations>
- </xml>
-     <xml name="citations-crr">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {C. Ross},
- title = {CPT Galaxy Tools},
- year = {2020-},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
- <yield/>
- </citations>
- </xml>
-        <xml name="citations-2020">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
-                        <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-                        </citation>
-                        <yield/>
- </citations>
- </xml>
-        <xml name="citations-2020-AJC-solo">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-                        </citation>
-                        <yield/>
- </citations>
- </xml>
-        <xml name="citations-clm">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
-                        <yield/>
- </citations>
- </xml>
-        <xml name="sl-citations-clm">
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
-                        <yield/>
- </xml>
-</macros>

diff -r dc22c76a57bd -r f0f0ab9db43f cpt_fix_aragorn/fix-aragorn-gff3.py
--- a/cpt_fix_aragorn/fix-aragorn-gff3.py Fri May 20 08:46:58 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,65 +0,0 @@
-#!/usr/bin/env python
-import sys
-import logging
-import argparse
-from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
-from Bio.SeqFeature import SeqFeature
-from gff3 import feature_lambda, feature_test_type
-
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger(__name__)
-
-
-def fixed_feature(rec):
-    for idx, feature in enumerate(
-        feature_lambda(
-            rec.features, feature_test_type, {"types": ["tRNA", "tmRNA"]}, subfeatures=True
-        )
-    ):
-
-        fid = "%s-%03d" % (feature.type, 1 + idx)
-        try:
-            name = [feature.type + "-" + feature.qualifiers["Codon"][0]]
-        except KeyError:
-            name = [feature.qualifiers['product'][0]]
-        try:
-          origSource = feature.qualifiers["source"][0]
-        except:
-          origSource = "."
-        gene = gffSeqFeature(
-            location=feature.location,
-            type="gene",
-            qualifiers={"ID": [fid + ".gene"], "source": [origSource], "Name": name},
-        )
-        feature.qualifiers["Name"] = name
-        # Below that we have an mRNA
-        exon = gffSeqFeature(
-            location=feature.location,
-            type="exon",
-            qualifiers={"source": [origSource], "ID": ["%s.exon" % fid], "Name": name},
-        )
-        feature.qualifiers["ID"] = [fid]
-        exon.qualifiers["Parent"] = [fid]
-        feature.qualifiers["Parent"] = [fid + ".gene"]
-        # gene -> trna -> exon
-        feature.sub_features = [exon]
-        gene.sub_features = [feature]
-        yield gene
-
-
-def gff_filter(gff3):
-    found_gff = False
-    for rec in gffParse(gff3):
-        found_gff = True
-        rec.features = sorted(list(fixed_feature(rec)), key=lambda x: x.location.start)
-        rec.annotations = {}
-        gffWrite([rec], sys.stdout)
-    if not found_gff:
-        print("##gff-version 3")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="add parent gene features to CDSs")
-    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
-    args = parser.parse_args()
-    gff_filter(**vars(args))

diff -r dc22c76a57bd -r f0f0ab9db43f cpt_fix_aragorn/fix-aragorn-gff3.xml
--- a/cpt_fix_aragorn/fix-aragorn-gff3.xml Fri May 20 08:46:58 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,33 +0,0 @@
-<?xml version="1.0"?>
-<tool id="edu.tamu.cpt.external.aragorn-gff3" name="Fix tRNA model" version="19.1.0.0">
-  <description></description>
-  <macros>
-    <import>macros.xml</import>
-    <import>cpt-macros.xml</import>
-  </macros>
-  <expand macro="requirements"/>
-  <command detect_errors="aggressive"><![CDATA[
-$__tool_directory__/fix-aragorn-gff3.py
-@INPUT_GFF@
-> $default]]></command>
-  <inputs>
-      <expand macro="gff3_input" />
-  </inputs>
-  <outputs>
-    <data format="gff3" name="default"/>
-  </outputs>
-  <tests>
-    <test>
-      <param name="gff3_data" value="FixAra_In.gff3"/>
-      <output name="default" file="FixAra_Out.gff3"/>
-    </test>
-  </tests>
-  <help><![CDATA[
-**What it does**
-
-For an input GFF3 file with tRNAs from the Aragorn or converted from the tRNAscan-SE tools, this tool modifies
-the gene model to reflect a gene-tRNA-exon hierarchy. That change is needed
-to allow for creation of proper tRNA features in Apollo.
-      ]]></help>
- <expand macro="citations" />
-</tool>

diff -r dc22c76a57bd -r f0f0ab9db43f cpt_fix_aragorn/gff3.py
--- a/cpt_fix_aragorn/gff3.py Fri May 20 08:46:58 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,346 +0,0 @@\n-import copy\n-import logging\n-\n-log = logging.getLogger()\n-log.setLevel(logging.WARN)\n-\n-\n-def feature_lambda(\n- feature_list,\n- test,\n- test_kwargs,\n- subfeatures=True,\n- parent=None,\n- invert=False,\n- recurse=True,\n-):\n- """Recursively search through features, testing each with a test function, yielding matches.\n-\n- GFF3 is a hierachical data structure, so we need to be able to recursively\n- search through features. E.g. if you\'re looking for a feature with\n- ID=\'bob.42\', you can\'t just do a simple list comprehension with a test\n- case. You don\'t know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.\n-\n- :type feature_list: list\n- :param feature_list: an iterable of features\n-\n- :type test: function reference\n- :param test: a closure with the method signature (feature, **kwargs) where\n- the kwargs are those passed in the next argument. This\n- function should return True or False, True if the feature is\n- to be yielded as part of the main feature_lambda function, or\n- False if it is to be ignored. This function CAN mutate the\n- features passed to it (think "apply").\n-\n- :type test_kwargs: dictionary\n- :param test_kwargs: kwargs to pass to your closure when it is called.\n-\n- :type subfeatures: boolean\n- :param subfeatures: when a feature is matched, should just that feature be\n- yielded to the caller, or should the entire sub_feature\n- tree for that feature be included? subfeatures=True is\n- useful in cases such as searching for a gene feature,\n- and wanting to know what RBS/Shine_Dalgarno_sequences\n- are in the sub_feature tree (which can be accomplished\n- with two feature_lambda calls). subfeatures=False is\n- useful in cases when you want to process (and possibly\n- return) the entire feature tree, such as applying a\n- qualifier to every single feature.\n-\n- :type invert: boolean\n- :param invert: Negate/invert the result of the filter.\n-\n- :rtype: yielded list\n- :return: Yields a list of matching features.\n- """\n- # Either the top level set of [features] or the subfeature attribute\n- for feature in feature_list:\n- feature._parent = parent\n- if not parent:\n- # Set to self so we cannot go above root.\n- feature._parent = feature\n- test_result = test(feature, **test_kwargs)\n- # if (not invert and test_result) or (invert and not test_result):\n- if invert ^ test_result:\n- if not subfeatures:\n- feature_copy = copy.deepcopy(feature)\n- feature_copy.sub_features = list()\n- yield feature_copy\n- else:\n- yield feature\n-\n- if recurse and hasattr(feature, "sub_features"):\n- for x in feature_lambda(\n- feature.sub_features,\n- test,\n- test_kwargs,\n- subfeatures=subfeatures,\n- parent=feature,\n- invert=invert,\n- recurse=recurse,\n- ):\n- yield x\n-\n-\n-def fetchParent(feature):\n- if not hasattr(feature, "_parent") or feature._parent is None:\n- return feature\n- else:\n- return fetchParent(feature._parent)\n-\n-\n-def feature_test_true(feature, **kwargs):\n- return True\n-\n-\n-def feature_test_type(feature, **kwargs):\n- if "type" in kwargs:\n- return str(feature.type).upper() == str(kwargs["type"]).upper()\n- elif "types" in kwargs:\n- for x in kwargs["types"]:\n- if str(feature.type).upper() == str(x).upper():\n- return True\n- return False\n- raise Exception("Incorrect feature_test_type call, ne'..b'feature.location.start,\n- # feature.location.end,\n- # feature.location.strand\n- # )\n- return result\n-\n-\n-def get_gff3_id(gene):\n- return gene.qualifiers.get("Name", [gene.id])[0]\n-\n-\n-def ensure_location_in_bounds(start=0, end=0, parent_length=0):\n- # This prevents frameshift errors\n- while start < 0:\n- start += 3\n- while end < 0:\n- end += 3\n- while start > parent_length:\n- start -= 3\n- while end > parent_length:\n- end -= 3\n- return (start, end)\n-\n-\n-def coding_genes(feature_list):\n- for x in genes(feature_list):\n- if (\n- len(\n- list(\n- feature_lambda(\n- x.sub_features,\n- feature_test_type,\n- {"type": "CDS"},\n- subfeatures=False,\n- )\n- )\n- )\n- > 0\n- ):\n- yield x\n-\n-\n-def genes(feature_list, feature_type="gene", sort=False):\n- """\n- Simple filter to extract gene features from the feature set.\n- """\n-\n- if not sort:\n- for x in feature_lambda(\n- feature_list, feature_test_type, {"type": feature_type}, subfeatures=True\n- ):\n- yield x\n- else:\n- data = list(genes(feature_list, feature_type=feature_type, sort=False))\n- data = sorted(data, key=lambda feature: feature.location.start)\n- for x in data:\n- yield x\n-\n-\n-def wa_unified_product_name(feature):\n- """\n- Try and figure out a name. We gave conflicting instructions, so\n- this isn\'t as trivial as it should be. Sometimes it will be in\n- \'product\' or \'Product\', othertimes in \'Name\'\n- """\n- # Manually applied tags.\n- protein_product = feature.qualifiers.get(\n- "product", feature.qualifiers.get("Product", [None])\n- )[0]\n-\n- # If neither of those are available ...\n- if protein_product is None:\n- # And there\'s a name...\n- if "Name" in feature.qualifiers:\n- if not is_uuid(feature.qualifiers["Name"][0]):\n- protein_product = feature.qualifiers["Name"][0]\n-\n- return protein_product\n-\n-\n-def is_uuid(name):\n- return name.count("-") == 4 and len(name) == 36\n-\n-\n-def get_rbs_from(gene):\n- # Normal RBS annotation types\n- rbs_rbs = list(\n- feature_lambda(\n- gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False\n- )\n- )\n- rbs_sds = list(\n- feature_lambda(\n- gene.sub_features,\n- feature_test_type,\n- {"type": "Shine_Dalgarno_sequence"},\n- subfeatures=False,\n- )\n- )\n- # Fraking apollo\n- apollo_exons = list(\n- feature_lambda(\n- gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False\n- )\n- )\n- apollo_exons = [x for x in apollo_exons if len(x) < 10]\n- # These are more NCBI\'s style\n- regulatory_elements = list(\n- feature_lambda(\n- gene.sub_features,\n- feature_test_type,\n- {"type": "regulatory"},\n- subfeatures=False,\n- )\n- )\n- rbs_regulatory = list(\n- feature_lambda(\n- regulatory_elements,\n- feature_test_quals,\n- {"regulatory_class": ["ribosome_binding_site"]},\n- subfeatures=False,\n- )\n- )\n- # Here\'s hoping you find just one ;)\n- return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons\n-\n-\n-def nice_name(record):\n- """\n- get the real name rather than NCBI IDs and so on. If fails, will return record.id\n- """\n- name = record.id\n- likely_parental_contig = list(genes(record.features, feature_type="contig"))\n- if len(likely_parental_contig) == 1:\n- name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]\n- return name\n-\n-\n-def fsort(it):\n- for i in sorted(it, key=lambda x: int(x.location.start)):\n- yield i\n'

diff -r dc22c76a57bd -r f0f0ab9db43f cpt_fix_aragorn/macros.xml
--- a/cpt_fix_aragorn/macros.xml Fri May 20 08:46:58 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,66 +0,0 @@
-<?xml version="1.0"?>
-<macros>
- <xml name="requirements">
- <requirements>
- <requirement type="package" version="3.8.13">python</requirement>
- <requirement type="package" version="1.79">biopython</requirement>
- <requirement type="package" version="1.2.2">cpt_gffparser</requirement>
- <yield/>
- </requirements>
- </xml>
- <xml name="genome_selector">
- <conditional name="reference_genome">
- <param name="reference_genome_source" type="select" label="Reference Genome">
- <option value="history" selected="True">From History</option>
- <option value="cached">Locally Cached</option>
- </param>
- <when value="cached">
- <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
- <options from_data_table="all_fasta"/>
- </param>
- </when>
- <when value="history">
- <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
- </when>
- </conditional>
- </xml>
- <xml name="gff3_input">
- <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
- </xml>
- <xml name="input/gff3+fasta">
- <expand macro="gff3_input" />
- <expand macro="genome_selector" />
- </xml>
- <token name="@INPUT_GFF@">
- "$gff3_data"
- </token>
- <token name="@INPUT_FASTA@">
-#if str($reference_genome.reference_genome_source) == 'cached':
- "${reference_genome.fasta_indexes.fields.path}"
-#else if str($reference_genome.reference_genome_source) == 'history':
- genomeref.fa
-#end if
- </token>
- <token name="@GENOME_SELECTOR_PRE@">
-#if $reference_genome.reference_genome_source == 'history':
- ln -s $reference_genome.genome_fasta genomeref.fa;
-#end if
- </token>
- <token name="@GENOME_SELECTOR@">
-#if str($reference_genome.reference_genome_source) == 'cached':
- "${reference_genome.fasta_indexes.fields.path}"
-#else if str($reference_genome.reference_genome_source) == 'history':
- genomeref.fa
-#end if
- </token>
- <xml name="input/fasta">
- <param label="Fasta file" name="sequences" type="data" format="fasta"/>
- </xml>
-
- <token name="@SEQUENCE@">
- "$sequences"
- </token>
- <xml name="input/fasta/protein">
- <param label="Protein fasta file" name="sequences" type="data" format="fasta"/>
- </xml>
-</macros>

diff -r dc22c76a57bd -r f0f0ab9db43f cpt_fix_aragorn/test-data/FixAra_In.gff3
--- a/cpt_fix_aragorn/test-data/FixAra_In.gff3 Fri May 20 08:46:58 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,2 +0,0 @@
-##gff-version-3
-Phriendly aragorn tRNA 48610 48685 . - . Anticodon="tcc";Codon="Gly"

diff -r dc22c76a57bd -r f0f0ab9db43f cpt_fix_aragorn/test-data/FixAra_Out.gff3
--- a/cpt_fix_aragorn/test-data/FixAra_Out.gff3 Fri May 20 08:46:58 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,4 +0,0 @@
-##gff-version 3
-Phriendly feature gene 48610 48685 . - . ID=tRNA-001.gene;source=.;Name=tRNA-"Gly";
-Phriendly aragorn tRNA 48610 48685 . - . Anticodon="tcc";Codon="Gly";Name=tRNA-"Gly";ID=tRNA-001;Parent=tRNA-001.gene;
-Phriendly feature exon 48610 48685 . - . source=.;ID=tRNA-001.exon;Name=tRNA-"Gly";Parent=tRNA-001;

diff -r dc22c76a57bd -r f0f0ab9db43f fix-aragorn-gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fix-aragorn-gff3.py Mon Jun 05 02:42:12 2023 +0000

[

@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+import sys
+import logging
+import argparse
+from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
+from Bio.SeqFeature import SeqFeature
+from gff3 import feature_lambda, feature_test_type
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def fixed_feature(rec):
+    for idx, feature in enumerate(
+        feature_lambda(
+            rec.features,
+            feature_test_type,
+            {"types": ["tRNA", "tmRNA"]},
+            subfeatures=True,
+        )
+    ):
+
+        fid = "%s-%03d" % (feature.type, 1 + idx)
+        try:
+            name = [feature.type + "-" + feature.qualifiers["Codon"][0]]
+        except KeyError:
+            name = [feature.qualifiers["product"][0]]
+        try:
+            origSource = feature.qualifiers["source"][0]
+        except:
+            origSource = "."
+        gene = gffSeqFeature(
+            location=feature.location,
+            type="gene",
+            qualifiers={"ID": [fid + ".gene"], "source": [origSource], "Name": name},
+        )
+        feature.qualifiers["Name"] = name
+        # Below that we have an mRNA
+        exon = gffSeqFeature(
+            location=feature.location,
+            type="exon",
+            qualifiers={"source": [origSource], "ID": ["%s.exon" % fid], "Name": name},
+        )
+        feature.qualifiers["ID"] = [fid]
+        exon.qualifiers["Parent"] = [fid]
+        feature.qualifiers["Parent"] = [fid + ".gene"]
+        # gene -> trna -> exon
+        feature.sub_features = [exon]
+        gene.sub_features = [feature]
+        yield gene
+
+
+def gff_filter(gff3):
+    found_gff = False
+    for rec in gffParse(gff3):
+        found_gff = True
+        rec.features = sorted(list(fixed_feature(rec)), key=lambda x: x.location.start)
+        rec.annotations = {}
+        gffWrite([rec], sys.stdout)
+    if not found_gff:
+        print("##gff-version 3")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="add parent gene features to CDSs")
+    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
+    args = parser.parse_args()
+    gff_filter(**vars(args))

diff -r dc22c76a57bd -r f0f0ab9db43f fix-aragorn-gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fix-aragorn-gff3.xml Mon Jun 05 02:42:12 2023 +0000

[

@@ -0,0 +1,32 @@
+<tool id="edu.tamu.cpt.external.aragorn-gff3" name="Fix tRNA model" version="19.1.0.0">
+  <description/>
+  <macros>
+    <import>macros.xml</import>
+    <import>cpt-macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <command detect_errors="aggressive"><![CDATA[
+'$__tool_directory__/fix-aragorn-gff3.py'
+@INPUT_GFF@
+> '$default']]></command>
+  <inputs>
+    <expand macro="gff3_input"/>
+  </inputs>
+  <outputs>
+    <data format="gff3" name="default"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="gff3_data" value="FixAra_In.gff3"/>
+      <output name="default" file="FixAra_Out.gff3"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+**What it does**
+
+For an input GFF3 file with tRNAs from the Aragorn or converted from the tRNAscan-SE tools, this tool modifies
+the gene model to reflect a gene-tRNA-exon hierarchy. That change is needed
+to allow for creation of proper tRNA features in Apollo.
+      ]]></help>
+  <expand macro="citations"/>
+</tool>

diff -r dc22c76a57bd -r f0f0ab9db43f gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff3.py Mon Jun 05 02:42:12 2023 +0000

[

b'@@ -0,0 +1,346 @@\n+import copy\n+import logging\n+\n+log = logging.getLogger()\n+log.setLevel(logging.WARN)\n+\n+\n+def feature_lambda(\n+ feature_list,\n+ test,\n+ test_kwargs,\n+ subfeatures=True,\n+ parent=None,\n+ invert=False,\n+ recurse=True,\n+):\n+ """Recursively search through features, testing each with a test function, yielding matches.\n+\n+ GFF3 is a hierachical data structure, so we need to be able to recursively\n+ search through features. E.g. if you\'re looking for a feature with\n+ ID=\'bob.42\', you can\'t just do a simple list comprehension with a test\n+ case. You don\'t know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.\n+\n+ :type feature_list: list\n+ :param feature_list: an iterable of features\n+\n+ :type test: function reference\n+ :param test: a closure with the method signature (feature, **kwargs) where\n+ the kwargs are those passed in the next argument. This\n+ function should return True or False, True if the feature is\n+ to be yielded as part of the main feature_lambda function, or\n+ False if it is to be ignored. This function CAN mutate the\n+ features passed to it (think "apply").\n+\n+ :type test_kwargs: dictionary\n+ :param test_kwargs: kwargs to pass to your closure when it is called.\n+\n+ :type subfeatures: boolean\n+ :param subfeatures: when a feature is matched, should just that feature be\n+ yielded to the caller, or should the entire sub_feature\n+ tree for that feature be included? subfeatures=True is\n+ useful in cases such as searching for a gene feature,\n+ and wanting to know what RBS/Shine_Dalgarno_sequences\n+ are in the sub_feature tree (which can be accomplished\n+ with two feature_lambda calls). subfeatures=False is\n+ useful in cases when you want to process (and possibly\n+ return) the entire feature tree, such as applying a\n+ qualifier to every single feature.\n+\n+ :type invert: boolean\n+ :param invert: Negate/invert the result of the filter.\n+\n+ :rtype: yielded list\n+ :return: Yields a list of matching features.\n+ """\n+ # Either the top level set of [features] or the subfeature attribute\n+ for feature in feature_list:\n+ feature._parent = parent\n+ if not parent:\n+ # Set to self so we cannot go above root.\n+ feature._parent = feature\n+ test_result = test(feature, **test_kwargs)\n+ # if (not invert and test_result) or (invert and not test_result):\n+ if invert ^ test_result:\n+ if not subfeatures:\n+ feature_copy = copy.deepcopy(feature)\n+ feature_copy.sub_features = list()\n+ yield feature_copy\n+ else:\n+ yield feature\n+\n+ if recurse and hasattr(feature, "sub_features"):\n+ for x in feature_lambda(\n+ feature.sub_features,\n+ test,\n+ test_kwargs,\n+ subfeatures=subfeatures,\n+ parent=feature,\n+ invert=invert,\n+ recurse=recurse,\n+ ):\n+ yield x\n+\n+\n+def fetchParent(feature):\n+ if not hasattr(feature, "_parent") or feature._parent is None:\n+ return feature\n+ else:\n+ return fetchParent(feature._parent)\n+\n+\n+def feature_test_true(feature, **kwargs):\n+ return True\n+\n+\n+def feature_test_type(feature, **kwargs):\n+ if "type" in kwargs:\n+ return str(feature.type).upper() == str(kwargs["type"]).upper()\n+ elif "types" in kwargs:\n+ for x in kwargs["types"]:\n+ if str(feature.type).upper() == str(x).upper():\n+ return True\n+ return False\n+ raise Exception("Incorrect feature_test'..b'feature.location.start,\n+ # feature.location.end,\n+ # feature.location.strand\n+ # )\n+ return result\n+\n+\n+def get_gff3_id(gene):\n+ return gene.qualifiers.get("Name", [gene.id])[0]\n+\n+\n+def ensure_location_in_bounds(start=0, end=0, parent_length=0):\n+ # This prevents frameshift errors\n+ while start < 0:\n+ start += 3\n+ while end < 0:\n+ end += 3\n+ while start > parent_length:\n+ start -= 3\n+ while end > parent_length:\n+ end -= 3\n+ return (start, end)\n+\n+\n+def coding_genes(feature_list):\n+ for x in genes(feature_list):\n+ if (\n+ len(\n+ list(\n+ feature_lambda(\n+ x.sub_features,\n+ feature_test_type,\n+ {"type": "CDS"},\n+ subfeatures=False,\n+ )\n+ )\n+ )\n+ > 0\n+ ):\n+ yield x\n+\n+\n+def genes(feature_list, feature_type="gene", sort=False):\n+ """\n+ Simple filter to extract gene features from the feature set.\n+ """\n+\n+ if not sort:\n+ for x in feature_lambda(\n+ feature_list, feature_test_type, {"type": feature_type}, subfeatures=True\n+ ):\n+ yield x\n+ else:\n+ data = list(genes(feature_list, feature_type=feature_type, sort=False))\n+ data = sorted(data, key=lambda feature: feature.location.start)\n+ for x in data:\n+ yield x\n+\n+\n+def wa_unified_product_name(feature):\n+ """\n+ Try and figure out a name. We gave conflicting instructions, so\n+ this isn\'t as trivial as it should be. Sometimes it will be in\n+ \'product\' or \'Product\', othertimes in \'Name\'\n+ """\n+ # Manually applied tags.\n+ protein_product = feature.qualifiers.get(\n+ "product", feature.qualifiers.get("Product", [None])\n+ )[0]\n+\n+ # If neither of those are available ...\n+ if protein_product is None:\n+ # And there\'s a name...\n+ if "Name" in feature.qualifiers:\n+ if not is_uuid(feature.qualifiers["Name"][0]):\n+ protein_product = feature.qualifiers["Name"][0]\n+\n+ return protein_product\n+\n+\n+def is_uuid(name):\n+ return name.count("-") == 4 and len(name) == 36\n+\n+\n+def get_rbs_from(gene):\n+ # Normal RBS annotation types\n+ rbs_rbs = list(\n+ feature_lambda(\n+ gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False\n+ )\n+ )\n+ rbs_sds = list(\n+ feature_lambda(\n+ gene.sub_features,\n+ feature_test_type,\n+ {"type": "Shine_Dalgarno_sequence"},\n+ subfeatures=False,\n+ )\n+ )\n+ # Fraking apollo\n+ apollo_exons = list(\n+ feature_lambda(\n+ gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False\n+ )\n+ )\n+ apollo_exons = [x for x in apollo_exons if len(x) < 10]\n+ # These are more NCBI\'s style\n+ regulatory_elements = list(\n+ feature_lambda(\n+ gene.sub_features,\n+ feature_test_type,\n+ {"type": "regulatory"},\n+ subfeatures=False,\n+ )\n+ )\n+ rbs_regulatory = list(\n+ feature_lambda(\n+ regulatory_elements,\n+ feature_test_quals,\n+ {"regulatory_class": ["ribosome_binding_site"]},\n+ subfeatures=False,\n+ )\n+ )\n+ # Here\'s hoping you find just one ;)\n+ return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons\n+\n+\n+def nice_name(record):\n+ """\n+ get the real name rather than NCBI IDs and so on. If fails, will return record.id\n+ """\n+ name = record.id\n+ likely_parental_contig = list(genes(record.features, feature_type="contig"))\n+ if len(likely_parental_contig) == 1:\n+ name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]\n+ return name\n+\n+\n+def fsort(it):\n+ for i in sorted(it, key=lambda x: int(x.location.start)):\n+ yield i\n'

diff -r dc22c76a57bd -r f0f0ab9db43f macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jun 05 02:42:12 2023 +0000

@@ -0,0 +1,74 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package">progressivemauve</requirement>
+            
+            <requirement type="package" version="0.6.4">bcbiogff</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@WRAPPER_VERSION@">2.4.0</token>
+    <xml name="citation/progressive_mauve">
+        <citation type="doi">10.1371/journal.pone.0011147</citation>
+    </xml>
+    <xml name="citation/gepard">
+        <citation type="doi">10.1093/bioinformatics/btm039</citation>
+    </xml>
+    <token name="@XMFA_INPUT@">
+ '$xmfa'
+ </token>
+    <xml name="xmfa_input" token_formats="xmfa">
+        <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/>
+    </xml>
+    <token name="@XMFA_FA_INPUT@">
+ '$sequences'
+ </token>
+    <xml name="xmfa_fa_input">
+        <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+    </xml>
+    <xml name="genome_selector">
+        <conditional name="reference_genome">
+            <param name="reference_genome_source" type="select" label="Reference Genome">
+                <option value="history" selected="True">From History</option>
+                <option value="cached">Locally Cached</option>
+            </param>
+            <when value="cached">
+                <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="gff3_input">
+        <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+    </xml>
+    <xml name="input/gff3+fasta">
+        <expand macro="gff3_input"/>
+        <expand macro="genome_selector"/>
+    </xml>
+    <token name="@INPUT_GFF@">
+     '$gff3_data'
+ </token>
+    <token name="@INPUT_FASTA@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+ </token>
+    <token name="@GENOME_SELECTOR_PRE@">
+    #if $reference_genome.reference_genome_source == 'history':
+            ln -s '$reference_genome.genome_fasta' genomeref.fa;
+    #end if
+ </token>
+    <token name="@GENOME_SELECTOR@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+ </token>
+</macros>

diff -r dc22c76a57bd -r f0f0ab9db43f test-data/FixAra_In.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/FixAra_In.gff3 Mon Jun 05 02:42:12 2023 +0000

@@ -0,0 +1,2 @@
+##gff-version-3
+Phriendly aragorn tRNA 48610 48685 . - . Anticodon="tcc";Codon="Gly"

diff -r dc22c76a57bd -r f0f0ab9db43f test-data/FixAra_Out.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/FixAra_Out.gff3 Mon Jun 05 02:42:12 2023 +0000

@@ -0,0 +1,4 @@
+##gff-version 3
+Phriendly feature gene 48610 48685 . - . ID=tRNA-001.gene;source=.;Name=tRNA-"Gly";
+Phriendly aragorn tRNA 48610 48685 . - . Anticodon="tcc";Codon="Gly";Name=tRNA-"Gly";ID=tRNA-001;Parent=tRNA-001.gene;
+Phriendly feature exon 48610 48685 . - . source=.;ID=tRNA-001.exon;Name=tRNA-"Gly";Parent=tRNA-001;