Repository 'cpt_gbk_to_gff'
hg clone https://toolshed.g2.bx.psu.edu/repos/cpt/cpt_gbk_to_gff

Changeset 1:bb6332a85aa6 (2023-06-05)
Previous changeset 0:a68f32350196 (2022-06-17) Next changeset 2:a921d6148d88 (2024-01-05)
Commit message:
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
added:
cpt-macros.xml
cpt_gbkToGff3.xml
gbk_to_gff3.py
macros.xml
removed:
cpt_gbk_to_gff/cpt-macros.xml
cpt_gbk_to_gff/cpt_gbkToGff3.xml
cpt_gbk_to_gff/gbk_to_gff3.py
cpt_gbk_to_gff/macros.xml
b
diff -r a68f32350196 -r bb6332a85aa6 cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml Mon Jun 05 02:43:04 2023 +0000
[
@@ -0,0 +1,115 @@
+<macros>
+    <xml name="gff_requirements">
+        <requirements>
+            <requirement type="package" version="2.7">python</requirement>
+            <requirement type="package" version="1.65">biopython</requirement>
+            <requirement type="package" version="2.12.1">requests</requirement>
+ <requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+            <yield/>
+        </requirements>
+        <version_command>
+ <![CDATA[
+ cd '$__tool_directory__' && git rev-parse HEAD
+ ]]>
+ </version_command>
+    </xml>
+    <xml name="citation/mijalisrasche">
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">@unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-crr">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020-AJC-solo">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-clm">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="sl-citations-clm">
+        <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+ </citation>
+        <yield/>
+    </xml>
+</macros>
b
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbkToGff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbkToGff3.xml Mon Jun 05 02:43:04 2023 +0000
[
@@ -0,0 +1,46 @@
+<tool id="edu.tamu.cpt.gff3.customGbkToGff" name="(CPT) Genbank to GFF3: " version="20.1.0.0">
+  <description> CPT made Biobython-based solution</description>
+  <macros>
+    <import>macros.xml</import>
+    <import>cpt-macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <command detect_errors="aggressive"><![CDATA[
+'$__tool_directory__/gbk_to_gff3.py'
+'$gbkIn'
+'$makeMRNA'
+'$makeGene'
+--identifier "$qualID"
+--fastaFile '$fastaOut'
+> '$default']]></command>
+  <inputs>
+    <param label="GenBank file" name="gbkIn" type="data" format="genbank"/>
+    <param checked="true" label="Automatically generate any missing Gene features if CDS/RBS has none" name="makeGene" type="boolean" truevalue="--makeGene" falsevalue=""/>
+    <param checked="true" label="Automatically generate missing mRNA features for genes" name="makeMRNA" type="boolean" truevalue="--makeMRNA" falsevalue=""/>
+    <param label="Qualifier to derive GFF ID from" name="qualID" type="text" value="locus_tag"/>
+  </inputs>
+  <outputs>
+    <data format="gff3" hidden="false" name="default"/>
+    <data format="fasta" hidden="false" name="fastaOut"/>
+  </outputs>
+  <tests>
+  </tests>
+  <help><![CDATA[
+**What it does**
+
+A Biopython-based script to convert Genbank files to GFF3. Should resolve frame shift errors and other problems caused by the old Bioperl  solution. 
+
+Will also attempt to automatically parent RBS, CDS, and Exon features without a locus tag to an appropriate gene feature.
+]]></help>
+  <citations>
+    <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+    <citation type="bibtex">
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+                        </citation>
+  </citations>
+</tool>
b
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/cpt-macros.xml
--- a/cpt_gbk_to_gff/cpt-macros.xml Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,115 +0,0 @@
-<?xml version="1.0"?>
-<macros>
- <xml name="gff_requirements">
- <requirements>
- <requirement type="package" version="2.7">python</requirement>
- <requirement type="package" version="1.65">biopython</requirement>
- <requirement type="package" version="2.12.1">requests</requirement>
- <yield/>
- </requirements>
- <version_command>
- <![CDATA[
- cd $__tool_directory__ && git rev-parse HEAD
- ]]>
- </version_command>
- </xml>
- <xml name="citation/mijalisrasche">
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">@unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
- </xml>
- <xml name="citations">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation> 
- <yield/>
- </citations>
- </xml>
-     <xml name="citations-crr">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {C. Ross},
- title = {CPT Galaxy Tools},
- year = {2020-},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
- <yield/>
- </citations>
- </xml>
-        <xml name="citations-2020">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
-                        <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-                        </citation>
-                        <yield/>
- </citations>
- </xml>
-        <xml name="citations-2020-AJC-solo">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-                        </citation>
-                        <yield/>
- </citations>
- </xml>
-        <xml name="citations-clm">
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
-                        <yield/>
- </citations>
- </xml>
-        <xml name="sl-citations-clm">
- <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
- </citation>
-                        <yield/>
- </xml>
-</macros>
b
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/cpt_gbkToGff3.xml
--- a/cpt_gbk_to_gff/cpt_gbkToGff3.xml Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,49 +0,0 @@
-<?xml version="1.0"?>
-<tool id="edu.tamu.cpt.gff3.customGbkToGff" name="(CPT) Genbank to GFF3: " version="20.1.0.0">
-  <description> CPT made Biobython-based solution</description>
-  <macros>
-    <import>macros.xml</import>
- <import>cpt-macros.xml</import>
-  </macros>
-  <expand macro="requirements"/>
-  <command detect_errors="aggressive"><![CDATA[
-$__tool_directory__/gbk_to_gff3.py
-$gbkIn
-$makeMRNA
-$makeGene
---identifier "$qualID"
---fastaFile $fastaOut
-> $default]]></command>
-  <inputs>
-    <param label="GenBank file" name="gbkIn" type="data" format="genbank"/>
-    <param checked="true" label="Automatically generate any missing Gene features if CDS/RBS has none" name="makeGene"
-        type="boolean" truevalue="--makeGene" falsevalue=""/>
-    <param checked="true" label="Automatically generate missing mRNA features for genes" name="makeMRNA"
-        type="boolean" truevalue="--makeMRNA" falsevalue=""/>
-    <param label="Qualifier to derive GFF ID from" name="qualID" type="text" value="locus_tag"/>
-  </inputs>
-  <outputs>
-    <data format="gff3" hidden="false" name="default"/>
-    <data format="fasta" hidden="false" name="fastaOut"/>
-  </outputs>
-  <tests>
-  </tests>
-  <help><![CDATA[
-**What it does**
-
-A Biopython-based script to convert Genbank files to GFF3. Should resolve frame shift errors and other problems caused by the old Bioperl  solution. 
-
-Will also attempt to automatically parent RBS, CDS, and Exon features without a locus tag to an appropriate gene feature.
-]]></help>
- <citations>
- <citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-                        </citation>
- </citations>
-</tool>
b
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/gbk_to_gff3.py
--- a/cpt_gbk_to_gff/gbk_to_gff3.py Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,274 +0,0 @@\n-#!/usr/bin/env python\n-\n-import argparse\n-import sys\n-\n-from Bio import SeqIO\n-from Bio.SeqRecord import SeqRecord\n-from Bio.SeqFeature import FeatureLocation\n-from CPT_GFFParser import gffSeqFeature, gffWrite\n-\n-bottomFeatTypes = ["exon", "RBS", "CDS"]\n-\n-def makeGffFeat(inFeat, num, recName, identifier):\n-    if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"):\n-      inFeat.type = "Shine_Dalgarno_sequence"\n-    if "codon_start" in inFeat.qualifiers.keys():\n-      shift = int(inFeat.qualifiers["codon_start"][0]) - 1\n-    else:\n-      shift = "."\n-    if identifier in inFeat.qualifiers.keys():\n-      name = inFeat.qualifiers[identifier][0] + "." + inFeat.type \n-      if num > 0:\n-        name += "." + str(num)\n-    else:\n-      name = recName + "." + inFeat.type + "." + str(num)\n-    \n-    outFeat = gffSeqFeature(inFeat.location, inFeat.type, \'\', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff")\n-    outFeat.qualifiers["ID"] = [name]  \n-    return outFeat\n-\n-def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):\n-\n-    ofh = sys.stdout\n-    if outFile:\n-        ofh = outFile\n-\n-    outRec = []\n-    failed = 0\n-    for rec in SeqIO.parse(inFile, "genbank"):\n-        recID = rec.name\n-\n-        if len(str(rec.seq)) > 0:\n-            seqs_pending_writes = True\n-            outSeq = str(rec.seq)\n-            seqLen = len(outSeq)\n-\n-        locBucket = {}\n-        outFeats = []\n-        topTypeDict = {}\n-        seekingParent = []\n-        geneNum = 0\n-        autoGeneNum = 0\n-        for feat in rec.features:\n-            if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC\n-              if feat.type in bottomFeatTypes:\n-                seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates]\n-                continue\n-              elif feat.type not in topTypeDict.keys():\n-                topTypeDict[feat.type] = 1\n-              else:\n-                topTypeDict[feat.type] += 1\n-              outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier))\n-              continue\n-            elif feat.qualifiers[identifier][0] not in locBucket.keys():\n-              locBucket[feat.qualifiers[identifier][0]] = []\n-            locBucket[feat.qualifiers[identifier][0]].append(feat)\n-\n-        for locus in locBucket.keys():\n-          minLoc = locBucket[locus][0].location.start\n-          maxLoc = locBucket[locus][0].location.end\n-          for feat in locBucket[locus]:\n-            minLoc = min(minLoc, feat.location.start)\n-            maxLoc = max(maxLoc, feat.location.end)\n-          for x in seekingParent:\n-            if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:\n-              x[1].append(locus)\n-            if x[0].location.start == minLoc or x[0].location.end == maxLoc:\n-              x[2].append(locus)\n-\n-        for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free]\n-          if len(x[2]) == 1:\n-            finList = ""\n-            if len(x[1]) > 1:\n-              for loc in x[1]:\n-                if loc != x[2][0]:\n-                  finList += loc + ", "\n-              finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "."\n-            else:\n-              finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."\n-            if "Notes" not in x[0].qualifiers.keys():\n-              x[0].qualifiers["Notes"] = []\n-            x[0].qualifiers["Notes"].append(finList)\n-            x[1] = x[2][0]\n-          elif len(x[2]) > 1:\n-            candidate = x[2][0] #Arbitrarily choose first one\n-            finList = ""\n-            strongList = ""\n-            f'..b'peDict[x] = 1\n-          \n-          if not topFeat:\n-            if makeGene:\n-              if midFeat:\n-                possibleStrand = midFeat.strand\n-              else:\n-                possibleStrand = bottomFeats[0].strand\n-              tempName = recID + ".gene." + str(geneNum)\n-              tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}\n-              topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), \'gene\', \'\', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")\n-            else:\n-              sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \\n" % (minLoc, maxLoc))\n-              for x in locBucket[locus]:\n-                sys.stderr.write(str(x))\n-                sys.stderr.write(\'\\n\')\n-                failed = 1\n-              continue\n-\n-          outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))\n-          if not midFeat and topFeat.type == "gene" and makeMRNA:\n-              if identifier in topFeat.qualifiers.keys():\n-                tempName = topFeat.qualifiers[identifier][0] + ".mRNA"\n-                tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}\n-              else:\n-                tempName = outFeats[-1].ID + ".mRNA"\n-                tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}\n-              midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), \'mRNA\', \'\', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")\n-              \n-          if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features\n-              outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier))\n-              outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]\n-              for x in bottomFeats:\n-                typeDict[x.type] += 1\n-                outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))\n-                outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id]\n-          else: # No midFeat, append bottom feats directly to top feats \n-              for x in bottomFeats:\n-                typeDict[x.type] += 1\n-                outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))\n-                outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]\n-                \n-        outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations))\n-        SeqIO.write([outRec[-1]], fastaFile, "fasta")\n-    gffWrite(outRec, ofh)    \n-    exit(failed) # 0 if all features handled, 1 if unable to handle some\n-\n-\n-if __name__ == \'__main__\':\n-    parser = argparse.ArgumentParser( description=\'Biopython solution to Gbk to GFF conversion\')\n-\n-    parser.add_argument(\'inFile\', type=argparse.FileType("r"), help=\'Path to an input GBK file\' )\n-    parser.add_argument(\'--makeMRNA\', action="store_true", required=False, help="Automatically create mRNA features")\n-    parser.add_argument(\'--makeGene\', action="store_true", required=False, help="Automatically create missing Gene features")\n-    parser.add_argument(\'--identifier\', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from")\n-    parser.add_argument(\'--fastaFile\', type=argparse.FileType("w"),  help=\'Fasta output for sequences\' )\n-    parser.add_argument(\'--outFile\', type=argparse.FileType("w"),  help=\'GFF feature output\' )\n-    args = parser.parse_args()\n-    main(**vars(args))\n-\n-\n-\n-\n-\n-\n-\n-\n'
b
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/macros.xml
--- a/cpt_gbk_to_gff/macros.xml Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,105 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-  <xml name="requirements">
-    <requirements>
- <requirement type="package" version="3.8.13">python</requirement>
- <requirement type="package" version="1.79">biopython</requirement>
- <requirement type="package" version="1.2.2">cpt_gffparser</requirement>  
- <yield/>
-    </requirements>
-  </xml>
-  <xml name="ldap_ref"
-    token_name="dn_ref"
-    token_label="Pick a DN"
-    token_fromfile="ldap_people.loc">
-        <repeat name="repeat_@NAME@" title="@LABEL@">
-          <param name="@NAME@" label="Select a @LABEL@" type="select">
-            <options from_file="@FROMFILE@">
-                <column name="name" index="0"/>
-                <column name="value" index="1"/>
-            </options>
-          </param>
-        </repeat>
-    </xml>
-  <xml name="ldap_ref_single"
-    token_name="dn_ref"
-    token_label="Pick a DN"
-    token_fromfile="ldap_people.loc">
-          <param name="@NAME@" label="Select a @LABEL@" type="select">
-            <options from_file="@FROMFILE@">
-                <column name="name" index="0"/>
-                <column name="value" index="1"/>
-            </options>
-          </param>
-    </xml>
- <xml name="gbk_feature_type"
- token_label="Feature type to remove"
- token_multiple="True"
- token_optional="False"
- token_name="positional_2">
-    <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select">
-      <option value="-10_signal">-10_signal</option>
-      <option value="-35_signal">-35_signal</option>
-      <option value="3'UTR">3'UTR</option>
-      <option value="5'UTR">5'UTR</option>
-      <option value="CAAT_signal">CAAT_signal</option>
-      <option selected="true" value="CDS">CDS</option>
-      <option value="C_region">C_region</option>
-      <option value="D-loop">D-loop</option>
-      <option value="D_segment">D_segment</option>
-      <option value="GC_signal">GC_signal</option>
-      <option value="J_segment">J_segment</option>
-      <option value="LTR">LTR</option>
-      <option value="N_region">N_region</option>
-      <option value="RBS">RBS</option>
-      <option value="STS">STS</option>
-      <option value="S_region">S_region</option>
-      <option value="TATA_signal">TATA_signal</option>
-      <option value="V_region">V_region</option>
-      <option value="V_segment">V_segment</option>
-      <option value="all">all</option>
-      <option value="assembly_gap">assembly_gap</option>
-      <option value="attenuator">attenuator</option>
-      <option value="enhancer">enhancer</option>
-      <option value="exon">exon</option>
-      <option value="gap">gap</option>
-      <option value="gene">gene</option>
-      <option value="iDNA">iDNA</option>
-      <option value="intron">intron</option>
-      <option value="mRNA">mRNA</option>
-      <option value="mat_peptide">mat_peptide</option>
-      <option value="misc_RNA">misc_RNA</option>
-      <option value="misc_binding">misc_binding</option>
-      <option value="misc_difference">misc_difference</option>
-      <option value="misc_feature">misc_feature</option>
-      <option value="misc_recomb">misc_recomb</option>
-      <option value="misc_signal">misc_signal</option>
-      <option value="misc_structure">misc_structure</option>
-      <option value="mobile_element">mobile_element</option>
-      <option value="modified_base">modified_base</option>
-      <option value="ncRNA">ncRNA</option>
-      <option value="old_sequence">old_sequence</option>
-      <option value="operon">operon</option>
-      <option value="oriT">oriT</option>
-      <option value="polyA_signal">polyA_signal</option>
-      <option value="polyA_site">polyA_site</option>
-      <option value="precursor_RNA">precursor_RNA</option>
-      <option value="prim_transcript">prim_transcript</option>
-      <option value="primer_bind">primer_bind</option>
-      <option value="promoter">promoter</option>
-      <option value="protein_bind">protein_bind</option>
-      <option value="rRNA">rRNA</option>
-      <option value="rep_origin">rep_origin</option>
-      <option value="repeat_region">repeat_region</option>
-      <option value="sig_peptide">sig_peptide</option>
-      <option value="source">source</option>
-      <option value="stem_loop">stem_loop</option>
-      <option value="tRNA">tRNA</option>
-      <option value="terminator">terminator</option>
-      <option value="tmRNA">tmRNA</option>
-      <option value="transit_peptide">transit_peptide</option>
-      <option value="unsure">unsure</option>
-      <option value="variation">variation</option>
-    </param>
- </xml>
-</macros>
b
diff -r a68f32350196 -r bb6332a85aa6 gbk_to_gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_gff3.py Mon Jun 05 02:43:04 2023 +0000
[
b'@@ -0,0 +1,477 @@\n+#!/usr/bin/env python\n+\n+import argparse\n+import sys\n+\n+from Bio import SeqIO\n+from Bio.SeqRecord import SeqRecord\n+from Bio.SeqFeature import FeatureLocation\n+from CPT_GFFParser import gffSeqFeature, gffWrite\n+\n+bottomFeatTypes = ["exon", "RBS", "CDS"]\n+\n+\n+def makeGffFeat(inFeat, num, recName, identifier):\n+    if inFeat.type == "RBS" or (\n+        inFeat.type == "regulatory"\n+        and "regulatory_class" in inFeat.qualifiers.keys()\n+        and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"\n+    ):\n+        inFeat.type = "Shine_Dalgarno_sequence"\n+    if "codon_start" in inFeat.qualifiers.keys():\n+        shift = int(inFeat.qualifiers["codon_start"][0]) - 1\n+    else:\n+        shift = "."\n+    if identifier in inFeat.qualifiers.keys():\n+        name = inFeat.qualifiers[identifier][0] + "." + inFeat.type\n+        if num > 0:\n+            name += "." + str(num)\n+    else:\n+        name = recName + "." + inFeat.type + "." + str(num)\n+\n+    outFeat = gffSeqFeature(\n+        inFeat.location,\n+        inFeat.type,\n+        "",\n+        inFeat.strand,\n+        name,\n+        inFeat.qualifiers,\n+        None,\n+        None,\n+        None,\n+        shift,\n+        0,\n+        "GbkToGff",\n+    )\n+    outFeat.qualifiers["ID"] = [name]\n+    return outFeat\n+\n+\n+def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):\n+\n+    ofh = sys.stdout\n+    if outFile:\n+        ofh = outFile\n+\n+    outRec = []\n+    failed = 0\n+    for rec in SeqIO.parse(inFile, "genbank"):\n+        recID = rec.name\n+\n+        if len(str(rec.seq)) > 0:\n+            seqs_pending_writes = True\n+            outSeq = str(rec.seq)\n+            seqLen = len(outSeq)\n+\n+        locBucket = {}\n+        outFeats = []\n+        topTypeDict = {}\n+        seekingParent = []\n+        geneNum = 0\n+        autoGeneNum = 0\n+        for feat in rec.features:\n+            if (\n+                identifier not in feat.qualifiers.keys()\n+            ):  # Allow metadata features and other features with no ID (Output warning?) - AJC\n+                if feat.type in bottomFeatTypes:\n+                    seekingParent.append(\n+                        [feat, [], []]\n+                    )  # [Feature, all parent candidates, strongest parent candidates]\n+                    continue\n+                elif feat.type not in topTypeDict.keys():\n+                    topTypeDict[feat.type] = 1\n+                else:\n+                    topTypeDict[feat.type] += 1\n+                outFeats.append(\n+                    makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)\n+                )\n+                continue\n+            elif feat.qualifiers[identifier][0] not in locBucket.keys():\n+                locBucket[feat.qualifiers[identifier][0]] = []\n+            locBucket[feat.qualifiers[identifier][0]].append(feat)\n+\n+        for locus in locBucket.keys():\n+            minLoc = locBucket[locus][0].location.start\n+            maxLoc = locBucket[locus][0].location.end\n+            for feat in locBucket[locus]:\n+                minLoc = min(minLoc, feat.location.start)\n+                maxLoc = max(maxLoc, feat.location.end)\n+            for x in seekingParent:\n+                if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:\n+                    x[1].append(locus)\n+                if x[0].location.start == minLoc or x[0].location.end == maxLoc:\n+                    x[2].append(locus)\n+\n+        for x in seekingParent:  # Reformat to [Feature, Locus, Unused/Free]\n+            if len(x[2]) == 1:\n+                finList = ""\n+                if len(x[1]) > 1:\n+                    for loc in x[1]:\n+                        if loc != x[2][0]:\n+                            finList += loc + ", "\n+                    finList = (\n+                        str(x[0].type)\n+                        + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were "\n+                        + finList[0:-2]\n+ '..b'       tempQuals = {\n+                        identifier: topFeat.qualifiers[identifier],\n+                        "ID": [tempName],\n+                        "Notes": [\n+                            "mRNA feature automatically generated by Gbk to GFF conversion"\n+                        ],\n+                    }\n+                else:\n+                    tempName = outFeats[-1].ID + ".mRNA"\n+                    tempQuals = {\n+                        identifier: topFeat.qualifiers[identifier],\n+                        "ID": [tempName],\n+                        "Notes": [\n+                            "mRNA feature automatically generated by Gbk to GFF conversion"\n+                        ],\n+                    }\n+                midFeat = gffSeqFeature(\n+                    FeatureLocation(minLoc, maxLoc, topFeat.strand),\n+                    "mRNA",\n+                    "",\n+                    topFeat.strand,\n+                    tempName,\n+                    tempQuals,\n+                    None,\n+                    None,\n+                    None,\n+                    ".",\n+                    0,\n+                    "GbkToGff",\n+                )\n+\n+            if (\n+                midFeat\n+            ):  # Again, need a new if statement if we want to handle multiple mid-tier features\n+                outFeats[-1].sub_features.append(\n+                    makeGffFeat(midFeat, 0, recID, identifier)\n+                )\n+                outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]\n+                for x in bottomFeats:\n+                    typeDict[x.type] += 1\n+                    outFeats[-1].sub_features[-1].sub_features.append(\n+                        makeGffFeat(x, typeDict[x.type], recID, identifier)\n+                    )\n+                    outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[\n+                        "Parent"\n+                    ] = [outFeats[-1].sub_features[-1].id]\n+            else:  # No midFeat, append bottom feats directly to top feats\n+                for x in bottomFeats:\n+                    typeDict[x.type] += 1\n+                    outFeats[-1].sub_features.append(\n+                        makeGffFeat(x, typeDict[x.type], recID, identifier)\n+                    )\n+                    outFeats[-1].sub_features[-1].qualifiers["Parent"] = [\n+                        outFeats[-1].id\n+                    ]\n+\n+        outRec.append(\n+            SeqRecord(\n+                rec.seq,\n+                recID,\n+                rec.name,\n+                rec.description,\n+                rec.dbxrefs,\n+                sorted(outFeats, key=lambda x: x.location.start),\n+                rec.annotations,\n+                rec.letter_annotations,\n+            )\n+        )\n+        SeqIO.write([outRec[-1]], fastaFile, "fasta")\n+    gffWrite(outRec, ofh)\n+    exit(failed)  # 0 if all features handled, 1 if unable to handle some\n+\n+\n+if __name__ == "__main__":\n+    parser = argparse.ArgumentParser(\n+        description="Biopython solution to Gbk to GFF conversion"\n+    )\n+\n+    parser.add_argument(\n+        "inFile", type=argparse.FileType("r"), help="Path to an input GBK file"\n+    )\n+    parser.add_argument(\n+        "--makeMRNA",\n+        action="store_true",\n+        required=False,\n+        help="Automatically create mRNA features",\n+    )\n+    parser.add_argument(\n+        "--makeGene",\n+        action="store_true",\n+        required=False,\n+        help="Automatically create missing Gene features",\n+    )\n+    parser.add_argument(\n+        "--identifier",\n+        type=str,\n+        default="locus_tag",\n+        required=False,\n+        help="Qualifier to derive ID property from",\n+    )\n+    parser.add_argument(\n+        "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences"\n+    )\n+    parser.add_argument(\n+        "--outFile", type=argparse.FileType("w"), help="GFF feature output"\n+    )\n+    args = parser.parse_args()\n+    main(**vars(args))\n'
b
diff -r a68f32350196 -r bb6332a85aa6 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jun 05 02:43:04 2023 +0000
b
@@ -0,0 +1,74 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package">progressivemauve</requirement>
+            <!--<requirement type="package" version="2.7">python</requirement>-->
+            <requirement type="package" version="0.6.4">bcbiogff</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@WRAPPER_VERSION@">2.4.0</token>
+    <xml name="citation/progressive_mauve">
+        <citation type="doi">10.1371/journal.pone.0011147</citation>
+    </xml>
+    <xml name="citation/gepard">
+        <citation type="doi">10.1093/bioinformatics/btm039</citation>
+    </xml>
+    <token name="@XMFA_INPUT@">
+ '$xmfa'
+ </token>
+    <xml name="xmfa_input" token_formats="xmfa">
+        <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/>
+    </xml>
+    <token name="@XMFA_FA_INPUT@">
+ '$sequences'
+ </token>
+    <xml name="xmfa_fa_input">
+        <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+    </xml>
+    <xml name="genome_selector">
+        <conditional name="reference_genome">
+            <param name="reference_genome_source" type="select" label="Reference Genome">
+                <option value="history" selected="True">From History</option>
+                <option value="cached">Locally Cached</option>
+            </param>
+            <when value="cached">
+                <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="gff3_input">
+        <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+    </xml>
+    <xml name="input/gff3+fasta">
+        <expand macro="gff3_input"/>
+        <expand macro="genome_selector"/>
+    </xml>
+    <token name="@INPUT_GFF@">
+     '$gff3_data'
+ </token>
+    <token name="@INPUT_FASTA@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+ </token>
+    <token name="@GENOME_SELECTOR_PRE@">
+    #if $reference_genome.reference_genome_source == 'history':
+            ln -s '$reference_genome.genome_fasta' genomeref.fa;
+    #end if
+ </token>
+    <token name="@GENOME_SELECTOR@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+ </token>
+</macros>