Mercurial > repos > cpt > cpt_gbk_to_5col

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BIO_FIX_TOPO.py	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,84 @@
+import Bio.GenBank
+
+
+def record_end(self, content):
+    """Clean up when we've finished the record."""
+    # from Bio import Alphabet
+    # from Bio.Alphabet import IUPAC
+    from Bio.Seq import Seq, UnknownSeq
+
+    # Try and append the version number to the accession for the full id
+    if not self.data.id:
+        assert "accessions" not in self.data.annotations, self.data.annotations[
+            "accessions"
+        ]
+        self.data.id = self.data.name  # Good fall back?
+    elif self.data.id.count(".") == 0:
+        try:
+            self.data.id += ".%i" % self.data.annotations["sequence_version"]
+        except KeyError:
+            pass
+
+    # add the sequence information
+    # first, determine the alphabet
+    # we default to an generic alphabet if we don't have a
+    # seq type or have strange sequence information.
+
+    # seq_alphabet = Alphabet.generic_alphabet
+
+    # now set the sequence
+    sequence = "".join(self._seq_data)
+
+    if (
+        self._expected_size is not None
+        and len(sequence) != 0
+        and self._expected_size != len(sequence)
+    ):
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn(
+            "Expected sequence length %i, found %i (%s)."
+            % (self._expected_size, len(sequence), self.data.id),
+            BiopythonParserWarning,
+        )
+    """
+    if self._seq_type:
+        # mRNA is really also DNA, since it is actually cDNA
+        if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
+            seq_alphabet = IUPAC.ambiguous_dna
+        # are there ever really RNA sequences in GenBank?
+        elif "RNA" in self._seq_type.upper():
+            # Even for data which was from RNA, the sequence string
+            # is usually given as DNA (T not U).  Bug 2408
+            if "T" in sequence and "U" not in sequence:
+                seq_alphabet = IUPAC.ambiguous_dna
+            else:
+                seq_alphabet = IUPAC.ambiguous_rna
+        elif (
+            "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
+        ):  # PRT is used in EMBL-bank for patents
+            seq_alphabet = IUPAC.protein  # or extended protein?
+        # work around ugly GenBank records which have circular or
+        # linear but no indication of sequence type
+        elif self._seq_type in ["circular", "linear", "unspecified"]:
+            pass
+        # we have a bug if we get here
+        else:
+            raise ValueError(
+                "Could not determine alphabet for seq_type %s" % self._seq_type
+            )
+
+        # Also save the chomosome layout
+        if "circular" in self._seq_type.lower():
+            self.data.annotations["topology"] = "circular"
+        elif "linear" in self._seq_type.lower():
+            self.data.annotations["topology"] = "linear"
+    """
+    if not sequence and self.__expected_size:
+        self.data.seq = UnknownSeq(self._expected_size)  # , seq_alphabet)
+    else:
+        self.data.seq = Seq(sequence)  # , seq_alphabet)
+
+
+Bio.GenBank._FeatureConsumer.record_end = record_end
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,115 @@
+<macros>
+    <xml name="gff_requirements">
+        <requirements>
+            <requirement type="package" version="2.7">python</requirement>
+            <requirement type="package" version="1.65">biopython</requirement>
+            <requirement type="package" version="2.12.1">requests</requirement>
+			<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+            <yield/>
+        </requirements>
+        <version_command>
+		<![CDATA[
+			cd '$__tool_directory__' && git rev-parse HEAD
+		]]>
+		</version_command>
+    </xml>
+    <xml name="citation/mijalisrasche">
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-crr">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020-AJC-solo">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-clm">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="sl-citations-clm">
+        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+        <yield/>
+    </xml>
+</macros>
--- a/cpt_gbk_to_5col/BIO_FIX_TOPO.py	Fri Jun 17 12:45:08 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,85 +0,0 @@
-import Bio.GenBank
-
-
-def record_end(self, content):
-    """Clean up when we've finished the record.
-    """
-    #from Bio import Alphabet
-    #from Bio.Alphabet import IUPAC
-    from Bio.Seq import Seq, UnknownSeq
-
-    # Try and append the version number to the accession for the full id
-    if not self.data.id:
-        assert "accessions" not in self.data.annotations, self.data.annotations[
-            "accessions"
-        ]
-        self.data.id = self.data.name  # Good fall back?
-    elif self.data.id.count(".") == 0:
-        try:
-            self.data.id += ".%i" % self.data.annotations["sequence_version"]
-        except KeyError:
-            pass
-
-    # add the sequence information
-    # first, determine the alphabet
-    # we default to an generic alphabet if we don't have a
-    # seq type or have strange sequence information.
-
-    #seq_alphabet = Alphabet.generic_alphabet
-
-    # now set the sequence
-    sequence = "".join(self._seq_data)
-
-    if (
-        self._expected_size is not None
-        and len(sequence) != 0
-        and self._expected_size != len(sequence)
-    ):
-        import warnings
-        from Bio import BiopythonParserWarning
-
-        warnings.warn(
-            "Expected sequence length %i, found %i (%s)."
-            % (self._expected_size, len(sequence), self.data.id),
-            BiopythonParserWarning,
-        )
-    """
-    if self._seq_type:
-        # mRNA is really also DNA, since it is actually cDNA
-        if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
-            seq_alphabet = IUPAC.ambiguous_dna
-        # are there ever really RNA sequences in GenBank?
-        elif "RNA" in self._seq_type.upper():
-            # Even for data which was from RNA, the sequence string
-            # is usually given as DNA (T not U).  Bug 2408
-            if "T" in sequence and "U" not in sequence:
-                seq_alphabet = IUPAC.ambiguous_dna
-            else:
-                seq_alphabet = IUPAC.ambiguous_rna
-        elif (
-            "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
-        ):  # PRT is used in EMBL-bank for patents
-            seq_alphabet = IUPAC.protein  # or extended protein?
-        # work around ugly GenBank records which have circular or
-        # linear but no indication of sequence type
-        elif self._seq_type in ["circular", "linear", "unspecified"]:
-            pass
-        # we have a bug if we get here
-        else:
-            raise ValueError(
-                "Could not determine alphabet for seq_type %s" % self._seq_type
-            )
-
-        # Also save the chomosome layout
-        if "circular" in self._seq_type.lower():
-            self.data.annotations["topology"] = "circular"
-        elif "linear" in self._seq_type.lower():
-            self.data.annotations["topology"] = "linear"
-    """
-    if not sequence and self.__expected_size:
-        self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet)
-    else:
-        self.data.seq = Seq(sequence)#, seq_alphabet)
-
-
-Bio.GenBank._FeatureConsumer.record_end = record_end
--- a/cpt_gbk_to_5col/cpt-macros.xml	Fri Jun 17 12:45:08 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="gff_requirements">
-		<requirements>
-			<requirement type="package" version="2.7">python</requirement>
-			<requirement type="package" version="1.65">biopython</requirement>
-			<requirement type="package" version="2.12.1">requests</requirement>
-			<yield/>
-		</requirements>
-		<version_command>
-		<![CDATA[
-			cd $__tool_directory__ && git rev-parse HEAD
-		]]>
-		</version_command>
-	</xml>
-	<xml name="citation/mijalisrasche">
-		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-		<citation type="bibtex">@unpublished{galaxyTools,
-		author = {E. Mijalis, H. Rasche},
-		title = {CPT Galaxy Tools},
-		year = {2013-2017},
-		note = {https://github.com/tamu-cpt/galaxy-tools/}
-		}
-		</citation>
-	</xml>
-	<xml name="citations">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-    	<xml name="citations-crr">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Ross},
-				title = {CPT Galaxy Tools},
-				year = {2020-},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020-AJC-solo">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-clm">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="sl-citations-clm">
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-	</xml>
-</macros>
--- a/cpt_gbk_to_5col/gbk_to_five_col.py	Fri Jun 17 12:45:08 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-#!/usr/bin/env python
-import BIO_FIX_TOPO  # NOQA
-import argparse
-import logging
-from Bio import SeqIO
-
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger()
-
-
-# Read in Genbank file and parse features
-# Output features into Five Column format
-
-"""
->Feature SeqID
-Line 1
-    Column 1: Start location (first nucleotide) of a feature
-    Column 2: Stop location (last nucleotide) of a feature
-    Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
-Line2:
-    Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
-    Column 5: Qualifier value
-
-Repeat for each feature in a seq
-Repeat Line 2 for each qualifier in a feature
-"""
-
-
-def gbk_to_5col(genbank):
-    """Converts genbank to BankIt five column format"""
-    for record in SeqIO.parse(genbank, "genbank"):
-        print(">Feature %s" % record.id)
-        for feature in record.features:
-            if feature.type == "source":
-                continue
-            else:
-                for index, part in enumerate(feature.location.parts):
-                    if part.strand > 0:
-                        start = int(part.start) + 1
-                        end = int(part.end)
-                    else:
-                        start = int(part.end)
-                        end = int(part.start) + 1
-                    if index == 0:
-                        name = feature.type
-                        print("%d\t%d\t%s" % (start, end, name))
-                    else:
-                        print("%d\t%d" % (start, end))
-                for (qualifier, values) in feature.qualifiers.items():
-                    for value in values:
-                        print("\t\t\t%s\t%s" % (qualifier, value))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Convert a Genbank file into five column format"
-    )
-    parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file")
-
-    args = vars(parser.parse_args())
-    gbk_to_5col(**args)
--- a/cpt_gbk_to_5col/gbk_to_five_col.xml	Fri Jun 17 12:45:08 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-<?xml version="1.0"?>
-<tool id="edu.tamu.cpt.genbank.GBKtoFiveCol" name="Genbank to Five Column Format" version="1.0">
-    <description></description>
-  <macros>
-    <import>macros.xml</import>
-		<import>cpt-macros.xml</import>
-  </macros>
-  <expand macro="requirements"/>
-  <command detect_errors="aggressive"><![CDATA[
-python $__tool_directory__/gbk_to_five_col.py
-  "$file"
-
-> "$output"
-
-]]></command>
-  <inputs>
-    <param label="GenBank file" name="file" type="data" format="genbank" />
-  </inputs>
-  <outputs>
-    <data format="tabular" name="output">
-    </data>
-  </outputs>
-  <tests>
-      <test>
-          <param name="file" value="complex_feature_locs.gbk" />
-          <output name="output" value="gbkto5col.tsv" />
-      </test>
-  </tests>
-  <help>
-Genbank Format to Five Column Format
-====================================
-
-Output format is:
-
->Feature ID
-Line 1
-- Column 1: Start location (first nucleotide) of a feature
-- Column 2: Stop location (last nucleotide) of a feature
-- Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
-
-Line2:
-- Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
-- Column 5: Qualifier value
-
-Example Output::
-
-    >Feature contig00077
-    0	22956	source
-    			mol_type	genomic DNA
-    			organism	AU1189
-    11652	11326	CDS
-    11327	11158
-    			note	tapemeasure frameshift chaperone
-    			product	P2 E' tapemeasure frameshift chaperone
-    			gene	gp14
-    			translation	MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
-    11900	11599	CDS
-    11600	11408
-    11910	11904	RBS
-
-</help>
-		<expand macro="citations" />
-</tool>
--- a/cpt_gbk_to_5col/macros.xml	Fri Jun 17 12:45:08 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,105 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-  <xml name="requirements">
-    <requirements>
-	<requirement type="package" version="3.8.13">python</requirement>
-	<requirement type="package" version="1.79">biopython</requirement>
-	<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
-	<yield/>
-    </requirements>
-  </xml>
-  <xml name="ldap_ref"
-    token_name="dn_ref"
-    token_label="Pick a DN"
-    token_fromfile="ldap_people.loc">
-        <repeat name="repeat_@NAME@" title="@LABEL@">
-          <param name="@NAME@" label="Select a @LABEL@" type="select">
-            <options from_file="@FROMFILE@">
-                <column name="name" index="0"/>
-                <column name="value" index="1"/>
-            </options>
-          </param>
-        </repeat>
-    </xml>
-  <xml name="ldap_ref_single"
-    token_name="dn_ref"
-    token_label="Pick a DN"
-    token_fromfile="ldap_people.loc">
-          <param name="@NAME@" label="Select a @LABEL@" type="select">
-            <options from_file="@FROMFILE@">
-                <column name="name" index="0"/>
-                <column name="value" index="1"/>
-            </options>
-          </param>
-    </xml>
-	<xml name="gbk_feature_type"
-		token_label="Feature type to remove"
-		token_multiple="True"
-		token_optional="False"
-		token_name="positional_2">
-    <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select">
-      <option value="-10_signal">-10_signal</option>
-      <option value="-35_signal">-35_signal</option>
-      <option value="3'UTR">3'UTR</option>
-      <option value="5'UTR">5'UTR</option>
-      <option value="CAAT_signal">CAAT_signal</option>
-      <option selected="true" value="CDS">CDS</option>
-      <option value="C_region">C_region</option>
-      <option value="D-loop">D-loop</option>
-      <option value="D_segment">D_segment</option>
-      <option value="GC_signal">GC_signal</option>
-      <option value="J_segment">J_segment</option>
-      <option value="LTR">LTR</option>
-      <option value="N_region">N_region</option>
-      <option value="RBS">RBS</option>
-      <option value="STS">STS</option>
-      <option value="S_region">S_region</option>
-      <option value="TATA_signal">TATA_signal</option>
-      <option value="V_region">V_region</option>
-      <option value="V_segment">V_segment</option>
-      <option value="all">all</option>
-      <option value="assembly_gap">assembly_gap</option>
-      <option value="attenuator">attenuator</option>
-      <option value="enhancer">enhancer</option>
-      <option value="exon">exon</option>
-      <option value="gap">gap</option>
-      <option value="gene">gene</option>
-      <option value="iDNA">iDNA</option>
-      <option value="intron">intron</option>
-      <option value="mRNA">mRNA</option>
-      <option value="mat_peptide">mat_peptide</option>
-      <option value="misc_RNA">misc_RNA</option>
-      <option value="misc_binding">misc_binding</option>
-      <option value="misc_difference">misc_difference</option>
-      <option value="misc_feature">misc_feature</option>
-      <option value="misc_recomb">misc_recomb</option>
-      <option value="misc_signal">misc_signal</option>
-      <option value="misc_structure">misc_structure</option>
-      <option value="mobile_element">mobile_element</option>
-      <option value="modified_base">modified_base</option>
-      <option value="ncRNA">ncRNA</option>
-      <option value="old_sequence">old_sequence</option>
-      <option value="operon">operon</option>
-      <option value="oriT">oriT</option>
-      <option value="polyA_signal">polyA_signal</option>
-      <option value="polyA_site">polyA_site</option>
-      <option value="precursor_RNA">precursor_RNA</option>
-      <option value="prim_transcript">prim_transcript</option>
-      <option value="primer_bind">primer_bind</option>
-      <option value="promoter">promoter</option>
-      <option value="protein_bind">protein_bind</option>
-      <option value="rRNA">rRNA</option>
-      <option value="rep_origin">rep_origin</option>
-      <option value="repeat_region">repeat_region</option>
-      <option value="sig_peptide">sig_peptide</option>
-      <option value="source">source</option>
-      <option value="stem_loop">stem_loop</option>
-      <option value="tRNA">tRNA</option>
-      <option value="terminator">terminator</option>
-      <option value="tmRNA">tmRNA</option>
-      <option value="transit_peptide">transit_peptide</option>
-      <option value="unsure">unsure</option>
-      <option value="variation">variation</option>
-    </param>
-	</xml>
-</macros>
--- a/cpt_gbk_to_5col/test-data/complex_feature_locs.gbk	Fri Jun 17 12:45:08 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-LOCUS       contig00077              300 bp    DNA     linear       15-MAR-2010
-DEFINITION  '[length=22956]'   '[numreads=4517 from AU1189;454 Data]'.
-ACCESSION
-VERSION
-KEYWORDS    .
-SOURCE      AU1189
-  ORGANISM  AU1189
-            Unclassified.
-REFERENCE   1  (bases 1 to 22956)
-  AUTHORS   Duarte,I.
-  TITLE     contig77
-  JOURNAL   Unpublished
-REFERENCE   2  (bases 1 to 22956)
-  AUTHORS   Duarte,I.
-  TITLE     Direct Submission
-  JOURNAL   Submitted (15-MAR-2010) PLPM, Texas A&M University, 2132 TAMU,
-            College Station, TX 77840, USA
-FEATURES             Location/Qualifiers
-     source          1..22956
-                     /organism="AU1189"
-                     /mol_type="genomic DNA"
-     CDS             complement(join(11159..11327,11327..11652))
-                     /note="tapemeasure frameshift chaperone"
-                     /product="P2 E' tapemeasure frameshift chaperone"
-                     /translation="MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGV
-                     SLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGL
-                     PDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ"
-                     /gene="gp14"
-     CDS             complement(join(11409..11600,11600..11900))
-     RBS             complement(11905..11910)
-BASE COUNT     3240 a   7606 c   8254 g   3856 t
-ORIGIN
-        1 agccgggcgc gccaagcctg atcaggctct cagcggtttc ctcccatcgt cgtgcagtac
-       61 cgttgcagct aaattgcagc cggaatcggc gcgggctcgg ccgtcagcgg cgcgacccat
-      121 tgcgccagat gcgcggccga cagatgcgcg taccgctgca ccatttccat cgtctcccag
-      181 ccgcccagct ccttcagcac ctgcagcggc gtgccgcgtt ggacgtgcca gctcgcccag
-      241 gtgtggcgca ggtcgtgcca gcggaaatcg tgcaggccgg cgcgccgcag cgccttggcc
-//
--- a/cpt_gbk_to_5col/test-data/gbkto5col.tsv	Fri Jun 17 12:45:08 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
->Feature contig00077
-11652	11327	CDS
-11327	11159
-			note	tapemeasure frameshift chaperone
-			product	P2 E' tapemeasure frameshift chaperone
-			translation	MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
-			gene	gp14
-11900	11600	CDS
-11600	11409
-11910	11905	RBS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_five_col.py	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+import BIO_FIX_TOPO  # NOQA
+import argparse
+import logging
+from Bio import SeqIO
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
+
+# Read in Genbank file and parse features
+# Output features into Five Column format
+
+"""
+>Feature SeqID
+Line 1
+    Column 1: Start location (first nucleotide) of a feature
+    Column 2: Stop location (last nucleotide) of a feature
+    Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
+Line2:
+    Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
+    Column 5: Qualifier value
+
+Repeat for each feature in a seq
+Repeat Line 2 for each qualifier in a feature
+"""
+
+
+def gbk_to_5col(genbank):
+    """Converts genbank to BankIt five column format"""
+    for record in SeqIO.parse(genbank, "genbank"):
+        print(">Feature %s" % record.id)
+        for feature in record.features:
+            if feature.type == "source":
+                continue
+            else:
+                for index, part in enumerate(feature.location.parts):
+                    if part.strand > 0:
+                        start = int(part.start) + 1
+                        end = int(part.end)
+                    else:
+                        start = int(part.end)
+                        end = int(part.start) + 1
+                    if index == 0:
+                        name = feature.type
+                        print("%d\t%d\t%s" % (start, end, name))
+                    else:
+                        print("%d\t%d" % (start, end))
+                for (qualifier, values) in feature.qualifiers.items():
+                    for value in values:
+                        print("\t\t\t%s\t%s" % (qualifier, value))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert a Genbank file into five column format"
+    )
+    parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file")
+
+    args = vars(parser.parse_args())
+    gbk_to_5col(**args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_five_col.xml	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,62 @@
+<tool id="edu.tamu.cpt.genbank.GBKtoFiveCol" name="Genbank to Five Column Format" version="1.0">
+  <description/>
+  <macros>
+    <import>macros.xml</import>
+    <import>cpt-macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <command detect_errors="aggressive"><![CDATA[
+'python $__tool_directory__/gbk_to_five_col.py'
+  "$file"
+
+> "$output"
+
+]]></command>
+  <inputs>
+    <param label="GenBank file" name="file" type="data" format="genbank"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output">
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="file" value="complex_feature_locs.gbk"/>
+      <output name="output" value="gbkto5col.tsv"/>
+    </test>
+  </tests>
+  <help>
+Genbank Format to Five Column Format
+====================================
+
+Output format is:
+
+&gt;Feature ID
+Line 1
+- Column 1: Start location (first nucleotide) of a feature
+- Column 2: Stop location (last nucleotide) of a feature
+- Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
+
+Line2:
+- Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
+- Column 5: Qualifier value
+
+Example Output::
+
+    &gt;Feature contig00077
+    0	22956	source
+    			mol_type	genomic DNA
+    			organism	AU1189
+    11652	11326	CDS
+    11327	11158
+    			note	tapemeasure frameshift chaperone
+    			product	P2 E' tapemeasure frameshift chaperone
+    			gene	gp14
+    			translation	MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
+    11900	11599	CDS
+    11600	11408
+    11910	11904	RBS
+
+</help>
+  <expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,74 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package">progressivemauve</requirement>
+            <!--<requirement type="package" version="2.7">python</requirement>-->
+            <requirement type="package" version="0.6.4">bcbiogff</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@WRAPPER_VERSION@">2.4.0</token>
+    <xml name="citation/progressive_mauve">
+        <citation type="doi">10.1371/journal.pone.0011147</citation>
+    </xml>
+    <xml name="citation/gepard">
+        <citation type="doi">10.1093/bioinformatics/btm039</citation>
+    </xml>
+    <token name="@XMFA_INPUT@">
+		'$xmfa'
+	</token>
+    <xml name="xmfa_input" token_formats="xmfa">
+        <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/>
+    </xml>
+    <token name="@XMFA_FA_INPUT@">
+		'$sequences'
+	</token>
+    <xml name="xmfa_fa_input">
+        <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+    </xml>
+    <xml name="genome_selector">
+        <conditional name="reference_genome">
+            <param name="reference_genome_source" type="select" label="Reference Genome">
+                <option value="history" selected="True">From History</option>
+                <option value="cached">Locally Cached</option>
+            </param>
+            <when value="cached">
+                <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="gff3_input">
+        <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+    </xml>
+    <xml name="input/gff3+fasta">
+        <expand macro="gff3_input"/>
+        <expand macro="genome_selector"/>
+    </xml>
+    <token name="@INPUT_GFF@">
+	    '$gff3_data'
+	</token>
+    <token name="@INPUT_FASTA@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR_PRE@">
+    #if $reference_genome.reference_genome_source == 'history':
+            ln -s '$reference_genome.genome_fasta' genomeref.fa;
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/complex_feature_locs.gbk	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,38 @@
+LOCUS       contig00077              300 bp    DNA     linear       15-MAR-2010
+DEFINITION  '[length=22956]'   '[numreads=4517 from AU1189;454 Data]'.
+ACCESSION
+VERSION
+KEYWORDS    .
+SOURCE      AU1189
+  ORGANISM  AU1189
+            Unclassified.
+REFERENCE   1  (bases 1 to 22956)
+  AUTHORS   Duarte,I.
+  TITLE     contig77
+  JOURNAL   Unpublished
+REFERENCE   2  (bases 1 to 22956)
+  AUTHORS   Duarte,I.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (15-MAR-2010) PLPM, Texas A&M University, 2132 TAMU,
+            College Station, TX 77840, USA
+FEATURES             Location/Qualifiers
+     source          1..22956
+                     /organism="AU1189"
+                     /mol_type="genomic DNA"
+     CDS             complement(join(11159..11327,11327..11652))
+                     /note="tapemeasure frameshift chaperone"
+                     /product="P2 E' tapemeasure frameshift chaperone"
+                     /translation="MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGV
+                     SLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGL
+                     PDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ"
+                     /gene="gp14"
+     CDS             complement(join(11409..11600,11600..11900))
+     RBS             complement(11905..11910)
+BASE COUNT     3240 a   7606 c   8254 g   3856 t
+ORIGIN
+        1 agccgggcgc gccaagcctg atcaggctct cagcggtttc ctcccatcgt cgtgcagtac
+       61 cgttgcagct aaattgcagc cggaatcggc gcgggctcgg ccgtcagcgg cgcgacccat
+      121 tgcgccagat gcgcggccga cagatgcgcg taccgctgca ccatttccat cgtctcccag
+      181 ccgcccagct ccttcagcac ctgcagcggc gtgccgcgtt ggacgtgcca gctcgcccag
+      241 gtgtggcgca ggtcgtgcca gcggaaatcg tgcaggccgg cgcgccgcag cgccttggcc
+//
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gbkto5col.tsv	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,10 @@
+>Feature contig00077
+11652	11327	CDS
+11327	11159
+			note	tapemeasure frameshift chaperone
+			product	P2 E' tapemeasure frameshift chaperone
+			translation	MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
+			gene	gp14
+11900	11600	CDS
+11600	11409
+11910	11905	RBS