Mercurial > repos > cpt > cpt_gbk_to_5col
changeset 1:1bdd481d5c25 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:42:57 +0000 |
parents | 66143811fe8a |
children | 8f5f2142fc2b |
files | BIO_FIX_TOPO.py cpt-macros.xml cpt_gbk_to_5col/BIO_FIX_TOPO.py cpt_gbk_to_5col/cpt-macros.xml cpt_gbk_to_5col/gbk_to_five_col.py cpt_gbk_to_5col/gbk_to_five_col.xml cpt_gbk_to_5col/macros.xml cpt_gbk_to_5col/test-data/complex_feature_locs.gbk cpt_gbk_to_5col/test-data/gbkto5col.tsv gbk_to_five_col.py gbk_to_five_col.xml macros.xml test-data/complex_feature_locs.gbk test-data/gbkto5col.tsv |
diffstat | 14 files changed, 444 insertions(+), 477 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BIO_FIX_TOPO.py Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,84 @@ +import Bio.GenBank + + +def record_end(self, content): + """Clean up when we've finished the record.""" + # from Bio import Alphabet + # from Bio.Alphabet import IUPAC + from Bio.Seq import Seq, UnknownSeq + + # Try and append the version number to the accession for the full id + if not self.data.id: + assert "accessions" not in self.data.annotations, self.data.annotations[ + "accessions" + ] + self.data.id = self.data.name # Good fall back? + elif self.data.id.count(".") == 0: + try: + self.data.id += ".%i" % self.data.annotations["sequence_version"] + except KeyError: + pass + + # add the sequence information + # first, determine the alphabet + # we default to an generic alphabet if we don't have a + # seq type or have strange sequence information. + + # seq_alphabet = Alphabet.generic_alphabet + + # now set the sequence + sequence = "".join(self._seq_data) + + if ( + self._expected_size is not None + and len(sequence) != 0 + and self._expected_size != len(sequence) + ): + import warnings + from Bio import BiopythonParserWarning + + warnings.warn( + "Expected sequence length %i, found %i (%s)." + % (self._expected_size, len(sequence), self.data.id), + BiopythonParserWarning, + ) + """ + if self._seq_type: + # mRNA is really also DNA, since it is actually cDNA + if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): + seq_alphabet = IUPAC.ambiguous_dna + # are there ever really RNA sequences in GenBank? + elif "RNA" in self._seq_type.upper(): + # Even for data which was from RNA, the sequence string + # is usually given as DNA (T not U). Bug 2408 + if "T" in sequence and "U" not in sequence: + seq_alphabet = IUPAC.ambiguous_dna + else: + seq_alphabet = IUPAC.ambiguous_rna + elif ( + "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" + ): # PRT is used in EMBL-bank for patents + seq_alphabet = IUPAC.protein # or extended protein? + # work around ugly GenBank records which have circular or + # linear but no indication of sequence type + elif self._seq_type in ["circular", "linear", "unspecified"]: + pass + # we have a bug if we get here + else: + raise ValueError( + "Could not determine alphabet for seq_type %s" % self._seq_type + ) + + # Also save the chomosome layout + if "circular" in self._seq_type.lower(): + self.data.annotations["topology"] = "circular" + elif "linear" in self._seq_type.lower(): + self.data.annotations["topology"] = "linear" + """ + if not sequence and self.__expected_size: + self.data.seq = UnknownSeq(self._expected_size) # , seq_alphabet) + else: + self.data.seq = Seq(sequence) # , seq_alphabet) + + +Bio.GenBank._FeatureConsumer.record_end = record_end
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- a/cpt_gbk_to_5col/BIO_FIX_TOPO.py Fri Jun 17 12:45:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ -import Bio.GenBank - - -def record_end(self, content): - """Clean up when we've finished the record. - """ - #from Bio import Alphabet - #from Bio.Alphabet import IUPAC - from Bio.Seq import Seq, UnknownSeq - - # Try and append the version number to the accession for the full id - if not self.data.id: - assert "accessions" not in self.data.annotations, self.data.annotations[ - "accessions" - ] - self.data.id = self.data.name # Good fall back? - elif self.data.id.count(".") == 0: - try: - self.data.id += ".%i" % self.data.annotations["sequence_version"] - except KeyError: - pass - - # add the sequence information - # first, determine the alphabet - # we default to an generic alphabet if we don't have a - # seq type or have strange sequence information. - - #seq_alphabet = Alphabet.generic_alphabet - - # now set the sequence - sequence = "".join(self._seq_data) - - if ( - self._expected_size is not None - and len(sequence) != 0 - and self._expected_size != len(sequence) - ): - import warnings - from Bio import BiopythonParserWarning - - warnings.warn( - "Expected sequence length %i, found %i (%s)." - % (self._expected_size, len(sequence), self.data.id), - BiopythonParserWarning, - ) - """ - if self._seq_type: - # mRNA is really also DNA, since it is actually cDNA - if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): - seq_alphabet = IUPAC.ambiguous_dna - # are there ever really RNA sequences in GenBank? - elif "RNA" in self._seq_type.upper(): - # Even for data which was from RNA, the sequence string - # is usually given as DNA (T not U). Bug 2408 - if "T" in sequence and "U" not in sequence: - seq_alphabet = IUPAC.ambiguous_dna - else: - seq_alphabet = IUPAC.ambiguous_rna - elif ( - "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" - ): # PRT is used in EMBL-bank for patents - seq_alphabet = IUPAC.protein # or extended protein? - # work around ugly GenBank records which have circular or - # linear but no indication of sequence type - elif self._seq_type in ["circular", "linear", "unspecified"]: - pass - # we have a bug if we get here - else: - raise ValueError( - "Could not determine alphabet for seq_type %s" % self._seq_type - ) - - # Also save the chomosome layout - if "circular" in self._seq_type.lower(): - self.data.annotations["topology"] = "circular" - elif "linear" in self._seq_type.lower(): - self.data.annotations["topology"] = "linear" - """ - if not sequence and self.__expected_size: - self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet) - else: - self.data.seq = Seq(sequence)#, seq_alphabet) - - -Bio.GenBank._FeatureConsumer.record_end = record_end
--- a/cpt_gbk_to_5col/cpt-macros.xml Fri Jun 17 12:45:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros>
--- a/cpt_gbk_to_5col/gbk_to_five_col.py Fri Jun 17 12:45:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -#!/usr/bin/env python -import BIO_FIX_TOPO # NOQA -import argparse -import logging -from Bio import SeqIO - -logging.basicConfig(level=logging.INFO) -log = logging.getLogger() - - -# Read in Genbank file and parse features -# Output features into Five Column format - -""" ->Feature SeqID -Line 1 - Column 1: Start location (first nucleotide) of a feature - Column 2: Stop location (last nucleotide) of a feature - Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon') -Line2: - Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note') - Column 5: Qualifier value - -Repeat for each feature in a seq -Repeat Line 2 for each qualifier in a feature -""" - - -def gbk_to_5col(genbank): - """Converts genbank to BankIt five column format""" - for record in SeqIO.parse(genbank, "genbank"): - print(">Feature %s" % record.id) - for feature in record.features: - if feature.type == "source": - continue - else: - for index, part in enumerate(feature.location.parts): - if part.strand > 0: - start = int(part.start) + 1 - end = int(part.end) - else: - start = int(part.end) - end = int(part.start) + 1 - if index == 0: - name = feature.type - print("%d\t%d\t%s" % (start, end, name)) - else: - print("%d\t%d" % (start, end)) - for (qualifier, values) in feature.qualifiers.items(): - for value in values: - print("\t\t\t%s\t%s" % (qualifier, value)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert a Genbank file into five column format" - ) - parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file") - - args = vars(parser.parse_args()) - gbk_to_5col(**args)
--- a/cpt_gbk_to_5col/gbk_to_five_col.xml Fri Jun 17 12:45:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -<?xml version="1.0"?> -<tool id="edu.tamu.cpt.genbank.GBKtoFiveCol" name="Genbank to Five Column Format" version="1.0"> - <description></description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -python $__tool_directory__/gbk_to_five_col.py - "$file" - -> "$output" - -]]></command> - <inputs> - <param label="GenBank file" name="file" type="data" format="genbank" /> - </inputs> - <outputs> - <data format="tabular" name="output"> - </data> - </outputs> - <tests> - <test> - <param name="file" value="complex_feature_locs.gbk" /> - <output name="output" value="gbkto5col.tsv" /> - </test> - </tests> - <help> -Genbank Format to Five Column Format -==================================== - -Output format is: - ->Feature ID -Line 1 -- Column 1: Start location (first nucleotide) of a feature -- Column 2: Stop location (last nucleotide) of a feature -- Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon') - -Line2: -- Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note') -- Column 5: Qualifier value - -Example Output:: - - >Feature contig00077 - 0 22956 source - mol_type genomic DNA - organism AU1189 - 11652 11326 CDS - 11327 11158 - note tapemeasure frameshift chaperone - product P2 E' tapemeasure frameshift chaperone - gene gp14 - translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ - 11900 11599 CDS - 11600 11408 - 11910 11904 RBS - -</help> - <expand macro="citations" /> -</tool>
--- a/cpt_gbk_to_5col/macros.xml Fri Jun 17 12:45:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <xml name="ldap_ref" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <repeat name="repeat_@NAME@" title="@LABEL@"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </repeat> - </xml> - <xml name="ldap_ref_single" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </xml> - <xml name="gbk_feature_type" - token_label="Feature type to remove" - token_multiple="True" - token_optional="False" - token_name="positional_2"> - <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select"> - <option value="-10_signal">-10_signal</option> - <option value="-35_signal">-35_signal</option> - <option value="3'UTR">3'UTR</option> - <option value="5'UTR">5'UTR</option> - <option value="CAAT_signal">CAAT_signal</option> - <option selected="true" value="CDS">CDS</option> - <option value="C_region">C_region</option> - <option value="D-loop">D-loop</option> - <option value="D_segment">D_segment</option> - <option value="GC_signal">GC_signal</option> - <option value="J_segment">J_segment</option> - <option value="LTR">LTR</option> - <option value="N_region">N_region</option> - <option value="RBS">RBS</option> - <option value="STS">STS</option> - <option value="S_region">S_region</option> - <option value="TATA_signal">TATA_signal</option> - <option value="V_region">V_region</option> - <option value="V_segment">V_segment</option> - <option value="all">all</option> - <option value="assembly_gap">assembly_gap</option> - <option value="attenuator">attenuator</option> - <option value="enhancer">enhancer</option> - <option value="exon">exon</option> - <option value="gap">gap</option> - <option value="gene">gene</option> - <option value="iDNA">iDNA</option> - <option value="intron">intron</option> - <option value="mRNA">mRNA</option> - <option value="mat_peptide">mat_peptide</option> - <option value="misc_RNA">misc_RNA</option> - <option value="misc_binding">misc_binding</option> - <option value="misc_difference">misc_difference</option> - <option value="misc_feature">misc_feature</option> - <option value="misc_recomb">misc_recomb</option> - <option value="misc_signal">misc_signal</option> - <option value="misc_structure">misc_structure</option> - <option value="mobile_element">mobile_element</option> - <option value="modified_base">modified_base</option> - <option value="ncRNA">ncRNA</option> - <option value="old_sequence">old_sequence</option> - <option value="operon">operon</option> - <option value="oriT">oriT</option> - <option value="polyA_signal">polyA_signal</option> - <option value="polyA_site">polyA_site</option> - <option value="precursor_RNA">precursor_RNA</option> - <option value="prim_transcript">prim_transcript</option> - <option value="primer_bind">primer_bind</option> - <option value="promoter">promoter</option> - <option value="protein_bind">protein_bind</option> - <option value="rRNA">rRNA</option> - <option value="rep_origin">rep_origin</option> - <option value="repeat_region">repeat_region</option> - <option value="sig_peptide">sig_peptide</option> - <option value="source">source</option> - <option value="stem_loop">stem_loop</option> - <option value="tRNA">tRNA</option> - <option value="terminator">terminator</option> - <option value="tmRNA">tmRNA</option> - <option value="transit_peptide">transit_peptide</option> - <option value="unsure">unsure</option> - <option value="variation">variation</option> - </param> - </xml> -</macros>
--- a/cpt_gbk_to_5col/test-data/complex_feature_locs.gbk Fri Jun 17 12:45:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -LOCUS contig00077 300 bp DNA linear 15-MAR-2010 -DEFINITION '[length=22956]' '[numreads=4517 from AU1189;454 Data]'. -ACCESSION -VERSION -KEYWORDS . -SOURCE AU1189 - ORGANISM AU1189 - Unclassified. -REFERENCE 1 (bases 1 to 22956) - AUTHORS Duarte,I. - TITLE contig77 - JOURNAL Unpublished -REFERENCE 2 (bases 1 to 22956) - AUTHORS Duarte,I. - TITLE Direct Submission - JOURNAL Submitted (15-MAR-2010) PLPM, Texas A&M University, 2132 TAMU, - College Station, TX 77840, USA -FEATURES Location/Qualifiers - source 1..22956 - /organism="AU1189" - /mol_type="genomic DNA" - CDS complement(join(11159..11327,11327..11652)) - /note="tapemeasure frameshift chaperone" - /product="P2 E' tapemeasure frameshift chaperone" - /translation="MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGV - SLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGL - PDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ" - /gene="gp14" - CDS complement(join(11409..11600,11600..11900)) - RBS complement(11905..11910) -BASE COUNT 3240 a 7606 c 8254 g 3856 t -ORIGIN - 1 agccgggcgc gccaagcctg atcaggctct cagcggtttc ctcccatcgt cgtgcagtac - 61 cgttgcagct aaattgcagc cggaatcggc gcgggctcgg ccgtcagcgg cgcgacccat - 121 tgcgccagat gcgcggccga cagatgcgcg taccgctgca ccatttccat cgtctcccag - 181 ccgcccagct ccttcagcac ctgcagcggc gtgccgcgtt ggacgtgcca gctcgcccag - 241 gtgtggcgca ggtcgtgcca gcggaaatcg tgcaggccgg cgcgccgcag cgccttggcc -//
--- a/cpt_gbk_to_5col/test-data/gbkto5col.tsv Fri Jun 17 12:45:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ ->Feature contig00077 -11652 11327 CDS -11327 11159 - note tapemeasure frameshift chaperone - product P2 E' tapemeasure frameshift chaperone - translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ - gene gp14 -11900 11600 CDS -11600 11409 -11910 11905 RBS
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_five_col.py Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,61 @@ +#!/usr/bin/env python +import BIO_FIX_TOPO # NOQA +import argparse +import logging +from Bio import SeqIO + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger() + + +# Read in Genbank file and parse features +# Output features into Five Column format + +""" +>Feature SeqID +Line 1 + Column 1: Start location (first nucleotide) of a feature + Column 2: Stop location (last nucleotide) of a feature + Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon') +Line2: + Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note') + Column 5: Qualifier value + +Repeat for each feature in a seq +Repeat Line 2 for each qualifier in a feature +""" + + +def gbk_to_5col(genbank): + """Converts genbank to BankIt five column format""" + for record in SeqIO.parse(genbank, "genbank"): + print(">Feature %s" % record.id) + for feature in record.features: + if feature.type == "source": + continue + else: + for index, part in enumerate(feature.location.parts): + if part.strand > 0: + start = int(part.start) + 1 + end = int(part.end) + else: + start = int(part.end) + end = int(part.start) + 1 + if index == 0: + name = feature.type + print("%d\t%d\t%s" % (start, end, name)) + else: + print("%d\t%d" % (start, end)) + for (qualifier, values) in feature.qualifiers.items(): + for value in values: + print("\t\t\t%s\t%s" % (qualifier, value)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert a Genbank file into five column format" + ) + parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file") + + args = vars(parser.parse_args()) + gbk_to_5col(**args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_five_col.xml Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,62 @@ +<tool id="edu.tamu.cpt.genbank.GBKtoFiveCol" name="Genbank to Five Column Format" version="1.0"> + <description/> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +'python $__tool_directory__/gbk_to_five_col.py' + "$file" + +> "$output" + +]]></command> + <inputs> + <param label="GenBank file" name="file" type="data" format="genbank"/> + </inputs> + <outputs> + <data format="tabular" name="output"> + </data> + </outputs> + <tests> + <test> + <param name="file" value="complex_feature_locs.gbk"/> + <output name="output" value="gbkto5col.tsv"/> + </test> + </tests> + <help> +Genbank Format to Five Column Format +==================================== + +Output format is: + +>Feature ID +Line 1 +- Column 1: Start location (first nucleotide) of a feature +- Column 2: Stop location (last nucleotide) of a feature +- Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon') + +Line2: +- Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note') +- Column 5: Qualifier value + +Example Output:: + + >Feature contig00077 + 0 22956 source + mol_type genomic DNA + organism AU1189 + 11652 11326 CDS + 11327 11158 + note tapemeasure frameshift chaperone + product P2 E' tapemeasure frameshift chaperone + gene gp14 + translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ + 11900 11599 CDS + 11600 11408 + 11910 11904 RBS + +</help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/complex_feature_locs.gbk Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,38 @@ +LOCUS contig00077 300 bp DNA linear 15-MAR-2010 +DEFINITION '[length=22956]' '[numreads=4517 from AU1189;454 Data]'. +ACCESSION +VERSION +KEYWORDS . +SOURCE AU1189 + ORGANISM AU1189 + Unclassified. +REFERENCE 1 (bases 1 to 22956) + AUTHORS Duarte,I. + TITLE contig77 + JOURNAL Unpublished +REFERENCE 2 (bases 1 to 22956) + AUTHORS Duarte,I. + TITLE Direct Submission + JOURNAL Submitted (15-MAR-2010) PLPM, Texas A&M University, 2132 TAMU, + College Station, TX 77840, USA +FEATURES Location/Qualifiers + source 1..22956 + /organism="AU1189" + /mol_type="genomic DNA" + CDS complement(join(11159..11327,11327..11652)) + /note="tapemeasure frameshift chaperone" + /product="P2 E' tapemeasure frameshift chaperone" + /translation="MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGV + SLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGL + PDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ" + /gene="gp14" + CDS complement(join(11409..11600,11600..11900)) + RBS complement(11905..11910) +BASE COUNT 3240 a 7606 c 8254 g 3856 t +ORIGIN + 1 agccgggcgc gccaagcctg atcaggctct cagcggtttc ctcccatcgt cgtgcagtac + 61 cgttgcagct aaattgcagc cggaatcggc gcgggctcgg ccgtcagcgg cgcgacccat + 121 tgcgccagat gcgcggccga cagatgcgcg taccgctgca ccatttccat cgtctcccag + 181 ccgcccagct ccttcagcac ctgcagcggc gtgccgcgtt ggacgtgcca gctcgcccag + 241 gtgtggcgca ggtcgtgcca gcggaaatcg tgcaggccgg cgcgccgcag cgccttggcc +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gbkto5col.tsv Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,10 @@ +>Feature contig00077 +11652 11327 CDS +11327 11159 + note tapemeasure frameshift chaperone + product P2 E' tapemeasure frameshift chaperone + translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ + gene gp14 +11900 11600 CDS +11600 11409 +11910 11905 RBS