Repository 'sapp'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjkoehorst/sapp

Changeset 21:bde695b3f97d (2015-02-21)
Previous changeset 20:4e05b3bf3e3e (2015-02-21) Next changeset 26:6a858e304888 (2015-02-25)
Commit message:
Deleted selected files
removed:
fasta2rdf/fastatordf.py
fasta2rdf/fastatordf.xml
fasta2rdf/test-data/NC_017117.fna
gbk2rdf/gbktordf.py
gbk2rdf/gbktordf.xml
gbk2rdf/test-data/CP009049.embl
gbk2rdf/test-data/NC_010067.gbk
protein2rdf/protein_to_ttl.py
protein2rdf/protein_to_ttl.xml
protein2rdf/test-data/NC_017117.faa
b
diff -r 4e05b3bf3e3e -r bde695b3f97d fasta2rdf/fastatordf.py
--- a/fasta2rdf/fastatordf.py Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,97 +0,0 @@
-#!/usr/bin/env python3.4
-# Author: Jasper Jan Koehorst
-# Date created: Jan 22 2015
-# Function: generation of a RDF file from a genome fasta file
-
-
-# from io import StringIO
-from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
-# import rdflib
-from rdflib.store import Store
-import sys
-
-store = plugin.get('IOMemory', Store)()
-
-global URI
-URI = "http://csb.wur.nl/genome/"
-global seeAlso
-seeAlso = "rdfs:seeAlso"
-global coreURI
-coreURI = Namespace(URI)
-global genomeGraph
-store = plugin.get('IOMemory', Store)()
-genomeGraph = Graph(store,URIRef(URI))
-genomeGraph.bind("ssb",coreURI)
-
-def delete_galaxy():
- for index, path in enumerate(sys.path):
- if "galaxy-dist/" in path:
- sys.path[index] = ''
-
-def createClass(uri):
- genomeGraph.add((uri,RDF.type,OWL.Class))
- genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
- return uri
-
-def fasta_parser(input_file):
- createClass(coreURI["Genome"])            #Genome class
- createClass(coreURI["Type"])                #Type class (Chr,Pls,Scaffold)
-
- genomeDict = {}
-
- sequence = ""
- genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
- if genomeID == 'None':
- genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
-
- genomeURI = coreURI[genomeID]
- for index, element in enumerate(sys.argv):
- if '-organism' == element:
- genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
- if '-ncbi_taxid' == element:
- genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
- if '-idtag' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
- if '-ids' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
-
- genomeDict[genomeID] = {}
-
- #Generating genome dictionary
- data = open(input_file).readlines()
- fastadict = {}
- key = ""
- for index, line in enumerate(data):
- if ">" == line[0]:
- key = line.strip(">").strip()
- fastadict[key] = ""
- else:
- fastadict[key] += line.strip()
-
- genomeClass = createClass(coreURI["Genome"])
- typeClass = createClass(coreURI["DnaObject"])
- for index, genome in enumerate(fastadict):
- typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
- sequence = fastadict[genome]
- genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
- genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
- genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
- genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((genomeURI, RDF.type,genomeClass))
- genomeGraph.add((typeURI, RDF.type,typeClass))
-
-def save():
- data = genomeGraph.serialize(format='turtle')
- open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
-
-def main():
- input_file = sys.argv[sys.argv.index("-input")+1]
- fasta_parser(input_file)
- save()
-
-if __name__ == '__main__':
- #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
- delete_galaxy()
- main()
-
b
diff -r 4e05b3bf3e3e -r bde695b3f97d fasta2rdf/fastatordf.xml
--- a/fasta2rdf/fastatordf.xml Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,38 +0,0 @@
-<tool id="SAPP_genome_to_ttl" name="FASTA to RDF" version="0.1">
-    <requirements>
-        <requirement type='package' version="3.4">python</requirement>
-        <requirement type='package' version="1.0">rdflib</requirement>
-    </requirements>
- <description></description>
- <command interpreter="python3">fastatordf.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' -sourcedb SAPP
- #for $index, $id in enumerate( $ids ) 
- '-ids' '$id.id_tag'
- #end for
- '-id_alternative' '$input.name'
- </command>
- <inputs>
- <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
- <param size="60" name="organism" type="text" format="text" label="organism name" optional="false"/>
- <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
- <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!" optional="false"/>
- <repeat name="ids" title="Identification tags">     
- <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
- </repeat>
- </inputs>
-
- <outputs>
- <data format="rdf" name="output" label="genomeTTL: ${input.name}" />
- </outputs>
-    
-    <tests>
-        <test>
-            <param name="input" value="test-data/NC_017117.fna"/>
-            <output name="$output" file="NC_017117.rdf"/>
-            <output name="$ncbi_taxid" value="634455"/>
-            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
-            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
-        </test>
-    </tests>
-
-<help> Genome FASTA file to RDF</help>
-</tool>
b
diff -r 4e05b3bf3e3e -r bde695b3f97d fasta2rdf/test-data/NC_017117.fna
--- a/fasta2rdf/test-data/NC_017117.fna Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,2736 +0,0 @@\n->gi|384055705|ref|NC_017117.1| Acetobacter pasteurianus IFO 3283-22 plasmid pAPA22-010, complete sequence\n-CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACAT\n-TGCGCAGATACAGGAAACACAGACCAAATCCCCATCTCCTGTGAGCCTGGGTCAGTCCCACCAGAAGAGC\n-GGCAATCCTGTCGTTCTCCGCTGCCAGTCGCGGACGATAGCGAAAGCAGGTCTCGGATATCCCAAAAATC\n-CGACAGGCCAGCGCAATGCTGACCCCATGATGCGCCACAGCTTGTGCGGCCAGTTCCCGGCGCTGGGCTG\n-GCCGCTTCATTTTTTTCCAAGGGCTTCCTTCAGGATATCCGTCTGCATGCTCAAATCCGCATACATGCGC\n-TTCAGCCGACGGTTCTCCTCTTCCAAAGCCTTCATCTGACTGATCATCGAAGCATCCATGCCGCCATATT\n-TCGCGCGCCACCGGTAAAACGTGGCGTTGCTGATCCCATGCTCCCGACACAGGTCAGGAACCGGGACACC\n-GCCCTCAGCCTGGCGGATCACACCCATGATCTGGGCGTCAGTAAAGCGATCACTCTTCATCAGAATCTCC\n-TCAATTCTTACGCTGAGAAAATTCTCATTCAAAAGTCACTCTTTTTATGGGGGGATTACCACTCTAAATC\n-AATGCATTCCAATTAACTTATAAAATGCTTTGAGAGTCATCACCTACAGCAAAGAACTCTGCTGACACCT\n-CTGAAATCATTTTGAGCCATAAAAACTGGGCTCGATTGACGTCCTGAAACTCATCAGCCATCACGGCTGT\n-GAAACGGCGTGACCAGCGGTAGCGATAGGCATCATTGTGCAGCATTGCCAATGTCGGCCACATCAACAGA\n-TCCCCAAAATCTGCAGCATTCTGTTCGCGCAGCAAACTCTGGTAACGACCATACAACTCAACCACATAGC\n-GCCAGCCCGCATCGTCCATAAAACGTTTCTGGGCATGTGCTCGCGCTATCATGGCTTCAACATGATGCCC\n-TGCCATCTCAGGCGTCACCAGATCTTCCTTCAAACGAGATGGGGTGGTTGCCGTCCTCCCCGCACGGCAT\n-CGCAATGTGCCAGAATGGTCGTTGGAAGAAACGACTGCGCGAAAGGACGGCAGATGAAGGATACAGTGAT\n-AGGCGTTGATCTGGCAAAGAACATTTTCCAGGTTCATGGAGCTTCGCGTGCGGGCGAGGTGATGTTTCGC\n-AAAAAGCTGCGTCGTCAGCAGTTTATGCAGTTCATGGCCACGCAGCCGCCTGCTCTGGTCGTTCTTGAAG\n-CGTGCGGGAGCGCGCATTACTGGGCTCGCGAACTGGCAGGAGCTGGTCACGAGGTCAGACTGATCGCTCC\n-GCAGTATGTGAAGCCTTTCGTGAAGCGCCAGAAGAACGATGCTGCTGATGCGGAAGCGATCGTCATTGCG\n-GCCCGTCAGCCGGAAATGCGCTTTGTCGAACCACGCACTGAAGCGCAGCAGGCGCGTGGCGTTCTTTTCC\n-GGGCCCGGCAGCGTCTGGTGCACCAGCGCACGGAACTGGTGAATGCCCTGCGTGCCGTTCTGTATGAATT\n-CGGTCTCGTCGTGCCACAGGGGATTGCGCATATCAGACACATTGAAGCCATGCTGGATGAGGCGGTTCTG\n-CCAGAGGCTGTGAAGCAGGAATGCCTTGATCTGCTGCGACAGATTTCGGAGCAGAGTGTGCGGATTGATG\n-TCAGAACAAAGAAGATCAGGATGCTTGCCCAGGAAAGTGAAAACACCTGCAGATTGCAGAGCATGCCTGG\n-AGTGGGTCCTCTGACCGCTCTTGCGATTGAAGCTTTTGCGCCTGACCTGCAGAGCTTCCGGCGCGGGCGC\n-GACTTTGCTGCGTGGCTGGGGCTGGTGCCCCGTCAGTTCTCATCTGGCGGAAAGGAAAGGCTGGGGAAGA\n-TATCAAAAGCCGGGCAGGCTGATATCCGCAGGCTTCTCATCATGGGCGCCATGACCCAGGTGAACTGGGC\n-CAGCCGTAAGGCCCCTGCACCGGGAAGCTGGCTGGCACGGATGCTGGCCCGCAAGCCCCGTATGCTGGTA\n-GCCATTGCGCTGGCCAACAGGATGGCACGAGCCATCTGGGCCATGGCAACAAAACAGGAGGATTATCGGG\n-ATCCGGCCCTGTCCGTGGCAGCCTGAGCGATGGCTCGGCTCCCGCGGATGGAACCGGTAGGGGTGTGAGA\n-GGGCGATGACCTGAATGGGCGCATGATCGTCTGATCCGGATCGGAAAAACCAGTGGATTTCTCTGTGCTT\n-TAAAGCACGCCTGTGAGATTTGGATCTGATCCGCTGATCACCATACTGGCCAGTGGCTTCTGAAAGGCCA\n-CATCAACAGGCCTTACAGAAGACCGCACACGATCACACGTCAATATGGGTCAGAAAACTCTTGCATAACG\n-GACGGCAACCATATGTGGACGGCTCCCCCTTGCAAGAGGCTAGGCAAGAAAATGATCGGATCTTTGCTTC\n-CATATGTCCGGCCTGTTGATGCGGCCATAGGGTCGCTGGCCAAGATGGCTTCCGCAGCGTGAGCCCCAAA\n-CACAGAAGCGGTCTTTGATGACCACTGGTTGCCACGGGTTTTCTCACGCCATGGATCGATCGATCACACC\n-ATCTGCTCTATTACTTGCAAGCCACGACCTCAGCTCGGCACGAGAGCGTCAAATGTCAGCGCATCGTGCC\n-AGGCTAAGCTCAAACAGCAGCTGCGCCGGGTTGCTGCAGAAGGCGCTTATAGTGTTCGCCGCTGACCATC\n-AGTTTCCAAGCAATCCGCGCAATCTTATTGGCAAGGGCCACCGCTGCGAGTTTCGGTTTTTTGCGCTCCA\n-GCAATTCACGTAACCAAGATGAGGCATTCTTCCCATTGGTCCGCCGGGCATGCGACACGACTGCGGTCGC\n-GCCAACCACCAGCGTGCTTCGCAAGACCTCATCGCCAGCGCGTGTGATTCTGCCAAGCCTTGTTTTTCCA\n-CCGGTTGAGTGATCCCTGGGCGTCAATCCGATCCAGGCCGCAAAGGCTCGACCCGATTTGAACAGATGCG\n-GATCAGGCGTTTTCATCATCAGCAGCGCTGCGCCGATCGGGCCAACGCCCGGAATTTTCGCAAGACGCTG\n-ACTGCATTCGTTGGCGCGGTGCCATGCCATCACCTTGCCCTCAAGCTGTTCGATTTCACCTTGCAATTCA\n-GCATATTCCTTTGCGTGAAGGGCAAACAACTCGCGCGTCAATGTGGGCAGGCTTTCGTCCGCAGCGATCC\n-GATCAAGGAGTGCCTCAATCCGGCACATGCCTTTGGGCGCCGTGATCCCAAACTCGGCAGCATATCCCCG\n-GATCGTATTGGCGAGCTGTGTGCGGTTCCGGATAAGTCGTGCCCGCATTCCAATCAGCATCAACGCTGCC\n-TGCTCTTCCTCGCTCTTGAGCGGGACGAACCGCATTGTAGGCCGACTCATCGCTTCACAGAGGGCTTCCG\n-CGTCGGCGGCATCGTTTTTCCCGCGCTTGACATAAGGCTTCACGAGCTGCGGCGCGATCAGCTTCACTGT\n-GTGTCCCAGACACGAGAGCACCCGCCCCCAGTAATGGGAGGCGCCACAGGCCTCAATCGCGATTTCAATC\n-GGGGGCAGTTTCTCAAAAAACTTTACCATCTCCCGGCGGGATAGCTTCCTGCGCAAAACAGGCTGCTCCT\n-TCGCGTTTACACCGTGCAATTGGAAAACACTTTTTGACGTGTCCATGCCAATACGGATAATTTGTTCCAT\n-GGGTGGCCTCCTCTGTGAGTTCTGCAACGACTTCACCTTGGCACATCGCGATGCCG'..b'TTGCTCCGAAGGCCTGCATGTGTCCACACCACGGCGATTGCCTGCAACAAGGCGAATGGAT\n-AGAAACAGTCAGGAGAAAGGTCGTCTGGTTCGATGTTGAGGCTGGCCCAGGTCAGTTTCAGCCAAGAAGC\n-ATCATCTATGGTCCTAGGATTTACGCCAAGATATTTCGCAATATTCTTCGGTGTCGAGAGATGATATCGC\n-GGATTACACCTGAGCCGCGCCCATTCCCAACTCTGAATATCAGATAAAAACCGGCGCATAACCTGATAAT\n-AAGCAACTTTCGTTGGAGCCTGCAGAGGCTGGCCAACTGTTGGGATCAGACGATAATCAAAGGTGGAACC\n-GCCCCAATCGCCAACACGTAACCTGTCGACGGCGGCGAGATAGTCGGCACATACCGATACCGTCCATTGC\n-TCTGGTCCAGTGACCTCAGGGTGCTCGCGACCCAACCAGATTCCGATACGGGTTAGAGTGTTGTAAACTG\n-CTCGCCTCGACCCTTCTCGCAACGTTGTTGTTTCCAACCAACGTCGGCACCATGTCATCCACTCAGGATT\n-GATATTATCAGTTTCAGTTCCGTGACGAGGTGGATATGCCCGAAAATGGATAACCTTTGATGTTAATCCC\n-ATTGCCGCCAAACCGTTTGACAATATTCCAATCCGCTTGGCGATATGATTTCCTGTGTAACACTCTCGTG\n-TATGTTCCAGTATCTTCCTATCAAAACTTTCAAGTTTTGGATTCTCGCCTTGTAATGCAACTGCAGCAAA\n-TACTGATGGTACGAGGGGCCGAACGGTCTGACAGACGAAGCCGACACGGGTTAGGGCCGAGAACAGACGC\n-TCACATTCTGTATCAACAATCTCCGCTCCAAAAACCAATCGGGCAACGACAGCCGTCGCCACATTGCGTT\n-GAATTTTGTACACGCTTCGAAAACCACACAGAAGATAAGCGACTGCAATCAACTGCGGTACCATTCCGGT\n-TTTTTTCAGAACAGGACTATTATTGATAATCTCAACCCACAGGTCTTCACTCCACCCCCAGTAGGGTTTT\n-TCCTGGTGTGCGACCGTCAGAAGCAAATACTTCAAAGCACGATGGCTGTTAGCGCGGTCGAGATTGCTGC\n-GATACAGGATGTCCGTCAGAGGCTTATGAAGACGCGGAGCCTGCCGTTTCGTAATAACAAGATTTTCTTC\n-TTTCCAACGTCTGACAACTTTTAATTCGTCTGCAGACAATGTCGCCCGTCTGTCGTATTCATCGAGATTA\n-ATGTGGAATTTGTATTCGGGCTGGATGTGAGAAACCTGAGATTCTACCACTTTTCTATCCTCCGAAGACC\n-CTGTGACCAAGCTTCATATCCATCTGTTCGACTGAGTTTGCGATCTTGCGAAGCAGATCTTCACCGGAAA\n-GATGGATATAGAGTGTCGTGCTTTGAACATTGCGATGCCCGGCATACGTCGCAATATCGTGTAGACGCCA\n-GCCAGCACGGGCCAGATGCGTCAATCTCAGGTGACGCAAAGTGTGCGTACTGAACAATGGCATATCAGCC\n-TGGAGAGCAAGACGTCTGACAGTTTTGCTCCATGACCACTTCGTAATAGGCTGCCGAAAGTTCCGATCTG\n-ACTCAGAGAGAAACAGGGCCGCTGAATGAGTCGCTGCGTTGCGCCTTTGATGCAGATATACCGCCAACAC\n-AGGACAGAGCGCCGCTGAATAACAAACCACACGAGGGCGAGCGCTTTTACTTGTTTCGGCCCGAATGGTG\n-AGCAAACGTCTCGCAGGGTCGATATCCGAGACGCGCAAATTTACTACGGCGTGTCGTCAGTTAAGCCCTG\n-AGAGTGGCACGTGAGGGTTGTACTTTGTGTCTGCGTGTGCTGACTGTTTTCCCATTTTTTGGGGAGACAG\n-ACAGATGCGGCGCTATAGTTTACGCGATGACCAGTGGGAGCGGATAAAGGATCTTCTTCCTGGTCGAGAA\n-GGCTATGTCGGCGGCACTGCGGTGAACAACCGTCTGTTCGTGGAGGCGGTGCTGTATCGCTATCGCGCGG\n-GTATTCCATGGCGCGACCTTCCTGCCCGTTTCGGTGACTGGAAAAACGTGCACCGGCGTCTGCGCCGCTG\n-GTGTGAAAGCGGCGTCATCGAACGGATATTTCGTTATCTGGCCGCTGATTACGACAACGAATACATGATG\n-ATCGACAGCACAATTGTCCGAGCGCATCAGCATAGTGCCGGAGCTCTCAAAAAAGGGGCACGGATCAGGC\n-CATCGGACGATCACGGGCGGGCTAACTACAAAGATCCATGCCATCTGCGACGCTCTGGGCAATCCAGTGG\n-AACTCGGCATCACACCGGGACAGGATGCCGATATCACCCAGGCAGAACCACTTCTGGAAAACATCGAACC\n-GGATGCTTTCCTTGCTGACAAGGCGTATGACGCGGACAGGTTGATCGATCGGCTGATACAGCGCGGGATT\n-ACCCCGGTCATCCCGCCAAAACGCAACAGAACGACACGACGGGTAATCCCCCCATAAAAAGAGTGACTTT\n-TGAATGAGAATTTTCTCAGCGTAAGAATTGAGGAGATTCTGATGAAGAGTGATCGCTTTAGTGACGCCCA\n-GATCATGGGTGTGATCCGCCAGGCTGAGGGCGGTGTCCCGGTTCCTGACCTGTGCCGGGAGCATGGGATC\n-AGCAACGCCACGTTTTACCGGTGGCGCGCGAAATATGGCGGCATGGATGCTTCGATGATCAGTCAGATGA\n-AGGCTTTGGAAGAGGAGAACCGTCGGCTGAAGCGCATGTATGCGGATTTGAGCATGCAGACGGATATCCT\n-GAAGGAAGCCCTTGGAAAAAAATGAAGCGGCCAGCCCAGCGCCGGGAACTGGCCGCACAGGCTGTGGCGC\n-ATCATGGGGTCAGCATTGCGCTGGCCTGTCGGATTTTTGGGATATCCGAGACCTGCTTTCGCTATCGTCC\n-GCGACTGGCAGCGGAGAATGACAGGATTGCCGCTCTTCTGGTGGGACTGACCCAGGCTCACAGGAGATGG\n-GGATTTGGTCTGTGTTTCCTGTATCTGCGCAATGTGCAGGGACAGCTCTGGAATCATAAGCGGGTTTATC\n-GGATCTATCGGGAACTGGAGTTCAACCTGCGGATTAAACCCCGCAGGCGTCTGGTTCGCGAAAAGCCTGA\n-AAAGCTGTCGGTTCCGGCCCTTCCCAACACGGTCTGGTCCATGGATTTCATGGCGGACAGGCTTTTGGAT\n-GGACGCGCTTTTCGGCTCCTGAACATCCTGGATGAGTTCAATCGTGAAGGACTGGCGATCGAGGTTGATT\n-TTTCCCTGCCGGCCTGTCGGGTTGTCCGCTGGTAATCCCCCCATTTTTAGTGGGGCATTGAATGAGAATT\n-CAGGCAGCTGTTTTTAGTTTCTGGGCGGGGGTTAGCCCGCTGTTCCCCATGTTGGGTCTGTCATTGTTAT\n-ATGTCCAGAGCCATTGTGTTGCGACCTCCTGTACGTCCTGAATGCTTTCAAACAAATACTGCTCTAGCCA\n-TTCCTGCCGGACAGTTCTGTTGTAGCGTTCAATATAGGCGTTCTGCTGCGGATTGCCCGGTTGTGTATAG\n-ATCAGGGTAATCCCCTGCTTTTCGGCCCATGAAACCAACGTATGACTGACATATTCAGGGCCATTGTCCA\n-TTCGGATAGCCTCTGGCCTGCCACGCCACTCCATAACCTGTTCCAGACAGCGAACAACCCGACAGGCTGG\n-CAGGGAAAAATCAACCTCAATCGCCAGTCCTTCACGATTGAAATCATCCAGAATGTTCAGGAGCCGAAAA\n-GCACGTCCATCCATCAGCCTGTCCGCCATAAAATCCATGGACCAGACCCTGTTGGGAAGGGCCGGAACCG\n-ACAGCTTTTCAGGCTTTTCGCGAACCAGACGCCTGCGGGGTTTAATC\n'
b
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/gbktordf.py
--- a/gbk2rdf/gbktordf.py Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,360 +0,0 @@\n-#!/usr/bin/env python3.4\n-# Author: Jasper Jan Koehorst\n-# Date created: Feb 21 2015\n-# Function: generation of a RDF file from Genbank/EMBL\n-\n-import warnings\n-warnings.filterwarnings("ignore")\n-\n-def delete_galaxy():\n-\timport sys\n-\tfor index, path in enumerate(sys.path):\n-\t\tif "galaxy-dist/" in path:\n-\t\t\tsys.path[index] = \'\'\n-\n-#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.\n-delete_galaxy()\n-\n-from Bio import SeqIO\n-# Import RDFLib\'s default Graph implementation.\n-import os, sys\n-from Bio.Seq import Seq\n-\n-from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin\n-from rdflib.store import Store\n-import hashlib\n-store = plugin.get(\'IOMemory\', Store)()\n-\n-global URI\n-URI = "http://csb.wur.nl/genome/"\n-global seeAlso\n-seeAlso = "rdfs:seeAlso"\n-global coreURI\n-coreURI = Namespace(URI)\n-\n-global SubClassOfDict\n-SubClassOfDict = {}\n-global SubClassOfDictRna\n-SubClassOfDictRna = {}\n-\n-def createClass(uri, root=True):\n-\tgenomeGraph.add((uri,RDF.type,OWL.Class))\n-\tif root:\n-\t\tgenomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))\n-\treturn uri\n-\n-def tmp():\n-\timport time\n-\tglobal tmpFolder\n-\ttmpFolder = "/tmp/"+str(time.time())+"/"\n-\tos.mkdir(tmpFolder)\n-\n-def cleantmp():\n-\tos.system("ls "+tmpFolder)\n-\tos.system("rm -rf "+tmpFolder)\n-\n-def crawler():\n-\t#From input folder it looks for GBK file (gz files are in progress)\n-\tinput_file = sys.argv[sys.argv.index("-input")+1]\n-\tgbk_parser(input_file)\n-\n-def gbk_parser():\n-\tprevObjStart = -1\n-\tprevObjStop = -1\t\n-\tstore = plugin.get(\'IOMemory\', Store)()\n-\tglobal genomeGraph\n-\tgenomeGraph = Graph(store,URIRef(URI))\n-\tgenomeGraph.bind("ssb",coreURI)\n-\tinput_file = sys.argv[sys.argv.index("-input")+1]\n-\n-\t#CLASS definitions\n-\tgenomeClass = createClass(coreURI["Genome"], root=True)\n-\ttypeClass = createClass(coreURI["DnaObject"], root=True)\n-\tcreateClass(coreURI["Protein"], root=True)\n-\tpubmedClass = createClass(coreURI["Pubmed"], root=True)\n-\tmiscClass = createClass(coreURI["MiscFeature"], root=False)\n-\tcreateClass(coreURI["Feature"], root=True)\n-\tSubClassOfDict["MiscFeature"] = 1\n-\tSubClassOfDictRna["Trna"] = 1\n-\tSubClassOfDictRna["Rrna"] = 1\n-\tSubClassOfDictRna["Tmrna"] = 1\n-\tSubClassOfDictRna["Ncrna"] = 1\n-\n-# \tcodon = "11" #Default initialization if no CDS are present\n-\t##################\n-\tweird_chars = list(\'\'\',./?<>:;"\'|\\}]{[+=_-)(*&^%$#@!\xc2\xb1\xc2\xa7~` \'\'\')\n-\tscaf_value = 0\n-\t#Which files are already done\n-\t########\n-\tformatGBK = sys.argv[sys.argv.index("-format")+1]\n-\tfor record in SeqIO.parse(input_file, formatGBK):\n-\t\t#Read first feature for genome name and information...\n-\t\t#Ignore the empty GBK file due to the lack of features?\n-\n-\t\tfor index, feature in enumerate(record.features):\n-\t\t\tif index == 0:\n-\t\t\t\tif "-identifier" in sys.argv:\n-\t\t\t\t\tgenome = sys.argv[sys.argv.index("-identifier")+1]\n-\t\t\t\telse:\n-\t\t\t\t\ttry:\n-\t\t\t\t\t\tgenome = feature.qualifiers["organism"][0].replace(" ","_")\n-\t\t\t\t\texcept:\n-\t\t\t\t\t\t#BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS\n-\t\t\t\t\t\tgenome = "XNoneX"\n-\t\t\t\tfor char in weird_chars:\n-\t\t\t\t\tgenome = genome.replace(char,"_")\n-\n-\t\t\t\ttry:\n-\t\t\t\t\tgi = record.annotations["gi"]\n-\t\t\t\t\ttyp = str(gi)\n-\t\t\t\texcept:\n-\t\t\t\t\ttry:\n-\t\t\t\t\t\tgi = record.annotations["accessions"][0]\n-\t\t\t\t\t\ttyp = str(gi)\n-\t\t\t\t\texcept:\n-\t\t\t\t\t\tscaf_value += 1\n-\t\t\t\t\t\ttyp = "scaffold_"+str(scaf_value)\n-\t\t\t\tgenomeURI = coreURI[genome]\n-\t\t\t\tgbkURI = coreURI[genome + "/" + typ]\n-\t\t\t\t#To contig connection to connect all data to it\n-\t\t\t\tgenomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))\n-\n-\t\t\t\t#General genome features also stored in the class...\n-\t\t\t\tif "genome" in feature.qualifiers:\n-\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))\n-\t\t\t\tif "strain" in feature.qualifiers:\n-\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))\n-\t\t\t\tif "taxonomy" in record.annotations:\n-\t\t\t\t\tfo'..b'a" and feature_type.lower() != "ncrna":\n-\t\t\tSubClassOfDict[feature_type.lower().title()] = 1\n-\tfor key in feature.qualifiers:\n-\t\tvalues = feature.qualifiers[key]\n-\t\tif key == "translation":\n-\t\t\tpass\n-\t\telif type(values) == list:\n-\t\t\tfor v in values:\n-\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n-\t\telse:\n-\t\t\tint_add(generalURI,coreURI[key.lower()],values)\n-\tif feature.type == "CDS":\n-\t\ttry:\n-\t\t\t#Feature is normally submitted to this function\n-\t\t\t#IF a subfeature is submitted it is submitted as a feature\n-\t\t\t#And subfeature variable will contain the superfeature\n-\t\t\tif superfeature:\n-\t\t\t\tcodon = superfeature.qualifiers["transl_table"][0]\n-\t\texcept:\n-\t\t\t#Default codon table 11\n-\t\t\tcodon = "11"\n-\t\t#Protein linkage\n-\t\ttranslation = ""\n-\t\ttry:\n-\t\t\ttranslation = feature.qualifiers["translation"][0].strip("*")\n-\t\texcept KeyError:\n-\t\t\t#When protein sequence is not given...\n-\t\t\tif len(feature.location.parts) > 1:\n-\t\t\t\t#Exon boundaries?\n-\t\t\t\tseq = \'\'\n-\t\t\t\tfor loc in feature.location:\n-\t\t\t\t\tseq += record.seq[loc]\n-\t\t\t\tif int(feature.location.strand) == -1:\n-\t\t\t\t\tseq = Seq(seq).complement()\n-\t\t\t\telse:\n-\t\t\t\t\tseq = Seq(seq)\n-\t\t\t\ttranslation = str(seq.translate(feature.qualifiers["transl_table"][0]))\n-\t\t\telif int(feature.location.strand) == -1:\n-\t\t\t\tif str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:\n-\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n-\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))\n-\t\t\t\t\telse:\n-\t\t\t\t\t\ttranslation = \'\'\n-\t\t\telif int(feature.location.strand) == +1:\n-\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n-\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))\n-\t\t\t\t\telse:\n-\t\t\t\t\t\ttranslation = \'\'\n-\t\t\t\n-\t\t\tif translation:\n-\t\t\t\ttranslation = list(translation)\n-\t\t\t\ttranslation[0] = "M"\n-\t\t\t\ttranslation = \'\'.join(translation).strip("*")\n-\t\t\t\tif "*" in translation:\n-\t\t\t\t\tpass\t\t\n-\n-\t\ttranslation = translation.encode(\'utf-8\')\n-\t\tmd5_protein = hashlib.md5(translation).hexdigest()\n-\t\tproteinURI = coreURI["protein/"+md5_protein]\n-\t\tgenomeGraph.add((generalURI,coreURI["protein"],proteinURI))\n-\t\tfor key in feature.qualifiers:\n-\t\t\tfor v in feature.qualifiers[key]:\n-\t\t\t\tif key == "translation":\n-\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))\n-\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))\n-\t\t\t\t\tgenomeGraph.add((proteinURI,RDF.type,proteinClass))\n-\t\t\t\telse:\n-\t\t\t\t\tfor v in feature.qualifiers[key]:\n-\t\t\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n-\t\n-def int_add(subject, predicate, obj):\n-\ttry:\n-\t\tobject_float = float(obj.replace(\'"\',\'\'))\n-\t\tobject_int = int(obj.replace(\'"\',\'\'))\n-\t\tif object_int == object_float:\n-\t\t\tgenomeGraph.add((subject,predicate,Literal(object_int)))\n-\t\telse:\n-\t\t\tgenomeGraph.add((subject,predicate,Literal(object_float)))\n-\texcept:\n-\t\tgenomeGraph.add((subject,predicate,Literal(obj.replace(\'"\',\'\'))))\n-\t\t\t\t\n-def save():\n-\tdata = genomeGraph.serialize(format=\'turtle\')\n-\topen(sys.argv[sys.argv.index("-output")+1],"wb").write(data)\n-\n-def subClassOfBuilder():\n-\tfor subclass in SubClassOfDict:\n-\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n-\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))\n-\n-def subClassOfBuilderRna():\n-\tfor subclass in SubClassOfDictRna:\n-\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n-\t\tgenomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))\n-\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))\n-\t\tgenomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))\n-\n-def main():\n-\ttmp()\n-\tgbk_parser()\n-\tsubClassOfBuilder()\n-\tsubClassOfBuilderRna()\n-\tsave()\n-\tcleantmp()\n-\n-if __name__ == "__main__":\n-\tmain()\n\\ No newline at end of file\n'
b
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/gbktordf.xml
--- a/gbk2rdf/gbktordf.xml Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,38 +0,0 @@
-<tool id="SAPP_genbank_to_ttl" name="EMBL/GBK to RDF" version="0.1">
- <requirements>
-     <requirement type='package' version="3.4">python</requirement>
-     <requirement type='package' version="1.0">rdflib</requirement>
- </requirements>
- <description>Genbank to RDF conversion</description>
- <command interpreter="python3.4">gbktordf.py '-input' '$input' -output '$output' -sourcedb "$format" -format "$format"</command>
- <inputs>
- <param name="input" type="data" format="gbk,gb,genbank,embl" label="Genbank file"/>
- <param name="format" type="select" label="EMBL/GBK">
- <option value="genbank" selected="true"> Genbank</option>
- <option value="embl"> EMBL </option>
- </param>
- </inputs>
-
- <outputs>
- <data format="rdf" name="output" label="GBKttl: ${input.name}" />
- </outputs>
-
- <tests>
-     <test>
-       <param name="input" value="test-data/NC_010067.gbk"/>
-       <output name="$output" file="NC_010067.rdf"/>
-       <output name="$format" value="genbank"/>
-       <output name="$sourcedb" value="genbank"/>
-     </test>
-     <test>
-       <param name="input" value="test-data/CP009049.embl"/>
-       <output name="$output" file="CP009049.rdf"/>
-       <output name="$format" value="embl"/>
-       <output name="$sourcedb" value="embl"/>
-     </test>
-  </tests>
-  
- <help>
- Genbank or EMBL to RDF conversion
- </help>
-</tool>
b
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/test-data/CP009049.embl
--- a/gbk2rdf/test-data/CP009049.embl Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,157312 +0,0 @@\n-ID   CP009049; SV 1; circular; genomic DNA; STD; PRO; 4599018 BP.\n-XX\n-AC   CP009049;\n-XX\n-PR   Project:PRJNA255737;\n-XX\n-DT   13-FEB-2015 (Rel. 123, Created)\n-DT   13-FEB-2015 (Rel. 123, Last updated, Version 1)\n-XX\n-DE   Salmonella enterica subsp. enterica serovar Paratyphi A strain CMCC 50973,\n-DE   complete genome.\n-XX\n-KW   .\n-XX\n-OS   Salmonella enterica subsp. enterica serovar Paratyphi A\n-OC   Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n-OC   Enterobacteriaceae; Salmonella.\n-XX\n-RN   [1]\n-RP   1-4599018\n-RA   Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n-RT   "Whole Genome Sequences of two Salmonella paratyphi A strains";\n-RL   Unpublished.\n-XX\n-RN   [2]\n-RP   1-4599018\n-RA   Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n-RT   ;\n-RL   Submitted (24-JUL-2014) to the INSDC.\n-RL   State Key Laboratory of Pathogen and Biosecurity, Beijing Institute of\n-RL   Biotechnology, 20 Dongdajie, Fengtai District, Beijing, Beijing 100071,\n-RL   China\n-XX\n-DR   MD5; e41a6215bf412b701febd8d4b182ec0c.\n-DR   BioSample; SAMN02909989.\n-XX\n-CC   Source DNA/bacteria are available from National Center for  Medical\n-CC   Culture Collection (CMCC) in China.\n-CC   Annotation was added by the NCBI Prokaryotic Genome Annotation\n-CC   Pipeline (released 2013). Information about the Pipeline can be\n-CC   found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/\n-CC   ##Genome-Assembly-Data-START##\n-CC   Assembly Method       :: SOAPdenovo v. 2011.04\n-CC   Assembly Name         :: CMCC(B) 50973\n-CC   Genome Coverage       :: 133x\n-CC   Sequencing Technology :: Illumina\n-CC   ##Genome-Assembly-Data-END##\n-CC   ##Genome-Annotation-Data-START##\n-CC   Annotation Provider          :: NCBI\n-CC   Annotation Date              :: 07/25/2014 13:43:31\n-CC   Annotation Pipeline          :: NCBI Prokaryotic Genome Annotation\n-CC                                   Pipeline\n-CC   Annotation Method            :: Best-placed reference protein set;\n-CC                                   GeneMarkS+\n-CC   Annotation Software revision :: 2.6 (rev. 440435)\n-CC   Features Annotated           :: Gene; CDS; rRNA; tRNA; ncRNA;\n-CC                                   repeat_region\n-CC   Genes                        :: 4,309\n-CC   CDS                          :: 4,016\n-CC   Pseudo Genes                 :: 166\n-CC   CRISPR Arrays                :: 2\n-CC   rRNAs                        :: 20 ( 5S, 16S, 23S )\n-CC   tRNAs                        :: 100\n-CC   ncRNA                        :: 7\n-CC   Frameshifted Genes           :: 106\n-CC   ##Genome-Annotation-Data-END##\n-XX\n-FH   Key             Location/Qualifiers\n-FH\n-FT   source          1..4599018\n-FT                   /organism="Salmonella enterica subsp. enterica serovar\n-FT                   Paratyphi A"\n-FT                   /host="Homo sapiens"\n-FT                   /sub_species="enterica"\n-FT                   /strain="CMCC 50973"\n-FT                   /mol_type="genomic DNA"\n-FT                   /country="China:Jiangsu"\n-FT                   /lat_lon="32.04 N 118.78 E"\n-FT                   /collection_date="2003-06-01"\n-FT                   /serovar="Paratyphi A"\n-FT                   /db_xref="taxon:54388"\n-FT                   /culture_collection="CMCC:50973"\n-FT   gene            complement(129..713)\n-FT                   /gene="mobA"\n-FT                   /locus_tag="IT63_00010"\n-FT   CDS             complement(129..713)\n-FT                   /codon_start=1\n-FT                   /transl_table=11\n-FT                   /gene="mobA"\n-FT                   /locus_tag="IT63_00010"\n-FT                   /product="molybdopterin-guanine dinucleotide biosynthesis\n-FT                   protein MobA"\n-FT                   /note="in Escherichia coli MobA links a guanosine\n-FT                   5\'-phosphate to molydopterin to form molybdopterin guanine\n-FT                   dinucleotide during molybdenum cofactor biosynthesis;\n-FT                   Derived by automated c'..b'cgag cgaacgggga ggagcccaga gcctgaatca gcatgtgtgt   4596180\n-     tagtggaagc gtctggaaag gcgcgcgata cagggtgaca gccccgtaca caaaagcgca   4596240\n-     tgtgctgtga gctcgatgag tagggcggga cacgtggtat cctgtctgaa tatgggggga   4596300\n-     ccatcctcca aggctaaata ctaattttgc tctttaaaaa tctggatcaa gctgaaaatt   4596360\n-     gaaacacaga acaacgaaag ttgttcgtga gtctctcaaa ttttcgcaac acgatgatga   4596420\n-     atcgtaagaa acatcttcgg gttgtgaggt taagcgacta agcgtacacg gtggatgccc   4596480\n-     tggcagtcag aggcgatgaa ggacgtgcta atctgcgata agcgccggta aggtgatatg   4596540\n-     aaccgttata accggcgatt tccgaatggg gaaacccagt gtgattcgtc acactatcat   4596600\n-     taactgaatc cataggttaa tgaggcgaac cgggggaact gaaacatcta agtaccccga   4596660\n-     ggaaaagaaa tcaaccgaga ttcccccagt agcggcgagc gaacggggag gagcccagag   4596720\n-     cctgaatcag catgtgtgtt agtggaagcg tctggaaagg cgcgcgatac agggtgacag   4596780\n-     ccccgtacac aaaagcgcat gtgctgtgag ctcgatgagt agggcgggac acgtggtatc   4596840\n-     ctgtctgaat atggggggac catcctccaa ggctaaatac tcctgactga ccgatagtga   4596900\n-     accagtaccg tgagggaaag gcgaaaagaa ccccggcgag gggagtgaaa aagaacctga   4596960\n-     aaccgtgtac gtacaagcag tgggagcaca ggtttacctg tgtgactgcg taccttttgt   4597020\n-     ataatgggtc agcgacttat attctgtagc aaggttaacc gtatagggga gccggaggga   4597080\n-     aaccgagtct taaccgggcg ttaagttgca gggtatagac ccgaaacccg gtgatctagc   4597140\n-     catgggcagg ttgaaggttg ggtaacacta actggaggac cgaaccgact aatgttgaaa   4597200\n-     aattagcgga tgacctgtgg ctgggggtga aaggccaatc aaaccgggag atagctggtt   4597260\n-     ctccccgaaa gctatttagg tagcgcctcg tgaattcatc tccgggggta gagcactgtt   4597320\n-     tcggctaggg ggccatcccg gcttaccaac ccgatgcaaa ctgcgaatac cggagaatgt   4597380\n-     tatcacggga gacacacggc gggtgctaac gtccgtcgtg aagagggaaa caacccagac   4597440\n-     cgccagctaa ggtcccaaag tcatggttaa gtgggaaacg atgtgggaag gcccagacag   4597500\n-     ccaggatgtt ggcttagaag cagccatcat ttaaagaaag cgtaatagct cactggtcga   4597560\n-     gtcggcctgc gcggaagatg taacggggct aaaccatgca ccgaagctgc ggcagcgaca   4597620\n-     ctcaggtgtt gttgggtagg ggagcgttct gtaagcctgt gaaggtggcc tgtgagggtt   4597680\n-     gctggaggta tcagaagtgc gaatgctgac ataagtaacg ataaagcggg tgaaaagccc   4597740\n-     gctcgccgga agaccaaggg ttcctgtcca acgttaatcg gggcagggtg agtcgacccc   4597800\n-     taaggcgagg ccgaaaggcg tagtcgatgg gaaacgggtt aatattcccg tacttggtgt   4597860\n-     tactgcgaag ggggggacgg agaaggctat gttggccggg cgacggttgt cccggtttaa   4597920\n-     gcgtgtaggt gtgtgttcca ggtaaatccg gttcacttta acactgaggc gtgacgacga   4597980\n-     ggcactacgg tgctgaagca acaaatgccc tgcttccagg aaaagcctct aagcatcagg   4598040\n-     taacatcaaa tcgtacccca aaccgacaca ggtggtcagg tagagaatac caaggcgctt   4598100\n-     gagagaactc gggtgaagga actaggcaaa atggtgccgt aacttcggga gaaggcacgc   4598160\n-     tgacacgtag gtgaagtgat ttactcatgg agctgaagtc agtcgaagat accagctggc   4598220\n-     tgcaactgtt tattaaaaac acagcactgt gcaaacacga aagtggacgt atacggtgtg   4598280\n-     acgcctgccc ggtgccggaa ggttaattga tggggtcagc gcaagcgaag ctcctgatcg   4598340\n-     aagccccggt aaacggcggc cgtaactata acggtcctaa ggtagcgaaa ttccttgtcg   4598400\n-     ggtaagttcc gacctgcacg aatggcgtaa tgatggccag gctgtctcca cccgagactc   4598460\n-     agtgaaattg aactcgctgt gaagatgcag tgtacccgcg gcaagacgga aagaccccgt   4598520\n-     gaacctttac tatagcttga cactgaacat tgagccttga tgtgtaggat aggtgggagg   4598580\n-     ctttgaagtg tggacgccag tctgcatgga gccgaccttg aaataccacc ctttaatgtt   4598640\n-     tgatgttcta acgtggaccc gttacccggg ttgcggacag tgtctggtgg gtagtttgac   4598700\n-     tggggcggtc tcctcctaaa gagtaacgga ggagcacgaa ggttggctaa tcctggtcgg   4598760\n-     acatcaggag gttagtgcaa tggcataagc cagcttgact gcgagcgtga cggcgcgagc   4598820\n-     aggtgcgaaa gcaggtcata gtgatccggt ggttctgaat ggaagggcca tcgctcaacg   4598880\n-     gataaaaggt actccgggga taacaggctg ataccgccca agagttcata tcgacggcgg   4598940\n-     tgtttggcac ctcgatgtcg gctcatccca tcccggggct gaagtaggtc ccaagggtat   4599000\n-     ggctgttcgc catttaaa                                                 4599018\n-//\n'
b
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/test-data/NC_010067.gbk
--- a/gbk2rdf/test-data/NC_010067.gbk Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b"@@ -1,259779 +0,0 @@\n-LOCUS       NC_010067            4600800 bp    DNA     circular CON 20-AUG-2013\n-DEFINITION  Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n-            RSK2980 chromosome, complete genome.\n-ACCESSION   NC_010067\n-VERSION     NC_010067.1  GI:161501984\n-DBLINK      Project: 58191\n-            BioProject: PRJNA58191\n-KEYWORDS    .\n-SOURCE      Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n-            RSK2980\n-  ORGANISM  Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n-            RSK2980\n-            Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n-            Enterobacteriaceae; Salmonella.\n-REFERENCE   1  (bases 1 to 4600800)\n-  CONSRTM   NCBI Genome Project\n-  TITLE     Direct Submission\n-  JOURNAL   Submitted (03-DEC-2007) National Center for Biotechnology\n-            Information, NIH, Bethesda, MD 20894, USA\n-REFERENCE   2  (bases 1 to 4600800)\n-  AUTHORS   McClelland,M., Sanderson,E.K., Porwollik,S., Spieth,J.,\n-            Clifton,W.S., Fulton,R., Chunyan,W., Wollam,A., Shah,N., Pepin,K.,\n-            Bhonagiri,V., Nash,W., Johnson,M., Thiruvilangam,P. and Wilson,R.\n-  CONSRTM   The Salmonella enterica serovar Arizonae Genome Sequencing Project\n-  TITLE     Direct Submission\n-  JOURNAL   Submitted (02-NOV-2007) Genetics, Genome Sequencing Center, 4444\n-            Forest Park Parkway, St. Louis, MO 63108, USA\n-COMMENT     PROVISIONAL REFSEQ: This record has not yet been subject to final\n-            NCBI review. The reference sequence was derived from CP000880.\n-            Salmonella enterica subspecies IIIa (Arizonae) serovar\n-            62:z4,z23:--Most bacteria in the species S. enterica belong to one\n-            of seven subspecies; all but subspecies I normally grow only in\n-            cold-blooded animals. Subspecies IIIa (S. Arizonae) is naturally\n-            found in reptiles, but also causes outbreaks of salmonellosis in\n-            turkeys and sheep and can occasionally produce both gastroenteritis\n-            and serious disseminated disease in humans. Many human infections\n-            can be traced to contact with reptiles or ingestion of various\n-            reptile products, particularly from rattlesnakes. Fewer than ten\n-            cases in humans are typically reported in the US each year.\n-            \n-            The strain of S. Arizonae (62:z4,z23:-) being sequenced is\n-            CDC346-86; it was named RSK2980 by R.K. Selander and is strain\n-            SARC5 of the Salmonella Reference C set. This serovar is of\n-            interest because of its taxonomic position. It appears to be the\n-            most divergent subspecies among the S. enterica. It can be obtained\n-            from the American Type Culture Collection as ATCC BAA-731, or the\n-            Salmonella Genetic Stock Centre as SGSC4693. The genome was\n-            sequenced to 8X coverage, using plasmid and fosmid libraries and\n-            was finished to an error rate of less than 1 per 10,000 bases.\n-            Automated annotation was performed and manual annotation will\n-            continue in the labs of Michael McClelland and Kenneth Sanderson.\n-            The National Institute of Allergy and Infectious Diseases (NIAID),\n-            National Institutes of Health (NIH) has funded this project.\n-            \n-            Coding sequences below are predicted using GeneMark v3.3 and\n-            Glimmer2  v2.13.Intergenic regions not spanned by GeneMark and\n-            Glimmer2 were blasted against NCBI's non-redundant (NR) database\n-            and predictions generated based on protein alignments. RNA genes\n-            were determined  using tRNAscan-SE 1.23 or Rfam v8.0. This sequence\n-            was finished as follows unless otherwise noted: all regions were\n-            double stranded, sequenced with an alternate chemistries or covered\n-            by high quality data(i.e., phred quality >=30);an attempt was made\n-      "..b'1 acccgtcatc gtatcgtcct tgccgcaacg cttgcggaat ttcttacaca acttaatcct\n-  4597741 cttctgtaat cgtttgccct gacaggtgtg agagatctct tacaaggtct gtaggagatc\n-  4597801 gccaggatat cagagaatac ttagctacga ctttctcctg taaatatata taaatcaatc\n-  4597861 tattaaaata ttatttcgca ctttcatata caaatttact taaggtatcg tctgtaagcg\n-  4597921 tcttgtaaga caaggtgaaa caggcgattc tatattcatc gacagggagt cgtacaacga\n-  4597981 agcgaacgtc aggaagatgg cgcttctgca ggacacgcca ggagggcgtt acatggaaag\n-  4598041 gcttcaggat gaggcaaagt ggaaagcgca ggatgcgtta aaggacacct ccaggacgga\n-  4598101 gaacgagagc cgattaggat ggtcggcggg tctggatgac cagggacgct tcgggatgaa\n-  4598161 gctatcacat cggggcgatg tgcgcaggat gcaaacgttc aggatgagca ggccgcaggg\n-  4598221 tcacaggaaa agttgtcacg gatgagcagg gagcatgaaa agtagctgga atgctgcgaa\n-  4598281 acgaaccggg agcactgttt atacagtgct cccttttttt gttattcttc gcgccagatt\n-  4598341 tccattattg aggttcttaa catgacgact catgaccgtg tgcgtcagca gttacatgcg\n-  4598401 cttgaaacgc tgctgcgtga gcatcatcac tggcggctgg atgcgccgca ggcgcacctg\n-  4598461 tttaccagca cgcagccgtt ttgtatggat accatggaac cgctggaatg gctgcaatgg\n-  4598521 gtattgatcc cgcgtatgca taccctgctt gataatgcgc agccgttacc tgaggcgttt\n-  4598581 gccgtcgccc cttattatga aatggcgctg acggcggatt atccgcagcg ggaagcgatc\n-  4598641 ctgacggttt tgcaggatct ggatgcgcta tttacccgcg ataaatcctg atgctggaga\n-  4598701 tcctctatca ggacgcgtgg ctggttgccg ttaataaacc tgcaggctgg cttgttcacc\n-  4598761 ggagctggct ggatcgcgac gaaaaagttg tggtcatgca aacggtgcgc gaccaaatcg\n-  4598821 gccagcatgt ttttaccgcc caccgtctcg acagacccac atcgggcgta ctactgatgg\n-  4598881 ggctgtccag cgaagcggga cgccgcctgg cgcagcagtt cgagcagcac catatccgta\n-  4598941 aacgttacca tgccatagtg cgcggctggc tgatggatga tgcgctactg gattatcctc\n-  4599001 tgctggaaga gcgcgataaa attgccgata agttcgcgcg tgaggataaa gcgccccagc\n-  4599061 cagccgtaac gcagtatcgc gggctggcga cggtcgaaat ggcagtgccg accgggcgtt\n-  4599121 atcccactac gcgttatggc ctggttgagc tggaaccgaa aacggggcgc aaacaccagc\n-  4599181 tccgccgtca tctggcgcat ctacgccatc ctatcatcgg cgacagtaaa cacggtgatt\n-  4599241 tgcggcaaaa ccgtagcgcg gcggaacatt ttgcttgtcg tcgcctgatg cttcatgcca\n-  4599301 gtcggcttga actgacgcat cccttcaccg gacagccatt aattattcag gccggactgg\n-  4599361 atgaaacctg gatgcaggcg ctaacacagt ttggctggcg gggacttctc cctgataatg\n-  4599421 aaagggttga gtttacgacg gcgtcccggc aggatgagtc ttatcagaca taattcaggg\n-  4599481 agatacgcat aatggcggaa attggtattt ttgtcggtac gatgtatggc aactcactgt\n-  4599541 tggtggcgga ggaagcggaa gcgatcctgg ccagacaggg ccatagcgcg actgtgtttg\n-  4599601 aagatcctga actgtccgac tggcggcaat atcaggacaa ggtggcattg gttgtcacct\n-  4599661 caacgaccgg acagggcgat ctaccggata gtattgcgcc gctctttcac ggtattaaag\n-  4599721 atacgttagg ttttcaacca aacctgcgtt acggggtgat tgcgttaggt gatagcagct\n-  4599781 accccaattt ctgtaatggc ggcaagcagt ttgatgccct gttgcaggag caaagcgcgc\n-  4599841 aacgggtggg ggaaatgtta ctcattgacg ccagcgaaca tccggagccg gagagccaat\n-  4599901 ccaatccctg ggtagaaaac tggggaacct tactttcctg aggtaaatcc ctccccctac\n-  4599961 cgggagggta ccttttcgtt tgattgcatt gccagtaagc aaaataacga cctgtatgta\n-  4600021 gtttaaagaa actgaatcgt gttagctttg tgcatatgcc tgcaaaagca gcagtttttt\n-  4600081 acgggcgttt tcatgtaatc aagcgacctg tttcacattc ttctcttttt attcctcctg\n-  4600141 cgtcgacgcc tgacgccttc tgatttcatt tccgtgaagt ggcttccact gtcctgggct\n-  4600201 tttgccacaa acaggcgtaa ttcattgcca aaatactgtg ttgttgcacg gtgagtgtgc\n-  4600261 gtgacgcgct ttttatactt ctcctgccag tgaataaaag aatgcagcat gcaaagcaaa\n-  4600321 cgacctaata aaagctgcaa caaggaaacg ttatctctga ttccctaccg gttgtgcagt\n-  4600381 tcagagtgag cgtagctaac gcgaaatttc aggagtgcaa caatgagttc attaagtcac\n-  4600441 gcggcgagta gtgcggagaa tcgcacgaac gcccgctact ggatagtggt gatgctgttt\n-  4600501 atcgtcacat cctttaacta tggcgatcgc gccacattgt ccattgccgg ctcagaaatg\n-  4600561 gccaaagata ttggtcttga cccggtaggc atgggctacg ttttctctgc gttttcatgg\n-  4600621 gcctatgtta tcggacagat ccctggcggc tggctgctgg accgctttgg ttccaaacgc\n-  4600681 gtctatttct ggtctatttt catctggtcg gtcttcaccc tgttgcaggg ttttgtcgat\n-  4600741 atttttagcg gtttcggcat tgttgtcgcc ctctttacgc ttcgtttcct ggtcggtctg\n-//\n'
b
diff -r 4e05b3bf3e3e -r bde695b3f97d protein2rdf/protein_to_ttl.py
--- a/protein2rdf/protein_to_ttl.py Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,130 +0,0 @@
-def delete_galaxy():
- import sys
- for index, path in enumerate(sys.path):
- if "galaxy-dist/" in path:
- sys.path[index] = ''
-
-#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
-delete_galaxy()
-
-# from io import StringIO
-from rdflib import Graph, URIRef, Literal,Namespace,  RDF,RDFS,OWL,  plugin
-# import rdflib
-from rdflib.store import Store
-import sys
-import hashlib
-
-store = plugin.get('IOMemory', Store)()
-
-global URI
-URI = "http://csb.wur.nl/genome/"
-global seeAlso
-seeAlso = "rdfs:seeAlso"
-global coreURI
-coreURI = Namespace(URI)
-
-
-def createClass(uri):
- genomeGraph.add((uri,RDF.type,OWL.Class))
- genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
- return uri
-
-def fasta_parser(input_file):
- createClass(coreURI["Protein"])
-
- genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
- if genome == '':
- genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
-
- genomeURI = coreURI[genome]
- for index, element in enumerate(sys.argv):
- if '-organism' == element:
- genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
- if '-ncbi_taxid' == element:
- genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
- if '-idtag' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
- if '-diagnosis' == element:
- genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
- if '-country' == element:
- genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
- if '-location' == element:
- genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
- if '-date' == element:
- genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
- if '-ids' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
-
-
-
- data = (open(input_file).readlines())
- fastadict = {}
- sequence = ""
- key = ""
- for index, line in enumerate(data):
- if ">" == line[0]:
- if sequence:
- fastadict[key] = sequence
- key = line
- sequence = ""
- fastadict[key] = ""
- else:
- sequence += line.strip()
- fastadict[key] = sequence
-
- #Create a class, to be the same as all the other genome conversions...
- #TODO: Proteins are part of cds, cds are part of dnaobject
- #If CDS is not there... how then?
- classURI = coreURI[genome + "/" + "protein_fasta"]
- proteinClass = createClass(coreURI["Protein"])
- genomeClass = createClass(coreURI["Genome"])
- typeClass = createClass(coreURI["DnaObject"])
- cdsClass = createClass(coreURI["Cds"])
- #A theoretical begin, end is created to have a workable GBK generation
- begin = 0
- end = 0
- genomeGraph.add((genomeURI, RDF.type, genomeClass))
- genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
- genomeGraph.add((classURI, RDF.type, typeClass))
-
- for protein in fastadict:
- sequence = fastadict[protein]
- sequence = sequence.encode('utf-8')
- end = begin + len(sequence)
- md5_protein = hashlib.md5(sequence).hexdigest()
- proteinURI = coreURI["protein/"+md5_protein]
-
- cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
- genomeGraph.add((classURI, coreURI["feature"] , cdsURI))
- genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))
- genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
- genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
- genomeGraph.add((cdsURI, RDF.type, cdsClass))
-
-
-
- genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
- genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
- genomeGraph.add((proteinURI,RDF.type,proteinClass))
- genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((proteinURI, RDF.type, proteinClass))
- begin = end
-
-def save():
- data = genomeGraph.serialize(format='turtle')
- open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
-
-def main():
- store = plugin.get('IOMemory', Store)()
- global genomeGraph
- genomeGraph = Graph(store,URIRef(URI))
- genomeGraph.bind("ssb",coreURI)
- input_file = sys.argv[sys.argv.index("-input")+1]
- fasta_parser(input_file)
- save()
-
-if __name__ == '__main__':
- main()
-
b
diff -r 4e05b3bf3e3e -r bde695b3f97d protein2rdf/protein_to_ttl.xml
--- a/protein2rdf/protein_to_ttl.xml Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,42 +0,0 @@
-<tool id="SAPP_protein_rdf" name="Protein FASTA to RDF" version="0.1">
-    <requirements>
-        <requirement type='package' version="3.4">python</requirement>
-        <requirement type='package' version="1.0">rdflib</requirement>
-    </requirements>
- <description></description>
- <command interpreter="python3.4">protein_to_ttl.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' '-diagnosis' '$diagnosis' '-country' '$country' '-location' '$location' '-date' '$date' -sourcedb SAPP 
- #for $index, $id in enumerate( $ids ) 
- '-ids' '$id.id_tag'
- #end for
- '-id_alternative' '$input.name'
- </command>
- <inputs>
- <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
- <param size="60" name="organism" type="text" format="text" label="organism name"/>
- <param size="60" name="diagnosis" type="text" format="text" label="Diagnosis of host if applicable"/>
- <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
- <param size="60" name="country" type="text" format="text" label="Country of sample"/>
- <param size="60" name="location" type="text" format="text" label="Location of sample e.g., river, city, hospital"/>
- <param size="60" name="date" type="text" format="text" label="Sample date"/>
- <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!"/>
- <repeat name="ids" title="Identification tags">     
- <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
- </repeat>
- </inputs>
-
- <outputs>
- <data format="rdf" name="output" label="proteinTTL: ${input.name}" />
- </outputs>
-    <tests>
-        <test>
-            <param name="input" value="test-data/NC_017117.faa"/>
-            <output name="$output" file="NC_017117.rdf"/>
-            <output name="$ncbi_taxid" value="634455"/>
-            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
-            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
-        </test>
-    </tests>
- <help>
- RDF creation from a multi protein fasta file
- </help>
-</tool>
b
diff -r 4e05b3bf3e3e -r bde695b3f97d protein2rdf/test-data/NC_017117.faa
--- a/protein2rdf/test-data/NC_017117.faa Sat Feb 21 11:26:55 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,993 +0,0 @@\n->gi|384055706|ref|YP_005485330.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MKSDRFTDAQIMGVIRQAEGGVPVPDLCREHGISNATFYRWRAKYGGMDASMISQMKALEEENRRLKRMY\n-ADLSMQTDILKEALGKK\n->gi|384055707|ref|YP_005485331.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n-MAGHHVEAMIARAHAQKRFMDDAGWRYVVELYGRYQSLLREQNAADFGDLLMWPTLAMLHNDAYRYRWSR\n-RFTAVMADEFQDVNRAQFLWLKMISEVSAEFFAVGDDSQSIL\n->gi|384055708|ref|YP_005485332.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MVVGRNDCAKGRQMKDTVIGVDLAKNIFQVHGASRAGEVMFRKKLRRQQFMQFMATQPPALVVLEACGSA\n-HYWARELAGAGHEVRLIAPQYVKPFVKRQKNDAADAEAIVIAARQPEMRFVEPRTEAQQARGVLFRARQR\n-LVHQRTELVNALRAVLYEFGLVVPQGIAHIRHIEAMLDEAVLPEAVKQECLDLLRQISEQSVRIDVRTKK\n-IRMLAQESENTCRLQSMPGVGPLTALAIEAFAPDLQSFRRGRDFAAWLGLVPRQFSSGGKERLGKISKAG\n-QADIRRLLIMGAMTQVNWASRKAPAPGSWLARMLARKPRMLVAIALANRMARAIWAMATKQEDYRDPALS\n-VAA\n->gi|384055709|ref|YP_005485333.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MEQIIRIGMDTSKSVFQLHGVNAKEQPVLRRKLSRREMVKFFEKLPPIEIAIEACGASHYWGRVLSCLGH\n-TVKLIAPQLVKPYVKRGKNDAADAEALCEAMSRPTMRFVPLKSEEEQAALMLIGMRARLIRNRTQLANTI\n-RGYAAEFGITAPKGMCRIEALLDRIAADESLPTLTRELFALHAKEYAELQGEIEQLEGKVMAWHRANECS\n-QRLAKIPGVGPIGAALLMMKTPDPHLFKSGRAFAAWIGLTPRDHSTGGKTRLGRITRAGDEVLRSTLVVG\n-ATAVVSHARRTNGKNASSWLRELLERKKPKLAAVALANKIARIAWKLMVSGEHYKRLLQQPGAAAV\n->gi|384055710|ref|YP_005485334.1| DNA resolvase [Acetobacter pasteurianus IFO 3283-22]\n-MVPPKPGKTPVGGRLIGYARVSTDDQGTDAQLNELRDAGCTMIFEKHASGADRNRPVLIRLLRDMNAGDT\n-LVVVRLDRLARSVSHLLAVIEQLDYAGAHFRSLDDPIDTTTPQGMFSLQVLGAVAQLDADFFCDGVDGSQ\n-RHRDVPR\n->gi|384055711|ref|YP_005485335.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MLTSRIHRRKPMGKPMSKATARANAAKSSIRAHVEHVFAHQKNRFNLFIRTIGLARAEAKLTLCNLAYNF\n-NRLIFHERLETAG\n->gi|384055712|ref|YP_005485336.1| D-mannonate oxidoreductase [Acetobacter pasteurianus IFO 3283-22]\n-MNLNRNAISHVPDTVYTPRYDPALLRPGIVHLGCGNFHRGHQVVATQAAIDAEGRDGLRWGIVSATMRRP\n-DLATVLQSQDNLYTLLTREPANTVASVMAAITEAVYAGDDNANLAARIADPATAIVTLTVTASGYYLSAD\n-GRLDPTFEAIQADLTAITPRTAPGIIAAGLAQVRQRGGVPPVILCCDNVNSNGATLRQAVIDLAALKGDD\n-LLAAWIETNVQFPDTMVDRIVPTATPDDIADACRLLGGIEDRAPISAEPWFQWVIGEFDGPRPRWVAHPG\n-TKFVSDVGVFERAKLQMLNGTHMLLAYVGALANLNTVSEAASDDALGRIAARFMRNEQTADVSLDTDELD\n-RYTVDLMQRFRNPGIVHEVTRIGRNGSAKMASRIVQPMRSNIEAGRPVDGAVLLIASWIRWFALHEQDEF\n-DIALTDPRAETLRGLCADARDDHKAQAEAFLAMEEVFGAPLPDHGKQVEAIASMLRRLTEESVPELLRTI\n-AH\n->gi|384055713|ref|YP_005485337.1| phosphatase/phosphohexomutase [Acetobacter pasteurianus IFO 3283-22]\n-MTDTVFPAHLLKHKQEPVHGVVFDMDGLLLDSESLAMEALVFAARDLNYDIPMSFCRTMIGVPADGCRTM\n-VRKTYGQDFPLERFFELQEVHLRNFVDTGKLALKKGVLPLLDLLDTYKIPRAIATSSSRVRTDHHLKLVN\n-LFHRFNAIVTRDDVSKGKPDPEPYLTAAKKIGVNPAHALALEDSHSGARAAHAAGIRVIVVPDLLEATDE\n-IRGKALAIVQDLSIVEAYLKHAITGQA\n->gi|384055714|ref|YP_005485338.1| hypothetical protein APA22_40090 [Acetobacter pasteurianus IFO 3283-22]\n-MRRDMDLVRQLLLKLEGIEKGPHDVLLIGGNSEEVAVDGRTSDEIYFHLTKIEEAGFLERVGGGAMTAVT\n-FRALSWKGQEFLDTIRDDSIWKKTKEKAGSASFDILAAVAKAVIKDRIKSLTGLDIG\n->gi|384055715|ref|YP_005485339.1| hypothetical protein APA22_40100 [Acetobacter pasteurianus IFO 3283-22]\n-MRPLGSGLSVRTYGCSEADDQENDGWAKKDTGEIVALYEMSSPVMPSGLVSISRWKIKGCYPKSGLSRAM\n-LCPTKIPQSASNIALLIGSDWSFIEENVFCNHIEWQTCLPVFVMNLDHPA\n->gi|384055716|ref|YP_005485340.1| DNA helicase superfamily I [Acetobacter pasteurianus IFO 3283-22]\n-MSSKPSHHSVLSYWHSALLDDAQMKISFSRDNLVALDEEGFEKGKLPPDKTQALRKMHPASRDLAPDDSI\n-IAMAGIRILLGQVSHSTEHSKQPALFCMAMLVNVSPEGTIQPLKDAPPWINRELLEPSDGDVLIGDLATM\n-DTWLQLNPFEGGSLGKTLEWAEKLWNAVTGEDGLPDGYELWERVALQPAEASIGMIATLHQRRFYDTVLA\n-DTGLVTPLLARYIDGGPEPAVVDESQKWAAAGRARGTMTFAYGMSSSQSEAMTAFCSVKDGDILAVNGPP\n-GTGKTTLLQGIVATELVTRALEGGDPAVIVGTSTNNQAVTNIIDAMKKAMASKDSRPWARRWIEGADALG\n-LYFPSGEKEKEALKAGYLIASPGRGLGTMEWKGFPERERDTVDAWASRDAWINGYYGSFYPGVTPPLRKE\n-HLSGHGPQGARHDISLVEDGIAKIRARMKVLVETGRVCAGEARKLNQLYVASGYGTYPDITKAIAQREAL\n-LQERRPREDALKSDLKEKEAAAAVPRARINEENRKTRDLLKQRDDAVHAAGQKVEEVGAHAVALIAALPG\n-GGFFSNLMSGRNWANVERLVAEGRQGSFFRSLMQAQVKSKREWMDAINEMTASAERELATVRESREETRQ\n-ARDTLIQKLEREVAAADLVSKTARAEYDHYVGGSYVLAGRELEKLVTLKHQILQQLQDCCTAIETVLAPS\n-DWAAMFDMPEEKLPWRQSNWTGRLDVIEDFLDR'..b'DEVAPAV\n-RHLISQIQTTIA\n->gi|384055875|ref|YP_005485499.1| multidrug resistance transporter EmrB/QacA [Acetobacter pasteurianus IFO 3283-22]\n-MGTSMTSSRVTNPLFVLLAASTGCALTVLDTNVVAIILPTIAREFRASFADIEWVISTYVLCFASLLLPA\n-GAIADRYGRRRIYLIGITTFALTSLFCGAAPSATALYLARALQGVSAAFLLAPALAIIGHTFHNPDERNR\n-AWAIWGSIMGLTMVLAPIIGGIIAYALGWRWAFYINIPICVLLAGAVFILVKESRDTDARRLDPVGIIFF\n-AAFMFGLTWGMINGQASGWTSWNALNGFIGGSISLGIFIASERAQSRPMLDLGLFSNPRFLGAVWAMFAY\n-AASAQVMASMLPLFLQNGLGRSALQAGFAMLPFALAMLIFPHIGRLLERHISSSGILAGGLSCVAIGNGI\n-TAWGAYVGSWIIVMAGMVVIGSGGGLLNGETQKAIMSVVPKERSGMASGISTTSRFSGILLGFAMLSGIL\n-ATMVRKWVAAFGCGTGCHHPSDFADAIVAGDLPSAISGLEGSNQEIAIQHAHHAFSYGFAVALLVASIFA\n-LGSSITVFTLMQSKMKQNIT\n->gi|384055876|ref|YP_005485500.1| transposase, partial [Acetobacter pasteurianus IFO 3283-22]\n-MLAYAVMASVRYQANSLKPKKTQLRTRQSLSAGPFRRSGASS\n->gi|384055877|ref|YP_005485501.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVILVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n-LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGQQADRYCRIIAD\n-HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n-ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n-SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n-CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n-KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n->gi|384055878|ref|YP_005485502.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVIVVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n-LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGHKADRYCRIIAD\n-HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n-ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n-SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n-CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n-KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n->gi|384055879|ref|YP_005485503.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n-MLQFSYMSEEADAIAAEIGRRAASGCAWHDIAVIYRQNRLSRAIEEALIQARVPYEIVGDVGFYQRVAVK\n-DALALLSLAARPDDRQSDEAFRADFSHLRQFRVIL\n->gi|384055880|ref|YP_005485504.1| DNA helicase RecD/TraA [Acetobacter pasteurianus IFO 3283-22]\n-MTSAVVGEQCQTEALAGLVERVTFHNAENGFCVLRVKVRGQRDLVTVVGHAAMISAGEFVQMSGRWFNDH\n-THGLQFKAEFLKASPPTTVEGIERYLGSGMIRGIGPVYAKKLVKAFGEAVFDLIEQEPHRLREVTGIGPK\n-RAERIVGGWADQKVIREIMLFLHSNGVGTSRAVRIFKTYGQDAVRLISENPYRLAKDIRGIGFKTADQIA\n-RKMGIAPDAMIRVRAGISYALGEAMDEGHCGLPVGELLTSTAELLEVAAPLIETALALELEAGDVVADSV\n-GETSCIFLAGLYRAEQSIAERLRACAVGRPPWPEIDAEKAMTWVEGKTGLAMAPSQQEAVRLALRSKVLV\n-ITGGPGVGKTTLVNAILKIVTAKGTDVQLCAPTGRAAKRLSESTGLEGKTIHRLLETDPGNGSFKRDDTN\n-PLTCDLLVVDEASMVDVLLMRSLLRALPDSASLLIVGDVDQLPSVGPGQVLADIIGSDAVPVVRLTEVFR\n-QAAQSRIITNAHRINEGKMPELSAEEGSDFYFVEAAEPEVGLRKLLAVVKDRIPARFGLDPVRDVQVLCP\n-MNRGGLGARSLNIELQQALNPAGDVKVERFGWTYGPGDKVMQIANDYDRDVFNGDLGVIDKIDVEEGELT\n-VLFDGREVVYGFGELDELVLAYATTIHKSQGSEYPVVVIPLVTQHYTMLARNLLYTGVTRGRKLVVLVGQ\n-KKALAIAVRNQGGRLRWSKLRDWLVGTSGTGHLSRLKKP\n->gi|384055881|ref|YP_005485505.1| phage integrase [Acetobacter pasteurianus IFO 3283-22]\n-MVESQVSHIQPEYKFHINLDEYDRRATLSADELKVVRRWKEENLVITKRQAPRLHKPLTDILYRSNLDRA\n-NSHRALKYLLLTVAHQEKPYWGWSEDLWVEIINNSPVLKKTGMVPQLIAVAYLLCGFRSVYKIQRNVATA\n-VVARLVFGAEIVDTECERLFSALTRVGFVCQTVRPLVPSVFAAVALQGENPKLESFDRKILEHTRECYTG\n-NHIAKRIGILSNGLAAMGLTSKVIHFRAYPPRHGTETDNINPEWMTWCRRWLETTTLREGSRRAVYNTLT\n-RIGIWLGREHPEVTGPEQWTVSVCADYLAAVDRLRVGDWGGSTFDYRLIPTVGQPLQAPTKVAYYQVMRR\n-FLSDIQSWEWARLRCNPRYHLSTPKNIAKYLGVNPRTIDDASWLKLTWASLNIEPDDLSPDCFYPFALLQ\n-AIAVVWTHAGLRSNEIARLRVGCTREQSEDVVDQSGNVVPAGQVCWLDVPEGKTSVAYTKPVGHAVHKYI\n-TAWMKKRASPRKHLDRRTGEHVHFLFQLRNRPIAKEVLNQTVIPLLCKKAGIPIEDSKGRITSHRGRASA\n-VSMLASVPQGMTIFDLAKWCGHTSVQSTMSYVRSKPTQLASAFAKADQAARMIEIVIDNEVIAAGATKDG\n-APWKYYDLGDSYCSNAFWSTCPHRMACARCYFNIPKPSAKGVVLAAQQAANRLLEEVWLSPEERDAVSGD\n-VEALEGMLNKLRDKPALDGRTPGEISATCGSQVSSPFTESE\n->gi|384055882|ref|YP_005485506.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MELGITPGQDADITQAEPLLENIEPDAFLADKAYDADRLIDRLIQRGITPVIPPKRNRTTRRVIPP\n'