Previous changeset 20:4e05b3bf3e3e (2015-02-21) Next changeset 26:6a858e304888 (2015-02-25) |
Commit message:
Deleted selected files |
removed:
fasta2rdf/fastatordf.py fasta2rdf/fastatordf.xml fasta2rdf/test-data/NC_017117.fna gbk2rdf/gbktordf.py gbk2rdf/gbktordf.xml gbk2rdf/test-data/CP009049.embl gbk2rdf/test-data/NC_010067.gbk protein2rdf/protein_to_ttl.py protein2rdf/protein_to_ttl.xml protein2rdf/test-data/NC_017117.faa |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d fasta2rdf/fastatordf.py --- a/fasta2rdf/fastatordf.py Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,97 +0,0 @@ -#!/usr/bin/env python3.4 -# Author: Jasper Jan Koehorst -# Date created: Jan 22 2015 -# Function: generation of a RDF file from a genome fasta file - - -# from io import StringIO -from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin -# import rdflib -from rdflib.store import Store -import sys - -store = plugin.get('IOMemory', Store)() - -global URI -URI = "http://csb.wur.nl/genome/" -global seeAlso -seeAlso = "rdfs:seeAlso" -global coreURI -coreURI = Namespace(URI) -global genomeGraph -store = plugin.get('IOMemory', Store)() -genomeGraph = Graph(store,URIRef(URI)) -genomeGraph.bind("ssb",coreURI) - -def delete_galaxy(): - for index, path in enumerate(sys.path): - if "galaxy-dist/" in path: - sys.path[index] = '' - -def createClass(uri): - genomeGraph.add((uri,RDF.type,OWL.Class)) - genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) - return uri - -def fasta_parser(input_file): - createClass(coreURI["Genome"]) #Genome class - createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) - - genomeDict = {} - - sequence = "" - genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") - if genomeID == 'None': - genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") - - genomeURI = coreURI[genomeID] - for index, element in enumerate(sys.argv): - if '-organism' == element: - genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) - if '-ncbi_taxid' == element: - genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) - if '-idtag' == element: - genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) - if '-ids' == element: - genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) - - genomeDict[genomeID] = {} - - #Generating genome dictionary - data = open(input_file).readlines() - fastadict = {} - key = "" - for index, line in enumerate(data): - if ">" == line[0]: - key = line.strip(">").strip() - fastadict[key] = "" - else: - fastadict[key] += line.strip() - - genomeClass = createClass(coreURI["Genome"]) - typeClass = createClass(coreURI["DnaObject"]) - for index, genome in enumerate(fastadict): - typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] - sequence = fastadict[genome] - genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) - genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) - genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) - genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) - genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) - genomeGraph.add((genomeURI, RDF.type,genomeClass)) - genomeGraph.add((typeURI, RDF.type,typeClass)) - -def save(): - data = genomeGraph.serialize(format='turtle') - open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) - -def main(): - input_file = sys.argv[sys.argv.index("-input")+1] - fasta_parser(input_file) - save() - -if __name__ == '__main__': - #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. - delete_galaxy() - main() - |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d fasta2rdf/fastatordf.xml --- a/fasta2rdf/fastatordf.xml Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,38 +0,0 @@ -<tool id="SAPP_genome_to_ttl" name="FASTA to RDF" version="0.1"> - <requirements> - <requirement type='package' version="3.4">python</requirement> - <requirement type='package' version="1.0">rdflib</requirement> - </requirements> - <description></description> - <command interpreter="python3">fastatordf.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' -sourcedb SAPP - #for $index, $id in enumerate( $ids ) - '-ids' '$id.id_tag' - #end for - '-id_alternative' '$input.name' - </command> - <inputs> - <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/> - <param size="60" name="organism" type="text" format="text" label="organism name" optional="false"/> - <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/> - <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!" optional="false"/> - <repeat name="ids" title="Identification tags"> - <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/> - </repeat> - </inputs> - - <outputs> - <data format="rdf" name="output" label="genomeTTL: ${input.name}" /> - </outputs> - - <tests> - <test> - <param name="input" value="test-data/NC_017117.fna"/> - <output name="$output" file="NC_017117.rdf"/> - <output name="$ncbi_taxid" value="634455"/> - <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/> - <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/> - </test> - </tests> - -<help> Genome FASTA file to RDF</help> -</tool> |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d fasta2rdf/test-data/NC_017117.fna --- a/fasta2rdf/test-data/NC_017117.fna Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,2736 +0,0 @@\n->gi|384055705|ref|NC_017117.1| Acetobacter pasteurianus IFO 3283-22 plasmid pAPA22-010, complete sequence\n-CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACAT\n-TGCGCAGATACAGGAAACACAGACCAAATCCCCATCTCCTGTGAGCCTGGGTCAGTCCCACCAGAAGAGC\n-GGCAATCCTGTCGTTCTCCGCTGCCAGTCGCGGACGATAGCGAAAGCAGGTCTCGGATATCCCAAAAATC\n-CGACAGGCCAGCGCAATGCTGACCCCATGATGCGCCACAGCTTGTGCGGCCAGTTCCCGGCGCTGGGCTG\n-GCCGCTTCATTTTTTTCCAAGGGCTTCCTTCAGGATATCCGTCTGCATGCTCAAATCCGCATACATGCGC\n-TTCAGCCGACGGTTCTCCTCTTCCAAAGCCTTCATCTGACTGATCATCGAAGCATCCATGCCGCCATATT\n-TCGCGCGCCACCGGTAAAACGTGGCGTTGCTGATCCCATGCTCCCGACACAGGTCAGGAACCGGGACACC\n-GCCCTCAGCCTGGCGGATCACACCCATGATCTGGGCGTCAGTAAAGCGATCACTCTTCATCAGAATCTCC\n-TCAATTCTTACGCTGAGAAAATTCTCATTCAAAAGTCACTCTTTTTATGGGGGGATTACCACTCTAAATC\n-AATGCATTCCAATTAACTTATAAAATGCTTTGAGAGTCATCACCTACAGCAAAGAACTCTGCTGACACCT\n-CTGAAATCATTTTGAGCCATAAAAACTGGGCTCGATTGACGTCCTGAAACTCATCAGCCATCACGGCTGT\n-GAAACGGCGTGACCAGCGGTAGCGATAGGCATCATTGTGCAGCATTGCCAATGTCGGCCACATCAACAGA\n-TCCCCAAAATCTGCAGCATTCTGTTCGCGCAGCAAACTCTGGTAACGACCATACAACTCAACCACATAGC\n-GCCAGCCCGCATCGTCCATAAAACGTTTCTGGGCATGTGCTCGCGCTATCATGGCTTCAACATGATGCCC\n-TGCCATCTCAGGCGTCACCAGATCTTCCTTCAAACGAGATGGGGTGGTTGCCGTCCTCCCCGCACGGCAT\n-CGCAATGTGCCAGAATGGTCGTTGGAAGAAACGACTGCGCGAAAGGACGGCAGATGAAGGATACAGTGAT\n-AGGCGTTGATCTGGCAAAGAACATTTTCCAGGTTCATGGAGCTTCGCGTGCGGGCGAGGTGATGTTTCGC\n-AAAAAGCTGCGTCGTCAGCAGTTTATGCAGTTCATGGCCACGCAGCCGCCTGCTCTGGTCGTTCTTGAAG\n-CGTGCGGGAGCGCGCATTACTGGGCTCGCGAACTGGCAGGAGCTGGTCACGAGGTCAGACTGATCGCTCC\n-GCAGTATGTGAAGCCTTTCGTGAAGCGCCAGAAGAACGATGCTGCTGATGCGGAAGCGATCGTCATTGCG\n-GCCCGTCAGCCGGAAATGCGCTTTGTCGAACCACGCACTGAAGCGCAGCAGGCGCGTGGCGTTCTTTTCC\n-GGGCCCGGCAGCGTCTGGTGCACCAGCGCACGGAACTGGTGAATGCCCTGCGTGCCGTTCTGTATGAATT\n-CGGTCTCGTCGTGCCACAGGGGATTGCGCATATCAGACACATTGAAGCCATGCTGGATGAGGCGGTTCTG\n-CCAGAGGCTGTGAAGCAGGAATGCCTTGATCTGCTGCGACAGATTTCGGAGCAGAGTGTGCGGATTGATG\n-TCAGAACAAAGAAGATCAGGATGCTTGCCCAGGAAAGTGAAAACACCTGCAGATTGCAGAGCATGCCTGG\n-AGTGGGTCCTCTGACCGCTCTTGCGATTGAAGCTTTTGCGCCTGACCTGCAGAGCTTCCGGCGCGGGCGC\n-GACTTTGCTGCGTGGCTGGGGCTGGTGCCCCGTCAGTTCTCATCTGGCGGAAAGGAAAGGCTGGGGAAGA\n-TATCAAAAGCCGGGCAGGCTGATATCCGCAGGCTTCTCATCATGGGCGCCATGACCCAGGTGAACTGGGC\n-CAGCCGTAAGGCCCCTGCACCGGGAAGCTGGCTGGCACGGATGCTGGCCCGCAAGCCCCGTATGCTGGTA\n-GCCATTGCGCTGGCCAACAGGATGGCACGAGCCATCTGGGCCATGGCAACAAAACAGGAGGATTATCGGG\n-ATCCGGCCCTGTCCGTGGCAGCCTGAGCGATGGCTCGGCTCCCGCGGATGGAACCGGTAGGGGTGTGAGA\n-GGGCGATGACCTGAATGGGCGCATGATCGTCTGATCCGGATCGGAAAAACCAGTGGATTTCTCTGTGCTT\n-TAAAGCACGCCTGTGAGATTTGGATCTGATCCGCTGATCACCATACTGGCCAGTGGCTTCTGAAAGGCCA\n-CATCAACAGGCCTTACAGAAGACCGCACACGATCACACGTCAATATGGGTCAGAAAACTCTTGCATAACG\n-GACGGCAACCATATGTGGACGGCTCCCCCTTGCAAGAGGCTAGGCAAGAAAATGATCGGATCTTTGCTTC\n-CATATGTCCGGCCTGTTGATGCGGCCATAGGGTCGCTGGCCAAGATGGCTTCCGCAGCGTGAGCCCCAAA\n-CACAGAAGCGGTCTTTGATGACCACTGGTTGCCACGGGTTTTCTCACGCCATGGATCGATCGATCACACC\n-ATCTGCTCTATTACTTGCAAGCCACGACCTCAGCTCGGCACGAGAGCGTCAAATGTCAGCGCATCGTGCC\n-AGGCTAAGCTCAAACAGCAGCTGCGCCGGGTTGCTGCAGAAGGCGCTTATAGTGTTCGCCGCTGACCATC\n-AGTTTCCAAGCAATCCGCGCAATCTTATTGGCAAGGGCCACCGCTGCGAGTTTCGGTTTTTTGCGCTCCA\n-GCAATTCACGTAACCAAGATGAGGCATTCTTCCCATTGGTCCGCCGGGCATGCGACACGACTGCGGTCGC\n-GCCAACCACCAGCGTGCTTCGCAAGACCTCATCGCCAGCGCGTGTGATTCTGCCAAGCCTTGTTTTTCCA\n-CCGGTTGAGTGATCCCTGGGCGTCAATCCGATCCAGGCCGCAAAGGCTCGACCCGATTTGAACAGATGCG\n-GATCAGGCGTTTTCATCATCAGCAGCGCTGCGCCGATCGGGCCAACGCCCGGAATTTTCGCAAGACGCTG\n-ACTGCATTCGTTGGCGCGGTGCCATGCCATCACCTTGCCCTCAAGCTGTTCGATTTCACCTTGCAATTCA\n-GCATATTCCTTTGCGTGAAGGGCAAACAACTCGCGCGTCAATGTGGGCAGGCTTTCGTCCGCAGCGATCC\n-GATCAAGGAGTGCCTCAATCCGGCACATGCCTTTGGGCGCCGTGATCCCAAACTCGGCAGCATATCCCCG\n-GATCGTATTGGCGAGCTGTGTGCGGTTCCGGATAAGTCGTGCCCGCATTCCAATCAGCATCAACGCTGCC\n-TGCTCTTCCTCGCTCTTGAGCGGGACGAACCGCATTGTAGGCCGACTCATCGCTTCACAGAGGGCTTCCG\n-CGTCGGCGGCATCGTTTTTCCCGCGCTTGACATAAGGCTTCACGAGCTGCGGCGCGATCAGCTTCACTGT\n-GTGTCCCAGACACGAGAGCACCCGCCCCCAGTAATGGGAGGCGCCACAGGCCTCAATCGCGATTTCAATC\n-GGGGGCAGTTTCTCAAAAAACTTTACCATCTCCCGGCGGGATAGCTTCCTGCGCAAAACAGGCTGCTCCT\n-TCGCGTTTACACCGTGCAATTGGAAAACACTTTTTGACGTGTCCATGCCAATACGGATAATTTGTTCCAT\n-GGGTGGCCTCCTCTGTGAGTTCTGCAACGACTTCACCTTGGCACATCGCGATGCCG'..b'TTGCTCCGAAGGCCTGCATGTGTCCACACCACGGCGATTGCCTGCAACAAGGCGAATGGAT\n-AGAAACAGTCAGGAGAAAGGTCGTCTGGTTCGATGTTGAGGCTGGCCCAGGTCAGTTTCAGCCAAGAAGC\n-ATCATCTATGGTCCTAGGATTTACGCCAAGATATTTCGCAATATTCTTCGGTGTCGAGAGATGATATCGC\n-GGATTACACCTGAGCCGCGCCCATTCCCAACTCTGAATATCAGATAAAAACCGGCGCATAACCTGATAAT\n-AAGCAACTTTCGTTGGAGCCTGCAGAGGCTGGCCAACTGTTGGGATCAGACGATAATCAAAGGTGGAACC\n-GCCCCAATCGCCAACACGTAACCTGTCGACGGCGGCGAGATAGTCGGCACATACCGATACCGTCCATTGC\n-TCTGGTCCAGTGACCTCAGGGTGCTCGCGACCCAACCAGATTCCGATACGGGTTAGAGTGTTGTAAACTG\n-CTCGCCTCGACCCTTCTCGCAACGTTGTTGTTTCCAACCAACGTCGGCACCATGTCATCCACTCAGGATT\n-GATATTATCAGTTTCAGTTCCGTGACGAGGTGGATATGCCCGAAAATGGATAACCTTTGATGTTAATCCC\n-ATTGCCGCCAAACCGTTTGACAATATTCCAATCCGCTTGGCGATATGATTTCCTGTGTAACACTCTCGTG\n-TATGTTCCAGTATCTTCCTATCAAAACTTTCAAGTTTTGGATTCTCGCCTTGTAATGCAACTGCAGCAAA\n-TACTGATGGTACGAGGGGCCGAACGGTCTGACAGACGAAGCCGACACGGGTTAGGGCCGAGAACAGACGC\n-TCACATTCTGTATCAACAATCTCCGCTCCAAAAACCAATCGGGCAACGACAGCCGTCGCCACATTGCGTT\n-GAATTTTGTACACGCTTCGAAAACCACACAGAAGATAAGCGACTGCAATCAACTGCGGTACCATTCCGGT\n-TTTTTTCAGAACAGGACTATTATTGATAATCTCAACCCACAGGTCTTCACTCCACCCCCAGTAGGGTTTT\n-TCCTGGTGTGCGACCGTCAGAAGCAAATACTTCAAAGCACGATGGCTGTTAGCGCGGTCGAGATTGCTGC\n-GATACAGGATGTCCGTCAGAGGCTTATGAAGACGCGGAGCCTGCCGTTTCGTAATAACAAGATTTTCTTC\n-TTTCCAACGTCTGACAACTTTTAATTCGTCTGCAGACAATGTCGCCCGTCTGTCGTATTCATCGAGATTA\n-ATGTGGAATTTGTATTCGGGCTGGATGTGAGAAACCTGAGATTCTACCACTTTTCTATCCTCCGAAGACC\n-CTGTGACCAAGCTTCATATCCATCTGTTCGACTGAGTTTGCGATCTTGCGAAGCAGATCTTCACCGGAAA\n-GATGGATATAGAGTGTCGTGCTTTGAACATTGCGATGCCCGGCATACGTCGCAATATCGTGTAGACGCCA\n-GCCAGCACGGGCCAGATGCGTCAATCTCAGGTGACGCAAAGTGTGCGTACTGAACAATGGCATATCAGCC\n-TGGAGAGCAAGACGTCTGACAGTTTTGCTCCATGACCACTTCGTAATAGGCTGCCGAAAGTTCCGATCTG\n-ACTCAGAGAGAAACAGGGCCGCTGAATGAGTCGCTGCGTTGCGCCTTTGATGCAGATATACCGCCAACAC\n-AGGACAGAGCGCCGCTGAATAACAAACCACACGAGGGCGAGCGCTTTTACTTGTTTCGGCCCGAATGGTG\n-AGCAAACGTCTCGCAGGGTCGATATCCGAGACGCGCAAATTTACTACGGCGTGTCGTCAGTTAAGCCCTG\n-AGAGTGGCACGTGAGGGTTGTACTTTGTGTCTGCGTGTGCTGACTGTTTTCCCATTTTTTGGGGAGACAG\n-ACAGATGCGGCGCTATAGTTTACGCGATGACCAGTGGGAGCGGATAAAGGATCTTCTTCCTGGTCGAGAA\n-GGCTATGTCGGCGGCACTGCGGTGAACAACCGTCTGTTCGTGGAGGCGGTGCTGTATCGCTATCGCGCGG\n-GTATTCCATGGCGCGACCTTCCTGCCCGTTTCGGTGACTGGAAAAACGTGCACCGGCGTCTGCGCCGCTG\n-GTGTGAAAGCGGCGTCATCGAACGGATATTTCGTTATCTGGCCGCTGATTACGACAACGAATACATGATG\n-ATCGACAGCACAATTGTCCGAGCGCATCAGCATAGTGCCGGAGCTCTCAAAAAAGGGGCACGGATCAGGC\n-CATCGGACGATCACGGGCGGGCTAACTACAAAGATCCATGCCATCTGCGACGCTCTGGGCAATCCAGTGG\n-AACTCGGCATCACACCGGGACAGGATGCCGATATCACCCAGGCAGAACCACTTCTGGAAAACATCGAACC\n-GGATGCTTTCCTTGCTGACAAGGCGTATGACGCGGACAGGTTGATCGATCGGCTGATACAGCGCGGGATT\n-ACCCCGGTCATCCCGCCAAAACGCAACAGAACGACACGACGGGTAATCCCCCCATAAAAAGAGTGACTTT\n-TGAATGAGAATTTTCTCAGCGTAAGAATTGAGGAGATTCTGATGAAGAGTGATCGCTTTAGTGACGCCCA\n-GATCATGGGTGTGATCCGCCAGGCTGAGGGCGGTGTCCCGGTTCCTGACCTGTGCCGGGAGCATGGGATC\n-AGCAACGCCACGTTTTACCGGTGGCGCGCGAAATATGGCGGCATGGATGCTTCGATGATCAGTCAGATGA\n-AGGCTTTGGAAGAGGAGAACCGTCGGCTGAAGCGCATGTATGCGGATTTGAGCATGCAGACGGATATCCT\n-GAAGGAAGCCCTTGGAAAAAAATGAAGCGGCCAGCCCAGCGCCGGGAACTGGCCGCACAGGCTGTGGCGC\n-ATCATGGGGTCAGCATTGCGCTGGCCTGTCGGATTTTTGGGATATCCGAGACCTGCTTTCGCTATCGTCC\n-GCGACTGGCAGCGGAGAATGACAGGATTGCCGCTCTTCTGGTGGGACTGACCCAGGCTCACAGGAGATGG\n-GGATTTGGTCTGTGTTTCCTGTATCTGCGCAATGTGCAGGGACAGCTCTGGAATCATAAGCGGGTTTATC\n-GGATCTATCGGGAACTGGAGTTCAACCTGCGGATTAAACCCCGCAGGCGTCTGGTTCGCGAAAAGCCTGA\n-AAAGCTGTCGGTTCCGGCCCTTCCCAACACGGTCTGGTCCATGGATTTCATGGCGGACAGGCTTTTGGAT\n-GGACGCGCTTTTCGGCTCCTGAACATCCTGGATGAGTTCAATCGTGAAGGACTGGCGATCGAGGTTGATT\n-TTTCCCTGCCGGCCTGTCGGGTTGTCCGCTGGTAATCCCCCCATTTTTAGTGGGGCATTGAATGAGAATT\n-CAGGCAGCTGTTTTTAGTTTCTGGGCGGGGGTTAGCCCGCTGTTCCCCATGTTGGGTCTGTCATTGTTAT\n-ATGTCCAGAGCCATTGTGTTGCGACCTCCTGTACGTCCTGAATGCTTTCAAACAAATACTGCTCTAGCCA\n-TTCCTGCCGGACAGTTCTGTTGTAGCGTTCAATATAGGCGTTCTGCTGCGGATTGCCCGGTTGTGTATAG\n-ATCAGGGTAATCCCCTGCTTTTCGGCCCATGAAACCAACGTATGACTGACATATTCAGGGCCATTGTCCA\n-TTCGGATAGCCTCTGGCCTGCCACGCCACTCCATAACCTGTTCCAGACAGCGAACAACCCGACAGGCTGG\n-CAGGGAAAAATCAACCTCAATCGCCAGTCCTTCACGATTGAAATCATCCAGAATGTTCAGGAGCCGAAAA\n-GCACGTCCATCCATCAGCCTGTCCGCCATAAAATCCATGGACCAGACCCTGTTGGGAAGGGCCGGAACCG\n-ACAGCTTTTCAGGCTTTTCGCGAACCAGACGCCTGCGGGGTTTAATC\n' |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/gbktordf.py --- a/gbk2rdf/gbktordf.py Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,360 +0,0 @@\n-#!/usr/bin/env python3.4\n-# Author: Jasper Jan Koehorst\n-# Date created: Feb 21 2015\n-# Function: generation of a RDF file from Genbank/EMBL\n-\n-import warnings\n-warnings.filterwarnings("ignore")\n-\n-def delete_galaxy():\n-\timport sys\n-\tfor index, path in enumerate(sys.path):\n-\t\tif "galaxy-dist/" in path:\n-\t\t\tsys.path[index] = \'\'\n-\n-#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.\n-delete_galaxy()\n-\n-from Bio import SeqIO\n-# Import RDFLib\'s default Graph implementation.\n-import os, sys\n-from Bio.Seq import Seq\n-\n-from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin\n-from rdflib.store import Store\n-import hashlib\n-store = plugin.get(\'IOMemory\', Store)()\n-\n-global URI\n-URI = "http://csb.wur.nl/genome/"\n-global seeAlso\n-seeAlso = "rdfs:seeAlso"\n-global coreURI\n-coreURI = Namespace(URI)\n-\n-global SubClassOfDict\n-SubClassOfDict = {}\n-global SubClassOfDictRna\n-SubClassOfDictRna = {}\n-\n-def createClass(uri, root=True):\n-\tgenomeGraph.add((uri,RDF.type,OWL.Class))\n-\tif root:\n-\t\tgenomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))\n-\treturn uri\n-\n-def tmp():\n-\timport time\n-\tglobal tmpFolder\n-\ttmpFolder = "/tmp/"+str(time.time())+"/"\n-\tos.mkdir(tmpFolder)\n-\n-def cleantmp():\n-\tos.system("ls "+tmpFolder)\n-\tos.system("rm -rf "+tmpFolder)\n-\n-def crawler():\n-\t#From input folder it looks for GBK file (gz files are in progress)\n-\tinput_file = sys.argv[sys.argv.index("-input")+1]\n-\tgbk_parser(input_file)\n-\n-def gbk_parser():\n-\tprevObjStart = -1\n-\tprevObjStop = -1\t\n-\tstore = plugin.get(\'IOMemory\', Store)()\n-\tglobal genomeGraph\n-\tgenomeGraph = Graph(store,URIRef(URI))\n-\tgenomeGraph.bind("ssb",coreURI)\n-\tinput_file = sys.argv[sys.argv.index("-input")+1]\n-\n-\t#CLASS definitions\n-\tgenomeClass = createClass(coreURI["Genome"], root=True)\n-\ttypeClass = createClass(coreURI["DnaObject"], root=True)\n-\tcreateClass(coreURI["Protein"], root=True)\n-\tpubmedClass = createClass(coreURI["Pubmed"], root=True)\n-\tmiscClass = createClass(coreURI["MiscFeature"], root=False)\n-\tcreateClass(coreURI["Feature"], root=True)\n-\tSubClassOfDict["MiscFeature"] = 1\n-\tSubClassOfDictRna["Trna"] = 1\n-\tSubClassOfDictRna["Rrna"] = 1\n-\tSubClassOfDictRna["Tmrna"] = 1\n-\tSubClassOfDictRna["Ncrna"] = 1\n-\n-# \tcodon = "11" #Default initialization if no CDS are present\n-\t##################\n-\tweird_chars = list(\'\'\',./?<>:;"\'|\\}]{[+=_-)(*&^%$#@!\xc2\xb1\xc2\xa7~` \'\'\')\n-\tscaf_value = 0\n-\t#Which files are already done\n-\t########\n-\tformatGBK = sys.argv[sys.argv.index("-format")+1]\n-\tfor record in SeqIO.parse(input_file, formatGBK):\n-\t\t#Read first feature for genome name and information...\n-\t\t#Ignore the empty GBK file due to the lack of features?\n-\n-\t\tfor index, feature in enumerate(record.features):\n-\t\t\tif index == 0:\n-\t\t\t\tif "-identifier" in sys.argv:\n-\t\t\t\t\tgenome = sys.argv[sys.argv.index("-identifier")+1]\n-\t\t\t\telse:\n-\t\t\t\t\ttry:\n-\t\t\t\t\t\tgenome = feature.qualifiers["organism"][0].replace(" ","_")\n-\t\t\t\t\texcept:\n-\t\t\t\t\t\t#BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS\n-\t\t\t\t\t\tgenome = "XNoneX"\n-\t\t\t\tfor char in weird_chars:\n-\t\t\t\t\tgenome = genome.replace(char,"_")\n-\n-\t\t\t\ttry:\n-\t\t\t\t\tgi = record.annotations["gi"]\n-\t\t\t\t\ttyp = str(gi)\n-\t\t\t\texcept:\n-\t\t\t\t\ttry:\n-\t\t\t\t\t\tgi = record.annotations["accessions"][0]\n-\t\t\t\t\t\ttyp = str(gi)\n-\t\t\t\t\texcept:\n-\t\t\t\t\t\tscaf_value += 1\n-\t\t\t\t\t\ttyp = "scaffold_"+str(scaf_value)\n-\t\t\t\tgenomeURI = coreURI[genome]\n-\t\t\t\tgbkURI = coreURI[genome + "/" + typ]\n-\t\t\t\t#To contig connection to connect all data to it\n-\t\t\t\tgenomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))\n-\n-\t\t\t\t#General genome features also stored in the class...\n-\t\t\t\tif "genome" in feature.qualifiers:\n-\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))\n-\t\t\t\tif "strain" in feature.qualifiers:\n-\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))\n-\t\t\t\tif "taxonomy" in record.annotations:\n-\t\t\t\t\tfo'..b'a" and feature_type.lower() != "ncrna":\n-\t\t\tSubClassOfDict[feature_type.lower().title()] = 1\n-\tfor key in feature.qualifiers:\n-\t\tvalues = feature.qualifiers[key]\n-\t\tif key == "translation":\n-\t\t\tpass\n-\t\telif type(values) == list:\n-\t\t\tfor v in values:\n-\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n-\t\telse:\n-\t\t\tint_add(generalURI,coreURI[key.lower()],values)\n-\tif feature.type == "CDS":\n-\t\ttry:\n-\t\t\t#Feature is normally submitted to this function\n-\t\t\t#IF a subfeature is submitted it is submitted as a feature\n-\t\t\t#And subfeature variable will contain the superfeature\n-\t\t\tif superfeature:\n-\t\t\t\tcodon = superfeature.qualifiers["transl_table"][0]\n-\t\texcept:\n-\t\t\t#Default codon table 11\n-\t\t\tcodon = "11"\n-\t\t#Protein linkage\n-\t\ttranslation = ""\n-\t\ttry:\n-\t\t\ttranslation = feature.qualifiers["translation"][0].strip("*")\n-\t\texcept KeyError:\n-\t\t\t#When protein sequence is not given...\n-\t\t\tif len(feature.location.parts) > 1:\n-\t\t\t\t#Exon boundaries?\n-\t\t\t\tseq = \'\'\n-\t\t\t\tfor loc in feature.location:\n-\t\t\t\t\tseq += record.seq[loc]\n-\t\t\t\tif int(feature.location.strand) == -1:\n-\t\t\t\t\tseq = Seq(seq).complement()\n-\t\t\t\telse:\n-\t\t\t\t\tseq = Seq(seq)\n-\t\t\t\ttranslation = str(seq.translate(feature.qualifiers["transl_table"][0]))\n-\t\t\telif int(feature.location.strand) == -1:\n-\t\t\t\tif str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:\n-\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n-\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))\n-\t\t\t\t\telse:\n-\t\t\t\t\t\ttranslation = \'\'\n-\t\t\telif int(feature.location.strand) == +1:\n-\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n-\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))\n-\t\t\t\t\telse:\n-\t\t\t\t\t\ttranslation = \'\'\n-\t\t\t\n-\t\t\tif translation:\n-\t\t\t\ttranslation = list(translation)\n-\t\t\t\ttranslation[0] = "M"\n-\t\t\t\ttranslation = \'\'.join(translation).strip("*")\n-\t\t\t\tif "*" in translation:\n-\t\t\t\t\tpass\t\t\n-\n-\t\ttranslation = translation.encode(\'utf-8\')\n-\t\tmd5_protein = hashlib.md5(translation).hexdigest()\n-\t\tproteinURI = coreURI["protein/"+md5_protein]\n-\t\tgenomeGraph.add((generalURI,coreURI["protein"],proteinURI))\n-\t\tfor key in feature.qualifiers:\n-\t\t\tfor v in feature.qualifiers[key]:\n-\t\t\t\tif key == "translation":\n-\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))\n-\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))\n-\t\t\t\t\tgenomeGraph.add((proteinURI,RDF.type,proteinClass))\n-\t\t\t\telse:\n-\t\t\t\t\tfor v in feature.qualifiers[key]:\n-\t\t\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n-\t\n-def int_add(subject, predicate, obj):\n-\ttry:\n-\t\tobject_float = float(obj.replace(\'"\',\'\'))\n-\t\tobject_int = int(obj.replace(\'"\',\'\'))\n-\t\tif object_int == object_float:\n-\t\t\tgenomeGraph.add((subject,predicate,Literal(object_int)))\n-\t\telse:\n-\t\t\tgenomeGraph.add((subject,predicate,Literal(object_float)))\n-\texcept:\n-\t\tgenomeGraph.add((subject,predicate,Literal(obj.replace(\'"\',\'\'))))\n-\t\t\t\t\n-def save():\n-\tdata = genomeGraph.serialize(format=\'turtle\')\n-\topen(sys.argv[sys.argv.index("-output")+1],"wb").write(data)\n-\n-def subClassOfBuilder():\n-\tfor subclass in SubClassOfDict:\n-\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n-\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))\n-\n-def subClassOfBuilderRna():\n-\tfor subclass in SubClassOfDictRna:\n-\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n-\t\tgenomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))\n-\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))\n-\t\tgenomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))\n-\n-def main():\n-\ttmp()\n-\tgbk_parser()\n-\tsubClassOfBuilder()\n-\tsubClassOfBuilderRna()\n-\tsave()\n-\tcleantmp()\n-\n-if __name__ == "__main__":\n-\tmain()\n\\ No newline at end of file\n' |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/gbktordf.xml --- a/gbk2rdf/gbktordf.xml Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,38 +0,0 @@ -<tool id="SAPP_genbank_to_ttl" name="EMBL/GBK to RDF" version="0.1"> - <requirements> - <requirement type='package' version="3.4">python</requirement> - <requirement type='package' version="1.0">rdflib</requirement> - </requirements> - <description>Genbank to RDF conversion</description> - <command interpreter="python3.4">gbktordf.py '-input' '$input' -output '$output' -sourcedb "$format" -format "$format"</command> - <inputs> - <param name="input" type="data" format="gbk,gb,genbank,embl" label="Genbank file"/> - <param name="format" type="select" label="EMBL/GBK"> - <option value="genbank" selected="true"> Genbank</option> - <option value="embl"> EMBL </option> - </param> - </inputs> - - <outputs> - <data format="rdf" name="output" label="GBKttl: ${input.name}" /> - </outputs> - - <tests> - <test> - <param name="input" value="test-data/NC_010067.gbk"/> - <output name="$output" file="NC_010067.rdf"/> - <output name="$format" value="genbank"/> - <output name="$sourcedb" value="genbank"/> - </test> - <test> - <param name="input" value="test-data/CP009049.embl"/> - <output name="$output" file="CP009049.rdf"/> - <output name="$format" value="embl"/> - <output name="$sourcedb" value="embl"/> - </test> - </tests> - - <help> - Genbank or EMBL to RDF conversion - </help> -</tool> |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/test-data/CP009049.embl --- a/gbk2rdf/test-data/CP009049.embl Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,157312 +0,0 @@\n-ID CP009049; SV 1; circular; genomic DNA; STD; PRO; 4599018 BP.\n-XX\n-AC CP009049;\n-XX\n-PR Project:PRJNA255737;\n-XX\n-DT 13-FEB-2015 (Rel. 123, Created)\n-DT 13-FEB-2015 (Rel. 123, Last updated, Version 1)\n-XX\n-DE Salmonella enterica subsp. enterica serovar Paratyphi A strain CMCC 50973,\n-DE complete genome.\n-XX\n-KW .\n-XX\n-OS Salmonella enterica subsp. enterica serovar Paratyphi A\n-OC Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n-OC Enterobacteriaceae; Salmonella.\n-XX\n-RN [1]\n-RP 1-4599018\n-RA Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n-RT "Whole Genome Sequences of two Salmonella paratyphi A strains";\n-RL Unpublished.\n-XX\n-RN [2]\n-RP 1-4599018\n-RA Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n-RT ;\n-RL Submitted (24-JUL-2014) to the INSDC.\n-RL State Key Laboratory of Pathogen and Biosecurity, Beijing Institute of\n-RL Biotechnology, 20 Dongdajie, Fengtai District, Beijing, Beijing 100071,\n-RL China\n-XX\n-DR MD5; e41a6215bf412b701febd8d4b182ec0c.\n-DR BioSample; SAMN02909989.\n-XX\n-CC Source DNA/bacteria are available from National Center for Medical\n-CC Culture Collection (CMCC) in China.\n-CC Annotation was added by the NCBI Prokaryotic Genome Annotation\n-CC Pipeline (released 2013). Information about the Pipeline can be\n-CC found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/\n-CC ##Genome-Assembly-Data-START##\n-CC Assembly Method :: SOAPdenovo v. 2011.04\n-CC Assembly Name :: CMCC(B) 50973\n-CC Genome Coverage :: 133x\n-CC Sequencing Technology :: Illumina\n-CC ##Genome-Assembly-Data-END##\n-CC ##Genome-Annotation-Data-START##\n-CC Annotation Provider :: NCBI\n-CC Annotation Date :: 07/25/2014 13:43:31\n-CC Annotation Pipeline :: NCBI Prokaryotic Genome Annotation\n-CC Pipeline\n-CC Annotation Method :: Best-placed reference protein set;\n-CC GeneMarkS+\n-CC Annotation Software revision :: 2.6 (rev. 440435)\n-CC Features Annotated :: Gene; CDS; rRNA; tRNA; ncRNA;\n-CC repeat_region\n-CC Genes :: 4,309\n-CC CDS :: 4,016\n-CC Pseudo Genes :: 166\n-CC CRISPR Arrays :: 2\n-CC rRNAs :: 20 ( 5S, 16S, 23S )\n-CC tRNAs :: 100\n-CC ncRNA :: 7\n-CC Frameshifted Genes :: 106\n-CC ##Genome-Annotation-Data-END##\n-XX\n-FH Key Location/Qualifiers\n-FH\n-FT source 1..4599018\n-FT /organism="Salmonella enterica subsp. enterica serovar\n-FT Paratyphi A"\n-FT /host="Homo sapiens"\n-FT /sub_species="enterica"\n-FT /strain="CMCC 50973"\n-FT /mol_type="genomic DNA"\n-FT /country="China:Jiangsu"\n-FT /lat_lon="32.04 N 118.78 E"\n-FT /collection_date="2003-06-01"\n-FT /serovar="Paratyphi A"\n-FT /db_xref="taxon:54388"\n-FT /culture_collection="CMCC:50973"\n-FT gene complement(129..713)\n-FT /gene="mobA"\n-FT /locus_tag="IT63_00010"\n-FT CDS complement(129..713)\n-FT /codon_start=1\n-FT /transl_table=11\n-FT /gene="mobA"\n-FT /locus_tag="IT63_00010"\n-FT /product="molybdopterin-guanine dinucleotide biosynthesis\n-FT protein MobA"\n-FT /note="in Escherichia coli MobA links a guanosine\n-FT 5\'-phosphate to molydopterin to form molybdopterin guanine\n-FT dinucleotide during molybdenum cofactor biosynthesis;\n-FT Derived by automated c'..b'cgag cgaacgggga ggagcccaga gcctgaatca gcatgtgtgt 4596180\n- tagtggaagc gtctggaaag gcgcgcgata cagggtgaca gccccgtaca caaaagcgca 4596240\n- tgtgctgtga gctcgatgag tagggcggga cacgtggtat cctgtctgaa tatgggggga 4596300\n- ccatcctcca aggctaaata ctaattttgc tctttaaaaa tctggatcaa gctgaaaatt 4596360\n- gaaacacaga acaacgaaag ttgttcgtga gtctctcaaa ttttcgcaac acgatgatga 4596420\n- atcgtaagaa acatcttcgg gttgtgaggt taagcgacta agcgtacacg gtggatgccc 4596480\n- tggcagtcag aggcgatgaa ggacgtgcta atctgcgata agcgccggta aggtgatatg 4596540\n- aaccgttata accggcgatt tccgaatggg gaaacccagt gtgattcgtc acactatcat 4596600\n- taactgaatc cataggttaa tgaggcgaac cgggggaact gaaacatcta agtaccccga 4596660\n- ggaaaagaaa tcaaccgaga ttcccccagt agcggcgagc gaacggggag gagcccagag 4596720\n- cctgaatcag catgtgtgtt agtggaagcg tctggaaagg cgcgcgatac agggtgacag 4596780\n- ccccgtacac aaaagcgcat gtgctgtgag ctcgatgagt agggcgggac acgtggtatc 4596840\n- ctgtctgaat atggggggac catcctccaa ggctaaatac tcctgactga ccgatagtga 4596900\n- accagtaccg tgagggaaag gcgaaaagaa ccccggcgag gggagtgaaa aagaacctga 4596960\n- aaccgtgtac gtacaagcag tgggagcaca ggtttacctg tgtgactgcg taccttttgt 4597020\n- ataatgggtc agcgacttat attctgtagc aaggttaacc gtatagggga gccggaggga 4597080\n- aaccgagtct taaccgggcg ttaagttgca gggtatagac ccgaaacccg gtgatctagc 4597140\n- catgggcagg ttgaaggttg ggtaacacta actggaggac cgaaccgact aatgttgaaa 4597200\n- aattagcgga tgacctgtgg ctgggggtga aaggccaatc aaaccgggag atagctggtt 4597260\n- ctccccgaaa gctatttagg tagcgcctcg tgaattcatc tccgggggta gagcactgtt 4597320\n- tcggctaggg ggccatcccg gcttaccaac ccgatgcaaa ctgcgaatac cggagaatgt 4597380\n- tatcacggga gacacacggc gggtgctaac gtccgtcgtg aagagggaaa caacccagac 4597440\n- cgccagctaa ggtcccaaag tcatggttaa gtgggaaacg atgtgggaag gcccagacag 4597500\n- ccaggatgtt ggcttagaag cagccatcat ttaaagaaag cgtaatagct cactggtcga 4597560\n- gtcggcctgc gcggaagatg taacggggct aaaccatgca ccgaagctgc ggcagcgaca 4597620\n- ctcaggtgtt gttgggtagg ggagcgttct gtaagcctgt gaaggtggcc tgtgagggtt 4597680\n- gctggaggta tcagaagtgc gaatgctgac ataagtaacg ataaagcggg tgaaaagccc 4597740\n- gctcgccgga agaccaaggg ttcctgtcca acgttaatcg gggcagggtg agtcgacccc 4597800\n- taaggcgagg ccgaaaggcg tagtcgatgg gaaacgggtt aatattcccg tacttggtgt 4597860\n- tactgcgaag ggggggacgg agaaggctat gttggccggg cgacggttgt cccggtttaa 4597920\n- gcgtgtaggt gtgtgttcca ggtaaatccg gttcacttta acactgaggc gtgacgacga 4597980\n- ggcactacgg tgctgaagca acaaatgccc tgcttccagg aaaagcctct aagcatcagg 4598040\n- taacatcaaa tcgtacccca aaccgacaca ggtggtcagg tagagaatac caaggcgctt 4598100\n- gagagaactc gggtgaagga actaggcaaa atggtgccgt aacttcggga gaaggcacgc 4598160\n- tgacacgtag gtgaagtgat ttactcatgg agctgaagtc agtcgaagat accagctggc 4598220\n- tgcaactgtt tattaaaaac acagcactgt gcaaacacga aagtggacgt atacggtgtg 4598280\n- acgcctgccc ggtgccggaa ggttaattga tggggtcagc gcaagcgaag ctcctgatcg 4598340\n- aagccccggt aaacggcggc cgtaactata acggtcctaa ggtagcgaaa ttccttgtcg 4598400\n- ggtaagttcc gacctgcacg aatggcgtaa tgatggccag gctgtctcca cccgagactc 4598460\n- agtgaaattg aactcgctgt gaagatgcag tgtacccgcg gcaagacgga aagaccccgt 4598520\n- gaacctttac tatagcttga cactgaacat tgagccttga tgtgtaggat aggtgggagg 4598580\n- ctttgaagtg tggacgccag tctgcatgga gccgaccttg aaataccacc ctttaatgtt 4598640\n- tgatgttcta acgtggaccc gttacccggg ttgcggacag tgtctggtgg gtagtttgac 4598700\n- tggggcggtc tcctcctaaa gagtaacgga ggagcacgaa ggttggctaa tcctggtcgg 4598760\n- acatcaggag gttagtgcaa tggcataagc cagcttgact gcgagcgtga cggcgcgagc 4598820\n- aggtgcgaaa gcaggtcata gtgatccggt ggttctgaat ggaagggcca tcgctcaacg 4598880\n- gataaaaggt actccgggga taacaggctg ataccgccca agagttcata tcgacggcgg 4598940\n- tgtttggcac ctcgatgtcg gctcatccca tcccggggct gaagtaggtc ccaagggtat 4599000\n- ggctgttcgc catttaaa 4599018\n-//\n' |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d gbk2rdf/test-data/NC_010067.gbk --- a/gbk2rdf/test-data/NC_010067.gbk Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b"@@ -1,259779 +0,0 @@\n-LOCUS NC_010067 4600800 bp DNA circular CON 20-AUG-2013\n-DEFINITION Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n- RSK2980 chromosome, complete genome.\n-ACCESSION NC_010067\n-VERSION NC_010067.1 GI:161501984\n-DBLINK Project: 58191\n- BioProject: PRJNA58191\n-KEYWORDS .\n-SOURCE Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n- RSK2980\n- ORGANISM Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n- RSK2980\n- Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n- Enterobacteriaceae; Salmonella.\n-REFERENCE 1 (bases 1 to 4600800)\n- CONSRTM NCBI Genome Project\n- TITLE Direct Submission\n- JOURNAL Submitted (03-DEC-2007) National Center for Biotechnology\n- Information, NIH, Bethesda, MD 20894, USA\n-REFERENCE 2 (bases 1 to 4600800)\n- AUTHORS McClelland,M., Sanderson,E.K., Porwollik,S., Spieth,J.,\n- Clifton,W.S., Fulton,R., Chunyan,W., Wollam,A., Shah,N., Pepin,K.,\n- Bhonagiri,V., Nash,W., Johnson,M., Thiruvilangam,P. and Wilson,R.\n- CONSRTM The Salmonella enterica serovar Arizonae Genome Sequencing Project\n- TITLE Direct Submission\n- JOURNAL Submitted (02-NOV-2007) Genetics, Genome Sequencing Center, 4444\n- Forest Park Parkway, St. Louis, MO 63108, USA\n-COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final\n- NCBI review. The reference sequence was derived from CP000880.\n- Salmonella enterica subspecies IIIa (Arizonae) serovar\n- 62:z4,z23:--Most bacteria in the species S. enterica belong to one\n- of seven subspecies; all but subspecies I normally grow only in\n- cold-blooded animals. Subspecies IIIa (S. Arizonae) is naturally\n- found in reptiles, but also causes outbreaks of salmonellosis in\n- turkeys and sheep and can occasionally produce both gastroenteritis\n- and serious disseminated disease in humans. Many human infections\n- can be traced to contact with reptiles or ingestion of various\n- reptile products, particularly from rattlesnakes. Fewer than ten\n- cases in humans are typically reported in the US each year.\n- \n- The strain of S. Arizonae (62:z4,z23:-) being sequenced is\n- CDC346-86; it was named RSK2980 by R.K. Selander and is strain\n- SARC5 of the Salmonella Reference C set. This serovar is of\n- interest because of its taxonomic position. It appears to be the\n- most divergent subspecies among the S. enterica. It can be obtained\n- from the American Type Culture Collection as ATCC BAA-731, or the\n- Salmonella Genetic Stock Centre as SGSC4693. The genome was\n- sequenced to 8X coverage, using plasmid and fosmid libraries and\n- was finished to an error rate of less than 1 per 10,000 bases.\n- Automated annotation was performed and manual annotation will\n- continue in the labs of Michael McClelland and Kenneth Sanderson.\n- The National Institute of Allergy and Infectious Diseases (NIAID),\n- National Institutes of Health (NIH) has funded this project.\n- \n- Coding sequences below are predicted using GeneMark v3.3 and\n- Glimmer2 v2.13.Intergenic regions not spanned by GeneMark and\n- Glimmer2 were blasted against NCBI's non-redundant (NR) database\n- and predictions generated based on protein alignments. RNA genes\n- were determined using tRNAscan-SE 1.23 or Rfam v8.0. This sequence\n- was finished as follows unless otherwise noted: all regions were\n- double stranded, sequenced with an alternate chemistries or covered\n- by high quality data(i.e., phred quality >=30);an attempt was made\n- "..b'1 acccgtcatc gtatcgtcct tgccgcaacg cttgcggaat ttcttacaca acttaatcct\n- 4597741 cttctgtaat cgtttgccct gacaggtgtg agagatctct tacaaggtct gtaggagatc\n- 4597801 gccaggatat cagagaatac ttagctacga ctttctcctg taaatatata taaatcaatc\n- 4597861 tattaaaata ttatttcgca ctttcatata caaatttact taaggtatcg tctgtaagcg\n- 4597921 tcttgtaaga caaggtgaaa caggcgattc tatattcatc gacagggagt cgtacaacga\n- 4597981 agcgaacgtc aggaagatgg cgcttctgca ggacacgcca ggagggcgtt acatggaaag\n- 4598041 gcttcaggat gaggcaaagt ggaaagcgca ggatgcgtta aaggacacct ccaggacgga\n- 4598101 gaacgagagc cgattaggat ggtcggcggg tctggatgac cagggacgct tcgggatgaa\n- 4598161 gctatcacat cggggcgatg tgcgcaggat gcaaacgttc aggatgagca ggccgcaggg\n- 4598221 tcacaggaaa agttgtcacg gatgagcagg gagcatgaaa agtagctgga atgctgcgaa\n- 4598281 acgaaccggg agcactgttt atacagtgct cccttttttt gttattcttc gcgccagatt\n- 4598341 tccattattg aggttcttaa catgacgact catgaccgtg tgcgtcagca gttacatgcg\n- 4598401 cttgaaacgc tgctgcgtga gcatcatcac tggcggctgg atgcgccgca ggcgcacctg\n- 4598461 tttaccagca cgcagccgtt ttgtatggat accatggaac cgctggaatg gctgcaatgg\n- 4598521 gtattgatcc cgcgtatgca taccctgctt gataatgcgc agccgttacc tgaggcgttt\n- 4598581 gccgtcgccc cttattatga aatggcgctg acggcggatt atccgcagcg ggaagcgatc\n- 4598641 ctgacggttt tgcaggatct ggatgcgcta tttacccgcg ataaatcctg atgctggaga\n- 4598701 tcctctatca ggacgcgtgg ctggttgccg ttaataaacc tgcaggctgg cttgttcacc\n- 4598761 ggagctggct ggatcgcgac gaaaaagttg tggtcatgca aacggtgcgc gaccaaatcg\n- 4598821 gccagcatgt ttttaccgcc caccgtctcg acagacccac atcgggcgta ctactgatgg\n- 4598881 ggctgtccag cgaagcggga cgccgcctgg cgcagcagtt cgagcagcac catatccgta\n- 4598941 aacgttacca tgccatagtg cgcggctggc tgatggatga tgcgctactg gattatcctc\n- 4599001 tgctggaaga gcgcgataaa attgccgata agttcgcgcg tgaggataaa gcgccccagc\n- 4599061 cagccgtaac gcagtatcgc gggctggcga cggtcgaaat ggcagtgccg accgggcgtt\n- 4599121 atcccactac gcgttatggc ctggttgagc tggaaccgaa aacggggcgc aaacaccagc\n- 4599181 tccgccgtca tctggcgcat ctacgccatc ctatcatcgg cgacagtaaa cacggtgatt\n- 4599241 tgcggcaaaa ccgtagcgcg gcggaacatt ttgcttgtcg tcgcctgatg cttcatgcca\n- 4599301 gtcggcttga actgacgcat cccttcaccg gacagccatt aattattcag gccggactgg\n- 4599361 atgaaacctg gatgcaggcg ctaacacagt ttggctggcg gggacttctc cctgataatg\n- 4599421 aaagggttga gtttacgacg gcgtcccggc aggatgagtc ttatcagaca taattcaggg\n- 4599481 agatacgcat aatggcggaa attggtattt ttgtcggtac gatgtatggc aactcactgt\n- 4599541 tggtggcgga ggaagcggaa gcgatcctgg ccagacaggg ccatagcgcg actgtgtttg\n- 4599601 aagatcctga actgtccgac tggcggcaat atcaggacaa ggtggcattg gttgtcacct\n- 4599661 caacgaccgg acagggcgat ctaccggata gtattgcgcc gctctttcac ggtattaaag\n- 4599721 atacgttagg ttttcaacca aacctgcgtt acggggtgat tgcgttaggt gatagcagct\n- 4599781 accccaattt ctgtaatggc ggcaagcagt ttgatgccct gttgcaggag caaagcgcgc\n- 4599841 aacgggtggg ggaaatgtta ctcattgacg ccagcgaaca tccggagccg gagagccaat\n- 4599901 ccaatccctg ggtagaaaac tggggaacct tactttcctg aggtaaatcc ctccccctac\n- 4599961 cgggagggta ccttttcgtt tgattgcatt gccagtaagc aaaataacga cctgtatgta\n- 4600021 gtttaaagaa actgaatcgt gttagctttg tgcatatgcc tgcaaaagca gcagtttttt\n- 4600081 acgggcgttt tcatgtaatc aagcgacctg tttcacattc ttctcttttt attcctcctg\n- 4600141 cgtcgacgcc tgacgccttc tgatttcatt tccgtgaagt ggcttccact gtcctgggct\n- 4600201 tttgccacaa acaggcgtaa ttcattgcca aaatactgtg ttgttgcacg gtgagtgtgc\n- 4600261 gtgacgcgct ttttatactt ctcctgccag tgaataaaag aatgcagcat gcaaagcaaa\n- 4600321 cgacctaata aaagctgcaa caaggaaacg ttatctctga ttccctaccg gttgtgcagt\n- 4600381 tcagagtgag cgtagctaac gcgaaatttc aggagtgcaa caatgagttc attaagtcac\n- 4600441 gcggcgagta gtgcggagaa tcgcacgaac gcccgctact ggatagtggt gatgctgttt\n- 4600501 atcgtcacat cctttaacta tggcgatcgc gccacattgt ccattgccgg ctcagaaatg\n- 4600561 gccaaagata ttggtcttga cccggtaggc atgggctacg ttttctctgc gttttcatgg\n- 4600621 gcctatgtta tcggacagat ccctggcggc tggctgctgg accgctttgg ttccaaacgc\n- 4600681 gtctatttct ggtctatttt catctggtcg gtcttcaccc tgttgcaggg ttttgtcgat\n- 4600741 atttttagcg gtttcggcat tgttgtcgcc ctctttacgc ttcgtttcct ggtcggtctg\n-//\n' |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d protein2rdf/protein_to_ttl.py --- a/protein2rdf/protein_to_ttl.py Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,130 +0,0 @@ -def delete_galaxy(): - import sys - for index, path in enumerate(sys.path): - if "galaxy-dist/" in path: - sys.path[index] = '' - -#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. -delete_galaxy() - -# from io import StringIO -from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin -# import rdflib -from rdflib.store import Store -import sys -import hashlib - -store = plugin.get('IOMemory', Store)() - -global URI -URI = "http://csb.wur.nl/genome/" -global seeAlso -seeAlso = "rdfs:seeAlso" -global coreURI -coreURI = Namespace(URI) - - -def createClass(uri): - genomeGraph.add((uri,RDF.type,OWL.Class)) - genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) - return uri - -def fasta_parser(input_file): - createClass(coreURI["Protein"]) - - genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") - if genome == '': - genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") - - genomeURI = coreURI[genome] - for index, element in enumerate(sys.argv): - if '-organism' == element: - genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) - if '-ncbi_taxid' == element: - genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) - if '-idtag' == element: - genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) - if '-diagnosis' == element: - genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1]))) - if '-country' == element: - genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1]))) - if '-location' == element: - genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1]))) - if '-date' == element: - genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1]))) - if '-ids' == element: - genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) - - - - data = (open(input_file).readlines()) - fastadict = {} - sequence = "" - key = "" - for index, line in enumerate(data): - if ">" == line[0]: - if sequence: - fastadict[key] = sequence - key = line - sequence = "" - fastadict[key] = "" - else: - sequence += line.strip() - fastadict[key] = sequence - - #Create a class, to be the same as all the other genome conversions... - #TODO: Proteins are part of cds, cds are part of dnaobject - #If CDS is not there... how then? - classURI = coreURI[genome + "/" + "protein_fasta"] - proteinClass = createClass(coreURI["Protein"]) - genomeClass = createClass(coreURI["Genome"]) - typeClass = createClass(coreURI["DnaObject"]) - cdsClass = createClass(coreURI["Cds"]) - #A theoretical begin, end is created to have a workable GBK generation - begin = 0 - end = 0 - genomeGraph.add((genomeURI, RDF.type, genomeClass)) - genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) - genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI)) - genomeGraph.add((classURI, RDF.type, typeClass)) - - for protein in fastadict: - sequence = fastadict[protein] - sequence = sequence.encode('utf-8') - end = begin + len(sequence) - md5_protein = hashlib.md5(sequence).hexdigest() - proteinURI = coreURI["protein/"+md5_protein] - - cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)] - genomeGraph.add((classURI, coreURI["feature"] , cdsURI)) - genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin))) - genomeGraph.add((cdsURI, coreURI["end"] , Literal(end))) - genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) - genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI)) - genomeGraph.add((cdsURI, RDF.type, cdsClass)) - - - - genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) - genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence))) - genomeGraph.add((proteinURI,RDF.type,proteinClass)) - genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) - genomeGraph.add((proteinURI, RDF.type, proteinClass)) - begin = end - -def save(): - data = genomeGraph.serialize(format='turtle') - open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) - -def main(): - store = plugin.get('IOMemory', Store)() - global genomeGraph - genomeGraph = Graph(store,URIRef(URI)) - genomeGraph.bind("ssb",coreURI) - input_file = sys.argv[sys.argv.index("-input")+1] - fasta_parser(input_file) - save() - -if __name__ == '__main__': - main() - |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d protein2rdf/protein_to_ttl.xml --- a/protein2rdf/protein_to_ttl.xml Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,42 +0,0 @@ -<tool id="SAPP_protein_rdf" name="Protein FASTA to RDF" version="0.1"> - <requirements> - <requirement type='package' version="3.4">python</requirement> - <requirement type='package' version="1.0">rdflib</requirement> - </requirements> - <description></description> - <command interpreter="python3.4">protein_to_ttl.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' '-diagnosis' '$diagnosis' '-country' '$country' '-location' '$location' '-date' '$date' -sourcedb SAPP - #for $index, $id in enumerate( $ids ) - '-ids' '$id.id_tag' - #end for - '-id_alternative' '$input.name' - </command> - <inputs> - <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/> - <param size="60" name="organism" type="text" format="text" label="organism name"/> - <param size="60" name="diagnosis" type="text" format="text" label="Diagnosis of host if applicable"/> - <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/> - <param size="60" name="country" type="text" format="text" label="Country of sample"/> - <param size="60" name="location" type="text" format="text" label="Location of sample e.g., river, city, hospital"/> - <param size="60" name="date" type="text" format="text" label="Sample date"/> - <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!"/> - <repeat name="ids" title="Identification tags"> - <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/> - </repeat> - </inputs> - - <outputs> - <data format="rdf" name="output" label="proteinTTL: ${input.name}" /> - </outputs> - <tests> - <test> - <param name="input" value="test-data/NC_017117.faa"/> - <output name="$output" file="NC_017117.rdf"/> - <output name="$ncbi_taxid" value="634455"/> - <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/> - <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/> - </test> - </tests> - <help> - RDF creation from a multi protein fasta file - </help> -</tool> |
b |
diff -r 4e05b3bf3e3e -r bde695b3f97d protein2rdf/test-data/NC_017117.faa --- a/protein2rdf/test-data/NC_017117.faa Sat Feb 21 11:26:55 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,993 +0,0 @@\n->gi|384055706|ref|YP_005485330.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MKSDRFTDAQIMGVIRQAEGGVPVPDLCREHGISNATFYRWRAKYGGMDASMISQMKALEEENRRLKRMY\n-ADLSMQTDILKEALGKK\n->gi|384055707|ref|YP_005485331.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n-MAGHHVEAMIARAHAQKRFMDDAGWRYVVELYGRYQSLLREQNAADFGDLLMWPTLAMLHNDAYRYRWSR\n-RFTAVMADEFQDVNRAQFLWLKMISEVSAEFFAVGDDSQSIL\n->gi|384055708|ref|YP_005485332.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MVVGRNDCAKGRQMKDTVIGVDLAKNIFQVHGASRAGEVMFRKKLRRQQFMQFMATQPPALVVLEACGSA\n-HYWARELAGAGHEVRLIAPQYVKPFVKRQKNDAADAEAIVIAARQPEMRFVEPRTEAQQARGVLFRARQR\n-LVHQRTELVNALRAVLYEFGLVVPQGIAHIRHIEAMLDEAVLPEAVKQECLDLLRQISEQSVRIDVRTKK\n-IRMLAQESENTCRLQSMPGVGPLTALAIEAFAPDLQSFRRGRDFAAWLGLVPRQFSSGGKERLGKISKAG\n-QADIRRLLIMGAMTQVNWASRKAPAPGSWLARMLARKPRMLVAIALANRMARAIWAMATKQEDYRDPALS\n-VAA\n->gi|384055709|ref|YP_005485333.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MEQIIRIGMDTSKSVFQLHGVNAKEQPVLRRKLSRREMVKFFEKLPPIEIAIEACGASHYWGRVLSCLGH\n-TVKLIAPQLVKPYVKRGKNDAADAEALCEAMSRPTMRFVPLKSEEEQAALMLIGMRARLIRNRTQLANTI\n-RGYAAEFGITAPKGMCRIEALLDRIAADESLPTLTRELFALHAKEYAELQGEIEQLEGKVMAWHRANECS\n-QRLAKIPGVGPIGAALLMMKTPDPHLFKSGRAFAAWIGLTPRDHSTGGKTRLGRITRAGDEVLRSTLVVG\n-ATAVVSHARRTNGKNASSWLRELLERKKPKLAAVALANKIARIAWKLMVSGEHYKRLLQQPGAAAV\n->gi|384055710|ref|YP_005485334.1| DNA resolvase [Acetobacter pasteurianus IFO 3283-22]\n-MVPPKPGKTPVGGRLIGYARVSTDDQGTDAQLNELRDAGCTMIFEKHASGADRNRPVLIRLLRDMNAGDT\n-LVVVRLDRLARSVSHLLAVIEQLDYAGAHFRSLDDPIDTTTPQGMFSLQVLGAVAQLDADFFCDGVDGSQ\n-RHRDVPR\n->gi|384055711|ref|YP_005485335.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MLTSRIHRRKPMGKPMSKATARANAAKSSIRAHVEHVFAHQKNRFNLFIRTIGLARAEAKLTLCNLAYNF\n-NRLIFHERLETAG\n->gi|384055712|ref|YP_005485336.1| D-mannonate oxidoreductase [Acetobacter pasteurianus IFO 3283-22]\n-MNLNRNAISHVPDTVYTPRYDPALLRPGIVHLGCGNFHRGHQVVATQAAIDAEGRDGLRWGIVSATMRRP\n-DLATVLQSQDNLYTLLTREPANTVASVMAAITEAVYAGDDNANLAARIADPATAIVTLTVTASGYYLSAD\n-GRLDPTFEAIQADLTAITPRTAPGIIAAGLAQVRQRGGVPPVILCCDNVNSNGATLRQAVIDLAALKGDD\n-LLAAWIETNVQFPDTMVDRIVPTATPDDIADACRLLGGIEDRAPISAEPWFQWVIGEFDGPRPRWVAHPG\n-TKFVSDVGVFERAKLQMLNGTHMLLAYVGALANLNTVSEAASDDALGRIAARFMRNEQTADVSLDTDELD\n-RYTVDLMQRFRNPGIVHEVTRIGRNGSAKMASRIVQPMRSNIEAGRPVDGAVLLIASWIRWFALHEQDEF\n-DIALTDPRAETLRGLCADARDDHKAQAEAFLAMEEVFGAPLPDHGKQVEAIASMLRRLTEESVPELLRTI\n-AH\n->gi|384055713|ref|YP_005485337.1| phosphatase/phosphohexomutase [Acetobacter pasteurianus IFO 3283-22]\n-MTDTVFPAHLLKHKQEPVHGVVFDMDGLLLDSESLAMEALVFAARDLNYDIPMSFCRTMIGVPADGCRTM\n-VRKTYGQDFPLERFFELQEVHLRNFVDTGKLALKKGVLPLLDLLDTYKIPRAIATSSSRVRTDHHLKLVN\n-LFHRFNAIVTRDDVSKGKPDPEPYLTAAKKIGVNPAHALALEDSHSGARAAHAAGIRVIVVPDLLEATDE\n-IRGKALAIVQDLSIVEAYLKHAITGQA\n->gi|384055714|ref|YP_005485338.1| hypothetical protein APA22_40090 [Acetobacter pasteurianus IFO 3283-22]\n-MRRDMDLVRQLLLKLEGIEKGPHDVLLIGGNSEEVAVDGRTSDEIYFHLTKIEEAGFLERVGGGAMTAVT\n-FRALSWKGQEFLDTIRDDSIWKKTKEKAGSASFDILAAVAKAVIKDRIKSLTGLDIG\n->gi|384055715|ref|YP_005485339.1| hypothetical protein APA22_40100 [Acetobacter pasteurianus IFO 3283-22]\n-MRPLGSGLSVRTYGCSEADDQENDGWAKKDTGEIVALYEMSSPVMPSGLVSISRWKIKGCYPKSGLSRAM\n-LCPTKIPQSASNIALLIGSDWSFIEENVFCNHIEWQTCLPVFVMNLDHPA\n->gi|384055716|ref|YP_005485340.1| DNA helicase superfamily I [Acetobacter pasteurianus IFO 3283-22]\n-MSSKPSHHSVLSYWHSALLDDAQMKISFSRDNLVALDEEGFEKGKLPPDKTQALRKMHPASRDLAPDDSI\n-IAMAGIRILLGQVSHSTEHSKQPALFCMAMLVNVSPEGTIQPLKDAPPWINRELLEPSDGDVLIGDLATM\n-DTWLQLNPFEGGSLGKTLEWAEKLWNAVTGEDGLPDGYELWERVALQPAEASIGMIATLHQRRFYDTVLA\n-DTGLVTPLLARYIDGGPEPAVVDESQKWAAAGRARGTMTFAYGMSSSQSEAMTAFCSVKDGDILAVNGPP\n-GTGKTTLLQGIVATELVTRALEGGDPAVIVGTSTNNQAVTNIIDAMKKAMASKDSRPWARRWIEGADALG\n-LYFPSGEKEKEALKAGYLIASPGRGLGTMEWKGFPERERDTVDAWASRDAWINGYYGSFYPGVTPPLRKE\n-HLSGHGPQGARHDISLVEDGIAKIRARMKVLVETGRVCAGEARKLNQLYVASGYGTYPDITKAIAQREAL\n-LQERRPREDALKSDLKEKEAAAAVPRARINEENRKTRDLLKQRDDAVHAAGQKVEEVGAHAVALIAALPG\n-GGFFSNLMSGRNWANVERLVAEGRQGSFFRSLMQAQVKSKREWMDAINEMTASAERELATVRESREETRQ\n-ARDTLIQKLEREVAAADLVSKTARAEYDHYVGGSYVLAGRELEKLVTLKHQILQQLQDCCTAIETVLAPS\n-DWAAMFDMPEEKLPWRQSNWTGRLDVIEDFLDR'..b'DEVAPAV\n-RHLISQIQTTIA\n->gi|384055875|ref|YP_005485499.1| multidrug resistance transporter EmrB/QacA [Acetobacter pasteurianus IFO 3283-22]\n-MGTSMTSSRVTNPLFVLLAASTGCALTVLDTNVVAIILPTIAREFRASFADIEWVISTYVLCFASLLLPA\n-GAIADRYGRRRIYLIGITTFALTSLFCGAAPSATALYLARALQGVSAAFLLAPALAIIGHTFHNPDERNR\n-AWAIWGSIMGLTMVLAPIIGGIIAYALGWRWAFYINIPICVLLAGAVFILVKESRDTDARRLDPVGIIFF\n-AAFMFGLTWGMINGQASGWTSWNALNGFIGGSISLGIFIASERAQSRPMLDLGLFSNPRFLGAVWAMFAY\n-AASAQVMASMLPLFLQNGLGRSALQAGFAMLPFALAMLIFPHIGRLLERHISSSGILAGGLSCVAIGNGI\n-TAWGAYVGSWIIVMAGMVVIGSGGGLLNGETQKAIMSVVPKERSGMASGISTTSRFSGILLGFAMLSGIL\n-ATMVRKWVAAFGCGTGCHHPSDFADAIVAGDLPSAISGLEGSNQEIAIQHAHHAFSYGFAVALLVASIFA\n-LGSSITVFTLMQSKMKQNIT\n->gi|384055876|ref|YP_005485500.1| transposase, partial [Acetobacter pasteurianus IFO 3283-22]\n-MLAYAVMASVRYQANSLKPKKTQLRTRQSLSAGPFRRSGASS\n->gi|384055877|ref|YP_005485501.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVILVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n-LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGQQADRYCRIIAD\n-HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n-ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n-SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n-CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n-KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n->gi|384055878|ref|YP_005485502.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVIVVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n-LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGHKADRYCRIIAD\n-HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n-ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n-SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n-CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n-KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n->gi|384055879|ref|YP_005485503.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n-MLQFSYMSEEADAIAAEIGRRAASGCAWHDIAVIYRQNRLSRAIEEALIQARVPYEIVGDVGFYQRVAVK\n-DALALLSLAARPDDRQSDEAFRADFSHLRQFRVIL\n->gi|384055880|ref|YP_005485504.1| DNA helicase RecD/TraA [Acetobacter pasteurianus IFO 3283-22]\n-MTSAVVGEQCQTEALAGLVERVTFHNAENGFCVLRVKVRGQRDLVTVVGHAAMISAGEFVQMSGRWFNDH\n-THGLQFKAEFLKASPPTTVEGIERYLGSGMIRGIGPVYAKKLVKAFGEAVFDLIEQEPHRLREVTGIGPK\n-RAERIVGGWADQKVIREIMLFLHSNGVGTSRAVRIFKTYGQDAVRLISENPYRLAKDIRGIGFKTADQIA\n-RKMGIAPDAMIRVRAGISYALGEAMDEGHCGLPVGELLTSTAELLEVAAPLIETALALELEAGDVVADSV\n-GETSCIFLAGLYRAEQSIAERLRACAVGRPPWPEIDAEKAMTWVEGKTGLAMAPSQQEAVRLALRSKVLV\n-ITGGPGVGKTTLVNAILKIVTAKGTDVQLCAPTGRAAKRLSESTGLEGKTIHRLLETDPGNGSFKRDDTN\n-PLTCDLLVVDEASMVDVLLMRSLLRALPDSASLLIVGDVDQLPSVGPGQVLADIIGSDAVPVVRLTEVFR\n-QAAQSRIITNAHRINEGKMPELSAEEGSDFYFVEAAEPEVGLRKLLAVVKDRIPARFGLDPVRDVQVLCP\n-MNRGGLGARSLNIELQQALNPAGDVKVERFGWTYGPGDKVMQIANDYDRDVFNGDLGVIDKIDVEEGELT\n-VLFDGREVVYGFGELDELVLAYATTIHKSQGSEYPVVVIPLVTQHYTMLARNLLYTGVTRGRKLVVLVGQ\n-KKALAIAVRNQGGRLRWSKLRDWLVGTSGTGHLSRLKKP\n->gi|384055881|ref|YP_005485505.1| phage integrase [Acetobacter pasteurianus IFO 3283-22]\n-MVESQVSHIQPEYKFHINLDEYDRRATLSADELKVVRRWKEENLVITKRQAPRLHKPLTDILYRSNLDRA\n-NSHRALKYLLLTVAHQEKPYWGWSEDLWVEIINNSPVLKKTGMVPQLIAVAYLLCGFRSVYKIQRNVATA\n-VVARLVFGAEIVDTECERLFSALTRVGFVCQTVRPLVPSVFAAVALQGENPKLESFDRKILEHTRECYTG\n-NHIAKRIGILSNGLAAMGLTSKVIHFRAYPPRHGTETDNINPEWMTWCRRWLETTTLREGSRRAVYNTLT\n-RIGIWLGREHPEVTGPEQWTVSVCADYLAAVDRLRVGDWGGSTFDYRLIPTVGQPLQAPTKVAYYQVMRR\n-FLSDIQSWEWARLRCNPRYHLSTPKNIAKYLGVNPRTIDDASWLKLTWASLNIEPDDLSPDCFYPFALLQ\n-AIAVVWTHAGLRSNEIARLRVGCTREQSEDVVDQSGNVVPAGQVCWLDVPEGKTSVAYTKPVGHAVHKYI\n-TAWMKKRASPRKHLDRRTGEHVHFLFQLRNRPIAKEVLNQTVIPLLCKKAGIPIEDSKGRITSHRGRASA\n-VSMLASVPQGMTIFDLAKWCGHTSVQSTMSYVRSKPTQLASAFAKADQAARMIEIVIDNEVIAAGATKDG\n-APWKYYDLGDSYCSNAFWSTCPHRMACARCYFNIPKPSAKGVVLAAQQAANRLLEEVWLSPEERDAVSGD\n-VEALEGMLNKLRDKPALDGRTPGEISATCGSQVSSPFTESE\n->gi|384055882|ref|YP_005485506.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MELGITPGQDADITQAEPLLENIEPDAFLADKAYDADRLIDRLIQRGITPVIPPKRNRTTRRVIPP\n' |