Repository 'sapp'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjkoehorst/sapp

Changeset 16:74b8ba5e2d5b (2015-02-21)
Previous changeset 15:10cad758ed0f (2015-02-21) Next changeset 17:2561c51e6605 (2015-02-21)
Commit message:
aragorn addition
added:
conversion/fasta2rdf/fastatordf.py
conversion/fasta2rdf/fastatordf.xml
conversion/fasta2rdf/test-data/NC_017117.fna
conversion/gbk2rdf/gbktordf.py
conversion/gbk2rdf/gbktordf.xml
conversion/gbk2rdf/test-data/CP009049.embl
conversion/gbk2rdf/test-data/NC_010067.gbk
conversion/protein2rdf/protein_to_ttl.py
conversion/protein2rdf/protein_to_ttl.xml
conversion/protein2rdf/test-data/NC_017117.faa
genetic_elements/aragorn.py
genetic_elements/aragorn.xml
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/fasta2rdf/fastatordf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/fasta2rdf/fastatordf.py Sat Feb 21 17:17:06 2015 +0100
[
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3.4
+# Author: Jasper Jan Koehorst
+# Date created: Jan 22 2015
+# Function: generation of a RDF file from a genome fasta file
+
+
+# from io import StringIO
+from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
+# import rdflib
+from rdflib.store import Store
+import sys
+
+store = plugin.get('IOMemory', Store)()
+
+global URI
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+global genomeGraph
+store = plugin.get('IOMemory', Store)()
+genomeGraph = Graph(store,URIRef(URI))
+genomeGraph.bind("ssb",coreURI)
+
+def delete_galaxy():
+ for index, path in enumerate(sys.path):
+ if "galaxy-dist/" in path:
+ sys.path[index] = ''
+
+def createClass(uri):
+ genomeGraph.add((uri,RDF.type,OWL.Class))
+ genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+ return uri
+
+def fasta_parser(input_file):
+ createClass(coreURI["Genome"])            #Genome class
+ createClass(coreURI["Type"])                #Type class (Chr,Pls,Scaffold)
+
+ genomeDict = {}
+
+ sequence = ""
+ genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
+ if genomeID == 'None':
+ genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
+
+ genomeURI = coreURI[genomeID]
+ for index, element in enumerate(sys.argv):
+ if '-organism' == element:
+ genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
+ if '-ncbi_taxid' == element:
+ genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
+ if '-idtag' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+ if '-ids' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+
+ genomeDict[genomeID] = {}
+
+ #Generating genome dictionary
+ data = open(input_file).readlines()
+ fastadict = {}
+ key = ""
+ for index, line in enumerate(data):
+ if ">" == line[0]:
+ key = line.strip(">").strip()
+ fastadict[key] = ""
+ else:
+ fastadict[key] += line.strip()
+
+ genomeClass = createClass(coreURI["Genome"])
+ typeClass = createClass(coreURI["DnaObject"])
+ for index, genome in enumerate(fastadict):
+ typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
+ sequence = fastadict[genome]
+ genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
+ genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
+ genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
+ genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((genomeURI, RDF.type,genomeClass))
+ genomeGraph.add((typeURI, RDF.type,typeClass))
+
+def save():
+ data = genomeGraph.serialize(format='turtle')
+ open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+
+def main():
+ input_file = sys.argv[sys.argv.index("-input")+1]
+ fasta_parser(input_file)
+ save()
+
+if __name__ == '__main__':
+ #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
+ delete_galaxy()
+ main()
+
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/fasta2rdf/fastatordf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/fasta2rdf/fastatordf.xml Sat Feb 21 17:17:06 2015 +0100
b
@@ -0,0 +1,38 @@
+<tool id="SAPP_genome_to_ttl" name="FASTA to RDF" version="0.1">
+    <requirements>
+        <requirement type='package' version="3.4">python</requirement>
+        <requirement type='package' version="1.0">rdflib</requirement>
+    </requirements>
+ <description></description>
+ <command interpreter="python3">fastatordf.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' -sourcedb SAPP
+ #for $index, $id in enumerate( $ids ) 
+ '-ids' '$id.id_tag'
+ #end for
+ '-id_alternative' '$input.name'
+ </command>
+ <inputs>
+ <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
+ <param size="60" name="organism" type="text" format="text" label="organism name" optional="false"/>
+ <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
+ <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!" optional="false"/>
+ <repeat name="ids" title="Identification tags">     
+ <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
+ </repeat>
+ </inputs>
+
+ <outputs>
+ <data format="rdf" name="output" label="genomeTTL: ${input.name}" />
+ </outputs>
+    
+    <tests>
+        <test>
+            <param name="input" value="test-data/NC_017117.fna"/>
+            <output name="$output" file="NC_017117.rdf"/>
+            <output name="$ncbi_taxid" value="634455"/>
+            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
+            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
+        </test>
+    </tests>
+
+<help> Genome FASTA file to RDF</help>
+</tool>
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/fasta2rdf/test-data/NC_017117.fna
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/fasta2rdf/test-data/NC_017117.fna Sat Feb 21 17:17:06 2015 +0100
b
b'@@ -0,0 +1,2736 @@\n+>gi|384055705|ref|NC_017117.1| Acetobacter pasteurianus IFO 3283-22 plasmid pAPA22-010, complete sequence\n+CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACAT\n+TGCGCAGATACAGGAAACACAGACCAAATCCCCATCTCCTGTGAGCCTGGGTCAGTCCCACCAGAAGAGC\n+GGCAATCCTGTCGTTCTCCGCTGCCAGTCGCGGACGATAGCGAAAGCAGGTCTCGGATATCCCAAAAATC\n+CGACAGGCCAGCGCAATGCTGACCCCATGATGCGCCACAGCTTGTGCGGCCAGTTCCCGGCGCTGGGCTG\n+GCCGCTTCATTTTTTTCCAAGGGCTTCCTTCAGGATATCCGTCTGCATGCTCAAATCCGCATACATGCGC\n+TTCAGCCGACGGTTCTCCTCTTCCAAAGCCTTCATCTGACTGATCATCGAAGCATCCATGCCGCCATATT\n+TCGCGCGCCACCGGTAAAACGTGGCGTTGCTGATCCCATGCTCCCGACACAGGTCAGGAACCGGGACACC\n+GCCCTCAGCCTGGCGGATCACACCCATGATCTGGGCGTCAGTAAAGCGATCACTCTTCATCAGAATCTCC\n+TCAATTCTTACGCTGAGAAAATTCTCATTCAAAAGTCACTCTTTTTATGGGGGGATTACCACTCTAAATC\n+AATGCATTCCAATTAACTTATAAAATGCTTTGAGAGTCATCACCTACAGCAAAGAACTCTGCTGACACCT\n+CTGAAATCATTTTGAGCCATAAAAACTGGGCTCGATTGACGTCCTGAAACTCATCAGCCATCACGGCTGT\n+GAAACGGCGTGACCAGCGGTAGCGATAGGCATCATTGTGCAGCATTGCCAATGTCGGCCACATCAACAGA\n+TCCCCAAAATCTGCAGCATTCTGTTCGCGCAGCAAACTCTGGTAACGACCATACAACTCAACCACATAGC\n+GCCAGCCCGCATCGTCCATAAAACGTTTCTGGGCATGTGCTCGCGCTATCATGGCTTCAACATGATGCCC\n+TGCCATCTCAGGCGTCACCAGATCTTCCTTCAAACGAGATGGGGTGGTTGCCGTCCTCCCCGCACGGCAT\n+CGCAATGTGCCAGAATGGTCGTTGGAAGAAACGACTGCGCGAAAGGACGGCAGATGAAGGATACAGTGAT\n+AGGCGTTGATCTGGCAAAGAACATTTTCCAGGTTCATGGAGCTTCGCGTGCGGGCGAGGTGATGTTTCGC\n+AAAAAGCTGCGTCGTCAGCAGTTTATGCAGTTCATGGCCACGCAGCCGCCTGCTCTGGTCGTTCTTGAAG\n+CGTGCGGGAGCGCGCATTACTGGGCTCGCGAACTGGCAGGAGCTGGTCACGAGGTCAGACTGATCGCTCC\n+GCAGTATGTGAAGCCTTTCGTGAAGCGCCAGAAGAACGATGCTGCTGATGCGGAAGCGATCGTCATTGCG\n+GCCCGTCAGCCGGAAATGCGCTTTGTCGAACCACGCACTGAAGCGCAGCAGGCGCGTGGCGTTCTTTTCC\n+GGGCCCGGCAGCGTCTGGTGCACCAGCGCACGGAACTGGTGAATGCCCTGCGTGCCGTTCTGTATGAATT\n+CGGTCTCGTCGTGCCACAGGGGATTGCGCATATCAGACACATTGAAGCCATGCTGGATGAGGCGGTTCTG\n+CCAGAGGCTGTGAAGCAGGAATGCCTTGATCTGCTGCGACAGATTTCGGAGCAGAGTGTGCGGATTGATG\n+TCAGAACAAAGAAGATCAGGATGCTTGCCCAGGAAAGTGAAAACACCTGCAGATTGCAGAGCATGCCTGG\n+AGTGGGTCCTCTGACCGCTCTTGCGATTGAAGCTTTTGCGCCTGACCTGCAGAGCTTCCGGCGCGGGCGC\n+GACTTTGCTGCGTGGCTGGGGCTGGTGCCCCGTCAGTTCTCATCTGGCGGAAAGGAAAGGCTGGGGAAGA\n+TATCAAAAGCCGGGCAGGCTGATATCCGCAGGCTTCTCATCATGGGCGCCATGACCCAGGTGAACTGGGC\n+CAGCCGTAAGGCCCCTGCACCGGGAAGCTGGCTGGCACGGATGCTGGCCCGCAAGCCCCGTATGCTGGTA\n+GCCATTGCGCTGGCCAACAGGATGGCACGAGCCATCTGGGCCATGGCAACAAAACAGGAGGATTATCGGG\n+ATCCGGCCCTGTCCGTGGCAGCCTGAGCGATGGCTCGGCTCCCGCGGATGGAACCGGTAGGGGTGTGAGA\n+GGGCGATGACCTGAATGGGCGCATGATCGTCTGATCCGGATCGGAAAAACCAGTGGATTTCTCTGTGCTT\n+TAAAGCACGCCTGTGAGATTTGGATCTGATCCGCTGATCACCATACTGGCCAGTGGCTTCTGAAAGGCCA\n+CATCAACAGGCCTTACAGAAGACCGCACACGATCACACGTCAATATGGGTCAGAAAACTCTTGCATAACG\n+GACGGCAACCATATGTGGACGGCTCCCCCTTGCAAGAGGCTAGGCAAGAAAATGATCGGATCTTTGCTTC\n+CATATGTCCGGCCTGTTGATGCGGCCATAGGGTCGCTGGCCAAGATGGCTTCCGCAGCGTGAGCCCCAAA\n+CACAGAAGCGGTCTTTGATGACCACTGGTTGCCACGGGTTTTCTCACGCCATGGATCGATCGATCACACC\n+ATCTGCTCTATTACTTGCAAGCCACGACCTCAGCTCGGCACGAGAGCGTCAAATGTCAGCGCATCGTGCC\n+AGGCTAAGCTCAAACAGCAGCTGCGCCGGGTTGCTGCAGAAGGCGCTTATAGTGTTCGCCGCTGACCATC\n+AGTTTCCAAGCAATCCGCGCAATCTTATTGGCAAGGGCCACCGCTGCGAGTTTCGGTTTTTTGCGCTCCA\n+GCAATTCACGTAACCAAGATGAGGCATTCTTCCCATTGGTCCGCCGGGCATGCGACACGACTGCGGTCGC\n+GCCAACCACCAGCGTGCTTCGCAAGACCTCATCGCCAGCGCGTGTGATTCTGCCAAGCCTTGTTTTTCCA\n+CCGGTTGAGTGATCCCTGGGCGTCAATCCGATCCAGGCCGCAAAGGCTCGACCCGATTTGAACAGATGCG\n+GATCAGGCGTTTTCATCATCAGCAGCGCTGCGCCGATCGGGCCAACGCCCGGAATTTTCGCAAGACGCTG\n+ACTGCATTCGTTGGCGCGGTGCCATGCCATCACCTTGCCCTCAAGCTGTTCGATTTCACCTTGCAATTCA\n+GCATATTCCTTTGCGTGAAGGGCAAACAACTCGCGCGTCAATGTGGGCAGGCTTTCGTCCGCAGCGATCC\n+GATCAAGGAGTGCCTCAATCCGGCACATGCCTTTGGGCGCCGTGATCCCAAACTCGGCAGCATATCCCCG\n+GATCGTATTGGCGAGCTGTGTGCGGTTCCGGATAAGTCGTGCCCGCATTCCAATCAGCATCAACGCTGCC\n+TGCTCTTCCTCGCTCTTGAGCGGGACGAACCGCATTGTAGGCCGACTCATCGCTTCACAGAGGGCTTCCG\n+CGTCGGCGGCATCGTTTTTCCCGCGCTTGACATAAGGCTTCACGAGCTGCGGCGCGATCAGCTTCACTGT\n+GTGTCCCAGACACGAGAGCACCCGCCCCCAGTAATGGGAGGCGCCACAGGCCTCAATCGCGATTTCAATC\n+GGGGGCAGTTTCTCAAAAAACTTTACCATCTCCCGGCGGGATAGCTTCCTGCGCAAAACAGGCTGCTCCT\n+TCGCGTTTACACCGTGCAATTGGAAAACACTTTTTGACGTGTCCATGCCAATACGGATAATTTGTTCCAT\n+GGGTGGCCTCCTCTGTGAGTTCTGCAACGACTTCACCTTGGCACATCGCGATGCCG'..b'TTGCTCCGAAGGCCTGCATGTGTCCACACCACGGCGATTGCCTGCAACAAGGCGAATGGAT\n+AGAAACAGTCAGGAGAAAGGTCGTCTGGTTCGATGTTGAGGCTGGCCCAGGTCAGTTTCAGCCAAGAAGC\n+ATCATCTATGGTCCTAGGATTTACGCCAAGATATTTCGCAATATTCTTCGGTGTCGAGAGATGATATCGC\n+GGATTACACCTGAGCCGCGCCCATTCCCAACTCTGAATATCAGATAAAAACCGGCGCATAACCTGATAAT\n+AAGCAACTTTCGTTGGAGCCTGCAGAGGCTGGCCAACTGTTGGGATCAGACGATAATCAAAGGTGGAACC\n+GCCCCAATCGCCAACACGTAACCTGTCGACGGCGGCGAGATAGTCGGCACATACCGATACCGTCCATTGC\n+TCTGGTCCAGTGACCTCAGGGTGCTCGCGACCCAACCAGATTCCGATACGGGTTAGAGTGTTGTAAACTG\n+CTCGCCTCGACCCTTCTCGCAACGTTGTTGTTTCCAACCAACGTCGGCACCATGTCATCCACTCAGGATT\n+GATATTATCAGTTTCAGTTCCGTGACGAGGTGGATATGCCCGAAAATGGATAACCTTTGATGTTAATCCC\n+ATTGCCGCCAAACCGTTTGACAATATTCCAATCCGCTTGGCGATATGATTTCCTGTGTAACACTCTCGTG\n+TATGTTCCAGTATCTTCCTATCAAAACTTTCAAGTTTTGGATTCTCGCCTTGTAATGCAACTGCAGCAAA\n+TACTGATGGTACGAGGGGCCGAACGGTCTGACAGACGAAGCCGACACGGGTTAGGGCCGAGAACAGACGC\n+TCACATTCTGTATCAACAATCTCCGCTCCAAAAACCAATCGGGCAACGACAGCCGTCGCCACATTGCGTT\n+GAATTTTGTACACGCTTCGAAAACCACACAGAAGATAAGCGACTGCAATCAACTGCGGTACCATTCCGGT\n+TTTTTTCAGAACAGGACTATTATTGATAATCTCAACCCACAGGTCTTCACTCCACCCCCAGTAGGGTTTT\n+TCCTGGTGTGCGACCGTCAGAAGCAAATACTTCAAAGCACGATGGCTGTTAGCGCGGTCGAGATTGCTGC\n+GATACAGGATGTCCGTCAGAGGCTTATGAAGACGCGGAGCCTGCCGTTTCGTAATAACAAGATTTTCTTC\n+TTTCCAACGTCTGACAACTTTTAATTCGTCTGCAGACAATGTCGCCCGTCTGTCGTATTCATCGAGATTA\n+ATGTGGAATTTGTATTCGGGCTGGATGTGAGAAACCTGAGATTCTACCACTTTTCTATCCTCCGAAGACC\n+CTGTGACCAAGCTTCATATCCATCTGTTCGACTGAGTTTGCGATCTTGCGAAGCAGATCTTCACCGGAAA\n+GATGGATATAGAGTGTCGTGCTTTGAACATTGCGATGCCCGGCATACGTCGCAATATCGTGTAGACGCCA\n+GCCAGCACGGGCCAGATGCGTCAATCTCAGGTGACGCAAAGTGTGCGTACTGAACAATGGCATATCAGCC\n+TGGAGAGCAAGACGTCTGACAGTTTTGCTCCATGACCACTTCGTAATAGGCTGCCGAAAGTTCCGATCTG\n+ACTCAGAGAGAAACAGGGCCGCTGAATGAGTCGCTGCGTTGCGCCTTTGATGCAGATATACCGCCAACAC\n+AGGACAGAGCGCCGCTGAATAACAAACCACACGAGGGCGAGCGCTTTTACTTGTTTCGGCCCGAATGGTG\n+AGCAAACGTCTCGCAGGGTCGATATCCGAGACGCGCAAATTTACTACGGCGTGTCGTCAGTTAAGCCCTG\n+AGAGTGGCACGTGAGGGTTGTACTTTGTGTCTGCGTGTGCTGACTGTTTTCCCATTTTTTGGGGAGACAG\n+ACAGATGCGGCGCTATAGTTTACGCGATGACCAGTGGGAGCGGATAAAGGATCTTCTTCCTGGTCGAGAA\n+GGCTATGTCGGCGGCACTGCGGTGAACAACCGTCTGTTCGTGGAGGCGGTGCTGTATCGCTATCGCGCGG\n+GTATTCCATGGCGCGACCTTCCTGCCCGTTTCGGTGACTGGAAAAACGTGCACCGGCGTCTGCGCCGCTG\n+GTGTGAAAGCGGCGTCATCGAACGGATATTTCGTTATCTGGCCGCTGATTACGACAACGAATACATGATG\n+ATCGACAGCACAATTGTCCGAGCGCATCAGCATAGTGCCGGAGCTCTCAAAAAAGGGGCACGGATCAGGC\n+CATCGGACGATCACGGGCGGGCTAACTACAAAGATCCATGCCATCTGCGACGCTCTGGGCAATCCAGTGG\n+AACTCGGCATCACACCGGGACAGGATGCCGATATCACCCAGGCAGAACCACTTCTGGAAAACATCGAACC\n+GGATGCTTTCCTTGCTGACAAGGCGTATGACGCGGACAGGTTGATCGATCGGCTGATACAGCGCGGGATT\n+ACCCCGGTCATCCCGCCAAAACGCAACAGAACGACACGACGGGTAATCCCCCCATAAAAAGAGTGACTTT\n+TGAATGAGAATTTTCTCAGCGTAAGAATTGAGGAGATTCTGATGAAGAGTGATCGCTTTAGTGACGCCCA\n+GATCATGGGTGTGATCCGCCAGGCTGAGGGCGGTGTCCCGGTTCCTGACCTGTGCCGGGAGCATGGGATC\n+AGCAACGCCACGTTTTACCGGTGGCGCGCGAAATATGGCGGCATGGATGCTTCGATGATCAGTCAGATGA\n+AGGCTTTGGAAGAGGAGAACCGTCGGCTGAAGCGCATGTATGCGGATTTGAGCATGCAGACGGATATCCT\n+GAAGGAAGCCCTTGGAAAAAAATGAAGCGGCCAGCCCAGCGCCGGGAACTGGCCGCACAGGCTGTGGCGC\n+ATCATGGGGTCAGCATTGCGCTGGCCTGTCGGATTTTTGGGATATCCGAGACCTGCTTTCGCTATCGTCC\n+GCGACTGGCAGCGGAGAATGACAGGATTGCCGCTCTTCTGGTGGGACTGACCCAGGCTCACAGGAGATGG\n+GGATTTGGTCTGTGTTTCCTGTATCTGCGCAATGTGCAGGGACAGCTCTGGAATCATAAGCGGGTTTATC\n+GGATCTATCGGGAACTGGAGTTCAACCTGCGGATTAAACCCCGCAGGCGTCTGGTTCGCGAAAAGCCTGA\n+AAAGCTGTCGGTTCCGGCCCTTCCCAACACGGTCTGGTCCATGGATTTCATGGCGGACAGGCTTTTGGAT\n+GGACGCGCTTTTCGGCTCCTGAACATCCTGGATGAGTTCAATCGTGAAGGACTGGCGATCGAGGTTGATT\n+TTTCCCTGCCGGCCTGTCGGGTTGTCCGCTGGTAATCCCCCCATTTTTAGTGGGGCATTGAATGAGAATT\n+CAGGCAGCTGTTTTTAGTTTCTGGGCGGGGGTTAGCCCGCTGTTCCCCATGTTGGGTCTGTCATTGTTAT\n+ATGTCCAGAGCCATTGTGTTGCGACCTCCTGTACGTCCTGAATGCTTTCAAACAAATACTGCTCTAGCCA\n+TTCCTGCCGGACAGTTCTGTTGTAGCGTTCAATATAGGCGTTCTGCTGCGGATTGCCCGGTTGTGTATAG\n+ATCAGGGTAATCCCCTGCTTTTCGGCCCATGAAACCAACGTATGACTGACATATTCAGGGCCATTGTCCA\n+TTCGGATAGCCTCTGGCCTGCCACGCCACTCCATAACCTGTTCCAGACAGCGAACAACCCGACAGGCTGG\n+CAGGGAAAAATCAACCTCAATCGCCAGTCCTTCACGATTGAAATCATCCAGAATGTTCAGGAGCCGAAAA\n+GCACGTCCATCCATCAGCCTGTCCGCCATAAAATCCATGGACCAGACCCTGTTGGGAAGGGCCGGAACCG\n+ACAGCTTTTCAGGCTTTTCGCGAACCAGACGCCTGCGGGGTTTAATC\n'
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/gbk2rdf/gbktordf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/gbk2rdf/gbktordf.py Sat Feb 21 17:17:06 2015 +0100
[
b'@@ -0,0 +1,360 @@\n+#!/usr/bin/env python3.4\n+# Author: Jasper Jan Koehorst\n+# Date created: Feb 21 2015\n+# Function: generation of a RDF file from Genbank/EMBL\n+\n+import warnings\n+warnings.filterwarnings("ignore")\n+\n+def delete_galaxy():\n+\timport sys\n+\tfor index, path in enumerate(sys.path):\n+\t\tif "galaxy-dist/" in path:\n+\t\t\tsys.path[index] = \'\'\n+\n+#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.\n+delete_galaxy()\n+\n+from Bio import SeqIO\n+# Import RDFLib\'s default Graph implementation.\n+import os, sys\n+from Bio.Seq import Seq\n+\n+from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin\n+from rdflib.store import Store\n+import hashlib\n+store = plugin.get(\'IOMemory\', Store)()\n+\n+global URI\n+URI = "http://csb.wur.nl/genome/"\n+global seeAlso\n+seeAlso = "rdfs:seeAlso"\n+global coreURI\n+coreURI = Namespace(URI)\n+\n+global SubClassOfDict\n+SubClassOfDict = {}\n+global SubClassOfDictRna\n+SubClassOfDictRna = {}\n+\n+def createClass(uri, root=True):\n+\tgenomeGraph.add((uri,RDF.type,OWL.Class))\n+\tif root:\n+\t\tgenomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))\n+\treturn uri\n+\n+def tmp():\n+\timport time\n+\tglobal tmpFolder\n+\ttmpFolder = "/tmp/"+str(time.time())+"/"\n+\tos.mkdir(tmpFolder)\n+\n+def cleantmp():\n+\tos.system("ls "+tmpFolder)\n+\tos.system("rm -rf "+tmpFolder)\n+\n+def crawler():\n+\t#From input folder it looks for GBK file (gz files are in progress)\n+\tinput_file = sys.argv[sys.argv.index("-input")+1]\n+\tgbk_parser(input_file)\n+\n+def gbk_parser():\n+\tprevObjStart = -1\n+\tprevObjStop = -1\t\n+\tstore = plugin.get(\'IOMemory\', Store)()\n+\tglobal genomeGraph\n+\tgenomeGraph = Graph(store,URIRef(URI))\n+\tgenomeGraph.bind("ssb",coreURI)\n+\tinput_file = sys.argv[sys.argv.index("-input")+1]\n+\n+\t#CLASS definitions\n+\tgenomeClass = createClass(coreURI["Genome"], root=True)\n+\ttypeClass = createClass(coreURI["DnaObject"], root=True)\n+\tcreateClass(coreURI["Protein"], root=True)\n+\tpubmedClass = createClass(coreURI["Pubmed"], root=True)\n+\tmiscClass = createClass(coreURI["MiscFeature"], root=False)\n+\tcreateClass(coreURI["Feature"], root=True)\n+\tSubClassOfDict["MiscFeature"] = 1\n+\tSubClassOfDictRna["Trna"] = 1\n+\tSubClassOfDictRna["Rrna"] = 1\n+\tSubClassOfDictRna["Tmrna"] = 1\n+\tSubClassOfDictRna["Ncrna"] = 1\n+\n+# \tcodon = "11" #Default initialization if no CDS are present\n+\t##################\n+\tweird_chars = list(\'\'\',./?<>:;"\'|\\}]{[+=_-)(*&^%$#@!\xc2\xb1\xc2\xa7~` \'\'\')\n+\tscaf_value = 0\n+\t#Which files are already done\n+\t########\n+\tformatGBK = sys.argv[sys.argv.index("-format")+1]\n+\tfor record in SeqIO.parse(input_file, formatGBK):\n+\t\t#Read first feature for genome name and information...\n+\t\t#Ignore the empty GBK file due to the lack of features?\n+\n+\t\tfor index, feature in enumerate(record.features):\n+\t\t\tif index == 0:\n+\t\t\t\tif "-identifier" in sys.argv:\n+\t\t\t\t\tgenome = sys.argv[sys.argv.index("-identifier")+1]\n+\t\t\t\telse:\n+\t\t\t\t\ttry:\n+\t\t\t\t\t\tgenome = feature.qualifiers["organism"][0].replace(" ","_")\n+\t\t\t\t\texcept:\n+\t\t\t\t\t\t#BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS\n+\t\t\t\t\t\tgenome = "XNoneX"\n+\t\t\t\tfor char in weird_chars:\n+\t\t\t\t\tgenome = genome.replace(char,"_")\n+\n+\t\t\t\ttry:\n+\t\t\t\t\tgi = record.annotations["gi"]\n+\t\t\t\t\ttyp = str(gi)\n+\t\t\t\texcept:\n+\t\t\t\t\ttry:\n+\t\t\t\t\t\tgi = record.annotations["accessions"][0]\n+\t\t\t\t\t\ttyp = str(gi)\n+\t\t\t\t\texcept:\n+\t\t\t\t\t\tscaf_value += 1\n+\t\t\t\t\t\ttyp = "scaffold_"+str(scaf_value)\n+\t\t\t\tgenomeURI = coreURI[genome]\n+\t\t\t\tgbkURI = coreURI[genome + "/" + typ]\n+\t\t\t\t#To contig connection to connect all data to it\n+\t\t\t\tgenomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))\n+\n+\t\t\t\t#General genome features also stored in the class...\n+\t\t\t\tif "genome" in feature.qualifiers:\n+\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))\n+\t\t\t\tif "strain" in feature.qualifiers:\n+\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))\n+\t\t\t\tif "taxonomy" in record.annotations:\n+\t\t\t\t\tfo'..b'a" and feature_type.lower() != "ncrna":\n+\t\t\tSubClassOfDict[feature_type.lower().title()] = 1\n+\tfor key in feature.qualifiers:\n+\t\tvalues = feature.qualifiers[key]\n+\t\tif key == "translation":\n+\t\t\tpass\n+\t\telif type(values) == list:\n+\t\t\tfor v in values:\n+\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n+\t\telse:\n+\t\t\tint_add(generalURI,coreURI[key.lower()],values)\n+\tif feature.type == "CDS":\n+\t\ttry:\n+\t\t\t#Feature is normally submitted to this function\n+\t\t\t#IF a subfeature is submitted it is submitted as a feature\n+\t\t\t#And subfeature variable will contain the superfeature\n+\t\t\tif superfeature:\n+\t\t\t\tcodon = superfeature.qualifiers["transl_table"][0]\n+\t\texcept:\n+\t\t\t#Default codon table 11\n+\t\t\tcodon = "11"\n+\t\t#Protein linkage\n+\t\ttranslation = ""\n+\t\ttry:\n+\t\t\ttranslation = feature.qualifiers["translation"][0].strip("*")\n+\t\texcept KeyError:\n+\t\t\t#When protein sequence is not given...\n+\t\t\tif len(feature.location.parts) > 1:\n+\t\t\t\t#Exon boundaries?\n+\t\t\t\tseq = \'\'\n+\t\t\t\tfor loc in feature.location:\n+\t\t\t\t\tseq += record.seq[loc]\n+\t\t\t\tif int(feature.location.strand) == -1:\n+\t\t\t\t\tseq = Seq(seq).complement()\n+\t\t\t\telse:\n+\t\t\t\t\tseq = Seq(seq)\n+\t\t\t\ttranslation = str(seq.translate(feature.qualifiers["transl_table"][0]))\n+\t\t\telif int(feature.location.strand) == -1:\n+\t\t\t\tif str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:\n+\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n+\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))\n+\t\t\t\t\telse:\n+\t\t\t\t\t\ttranslation = \'\'\n+\t\t\telif int(feature.location.strand) == +1:\n+\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n+\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))\n+\t\t\t\t\telse:\n+\t\t\t\t\t\ttranslation = \'\'\n+\t\t\t\n+\t\t\tif translation:\n+\t\t\t\ttranslation = list(translation)\n+\t\t\t\ttranslation[0] = "M"\n+\t\t\t\ttranslation = \'\'.join(translation).strip("*")\n+\t\t\t\tif "*" in translation:\n+\t\t\t\t\tpass\t\t\n+\n+\t\ttranslation = translation.encode(\'utf-8\')\n+\t\tmd5_protein = hashlib.md5(translation).hexdigest()\n+\t\tproteinURI = coreURI["protein/"+md5_protein]\n+\t\tgenomeGraph.add((generalURI,coreURI["protein"],proteinURI))\n+\t\tfor key in feature.qualifiers:\n+\t\t\tfor v in feature.qualifiers[key]:\n+\t\t\t\tif key == "translation":\n+\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))\n+\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))\n+\t\t\t\t\tgenomeGraph.add((proteinURI,RDF.type,proteinClass))\n+\t\t\t\telse:\n+\t\t\t\t\tfor v in feature.qualifiers[key]:\n+\t\t\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n+\t\n+def int_add(subject, predicate, obj):\n+\ttry:\n+\t\tobject_float = float(obj.replace(\'"\',\'\'))\n+\t\tobject_int = int(obj.replace(\'"\',\'\'))\n+\t\tif object_int == object_float:\n+\t\t\tgenomeGraph.add((subject,predicate,Literal(object_int)))\n+\t\telse:\n+\t\t\tgenomeGraph.add((subject,predicate,Literal(object_float)))\n+\texcept:\n+\t\tgenomeGraph.add((subject,predicate,Literal(obj.replace(\'"\',\'\'))))\n+\t\t\t\t\n+def save():\n+\tdata = genomeGraph.serialize(format=\'turtle\')\n+\topen(sys.argv[sys.argv.index("-output")+1],"wb").write(data)\n+\n+def subClassOfBuilder():\n+\tfor subclass in SubClassOfDict:\n+\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n+\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))\n+\n+def subClassOfBuilderRna():\n+\tfor subclass in SubClassOfDictRna:\n+\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n+\t\tgenomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))\n+\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))\n+\t\tgenomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))\n+\n+def main():\n+\ttmp()\n+\tgbk_parser()\n+\tsubClassOfBuilder()\n+\tsubClassOfBuilderRna()\n+\tsave()\n+\tcleantmp()\n+\n+if __name__ == "__main__":\n+\tmain()\n\\ No newline at end of file\n'
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/gbk2rdf/gbktordf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/gbk2rdf/gbktordf.xml Sat Feb 21 17:17:06 2015 +0100
b
@@ -0,0 +1,38 @@
+<tool id="SAPP_genbank_to_ttl" name="EMBL/GBK to RDF" version="0.1">
+ <requirements>
+     <requirement type='package' version="3.4">python</requirement>
+     <requirement type='package' version="1.0">rdflib</requirement>
+ </requirements>
+ <description>Genbank to RDF conversion</description>
+ <command interpreter="python3.4">gbktordf.py '-input' '$input' -output '$output' -sourcedb "$format" -format "$format"</command>
+ <inputs>
+ <param name="input" type="data" format="gbk,gb,genbank,embl" label="Genbank file"/>
+ <param name="format" type="select" label="EMBL/GBK">
+ <option value="genbank" selected="true"> Genbank</option>
+ <option value="embl"> EMBL </option>
+ </param>
+ </inputs>
+
+ <outputs>
+ <data format="rdf" name="output" label="GBKttl: ${input.name}" />
+ </outputs>
+
+ <tests>
+     <test>
+       <param name="input" value="test-data/NC_010067.gbk"/>
+       <output name="$output" file="NC_010067.rdf"/>
+       <output name="$format" value="genbank"/>
+       <output name="$sourcedb" value="genbank"/>
+     </test>
+     <test>
+       <param name="input" value="test-data/CP009049.embl"/>
+       <output name="$output" file="CP009049.rdf"/>
+       <output name="$format" value="embl"/>
+       <output name="$sourcedb" value="embl"/>
+     </test>
+  </tests>
+  
+ <help>
+ Genbank or EMBL to RDF conversion
+ </help>
+</tool>
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/gbk2rdf/test-data/CP009049.embl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/gbk2rdf/test-data/CP009049.embl Sat Feb 21 17:17:06 2015 +0100
[
b'@@ -0,0 +1,157312 @@\n+ID   CP009049; SV 1; circular; genomic DNA; STD; PRO; 4599018 BP.\n+XX\n+AC   CP009049;\n+XX\n+PR   Project:PRJNA255737;\n+XX\n+DT   13-FEB-2015 (Rel. 123, Created)\n+DT   13-FEB-2015 (Rel. 123, Last updated, Version 1)\n+XX\n+DE   Salmonella enterica subsp. enterica serovar Paratyphi A strain CMCC 50973,\n+DE   complete genome.\n+XX\n+KW   .\n+XX\n+OS   Salmonella enterica subsp. enterica serovar Paratyphi A\n+OC   Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n+OC   Enterobacteriaceae; Salmonella.\n+XX\n+RN   [1]\n+RP   1-4599018\n+RA   Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n+RT   "Whole Genome Sequences of two Salmonella paratyphi A strains";\n+RL   Unpublished.\n+XX\n+RN   [2]\n+RP   1-4599018\n+RA   Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n+RT   ;\n+RL   Submitted (24-JUL-2014) to the INSDC.\n+RL   State Key Laboratory of Pathogen and Biosecurity, Beijing Institute of\n+RL   Biotechnology, 20 Dongdajie, Fengtai District, Beijing, Beijing 100071,\n+RL   China\n+XX\n+DR   MD5; e41a6215bf412b701febd8d4b182ec0c.\n+DR   BioSample; SAMN02909989.\n+XX\n+CC   Source DNA/bacteria are available from National Center for  Medical\n+CC   Culture Collection (CMCC) in China.\n+CC   Annotation was added by the NCBI Prokaryotic Genome Annotation\n+CC   Pipeline (released 2013). Information about the Pipeline can be\n+CC   found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/\n+CC   ##Genome-Assembly-Data-START##\n+CC   Assembly Method       :: SOAPdenovo v. 2011.04\n+CC   Assembly Name         :: CMCC(B) 50973\n+CC   Genome Coverage       :: 133x\n+CC   Sequencing Technology :: Illumina\n+CC   ##Genome-Assembly-Data-END##\n+CC   ##Genome-Annotation-Data-START##\n+CC   Annotation Provider          :: NCBI\n+CC   Annotation Date              :: 07/25/2014 13:43:31\n+CC   Annotation Pipeline          :: NCBI Prokaryotic Genome Annotation\n+CC                                   Pipeline\n+CC   Annotation Method            :: Best-placed reference protein set;\n+CC                                   GeneMarkS+\n+CC   Annotation Software revision :: 2.6 (rev. 440435)\n+CC   Features Annotated           :: Gene; CDS; rRNA; tRNA; ncRNA;\n+CC                                   repeat_region\n+CC   Genes                        :: 4,309\n+CC   CDS                          :: 4,016\n+CC   Pseudo Genes                 :: 166\n+CC   CRISPR Arrays                :: 2\n+CC   rRNAs                        :: 20 ( 5S, 16S, 23S )\n+CC   tRNAs                        :: 100\n+CC   ncRNA                        :: 7\n+CC   Frameshifted Genes           :: 106\n+CC   ##Genome-Annotation-Data-END##\n+XX\n+FH   Key             Location/Qualifiers\n+FH\n+FT   source          1..4599018\n+FT                   /organism="Salmonella enterica subsp. enterica serovar\n+FT                   Paratyphi A"\n+FT                   /host="Homo sapiens"\n+FT                   /sub_species="enterica"\n+FT                   /strain="CMCC 50973"\n+FT                   /mol_type="genomic DNA"\n+FT                   /country="China:Jiangsu"\n+FT                   /lat_lon="32.04 N 118.78 E"\n+FT                   /collection_date="2003-06-01"\n+FT                   /serovar="Paratyphi A"\n+FT                   /db_xref="taxon:54388"\n+FT                   /culture_collection="CMCC:50973"\n+FT   gene            complement(129..713)\n+FT                   /gene="mobA"\n+FT                   /locus_tag="IT63_00010"\n+FT   CDS             complement(129..713)\n+FT                   /codon_start=1\n+FT                   /transl_table=11\n+FT                   /gene="mobA"\n+FT                   /locus_tag="IT63_00010"\n+FT                   /product="molybdopterin-guanine dinucleotide biosynthesis\n+FT                   protein MobA"\n+FT                   /note="in Escherichia coli MobA links a guanosine\n+FT                   5\'-phosphate to molydopterin to form molybdopterin guanine\n+FT                   dinucleotide during molybdenum cofactor biosynthesis;\n+FT                   Derived by automated c'..b'cgag cgaacgggga ggagcccaga gcctgaatca gcatgtgtgt   4596180\n+     tagtggaagc gtctggaaag gcgcgcgata cagggtgaca gccccgtaca caaaagcgca   4596240\n+     tgtgctgtga gctcgatgag tagggcggga cacgtggtat cctgtctgaa tatgggggga   4596300\n+     ccatcctcca aggctaaata ctaattttgc tctttaaaaa tctggatcaa gctgaaaatt   4596360\n+     gaaacacaga acaacgaaag ttgttcgtga gtctctcaaa ttttcgcaac acgatgatga   4596420\n+     atcgtaagaa acatcttcgg gttgtgaggt taagcgacta agcgtacacg gtggatgccc   4596480\n+     tggcagtcag aggcgatgaa ggacgtgcta atctgcgata agcgccggta aggtgatatg   4596540\n+     aaccgttata accggcgatt tccgaatggg gaaacccagt gtgattcgtc acactatcat   4596600\n+     taactgaatc cataggttaa tgaggcgaac cgggggaact gaaacatcta agtaccccga   4596660\n+     ggaaaagaaa tcaaccgaga ttcccccagt agcggcgagc gaacggggag gagcccagag   4596720\n+     cctgaatcag catgtgtgtt agtggaagcg tctggaaagg cgcgcgatac agggtgacag   4596780\n+     ccccgtacac aaaagcgcat gtgctgtgag ctcgatgagt agggcgggac acgtggtatc   4596840\n+     ctgtctgaat atggggggac catcctccaa ggctaaatac tcctgactga ccgatagtga   4596900\n+     accagtaccg tgagggaaag gcgaaaagaa ccccggcgag gggagtgaaa aagaacctga   4596960\n+     aaccgtgtac gtacaagcag tgggagcaca ggtttacctg tgtgactgcg taccttttgt   4597020\n+     ataatgggtc agcgacttat attctgtagc aaggttaacc gtatagggga gccggaggga   4597080\n+     aaccgagtct taaccgggcg ttaagttgca gggtatagac ccgaaacccg gtgatctagc   4597140\n+     catgggcagg ttgaaggttg ggtaacacta actggaggac cgaaccgact aatgttgaaa   4597200\n+     aattagcgga tgacctgtgg ctgggggtga aaggccaatc aaaccgggag atagctggtt   4597260\n+     ctccccgaaa gctatttagg tagcgcctcg tgaattcatc tccgggggta gagcactgtt   4597320\n+     tcggctaggg ggccatcccg gcttaccaac ccgatgcaaa ctgcgaatac cggagaatgt   4597380\n+     tatcacggga gacacacggc gggtgctaac gtccgtcgtg aagagggaaa caacccagac   4597440\n+     cgccagctaa ggtcccaaag tcatggttaa gtgggaaacg atgtgggaag gcccagacag   4597500\n+     ccaggatgtt ggcttagaag cagccatcat ttaaagaaag cgtaatagct cactggtcga   4597560\n+     gtcggcctgc gcggaagatg taacggggct aaaccatgca ccgaagctgc ggcagcgaca   4597620\n+     ctcaggtgtt gttgggtagg ggagcgttct gtaagcctgt gaaggtggcc tgtgagggtt   4597680\n+     gctggaggta tcagaagtgc gaatgctgac ataagtaacg ataaagcggg tgaaaagccc   4597740\n+     gctcgccgga agaccaaggg ttcctgtcca acgttaatcg gggcagggtg agtcgacccc   4597800\n+     taaggcgagg ccgaaaggcg tagtcgatgg gaaacgggtt aatattcccg tacttggtgt   4597860\n+     tactgcgaag ggggggacgg agaaggctat gttggccggg cgacggttgt cccggtttaa   4597920\n+     gcgtgtaggt gtgtgttcca ggtaaatccg gttcacttta acactgaggc gtgacgacga   4597980\n+     ggcactacgg tgctgaagca acaaatgccc tgcttccagg aaaagcctct aagcatcagg   4598040\n+     taacatcaaa tcgtacccca aaccgacaca ggtggtcagg tagagaatac caaggcgctt   4598100\n+     gagagaactc gggtgaagga actaggcaaa atggtgccgt aacttcggga gaaggcacgc   4598160\n+     tgacacgtag gtgaagtgat ttactcatgg agctgaagtc agtcgaagat accagctggc   4598220\n+     tgcaactgtt tattaaaaac acagcactgt gcaaacacga aagtggacgt atacggtgtg   4598280\n+     acgcctgccc ggtgccggaa ggttaattga tggggtcagc gcaagcgaag ctcctgatcg   4598340\n+     aagccccggt aaacggcggc cgtaactata acggtcctaa ggtagcgaaa ttccttgtcg   4598400\n+     ggtaagttcc gacctgcacg aatggcgtaa tgatggccag gctgtctcca cccgagactc   4598460\n+     agtgaaattg aactcgctgt gaagatgcag tgtacccgcg gcaagacgga aagaccccgt   4598520\n+     gaacctttac tatagcttga cactgaacat tgagccttga tgtgtaggat aggtgggagg   4598580\n+     ctttgaagtg tggacgccag tctgcatgga gccgaccttg aaataccacc ctttaatgtt   4598640\n+     tgatgttcta acgtggaccc gttacccggg ttgcggacag tgtctggtgg gtagtttgac   4598700\n+     tggggcggtc tcctcctaaa gagtaacgga ggagcacgaa ggttggctaa tcctggtcgg   4598760\n+     acatcaggag gttagtgcaa tggcataagc cagcttgact gcgagcgtga cggcgcgagc   4598820\n+     aggtgcgaaa gcaggtcata gtgatccggt ggttctgaat ggaagggcca tcgctcaacg   4598880\n+     gataaaaggt actccgggga taacaggctg ataccgccca agagttcata tcgacggcgg   4598940\n+     tgtttggcac ctcgatgtcg gctcatccca tcccggggct gaagtaggtc ccaagggtat   4599000\n+     ggctgttcgc catttaaa                                                 4599018\n+//\n'
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/gbk2rdf/test-data/NC_010067.gbk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/gbk2rdf/test-data/NC_010067.gbk Sat Feb 21 17:17:06 2015 +0100
b
b"@@ -0,0 +1,259779 @@\n+LOCUS       NC_010067            4600800 bp    DNA     circular CON 20-AUG-2013\n+DEFINITION  Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n+            RSK2980 chromosome, complete genome.\n+ACCESSION   NC_010067\n+VERSION     NC_010067.1  GI:161501984\n+DBLINK      Project: 58191\n+            BioProject: PRJNA58191\n+KEYWORDS    .\n+SOURCE      Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n+            RSK2980\n+  ORGANISM  Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n+            RSK2980\n+            Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n+            Enterobacteriaceae; Salmonella.\n+REFERENCE   1  (bases 1 to 4600800)\n+  CONSRTM   NCBI Genome Project\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (03-DEC-2007) National Center for Biotechnology\n+            Information, NIH, Bethesda, MD 20894, USA\n+REFERENCE   2  (bases 1 to 4600800)\n+  AUTHORS   McClelland,M., Sanderson,E.K., Porwollik,S., Spieth,J.,\n+            Clifton,W.S., Fulton,R., Chunyan,W., Wollam,A., Shah,N., Pepin,K.,\n+            Bhonagiri,V., Nash,W., Johnson,M., Thiruvilangam,P. and Wilson,R.\n+  CONSRTM   The Salmonella enterica serovar Arizonae Genome Sequencing Project\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (02-NOV-2007) Genetics, Genome Sequencing Center, 4444\n+            Forest Park Parkway, St. Louis, MO 63108, USA\n+COMMENT     PROVISIONAL REFSEQ: This record has not yet been subject to final\n+            NCBI review. The reference sequence was derived from CP000880.\n+            Salmonella enterica subspecies IIIa (Arizonae) serovar\n+            62:z4,z23:--Most bacteria in the species S. enterica belong to one\n+            of seven subspecies; all but subspecies I normally grow only in\n+            cold-blooded animals. Subspecies IIIa (S. Arizonae) is naturally\n+            found in reptiles, but also causes outbreaks of salmonellosis in\n+            turkeys and sheep and can occasionally produce both gastroenteritis\n+            and serious disseminated disease in humans. Many human infections\n+            can be traced to contact with reptiles or ingestion of various\n+            reptile products, particularly from rattlesnakes. Fewer than ten\n+            cases in humans are typically reported in the US each year.\n+            \n+            The strain of S. Arizonae (62:z4,z23:-) being sequenced is\n+            CDC346-86; it was named RSK2980 by R.K. Selander and is strain\n+            SARC5 of the Salmonella Reference C set. This serovar is of\n+            interest because of its taxonomic position. It appears to be the\n+            most divergent subspecies among the S. enterica. It can be obtained\n+            from the American Type Culture Collection as ATCC BAA-731, or the\n+            Salmonella Genetic Stock Centre as SGSC4693. The genome was\n+            sequenced to 8X coverage, using plasmid and fosmid libraries and\n+            was finished to an error rate of less than 1 per 10,000 bases.\n+            Automated annotation was performed and manual annotation will\n+            continue in the labs of Michael McClelland and Kenneth Sanderson.\n+            The National Institute of Allergy and Infectious Diseases (NIAID),\n+            National Institutes of Health (NIH) has funded this project.\n+            \n+            Coding sequences below are predicted using GeneMark v3.3 and\n+            Glimmer2  v2.13.Intergenic regions not spanned by GeneMark and\n+            Glimmer2 were blasted against NCBI's non-redundant (NR) database\n+            and predictions generated based on protein alignments. RNA genes\n+            were determined  using tRNAscan-SE 1.23 or Rfam v8.0. This sequence\n+            was finished as follows unless otherwise noted: all regions were\n+            double stranded, sequenced with an alternate chemistries or covered\n+            by high quality data(i.e., phred quality >=30);an attempt was made\n+      "..b'1 acccgtcatc gtatcgtcct tgccgcaacg cttgcggaat ttcttacaca acttaatcct\n+  4597741 cttctgtaat cgtttgccct gacaggtgtg agagatctct tacaaggtct gtaggagatc\n+  4597801 gccaggatat cagagaatac ttagctacga ctttctcctg taaatatata taaatcaatc\n+  4597861 tattaaaata ttatttcgca ctttcatata caaatttact taaggtatcg tctgtaagcg\n+  4597921 tcttgtaaga caaggtgaaa caggcgattc tatattcatc gacagggagt cgtacaacga\n+  4597981 agcgaacgtc aggaagatgg cgcttctgca ggacacgcca ggagggcgtt acatggaaag\n+  4598041 gcttcaggat gaggcaaagt ggaaagcgca ggatgcgtta aaggacacct ccaggacgga\n+  4598101 gaacgagagc cgattaggat ggtcggcggg tctggatgac cagggacgct tcgggatgaa\n+  4598161 gctatcacat cggggcgatg tgcgcaggat gcaaacgttc aggatgagca ggccgcaggg\n+  4598221 tcacaggaaa agttgtcacg gatgagcagg gagcatgaaa agtagctgga atgctgcgaa\n+  4598281 acgaaccggg agcactgttt atacagtgct cccttttttt gttattcttc gcgccagatt\n+  4598341 tccattattg aggttcttaa catgacgact catgaccgtg tgcgtcagca gttacatgcg\n+  4598401 cttgaaacgc tgctgcgtga gcatcatcac tggcggctgg atgcgccgca ggcgcacctg\n+  4598461 tttaccagca cgcagccgtt ttgtatggat accatggaac cgctggaatg gctgcaatgg\n+  4598521 gtattgatcc cgcgtatgca taccctgctt gataatgcgc agccgttacc tgaggcgttt\n+  4598581 gccgtcgccc cttattatga aatggcgctg acggcggatt atccgcagcg ggaagcgatc\n+  4598641 ctgacggttt tgcaggatct ggatgcgcta tttacccgcg ataaatcctg atgctggaga\n+  4598701 tcctctatca ggacgcgtgg ctggttgccg ttaataaacc tgcaggctgg cttgttcacc\n+  4598761 ggagctggct ggatcgcgac gaaaaagttg tggtcatgca aacggtgcgc gaccaaatcg\n+  4598821 gccagcatgt ttttaccgcc caccgtctcg acagacccac atcgggcgta ctactgatgg\n+  4598881 ggctgtccag cgaagcggga cgccgcctgg cgcagcagtt cgagcagcac catatccgta\n+  4598941 aacgttacca tgccatagtg cgcggctggc tgatggatga tgcgctactg gattatcctc\n+  4599001 tgctggaaga gcgcgataaa attgccgata agttcgcgcg tgaggataaa gcgccccagc\n+  4599061 cagccgtaac gcagtatcgc gggctggcga cggtcgaaat ggcagtgccg accgggcgtt\n+  4599121 atcccactac gcgttatggc ctggttgagc tggaaccgaa aacggggcgc aaacaccagc\n+  4599181 tccgccgtca tctggcgcat ctacgccatc ctatcatcgg cgacagtaaa cacggtgatt\n+  4599241 tgcggcaaaa ccgtagcgcg gcggaacatt ttgcttgtcg tcgcctgatg cttcatgcca\n+  4599301 gtcggcttga actgacgcat cccttcaccg gacagccatt aattattcag gccggactgg\n+  4599361 atgaaacctg gatgcaggcg ctaacacagt ttggctggcg gggacttctc cctgataatg\n+  4599421 aaagggttga gtttacgacg gcgtcccggc aggatgagtc ttatcagaca taattcaggg\n+  4599481 agatacgcat aatggcggaa attggtattt ttgtcggtac gatgtatggc aactcactgt\n+  4599541 tggtggcgga ggaagcggaa gcgatcctgg ccagacaggg ccatagcgcg actgtgtttg\n+  4599601 aagatcctga actgtccgac tggcggcaat atcaggacaa ggtggcattg gttgtcacct\n+  4599661 caacgaccgg acagggcgat ctaccggata gtattgcgcc gctctttcac ggtattaaag\n+  4599721 atacgttagg ttttcaacca aacctgcgtt acggggtgat tgcgttaggt gatagcagct\n+  4599781 accccaattt ctgtaatggc ggcaagcagt ttgatgccct gttgcaggag caaagcgcgc\n+  4599841 aacgggtggg ggaaatgtta ctcattgacg ccagcgaaca tccggagccg gagagccaat\n+  4599901 ccaatccctg ggtagaaaac tggggaacct tactttcctg aggtaaatcc ctccccctac\n+  4599961 cgggagggta ccttttcgtt tgattgcatt gccagtaagc aaaataacga cctgtatgta\n+  4600021 gtttaaagaa actgaatcgt gttagctttg tgcatatgcc tgcaaaagca gcagtttttt\n+  4600081 acgggcgttt tcatgtaatc aagcgacctg tttcacattc ttctcttttt attcctcctg\n+  4600141 cgtcgacgcc tgacgccttc tgatttcatt tccgtgaagt ggcttccact gtcctgggct\n+  4600201 tttgccacaa acaggcgtaa ttcattgcca aaatactgtg ttgttgcacg gtgagtgtgc\n+  4600261 gtgacgcgct ttttatactt ctcctgccag tgaataaaag aatgcagcat gcaaagcaaa\n+  4600321 cgacctaata aaagctgcaa caaggaaacg ttatctctga ttccctaccg gttgtgcagt\n+  4600381 tcagagtgag cgtagctaac gcgaaatttc aggagtgcaa caatgagttc attaagtcac\n+  4600441 gcggcgagta gtgcggagaa tcgcacgaac gcccgctact ggatagtggt gatgctgttt\n+  4600501 atcgtcacat cctttaacta tggcgatcgc gccacattgt ccattgccgg ctcagaaatg\n+  4600561 gccaaagata ttggtcttga cccggtaggc atgggctacg ttttctctgc gttttcatgg\n+  4600621 gcctatgtta tcggacagat ccctggcggc tggctgctgg accgctttgg ttccaaacgc\n+  4600681 gtctatttct ggtctatttt catctggtcg gtcttcaccc tgttgcaggg ttttgtcgat\n+  4600741 atttttagcg gtttcggcat tgttgtcgcc ctctttacgc ttcgtttcct ggtcggtctg\n+//\n'
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/protein2rdf/protein_to_ttl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/protein2rdf/protein_to_ttl.py Sat Feb 21 17:17:06 2015 +0100
[
@@ -0,0 +1,130 @@
+def delete_galaxy():
+ import sys
+ for index, path in enumerate(sys.path):
+ if "galaxy-dist/" in path:
+ sys.path[index] = ''
+
+#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
+delete_galaxy()
+
+# from io import StringIO
+from rdflib import Graph, URIRef, Literal,Namespace,  RDF,RDFS,OWL,  plugin
+# import rdflib
+from rdflib.store import Store
+import sys
+import hashlib
+
+store = plugin.get('IOMemory', Store)()
+
+global URI
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+
+
+def createClass(uri):
+ genomeGraph.add((uri,RDF.type,OWL.Class))
+ genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+ return uri
+
+def fasta_parser(input_file):
+ createClass(coreURI["Protein"])
+
+ genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
+ if genome == '':
+ genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
+
+ genomeURI = coreURI[genome]
+ for index, element in enumerate(sys.argv):
+ if '-organism' == element:
+ genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
+ if '-ncbi_taxid' == element:
+ genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
+ if '-idtag' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+ if '-diagnosis' == element:
+ genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
+ if '-country' == element:
+ genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
+ if '-location' == element:
+ genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
+ if '-date' == element:
+ genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
+ if '-ids' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+
+
+
+ data = (open(input_file).readlines())
+ fastadict = {}
+ sequence = ""
+ key = ""
+ for index, line in enumerate(data):
+ if ">" == line[0]:
+ if sequence:
+ fastadict[key] = sequence
+ key = line
+ sequence = ""
+ fastadict[key] = ""
+ else:
+ sequence += line.strip()
+ fastadict[key] = sequence
+
+ #Create a class, to be the same as all the other genome conversions...
+ #TODO: Proteins are part of cds, cds are part of dnaobject
+ #If CDS is not there... how then?
+ classURI = coreURI[genome + "/" + "protein_fasta"]
+ proteinClass = createClass(coreURI["Protein"])
+ genomeClass = createClass(coreURI["Genome"])
+ typeClass = createClass(coreURI["DnaObject"])
+ cdsClass = createClass(coreURI["Cds"])
+ #A theoretical begin, end is created to have a workable GBK generation
+ begin = 0
+ end = 0
+ genomeGraph.add((genomeURI, RDF.type, genomeClass))
+ genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
+ genomeGraph.add((classURI, RDF.type, typeClass))
+
+ for protein in fastadict:
+ sequence = fastadict[protein]
+ sequence = sequence.encode('utf-8')
+ end = begin + len(sequence)
+ md5_protein = hashlib.md5(sequence).hexdigest()
+ proteinURI = coreURI["protein/"+md5_protein]
+
+ cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
+ genomeGraph.add((classURI, coreURI["feature"] , cdsURI))
+ genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))
+ genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
+ genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
+ genomeGraph.add((cdsURI, RDF.type, cdsClass))
+
+
+
+ genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
+ genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
+ genomeGraph.add((proteinURI,RDF.type,proteinClass))
+ genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((proteinURI, RDF.type, proteinClass))
+ begin = end
+
+def save():
+ data = genomeGraph.serialize(format='turtle')
+ open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+
+def main():
+ store = plugin.get('IOMemory', Store)()
+ global genomeGraph
+ genomeGraph = Graph(store,URIRef(URI))
+ genomeGraph.bind("ssb",coreURI)
+ input_file = sys.argv[sys.argv.index("-input")+1]
+ fasta_parser(input_file)
+ save()
+
+if __name__ == '__main__':
+ main()
+
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/protein2rdf/protein_to_ttl.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/protein2rdf/protein_to_ttl.xml Sat Feb 21 17:17:06 2015 +0100
b
@@ -0,0 +1,42 @@
+<tool id="SAPP_protein_rdf" name="Protein FASTA to RDF" version="0.1">
+    <requirements>
+        <requirement type='package' version="3.4">python</requirement>
+        <requirement type='package' version="1.0">rdflib</requirement>
+    </requirements>
+ <description></description>
+ <command interpreter="python3.4">protein_to_ttl.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' '-diagnosis' '$diagnosis' '-country' '$country' '-location' '$location' '-date' '$date' -sourcedb SAPP 
+ #for $index, $id in enumerate( $ids ) 
+ '-ids' '$id.id_tag'
+ #end for
+ '-id_alternative' '$input.name'
+ </command>
+ <inputs>
+ <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
+ <param size="60" name="organism" type="text" format="text" label="organism name"/>
+ <param size="60" name="diagnosis" type="text" format="text" label="Diagnosis of host if applicable"/>
+ <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
+ <param size="60" name="country" type="text" format="text" label="Country of sample"/>
+ <param size="60" name="location" type="text" format="text" label="Location of sample e.g., river, city, hospital"/>
+ <param size="60" name="date" type="text" format="text" label="Sample date"/>
+ <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!"/>
+ <repeat name="ids" title="Identification tags">     
+ <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
+ </repeat>
+ </inputs>
+
+ <outputs>
+ <data format="rdf" name="output" label="proteinTTL: ${input.name}" />
+ </outputs>
+    <tests>
+        <test>
+            <param name="input" value="test-data/NC_017117.faa"/>
+            <output name="$output" file="NC_017117.rdf"/>
+            <output name="$ncbi_taxid" value="634455"/>
+            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
+            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
+        </test>
+    </tests>
+ <help>
+ RDF creation from a multi protein fasta file
+ </help>
+</tool>
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b conversion/protein2rdf/test-data/NC_017117.faa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/protein2rdf/test-data/NC_017117.faa Sat Feb 21 17:17:06 2015 +0100
[
b'@@ -0,0 +1,993 @@\n+>gi|384055706|ref|YP_005485330.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MKSDRFTDAQIMGVIRQAEGGVPVPDLCREHGISNATFYRWRAKYGGMDASMISQMKALEEENRRLKRMY\n+ADLSMQTDILKEALGKK\n+>gi|384055707|ref|YP_005485331.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n+MAGHHVEAMIARAHAQKRFMDDAGWRYVVELYGRYQSLLREQNAADFGDLLMWPTLAMLHNDAYRYRWSR\n+RFTAVMADEFQDVNRAQFLWLKMISEVSAEFFAVGDDSQSIL\n+>gi|384055708|ref|YP_005485332.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MVVGRNDCAKGRQMKDTVIGVDLAKNIFQVHGASRAGEVMFRKKLRRQQFMQFMATQPPALVVLEACGSA\n+HYWARELAGAGHEVRLIAPQYVKPFVKRQKNDAADAEAIVIAARQPEMRFVEPRTEAQQARGVLFRARQR\n+LVHQRTELVNALRAVLYEFGLVVPQGIAHIRHIEAMLDEAVLPEAVKQECLDLLRQISEQSVRIDVRTKK\n+IRMLAQESENTCRLQSMPGVGPLTALAIEAFAPDLQSFRRGRDFAAWLGLVPRQFSSGGKERLGKISKAG\n+QADIRRLLIMGAMTQVNWASRKAPAPGSWLARMLARKPRMLVAIALANRMARAIWAMATKQEDYRDPALS\n+VAA\n+>gi|384055709|ref|YP_005485333.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MEQIIRIGMDTSKSVFQLHGVNAKEQPVLRRKLSRREMVKFFEKLPPIEIAIEACGASHYWGRVLSCLGH\n+TVKLIAPQLVKPYVKRGKNDAADAEALCEAMSRPTMRFVPLKSEEEQAALMLIGMRARLIRNRTQLANTI\n+RGYAAEFGITAPKGMCRIEALLDRIAADESLPTLTRELFALHAKEYAELQGEIEQLEGKVMAWHRANECS\n+QRLAKIPGVGPIGAALLMMKTPDPHLFKSGRAFAAWIGLTPRDHSTGGKTRLGRITRAGDEVLRSTLVVG\n+ATAVVSHARRTNGKNASSWLRELLERKKPKLAAVALANKIARIAWKLMVSGEHYKRLLQQPGAAAV\n+>gi|384055710|ref|YP_005485334.1| DNA resolvase [Acetobacter pasteurianus IFO 3283-22]\n+MVPPKPGKTPVGGRLIGYARVSTDDQGTDAQLNELRDAGCTMIFEKHASGADRNRPVLIRLLRDMNAGDT\n+LVVVRLDRLARSVSHLLAVIEQLDYAGAHFRSLDDPIDTTTPQGMFSLQVLGAVAQLDADFFCDGVDGSQ\n+RHRDVPR\n+>gi|384055711|ref|YP_005485335.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MLTSRIHRRKPMGKPMSKATARANAAKSSIRAHVEHVFAHQKNRFNLFIRTIGLARAEAKLTLCNLAYNF\n+NRLIFHERLETAG\n+>gi|384055712|ref|YP_005485336.1| D-mannonate oxidoreductase [Acetobacter pasteurianus IFO 3283-22]\n+MNLNRNAISHVPDTVYTPRYDPALLRPGIVHLGCGNFHRGHQVVATQAAIDAEGRDGLRWGIVSATMRRP\n+DLATVLQSQDNLYTLLTREPANTVASVMAAITEAVYAGDDNANLAARIADPATAIVTLTVTASGYYLSAD\n+GRLDPTFEAIQADLTAITPRTAPGIIAAGLAQVRQRGGVPPVILCCDNVNSNGATLRQAVIDLAALKGDD\n+LLAAWIETNVQFPDTMVDRIVPTATPDDIADACRLLGGIEDRAPISAEPWFQWVIGEFDGPRPRWVAHPG\n+TKFVSDVGVFERAKLQMLNGTHMLLAYVGALANLNTVSEAASDDALGRIAARFMRNEQTADVSLDTDELD\n+RYTVDLMQRFRNPGIVHEVTRIGRNGSAKMASRIVQPMRSNIEAGRPVDGAVLLIASWIRWFALHEQDEF\n+DIALTDPRAETLRGLCADARDDHKAQAEAFLAMEEVFGAPLPDHGKQVEAIASMLRRLTEESVPELLRTI\n+AH\n+>gi|384055713|ref|YP_005485337.1| phosphatase/phosphohexomutase [Acetobacter pasteurianus IFO 3283-22]\n+MTDTVFPAHLLKHKQEPVHGVVFDMDGLLLDSESLAMEALVFAARDLNYDIPMSFCRTMIGVPADGCRTM\n+VRKTYGQDFPLERFFELQEVHLRNFVDTGKLALKKGVLPLLDLLDTYKIPRAIATSSSRVRTDHHLKLVN\n+LFHRFNAIVTRDDVSKGKPDPEPYLTAAKKIGVNPAHALALEDSHSGARAAHAAGIRVIVVPDLLEATDE\n+IRGKALAIVQDLSIVEAYLKHAITGQA\n+>gi|384055714|ref|YP_005485338.1| hypothetical protein APA22_40090 [Acetobacter pasteurianus IFO 3283-22]\n+MRRDMDLVRQLLLKLEGIEKGPHDVLLIGGNSEEVAVDGRTSDEIYFHLTKIEEAGFLERVGGGAMTAVT\n+FRALSWKGQEFLDTIRDDSIWKKTKEKAGSASFDILAAVAKAVIKDRIKSLTGLDIG\n+>gi|384055715|ref|YP_005485339.1| hypothetical protein APA22_40100 [Acetobacter pasteurianus IFO 3283-22]\n+MRPLGSGLSVRTYGCSEADDQENDGWAKKDTGEIVALYEMSSPVMPSGLVSISRWKIKGCYPKSGLSRAM\n+LCPTKIPQSASNIALLIGSDWSFIEENVFCNHIEWQTCLPVFVMNLDHPA\n+>gi|384055716|ref|YP_005485340.1| DNA helicase superfamily I [Acetobacter pasteurianus IFO 3283-22]\n+MSSKPSHHSVLSYWHSALLDDAQMKISFSRDNLVALDEEGFEKGKLPPDKTQALRKMHPASRDLAPDDSI\n+IAMAGIRILLGQVSHSTEHSKQPALFCMAMLVNVSPEGTIQPLKDAPPWINRELLEPSDGDVLIGDLATM\n+DTWLQLNPFEGGSLGKTLEWAEKLWNAVTGEDGLPDGYELWERVALQPAEASIGMIATLHQRRFYDTVLA\n+DTGLVTPLLARYIDGGPEPAVVDESQKWAAAGRARGTMTFAYGMSSSQSEAMTAFCSVKDGDILAVNGPP\n+GTGKTTLLQGIVATELVTRALEGGDPAVIVGTSTNNQAVTNIIDAMKKAMASKDSRPWARRWIEGADALG\n+LYFPSGEKEKEALKAGYLIASPGRGLGTMEWKGFPERERDTVDAWASRDAWINGYYGSFYPGVTPPLRKE\n+HLSGHGPQGARHDISLVEDGIAKIRARMKVLVETGRVCAGEARKLNQLYVASGYGTYPDITKAIAQREAL\n+LQERRPREDALKSDLKEKEAAAAVPRARINEENRKTRDLLKQRDDAVHAAGQKVEEVGAHAVALIAALPG\n+GGFFSNLMSGRNWANVERLVAEGRQGSFFRSLMQAQVKSKREWMDAINEMTASAERELATVRESREETRQ\n+ARDTLIQKLEREVAAADLVSKTARAEYDHYVGGSYVLAGRELEKLVTLKHQILQQLQDCCTAIETVLAPS\n+DWAAMFDMPEEKLPWRQSNWTGRLDVIEDFLDR'..b'DEVAPAV\n+RHLISQIQTTIA\n+>gi|384055875|ref|YP_005485499.1| multidrug resistance transporter EmrB/QacA [Acetobacter pasteurianus IFO 3283-22]\n+MGTSMTSSRVTNPLFVLLAASTGCALTVLDTNVVAIILPTIAREFRASFADIEWVISTYVLCFASLLLPA\n+GAIADRYGRRRIYLIGITTFALTSLFCGAAPSATALYLARALQGVSAAFLLAPALAIIGHTFHNPDERNR\n+AWAIWGSIMGLTMVLAPIIGGIIAYALGWRWAFYINIPICVLLAGAVFILVKESRDTDARRLDPVGIIFF\n+AAFMFGLTWGMINGQASGWTSWNALNGFIGGSISLGIFIASERAQSRPMLDLGLFSNPRFLGAVWAMFAY\n+AASAQVMASMLPLFLQNGLGRSALQAGFAMLPFALAMLIFPHIGRLLERHISSSGILAGGLSCVAIGNGI\n+TAWGAYVGSWIIVMAGMVVIGSGGGLLNGETQKAIMSVVPKERSGMASGISTTSRFSGILLGFAMLSGIL\n+ATMVRKWVAAFGCGTGCHHPSDFADAIVAGDLPSAISGLEGSNQEIAIQHAHHAFSYGFAVALLVASIFA\n+LGSSITVFTLMQSKMKQNIT\n+>gi|384055876|ref|YP_005485500.1| transposase, partial [Acetobacter pasteurianus IFO 3283-22]\n+MLAYAVMASVRYQANSLKPKKTQLRTRQSLSAGPFRRSGASS\n+>gi|384055877|ref|YP_005485501.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVILVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n+LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGQQADRYCRIIAD\n+HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n+ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n+SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n+CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n+KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n+>gi|384055878|ref|YP_005485502.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVIVVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n+LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGHKADRYCRIIAD\n+HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n+ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n+SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n+CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n+KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n+>gi|384055879|ref|YP_005485503.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n+MLQFSYMSEEADAIAAEIGRRAASGCAWHDIAVIYRQNRLSRAIEEALIQARVPYEIVGDVGFYQRVAVK\n+DALALLSLAARPDDRQSDEAFRADFSHLRQFRVIL\n+>gi|384055880|ref|YP_005485504.1| DNA helicase RecD/TraA [Acetobacter pasteurianus IFO 3283-22]\n+MTSAVVGEQCQTEALAGLVERVTFHNAENGFCVLRVKVRGQRDLVTVVGHAAMISAGEFVQMSGRWFNDH\n+THGLQFKAEFLKASPPTTVEGIERYLGSGMIRGIGPVYAKKLVKAFGEAVFDLIEQEPHRLREVTGIGPK\n+RAERIVGGWADQKVIREIMLFLHSNGVGTSRAVRIFKTYGQDAVRLISENPYRLAKDIRGIGFKTADQIA\n+RKMGIAPDAMIRVRAGISYALGEAMDEGHCGLPVGELLTSTAELLEVAAPLIETALALELEAGDVVADSV\n+GETSCIFLAGLYRAEQSIAERLRACAVGRPPWPEIDAEKAMTWVEGKTGLAMAPSQQEAVRLALRSKVLV\n+ITGGPGVGKTTLVNAILKIVTAKGTDVQLCAPTGRAAKRLSESTGLEGKTIHRLLETDPGNGSFKRDDTN\n+PLTCDLLVVDEASMVDVLLMRSLLRALPDSASLLIVGDVDQLPSVGPGQVLADIIGSDAVPVVRLTEVFR\n+QAAQSRIITNAHRINEGKMPELSAEEGSDFYFVEAAEPEVGLRKLLAVVKDRIPARFGLDPVRDVQVLCP\n+MNRGGLGARSLNIELQQALNPAGDVKVERFGWTYGPGDKVMQIANDYDRDVFNGDLGVIDKIDVEEGELT\n+VLFDGREVVYGFGELDELVLAYATTIHKSQGSEYPVVVIPLVTQHYTMLARNLLYTGVTRGRKLVVLVGQ\n+KKALAIAVRNQGGRLRWSKLRDWLVGTSGTGHLSRLKKP\n+>gi|384055881|ref|YP_005485505.1| phage integrase [Acetobacter pasteurianus IFO 3283-22]\n+MVESQVSHIQPEYKFHINLDEYDRRATLSADELKVVRRWKEENLVITKRQAPRLHKPLTDILYRSNLDRA\n+NSHRALKYLLLTVAHQEKPYWGWSEDLWVEIINNSPVLKKTGMVPQLIAVAYLLCGFRSVYKIQRNVATA\n+VVARLVFGAEIVDTECERLFSALTRVGFVCQTVRPLVPSVFAAVALQGENPKLESFDRKILEHTRECYTG\n+NHIAKRIGILSNGLAAMGLTSKVIHFRAYPPRHGTETDNINPEWMTWCRRWLETTTLREGSRRAVYNTLT\n+RIGIWLGREHPEVTGPEQWTVSVCADYLAAVDRLRVGDWGGSTFDYRLIPTVGQPLQAPTKVAYYQVMRR\n+FLSDIQSWEWARLRCNPRYHLSTPKNIAKYLGVNPRTIDDASWLKLTWASLNIEPDDLSPDCFYPFALLQ\n+AIAVVWTHAGLRSNEIARLRVGCTREQSEDVVDQSGNVVPAGQVCWLDVPEGKTSVAYTKPVGHAVHKYI\n+TAWMKKRASPRKHLDRRTGEHVHFLFQLRNRPIAKEVLNQTVIPLLCKKAGIPIEDSKGRITSHRGRASA\n+VSMLASVPQGMTIFDLAKWCGHTSVQSTMSYVRSKPTQLASAFAKADQAARMIEIVIDNEVIAAGATKDG\n+APWKYYDLGDSYCSNAFWSTCPHRMACARCYFNIPKPSAKGVVLAAQQAANRLLEEVWLSPEERDAVSGD\n+VEALEGMLNKLRDKPALDGRTPGEISATCGSQVSSPFTESE\n+>gi|384055882|ref|YP_005485506.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MELGITPGQDADITQAEPLLENIEPDAFLADKAYDADRLIDRLIQRGITPVIPPKRNRTTRRVIPP\n'
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b genetic_elements/aragorn.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genetic_elements/aragorn.py Sat Feb 21 17:17:06 2015 +0100
[
@@ -0,0 +1,125 @@
+def delete_galaxy():
+ import sys
+ for index, path in enumerate(sys.path):
+ if "galaxy-dist/" in path:
+ sys.path[index] = ''
+
+#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
+delete_galaxy()
+
+from rdflib import Graph, URIRef, Literal,Namespace, XSD, BNode,RDF,RDFS,OWL, ConjunctiveGraph, plugin
+
+# Import RDFLib's default Graph implementation.
+from rdflib.graph import Graph
+
+import sys, os
+
+import rdflib
+import subprocess
+import hashlib
+global URI
+global SubClassOfDict
+SubClassOfDict = {}
+
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+
+def createClass(uri):
+ #genomeGraph.add((uri,RDF.type,OWL.Class))
+ #genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+ #genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
+ #genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
+ #genomeGraph.add((uri,RDFS.subClassOf,coreURI["Rna"]))
+ return uri
+
+def tmp():
+ import time
+ global tmpFolder
+ tmpFolder = "/tmp/"+str(time.time())+"/"
+ os.mkdir(tmpFolder)
+
+def query():
+ global genomeGraph
+ genomeGraph = Graph()
+ filename = sys.argv[1]
+ genomeGraph.parse(filename, format="turtle")
+ qres = genomeGraph.query('select ?class ?sequence where {?class a ssb:DnaObject . ?class ssb:sequence ?sequence .}')
+ sequences = []
+ for row in qres:
+ print ("Header:",row[0])
+ sequences += [[">"+str(row[0]),str(row[1].strip())]] #.replace("/","-").replace("","")
+
+ return sequences
+
+def aragorn(sequences):
+ for sequence in sequences:
+ #Call aragorn for each contig, for ease of parsing
+ open(tmpFolder+"tmp.seq","w").write('\n'.join(sequence))
+ folder = os.path.realpath(__file__).rsplit("/",2)[0]+"/"
+ cmd = folder+"/tools/aragorn1.2.36/aragorn -fasta "+tmpFolder+"tmp.seq "+' '.join(sys.argv[3:-2])+" > "+tmpFolder+"aragorn.output"
+ print (cmd)
+ os.system(cmd)
+ aragorn = open(tmpFolder+"aragorn.output").readlines()
+#  string = ''.join(aragorn)
+
+ contig = sequence[0].strip(">").replace("http://csb.wur.nl/genome/","")
+ dnaobjectURI = coreURI[contig]
+ #print (contig)
+ for line in aragorn:
+ if ">" in line:
+ print (line.split())
+ try:
+ trna, pos = line.split()[1:]
+ except:
+ try:
+ trna, pos = line.split()
+ except:
+ if "(Permuted)" in line:
+ trna, permute, pos = line.split()[1:]
+
+ if "tRNA-" in line:
+ trna, codon = (trna.strip(">)").split("(",1))
+ else:
+ trna = trna.strip(">").strip() #Actually a tmRNA...
+ codon = ''
+ trnaClass = createClass(coreURI[trna.split("-")[0].title()]) #trna or tmrna
+ SubClassOfDict[trna.split("-")[0].title()] = 1
+ if "c" in pos[0]: #complementary
+ stop, start = pos.split("[")[1].split("]")[0].split(",")
+ else:
+ start, stop = pos.split("[")[1].split("]")[0].split(",")
+ trnaURI = coreURI[contig+"/trna-aragorn_1_2_36-"+trna.lower() +"/"+ start +"_"+ stop]
+ genomeGraph.add((dnaobjectURI, coreURI["feature"] , trnaURI))
+ genomeGraph.add((trnaURI, RDF.type,trnaClass))
+ genomeGraph.add((trnaURI, coreURI["begin"] , Literal(start,datatype=XSD.integer)))
+ genomeGraph.add((trnaURI, coreURI["end"] , Literal(stop,datatype=XSD.integer)))
+ genomeGraph.add((trnaURI, coreURI["trna_type"] , Literal(trna)))
+ genomeGraph.add((trnaURI, coreURI["trna_anti"] , Literal(codon)))
+ genomeGraph.add((trnaURI, coreURI["tool"] , Literal("aragorn")))
+ genomeGraph.add((trnaURI, coreURI["version"] , Literal("1.2.36")))
+ genomeGraph.add((trnaURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+
+def subClassOfBuilder():
+ for subclass in SubClassOfDict:
+ genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
+ genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
+ genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
+ genomeGraph.add((coreURI["Rna"], RDF.type,OWL.Class))
+
+def save():
+ #Create the subclass off instances
+ #subClassOfBuilder()
+ ## Saves the file
+ data = genomeGraph.serialize(format='turtle')
+ open(sys.argv[2],"wb").write(data)
+
+def main():
+ tmp()
+ sequences = query()
+ aragorn(sequences)
+ save()
+
+main()
b
diff -r 10cad758ed0f -r 74b8ba5e2d5b genetic_elements/aragorn.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genetic_elements/aragorn.xml Sat Feb 21 17:17:06 2015 +0100
b
@@ -0,0 +1,77 @@
+<tool id="SAPP_aragorn_trna" name="tRNA and tmRNA" version="0.3">
+    <requirements>
+        <requirement type='package' version="3.4">python</requirement>
+        <requirement type='package' version="1.0">rdflib</requirement>
+        <requirement type="package" version="1.2.36">aragorn</requirement>
+    </requirements>
+    <description>Aragon</description>
+    <command interpreter="python3.4">aragorn.py '$input' '$output' '-gc$genbank_gencode' '$tmRNA' '$tRNA' '$topology' '-fon' '-sourcedb' 'SAPP'
+    </command>
+    <inputs>
+        <param name="input" type="data" format="rdf" label="RDF Genome"/>
+
+        <param name="genbank_gencode" type="select" label="Genetic code">
+            <option value="1" select="True">1. Standard</option>
+            <option value="2">2. Vertebrate Mitochondrial</option>
+            <option value="3">3. Yeast Mitochondrial</option>
+            <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+            <option value="5">5. Invertebrate Mitochondrial</option>
+            <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+            <option value="9">9. Echinoderm Mitochondrial</option>
+            <option value="10">10. Euplotid Nuclear</option>
+            <option value="11">11. Bacteria and Archaea</option>
+            <option value="12">12. Alternative Yeast Nuclear</option>
+            <option value="13">13. Ascidian Mitochondrial</option>
+            <option value="14">14. Flatworm Mitochondrial</option>
+            <option value="15">15. Blepharisma Macronuclear</option>
+            <option value="16">16. Chlorophycean Mitochondrial</option>
+            <option value="21">21. Trematode Mitochondrial</option>
+            <option value="22">22. Scenedesmus obliquus mitochondrial</option>
+            <option value="23">23. Thraustochytrium Mitochondrial</option>
+            <option value="24">24. Pterobranchia mitochondrial</option>
+        </param>
+        <param name="topology" type="select" label="Topology">
+            <option value="-c">Assume that each sequence has a circular topology</option>
+            <option value="-l">Assume that each sequence has a linear topology</option>
+        </param>
+        <param name='tmRNA' type='boolean' label='Search for tmRNA genes (-m)' truevalue='-m' falsevalue='' checked="true" help='' />
+        <param name='tRNA' type='boolean' label='Search for tRNA genes (-t)' truevalue='-t' falsevalue='' checked="true" help='' />
+    </inputs>
+    <outputs>
+        <data format="rdf" name="output" label="Aragorn: ${input.name}"></data>
+    </outputs>
+    <help>
+
+**What it does**
+
+Aragorn_ predicts tRNA (and tmRNA) in nucleotide sequences.
+
+.. _Aragorn: http://mbio-serv2.mbioekol.lu.se/ARAGORN/
+
+-----
+
+It requires an RDF genome file
+
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @article{Laslett2004,
+abstract = {A computer program, ARAGORN, identifies tRNA and tmRNA genes. The program employs heuristic algorithms to predict tRNA secondary structure, based on homology with recognized tRNA consensus sequences and ability to form a base-paired cloverleaf. tmRNA genes are identified using a modified version of the BRUCE program. ARAGORN achieves a detection sensitivity of 99\% from a set of 1290 eubacterial, eukaryotic and archaeal tRNA genes and detects all complete tmRNA sequences in the tmRNA database, improving on the performance of the BRUCE program. Recently discovered tmRNA genes in the chloroplasts of two species from the 'green' algae lineage are detected. The output of the program reports the proposed tRNA secondary structure and, for tmRNA genes, the secondary structure of the tRNA domain, the tmRNA gene sequence, the tag peptide and a list of organisms with matching tmRNA peptide tags.},
+author = {Laslett, Dean and Canback, Bjorn},
+doi = {10.1093/nar/gkh152},
+file = {:Users/koeho006/Library/Application Support/Mendeley Desktop/Downloaded/Laslett, Canback - 2004 - ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences.pdf:pdf},
+isbn = {1362-4962 (Electronic)$\backslash$n1362-4962 (Linking)},
+issn = {03051048},
+journal = {Nucleic Acids Research},
+mendeley-groups = {VAPP Application note},
+pages = {11--16},
+pmid = {14704338},
+title = {{ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences}},
+volume = {32},
+year = {2004}
+}
+</citation>
+</citations>
+
+</tool>
+