Repository 'sapp'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjkoehorst/sapp

Changeset 31:957156367442 (2016-06-29)
Previous changeset 30:0a947cb25a3d (2016-06-29) Next changeset 32:7b519ee3ea3b (2016-06-29)
Commit message:
Uploaded
added:
._sappDocker
sappDocker/._.DS_Store
sappDocker/._annotation.xml
sappDocker/._aragorn.xml
sappDocker/._circos.xml
sappDocker/._crt.xml
sappDocker/._enzdp.xml
sappDocker/._fasta2rdf.xml
sappDocker/._gbk2rdf.xml
sappDocker/._genecaller.xml
sappDocker/._genomeInformation.xml
sappDocker/._interproscan.xml
sappDocker/._ipath.xml
sappDocker/._loader.xml
sappDocker/._locustagger.xml
sappDocker/._matrix.xml
sappDocker/._merger.xml
sappDocker/._pathwayAnalysis.xml
sappDocker/._phylogeny.xml
sappDocker/._priam.xml
sappDocker/._rdf2embl.xml
sappDocker/._rnammer.xml
sappDocker/._signalp.xml
sappDocker/._sparql.xml
sappDocker/._swisscog.xml
sappDocker/._tmhmm.xml
sappDocker/annotation.xml
sappDocker/aragorn.xml
sappDocker/circos.xml
sappDocker/crt.xml
sappDocker/enzdp.xml
sappDocker/fasta2rdf.xml
sappDocker/gbk2rdf.xml
sappDocker/genecaller.xml
sappDocker/genomeInformation.xml
sappDocker/interproscan.xml
sappDocker/ipath.xml
sappDocker/loader.xml
sappDocker/locustagger.xml
sappDocker/matrix.xml
sappDocker/merger.xml
sappDocker/pathwayAnalysis.xml
sappDocker/phylogeny.xml
sappDocker/priam.xml
sappDocker/rdf2embl.xml
sappDocker/rnammer.xml
sappDocker/signalp.xml
sappDocker/sparql.xml
sappDocker/swisscog.xml
sappDocker/tmhmm.xml
removed:
.project
.pydevproject
conversion/fasta2rdf/fastatordf.py
conversion/fasta2rdf/fastatordf.xml
conversion/fasta2rdf/test-data/NC_017117.fna
conversion/gbk2rdf/gbktordf.py
conversion/gbk2rdf/gbktordf.xml
conversion/gbk2rdf/repository_dependencies.xml
conversion/gbk2rdf/test-data/CP009049.embl
conversion/gbk2rdf/test-data/NC_010067.gbk
conversion/gbk2rdf/tool_dependencies.xml
conversion/protein2rdf/protein_to_ttl.py
conversion/protein2rdf/protein_to_ttl.xml
conversion/protein2rdf/test-data/NC_017117.faa
genetic_elements/aragorn/aragorn.py
genetic_elements/aragorn/aragorn.xml
genetic_elements/aragorn/test-data/NC_017117.rdf
genetic_elements/aragorn/tool_dependencies.xml
sappDocker.zip
b
diff -r 0a947cb25a3d -r 957156367442 ._sappDocker
b
Binary file ._sappDocker has changed
b
diff -r 0a947cb25a3d -r 957156367442 .project
--- a/.project Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,17 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
- <name>sapp</name>
- <comment></comment>
- <projects>
- </projects>
- <buildSpec>
- <buildCommand>
- <name>org.python.pydev.PyDevBuilder</name>
- <arguments>
- </arguments>
- </buildCommand>
- </buildSpec>
- <natures>
- <nature>org.python.pydev.pythonNature</nature>
- </natures>
-</projectDescription>
b
diff -r 0a947cb25a3d -r 957156367442 .pydevproject
--- a/.pydevproject Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?eclipse-pydev version="1.0"?><pydev_project>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
-</pydev_project>
b
diff -r 0a947cb25a3d -r 957156367442 conversion/fasta2rdf/fastatordf.py
--- a/conversion/fasta2rdf/fastatordf.py Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,97 +0,0 @@
-#!/usr/bin/env python3.4
-# Author: Jasper Jan Koehorst
-# Date created: Jan 22 2015
-# Function: generation of a RDF file from a genome fasta file
-
-
-# from io import StringIO
-from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
-# import rdflib
-from rdflib.store import Store
-import sys
-
-store = plugin.get('IOMemory', Store)()
-
-global URI
-URI = "http://csb.wur.nl/genome/"
-global seeAlso
-seeAlso = "rdfs:seeAlso"
-global coreURI
-coreURI = Namespace(URI)
-global genomeGraph
-store = plugin.get('IOMemory', Store)()
-genomeGraph = Graph(store,URIRef(URI))
-genomeGraph.bind("ssb",coreURI)
-
-def delete_galaxy():
- for index, path in enumerate(sys.path):
- if "galaxy-dist/" in path:
- sys.path[index] = ''
-
-def createClass(uri):
- genomeGraph.add((uri,RDF.type,OWL.Class))
- genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
- return uri
-
-def fasta_parser(input_file):
- createClass(coreURI["Genome"])            #Genome class
- createClass(coreURI["Type"])                #Type class (Chr,Pls,Scaffold)
-
- genomeDict = {}
-
- sequence = ""
- genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
- if genomeID == 'None':
- genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
-
- genomeURI = coreURI[genomeID]
- for index, element in enumerate(sys.argv):
- if '-organism' == element:
- genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
- if '-ncbi_taxid' == element:
- genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
- if '-idtag' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
- if '-ids' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
-
- genomeDict[genomeID] = {}
-
- #Generating genome dictionary
- data = open(input_file).readlines()
- fastadict = {}
- key = ""
- for index, line in enumerate(data):
- if ">" == line[0]:
- key = line.strip(">").strip()
- fastadict[key] = ""
- else:
- fastadict[key] += line.strip()
-
- genomeClass = createClass(coreURI["Genome"])
- typeClass = createClass(coreURI["DnaObject"])
- for index, genome in enumerate(fastadict):
- typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
- sequence = fastadict[genome]
- genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
- genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
- genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
- genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((genomeURI, RDF.type,genomeClass))
- genomeGraph.add((typeURI, RDF.type,typeClass))
-
-def save():
- data = genomeGraph.serialize(format='turtle')
- open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
-
-def main():
- input_file = sys.argv[sys.argv.index("-input")+1]
- fasta_parser(input_file)
- save()
-
-if __name__ == '__main__':
- #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
- delete_galaxy()
- main()
-
b
diff -r 0a947cb25a3d -r 957156367442 conversion/fasta2rdf/fastatordf.xml
--- a/conversion/fasta2rdf/fastatordf.xml Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,38 +0,0 @@
-<tool id="SAPP_genome_to_ttl" name="FASTA to RDF" version="0.1">
-    <requirements>
-        <requirement type='package' version="3.4">python</requirement>
-        <requirement type='package' version="1.0">rdflib</requirement>
-    </requirements>
- <description></description>
- <command interpreter="python3">fastatordf.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' -sourcedb SAPP
- #for $index, $id in enumerate( $ids ) 
- '-ids' '$id.id_tag'
- #end for
- '-id_alternative' '$input.name'
- </command>
- <inputs>
- <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
- <param size="60" name="organism" type="text" format="text" label="organism name" optional="false"/>
- <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
- <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!" optional="false"/>
- <repeat name="ids" title="Identification tags">     
- <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
- </repeat>
- </inputs>
-
- <outputs>
- <data format="rdf" name="output" label="genomeTTL: ${input.name}" />
- </outputs>
-    
-    <tests>
-        <test>
-            <param name="input" value="test-data/NC_017117.fna"/>
-            <output name="$output" file="NC_017117.rdf"/>
-            <output name="$ncbi_taxid" value="634455"/>
-            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
-            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
-        </test>
-    </tests>
-
-<help> Genome FASTA file to RDF</help>
-</tool>
b
diff -r 0a947cb25a3d -r 957156367442 conversion/fasta2rdf/test-data/NC_017117.fna
--- a/conversion/fasta2rdf/test-data/NC_017117.fna Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,2736 +0,0 @@\n->gi|384055705|ref|NC_017117.1| Acetobacter pasteurianus IFO 3283-22 plasmid pAPA22-010, complete sequence\n-CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACAT\n-TGCGCAGATACAGGAAACACAGACCAAATCCCCATCTCCTGTGAGCCTGGGTCAGTCCCACCAGAAGAGC\n-GGCAATCCTGTCGTTCTCCGCTGCCAGTCGCGGACGATAGCGAAAGCAGGTCTCGGATATCCCAAAAATC\n-CGACAGGCCAGCGCAATGCTGACCCCATGATGCGCCACAGCTTGTGCGGCCAGTTCCCGGCGCTGGGCTG\n-GCCGCTTCATTTTTTTCCAAGGGCTTCCTTCAGGATATCCGTCTGCATGCTCAAATCCGCATACATGCGC\n-TTCAGCCGACGGTTCTCCTCTTCCAAAGCCTTCATCTGACTGATCATCGAAGCATCCATGCCGCCATATT\n-TCGCGCGCCACCGGTAAAACGTGGCGTTGCTGATCCCATGCTCCCGACACAGGTCAGGAACCGGGACACC\n-GCCCTCAGCCTGGCGGATCACACCCATGATCTGGGCGTCAGTAAAGCGATCACTCTTCATCAGAATCTCC\n-TCAATTCTTACGCTGAGAAAATTCTCATTCAAAAGTCACTCTTTTTATGGGGGGATTACCACTCTAAATC\n-AATGCATTCCAATTAACTTATAAAATGCTTTGAGAGTCATCACCTACAGCAAAGAACTCTGCTGACACCT\n-CTGAAATCATTTTGAGCCATAAAAACTGGGCTCGATTGACGTCCTGAAACTCATCAGCCATCACGGCTGT\n-GAAACGGCGTGACCAGCGGTAGCGATAGGCATCATTGTGCAGCATTGCCAATGTCGGCCACATCAACAGA\n-TCCCCAAAATCTGCAGCATTCTGTTCGCGCAGCAAACTCTGGTAACGACCATACAACTCAACCACATAGC\n-GCCAGCCCGCATCGTCCATAAAACGTTTCTGGGCATGTGCTCGCGCTATCATGGCTTCAACATGATGCCC\n-TGCCATCTCAGGCGTCACCAGATCTTCCTTCAAACGAGATGGGGTGGTTGCCGTCCTCCCCGCACGGCAT\n-CGCAATGTGCCAGAATGGTCGTTGGAAGAAACGACTGCGCGAAAGGACGGCAGATGAAGGATACAGTGAT\n-AGGCGTTGATCTGGCAAAGAACATTTTCCAGGTTCATGGAGCTTCGCGTGCGGGCGAGGTGATGTTTCGC\n-AAAAAGCTGCGTCGTCAGCAGTTTATGCAGTTCATGGCCACGCAGCCGCCTGCTCTGGTCGTTCTTGAAG\n-CGTGCGGGAGCGCGCATTACTGGGCTCGCGAACTGGCAGGAGCTGGTCACGAGGTCAGACTGATCGCTCC\n-GCAGTATGTGAAGCCTTTCGTGAAGCGCCAGAAGAACGATGCTGCTGATGCGGAAGCGATCGTCATTGCG\n-GCCCGTCAGCCGGAAATGCGCTTTGTCGAACCACGCACTGAAGCGCAGCAGGCGCGTGGCGTTCTTTTCC\n-GGGCCCGGCAGCGTCTGGTGCACCAGCGCACGGAACTGGTGAATGCCCTGCGTGCCGTTCTGTATGAATT\n-CGGTCTCGTCGTGCCACAGGGGATTGCGCATATCAGACACATTGAAGCCATGCTGGATGAGGCGGTTCTG\n-CCAGAGGCTGTGAAGCAGGAATGCCTTGATCTGCTGCGACAGATTTCGGAGCAGAGTGTGCGGATTGATG\n-TCAGAACAAAGAAGATCAGGATGCTTGCCCAGGAAAGTGAAAACACCTGCAGATTGCAGAGCATGCCTGG\n-AGTGGGTCCTCTGACCGCTCTTGCGATTGAAGCTTTTGCGCCTGACCTGCAGAGCTTCCGGCGCGGGCGC\n-GACTTTGCTGCGTGGCTGGGGCTGGTGCCCCGTCAGTTCTCATCTGGCGGAAAGGAAAGGCTGGGGAAGA\n-TATCAAAAGCCGGGCAGGCTGATATCCGCAGGCTTCTCATCATGGGCGCCATGACCCAGGTGAACTGGGC\n-CAGCCGTAAGGCCCCTGCACCGGGAAGCTGGCTGGCACGGATGCTGGCCCGCAAGCCCCGTATGCTGGTA\n-GCCATTGCGCTGGCCAACAGGATGGCACGAGCCATCTGGGCCATGGCAACAAAACAGGAGGATTATCGGG\n-ATCCGGCCCTGTCCGTGGCAGCCTGAGCGATGGCTCGGCTCCCGCGGATGGAACCGGTAGGGGTGTGAGA\n-GGGCGATGACCTGAATGGGCGCATGATCGTCTGATCCGGATCGGAAAAACCAGTGGATTTCTCTGTGCTT\n-TAAAGCACGCCTGTGAGATTTGGATCTGATCCGCTGATCACCATACTGGCCAGTGGCTTCTGAAAGGCCA\n-CATCAACAGGCCTTACAGAAGACCGCACACGATCACACGTCAATATGGGTCAGAAAACTCTTGCATAACG\n-GACGGCAACCATATGTGGACGGCTCCCCCTTGCAAGAGGCTAGGCAAGAAAATGATCGGATCTTTGCTTC\n-CATATGTCCGGCCTGTTGATGCGGCCATAGGGTCGCTGGCCAAGATGGCTTCCGCAGCGTGAGCCCCAAA\n-CACAGAAGCGGTCTTTGATGACCACTGGTTGCCACGGGTTTTCTCACGCCATGGATCGATCGATCACACC\n-ATCTGCTCTATTACTTGCAAGCCACGACCTCAGCTCGGCACGAGAGCGTCAAATGTCAGCGCATCGTGCC\n-AGGCTAAGCTCAAACAGCAGCTGCGCCGGGTTGCTGCAGAAGGCGCTTATAGTGTTCGCCGCTGACCATC\n-AGTTTCCAAGCAATCCGCGCAATCTTATTGGCAAGGGCCACCGCTGCGAGTTTCGGTTTTTTGCGCTCCA\n-GCAATTCACGTAACCAAGATGAGGCATTCTTCCCATTGGTCCGCCGGGCATGCGACACGACTGCGGTCGC\n-GCCAACCACCAGCGTGCTTCGCAAGACCTCATCGCCAGCGCGTGTGATTCTGCCAAGCCTTGTTTTTCCA\n-CCGGTTGAGTGATCCCTGGGCGTCAATCCGATCCAGGCCGCAAAGGCTCGACCCGATTTGAACAGATGCG\n-GATCAGGCGTTTTCATCATCAGCAGCGCTGCGCCGATCGGGCCAACGCCCGGAATTTTCGCAAGACGCTG\n-ACTGCATTCGTTGGCGCGGTGCCATGCCATCACCTTGCCCTCAAGCTGTTCGATTTCACCTTGCAATTCA\n-GCATATTCCTTTGCGTGAAGGGCAAACAACTCGCGCGTCAATGTGGGCAGGCTTTCGTCCGCAGCGATCC\n-GATCAAGGAGTGCCTCAATCCGGCACATGCCTTTGGGCGCCGTGATCCCAAACTCGGCAGCATATCCCCG\n-GATCGTATTGGCGAGCTGTGTGCGGTTCCGGATAAGTCGTGCCCGCATTCCAATCAGCATCAACGCTGCC\n-TGCTCTTCCTCGCTCTTGAGCGGGACGAACCGCATTGTAGGCCGACTCATCGCTTCACAGAGGGCTTCCG\n-CGTCGGCGGCATCGTTTTTCCCGCGCTTGACATAAGGCTTCACGAGCTGCGGCGCGATCAGCTTCACTGT\n-GTGTCCCAGACACGAGAGCACCCGCCCCCAGTAATGGGAGGCGCCACAGGCCTCAATCGCGATTTCAATC\n-GGGGGCAGTTTCTCAAAAAACTTTACCATCTCCCGGCGGGATAGCTTCCTGCGCAAAACAGGCTGCTCCT\n-TCGCGTTTACACCGTGCAATTGGAAAACACTTTTTGACGTGTCCATGCCAATACGGATAATTTGTTCCAT\n-GGGTGGCCTCCTCTGTGAGTTCTGCAACGACTTCACCTTGGCACATCGCGATGCCG'..b'TTGCTCCGAAGGCCTGCATGTGTCCACACCACGGCGATTGCCTGCAACAAGGCGAATGGAT\n-AGAAACAGTCAGGAGAAAGGTCGTCTGGTTCGATGTTGAGGCTGGCCCAGGTCAGTTTCAGCCAAGAAGC\n-ATCATCTATGGTCCTAGGATTTACGCCAAGATATTTCGCAATATTCTTCGGTGTCGAGAGATGATATCGC\n-GGATTACACCTGAGCCGCGCCCATTCCCAACTCTGAATATCAGATAAAAACCGGCGCATAACCTGATAAT\n-AAGCAACTTTCGTTGGAGCCTGCAGAGGCTGGCCAACTGTTGGGATCAGACGATAATCAAAGGTGGAACC\n-GCCCCAATCGCCAACACGTAACCTGTCGACGGCGGCGAGATAGTCGGCACATACCGATACCGTCCATTGC\n-TCTGGTCCAGTGACCTCAGGGTGCTCGCGACCCAACCAGATTCCGATACGGGTTAGAGTGTTGTAAACTG\n-CTCGCCTCGACCCTTCTCGCAACGTTGTTGTTTCCAACCAACGTCGGCACCATGTCATCCACTCAGGATT\n-GATATTATCAGTTTCAGTTCCGTGACGAGGTGGATATGCCCGAAAATGGATAACCTTTGATGTTAATCCC\n-ATTGCCGCCAAACCGTTTGACAATATTCCAATCCGCTTGGCGATATGATTTCCTGTGTAACACTCTCGTG\n-TATGTTCCAGTATCTTCCTATCAAAACTTTCAAGTTTTGGATTCTCGCCTTGTAATGCAACTGCAGCAAA\n-TACTGATGGTACGAGGGGCCGAACGGTCTGACAGACGAAGCCGACACGGGTTAGGGCCGAGAACAGACGC\n-TCACATTCTGTATCAACAATCTCCGCTCCAAAAACCAATCGGGCAACGACAGCCGTCGCCACATTGCGTT\n-GAATTTTGTACACGCTTCGAAAACCACACAGAAGATAAGCGACTGCAATCAACTGCGGTACCATTCCGGT\n-TTTTTTCAGAACAGGACTATTATTGATAATCTCAACCCACAGGTCTTCACTCCACCCCCAGTAGGGTTTT\n-TCCTGGTGTGCGACCGTCAGAAGCAAATACTTCAAAGCACGATGGCTGTTAGCGCGGTCGAGATTGCTGC\n-GATACAGGATGTCCGTCAGAGGCTTATGAAGACGCGGAGCCTGCCGTTTCGTAATAACAAGATTTTCTTC\n-TTTCCAACGTCTGACAACTTTTAATTCGTCTGCAGACAATGTCGCCCGTCTGTCGTATTCATCGAGATTA\n-ATGTGGAATTTGTATTCGGGCTGGATGTGAGAAACCTGAGATTCTACCACTTTTCTATCCTCCGAAGACC\n-CTGTGACCAAGCTTCATATCCATCTGTTCGACTGAGTTTGCGATCTTGCGAAGCAGATCTTCACCGGAAA\n-GATGGATATAGAGTGTCGTGCTTTGAACATTGCGATGCCCGGCATACGTCGCAATATCGTGTAGACGCCA\n-GCCAGCACGGGCCAGATGCGTCAATCTCAGGTGACGCAAAGTGTGCGTACTGAACAATGGCATATCAGCC\n-TGGAGAGCAAGACGTCTGACAGTTTTGCTCCATGACCACTTCGTAATAGGCTGCCGAAAGTTCCGATCTG\n-ACTCAGAGAGAAACAGGGCCGCTGAATGAGTCGCTGCGTTGCGCCTTTGATGCAGATATACCGCCAACAC\n-AGGACAGAGCGCCGCTGAATAACAAACCACACGAGGGCGAGCGCTTTTACTTGTTTCGGCCCGAATGGTG\n-AGCAAACGTCTCGCAGGGTCGATATCCGAGACGCGCAAATTTACTACGGCGTGTCGTCAGTTAAGCCCTG\n-AGAGTGGCACGTGAGGGTTGTACTTTGTGTCTGCGTGTGCTGACTGTTTTCCCATTTTTTGGGGAGACAG\n-ACAGATGCGGCGCTATAGTTTACGCGATGACCAGTGGGAGCGGATAAAGGATCTTCTTCCTGGTCGAGAA\n-GGCTATGTCGGCGGCACTGCGGTGAACAACCGTCTGTTCGTGGAGGCGGTGCTGTATCGCTATCGCGCGG\n-GTATTCCATGGCGCGACCTTCCTGCCCGTTTCGGTGACTGGAAAAACGTGCACCGGCGTCTGCGCCGCTG\n-GTGTGAAAGCGGCGTCATCGAACGGATATTTCGTTATCTGGCCGCTGATTACGACAACGAATACATGATG\n-ATCGACAGCACAATTGTCCGAGCGCATCAGCATAGTGCCGGAGCTCTCAAAAAAGGGGCACGGATCAGGC\n-CATCGGACGATCACGGGCGGGCTAACTACAAAGATCCATGCCATCTGCGACGCTCTGGGCAATCCAGTGG\n-AACTCGGCATCACACCGGGACAGGATGCCGATATCACCCAGGCAGAACCACTTCTGGAAAACATCGAACC\n-GGATGCTTTCCTTGCTGACAAGGCGTATGACGCGGACAGGTTGATCGATCGGCTGATACAGCGCGGGATT\n-ACCCCGGTCATCCCGCCAAAACGCAACAGAACGACACGACGGGTAATCCCCCCATAAAAAGAGTGACTTT\n-TGAATGAGAATTTTCTCAGCGTAAGAATTGAGGAGATTCTGATGAAGAGTGATCGCTTTAGTGACGCCCA\n-GATCATGGGTGTGATCCGCCAGGCTGAGGGCGGTGTCCCGGTTCCTGACCTGTGCCGGGAGCATGGGATC\n-AGCAACGCCACGTTTTACCGGTGGCGCGCGAAATATGGCGGCATGGATGCTTCGATGATCAGTCAGATGA\n-AGGCTTTGGAAGAGGAGAACCGTCGGCTGAAGCGCATGTATGCGGATTTGAGCATGCAGACGGATATCCT\n-GAAGGAAGCCCTTGGAAAAAAATGAAGCGGCCAGCCCAGCGCCGGGAACTGGCCGCACAGGCTGTGGCGC\n-ATCATGGGGTCAGCATTGCGCTGGCCTGTCGGATTTTTGGGATATCCGAGACCTGCTTTCGCTATCGTCC\n-GCGACTGGCAGCGGAGAATGACAGGATTGCCGCTCTTCTGGTGGGACTGACCCAGGCTCACAGGAGATGG\n-GGATTTGGTCTGTGTTTCCTGTATCTGCGCAATGTGCAGGGACAGCTCTGGAATCATAAGCGGGTTTATC\n-GGATCTATCGGGAACTGGAGTTCAACCTGCGGATTAAACCCCGCAGGCGTCTGGTTCGCGAAAAGCCTGA\n-AAAGCTGTCGGTTCCGGCCCTTCCCAACACGGTCTGGTCCATGGATTTCATGGCGGACAGGCTTTTGGAT\n-GGACGCGCTTTTCGGCTCCTGAACATCCTGGATGAGTTCAATCGTGAAGGACTGGCGATCGAGGTTGATT\n-TTTCCCTGCCGGCCTGTCGGGTTGTCCGCTGGTAATCCCCCCATTTTTAGTGGGGCATTGAATGAGAATT\n-CAGGCAGCTGTTTTTAGTTTCTGGGCGGGGGTTAGCCCGCTGTTCCCCATGTTGGGTCTGTCATTGTTAT\n-ATGTCCAGAGCCATTGTGTTGCGACCTCCTGTACGTCCTGAATGCTTTCAAACAAATACTGCTCTAGCCA\n-TTCCTGCCGGACAGTTCTGTTGTAGCGTTCAATATAGGCGTTCTGCTGCGGATTGCCCGGTTGTGTATAG\n-ATCAGGGTAATCCCCTGCTTTTCGGCCCATGAAACCAACGTATGACTGACATATTCAGGGCCATTGTCCA\n-TTCGGATAGCCTCTGGCCTGCCACGCCACTCCATAACCTGTTCCAGACAGCGAACAACCCGACAGGCTGG\n-CAGGGAAAAATCAACCTCAATCGCCAGTCCTTCACGATTGAAATCATCCAGAATGTTCAGGAGCCGAAAA\n-GCACGTCCATCCATCAGCCTGTCCGCCATAAAATCCATGGACCAGACCCTGTTGGGAAGGGCCGGAACCG\n-ACAGCTTTTCAGGCTTTTCGCGAACCAGACGCCTGCGGGGTTTAATC\n'
b
diff -r 0a947cb25a3d -r 957156367442 conversion/gbk2rdf/gbktordf.py
--- a/conversion/gbk2rdf/gbktordf.py Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,360 +0,0 @@\n-#!/usr/bin/env python3.4\n-# Author: Jasper Jan Koehorst\n-# Date created: Feb 21 2015\n-# Function: generation of a RDF file from Genbank/EMBL\n-\n-import warnings\n-warnings.filterwarnings("ignore")\n-\n-def delete_galaxy():\n-\timport sys\n-\tfor index, path in enumerate(sys.path):\n-\t\tif "galaxy-dist/" in path:\n-\t\t\tsys.path[index] = \'\'\n-\n-#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.\n-delete_galaxy()\n-\n-from Bio import SeqIO\n-# Import RDFLib\'s default Graph implementation.\n-import os, sys\n-from Bio.Seq import Seq\n-\n-from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin\n-from rdflib.store import Store\n-import hashlib\n-store = plugin.get(\'IOMemory\', Store)()\n-\n-global URI\n-URI = "http://csb.wur.nl/genome/"\n-global seeAlso\n-seeAlso = "rdfs:seeAlso"\n-global coreURI\n-coreURI = Namespace(URI)\n-\n-global SubClassOfDict\n-SubClassOfDict = {}\n-global SubClassOfDictRna\n-SubClassOfDictRna = {}\n-\n-def createClass(uri, root=True):\n-\tgenomeGraph.add((uri,RDF.type,OWL.Class))\n-\tif root:\n-\t\tgenomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))\n-\treturn uri\n-\n-def tmp():\n-\timport time\n-\tglobal tmpFolder\n-\ttmpFolder = "/tmp/"+str(time.time())+"/"\n-\tos.mkdir(tmpFolder)\n-\n-def cleantmp():\n-\tos.system("ls "+tmpFolder)\n-\tos.system("rm -rf "+tmpFolder)\n-\n-def crawler():\n-\t#From input folder it looks for GBK file (gz files are in progress)\n-\tinput_file = sys.argv[sys.argv.index("-input")+1]\n-\tgbk_parser(input_file)\n-\n-def gbk_parser():\n-\tprevObjStart = -1\n-\tprevObjStop = -1\t\n-\tstore = plugin.get(\'IOMemory\', Store)()\n-\tglobal genomeGraph\n-\tgenomeGraph = Graph(store,URIRef(URI))\n-\tgenomeGraph.bind("ssb",coreURI)\n-\tinput_file = sys.argv[sys.argv.index("-input")+1]\n-\n-\t#CLASS definitions\n-\tgenomeClass = createClass(coreURI["Genome"], root=True)\n-\ttypeClass = createClass(coreURI["DnaObject"], root=True)\n-\tcreateClass(coreURI["Protein"], root=True)\n-\tpubmedClass = createClass(coreURI["Pubmed"], root=True)\n-\tmiscClass = createClass(coreURI["MiscFeature"], root=False)\n-\tcreateClass(coreURI["Feature"], root=True)\n-\tSubClassOfDict["MiscFeature"] = 1\n-\tSubClassOfDictRna["Trna"] = 1\n-\tSubClassOfDictRna["Rrna"] = 1\n-\tSubClassOfDictRna["Tmrna"] = 1\n-\tSubClassOfDictRna["Ncrna"] = 1\n-\n-# \tcodon = "11" #Default initialization if no CDS are present\n-\t##################\n-\tweird_chars = list(\'\'\',./?<>:;"\'|\\}]{[+=_-)(*&^%$#@!\xc2\xb1\xc2\xa7~` \'\'\')\n-\tscaf_value = 0\n-\t#Which files are already done\n-\t########\n-\tformatGBK = sys.argv[sys.argv.index("-format")+1]\n-\tfor record in SeqIO.parse(input_file, formatGBK):\n-\t\t#Read first feature for genome name and information...\n-\t\t#Ignore the empty GBK file due to the lack of features?\n-\n-\t\tfor index, feature in enumerate(record.features):\n-\t\t\tif index == 0:\n-\t\t\t\tif "-identifier" in sys.argv:\n-\t\t\t\t\tgenome = sys.argv[sys.argv.index("-identifier")+1]\n-\t\t\t\telse:\n-\t\t\t\t\ttry:\n-\t\t\t\t\t\tgenome = feature.qualifiers["organism"][0].replace(" ","_")\n-\t\t\t\t\texcept:\n-\t\t\t\t\t\t#BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS\n-\t\t\t\t\t\tgenome = "XNoneX"\n-\t\t\t\tfor char in weird_chars:\n-\t\t\t\t\tgenome = genome.replace(char,"_")\n-\n-\t\t\t\ttry:\n-\t\t\t\t\tgi = record.annotations["gi"]\n-\t\t\t\t\ttyp = str(gi)\n-\t\t\t\texcept:\n-\t\t\t\t\ttry:\n-\t\t\t\t\t\tgi = record.annotations["accessions"][0]\n-\t\t\t\t\t\ttyp = str(gi)\n-\t\t\t\t\texcept:\n-\t\t\t\t\t\tscaf_value += 1\n-\t\t\t\t\t\ttyp = "scaffold_"+str(scaf_value)\n-\t\t\t\tgenomeURI = coreURI[genome]\n-\t\t\t\tgbkURI = coreURI[genome + "/" + typ]\n-\t\t\t\t#To contig connection to connect all data to it\n-\t\t\t\tgenomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))\n-\n-\t\t\t\t#General genome features also stored in the class...\n-\t\t\t\tif "genome" in feature.qualifiers:\n-\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))\n-\t\t\t\tif "strain" in feature.qualifiers:\n-\t\t\t\t\tgenomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))\n-\t\t\t\tif "taxonomy" in record.annotations:\n-\t\t\t\t\tfo'..b'a" and feature_type.lower() != "ncrna":\n-\t\t\tSubClassOfDict[feature_type.lower().title()] = 1\n-\tfor key in feature.qualifiers:\n-\t\tvalues = feature.qualifiers[key]\n-\t\tif key == "translation":\n-\t\t\tpass\n-\t\telif type(values) == list:\n-\t\t\tfor v in values:\n-\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n-\t\telse:\n-\t\t\tint_add(generalURI,coreURI[key.lower()],values)\n-\tif feature.type == "CDS":\n-\t\ttry:\n-\t\t\t#Feature is normally submitted to this function\n-\t\t\t#IF a subfeature is submitted it is submitted as a feature\n-\t\t\t#And subfeature variable will contain the superfeature\n-\t\t\tif superfeature:\n-\t\t\t\tcodon = superfeature.qualifiers["transl_table"][0]\n-\t\texcept:\n-\t\t\t#Default codon table 11\n-\t\t\tcodon = "11"\n-\t\t#Protein linkage\n-\t\ttranslation = ""\n-\t\ttry:\n-\t\t\ttranslation = feature.qualifiers["translation"][0].strip("*")\n-\t\texcept KeyError:\n-\t\t\t#When protein sequence is not given...\n-\t\t\tif len(feature.location.parts) > 1:\n-\t\t\t\t#Exon boundaries?\n-\t\t\t\tseq = \'\'\n-\t\t\t\tfor loc in feature.location:\n-\t\t\t\t\tseq += record.seq[loc]\n-\t\t\t\tif int(feature.location.strand) == -1:\n-\t\t\t\t\tseq = Seq(seq).complement()\n-\t\t\t\telse:\n-\t\t\t\t\tseq = Seq(seq)\n-\t\t\t\ttranslation = str(seq.translate(feature.qualifiers["transl_table"][0]))\n-\t\t\telif int(feature.location.strand) == -1:\n-\t\t\t\tif str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:\n-\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n-\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))\n-\t\t\t\t\telse:\n-\t\t\t\t\t\ttranslation = \'\'\n-\t\t\telif int(feature.location.strand) == +1:\n-\t\t\t\t\tif len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:\n-\t\t\t\t\t\ttranslation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))\n-\t\t\t\t\telse:\n-\t\t\t\t\t\ttranslation = \'\'\n-\t\t\t\n-\t\t\tif translation:\n-\t\t\t\ttranslation = list(translation)\n-\t\t\t\ttranslation[0] = "M"\n-\t\t\t\ttranslation = \'\'.join(translation).strip("*")\n-\t\t\t\tif "*" in translation:\n-\t\t\t\t\tpass\t\t\n-\n-\t\ttranslation = translation.encode(\'utf-8\')\n-\t\tmd5_protein = hashlib.md5(translation).hexdigest()\n-\t\tproteinURI = coreURI["protein/"+md5_protein]\n-\t\tgenomeGraph.add((generalURI,coreURI["protein"],proteinURI))\n-\t\tfor key in feature.qualifiers:\n-\t\t\tfor v in feature.qualifiers[key]:\n-\t\t\t\tif key == "translation":\n-\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))\n-\t\t\t\t\tgenomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))\n-\t\t\t\t\tgenomeGraph.add((proteinURI,RDF.type,proteinClass))\n-\t\t\t\telse:\n-\t\t\t\t\tfor v in feature.qualifiers[key]:\n-\t\t\t\t\t\tint_add(generalURI,coreURI[key.lower()],v)\n-\t\n-def int_add(subject, predicate, obj):\n-\ttry:\n-\t\tobject_float = float(obj.replace(\'"\',\'\'))\n-\t\tobject_int = int(obj.replace(\'"\',\'\'))\n-\t\tif object_int == object_float:\n-\t\t\tgenomeGraph.add((subject,predicate,Literal(object_int)))\n-\t\telse:\n-\t\t\tgenomeGraph.add((subject,predicate,Literal(object_float)))\n-\texcept:\n-\t\tgenomeGraph.add((subject,predicate,Literal(obj.replace(\'"\',\'\'))))\n-\t\t\t\t\n-def save():\n-\tdata = genomeGraph.serialize(format=\'turtle\')\n-\topen(sys.argv[sys.argv.index("-output")+1],"wb").write(data)\n-\n-def subClassOfBuilder():\n-\tfor subclass in SubClassOfDict:\n-\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n-\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))\n-\n-def subClassOfBuilderRna():\n-\tfor subclass in SubClassOfDictRna:\n-\t\tgenomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))\n-\t\tgenomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))\n-\t\tgenomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))\n-\t\tgenomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))\n-\n-def main():\n-\ttmp()\n-\tgbk_parser()\n-\tsubClassOfBuilder()\n-\tsubClassOfBuilderRna()\n-\tsave()\n-\tcleantmp()\n-\n-if __name__ == "__main__":\n-\tmain()\n\\ No newline at end of file\n'
b
diff -r 0a947cb25a3d -r 957156367442 conversion/gbk2rdf/gbktordf.xml
--- a/conversion/gbk2rdf/gbktordf.xml Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,39 +0,0 @@
-<tool id="SAPP_genbank_to_ttl" name="EMBL/GBK to RDF" version="0.1">
- <requirements>
-     <requirement type='package' version="3.4">python</requirement>
-     <requirement type='package' version="1.0">rdflib</requirement>
- <requirement type="package" version="1.65">biopython</requirement>
- </requirements>
- <description>Genbank to RDF conversion</description>
- <command interpreter="python3.4">gbktordf.py '-input' '$input' -output '$output' -sourcedb "$format" -format "$format"</command>
- <inputs>
- <param name="input" type="data" format="gbk,gb,genbank,embl" label="Genbank file"/>
- <param name="format" type="select" label="EMBL/GBK">
- <option value="genbank" selected="true"> Genbank</option>
- <option value="embl"> EMBL </option>
- </param>
- </inputs>
-
- <outputs>
- <data format="rdf" name="output" label="GBKttl: ${input.name}" />
- </outputs>
-
- <tests>
-     <test>
-       <param name="input" value="test-data/NC_010067.gbk"/>
-       <output name="$output" file="NC_010067.rdf"/>
-       <output name="$format" value="genbank"/>
-       <output name="$sourcedb" value="genbank"/>
-     </test>
-     <test>
-       <param name="input" value="test-data/CP009049.embl"/>
-       <output name="$output" file="CP009049.rdf"/>
-       <output name="$format" value="embl"/>
-       <output name="$sourcedb" value="embl"/>
-     </test>
-  </tests>
-  
- <help>
- Genbank or EMBL to RDF conversion
- </help>
-</tool>
b
diff -r 0a947cb25a3d -r 957156367442 conversion/gbk2rdf/repository_dependencies.xml 
--- a/conversion/gbk2rdf/repository_dependencies.xml  Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,4 +0,0 @@
-<?xml version="1.0"?>
-<repositories description="Requires Biopython as a dependency.">
-  <repository name="package_biopython_1_65" owner="biopython" />
-</repositories>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 conversion/gbk2rdf/test-data/CP009049.embl
--- a/conversion/gbk2rdf/test-data/CP009049.embl Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,157312 +0,0 @@\n-ID   CP009049; SV 1; circular; genomic DNA; STD; PRO; 4599018 BP.\n-XX\n-AC   CP009049;\n-XX\n-PR   Project:PRJNA255737;\n-XX\n-DT   13-FEB-2015 (Rel. 123, Created)\n-DT   13-FEB-2015 (Rel. 123, Last updated, Version 1)\n-XX\n-DE   Salmonella enterica subsp. enterica serovar Paratyphi A strain CMCC 50973,\n-DE   complete genome.\n-XX\n-KW   .\n-XX\n-OS   Salmonella enterica subsp. enterica serovar Paratyphi A\n-OC   Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n-OC   Enterobacteriaceae; Salmonella.\n-XX\n-RN   [1]\n-RP   1-4599018\n-RA   Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n-RT   "Whole Genome Sequences of two Salmonella paratyphi A strains";\n-RL   Unpublished.\n-XX\n-RN   [2]\n-RP   1-4599018\n-RA   Wang B., Liang H., Liu X., Zhu L., Wang H., Zeng M.;\n-RT   ;\n-RL   Submitted (24-JUL-2014) to the INSDC.\n-RL   State Key Laboratory of Pathogen and Biosecurity, Beijing Institute of\n-RL   Biotechnology, 20 Dongdajie, Fengtai District, Beijing, Beijing 100071,\n-RL   China\n-XX\n-DR   MD5; e41a6215bf412b701febd8d4b182ec0c.\n-DR   BioSample; SAMN02909989.\n-XX\n-CC   Source DNA/bacteria are available from National Center for  Medical\n-CC   Culture Collection (CMCC) in China.\n-CC   Annotation was added by the NCBI Prokaryotic Genome Annotation\n-CC   Pipeline (released 2013). Information about the Pipeline can be\n-CC   found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/\n-CC   ##Genome-Assembly-Data-START##\n-CC   Assembly Method       :: SOAPdenovo v. 2011.04\n-CC   Assembly Name         :: CMCC(B) 50973\n-CC   Genome Coverage       :: 133x\n-CC   Sequencing Technology :: Illumina\n-CC   ##Genome-Assembly-Data-END##\n-CC   ##Genome-Annotation-Data-START##\n-CC   Annotation Provider          :: NCBI\n-CC   Annotation Date              :: 07/25/2014 13:43:31\n-CC   Annotation Pipeline          :: NCBI Prokaryotic Genome Annotation\n-CC                                   Pipeline\n-CC   Annotation Method            :: Best-placed reference protein set;\n-CC                                   GeneMarkS+\n-CC   Annotation Software revision :: 2.6 (rev. 440435)\n-CC   Features Annotated           :: Gene; CDS; rRNA; tRNA; ncRNA;\n-CC                                   repeat_region\n-CC   Genes                        :: 4,309\n-CC   CDS                          :: 4,016\n-CC   Pseudo Genes                 :: 166\n-CC   CRISPR Arrays                :: 2\n-CC   rRNAs                        :: 20 ( 5S, 16S, 23S )\n-CC   tRNAs                        :: 100\n-CC   ncRNA                        :: 7\n-CC   Frameshifted Genes           :: 106\n-CC   ##Genome-Annotation-Data-END##\n-XX\n-FH   Key             Location/Qualifiers\n-FH\n-FT   source          1..4599018\n-FT                   /organism="Salmonella enterica subsp. enterica serovar\n-FT                   Paratyphi A"\n-FT                   /host="Homo sapiens"\n-FT                   /sub_species="enterica"\n-FT                   /strain="CMCC 50973"\n-FT                   /mol_type="genomic DNA"\n-FT                   /country="China:Jiangsu"\n-FT                   /lat_lon="32.04 N 118.78 E"\n-FT                   /collection_date="2003-06-01"\n-FT                   /serovar="Paratyphi A"\n-FT                   /db_xref="taxon:54388"\n-FT                   /culture_collection="CMCC:50973"\n-FT   gene            complement(129..713)\n-FT                   /gene="mobA"\n-FT                   /locus_tag="IT63_00010"\n-FT   CDS             complement(129..713)\n-FT                   /codon_start=1\n-FT                   /transl_table=11\n-FT                   /gene="mobA"\n-FT                   /locus_tag="IT63_00010"\n-FT                   /product="molybdopterin-guanine dinucleotide biosynthesis\n-FT                   protein MobA"\n-FT                   /note="in Escherichia coli MobA links a guanosine\n-FT                   5\'-phosphate to molydopterin to form molybdopterin guanine\n-FT                   dinucleotide during molybdenum cofactor biosynthesis;\n-FT                   Derived by automated c'..b'cgag cgaacgggga ggagcccaga gcctgaatca gcatgtgtgt   4596180\n-     tagtggaagc gtctggaaag gcgcgcgata cagggtgaca gccccgtaca caaaagcgca   4596240\n-     tgtgctgtga gctcgatgag tagggcggga cacgtggtat cctgtctgaa tatgggggga   4596300\n-     ccatcctcca aggctaaata ctaattttgc tctttaaaaa tctggatcaa gctgaaaatt   4596360\n-     gaaacacaga acaacgaaag ttgttcgtga gtctctcaaa ttttcgcaac acgatgatga   4596420\n-     atcgtaagaa acatcttcgg gttgtgaggt taagcgacta agcgtacacg gtggatgccc   4596480\n-     tggcagtcag aggcgatgaa ggacgtgcta atctgcgata agcgccggta aggtgatatg   4596540\n-     aaccgttata accggcgatt tccgaatggg gaaacccagt gtgattcgtc acactatcat   4596600\n-     taactgaatc cataggttaa tgaggcgaac cgggggaact gaaacatcta agtaccccga   4596660\n-     ggaaaagaaa tcaaccgaga ttcccccagt agcggcgagc gaacggggag gagcccagag   4596720\n-     cctgaatcag catgtgtgtt agtggaagcg tctggaaagg cgcgcgatac agggtgacag   4596780\n-     ccccgtacac aaaagcgcat gtgctgtgag ctcgatgagt agggcgggac acgtggtatc   4596840\n-     ctgtctgaat atggggggac catcctccaa ggctaaatac tcctgactga ccgatagtga   4596900\n-     accagtaccg tgagggaaag gcgaaaagaa ccccggcgag gggagtgaaa aagaacctga   4596960\n-     aaccgtgtac gtacaagcag tgggagcaca ggtttacctg tgtgactgcg taccttttgt   4597020\n-     ataatgggtc agcgacttat attctgtagc aaggttaacc gtatagggga gccggaggga   4597080\n-     aaccgagtct taaccgggcg ttaagttgca gggtatagac ccgaaacccg gtgatctagc   4597140\n-     catgggcagg ttgaaggttg ggtaacacta actggaggac cgaaccgact aatgttgaaa   4597200\n-     aattagcgga tgacctgtgg ctgggggtga aaggccaatc aaaccgggag atagctggtt   4597260\n-     ctccccgaaa gctatttagg tagcgcctcg tgaattcatc tccgggggta gagcactgtt   4597320\n-     tcggctaggg ggccatcccg gcttaccaac ccgatgcaaa ctgcgaatac cggagaatgt   4597380\n-     tatcacggga gacacacggc gggtgctaac gtccgtcgtg aagagggaaa caacccagac   4597440\n-     cgccagctaa ggtcccaaag tcatggttaa gtgggaaacg atgtgggaag gcccagacag   4597500\n-     ccaggatgtt ggcttagaag cagccatcat ttaaagaaag cgtaatagct cactggtcga   4597560\n-     gtcggcctgc gcggaagatg taacggggct aaaccatgca ccgaagctgc ggcagcgaca   4597620\n-     ctcaggtgtt gttgggtagg ggagcgttct gtaagcctgt gaaggtggcc tgtgagggtt   4597680\n-     gctggaggta tcagaagtgc gaatgctgac ataagtaacg ataaagcggg tgaaaagccc   4597740\n-     gctcgccgga agaccaaggg ttcctgtcca acgttaatcg gggcagggtg agtcgacccc   4597800\n-     taaggcgagg ccgaaaggcg tagtcgatgg gaaacgggtt aatattcccg tacttggtgt   4597860\n-     tactgcgaag ggggggacgg agaaggctat gttggccggg cgacggttgt cccggtttaa   4597920\n-     gcgtgtaggt gtgtgttcca ggtaaatccg gttcacttta acactgaggc gtgacgacga   4597980\n-     ggcactacgg tgctgaagca acaaatgccc tgcttccagg aaaagcctct aagcatcagg   4598040\n-     taacatcaaa tcgtacccca aaccgacaca ggtggtcagg tagagaatac caaggcgctt   4598100\n-     gagagaactc gggtgaagga actaggcaaa atggtgccgt aacttcggga gaaggcacgc   4598160\n-     tgacacgtag gtgaagtgat ttactcatgg agctgaagtc agtcgaagat accagctggc   4598220\n-     tgcaactgtt tattaaaaac acagcactgt gcaaacacga aagtggacgt atacggtgtg   4598280\n-     acgcctgccc ggtgccggaa ggttaattga tggggtcagc gcaagcgaag ctcctgatcg   4598340\n-     aagccccggt aaacggcggc cgtaactata acggtcctaa ggtagcgaaa ttccttgtcg   4598400\n-     ggtaagttcc gacctgcacg aatggcgtaa tgatggccag gctgtctcca cccgagactc   4598460\n-     agtgaaattg aactcgctgt gaagatgcag tgtacccgcg gcaagacgga aagaccccgt   4598520\n-     gaacctttac tatagcttga cactgaacat tgagccttga tgtgtaggat aggtgggagg   4598580\n-     ctttgaagtg tggacgccag tctgcatgga gccgaccttg aaataccacc ctttaatgtt   4598640\n-     tgatgttcta acgtggaccc gttacccggg ttgcggacag tgtctggtgg gtagtttgac   4598700\n-     tggggcggtc tcctcctaaa gagtaacgga ggagcacgaa ggttggctaa tcctggtcgg   4598760\n-     acatcaggag gttagtgcaa tggcataagc cagcttgact gcgagcgtga cggcgcgagc   4598820\n-     aggtgcgaaa gcaggtcata gtgatccggt ggttctgaat ggaagggcca tcgctcaacg   4598880\n-     gataaaaggt actccgggga taacaggctg ataccgccca agagttcata tcgacggcgg   4598940\n-     tgtttggcac ctcgatgtcg gctcatccca tcccggggct gaagtaggtc ccaagggtat   4599000\n-     ggctgttcgc catttaaa                                                 4599018\n-//\n'
b
diff -r 0a947cb25a3d -r 957156367442 conversion/gbk2rdf/test-data/NC_010067.gbk
--- a/conversion/gbk2rdf/test-data/NC_010067.gbk Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b"@@ -1,259779 +0,0 @@\n-LOCUS       NC_010067            4600800 bp    DNA     circular CON 20-AUG-2013\n-DEFINITION  Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n-            RSK2980 chromosome, complete genome.\n-ACCESSION   NC_010067\n-VERSION     NC_010067.1  GI:161501984\n-DBLINK      Project: 58191\n-            BioProject: PRJNA58191\n-KEYWORDS    .\n-SOURCE      Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n-            RSK2980\n-  ORGANISM  Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str.\n-            RSK2980\n-            Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;\n-            Enterobacteriaceae; Salmonella.\n-REFERENCE   1  (bases 1 to 4600800)\n-  CONSRTM   NCBI Genome Project\n-  TITLE     Direct Submission\n-  JOURNAL   Submitted (03-DEC-2007) National Center for Biotechnology\n-            Information, NIH, Bethesda, MD 20894, USA\n-REFERENCE   2  (bases 1 to 4600800)\n-  AUTHORS   McClelland,M., Sanderson,E.K., Porwollik,S., Spieth,J.,\n-            Clifton,W.S., Fulton,R., Chunyan,W., Wollam,A., Shah,N., Pepin,K.,\n-            Bhonagiri,V., Nash,W., Johnson,M., Thiruvilangam,P. and Wilson,R.\n-  CONSRTM   The Salmonella enterica serovar Arizonae Genome Sequencing Project\n-  TITLE     Direct Submission\n-  JOURNAL   Submitted (02-NOV-2007) Genetics, Genome Sequencing Center, 4444\n-            Forest Park Parkway, St. Louis, MO 63108, USA\n-COMMENT     PROVISIONAL REFSEQ: This record has not yet been subject to final\n-            NCBI review. The reference sequence was derived from CP000880.\n-            Salmonella enterica subspecies IIIa (Arizonae) serovar\n-            62:z4,z23:--Most bacteria in the species S. enterica belong to one\n-            of seven subspecies; all but subspecies I normally grow only in\n-            cold-blooded animals. Subspecies IIIa (S. Arizonae) is naturally\n-            found in reptiles, but also causes outbreaks of salmonellosis in\n-            turkeys and sheep and can occasionally produce both gastroenteritis\n-            and serious disseminated disease in humans. Many human infections\n-            can be traced to contact with reptiles or ingestion of various\n-            reptile products, particularly from rattlesnakes. Fewer than ten\n-            cases in humans are typically reported in the US each year.\n-            \n-            The strain of S. Arizonae (62:z4,z23:-) being sequenced is\n-            CDC346-86; it was named RSK2980 by R.K. Selander and is strain\n-            SARC5 of the Salmonella Reference C set. This serovar is of\n-            interest because of its taxonomic position. It appears to be the\n-            most divergent subspecies among the S. enterica. It can be obtained\n-            from the American Type Culture Collection as ATCC BAA-731, or the\n-            Salmonella Genetic Stock Centre as SGSC4693. The genome was\n-            sequenced to 8X coverage, using plasmid and fosmid libraries and\n-            was finished to an error rate of less than 1 per 10,000 bases.\n-            Automated annotation was performed and manual annotation will\n-            continue in the labs of Michael McClelland and Kenneth Sanderson.\n-            The National Institute of Allergy and Infectious Diseases (NIAID),\n-            National Institutes of Health (NIH) has funded this project.\n-            \n-            Coding sequences below are predicted using GeneMark v3.3 and\n-            Glimmer2  v2.13.Intergenic regions not spanned by GeneMark and\n-            Glimmer2 were blasted against NCBI's non-redundant (NR) database\n-            and predictions generated based on protein alignments. RNA genes\n-            were determined  using tRNAscan-SE 1.23 or Rfam v8.0. This sequence\n-            was finished as follows unless otherwise noted: all regions were\n-            double stranded, sequenced with an alternate chemistries or covered\n-            by high quality data(i.e., phred quality >=30);an attempt was made\n-      "..b'1 acccgtcatc gtatcgtcct tgccgcaacg cttgcggaat ttcttacaca acttaatcct\n-  4597741 cttctgtaat cgtttgccct gacaggtgtg agagatctct tacaaggtct gtaggagatc\n-  4597801 gccaggatat cagagaatac ttagctacga ctttctcctg taaatatata taaatcaatc\n-  4597861 tattaaaata ttatttcgca ctttcatata caaatttact taaggtatcg tctgtaagcg\n-  4597921 tcttgtaaga caaggtgaaa caggcgattc tatattcatc gacagggagt cgtacaacga\n-  4597981 agcgaacgtc aggaagatgg cgcttctgca ggacacgcca ggagggcgtt acatggaaag\n-  4598041 gcttcaggat gaggcaaagt ggaaagcgca ggatgcgtta aaggacacct ccaggacgga\n-  4598101 gaacgagagc cgattaggat ggtcggcggg tctggatgac cagggacgct tcgggatgaa\n-  4598161 gctatcacat cggggcgatg tgcgcaggat gcaaacgttc aggatgagca ggccgcaggg\n-  4598221 tcacaggaaa agttgtcacg gatgagcagg gagcatgaaa agtagctgga atgctgcgaa\n-  4598281 acgaaccggg agcactgttt atacagtgct cccttttttt gttattcttc gcgccagatt\n-  4598341 tccattattg aggttcttaa catgacgact catgaccgtg tgcgtcagca gttacatgcg\n-  4598401 cttgaaacgc tgctgcgtga gcatcatcac tggcggctgg atgcgccgca ggcgcacctg\n-  4598461 tttaccagca cgcagccgtt ttgtatggat accatggaac cgctggaatg gctgcaatgg\n-  4598521 gtattgatcc cgcgtatgca taccctgctt gataatgcgc agccgttacc tgaggcgttt\n-  4598581 gccgtcgccc cttattatga aatggcgctg acggcggatt atccgcagcg ggaagcgatc\n-  4598641 ctgacggttt tgcaggatct ggatgcgcta tttacccgcg ataaatcctg atgctggaga\n-  4598701 tcctctatca ggacgcgtgg ctggttgccg ttaataaacc tgcaggctgg cttgttcacc\n-  4598761 ggagctggct ggatcgcgac gaaaaagttg tggtcatgca aacggtgcgc gaccaaatcg\n-  4598821 gccagcatgt ttttaccgcc caccgtctcg acagacccac atcgggcgta ctactgatgg\n-  4598881 ggctgtccag cgaagcggga cgccgcctgg cgcagcagtt cgagcagcac catatccgta\n-  4598941 aacgttacca tgccatagtg cgcggctggc tgatggatga tgcgctactg gattatcctc\n-  4599001 tgctggaaga gcgcgataaa attgccgata agttcgcgcg tgaggataaa gcgccccagc\n-  4599061 cagccgtaac gcagtatcgc gggctggcga cggtcgaaat ggcagtgccg accgggcgtt\n-  4599121 atcccactac gcgttatggc ctggttgagc tggaaccgaa aacggggcgc aaacaccagc\n-  4599181 tccgccgtca tctggcgcat ctacgccatc ctatcatcgg cgacagtaaa cacggtgatt\n-  4599241 tgcggcaaaa ccgtagcgcg gcggaacatt ttgcttgtcg tcgcctgatg cttcatgcca\n-  4599301 gtcggcttga actgacgcat cccttcaccg gacagccatt aattattcag gccggactgg\n-  4599361 atgaaacctg gatgcaggcg ctaacacagt ttggctggcg gggacttctc cctgataatg\n-  4599421 aaagggttga gtttacgacg gcgtcccggc aggatgagtc ttatcagaca taattcaggg\n-  4599481 agatacgcat aatggcggaa attggtattt ttgtcggtac gatgtatggc aactcactgt\n-  4599541 tggtggcgga ggaagcggaa gcgatcctgg ccagacaggg ccatagcgcg actgtgtttg\n-  4599601 aagatcctga actgtccgac tggcggcaat atcaggacaa ggtggcattg gttgtcacct\n-  4599661 caacgaccgg acagggcgat ctaccggata gtattgcgcc gctctttcac ggtattaaag\n-  4599721 atacgttagg ttttcaacca aacctgcgtt acggggtgat tgcgttaggt gatagcagct\n-  4599781 accccaattt ctgtaatggc ggcaagcagt ttgatgccct gttgcaggag caaagcgcgc\n-  4599841 aacgggtggg ggaaatgtta ctcattgacg ccagcgaaca tccggagccg gagagccaat\n-  4599901 ccaatccctg ggtagaaaac tggggaacct tactttcctg aggtaaatcc ctccccctac\n-  4599961 cgggagggta ccttttcgtt tgattgcatt gccagtaagc aaaataacga cctgtatgta\n-  4600021 gtttaaagaa actgaatcgt gttagctttg tgcatatgcc tgcaaaagca gcagtttttt\n-  4600081 acgggcgttt tcatgtaatc aagcgacctg tttcacattc ttctcttttt attcctcctg\n-  4600141 cgtcgacgcc tgacgccttc tgatttcatt tccgtgaagt ggcttccact gtcctgggct\n-  4600201 tttgccacaa acaggcgtaa ttcattgcca aaatactgtg ttgttgcacg gtgagtgtgc\n-  4600261 gtgacgcgct ttttatactt ctcctgccag tgaataaaag aatgcagcat gcaaagcaaa\n-  4600321 cgacctaata aaagctgcaa caaggaaacg ttatctctga ttccctaccg gttgtgcagt\n-  4600381 tcagagtgag cgtagctaac gcgaaatttc aggagtgcaa caatgagttc attaagtcac\n-  4600441 gcggcgagta gtgcggagaa tcgcacgaac gcccgctact ggatagtggt gatgctgttt\n-  4600501 atcgtcacat cctttaacta tggcgatcgc gccacattgt ccattgccgg ctcagaaatg\n-  4600561 gccaaagata ttggtcttga cccggtaggc atgggctacg ttttctctgc gttttcatgg\n-  4600621 gcctatgtta tcggacagat ccctggcggc tggctgctgg accgctttgg ttccaaacgc\n-  4600681 gtctatttct ggtctatttt catctggtcg gtcttcaccc tgttgcaggg ttttgtcgat\n-  4600741 atttttagcg gtttcggcat tgttgtcgcc ctctttacgc ttcgtttcct ggtcggtctg\n-//\n'
b
diff -r 0a947cb25a3d -r 957156367442 conversion/gbk2rdf/tool_dependencies.xml
--- a/conversion/gbk2rdf/tool_dependencies.xml Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,105 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-  <package name="zlib" version="1.2.8">
-    <repository name="package_zlib_1_2_8" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="readline" version="6.2">
-    <repository name="package_readline_6_2" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="bzlib" version="1.0.6">
-    <repository name="package_bzlib_1_0" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="xz" version="5.0.5">
-    <repository name="package_xz_5_0_5" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="openssl" version="1.0.1">
-    <repository name="package_openssl_1_0" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="sqlite" version="3.8.3">
-    <repository name="package_sqlite_3_8_3" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="gdbm" version="1.11">
-    <repository name="package_gdbm_1_11" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="ncurses" version="5.9">
-    <repository name="package_ncurses_5_9" owner="iuc" prior_installation_required="True" />
-  </package>
-
-  <package name="python" version="3.4">
-    <install version="1.0">
-      <actions>
-        <action type="download_by_url">https://www.python.org/ftp/python/3.4.1/Python-3.4.1.tgz</action>
-
-        <action type="set_environment_for_install">
-          <repository name="package_zlib_1_2_8" owner="iuc">
-            <package name="zlib" version="1.2.8" />
-          </repository>
-          <repository name="package_readline_6_2" owner="iuc">
-            <package name="readline" version="6.2" />
-          </repository>
-          <repository name="package_bzlib_1_0" owner="iuc">
-            <package name="bzlib" version="1.0.6" />
-          </repository>
-          <repository name="package_xz_5_0_5" owner="iuc">
-            <package name="xz" version="5.0.5" />
-          </repository>
-          <repository name="package_openssl_1_0" owner="iuc">
-            <package name="openssl" version="1.0.1" />
-          </repository>
-          <repository name="package_sqlite_3_8_3" owner="iuc">
-            <package name="sqlite" version="3.8.3" />
-          </repository>
-          <repository name="package_gdbm_1_11" owner="iuc">
-            <package name="gdbm" version="1.11" />
-          </repository>
-          <repository name="package_ncurses_5_9" owner="iuc">
-            <package name="ncurses" version="5.9" />
-          </repository>
-        </action>
-
-
-        <action type="autoconf" />
-
-        <action type="download_file">https://bitbucket.org/pypa/setuptools/get/5.2.tar.bz2</action>
-        <action type="shell_command">tar -xjf 5.2.tar.bz2</action>
-        <action type="change_directory">pypa-setuptools-f493e6c4ffd8</action>
-
-        <action type="shell_command">export PATH=$PATH:$INSTALL_DIR/bin &amp;&amp;
-            export PYTHONHOME=$INSTALL_DIR &amp;&amp;
-            export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR &amp;&amp;
-            export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python3.4/site-packages/ &amp;&amp;
-            export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$INSTALL_DIR/lib/pkgconfig &amp;&amp;
-            $INSTALL_DIR/bin/python3 setup.py install --prefix=$INSTALL_DIR</action>
-        <action type="set_environment">
-          <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable>
-          <environment_variable action="prepend_to" name="PYTHONPATH">$INSTALL_DIR</environment_variable>
-          <environment_variable action="set_to" name="PYTHONHOME">$INSTALL_DIR</environment_variable>
-          <environment_variable action="prepend_to" name="PKG_CONFIG_PATH">$INSTALL_DIR/lib/pkgconfig</environment_variable>
-        </action>
-
-
-      </actions>
-    </install>
-
-
-
-
-    <package name="biopython" version="1.61">
-        <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-    
-    <readme>
-      The Python programming language version 3.4. hopefully incombination with python3.4
-
-      http://www.python.org
-    </readme>
-
-  </package>
-</tool_dependency>
b
diff -r 0a947cb25a3d -r 957156367442 conversion/protein2rdf/protein_to_ttl.py
--- a/conversion/protein2rdf/protein_to_ttl.py Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,130 +0,0 @@
-def delete_galaxy():
- import sys
- for index, path in enumerate(sys.path):
- if "galaxy-dist/" in path:
- sys.path[index] = ''
-
-#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
-delete_galaxy()
-
-# from io import StringIO
-from rdflib import Graph, URIRef, Literal,Namespace,  RDF,RDFS,OWL,  plugin
-# import rdflib
-from rdflib.store import Store
-import sys
-import hashlib
-
-store = plugin.get('IOMemory', Store)()
-
-global URI
-URI = "http://csb.wur.nl/genome/"
-global seeAlso
-seeAlso = "rdfs:seeAlso"
-global coreURI
-coreURI = Namespace(URI)
-
-
-def createClass(uri):
- genomeGraph.add((uri,RDF.type,OWL.Class))
- genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
- return uri
-
-def fasta_parser(input_file):
- createClass(coreURI["Protein"])
-
- genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
- if genome == '':
- genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
-
- genomeURI = coreURI[genome]
- for index, element in enumerate(sys.argv):
- if '-organism' == element:
- genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
- if '-ncbi_taxid' == element:
- genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
- if '-idtag' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
- if '-diagnosis' == element:
- genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
- if '-country' == element:
- genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
- if '-location' == element:
- genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
- if '-date' == element:
- genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
- if '-ids' == element:
- genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
-
-
-
- data = (open(input_file).readlines())
- fastadict = {}
- sequence = ""
- key = ""
- for index, line in enumerate(data):
- if ">" == line[0]:
- if sequence:
- fastadict[key] = sequence
- key = line
- sequence = ""
- fastadict[key] = ""
- else:
- sequence += line.strip()
- fastadict[key] = sequence
-
- #Create a class, to be the same as all the other genome conversions...
- #TODO: Proteins are part of cds, cds are part of dnaobject
- #If CDS is not there... how then?
- classURI = coreURI[genome + "/" + "protein_fasta"]
- proteinClass = createClass(coreURI["Protein"])
- genomeClass = createClass(coreURI["Genome"])
- typeClass = createClass(coreURI["DnaObject"])
- cdsClass = createClass(coreURI["Cds"])
- #A theoretical begin, end is created to have a workable GBK generation
- begin = 0
- end = 0
- genomeGraph.add((genomeURI, RDF.type, genomeClass))
- genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
- genomeGraph.add((classURI, RDF.type, typeClass))
-
- for protein in fastadict:
- sequence = fastadict[protein]
- sequence = sequence.encode('utf-8')
- end = begin + len(sequence)
- md5_protein = hashlib.md5(sequence).hexdigest()
- proteinURI = coreURI["protein/"+md5_protein]
-
- cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
- genomeGraph.add((classURI, coreURI["feature"] , cdsURI))
- genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))
- genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
- genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
- genomeGraph.add((cdsURI, RDF.type, cdsClass))
-
-
-
- genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
- genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
- genomeGraph.add((proteinURI,RDF.type,proteinClass))
- genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
- genomeGraph.add((proteinURI, RDF.type, proteinClass))
- begin = end
-
-def save():
- data = genomeGraph.serialize(format='turtle')
- open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
-
-def main():
- store = plugin.get('IOMemory', Store)()
- global genomeGraph
- genomeGraph = Graph(store,URIRef(URI))
- genomeGraph.bind("ssb",coreURI)
- input_file = sys.argv[sys.argv.index("-input")+1]
- fasta_parser(input_file)
- save()
-
-if __name__ == '__main__':
- main()
-
b
diff -r 0a947cb25a3d -r 957156367442 conversion/protein2rdf/protein_to_ttl.xml
--- a/conversion/protein2rdf/protein_to_ttl.xml Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,42 +0,0 @@
-<tool id="SAPP_protein_rdf" name="Protein FASTA to RDF" version="0.1">
-    <requirements>
-        <requirement type='package' version="3.4">python</requirement>
-        <requirement type='package' version="1.0">rdflib</requirement>
-    </requirements>
- <description></description>
- <command interpreter="python3.4">protein_to_ttl.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' '-diagnosis' '$diagnosis' '-country' '$country' '-location' '$location' '-date' '$date' -sourcedb SAPP 
- #for $index, $id in enumerate( $ids ) 
- '-ids' '$id.id_tag'
- #end for
- '-id_alternative' '$input.name'
- </command>
- <inputs>
- <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
- <param size="60" name="organism" type="text" format="text" label="organism name"/>
- <param size="60" name="diagnosis" type="text" format="text" label="Diagnosis of host if applicable"/>
- <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
- <param size="60" name="country" type="text" format="text" label="Country of sample"/>
- <param size="60" name="location" type="text" format="text" label="Location of sample e.g., river, city, hospital"/>
- <param size="60" name="date" type="text" format="text" label="Sample date"/>
- <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!"/>
- <repeat name="ids" title="Identification tags">     
- <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
- </repeat>
- </inputs>
-
- <outputs>
- <data format="rdf" name="output" label="proteinTTL: ${input.name}" />
- </outputs>
-    <tests>
-        <test>
-            <param name="input" value="test-data/NC_017117.faa"/>
-            <output name="$output" file="NC_017117.rdf"/>
-            <output name="$ncbi_taxid" value="634455"/>
-            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
-            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
-        </test>
-    </tests>
- <help>
- RDF creation from a multi protein fasta file
- </help>
-</tool>
b
diff -r 0a947cb25a3d -r 957156367442 conversion/protein2rdf/test-data/NC_017117.faa
--- a/conversion/protein2rdf/test-data/NC_017117.faa Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,993 +0,0 @@\n->gi|384055706|ref|YP_005485330.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MKSDRFTDAQIMGVIRQAEGGVPVPDLCREHGISNATFYRWRAKYGGMDASMISQMKALEEENRRLKRMY\n-ADLSMQTDILKEALGKK\n->gi|384055707|ref|YP_005485331.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n-MAGHHVEAMIARAHAQKRFMDDAGWRYVVELYGRYQSLLREQNAADFGDLLMWPTLAMLHNDAYRYRWSR\n-RFTAVMADEFQDVNRAQFLWLKMISEVSAEFFAVGDDSQSIL\n->gi|384055708|ref|YP_005485332.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MVVGRNDCAKGRQMKDTVIGVDLAKNIFQVHGASRAGEVMFRKKLRRQQFMQFMATQPPALVVLEACGSA\n-HYWARELAGAGHEVRLIAPQYVKPFVKRQKNDAADAEAIVIAARQPEMRFVEPRTEAQQARGVLFRARQR\n-LVHQRTELVNALRAVLYEFGLVVPQGIAHIRHIEAMLDEAVLPEAVKQECLDLLRQISEQSVRIDVRTKK\n-IRMLAQESENTCRLQSMPGVGPLTALAIEAFAPDLQSFRRGRDFAAWLGLVPRQFSSGGKERLGKISKAG\n-QADIRRLLIMGAMTQVNWASRKAPAPGSWLARMLARKPRMLVAIALANRMARAIWAMATKQEDYRDPALS\n-VAA\n->gi|384055709|ref|YP_005485333.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MEQIIRIGMDTSKSVFQLHGVNAKEQPVLRRKLSRREMVKFFEKLPPIEIAIEACGASHYWGRVLSCLGH\n-TVKLIAPQLVKPYVKRGKNDAADAEALCEAMSRPTMRFVPLKSEEEQAALMLIGMRARLIRNRTQLANTI\n-RGYAAEFGITAPKGMCRIEALLDRIAADESLPTLTRELFALHAKEYAELQGEIEQLEGKVMAWHRANECS\n-QRLAKIPGVGPIGAALLMMKTPDPHLFKSGRAFAAWIGLTPRDHSTGGKTRLGRITRAGDEVLRSTLVVG\n-ATAVVSHARRTNGKNASSWLRELLERKKPKLAAVALANKIARIAWKLMVSGEHYKRLLQQPGAAAV\n->gi|384055710|ref|YP_005485334.1| DNA resolvase [Acetobacter pasteurianus IFO 3283-22]\n-MVPPKPGKTPVGGRLIGYARVSTDDQGTDAQLNELRDAGCTMIFEKHASGADRNRPVLIRLLRDMNAGDT\n-LVVVRLDRLARSVSHLLAVIEQLDYAGAHFRSLDDPIDTTTPQGMFSLQVLGAVAQLDADFFCDGVDGSQ\n-RHRDVPR\n->gi|384055711|ref|YP_005485335.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MLTSRIHRRKPMGKPMSKATARANAAKSSIRAHVEHVFAHQKNRFNLFIRTIGLARAEAKLTLCNLAYNF\n-NRLIFHERLETAG\n->gi|384055712|ref|YP_005485336.1| D-mannonate oxidoreductase [Acetobacter pasteurianus IFO 3283-22]\n-MNLNRNAISHVPDTVYTPRYDPALLRPGIVHLGCGNFHRGHQVVATQAAIDAEGRDGLRWGIVSATMRRP\n-DLATVLQSQDNLYTLLTREPANTVASVMAAITEAVYAGDDNANLAARIADPATAIVTLTVTASGYYLSAD\n-GRLDPTFEAIQADLTAITPRTAPGIIAAGLAQVRQRGGVPPVILCCDNVNSNGATLRQAVIDLAALKGDD\n-LLAAWIETNVQFPDTMVDRIVPTATPDDIADACRLLGGIEDRAPISAEPWFQWVIGEFDGPRPRWVAHPG\n-TKFVSDVGVFERAKLQMLNGTHMLLAYVGALANLNTVSEAASDDALGRIAARFMRNEQTADVSLDTDELD\n-RYTVDLMQRFRNPGIVHEVTRIGRNGSAKMASRIVQPMRSNIEAGRPVDGAVLLIASWIRWFALHEQDEF\n-DIALTDPRAETLRGLCADARDDHKAQAEAFLAMEEVFGAPLPDHGKQVEAIASMLRRLTEESVPELLRTI\n-AH\n->gi|384055713|ref|YP_005485337.1| phosphatase/phosphohexomutase [Acetobacter pasteurianus IFO 3283-22]\n-MTDTVFPAHLLKHKQEPVHGVVFDMDGLLLDSESLAMEALVFAARDLNYDIPMSFCRTMIGVPADGCRTM\n-VRKTYGQDFPLERFFELQEVHLRNFVDTGKLALKKGVLPLLDLLDTYKIPRAIATSSSRVRTDHHLKLVN\n-LFHRFNAIVTRDDVSKGKPDPEPYLTAAKKIGVNPAHALALEDSHSGARAAHAAGIRVIVVPDLLEATDE\n-IRGKALAIVQDLSIVEAYLKHAITGQA\n->gi|384055714|ref|YP_005485338.1| hypothetical protein APA22_40090 [Acetobacter pasteurianus IFO 3283-22]\n-MRRDMDLVRQLLLKLEGIEKGPHDVLLIGGNSEEVAVDGRTSDEIYFHLTKIEEAGFLERVGGGAMTAVT\n-FRALSWKGQEFLDTIRDDSIWKKTKEKAGSASFDILAAVAKAVIKDRIKSLTGLDIG\n->gi|384055715|ref|YP_005485339.1| hypothetical protein APA22_40100 [Acetobacter pasteurianus IFO 3283-22]\n-MRPLGSGLSVRTYGCSEADDQENDGWAKKDTGEIVALYEMSSPVMPSGLVSISRWKIKGCYPKSGLSRAM\n-LCPTKIPQSASNIALLIGSDWSFIEENVFCNHIEWQTCLPVFVMNLDHPA\n->gi|384055716|ref|YP_005485340.1| DNA helicase superfamily I [Acetobacter pasteurianus IFO 3283-22]\n-MSSKPSHHSVLSYWHSALLDDAQMKISFSRDNLVALDEEGFEKGKLPPDKTQALRKMHPASRDLAPDDSI\n-IAMAGIRILLGQVSHSTEHSKQPALFCMAMLVNVSPEGTIQPLKDAPPWINRELLEPSDGDVLIGDLATM\n-DTWLQLNPFEGGSLGKTLEWAEKLWNAVTGEDGLPDGYELWERVALQPAEASIGMIATLHQRRFYDTVLA\n-DTGLVTPLLARYIDGGPEPAVVDESQKWAAAGRARGTMTFAYGMSSSQSEAMTAFCSVKDGDILAVNGPP\n-GTGKTTLLQGIVATELVTRALEGGDPAVIVGTSTNNQAVTNIIDAMKKAMASKDSRPWARRWIEGADALG\n-LYFPSGEKEKEALKAGYLIASPGRGLGTMEWKGFPERERDTVDAWASRDAWINGYYGSFYPGVTPPLRKE\n-HLSGHGPQGARHDISLVEDGIAKIRARMKVLVETGRVCAGEARKLNQLYVASGYGTYPDITKAIAQREAL\n-LQERRPREDALKSDLKEKEAAAAVPRARINEENRKTRDLLKQRDDAVHAAGQKVEEVGAHAVALIAALPG\n-GGFFSNLMSGRNWANVERLVAEGRQGSFFRSLMQAQVKSKREWMDAINEMTASAERELATVRESREETRQ\n-ARDTLIQKLEREVAAADLVSKTARAEYDHYVGGSYVLAGRELEKLVTLKHQILQQLQDCCTAIETVLAPS\n-DWAAMFDMPEEKLPWRQSNWTGRLDVIEDFLDR'..b'DEVAPAV\n-RHLISQIQTTIA\n->gi|384055875|ref|YP_005485499.1| multidrug resistance transporter EmrB/QacA [Acetobacter pasteurianus IFO 3283-22]\n-MGTSMTSSRVTNPLFVLLAASTGCALTVLDTNVVAIILPTIAREFRASFADIEWVISTYVLCFASLLLPA\n-GAIADRYGRRRIYLIGITTFALTSLFCGAAPSATALYLARALQGVSAAFLLAPALAIIGHTFHNPDERNR\n-AWAIWGSIMGLTMVLAPIIGGIIAYALGWRWAFYINIPICVLLAGAVFILVKESRDTDARRLDPVGIIFF\n-AAFMFGLTWGMINGQASGWTSWNALNGFIGGSISLGIFIASERAQSRPMLDLGLFSNPRFLGAVWAMFAY\n-AASAQVMASMLPLFLQNGLGRSALQAGFAMLPFALAMLIFPHIGRLLERHISSSGILAGGLSCVAIGNGI\n-TAWGAYVGSWIIVMAGMVVIGSGGGLLNGETQKAIMSVVPKERSGMASGISTTSRFSGILLGFAMLSGIL\n-ATMVRKWVAAFGCGTGCHHPSDFADAIVAGDLPSAISGLEGSNQEIAIQHAHHAFSYGFAVALLVASIFA\n-LGSSITVFTLMQSKMKQNIT\n->gi|384055876|ref|YP_005485500.1| transposase, partial [Acetobacter pasteurianus IFO 3283-22]\n-MLAYAVMASVRYQANSLKPKKTQLRTRQSLSAGPFRRSGASS\n->gi|384055877|ref|YP_005485501.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVILVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n-LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGQQADRYCRIIAD\n-HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n-ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n-SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n-CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n-KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n->gi|384055878|ref|YP_005485502.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVIVVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n-LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGHKADRYCRIIAD\n-HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n-ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n-SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n-CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n-KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n->gi|384055879|ref|YP_005485503.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n-MLQFSYMSEEADAIAAEIGRRAASGCAWHDIAVIYRQNRLSRAIEEALIQARVPYEIVGDVGFYQRVAVK\n-DALALLSLAARPDDRQSDEAFRADFSHLRQFRVIL\n->gi|384055880|ref|YP_005485504.1| DNA helicase RecD/TraA [Acetobacter pasteurianus IFO 3283-22]\n-MTSAVVGEQCQTEALAGLVERVTFHNAENGFCVLRVKVRGQRDLVTVVGHAAMISAGEFVQMSGRWFNDH\n-THGLQFKAEFLKASPPTTVEGIERYLGSGMIRGIGPVYAKKLVKAFGEAVFDLIEQEPHRLREVTGIGPK\n-RAERIVGGWADQKVIREIMLFLHSNGVGTSRAVRIFKTYGQDAVRLISENPYRLAKDIRGIGFKTADQIA\n-RKMGIAPDAMIRVRAGISYALGEAMDEGHCGLPVGELLTSTAELLEVAAPLIETALALELEAGDVVADSV\n-GETSCIFLAGLYRAEQSIAERLRACAVGRPPWPEIDAEKAMTWVEGKTGLAMAPSQQEAVRLALRSKVLV\n-ITGGPGVGKTTLVNAILKIVTAKGTDVQLCAPTGRAAKRLSESTGLEGKTIHRLLETDPGNGSFKRDDTN\n-PLTCDLLVVDEASMVDVLLMRSLLRALPDSASLLIVGDVDQLPSVGPGQVLADIIGSDAVPVVRLTEVFR\n-QAAQSRIITNAHRINEGKMPELSAEEGSDFYFVEAAEPEVGLRKLLAVVKDRIPARFGLDPVRDVQVLCP\n-MNRGGLGARSLNIELQQALNPAGDVKVERFGWTYGPGDKVMQIANDYDRDVFNGDLGVIDKIDVEEGELT\n-VLFDGREVVYGFGELDELVLAYATTIHKSQGSEYPVVVIPLVTQHYTMLARNLLYTGVTRGRKLVVLVGQ\n-KKALAIAVRNQGGRLRWSKLRDWLVGTSGTGHLSRLKKP\n->gi|384055881|ref|YP_005485505.1| phage integrase [Acetobacter pasteurianus IFO 3283-22]\n-MVESQVSHIQPEYKFHINLDEYDRRATLSADELKVVRRWKEENLVITKRQAPRLHKPLTDILYRSNLDRA\n-NSHRALKYLLLTVAHQEKPYWGWSEDLWVEIINNSPVLKKTGMVPQLIAVAYLLCGFRSVYKIQRNVATA\n-VVARLVFGAEIVDTECERLFSALTRVGFVCQTVRPLVPSVFAAVALQGENPKLESFDRKILEHTRECYTG\n-NHIAKRIGILSNGLAAMGLTSKVIHFRAYPPRHGTETDNINPEWMTWCRRWLETTTLREGSRRAVYNTLT\n-RIGIWLGREHPEVTGPEQWTVSVCADYLAAVDRLRVGDWGGSTFDYRLIPTVGQPLQAPTKVAYYQVMRR\n-FLSDIQSWEWARLRCNPRYHLSTPKNIAKYLGVNPRTIDDASWLKLTWASLNIEPDDLSPDCFYPFALLQ\n-AIAVVWTHAGLRSNEIARLRVGCTREQSEDVVDQSGNVVPAGQVCWLDVPEGKTSVAYTKPVGHAVHKYI\n-TAWMKKRASPRKHLDRRTGEHVHFLFQLRNRPIAKEVLNQTVIPLLCKKAGIPIEDSKGRITSHRGRASA\n-VSMLASVPQGMTIFDLAKWCGHTSVQSTMSYVRSKPTQLASAFAKADQAARMIEIVIDNEVIAAGATKDG\n-APWKYYDLGDSYCSNAFWSTCPHRMACARCYFNIPKPSAKGVVLAAQQAANRLLEEVWLSPEERDAVSGD\n-VEALEGMLNKLRDKPALDGRTPGEISATCGSQVSSPFTESE\n->gi|384055882|ref|YP_005485506.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n-MELGITPGQDADITQAEPLLENIEPDAFLADKAYDADRLIDRLIQRGITPVIPPKRNRTTRRVIPP\n'
b
diff -r 0a947cb25a3d -r 957156367442 genetic_elements/aragorn/aragorn.py
--- a/genetic_elements/aragorn/aragorn.py Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,125 +0,0 @@
-def delete_galaxy():
- import sys
- for index, path in enumerate(sys.path):
- if "galaxy-dist/" in path:
- sys.path[index] = ''
-
-#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
-delete_galaxy()
-
-from rdflib import Graph, URIRef, Literal,Namespace, XSD, BNode,RDF,RDFS,OWL, ConjunctiveGraph, plugin
-
-# Import RDFLib's default Graph implementation.
-from rdflib.graph import Graph
-
-import sys, os
-
-import rdflib
-import subprocess
-import hashlib
-global URI
-global SubClassOfDict
-SubClassOfDict = {}
-
-URI = "http://csb.wur.nl/genome/"
-global seeAlso
-seeAlso = "rdfs:seeAlso"
-global coreURI
-coreURI = Namespace(URI)
-
-def createClass(uri):
- #genomeGraph.add((uri,RDF.type,OWL.Class))
- #genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
- #genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
- #genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
- #genomeGraph.add((uri,RDFS.subClassOf,coreURI["Rna"]))
- return uri
-
-def tmp():
- import time
- global tmpFolder
- tmpFolder = "/tmp/"+str(time.time())+"/"
- os.mkdir(tmpFolder)
-
-def query():
- global genomeGraph
- genomeGraph = Graph()
- filename = sys.argv[1]
- genomeGraph.parse(filename, format="turtle")
- qres = genomeGraph.query('select ?class ?sequence where {?class a ssb:DnaObject . ?class ssb:sequence ?sequence .}')
- sequences = []
- for row in qres:
- print ("Header:",row[0])
- sequences += [[">"+str(row[0]),str(row[1].strip())]] #.replace("/","-").replace("","")
-
- return sequences
-
-def aragorn(sequences):
- for sequence in sequences:
- #Call aragorn for each contig, for ease of parsing
- open(tmpFolder+"tmp.seq","w").write('\n'.join(sequence))
- folder = os.path.realpath(__file__).rsplit("/",2)[0]+"/"
- cmd = folder+"/tools/aragorn1.2.36/aragorn -fasta "+tmpFolder+"tmp.seq "+' '.join(sys.argv[3:-2])+" > "+tmpFolder+"aragorn.output"
- print (cmd)
- os.system(cmd)
- aragorn = open(tmpFolder+"aragorn.output").readlines()
-#  string = ''.join(aragorn)
-
- contig = sequence[0].strip(">").replace("http://csb.wur.nl/genome/","")
- dnaobjectURI = coreURI[contig]
- #print (contig)
- for line in aragorn:
- if ">" in line:
- print (line.split())
- try:
- trna, pos = line.split()[1:]
- except:
- try:
- trna, pos = line.split()
- except:
- if "(Permuted)" in line:
- trna, permute, pos = line.split()[1:]
-
- if "tRNA-" in line:
- trna, codon = (trna.strip(">)").split("(",1))
- else:
- trna = trna.strip(">").strip() #Actually a tmRNA...
- codon = ''
- trnaClass = createClass(coreURI[trna.split("-")[0].title()]) #trna or tmrna
- SubClassOfDict[trna.split("-")[0].title()] = 1
- if "c" in pos[0]: #complementary
- stop, start = pos.split("[")[1].split("]")[0].split(",")
- else:
- start, stop = pos.split("[")[1].split("]")[0].split(",")
- trnaURI = coreURI[contig+"/trna-aragorn_1_2_36-"+trna.lower() +"/"+ start +"_"+ stop]
- genomeGraph.add((dnaobjectURI, coreURI["feature"] , trnaURI))
- genomeGraph.add((trnaURI, RDF.type,trnaClass))
- genomeGraph.add((trnaURI, coreURI["begin"] , Literal(start,datatype=XSD.integer)))
- genomeGraph.add((trnaURI, coreURI["end"] , Literal(stop,datatype=XSD.integer)))
- genomeGraph.add((trnaURI, coreURI["trna_type"] , Literal(trna)))
- genomeGraph.add((trnaURI, coreURI["trna_anti"] , Literal(codon)))
- genomeGraph.add((trnaURI, coreURI["tool"] , Literal("aragorn")))
- genomeGraph.add((trnaURI, coreURI["version"] , Literal("1.2.36")))
- genomeGraph.add((trnaURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
-
-def subClassOfBuilder():
- for subclass in SubClassOfDict:
- genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
- genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
- genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
- genomeGraph.add((coreURI["Rna"], RDF.type,OWL.Class))
-
-def save():
- #Create the subclass off instances
- #subClassOfBuilder()
- ## Saves the file
- data = genomeGraph.serialize(format='turtle')
- open(sys.argv[2],"wb").write(data)
-
-def main():
- tmp()
- sequences = query()
- aragorn(sequences)
- save()
-
-main()
b
diff -r 0a947cb25a3d -r 957156367442 genetic_elements/aragorn/aragorn.xml
--- a/genetic_elements/aragorn/aragorn.xml Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,88 +0,0 @@
-<tool id="SAPP_aragorn_trna" name="tRNA and tmRNA" version="0.3">
-    <requirements>
-        <requirement type='package' version="3.4">python</requirement>
-        <requirement type='package' version="1.0">rdflib</requirement>
-        <requirement type="package" version="1.2.36">aragorn</requirement>
-    </requirements>
-    <description>Aragon</description>
-    <command interpreter="python3.4">aragorn.py '$input' '$output' '-gc$genbank_gencode' '$tmRNA' '$tRNA' '$topology' '-fon' '-sourcedb' 'SAPP'
-    </command>
-    <inputs>
-        <param name="input" type="data" format="rdf" label="RDF Genome"/>
-
-        <param name="genbank_gencode" type="select" label="Genetic code">
-            <option value="1" select="True">1. Standard</option>
-            <option value="2">2. Vertebrate Mitochondrial</option>
-            <option value="3">3. Yeast Mitochondrial</option>
-            <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
-            <option value="5">5. Invertebrate Mitochondrial</option>
-            <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
-            <option value="9">9. Echinoderm Mitochondrial</option>
-            <option value="10">10. Euplotid Nuclear</option>
-            <option value="11">11. Bacteria and Archaea</option>
-            <option value="12">12. Alternative Yeast Nuclear</option>
-            <option value="13">13. Ascidian Mitochondrial</option>
-            <option value="14">14. Flatworm Mitochondrial</option>
-            <option value="15">15. Blepharisma Macronuclear</option>
-            <option value="16">16. Chlorophycean Mitochondrial</option>
-            <option value="21">21. Trematode Mitochondrial</option>
-            <option value="22">22. Scenedesmus obliquus mitochondrial</option>
-            <option value="23">23. Thraustochytrium Mitochondrial</option>
-            <option value="24">24. Pterobranchia mitochondrial</option>
-        </param>
-        <param name="topology" type="select" label="Topology">
-            <option value="-c">Assume that each sequence has a circular topology</option>
-            <option value="-l">Assume that each sequence has a linear topology</option>
-        </param>
-        <param name='tmRNA' type='boolean' label='Search for tmRNA genes (-m)' truevalue='-m' falsevalue='' checked="true" help='' />
-        <param name='tRNA' type='boolean' label='Search for tRNA genes (-t)' truevalue='-t' falsevalue='' checked="true" help='' />
-    </inputs>
-    <outputs>
-        <data format="rdf" name="output" label="Aragorn: ${input.name}"></data>
-    </outputs>
-    <tests>
-        <test>
-            <param name="input" value="test-data/NC_017117.fna"/>
-            <output name="$output" file="NC_017117.rdf"/>
-            <output name="$genbank_gencode" value="11"/>
-            <output name="$tmRNA" value="-m"/>
-            <output name="$tRNA" value="-t"/>
-            <output name="$topology" value="-c"/>
-        </test>
-    </tests>
-
-    <help>
-
-**What it does**
-
-Aragorn_ predicts tRNA (and tmRNA) in nucleotide sequences.
-
-.. _Aragorn: http://mbio-serv2.mbioekol.lu.se/ARAGORN/
-
------
-
-It requires an RDF genome file
-
-    </help>
-    <citations>
-        <citation type="bibtex">
-            @article{Laslett2004,
-abstract = {A computer program, ARAGORN, identifies tRNA and tmRNA genes. The program employs heuristic algorithms to predict tRNA secondary structure, based on homology with recognized tRNA consensus sequences and ability to form a base-paired cloverleaf. tmRNA genes are identified using a modified version of the BRUCE program. ARAGORN achieves a detection sensitivity of 99\% from a set of 1290 eubacterial, eukaryotic and archaeal tRNA genes and detects all complete tmRNA sequences in the tmRNA database, improving on the performance of the BRUCE program. Recently discovered tmRNA genes in the chloroplasts of two species from the 'green' algae lineage are detected. The output of the program reports the proposed tRNA secondary structure and, for tmRNA genes, the secondary structure of the tRNA domain, the tmRNA gene sequence, the tag peptide and a list of organisms with matching tmRNA peptide tags.},
-author = {Laslett, Dean and Canback, Bjorn},
-doi = {10.1093/nar/gkh152},
-file = {:Users/koeho006/Library/Application Support/Mendeley Desktop/Downloaded/Laslett, Canback - 2004 - ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences.pdf:pdf},
-isbn = {1362-4962 (Electronic)$\backslash$n1362-4962 (Linking)},
-issn = {03051048},
-journal = {Nucleic Acids Research},
-mendeley-groups = {VAPP Application note},
-pages = {11--16},
-pmid = {14704338},
-title = {{ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences}},
-volume = {32},
-year = {2004}
-}
-</citation>
-</citations>
-
-</tool>
-
b
diff -r 0a947cb25a3d -r 957156367442 genetic_elements/aragorn/test-data/NC_017117.rdf
--- a/genetic_elements/aragorn/test-data/NC_017117.rdf Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,25 +0,0 @@\n-@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n-@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n-@prefix ssb: <http://csb.wur.nl/genome/> .\n-@prefix xml: <http://www.w3.org/XML/1998/namespace> .\n-@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n-\n-ssb:NC_017117 a ssb:Genome ;\n-    ssb:dnaobject <http://csb.wur.nl/genome/NC_017117/dnaobject_0> ;\n-    ssb:id_tag "NC_017117" ;\n-    ssb:sourcedb "SAPP" .\n-\n-ssb:Type a <http://www.w3.org/2002/07/owl#Class> ;\n-    rdfs:subClassOf <http://www.w3.org/2002/07/owl#Thing> .\n-\n-ssb:DnaObject a <http://www.w3.org/2002/07/owl#Class> ;\n-    rdfs:subClassOf <http://www.w3.org/2002/07/owl#Thing> .\n-\n-ssb:Genome a <http://www.w3.org/2002/07/owl#Class> ;\n-    rdfs:subClassOf <http://www.w3.org/2002/07/owl#Thing> .\n-\n-<http://csb.wur.nl/genome/NC_017117/dnaobject_0> a ssb:DnaObject ;\n-    ssb:header "gi|384055705|ref|NC_017117.1| Acetobacter pasteurianus IFO 3283-22 plasmid pAPA22-010, complete sequence" ;\n-    ssb:sequence "CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACATTGCGCAGATACAGGAAACACAGACCAAATCCCCATCTCCTGTGAGCCTGGGTCAGTCCCACCAGAAGAGCGGCAATCCTGTCGTTCTCCGCTGCCAGTCGCGGACGATAGCGAAAGCAGGTCTCGGATATCCCAAAAATCCGACAGGCCAGCGCAATGCTGACCCCATGATGCGCCACAGCTTGTGCGGCCAGTTCCCGGCGCTGGGCTGGCCGCTTCATTTTTTTCCAAGGGCTTCCTTCAGGATATCCGTCTGCATGCTCAAATCCGCATACATGCGCTTCAGCCGACGGTTCTCCTCTTCCAAAGCCTTCATCTGACTGATCATCGAAGCATCCATGCCGCCATATTTCGCGCGCCACCGGTAAAACGTGGCGTTGCTGATCCCATGCTCCCGACACAGGTCAGGAACCGGGACACCGCCCTCAGCCTGGCGGATCACACCCATGATCTGGGCGTCAGTAAAGCGATCACTCTTCATCAGAATCTCCTCAATTCTTACGCTGAGAAAATTCTCATTCAAAAGTCACTCTTTTTATGGGGGGATTACCACTCTAAATCAATGCATTCCAATTAACTTATAAAATGCTTTGAGAGTCATCACCTACAGCAAAGAACTCTGCTGACACCTCTGAAATCATTTTGAGCCATAAAAACTGGGCTCGATTGACGTCCTGAAACTCATCAGCCATCACGGCTGTGAAACGGCGTGACCAGCGGTAGCGATAGGCATCATTGTGCAGCATTGCCAATGTCGGCCACATCAACAGATCCCCAAAATCTGCAGCATTCTGTTCGCGCAGCAAACTCTGGTAACGACCATACAACTCAACCACATAGCGCCAGCCCGCATCGTCCATAAAACGTTTCTGGGCATGTGCTCGCGCTATCATGGCTTCAACATGATGCCCTGCCATCTCAGGCGTCACCAGATCTTCCTTCAAACGAGATGGGGTGGTTGCCGTCCTCCCCGCACGGCATCGCAATGTGCCAGAATGGTCGTTGGAAGAAACGACTGCGCGAAAGGACGGCAGATGAAGGATACAGTGATAGGCGTTGATCTGGCAAAGAACATTTTCCAGGTTCATGGAGCTTCGCGTGCGGGCGAGGTGATGTTTCGCAAAAAGCTGCGTCGTCAGCAGTTTATGCAGTTCATGGCCACGCAGCCGCCTGCTCTGGTCGTTCTTGAAGCGTGCGGGAGCGCGCATTACTGGGCTCGCGAACTGGCAGGAGCTGGTCACGAGGTCAGACTGATCGCTCCGCAGTATGTGAAGCCTTTCGTGAAGCGCCAGAAGAACGATGCTGCTGATGCGGAAGCGATCGTCATTGCGGCCCGTCAGCCGGAAATGCGCTTTGTCGAACCACGCACTGAAGCGCAGCAGGCGCGTGGCGTTCTTTTCCGGGCCCGGCAGCGTCTGGTGCACCAGCGCACGGAACTGGTGAATGCCCTGCGTGCCGTTCTGTATGAATTCGGTCTCGTCGTGCCACAGGGGATTGCGCATATCAGACACATTGAAGCCATGCTGGATGAGGCGGTTCTGCCAGAGGCTGTGAAGCAGGAATGCCTTGATCTGCTGCGACAGATTTCGGAGCAGAGTGTGCGGATTGATGTCAGAACAAAGAAGATCAGGATGCTTGCCCAGGAAAGTGAAAACACCTGCAGATTGCAGAGCATGCCTGGAGTGGGTCCTCTGACCGCTCTTGCGATTGAAGCTTTTGCGCCTGACCTGCAGAGCTTCCGGCGCGGGCGCGACTTTGCTGCGTGGCTGGGGCTGGTGCCCCGTCAGTTCTCATCTGGCGGAAAGGAAAGGCTGGGGAAGATATCAAAAGCCGGGCAGGCTGATATCCGCAGGCTTCTCATCATGGGCGCCATGACCCAGGTGAACTGGGCCAGCCGTAAGGCCCCTGCACCGGGAAGCTGGCTGGCACGGATGCTGGCCCGCAAGCCCCGTATGCTGGTAGCCATTGCGCTGGCCAACAGGATGGCACGAGCCATCTGGGCCATGGCAACAAAACAGGAGGATTATCGGGATCCGGCCCTGTCCGTGGCAGCCTGAGCGATGGCTCGGCTCCCGCGGATGGAACCGGTAGGGGTGTGAGAGGGCGATGACCTGAATGGGCGCATGATCGTCTGATCCGGATCGGAAAAACCAGTGGATTTCTCTGTGCTTTAAAGCACGCCTGTGAGATTTGGATCTGATCCGCTGATCACCATACTGGCCAGTGGCTTCTGAAAGGCCACATCAACAGGCCTTACAGAAGACCGCACACGATCACACGTCAATATGGGTCAGAAAACTCTTGCATAACGGACGGCAACCATATGTGGACGGCTCCCCCTTGCAAGAGGCTAGGCAAGAAAATGATCGGATCTTTGCTTCCATATGTCCGGCCTGTTGATGCGGCCATAGGGTCGCTGGCCAAGATGGCTTCCGCAGCGTGAGCCCCAAACACAGAAGCGGTCTTTGATGACCACTGGTTGCCACGGGTTTTCTCACGCCATGGATCGATCGATCACACCATCTGCTCTATTACTTGCAAGCCACGACCTCAGCTCGGCACGAGAGCGTCAAATGTCAGCGCATCGTGCCAGGCTAAGCTCAAACAGCAGCTGCGCCGGGTTGCTGCAGAAGGCGCTTATAGTGTTCGCCGCTGACCATCAGTTTCCAAGCAATCCGCGCAATCTTATTGGCAAGGGCCACCGCTGCGAGTTTCGGTTTTTTGCGCTCCAGCAATTCACGTAACCAAGATGAGGCATTCTTCCCATTGGTCCGCCGGGCATGCGACACGACTGCGGTCGCGCCAACCACCAGCGTGCTTCGCAAGACCTCATCGCCAGCGCGTGTGATTCTGCCAAGCCTTGTTTTTCCACCGGTTGAGTGATCCCTGGGCGTCAATCCGATCCAGGCCGCAAAGGC'..b'GGAACGACGTTGCCAGACTGATCAACCACATCCTCTGATTGCTCACGCGTGCAACCGACCCTAAGCCGGGCGATCTCATTGCTCCGAAGGCCTGCATGTGTCCACACCACGGCGATTGCCTGCAACAAGGCGAATGGATAGAAACAGTCAGGAGAAAGGTCGTCTGGTTCGATGTTGAGGCTGGCCCAGGTCAGTTTCAGCCAAGAAGCATCATCTATGGTCCTAGGATTTACGCCAAGATATTTCGCAATATTCTTCGGTGTCGAGAGATGATATCGCGGATTACACCTGAGCCGCGCCCATTCCCAACTCTGAATATCAGATAAAAACCGGCGCATAACCTGATAATAAGCAACTTTCGTTGGAGCCTGCAGAGGCTGGCCAACTGTTGGGATCAGACGATAATCAAAGGTGGAACCGCCCCAATCGCCAACACGTAACCTGTCGACGGCGGCGAGATAGTCGGCACATACCGATACCGTCCATTGCTCTGGTCCAGTGACCTCAGGGTGCTCGCGACCCAACCAGATTCCGATACGGGTTAGAGTGTTGTAAACTGCTCGCCTCGACCCTTCTCGCAACGTTGTTGTTTCCAACCAACGTCGGCACCATGTCATCCACTCAGGATTGATATTATCAGTTTCAGTTCCGTGACGAGGTGGATATGCCCGAAAATGGATAACCTTTGATGTTAATCCCATTGCCGCCAAACCGTTTGACAATATTCCAATCCGCTTGGCGATATGATTTCCTGTGTAACACTCTCGTGTATGTTCCAGTATCTTCCTATCAAAACTTTCAAGTTTTGGATTCTCGCCTTGTAATGCAACTGCAGCAAATACTGATGGTACGAGGGGCCGAACGGTCTGACAGACGAAGCCGACACGGGTTAGGGCCGAGAACAGACGCTCACATTCTGTATCAACAATCTCCGCTCCAAAAACCAATCGGGCAACGACAGCCGTCGCCACATTGCGTTGAATTTTGTACACGCTTCGAAAACCACACAGAAGATAAGCGACTGCAATCAACTGCGGTACCATTCCGGTTTTTTTCAGAACAGGACTATTATTGATAATCTCAACCCACAGGTCTTCACTCCACCCCCAGTAGGGTTTTTCCTGGTGTGCGACCGTCAGAAGCAAATACTTCAAAGCACGATGGCTGTTAGCGCGGTCGAGATTGCTGCGATACAGGATGTCCGTCAGAGGCTTATGAAGACGCGGAGCCTGCCGTTTCGTAATAACAAGATTTTCTTCTTTCCAACGTCTGACAACTTTTAATTCGTCTGCAGACAATGTCGCCCGTCTGTCGTATTCATCGAGATTAATGTGGAATTTGTATTCGGGCTGGATGTGAGAAACCTGAGATTCTACCACTTTTCTATCCTCCGAAGACCCTGTGACCAAGCTTCATATCCATCTGTTCGACTGAGTTTGCGATCTTGCGAAGCAGATCTTCACCGGAAAGATGGATATAGAGTGTCGTGCTTTGAACATTGCGATGCCCGGCATACGTCGCAATATCGTGTAGACGCCAGCCAGCACGGGCCAGATGCGTCAATCTCAGGTGACGCAAAGTGTGCGTACTGAACAATGGCATATCAGCCTGGAGAGCAAGACGTCTGACAGTTTTGCTCCATGACCACTTCGTAATAGGCTGCCGAAAGTTCCGATCTGACTCAGAGAGAAACAGGGCCGCTGAATGAGTCGCTGCGTTGCGCCTTTGATGCAGATATACCGCCAACACAGGACAGAGCGCCGCTGAATAACAAACCACACGAGGGCGAGCGCTTTTACTTGTTTCGGCCCGAATGGTGAGCAAACGTCTCGCAGGGTCGATATCCGAGACGCGCAAATTTACTACGGCGTGTCGTCAGTTAAGCCCTGAGAGTGGCACGTGAGGGTTGTACTTTGTGTCTGCGTGTGCTGACTGTTTTCCCATTTTTTGGGGAGACAGACAGATGCGGCGCTATAGTTTACGCGATGACCAGTGGGAGCGGATAAAGGATCTTCTTCCTGGTCGAGAAGGCTATGTCGGCGGCACTGCGGTGAACAACCGTCTGTTCGTGGAGGCGGTGCTGTATCGCTATCGCGCGGGTATTCCATGGCGCGACCTTCCTGCCCGTTTCGGTGACTGGAAAAACGTGCACCGGCGTCTGCGCCGCTGGTGTGAAAGCGGCGTCATCGAACGGATATTTCGTTATCTGGCCGCTGATTACGACAACGAATACATGATGATCGACAGCACAATTGTCCGAGCGCATCAGCATAGTGCCGGAGCTCTCAAAAAAGGGGCACGGATCAGGCCATCGGACGATCACGGGCGGGCTAACTACAAAGATCCATGCCATCTGCGACGCTCTGGGCAATCCAGTGGAACTCGGCATCACACCGGGACAGGATGCCGATATCACCCAGGCAGAACCACTTCTGGAAAACATCGAACCGGATGCTTTCCTTGCTGACAAGGCGTATGACGCGGACAGGTTGATCGATCGGCTGATACAGCGCGGGATTACCCCGGTCATCCCGCCAAAACGCAACAGAACGACACGACGGGTAATCCCCCCATAAAAAGAGTGACTTTTGAATGAGAATTTTCTCAGCGTAAGAATTGAGGAGATTCTGATGAAGAGTGATCGCTTTAGTGACGCCCAGATCATGGGTGTGATCCGCCAGGCTGAGGGCGGTGTCCCGGTTCCTGACCTGTGCCGGGAGCATGGGATCAGCAACGCCACGTTTTACCGGTGGCGCGCGAAATATGGCGGCATGGATGCTTCGATGATCAGTCAGATGAAGGCTTTGGAAGAGGAGAACCGTCGGCTGAAGCGCATGTATGCGGATTTGAGCATGCAGACGGATATCCTGAAGGAAGCCCTTGGAAAAAAATGAAGCGGCCAGCCCAGCGCCGGGAACTGGCCGCACAGGCTGTGGCGCATCATGGGGTCAGCATTGCGCTGGCCTGTCGGATTTTTGGGATATCCGAGACCTGCTTTCGCTATCGTCCGCGACTGGCAGCGGAGAATGACAGGATTGCCGCTCTTCTGGTGGGACTGACCCAGGCTCACAGGAGATGGGGATTTGGTCTGTGTTTCCTGTATCTGCGCAATGTGCAGGGACAGCTCTGGAATCATAAGCGGGTTTATCGGATCTATCGGGAACTGGAGTTCAACCTGCGGATTAAACCCCGCAGGCGTCTGGTTCGCGAAAAGCCTGAAAAGCTGTCGGTTCCGGCCCTTCCCAACACGGTCTGGTCCATGGATTTCATGGCGGACAGGCTTTTGGATGGACGCGCTTTTCGGCTCCTGAACATCCTGGATGAGTTCAATCGTGAAGGACTGGCGATCGAGGTTGATTTTTCCCTGCCGGCCTGTCGGGTTGTCCGCTGGTAATCCCCCCATTTTTAGTGGGGCATTGAATGAGAATTCAGGCAGCTGTTTTTAGTTTCTGGGCGGGGGTTAGCCCGCTGTTCCCCATGTTGGGTCTGTCATTGTTATATGTCCAGAGCCATTGTGTTGCGACCTCCTGTACGTCCTGAATGCTTTCAAACAAATACTGCTCTAGCCATTCCTGCCGGACAGTTCTGTTGTAGCGTTCAATATAGGCGTTCTGCTGCGGATTGCCCGGTTGTGTATAGATCAGGGTAATCCCCTGCTTTTCGGCCCATGAAACCAACGTATGACTGACATATTCAGGGCCATTGTCCATTCGGATAGCCTCTGGCCTGCCACGCCACTCCATAACCTGTTCCAGACAGCGAACAACCCGACAGGCTGGCAGGGAAAAATCAACCTCAATCGCCAGTCCTTCACGATTGAAATCATCCAGAATGTTCAGGAGCCGAAAAGCACGTCCATCCATCAGCCTGTCCGCCATAAAATCCATGGACCAGACCCTGTTGGGAAGGGCCGGAACCGACAGCTTTTCAGGCTTTTCGCGAACCAGACGCCTGCGGGGTTTAATC" ;\n-    ssb:sourcedb "SAPP" .\n-\n'
b
diff -r 0a947cb25a3d -r 957156367442 genetic_elements/aragorn/tool_dependencies.xml
--- a/genetic_elements/aragorn/tool_dependencies.xml Wed Jun 29 01:34:59 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,9 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="aragorn" version="1.2.36">
-        <repository changeset_revision="f09e2902e6ed" name="package_aragorn_1_2_36" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-    <set_environment version="1.0">
-        <environment_variable action="set_to" name="TRNAPRED_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable>
-    </set_environment>
-</tool_dependency>
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker.zip
b
Binary file sappDocker.zip has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._.DS_Store
b
Binary file sappDocker/._.DS_Store has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._annotation.xml
b
Binary file sappDocker/._annotation.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._aragorn.xml
b
Binary file sappDocker/._aragorn.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._circos.xml
b
Binary file sappDocker/._circos.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._crt.xml
b
Binary file sappDocker/._crt.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._enzdp.xml
b
Binary file sappDocker/._enzdp.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._fasta2rdf.xml
b
Binary file sappDocker/._fasta2rdf.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._gbk2rdf.xml
b
Binary file sappDocker/._gbk2rdf.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._genecaller.xml
b
Binary file sappDocker/._genecaller.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._genomeInformation.xml
b
Binary file sappDocker/._genomeInformation.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._interproscan.xml
b
Binary file sappDocker/._interproscan.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._ipath.xml
b
Binary file sappDocker/._ipath.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._loader.xml
b
Binary file sappDocker/._loader.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._locustagger.xml
b
Binary file sappDocker/._locustagger.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._matrix.xml
b
Binary file sappDocker/._matrix.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._merger.xml
b
Binary file sappDocker/._merger.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._pathwayAnalysis.xml
b
Binary file sappDocker/._pathwayAnalysis.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._phylogeny.xml
b
Binary file sappDocker/._phylogeny.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._priam.xml
b
Binary file sappDocker/._priam.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._rdf2embl.xml
b
Binary file sappDocker/._rdf2embl.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._rnammer.xml
b
Binary file sappDocker/._rnammer.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._signalp.xml
b
Binary file sappDocker/._signalp.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._sparql.xml
b
Binary file sappDocker/._sparql.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._swisscog.xml
b
Binary file sappDocker/._swisscog.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/._tmhmm.xml
b
Binary file sappDocker/._tmhmm.xml has changed
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/annotation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/annotation.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,13 @@
+<tool id="DAnnotation" name="Protein annotation" version="0.1">
+ <description>Protein annotation</description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:PROTEINANNOTATION</container>
+ </requirements>
+ <command interpreter="docker">java -jar /proteinannotation/target/proteinAnnotation-0.0.1-SNAPSHOT-jar-with-dependencies.jar '-input' '$input' '-output' '$output' -format TURTLE</command>
+ <inputs>
+ <param format="ttl" label="genome ttl file" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="Annotation: ${input.name}" name="output"/>
+ </outputs>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/aragorn.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/aragorn.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,80 @@
+<tool id="DAragorn" name="tRNA and tmRNA prediction" version="0.3">
+ <description>Aragorn tRNA and tmRNA prediction</description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:ARAGORN</container>
+ </requirements>
+ <command interpreter="docker">java -jar /aragorn/target/aragorn-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-input' '$input' '-output' '$output' '-gc' $genbank_gencode
+ '$tmRNA' '$tRNA' '$topology' -format TURTLE
+ </command>
+ <inputs>
+ <param format="ttl" label="RDF Genome" name="input" type="data"/>
+ <param label="Genetic code" name="genbank_gencode" type="select">
+ <option select="True" value="1">1. Standard</option>
+ <option value="2">2. Vertebrate Mitochondrial</option>
+ <option value="3">3. Yeast Mitochondrial</option>
+ <option value="4">4. Mold, Protozoan, and Coelenterate
+ Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+ <option value="5">5. Invertebrate Mitochondrial</option>
+ <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear
+ Code</option>
+ <option value="9">9. Echinoderm Mitochondrial</option>
+ <option value="10">10. Euplotid Nuclear</option>
+ <option value="11">11. Bacteria and Archaea</option>
+ <option value="12">12. Alternative Yeast Nuclear</option>
+ <option value="13">13. Ascidian Mitochondrial</option>
+ <option value="14">14. Flatworm Mitochondrial</option>
+ <option value="15">15. Blepharisma Macronuclear</option>
+ <option value="16">16. Chlorophycean Mitochondrial</option>
+ <option value="21">21. Trematode Mitochondrial</option>
+ <option value="22">22. Scenedesmus obliquus mitochondrial</option>
+ <option value="23">23. Thraustochytrium Mitochondrial</option>
+ <option value="24">24. Pterobranchia mitochondrial</option>
+ </param>
+ <param label="Topology" name="topology" type="select">
+ <option value="-c">Assume that each sequence has a circular
+ topology</option>
+ <option value="-l">Assume that each sequence has a linear topology
+ </option>
+ </param>
+ <param checked="true" falsevalue="" help="" label="Search for tmRNA genes (-m)" name="tmRNA" truevalue="-m" type="boolean"/>
+ <param checked="true" falsevalue="" help="" label="Search for tRNA genes (-t)" name="tRNA" truevalue="-t" type="boolean"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="Aragorn: ${input.name}" name="output"/>
+ </outputs>
+ <citations>
+ <citation type="bibtex">@article{Laslett2004,
+ abstract = {A computer program, ARAGORN, identifies tRNA and tmRNA genes. The
+ program employs heuristic algorithms to predict tRNA secondary
+ structure, based on homology with recognized tRNA consensus sequences
+ and ability to form a base-paired cloverleaf. tmRNA genes are
+ identified using a modified version of the BRUCE program. ARAGORN
+ achieves a detection sensitivity of 99\% from a set of 1290
+ eubacterial, eukaryotic and archaeal tRNA genes and detects all
+ complete tmRNA sequences in the tmRNA database, improving on the
+ performance of the BRUCE program. Recently discovered tmRNA genes in
+ the chloroplasts of two species from the 'green' algae lineage are
+ detected. The output of the program reports the proposed tRNA
+ secondary structure and, for tmRNA genes, the secondary structure of
+ the tRNA domain, the tmRNA gene sequence, the tag peptide and a list
+ of organisms with matching tmRNA peptide tags.},
+ author = {Laslett, Dean and Canback, Bjorn},
+ doi = {10.1093/nar/gkh152},
+ file = {:Users/koeho006/Library/Application Support/Mendeley
+ Desktop/Downloaded/Laslett, Canback - 2004 - ARAGORN, a program to
+ detect tRNA genes and tmRNA genes in nucleotide sequences.pdf:pdf},
+ isbn = {1362-4962 (Electronic)$\backslash$n1362-4962 (Linking)},
+ issn = {03051048},
+ journal = {Nucleic Acids Research},
+ mendeley-groups = {VAPP Application note},
+ pages = {11--16},
+ pmid = {14704338},
+ title = {{ARAGORN, a program to detect tRNA genes and tmRNA genes in
+ nucleotide sequences}},
+ volume = {32},
+ year = {2004}
+ }
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/circos.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/circos.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,36 @@
+<tool id="DCircos" name="Circle Image Generator" version="0.1">
+ <description>Circos View</description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:CIRCOS</container>
+ </requirements>
+ <command interpreter="docker">python3.4 /circos/circos.py '-input' '$input' -output '$output1' '$output2'</command>
+ <inputs>
+ <param format="ttl" label="genome ttl file" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="png" label="CIRCLE: ${input.name}" name="output1"/>
+ <data format="svg" label="CIRCLE: ${input.name}" name="output2"/>
+ </outputs>
+ <help>Visualization of the RDF genome using CIRCOS. It requires a RDF genome with optionally gene prediction.
+ </help>
+ <citations>
+ <citation type="bibtex">@article{Krzywinski2009,
+abstract = {We created a visualization tool called Circos to facilitate the identification and analysis of similarities and differences arising from comparisons of genomes. Our tool is effective in displaying variation in genome structure and, generally, any other kind of positional relationships between genomic intervals. Such data are routinely produced by sequence alignments, hybridization arrays, genome mapping, and genotyping studies. Circos uses a circular ideogram layout to facilitate the display of relationships between pairs of positions by the use of ribbons, which encode the position, size, and orientation of related genomic elements. Circos is capable of displaying data as scatter, line, and histogram plots, heat maps, tiles, connectors, and text. Bitmap or vector images can be created from GFF-style data inputs and hierarchical configuration files, which can be easily generated by automated tools, making Circos suitable for rapid deployment in data analysis and reporting pipelines.},
+author = {Krzywinski, Martin and Schein, Jacqueline and Birol, Inan\c{c} and Connors, Joseph and Gascoyne, Randy and Horsman, Doug and Jones, Steven J and Marra, Marco A},
+doi = {10.1101/gr.092759.109},
+issn = {1549-5469},
+journal = {Genome research},
+keywords = {Animals,Chromosome Mapping,Chromosomes, Artificial, Bacterial,Chromosomes, Human, Pair 17,Chromosomes, Human, Pair 17: genetics,Chromosomes, Human, Pair 6,Chromosomes, Human, Pair 6: genetics,Contig Mapping,Dogs,Gene Dosage,Gene Dosage: genetics,Genome,Genome: genetics,Genomics,Humans,Lymphoma, Follicular,Lymphoma, Follicular: genetics,Software},
+month = sep,
+number = {9},
+pages = {1639--45},
+pmid = {19541911},
+title = {{Circos: an information aesthetic for comparative genomics.}},
+url = {http://genome.cshlp.org/content/early/2009/06/15/gr.092759.109.abstract},
+volume = {19},
+year = {2009}
+}
+
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/crt.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/crt.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,90 @@
+<tool id="DCRT" name="CRISPR detection" version="0.1">
+ <description></description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:CRT</container>
+ </requirements>
+ <command interpreter="docker">java -jar /crt/target/CRT-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-input' '$input' -output '$output' -format TURTLE
+ </command>
+ <inputs>
+ <param name="input" type="data" format="ttl" label="genome ttl file" />
+ </inputs>
+
+ <outputs>
+ <data format="ttl" name="output" label="CRISPR: ${input.name}" />
+ </outputs>
+ <help>
+ CIRSPR prediction using CRT. Requires a converted
+ FASTA/EMBL/GenBank file.
+ </help>
+ <citations>
+ <citation type="bibtex">
+ @article{Bland2007,
+ abstract = {BACKGROUND:
+ Clustered Regularly Interspaced Palindromic Repeats
+ (CRISPRs) are a
+ novel type of direct repeat found in a wide range of
+ bacteria and
+ archaea. CRISPRs are beginning to attract attention
+ because of their
+ proposed mechanism; that is, defending their hosts
+ against invading
+ extrachromosomal elements such as viruses. Existing
+ repeat detection
+ tools do a poor job of identifying CRISPRs due to
+ the presence of
+ unique spacer sequences separating the repeats. In
+ this study, a new
+ tool, CRT, is introduced that rapidly and
+ accurately identifies
+ CRISPRs in large DNA strings, such as genomes
+ and metagenomes.
+ RESULTS: CRT was compared to CRISPR detection tools,
+ Patscan and
+ Pilercr. In terms of correctness, CRT was shown to be
+ very reliable,
+ demonstrating significant improvements over Patscan
+ for measures
+ precision, recall and quality. When compared to Pilercr,
+ CRT showed
+ improved performance for recall and quality. In terms of
+ speed, CRT
+ proved to be a huge improvement over Patscan. Both CRT and
+ Pilercr
+ were comparable in speed, however CRT was faster for genomes
+ containing large numbers of repeats. CONCLUSION: In this paper a new
+ tool was introduced for the automatic detection of CRISPR elements.
+ This tool, CRT, showed some important improvements over current
+ techniques for CRISPR identification. CRT's approach to detecting
+ repetitive sequences is straightforward. It uses a simple sequential
+ scan of a DNA sequence and detects repeats directly without any major
+ conversion or preprocessing of the input. This leads to a program
+ that is easy to describe and understand; yet it is very accurate,
+ fast and memory efficient, being O(n) in space and O(nm/l) in time.},
+ author = {Bland, Charles and Ramsey, Teresa L and Sabree, Fareedah
+ and Lowe, Micheal and Brown, Kyndall and Kyrpides, Nikos C and
+ Hugenholtz, Philip},
+ doi = {10.1186/1471-2105-8-209},
+ file =
+ {:Users/koeho006/Library/Application Support/Mendeley
+ Desktop/Downloaded/Bland et al. - 2007 - CRISPR recognition tool
+ (CRT) a tool for automatic detection of clustered regularly
+ interspaced palindromic repeat.pdf:pdf},
+ isbn = {1471-2105
+ (Electronic)$\backslash$n1471-2105 (Linking)},
+ issn = {14712105},
+ journal = {BMC bioinformatics},
+ mendeley-groups = {VAPP Application
+ note},
+ pages = {209},
+ pmid = {17577412},
+ title = {{CRISPR recognition
+ tool (CRT): a tool for automatic detection of
+ clustered regularly
+ interspaced palindromic repeats.}},
+ volume = {8},
+ year = {2007}
+ }
+ </citation>
+ </citations>
+</tool>
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/enzdp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/enzdp.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,16 @@
+<tool id="DEnzDP" name="EnzDP - Enzyme prediction" version="1.0.0">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:ENZDP</container>
+ </requirements>
+ <command interpreter="docker">java -jar /enzdp/target/enzdpRDF-0.0.1-SNAPSHOT-jar-with-dependencies.jar -input $input -output $output -format TURTLE
+ </command>
+ <inputs>
+ <param format="ttl" label="genome ttl with protein sequences" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="ENZDP: ${input.name}" name="output"/>
+ </outputs>
+ <help/>
+ <citations/>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/fasta2rdf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/fasta2rdf.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,82 @@
+<tool id="DFASTA2RDF" name="FASTA to RDF" version="0.1">
+ <description></description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:FASTA2RDF</container>
+ </requirements>
+ <command interpreter="docker">java -jar /fasta2rdf/target/FASTA2RDF-0.1-jar-with-dependencies.jar
+                '--type' '$source.fastaType' '--ignorestop' '$IgnoreStopCodon'
+ '--input' '$input' '--output' '$output' '-organism' '$organism'
+ '--ncbi_taxid' '$ncbi_taxid'
+ #if len(str($identification_tag))==0
+ '--idtag' ${input.name}
+ #else
+ '--idtag' '$identification_tag'
+ #end if
+ --source SAPP
+
+ #for $index, $id in enumerate( $ids )
+ '--id_alternative' '$id.id_tag'
+ #end for
+ '--id_alternative' '$input.name'
+ '--codon' '$table'
+ </command>
+ <inputs>
+ <param size="60" name="input" type="data" format="fasta" label="Fasta file for conversion" />
+
+                <conditional name="source">
+                        <param name="fastaType" type="select"
+                                label="Select if it is a Genome/Gene/Protein or program wont start!">
+                                <option value="">To be chosen</option>
+                                <option value="genome"> Genome </option>
+                                <option value="gene"> Gene</option>
+                                <option value="protein"> Protein </option>
+                                <validator type="empty_field"
+                                        message="Please select if it is a Genome, Gene or Protein" />
+                        </param>
+                </conditional>
+
+ <param name="table" type="select" label="Codon table">
+ <option value="1"> 1 - UNIVERSAL </option>
+ <option value="2"> 2 - VERTEBRATE_MITOCHONDRIAL </option>
+ <option value="3"> 3 - YEAST_MITOCHONDRIAL </option>
+ <option value="4"> 4 - MOLD_MITOCHONDRIAL </option>
+ <option value="5"> 5 - INVERTEBRATE_MITOCHONDRIAL </option>
+ <option value="6"> 6 - CILIATE_NUCLEAR </option>
+ <option value="9"> 9 - ECHINODERM_MITOCHONDRIAL </option>
+ <option value="10"> 10 - EUPLOTID_NUCLEAR </option>
+ <option value="11" selected="true"> 11 - BACTERIAL </option>
+ <option value="12"> 12 - ALTERNATIVE_YEAST_NUCLEAR </option>
+ <option value="13"> 13 - ASCIDIAN_MITOCHONDRIAL </option>
+ <option value="14"> 14 - FLATWORM_MITOCHONDRIAL </option>
+ <option value="15"> 15 - BLEPHARISMA_MACRONUCLEAR </option>
+ <option value="16"> 16 - 2CHLOROPHYCEAN_MITOCHONDRIAL </option>
+ <option value="21"> 21 - TREMATODE_MITOCHONDRIAL </option>
+ <option value="23"> 23 - SCENEDESMUS_MITOCHONDRIAL </option>
+ </param>
+ <param size="60" name="organism" type="text" format="text"
+ label="organism name" />
+ <param name='IgnoreStopCodon' type='boolean'
+ label='Ignore if stop codon within protein sequence' truevalue='true'
+ falsevalue='false' checked="false" help='' />
+
+ <param size="60" name="ncbi_taxid" type="integer" value="0"
+ label="NCBI taxonomy ID" optional="False">
+ <validator type="in_range" min="1"
+ message="Minimum taxonomy value is 1" />
+ </param>
+ <param size="60" name="identification_tag" type="text" format="text"
+ label="An identification tag used for RDF storage !Needs to be very unique!"
+ optional="True" />
+ <repeat name="ids" title="Identification tags">
+ <param size="60" name="id_tag" type="text" format="text"
+ label="An identification tag used by other consortiums" />
+ </repeat>
+ </inputs>
+ <outputs>
+ <data format="ttl" name="output" label="FASTA2RDF: ${input.name}" />
+ </outputs>
+ <help>
+ RDF creation from a multi (gene/protein/genome) fasta file
+ </help>
+</tool>
+
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/gbk2rdf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/gbk2rdf.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,24 @@
+<tool id="DGBK2RDF" name="EMBL/GBK to RDF" version="0.1">
+ <description>Converts GenBank/EMBL files to RDF</description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:GBK2RDF</container>
+   </requirements>
+ <command>java -jar /genbank2rdf/target/GenBank2RDF-0.0.1-SNAPSHOT-jar-with-dependencies.jar '-input' '$input' -output '$output' -source "$source" -format "$format" -identifier "${input.name}" -codon "$codon" </command>
+ <inputs>
+ <param format="gbk,gb,genbank" label="Genbank file" name="input" type="data"/>
+ <param label="EMBL/GBK" name="format" type="select">
+ <option value="gbk">Genbank</option>
+ <option selected="true" value="embl">EMBL</option>
+ </param>
+ <param label="11/4" name="codon" type="select">
+ <option selected="true" value="11">11</option>
+ <option value="4">4</option>
+ </param>
+ <param label="Source of annotation eg. RAST/NCBI/EBI" name="source" optional="false" type="text"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="GBKttl: ${input.name}" name="output"/>
+ </outputs>
+ <help>Java Genbank or EMBL to RDF conversion
+ </help>
+</tool>
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/genecaller.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/genecaller.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,103 @@
+<tool id="DGenes" name="Gene prediction" version="1.0.0">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:GENECALLER</container>
+ </requirements>
+ <command interpreter="docker">java -jar /genecaller/target/genecaller-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-runtype' '$runtype' -input $input -output $output -codon $codon -format TURTLE
+ </command>
+ <inputs>
+ <param format="ttl" label="ttl genome file" name="input" type="data"/>
+ <param label="codon table selection" name="codon" type="select">
+ <option value="11">The Bacterial, Archaeal and Plant Plastid Code
+ (transl_table=11)
+ </option>
+ <option value="4">The Mold, Protozoan, Coelenterate Mitochondrial
+ and Mycoplasma/Spiroplasma Code (transl_table=4)
+ </option>
+ </param>
+ <param label="single or meta genome" name="runtype" type="select">
+ <option value="single">Single genome analysis</option>
+ <option value="meta">Metagenome analysis</option>
+ </param>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="ORF: ${input.name}" name="output"/>
+ </outputs>
+ <help>Prodigal gene prediction requires an RDF file from either a
+ Genome FASTA or
+ Genbank/EMBL format.
+ </help>
+ <citations>
+ <citation type="bibtex">@article{Hyatt2010,
+ abstract = {BACKGROUND: The
+ quality of automated gene prediction in microbial
+ organisms has
+ improved steadily over the past decade, but there is
+ still room for
+ improvement. Increasing the number of correct
+ identifications, both of
+ genes and of the translation initiation
+ sites for each gene, and
+ reducing the overall number of false
+ positives, are all desirable
+ goals.
+
+ RESULTS: With our years of experience in manually curating
+ genomes for the
+ Joint Genome Institute, we developed a new gene
+ prediction algorithm
+ called Prodigal (PROkaryotic DYnamic programming
+ Gene-finding
+ ALgorithm). With Prodigal, we focused specifically on the
+ three goals
+ of improved gene structure prediction, improved
+ translation
+ initiation site recognition, and reduced false positives.
+ We compared
+ the results of Prodigal to existing gene-finding methods
+ to
+ demonstrate that it met each of these objectives.
+
+ CONCLUSION: We
+ built a fast, lightweight, open source gene prediction program
+ called
+ Prodigal http://compbio.ornl.gov/prodigal/. Prodigal achieved
+ good
+ results compared to existing methods, and we believe it will be
+ a
+ valuable asset to automated microbial annotation pipelines.},
+ author =
+ {Hyatt, Doug and Chen, Gwo-Liang and Locascio, Philip F and
+ Land,
+ Miriam L and Larimer, Frank W and Hauser, Loren J},
+ doi =
+ {10.1186/1471-2105-11-119},
+ file =
+ {:Users/koeho006/Library/Application Support/Mendeley
+ Desktop/Downloaded/Hyatt et al. - 2010 - Prodigal prokaryotic gene
+ recognition and translation initiation site identification.pdf:pdf},
+ issn = {1471-2105},
+ journal = {BMC bioinformatics},
+ keywords =
+ {Algorithms,Databases, Genetic,Genome, Bacterial,Peptide Chain
+ Initiation, Translational,Peptide Chain Initiation, Translational:
+ genetics,Prokaryotic Cells,Software},
+ mendeley-groups = {Dump/VAPP
+ Paper},
+ month = jan,
+ number = {1},
+ pages = {119},
+ pmid = {20211023},
+ title = {{Prodigal: prokaryotic gene recognition and translation
+ initiation site identification.}},
+ url =
+ {http://www.biomedcentral.com/1471-2105/11/119},
+ volume = {11},
+ year =
+ {2010}
+ }
+
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/genomeInformation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/genomeInformation.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,16 @@
+<tool id="DInfo" name="Information overview" version="1.0.0">
+  <description>Information overview</description>
+  <requirements>
+    <container type="docker">jjkoehorst/sappdocker:GENOMEINFORMATION</container>
+  </requirements>
+  <command interpreter="docker">java -jar /genomeinformation/target/GenomeInformation-0.0.1-SNAPSHOT-jar-with-dependencies.jar -input $input -output '$output' -format 'TURTLE' 
+    </command>
+  <inputs>
+    <param format="ttl" label="Genome Database with Interpro" multiple="True" name="input" type="data"/>
+  </inputs>
+  <outputs>
+    <data format="text" label="information.text" name="output"/>
+  </outputs>
+  <help>Genome(s) information overview
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/interproscan.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/interproscan.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,148 @@
+<tool id="DInterproscan" name="Interproscan" version="1.0.0">
+ <description>Interproscan annotation for SAPP</description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:INTERPROSCAN</container>
+ </requirements>
+ <command interpreter="docker">java -jar /interproscan/target/interproscanRDF-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-input' '$input' '-format' 'TURTLE'
+ '-applications' '$appl'
+ '-output'
+ '$outfile' -v '$version' '$disable'
+ </command>
+ <inputs>
+ <param format="ttl" label="genome rdf file with orf prediction" name="input" type="data"/>
+ <param display="checkboxes" help="Select your programm." label="Applications to run" multiple="True" name="appl" type="select">
+ <option selected="true" value="TIGRFAM">TIGRFAM: protein families
+ based on Hidden Markov Models or HMMs
+ </option>
+ <option selected="false" value="PIRSF">PIRSF: non-overlapping
+ clustering of UniProtKB sequences into a hierarchical order
+ (evolutionary relationships)
+ </option>
+ <option selected="true" value="ProDom">ProDom: set of protein domain
+ families generated from the UniProtKB
+ </option>
+ <option selected="true" value="SMART">SMART: identification and
+ analysis of domain architectures based on Hidden Markov Models or
+ HMMs
+ </option>
+ <option selected="false" value="PrositeProfiles">PROSITE Profiles:
+ protein domains, families and functional sites as well as associated
+ profiles to identify them
+ </option>
+ <option selected="true" value="PrositePatterns">PROSITE Pattern:
+ protein domains, families and functional sites as well as associated
+ patterns to identify them
+ </option>
+ <option selected="false" value="HAMAP">HAMAP: High-quality Automated
+ Annotation of Microbial Proteomes
+ </option>
+ <option selected="true" value="PfamA">PfamA: protein families, each
+ represented by multiple sequence alignments and hidden Markov models
+ </option>
+ <option selected="true" value="PRINTS">PRINTS: group of conserved
+ motifs (fingerprints) used to characterise a protein family
+ </option>
+ <option selected="true" value="SuperFamily">SUPERFAMILY: database of
+ structural and functional annotation
+ </option>
+ <option selected="true" value="Coils">Coils: Prediction of Coiled
+ Coil Regions in Proteins
+ </option>
+ <option selected="true" value="Gene3d">Gene3d: Structural assignment
+ for whole genes and genomes using the CATH domain structure database
+ </option>
+ </param>
+ <param label="Version selection" name="version" type="select">
+ <option value="interproscan-5.17-56.0">interproscan-5.17-56.0</option>
+ </param>
+ <param checked="false" falsevalue="-disableprecalc" help="You need to setup your own lookup server as the EBI version can differ. Look at interproscan configuration file for more info" label="Perform lookup of InterPro at defined server address" name="disable" truevalue="" type="boolean"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="IPR: ${input.name}" name="outfile"/>
+ </outputs>
+ <help>Interproscan annotation suite. Select your RDF genome with
+ protein annotation.
+ This can be either from a converted GenBank/EMBL
+ file or from a
+ Prodigal prediction.
+ The output will be an RDF file with
+ protein domain annotation from
+ InterPro.
+ </help>
+ <citations>
+ <citation type="bibtex">@article{Mitchell26112014,
+ author = {Mitchell,
+ Alex and Chang, Hsin-Yu and Daugherty, Louise and
+ Fraser, Matthew and
+ Hunter, Sarah and Lopez, Rodrigo and McAnulla,
+ Craig and McMenamin,
+ Conor and Nuka, Gift and Pesseat, Sebastien and
+ Sangrador-Vegas, Amaia
+ and Scheremetjew, Maxim and Rato, Claudia and
+ Yong, Siew-Yit and
+ Bateman, Alex and Punta, Marco and Attwood, Teresa
+ K. and Sigrist,
+ Christian J.A. and Redaschi, Nicole and Rivoire,
+ Catherine and
+ Xenarios, Ioannis and Kahn, Daniel and Guyot, Dominique
+ and Bork, Peer
+ and Letunic, Ivica and Gough, Julian and Oates, Matt
+ and Haft, Daniel
+ and Huang, Hongzhan and Natale, Darren A. and Wu,
+ Cathy H. and Orengo,
+ Christine and Sillitoe, Ian and Mi, Huaiyu and
+ Thomas, Paul D. and
+ Finn, Robert D.},
+ title = {The InterPro protein families database: the
+ classification
+ resource after 15 years},
+ year = {2014},
+ doi =
+ {10.1093/nar/gku1243},
+ abstract ={The InterPro database
+ (http://www.ebi.ac.uk/interpro/) is a freely
+ available resource that
+ can be used to classify sequences into
+ protein families and to predict
+ the presence of important domains and
+ sites. Central to the InterPro
+ database are predictive models, known
+ as signatures, from a range of
+ different protein family databases
+ that have different biological
+ focuses and use different
+ methodological approaches to classify
+ protein families and domains.
+ InterPro integrates these signatures,
+ capitalizing on the respective
+ strengths of the individual databases,
+ to produce a powerful protein
+ classification resource. Here, we report
+ on the status of InterPro as
+ it enters its 15th year of operation, and
+ give an overview of new
+ developments with the database and its
+ associated Web interfaces and
+ software. In particular, the new domain
+ architecture search tool is
+ described and the process of mapping of
+ Gene Ontology terms to
+ InterPro is outlined. We also discuss the
+ challenges faced by the
+ resource given the explosive growth in
+ sequence data in recent years.
+ InterPro (version 48.0) contains 36 766
+ member database signatures
+ integrated into 26 238 InterPro entries, an
+ increase of over 3993
+ entries (5081 signatures), since 2012.},
+ URL =
+ {http://nar.oxfordjournals.org/content/early/2014/11/26/nar.gku1243.abstract},
+ eprint =
+ {http://nar.oxfordjournals.org/content/early/2014/11/26/nar.gku1243.full.pdf+html},
+ journal = {Nucleic Acids Research}
+ }
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/ipath.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/ipath.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,39 @@
+<tool id="DIPath" name="iPath" version="2.01">
+ <description>iPath Generator</description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:IPATH</container>
+ </requirements>
+ <command interpreter="docker">java -jar /ipath/target/iPath-0.0.1-SNAPSHOT-jar-with-dependencies.jar 
+ #if $conditional.source_select==&quot;single&quot;
+ -group1 '$conditional.input'
+ #else
+ -group1 $conditional.input1
+ -group2 $conditional.input2
+ #end if
+ -o &quot;$outfile&quot; -format TURTLE
+ </command>
+ <inputs>
+ <param label="Priam E-Value cutoff" name="Evalue" optional="False" size="60" type="float" value="1E-1"/>
+ <conditional name="conditional">
+ <param label="Analysis method" name="source_select" type="select">
+ <option value="single">Single genome</option>
+ <option value="group">Group comparison</option>
+ </param>
+ <when value="single">
+ <param format="ttl" label="RDF Genome file" name="input" type="data"/>
+ </when>
+ <when value="group">
+ <param format="rdf" label="RDF Genome file for group 1" multiple="True" name="input1" type="data"/>
+ <param format="rdf" label="RDF Genome file for group 2" multiple="True" name="input2" type="data"/>
+ </when>
+ </conditional>
+ </inputs>
+ <outputs>
+ <data format="tsv" label="iPath comparison" name="outfile"/>
+ </outputs>
+ <help>GROUP 1: #FF0000
+GROUP 2: #0000FF
+BOTH: #00FF00
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/loader.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/loader.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,17 @@
+<tool id="DLoader" name="RDF remote Loader" version="0.1">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:LOADER</container>
+ </requirements>
+ <command interpreter="docker">java -jar /loader/target/Loader-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ -input '$input' -endpoint &quot;$endpoint&quot; -format &quot;application/x-turtle&quot;</command>
+ <inputs>
+ <param format="ttl" label="Annotated RDF file" multiple="true" name="input" type="data"/>
+ <param label="SPARQL endpoint" name="endpoint" optional="false" type="text"/>
+ </inputs>
+ <outputs>
+ <data format="xml" label="Loader: $endpoint" name="output"/>
+ </outputs>
+ <help>RDF Loader into a remote SPARQL end point
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/locustagger.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/locustagger.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,18 @@
+<tool id="DLocus" name="Locus tags inference from GBK import" version="0.1">
+ <description>Locus tagger inference from original annotation</description>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:LOCUSTAGGER</container>
+ </requirements>
+ <command interpreter="docker">java -jar /locustagger/target/LocusTagger-0.1-jar-with-dependencies.jar
+ '-input' '$input' -format 'TURTLE' -output '$output' -prefix '$prefix'</command>
+ <inputs>
+ <param format="ttl" label="RDF file" name="input" type="data"/>
+ <param format="text" label="PREFIX identifier" name="prefix" size="60" type="text"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="LocusTagger: ${input.name}" name="output"/>
+ </outputs>
+ <help>Locus tag inference from original genbank/embl that was converted
+ to RDF
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/matrix.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/matrix.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,63 @@
+<tool id="DMatrix" name="MATSPARQL" version="1.0.1">
+  <description/>
+  <requirements>
+    <container type="docker">jjkoehorst/sappdocker:MATRIX</container>
+  </requirements>
+  <command interpreter="docker">java -jar /sparql/target/sparqljava-0.0.1-SNAPSHOT-jar-with-dependencies.jar '$separate' '-rdf' '$input' '-format' 'TURTLE' '-query' '$query' '-output' '$output' &amp;&amp; Rscript $__tool_directory__/matrix.R '$output' '$output' </command>
+  <inputs>
+    <param format="ttl" label="Genome Database" multiple="True" name="input" type="data"/>
+    <param area="True" label="SPARQL query" name="query" type="text" value="YOUR QUERY HERE"/>
+    <param checked="False" falsevalue="" help="Use this option if you run into memory or performance problems. Each genome will be queried independently of each other and therefor advanced comparison SPARQL queries will not work" label="Treath genomes separately" name="separate" truevalue="-separate" type="boolean"/>
+  </inputs>
+  <outputs>
+    <data format="tsv" label="matrix.tsv" name="output"/>
+  </outputs>
+  <help>The creation of a  matrix from a created SPARQL query. One should use a query that creates 3 columns for the X and Y coordinates and Z for the value. 
+
+    A header for a SPARQL query would look like SELECT ?genome ?protein ?value or SELECT ?genome ?domain (COUNT(?domain) AS ?domainC)
+
+-----------------------------
+Genome Interpro Matrix
+-----------------------------
+The following query results in a matrix of genomes by Pfam accessions ::
+
+    PREFIX biopax:&lt;http://www.biopax.org/release/bp-level3.owl#&gt;
+    PREFIX ssb:&lt;http://csb.wur.nl/genome/&gt;
+    SELECT DISTINCT ?genome ?id (COUNT(?id) AS ?value)
+    WHERE { 
+      ?genome a ssb:Genome .
+      ?genome ssb:dnaobject ?dna .
+      ?dna ssb:feature ?feature .
+      ?feature ssb:tool ?tool .
+      ?feature ssb:protein ?protein .
+      ?protein ssb:feature ?domain .
+      ?domain ssb:signature ?signature .
+      ?signature biopax:xref ?xref .
+      ?xref biopax:db 'pfam' .
+      ?xref biopax:id ?id .
+      } GROUP BY ?genome ?id
+
+-------------------
+Enzyme based matrix
+-------------------
+
+The following query results in a matrix of genomes by EC numbers ::
+
+   PREFIX ssb:&lt;http://csb.wur.nl/genome/&gt;
+   SELECT  ?genome ?ec (COUNT(?ec) AS ?ecCount)
+   WHERE {
+        ?gene a ssb:Cds .
+        ?gene ssb:locus_tag ?locus .
+        ?gene ssb:source ?source .
+        ?gene ssb:protein ?protein .
+        ?protein ssb:feature ?feature .
+        {
+            ?feature ssb:kegg ?ec .
+        } UNION {
+            ?feature ssb:ec_number ?ec .
+        }
+   } GROUP BY ?genome ?ec
+
+**If you require specific questions related to the database that you created feel free to contact us.**
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/merger.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/merger.xml Wed Jun 29 01:36:58 2016 -0400
[
@@ -0,0 +1,17 @@
+<tool id="DMerger" name="Merger of RDF" version="1.0.0">
+  <description/>
+  <requirements>
+    <container type="docker">jjkoehorst/sappdocker:MERGER</container>
+  </requirements>
+  <command interpreter="docker">java -jar /merger/target/rdfMerge-0.0.1-SNAPSHOT-jar-with-dependencies.jar -input $input  -output '$output' -format TURTLE
+    </command>
+  <inputs>
+    <param format="ttl" label="RDF Files" multiple="True" name="input" type="data"/>
+  </inputs>
+  <outputs>
+    <data format="ttl" label="MERGER: {$input[-1].name}" name="output"/>
+  </outputs>
+  <help>Merges multiple genome TURTLE runs into a single end...
+    Handy when running multiple parallel analysis and results can be merged into a single TURTLE RDF file...
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/pathwayAnalysis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/pathwayAnalysis.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,12 @@
+<tool id="DPathway" name="Pathway analysis" version="0.1">
+ <description/>
+ <command interpreter="docker">java -jar /pathwayanalysis/target/pathwayAnalysis-0.0.1-SNAPSHOT-jar-with-dependencies.jar -input $input -output $output -format TURTLE</command>
+ <inputs>
+ <param format="ttl" label="Annotated RDF file" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="tsv" label="PathwayAnalysis: ${input.name}" name="output"/>
+ </outputs>
+ <help>Pathway overview information for MetaCyc and KEGG
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/phylogeny.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/phylogeny.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,16 @@
+<tool id="DPhylogeny" name="Phylogeny analysis" version="0.1">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:PHYLOGENY</container>
+ </requirements>
+ <command interpreter="java -jar">/phylogeny/target/phylogeny-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ -input $input -output $output -format TURTLE</command>
+ <inputs>
+ <param format="ttl" label="Annotated RDF file" multiple="true" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="newick" label="Phylogeny" name="output"/>
+ </outputs>
+ <help>Phylogeny analysis based on protein signatures
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/priam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/priam.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,55 @@
+<tool id="DPriam" name="PRIAM EC detection" version="1.0.0">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:PRIAM</container>
+ </requirements>
+ <command interpreter="docker">java -jar /priam/target/priam-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-input' '$input' -format TURTLE -output '$output'
+ </command>
+ <inputs>
+ <param format="ttl" label="ttl genome file" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="PRIAM: ${input.name}" name="output"/>
+ </outputs>
+ <help>EC detection using PRIAM. An RDF file with protein prediction is
+ required. Either from Genbank/EMBL or from Prodigal gene prediction
+ module.
+ </help>
+ <citations>
+ <citation type="bibtex">@article{Claudel-Renard2003,
+ abstract = {The
+ advent of fully sequenced genomes opens the ground for the
+ reconstruction of metabolic pathways on the basis of the
+ identification of enzyme-coding genes. Here we describe PRIAM, a
+ method for automated enzyme detection in a fully sequenced genome,
+ based on the classification of enzymes in the ENZYME database. PRIAM
+ relies on sets of position-specific scoring matrices ( profiles')
+ automatically tailored for each ENZYME entry. Automatically generated
+ logical rules define which of these profiles is required in order to
+ infer the presence of the corresponding enzyme in an organism. As an
+ example, PRIAM was applied to identify potential metabolic pathways
+ from the complete genome of the nitrogen-fixing bacterium
+ Sinorhizobium meliloti. The results of this automated method were
+ compared with the original genome annotation and visualised on KEGG
+ graphs in order to facilitate the interpretation of metabolic
+ pathways and to highlight potentially missing enzymes.},
+ author =
+ {Claudel-Renard, C.},
+ doi = {10.1093/nar/gkg847},
+ issn = {1362-4962},
+ journal = {Nucleic Acids Research},
+ month = nov,
+ number = {22},
+ pages =
+ {6633--6639},
+ title = {{Enzyme-specific profiles for genome
+ annotation: PRIAM}},
+ url =
+ {http://nar.oxfordjournals.org/content/31/22/6633.abstract?etoc},
+ volume = {31},
+ year = {2003}
+ }
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/rdf2embl.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/rdf2embl.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,70 @@
+<tool id="DRDF2EMBL" name="EMBL/GenBank creation" version="0.1">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:RDF2EMBL</container>
+ </requirements>
+ <command interpreter="docker">java -jar /rdf2embl/target/rdf2embl-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-input' '$input' -output '$output' -format 'TURTLE' '-organism'
+ '$organism' '-strain' '$strain' '-substrain' '$substrain' '-keywords'
+ '$keywords' '-taxon' '$taxon' -codon '$codon'
+ -locus '$prefix' '-title'
+ '$title' '-authors' '$authors' '-consortium' '$consortium' '-journal'
+ '$journal' '-dataclass' '$dataclass' '-writer' '$writer' '-projectid'
+ '$projectid' '$pathwaytools' '-note' '$note' '-scaffold' '$scaffold'
+ '$gapprotein'
+ </command>
+ <inputs>
+ <param format="ttl" label="TTL / RDF file" multiple="False" name="input" type="data"/>
+ <param label="Organism name" name="organism" optional="false" type="text"/>
+ <param label="Strain name" name="strain" optional="false" type="text"/>
+ <param label="Substrain name" name="substrain" optional="false" type="text"/>
+ <param label="Keywords" name="keywords" optional="false" type="text"/>
+ <param label="Project identifier" name="projectid" optional="false" type="text"/>
+ <param label="Taxon number" name="taxon" optional="false" type="text"/>
+ <param label="Locus prefix (FZH_)" name="prefix" optional="false" type="text"/>
+ <param label="Scaffold prefix (SCAF_)" name="scaffold" optional="false" type="text"/>
+ <param label="Journal" name="journal" optional="false" type="text" value="journal vol:pp-pp(year)"/>
+ <param label="Authors" name="authors" optional="false" type="text"/>
+ <param label="Title" name="title" optional="false" type="text"/>
+ <param label="Consortium" name="consortium" optional="false" type="text"/>
+ <param area="True" label="Note for each record" name="note" optional="false" size="10" type="text" value="Annotation was performed using the Semantic Annotation Platform for Prokaryotes (SAPP) and the sha384 key is $shakey and the FASTA header name is: $header"/>
+ <param label="codon table selection" name="codon" type="select">
+ <option value="11">The Bacterial, Archaeal and Plant Plastid Code
+ (transl_table=11)
+ </option>
+ <option value="4">The Mold, Protozoan, Coelenterate Mitochondrial
+ and Mycoplasma/Spiroplasma Code (transl_table=4)
+ </option>
+ </param>
+ <param label="Output format" name="writer" type="select">
+ <option selected="true" value="embl">EMBL format</option>
+ <option value="genbank">Genbank format</option>
+ </param>
+ <param label="Data class selection" name="dataclass" type="select">
+ <option value="PAT">Patent</option>
+ <option value="EST">Expressed Sequence Tag</option>
+ <option value="GSS">Genome Survey Sequence</option>
+ <option value="HTC">High Thoughput CDNA sequencing</option>
+ <option value="HTG">High Thoughput Genome sequencing</option>
+ <option value="MGA">Mass Genome Annotation</option>
+ <option selected="true" value="WGS">Whole Genome Shotgun</option>
+ <option value="TSA">Transcriptome Shotgun Assembly</option>
+ <option value="STS">Sequence Tagged Site</option>
+ <option value="STD">Standard (all entries not classified as above)
+ </option>
+ </param>
+ <param label="Remove gap spanning proteins (contains more than 50% of XXX in sequence)" name="gapprotein" type="select">
+ <option selected="true" value="">No</option>
+ <option selected="false" value="-gapprotein">Yes</option>
+ </param>
+ <param label="Pathway tools compatible? WARNING: Each contig needs to be loaded individually into Pathway tools (V19.0)" name="pathwaytools" type="select">
+ <option selected="true" value="">No</option>
+ <option selected="false" value="-pathwaytools">Yes</option>
+ </param>
+ </inputs>
+ <outputs>
+ <data format="embl" label="EMBL: ${input.name}" name="output"/>
+ </outputs>
+ <help>RDF to EMBL conversion. Locus tags are automatically generated unless locus tags have been inferred or generated through the locus module.
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/rnammer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/rnammer.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,87 @@
+<tool id="DRnammer" name="rRNA detection" version="1.0.0">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:RNAMMER</container>
+ </requirements>
+ <command interpreter="docker">java -jar /rnammer/target/rnammer-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-input' '$input' -output '$output' -format TURTLE
+ </command>
+ <inputs>
+ <param format="ttl" label="genome ttl file" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="RNA: ${input.name}" name="output"/>
+ </outputs>
+ <help>Be aware that this can only be used for academic users; other
+ users are
+ requested to contact CBS Software Package Manager at
+ software@cbs.dtu.dk.
+ We are investigating alternative prediction
+ applications, please contact
+ us if you are aware of such method.
+ </help>
+ <citations>
+ <citation type="bibtex">@article{Lagesen2007,
+ abstract = {The
+ publication of a complete genome sequence is usually
+ accompanied by
+ annotations of its genes. In contrast to protein
+ coding genes, genes
+ for ribosomal RNA (rRNA) are often poorly or
+ inconsistently annotated.
+ This makes comparative studies based on
+ rRNA genes difficult. We have
+ therefore created computational
+ predictors for the major rRNA species
+ from all kingdoms of life and
+ compiled them into a program called
+ RNAmmer. The program uses hidden
+ Markov models trained on data from
+ the 5S ribosomal RNA database and
+ the European ribosomal RNA database
+ project. A pre-screening step
+ makes the method fast with little loss
+ of sensitivity, enabling the
+ analysis of a complete bacterial genome
+ in less than a minute.
+ Results from running RNAmmer on a large set of
+ genomes indicate that
+ the location of rRNAs can be predicted with a
+ very high level of
+ accuracy. Novel, unannotated rRNAs are also
+ predicted in many
+ genomes. The software as well as the genome analysis
+ results are
+ available at the CBS web server.},
+ author = {Lagesen, Karin
+ and Hallin, Peter and R\o dland, Einar Andreas and
+ Staerfeldt,
+ Hans-Henrik and Rognes, Torbj\o rn and Ussery, David W},
+ doi =
+ {10.1093/nar/gkm160},
+ file = {:Users/koeho006/Library/Application
+ Support/Mendeley
+ Desktop/Downloaded/Lagesen et al. - 2007 - RNAmmer
+ consistent and
+ rapid annotation of ribosomal RNA genes.pdf:pdf},
+ issn =
+ {1362-4962},
+ journal = {Nucleic acids research},
+ keywords =
+ {Computational Biology,Computational Biology: methods,Genes,
+ rRNA,Genome, Bacterial,Genomics,Genomics: methods,Markov
+ Chains,Software},
+ mendeley-groups = {Dump/VAPP Paper,VAPP Application
+ note},
+ month = jan,
+ number = {9},
+ pages = {3100--8},
+ pmid = {17452365},
+ title = {{RNAmmer: consistent and rapid annotation of ribosomal RNA
+ genes.}},
+ volume = {35},
+ year = {2007}
+ }
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/signalp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/signalp.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,59 @@
+<tool id="DSignalp" name="Signal peptide detection" version="1.0.0">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:SIGNALP</container>
+ </requirements>
+ <command interpreter="docker">java -jar /signalp/target/signalp-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ '-signaltype' '$runtype' -input $input -output $output -format TURTLE
+ </command>
+ <inputs>
+ <param format="ttl" label="ttl genome file" name="input" type="data"/>
+ <param label="Gram+/- or Eukaryotes" name="runtype" type="select">
+ <option value="gram+">Gram+ Bacteria</option>
+ <option value="gram-">Gram- Bacteria</option>
+ <option value="euk">Eukaryotes</option>
+ </param>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="signalP: ${input.name}" name="output"/>
+ </outputs>
+ <help>Be aware that this can only be used for academic users; other
+ users are
+ requested to contact CBS Software Package Manager at
+ software@cbs.dtu.dk.
+ We are investigating alternative prediction
+ applications, please contact
+ us if you are aware of such method.
+ </help>
+ <citations>
+ <citation type="bibtex">@article{Petersen2011,
+ author = {Petersen,
+ Thomas Nordahl and Brunak, S\o ren and von Heijne,
+ Gunnar and Nielsen,
+ Henrik},
+ doi = {10.1038/nmeth.1701},
+ issn = {1548-7105},
+ journal =
+ {Nature methods},
+ keywords = {Algorithms,Cell Membrane,Cell Membrane:
+ metabolism,Computational
+ Biology,Protein Sorting Signals,Software},
+ mendeley-groups = {Dump/VAPP Paper},
+ month = jan,
+ number = {10},
+ pages =
+ {785--6},
+ pmid = {21959131},
+ publisher = {Nature Publishing Group},
+ title = {{SignalP 4.0: discriminating signal peptides from
+ transmembrane
+ regions.}},
+ url =
+ {http://www.ncbi.nlm.nih.gov/pubmed/21959131},
+ volume = {8},
+ year =
+ {2011}
+ }
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/sparql.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/sparql.xml Wed Jun 29 01:36:58 2016 -0400
[
@@ -0,0 +1,60 @@
+<tool id="DQuery" name="SAPPARQL" version="1.0.0">
+  <description/>
+  <command interpreter="docker">java -jar /sparql/target/sparqljava-0.0.1-SNAPSHOT-jar-with-dependencies.jar '-rdf' '$input' '-format' 'TURTLE' '-query' '$query' '-output' '$output'</command>
+  <inputs>
+    <param format="ttl" label="Genome Database" multiple="True" name="input" type="data"/>
+    <param area="True" label="SPARQL query" name="query" type="text" value="YOUR QUERY HERE">
+      <sanitizer>
+        <valid initial="string.ascii_letters + string.punctuation + string.whitespace + string.digits"/>
+      </sanitizer>
+    </param>
+    <param checked="False" falsevalue="" help="Use this option if you run into memory or performance problems. Each genome will be queried independently of each other and therefor advanced comparison SPARQL queries will not work." label="Treath genomes separately" name="separate" truevalue="-separate" type="boolean"/>
+  </inputs>
+  <outputs>
+    <data format="tsv" label="query.tsv" name="output"/>
+  </outputs>
+  <help>
+===============================
+Frequently Asked Queries (FAQs)
+===============================
+
+----------------------
+Obtain COG information
+----------------------
+
+Retrieving COG information from a single or multiple genomes at the same time. ::
+
+   PREFIX ssb:&lt;http://csb.wur.nl/genome/&gt;
+   SELECT ?genome ?label ?letter ?description
+   WHERE {
+      ?genome a ssb:Genome .
+      ?genome ssb:dnaobject ?dna .
+      ?dna ssb:feature ?gene .
+      ?gene ssb:protein ?protein .
+      ?protein ssb:feature ?feature .
+      ?feature a ssb:Blast .
+      ?feature ssb:evalue ?evalue .
+      ?feature ssb:cog ?cog .
+      ?cog ssb:cogid ?id .
+      ?id ssb:letter ?letter .
+      ?id ssb:description ?description .
+      ?id ssb:label ?label .
+   }
+   LIMIT 25
+
+-------------------------------
+Obtain GC content + Genome size
+-------------------------------
+
+  PREFIX ssb:&lt;http://csb.wur.nl/genome/&gt;
+  SELECT ?genome (?GC/?ATGC AS ?GContent) ?ATGC
+  WHERE {
+    ?genome a ssb:Genome .
+    ?genome ssb:dnaobject ?dna .
+    ?dna ssb:sequence ?sequence .
+    BIND(STRLEN(REPLACE(?sequence,&quot;[AT]&quot;,&quot;&quot;,&quot;i&quot;)) AS ?GC)
+    BIND(STRLEN(?sequence) AS ?ATGC)
+  }
+
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/swisscog.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/swisscog.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,17 @@
+<tool id="DSwissCog" name="Swissprot COG annotation" version="1.0.0">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:SWISSCOG</container>
+ </requirements>
+ <command interpreter="docker">java -jar /swisscog/target/SwissCog-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ -input $input -output $output -format TURTLE
+ </command>
+ <inputs>
+ <param format="ttl" label="genome ttl with orf prediction" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="SWISSCOG: ${input.name}" name="output"/>
+ </outputs>
+ <help/>
+ <citations/>
+</tool>
\ No newline at end of file
b
diff -r 0a947cb25a3d -r 957156367442 sappDocker/tmhmm.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sappDocker/tmhmm.xml Wed Jun 29 01:36:58 2016 -0400
b
@@ -0,0 +1,92 @@
+<tool id="DTmhmm" name="Transmembrane detection" version="1.0.0">
+ <description/>
+ <requirements>
+ <container type="docker">jjkoehorst/sappdocker:TMHMM</container>
+ </requirements>
+ <command interpreter="docker">java -jar /tmhmm/target/tmhmm-0.0.1-SNAPSHOT-jar-with-dependencies.jar
+ -input $input -output $output -format TURTLE
+ </command>
+ <inputs>
+ <param format="ttl" label="genome ttl with orf prediction" name="input" type="data"/>
+ </inputs>
+ <outputs>
+ <data format="ttl" label="TMHMM: ${input.name}" name="output"/>
+ </outputs>
+ <help>Be aware that this can only be used for academic users; other
+ users are
+ requested to contact CBS Software Package Manager at
+ software@cbs.dtu.dk.
+ We are investigating alternative prediction
+ applications, please contact
+ us if you are aware of such method.
+ </help>
+ <citations>
+ <citation type="bibtex">@article{Krogh2001,
+ abstract = {We describe and
+ validate a new membrane protein topology
+ prediction method, TMHMM,
+ based on a hidden Markov model. We present
+ a detailed analysis of
+ TMHMM's performance, and show that it
+ correctly predicts 97-98 \% of
+ the transmembrane helices.
+ Additionally, TMHMM can discriminate
+ between soluble and membrane
+ proteins with both specificity and
+ sensitivity better than 99 \%,
+ although the accuracy drops when signal
+ peptides are present. This
+ high degree of accuracy allowed us to
+ predict reliably integral
+ membrane proteins in a large collection of
+ genomes. Based on these
+ predictions, we estimate that 20-30 \% of all
+ genes in most genomes
+ encode membrane proteins, which is in agreement
+ with previous
+ estimates. We further discovered that proteins with
+ N(in)-C(in)
+ topologies are strongly preferred in all examined
+ organisms, except
+ Caenorhabditis elegans, where the large number of
+ 7TM receptors
+ increases the counts for N(out)-C(in) topologies. We
+ discuss the
+ possible relevance of this finding for our understanding
+ of membrane
+ protein assembly mechanisms. A TMHMM prediction service is
+ available
+ at http://www.cbs.dtu.dk/services/TMHMM/.},
+ author = {Krogh,
+ A and Larsson, B and von Heijne, G and Sonnhammer, E L},
+ doi =
+ {10.1006/jmbi.2000.4315},
+ issn = {0022-2836},
+ journal = {Journal of
+ molecular biology},
+ keywords = {Animals,Bacterial Proteins,Bacterial
+ Proteins:
+ chemistry,Computational Biology,Computational Biology:
+ methods,Databases as Topic,Fungal Proteins,Fungal Proteins:
+ chemistry,Genome,Internet,Markov Chains,Membrane Proteins,Membrane
+ Proteins: chemistry,Plant Proteins,Plant Proteins:
+ chemistry,Porins,Porins: chemistry,Protein Sorting Signals,Protein
+ Structure, Secondary,Reproducibility of Results,Research
+ Design,Sensitivity and Specificity,Software,Solubility},
+ month = jan,
+ number = {3},
+ pages = {567--80},
+ pmid = {11152613},
+ title = {{Predicting
+ transmembrane protein topology with a hidden Markov
+ model: application
+ to complete genomes.}},
+ url =
+ {http://www.sciencedirect.com/science/article/pii/S0022283600943158},
+ volume = {305},
+ year = {2001}
+ }
+
+ </citation>
+ </citations>
+</tool>
\ No newline at end of file