Repository 'sapp'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjkoehorst/sapp

Changeset 7:c79025539d9b (2015-02-21)
Previous changeset 5:e159dbecdad6 (2015-02-21) Next changeset 8:e51957423315 (2015-02-21)
Commit message:
FASTA to RDF
modified:
gbk2rdf/gbktordf.py
added:
fasta2rdf/fastatordf.py
fasta2rdf/fastatordf.xml
fasta2rdf/test-data/.DS_Store
fasta2rdf/test-data/NC_017117.fna
b
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/fastatordf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta2rdf/fastatordf.py Sat Feb 21 15:23:15 2015 +0100
[
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3.4
+# Author: Jasper Jan Koehorst
+# Date created: Jan 22 2015
+# Function: generation of a RDF file from a genome fasta file
+
+def delete_galaxy():
+ import sys
+ for index, path in enumerate(sys.path):
+ if "galaxy-dist/" in path:
+ sys.path[index] = ''
+
+#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
+delete_galaxy()
+
+# from io import StringIO
+from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
+# import rdflib
+from rdflib.store import Store
+import sys
+
+store = plugin.get('IOMemory', Store)()
+
+global URI
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+
+def createClass(uri):
+ genomeGraph.add((uri,RDF.type,OWL.Class))
+ genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+ return uri
+
+def fasta_parser(input_file):
+ createClass(coreURI["Genome"])            #Genome class
+ createClass(coreURI["Type"])                #Type class (Chr,Pls,Scaffold)
+
+ genomeDict = {}
+
+ #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds
+#  regex = re.compile('\[type=(.*?)\]')
+ sequence = ""
+ genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
+ if genomeID == 'None':
+ genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
+
+ genomeURI = coreURI[genomeID]
+ for index, element in enumerate(sys.argv):
+ if '-organism' == element:
+ genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
+ if '-ncbi_taxid' == element:
+ genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
+ if '-idtag' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+ if '-ids' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+
+ genomeDict[genomeID] = {}
+ # typDict = {"plasmid":0,"scaffold":0,"chromosome":0}
+
+ #Generating genome dictionary
+ data = open(input_file).readlines()
+ fastadict = {}
+ key = ""
+ for index, line in enumerate(data):
+ if ">" == line[0]:
+ key = line.strip(">").strip()
+ fastadict[key] = ""
+ else:
+ fastadict[key] += line.strip()
+
+ # for line in fastadict:
+ # typ = regex.findall(line)
+ # value = 0
+ #If something is found
+ # if len(typ) > 0:
+ # typ = typ[0]
+ #If something is not found
+ # elif typ == []:
+ # typ = "scaffold"
+ #If something is found but does not contain a value
+ # elif "_" in typ:
+ # value = typ.split("_")[-1]
+ # try:
+ # value = int(value)
+ # except:
+ # value = 1
+ #Not a integer
+
+ #If a value is not given it is automatically assigned as the first one
+ #If a value is given...
+ # if value > -1:
+ #If a second scaffold of a chromosome_1 is found
+ # if typ in genomeDict[genome]:
+ #Retrieve how many
+ # value = len(genomeDict[genome][typ]) + 1
+ # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]}
+ # else:
+ # genomeDict[genome][typ] = {}
+ # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]}
+
+ #Genome dictionary to TTL
+ genomeClass = createClass(coreURI["Genome"])
+ typeClass = createClass(coreURI["DnaObject"])
+ for index, genome in enumerate(fastadict):
+ # for typ in genomeDict[genome]:
+ # for scaf in genomeDict[genome][typ]:
+ # for con in genomeDict[genome][typ][scaf]:
+ #A note is required here...
+ #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig.
+ #Previously it was
+ # Genome > Class > Scaffold > Contig
+ #Now it will be
+ # Genome > Class/Scaffold/Contig
+ #typeURI = coreURI[genome + "/" + typ]
+ #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf]
+ #Was contigURI
+ typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con]
+ # sequence = genomeDict[genome][typ][scaf][con]
+ sequence = fastadict[genome]
+ genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
+ genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
+ genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
+ genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((genomeURI, RDF.type,genomeClass))
+ genomeGraph.add((typeURI, RDF.type,typeClass))
+
+def save():
+ data = genomeGraph.serialize(format='turtle')
+ open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+
+def main():
+ store = plugin.get('IOMemory', Store)()
+ global genomeGraph
+ genomeGraph = Graph(store,URIRef(URI))
+ genomeGraph.bind("ssb",coreURI)
+ input_file = sys.argv[sys.argv.index("-input")+1]
+ fasta_parser(input_file)
+ save()
+
+if __name__ == '__main__':
+ main()
+
b
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/fastatordf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta2rdf/fastatordf.xml Sat Feb 21 15:23:15 2015 +0100
b
@@ -0,0 +1,34 @@
+<tool id="SAPP_genome_to_ttl" name="FASTA to RDF" version="0.1">
+ <description></description>
+ <command interpreter="python3">fastatordf.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' -sourcedb SAPP
+ #for $index, $id in enumerate( $ids ) 
+ '-ids' '$id.id_tag'
+ #end for
+ '-id_alternative' '$input.name'
+ </command>
+ <inputs>
+ <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
+ <param size="60" name="organism" type="text" format="text" label="organism name" optional="false"/>
+ <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
+ <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!" optional="false"/>
+ <repeat name="ids" title="Identification tags">     
+ <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
+ </repeat>
+ </inputs>
+
+ <outputs>
+ <data format="rdf" name="output" label="genomeTTL: ${input.name}" />
+ </outputs>
+    
+    <tests>
+        <test>
+            <param name="input" value="test-data/NC_017117.fna"/>
+            <output name="$output" file="NC_017117.rdf"/>
+            <output name="$ncbi_taxid" value="634455"/>
+            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
+            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
+        </test>
+    </tests>
+
+<help> Genome FASTA file to RDF</help>
+</tool>
b
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/test-data/.DS_Store
b
Binary file fasta2rdf/test-data/.DS_Store has changed
b
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/test-data/NC_017117.fna
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta2rdf/test-data/NC_017117.fna Sat Feb 21 15:23:15 2015 +0100
b
b'@@ -0,0 +1,2736 @@\n+>gi|384055705|ref|NC_017117.1| Acetobacter pasteurianus IFO 3283-22 plasmid pAPA22-010, complete sequence\n+CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACAT\n+TGCGCAGATACAGGAAACACAGACCAAATCCCCATCTCCTGTGAGCCTGGGTCAGTCCCACCAGAAGAGC\n+GGCAATCCTGTCGTTCTCCGCTGCCAGTCGCGGACGATAGCGAAAGCAGGTCTCGGATATCCCAAAAATC\n+CGACAGGCCAGCGCAATGCTGACCCCATGATGCGCCACAGCTTGTGCGGCCAGTTCCCGGCGCTGGGCTG\n+GCCGCTTCATTTTTTTCCAAGGGCTTCCTTCAGGATATCCGTCTGCATGCTCAAATCCGCATACATGCGC\n+TTCAGCCGACGGTTCTCCTCTTCCAAAGCCTTCATCTGACTGATCATCGAAGCATCCATGCCGCCATATT\n+TCGCGCGCCACCGGTAAAACGTGGCGTTGCTGATCCCATGCTCCCGACACAGGTCAGGAACCGGGACACC\n+GCCCTCAGCCTGGCGGATCACACCCATGATCTGGGCGTCAGTAAAGCGATCACTCTTCATCAGAATCTCC\n+TCAATTCTTACGCTGAGAAAATTCTCATTCAAAAGTCACTCTTTTTATGGGGGGATTACCACTCTAAATC\n+AATGCATTCCAATTAACTTATAAAATGCTTTGAGAGTCATCACCTACAGCAAAGAACTCTGCTGACACCT\n+CTGAAATCATTTTGAGCCATAAAAACTGGGCTCGATTGACGTCCTGAAACTCATCAGCCATCACGGCTGT\n+GAAACGGCGTGACCAGCGGTAGCGATAGGCATCATTGTGCAGCATTGCCAATGTCGGCCACATCAACAGA\n+TCCCCAAAATCTGCAGCATTCTGTTCGCGCAGCAAACTCTGGTAACGACCATACAACTCAACCACATAGC\n+GCCAGCCCGCATCGTCCATAAAACGTTTCTGGGCATGTGCTCGCGCTATCATGGCTTCAACATGATGCCC\n+TGCCATCTCAGGCGTCACCAGATCTTCCTTCAAACGAGATGGGGTGGTTGCCGTCCTCCCCGCACGGCAT\n+CGCAATGTGCCAGAATGGTCGTTGGAAGAAACGACTGCGCGAAAGGACGGCAGATGAAGGATACAGTGAT\n+AGGCGTTGATCTGGCAAAGAACATTTTCCAGGTTCATGGAGCTTCGCGTGCGGGCGAGGTGATGTTTCGC\n+AAAAAGCTGCGTCGTCAGCAGTTTATGCAGTTCATGGCCACGCAGCCGCCTGCTCTGGTCGTTCTTGAAG\n+CGTGCGGGAGCGCGCATTACTGGGCTCGCGAACTGGCAGGAGCTGGTCACGAGGTCAGACTGATCGCTCC\n+GCAGTATGTGAAGCCTTTCGTGAAGCGCCAGAAGAACGATGCTGCTGATGCGGAAGCGATCGTCATTGCG\n+GCCCGTCAGCCGGAAATGCGCTTTGTCGAACCACGCACTGAAGCGCAGCAGGCGCGTGGCGTTCTTTTCC\n+GGGCCCGGCAGCGTCTGGTGCACCAGCGCACGGAACTGGTGAATGCCCTGCGTGCCGTTCTGTATGAATT\n+CGGTCTCGTCGTGCCACAGGGGATTGCGCATATCAGACACATTGAAGCCATGCTGGATGAGGCGGTTCTG\n+CCAGAGGCTGTGAAGCAGGAATGCCTTGATCTGCTGCGACAGATTTCGGAGCAGAGTGTGCGGATTGATG\n+TCAGAACAAAGAAGATCAGGATGCTTGCCCAGGAAAGTGAAAACACCTGCAGATTGCAGAGCATGCCTGG\n+AGTGGGTCCTCTGACCGCTCTTGCGATTGAAGCTTTTGCGCCTGACCTGCAGAGCTTCCGGCGCGGGCGC\n+GACTTTGCTGCGTGGCTGGGGCTGGTGCCCCGTCAGTTCTCATCTGGCGGAAAGGAAAGGCTGGGGAAGA\n+TATCAAAAGCCGGGCAGGCTGATATCCGCAGGCTTCTCATCATGGGCGCCATGACCCAGGTGAACTGGGC\n+CAGCCGTAAGGCCCCTGCACCGGGAAGCTGGCTGGCACGGATGCTGGCCCGCAAGCCCCGTATGCTGGTA\n+GCCATTGCGCTGGCCAACAGGATGGCACGAGCCATCTGGGCCATGGCAACAAAACAGGAGGATTATCGGG\n+ATCCGGCCCTGTCCGTGGCAGCCTGAGCGATGGCTCGGCTCCCGCGGATGGAACCGGTAGGGGTGTGAGA\n+GGGCGATGACCTGAATGGGCGCATGATCGTCTGATCCGGATCGGAAAAACCAGTGGATTTCTCTGTGCTT\n+TAAAGCACGCCTGTGAGATTTGGATCTGATCCGCTGATCACCATACTGGCCAGTGGCTTCTGAAAGGCCA\n+CATCAACAGGCCTTACAGAAGACCGCACACGATCACACGTCAATATGGGTCAGAAAACTCTTGCATAACG\n+GACGGCAACCATATGTGGACGGCTCCCCCTTGCAAGAGGCTAGGCAAGAAAATGATCGGATCTTTGCTTC\n+CATATGTCCGGCCTGTTGATGCGGCCATAGGGTCGCTGGCCAAGATGGCTTCCGCAGCGTGAGCCCCAAA\n+CACAGAAGCGGTCTTTGATGACCACTGGTTGCCACGGGTTTTCTCACGCCATGGATCGATCGATCACACC\n+ATCTGCTCTATTACTTGCAAGCCACGACCTCAGCTCGGCACGAGAGCGTCAAATGTCAGCGCATCGTGCC\n+AGGCTAAGCTCAAACAGCAGCTGCGCCGGGTTGCTGCAGAAGGCGCTTATAGTGTTCGCCGCTGACCATC\n+AGTTTCCAAGCAATCCGCGCAATCTTATTGGCAAGGGCCACCGCTGCGAGTTTCGGTTTTTTGCGCTCCA\n+GCAATTCACGTAACCAAGATGAGGCATTCTTCCCATTGGTCCGCCGGGCATGCGACACGACTGCGGTCGC\n+GCCAACCACCAGCGTGCTTCGCAAGACCTCATCGCCAGCGCGTGTGATTCTGCCAAGCCTTGTTTTTCCA\n+CCGGTTGAGTGATCCCTGGGCGTCAATCCGATCCAGGCCGCAAAGGCTCGACCCGATTTGAACAGATGCG\n+GATCAGGCGTTTTCATCATCAGCAGCGCTGCGCCGATCGGGCCAACGCCCGGAATTTTCGCAAGACGCTG\n+ACTGCATTCGTTGGCGCGGTGCCATGCCATCACCTTGCCCTCAAGCTGTTCGATTTCACCTTGCAATTCA\n+GCATATTCCTTTGCGTGAAGGGCAAACAACTCGCGCGTCAATGTGGGCAGGCTTTCGTCCGCAGCGATCC\n+GATCAAGGAGTGCCTCAATCCGGCACATGCCTTTGGGCGCCGTGATCCCAAACTCGGCAGCATATCCCCG\n+GATCGTATTGGCGAGCTGTGTGCGGTTCCGGATAAGTCGTGCCCGCATTCCAATCAGCATCAACGCTGCC\n+TGCTCTTCCTCGCTCTTGAGCGGGACGAACCGCATTGTAGGCCGACTCATCGCTTCACAGAGGGCTTCCG\n+CGTCGGCGGCATCGTTTTTCCCGCGCTTGACATAAGGCTTCACGAGCTGCGGCGCGATCAGCTTCACTGT\n+GTGTCCCAGACACGAGAGCACCCGCCCCCAGTAATGGGAGGCGCCACAGGCCTCAATCGCGATTTCAATC\n+GGGGGCAGTTTCTCAAAAAACTTTACCATCTCCCGGCGGGATAGCTTCCTGCGCAAAACAGGCTGCTCCT\n+TCGCGTTTACACCGTGCAATTGGAAAACACTTTTTGACGTGTCCATGCCAATACGGATAATTTGTTCCAT\n+GGGTGGCCTCCTCTGTGAGTTCTGCAACGACTTCACCTTGGCACATCGCGATGCCG'..b'TTGCTCCGAAGGCCTGCATGTGTCCACACCACGGCGATTGCCTGCAACAAGGCGAATGGAT\n+AGAAACAGTCAGGAGAAAGGTCGTCTGGTTCGATGTTGAGGCTGGCCCAGGTCAGTTTCAGCCAAGAAGC\n+ATCATCTATGGTCCTAGGATTTACGCCAAGATATTTCGCAATATTCTTCGGTGTCGAGAGATGATATCGC\n+GGATTACACCTGAGCCGCGCCCATTCCCAACTCTGAATATCAGATAAAAACCGGCGCATAACCTGATAAT\n+AAGCAACTTTCGTTGGAGCCTGCAGAGGCTGGCCAACTGTTGGGATCAGACGATAATCAAAGGTGGAACC\n+GCCCCAATCGCCAACACGTAACCTGTCGACGGCGGCGAGATAGTCGGCACATACCGATACCGTCCATTGC\n+TCTGGTCCAGTGACCTCAGGGTGCTCGCGACCCAACCAGATTCCGATACGGGTTAGAGTGTTGTAAACTG\n+CTCGCCTCGACCCTTCTCGCAACGTTGTTGTTTCCAACCAACGTCGGCACCATGTCATCCACTCAGGATT\n+GATATTATCAGTTTCAGTTCCGTGACGAGGTGGATATGCCCGAAAATGGATAACCTTTGATGTTAATCCC\n+ATTGCCGCCAAACCGTTTGACAATATTCCAATCCGCTTGGCGATATGATTTCCTGTGTAACACTCTCGTG\n+TATGTTCCAGTATCTTCCTATCAAAACTTTCAAGTTTTGGATTCTCGCCTTGTAATGCAACTGCAGCAAA\n+TACTGATGGTACGAGGGGCCGAACGGTCTGACAGACGAAGCCGACACGGGTTAGGGCCGAGAACAGACGC\n+TCACATTCTGTATCAACAATCTCCGCTCCAAAAACCAATCGGGCAACGACAGCCGTCGCCACATTGCGTT\n+GAATTTTGTACACGCTTCGAAAACCACACAGAAGATAAGCGACTGCAATCAACTGCGGTACCATTCCGGT\n+TTTTTTCAGAACAGGACTATTATTGATAATCTCAACCCACAGGTCTTCACTCCACCCCCAGTAGGGTTTT\n+TCCTGGTGTGCGACCGTCAGAAGCAAATACTTCAAAGCACGATGGCTGTTAGCGCGGTCGAGATTGCTGC\n+GATACAGGATGTCCGTCAGAGGCTTATGAAGACGCGGAGCCTGCCGTTTCGTAATAACAAGATTTTCTTC\n+TTTCCAACGTCTGACAACTTTTAATTCGTCTGCAGACAATGTCGCCCGTCTGTCGTATTCATCGAGATTA\n+ATGTGGAATTTGTATTCGGGCTGGATGTGAGAAACCTGAGATTCTACCACTTTTCTATCCTCCGAAGACC\n+CTGTGACCAAGCTTCATATCCATCTGTTCGACTGAGTTTGCGATCTTGCGAAGCAGATCTTCACCGGAAA\n+GATGGATATAGAGTGTCGTGCTTTGAACATTGCGATGCCCGGCATACGTCGCAATATCGTGTAGACGCCA\n+GCCAGCACGGGCCAGATGCGTCAATCTCAGGTGACGCAAAGTGTGCGTACTGAACAATGGCATATCAGCC\n+TGGAGAGCAAGACGTCTGACAGTTTTGCTCCATGACCACTTCGTAATAGGCTGCCGAAAGTTCCGATCTG\n+ACTCAGAGAGAAACAGGGCCGCTGAATGAGTCGCTGCGTTGCGCCTTTGATGCAGATATACCGCCAACAC\n+AGGACAGAGCGCCGCTGAATAACAAACCACACGAGGGCGAGCGCTTTTACTTGTTTCGGCCCGAATGGTG\n+AGCAAACGTCTCGCAGGGTCGATATCCGAGACGCGCAAATTTACTACGGCGTGTCGTCAGTTAAGCCCTG\n+AGAGTGGCACGTGAGGGTTGTACTTTGTGTCTGCGTGTGCTGACTGTTTTCCCATTTTTTGGGGAGACAG\n+ACAGATGCGGCGCTATAGTTTACGCGATGACCAGTGGGAGCGGATAAAGGATCTTCTTCCTGGTCGAGAA\n+GGCTATGTCGGCGGCACTGCGGTGAACAACCGTCTGTTCGTGGAGGCGGTGCTGTATCGCTATCGCGCGG\n+GTATTCCATGGCGCGACCTTCCTGCCCGTTTCGGTGACTGGAAAAACGTGCACCGGCGTCTGCGCCGCTG\n+GTGTGAAAGCGGCGTCATCGAACGGATATTTCGTTATCTGGCCGCTGATTACGACAACGAATACATGATG\n+ATCGACAGCACAATTGTCCGAGCGCATCAGCATAGTGCCGGAGCTCTCAAAAAAGGGGCACGGATCAGGC\n+CATCGGACGATCACGGGCGGGCTAACTACAAAGATCCATGCCATCTGCGACGCTCTGGGCAATCCAGTGG\n+AACTCGGCATCACACCGGGACAGGATGCCGATATCACCCAGGCAGAACCACTTCTGGAAAACATCGAACC\n+GGATGCTTTCCTTGCTGACAAGGCGTATGACGCGGACAGGTTGATCGATCGGCTGATACAGCGCGGGATT\n+ACCCCGGTCATCCCGCCAAAACGCAACAGAACGACACGACGGGTAATCCCCCCATAAAAAGAGTGACTTT\n+TGAATGAGAATTTTCTCAGCGTAAGAATTGAGGAGATTCTGATGAAGAGTGATCGCTTTAGTGACGCCCA\n+GATCATGGGTGTGATCCGCCAGGCTGAGGGCGGTGTCCCGGTTCCTGACCTGTGCCGGGAGCATGGGATC\n+AGCAACGCCACGTTTTACCGGTGGCGCGCGAAATATGGCGGCATGGATGCTTCGATGATCAGTCAGATGA\n+AGGCTTTGGAAGAGGAGAACCGTCGGCTGAAGCGCATGTATGCGGATTTGAGCATGCAGACGGATATCCT\n+GAAGGAAGCCCTTGGAAAAAAATGAAGCGGCCAGCCCAGCGCCGGGAACTGGCCGCACAGGCTGTGGCGC\n+ATCATGGGGTCAGCATTGCGCTGGCCTGTCGGATTTTTGGGATATCCGAGACCTGCTTTCGCTATCGTCC\n+GCGACTGGCAGCGGAGAATGACAGGATTGCCGCTCTTCTGGTGGGACTGACCCAGGCTCACAGGAGATGG\n+GGATTTGGTCTGTGTTTCCTGTATCTGCGCAATGTGCAGGGACAGCTCTGGAATCATAAGCGGGTTTATC\n+GGATCTATCGGGAACTGGAGTTCAACCTGCGGATTAAACCCCGCAGGCGTCTGGTTCGCGAAAAGCCTGA\n+AAAGCTGTCGGTTCCGGCCCTTCCCAACACGGTCTGGTCCATGGATTTCATGGCGGACAGGCTTTTGGAT\n+GGACGCGCTTTTCGGCTCCTGAACATCCTGGATGAGTTCAATCGTGAAGGACTGGCGATCGAGGTTGATT\n+TTTCCCTGCCGGCCTGTCGGGTTGTCCGCTGGTAATCCCCCCATTTTTAGTGGGGCATTGAATGAGAATT\n+CAGGCAGCTGTTTTTAGTTTCTGGGCGGGGGTTAGCCCGCTGTTCCCCATGTTGGGTCTGTCATTGTTAT\n+ATGTCCAGAGCCATTGTGTTGCGACCTCCTGTACGTCCTGAATGCTTTCAAACAAATACTGCTCTAGCCA\n+TTCCTGCCGGACAGTTCTGTTGTAGCGTTCAATATAGGCGTTCTGCTGCGGATTGCCCGGTTGTGTATAG\n+ATCAGGGTAATCCCCTGCTTTTCGGCCCATGAAACCAACGTATGACTGACATATTCAGGGCCATTGTCCA\n+TTCGGATAGCCTCTGGCCTGCCACGCCACTCCATAACCTGTTCCAGACAGCGAACAACCCGACAGGCTGG\n+CAGGGAAAAATCAACCTCAATCGCCAGTCCTTCACGATTGAAATCATCCAGAATGTTCAGGAGCCGAAAA\n+GCACGTCCATCCATCAGCCTGTCCGCCATAAAATCCATGGACCAGACCCTGTTGGGAAGGGCCGGAACCG\n+ACAGCTTTTCAGGCTTTTCGCGAACCAGACGCCTGCGGGGTTTAATC\n'
b
diff -r e159dbecdad6 -r c79025539d9b gbk2rdf/gbktordf.py
--- a/gbk2rdf/gbktordf.py Sat Feb 21 07:56:16 2015 -0500
+++ b/gbk2rdf/gbktordf.py Sat Feb 21 15:23:15 2015 +0100
[
@@ -108,8 +108,12 @@
  gi = record.annotations["gi"]
  typ = str(gi)
  except:
- scaf_value += 1
- typ = "scaffold_"+str(scaf_value)
+ try:
+ gi = record.annotations["accessions"][0]
+ typ = str(gi)
+ except:
+ scaf_value += 1
+ typ = "scaffold_"+str(scaf_value)
  genomeURI = coreURI[genome]
  gbkURI = coreURI[genome + "/" + typ]
  #To contig connection to connect all data to it
@@ -148,8 +152,7 @@
  int_add(gbkURI,coreURI[annot.lower()],str(a))
  else:
  int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
-
-
+
  #####END of RECORD####
  if len(sequence) > 0:
  genomeGraph.add((gbkURI, coreURI["sequence"] ,  Literal(sequence)))
@@ -167,13 +170,6 @@
 
  if strand == 'None':
  strand = 0
-
-#  if feature_type == "gene":
-#  gene = feature
- #Store gene in next feature....
-#  gene_location_start = end = str(gene.location.end).replace(">","").replace("<","")
-#  gene_location_stop = str(gene.location.start).replace(">","").replace("<","")
-#  gene_qualifiers = gene.qualifiers
  else:
  if feature.type == "misc_feature": #Store as part of previous cds or something...
  if strand == "-1":
@@ -181,8 +177,6 @@
  else:
  miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
 
- # genomeGraph.add((generalURI,coreURI["subFeature"],miscURI))
-
  # TODO: Check if biopython has an overlap function...
  if int(prevObjStart) <= int(start):
  if int(end) <= int(prevObjStop):
@@ -201,15 +195,12 @@
  prevObjStart = start
  prevObjStop = end
 
-
  if strand == "-1":
  typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
  else:
  typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
 
-#  cds_sequence = str(feature.extract(sequence))
- #Contig specific connection
-
+ #Contig specific connection
  genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
  ############################
 
@@ -228,6 +219,7 @@
  genomeGraph.add((typeURI, coreURI["feature"] , subURI))
  store_general_information(subURI,subfeature,record,feature)
 
+
 def store_general_information(generalURI,feature,record,superfeature=""):
  proteinClass = createClass(coreURI["Protein"], root=True)
  sequence = str(record.seq)
@@ -277,8 +269,6 @@
  #And subfeature variable will contain the superfeature
  if superfeature:
  codon = superfeature.qualifiers["transl_table"][0]
-#  else:
-#  codon = subfeature.qualifiers["transl_table"][0]
  except:
  #Default codon table 11
  codon = "11"
@@ -356,7 +346,6 @@
  genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
  genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
  genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
- genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
  genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
 
 def main():