annotate fasta2rdf/fastatordf.py @ 9:3f4f1cd22a6a

FASTA 2 RDF code cleanup
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 15:38:26 +0100
parents ec73c34af97b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python3.4
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
2 # Author: Jasper Jan Koehorst
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
3 # Date created: Jan 22 2015
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
4 # Function: generation of a RDF file from a genome fasta file
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
5
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
7 # from io import StringIO
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
8 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
9 # import rdflib
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
10 from rdflib.store import Store
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
11 import sys
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
12
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
13 store = plugin.get('IOMemory', Store)()
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
14
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
15 global URI
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
16 URI = "http://csb.wur.nl/genome/"
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
17 global seeAlso
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
18 seeAlso = "rdfs:seeAlso"
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
19 global coreURI
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
20 coreURI = Namespace(URI)
9
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
21 global genomeGraph
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
22 store = plugin.get('IOMemory', Store)()
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
23 genomeGraph = Graph(store,URIRef(URI))
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
24 genomeGraph.bind("ssb",coreURI)
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
25
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
26 def delete_galaxy():
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
27 for index, path in enumerate(sys.path):
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
28 if "galaxy-dist/" in path:
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
29 sys.path[index] = ''
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
30
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
31 def createClass(uri):
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
32 genomeGraph.add((uri,RDF.type,OWL.Class))
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
33 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
34 return uri
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
35
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
36 def fasta_parser(input_file):
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
37 createClass(coreURI["Genome"]) #Genome class
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
38 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold)
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
39
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
40 genomeDict = {}
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
41
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
42 sequence = ""
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
43 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
44 if genomeID == 'None':
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
45 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
46
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
47 genomeURI = coreURI[genomeID]
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
48 for index, element in enumerate(sys.argv):
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
49 if '-organism' == element:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
50 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
51 if '-ncbi_taxid' == element:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
52 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
53 if '-idtag' == element:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
54 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
55 if '-ids' == element:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
57
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
58 genomeDict[genomeID] = {}
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
59
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
60 #Generating genome dictionary
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
61 data = open(input_file).readlines()
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
62 fastadict = {}
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
63 key = ""
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
64 for index, line in enumerate(data):
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
65 if ">" == line[0]:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
66 key = line.strip(">").strip()
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
67 fastadict[key] = ""
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
68 else:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
69 fastadict[key] += line.strip()
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
70
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
71 genomeClass = createClass(coreURI["Genome"])
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
72 typeClass = createClass(coreURI["DnaObject"])
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
73 for index, genome in enumerate(fastadict):
9
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
74 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
75 sequence = fastadict[genome]
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
76 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
77 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
78 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence)))
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
79 genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
80 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
81 genomeGraph.add((genomeURI, RDF.type,genomeClass))
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
82 genomeGraph.add((typeURI, RDF.type,typeClass))
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
83
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
84 def save():
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
85 data = genomeGraph.serialize(format='turtle')
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
86 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
87
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
88 def main():
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
89 input_file = sys.argv[sys.argv.index("-input")+1]
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
90 fasta_parser(input_file)
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
91 save()
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
92
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
93 if __name__ == '__main__':
9
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
94 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
3f4f1cd22a6a FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents: 6
diff changeset
95 delete_galaxy()
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
96 main()
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
97