annotate conversion/protein2rdf/protein_to_ttl.py @ 16:74b8ba5e2d5b

aragorn addition
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 17:17:06 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
1 def delete_galaxy():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
2 import sys
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
3 for index, path in enumerate(sys.path):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
4 if "galaxy-dist/" in path:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
5 sys.path[index] = ''
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
6
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
7 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
8 delete_galaxy()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
9
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
10 # from io import StringIO
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
11 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
12 # import rdflib
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
13 from rdflib.store import Store
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
14 import sys
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
15 import hashlib
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
16
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
17 store = plugin.get('IOMemory', Store)()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
18
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
19 global URI
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
20 URI = "http://csb.wur.nl/genome/"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
21 global seeAlso
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
22 seeAlso = "rdfs:seeAlso"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
23 global coreURI
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
24 coreURI = Namespace(URI)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
25
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
26
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
27 def createClass(uri):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
28 genomeGraph.add((uri,RDF.type,OWL.Class))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
29 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
30 return uri
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
31
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
32 def fasta_parser(input_file):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
33 createClass(coreURI["Protein"])
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
34
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
35 genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
36 if genome == '':
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
37 genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
38
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
39 genomeURI = coreURI[genome]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
40 for index, element in enumerate(sys.argv):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
41 if '-organism' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
42 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
43 if '-ncbi_taxid' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
44 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
45 if '-idtag' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
46 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
47 if '-diagnosis' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
48 genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
49 if '-country' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
50 genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
51 if '-location' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
52 genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
53 if '-date' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
54 genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
55 if '-ids' == element:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
57
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
58
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
59
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
60 data = (open(input_file).readlines())
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
61 fastadict = {}
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
62 sequence = ""
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
63 key = ""
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
64 for index, line in enumerate(data):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
65 if ">" == line[0]:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
66 if sequence:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
67 fastadict[key] = sequence
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
68 key = line
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
69 sequence = ""
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
70 fastadict[key] = ""
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
71 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
72 sequence += line.strip()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
73 fastadict[key] = sequence
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
74
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
75 #Create a class, to be the same as all the other genome conversions...
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
76 #TODO: Proteins are part of cds, cds are part of dnaobject
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
77 #If CDS is not there... how then?
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
78 classURI = coreURI[genome + "/" + "protein_fasta"]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
79 proteinClass = createClass(coreURI["Protein"])
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
80 genomeClass = createClass(coreURI["Genome"])
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
81 typeClass = createClass(coreURI["DnaObject"])
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
82 cdsClass = createClass(coreURI["Cds"])
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
83 #A theoretical begin, end is created to have a workable GBK generation
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
84 begin = 0
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
85 end = 0
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
86 genomeGraph.add((genomeURI, RDF.type, genomeClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
87 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
88 genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
89 genomeGraph.add((classURI, RDF.type, typeClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
90
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
91 for protein in fastadict:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
92 sequence = fastadict[protein]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
93 sequence = sequence.encode('utf-8')
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
94 end = begin + len(sequence)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
95 md5_protein = hashlib.md5(sequence).hexdigest()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
96 proteinURI = coreURI["protein/"+md5_protein]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
97
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
98 cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
99 genomeGraph.add((classURI, coreURI["feature"] , cdsURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
100 genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
101 genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
102 genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
103 genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
104 genomeGraph.add((cdsURI, RDF.type, cdsClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
105
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
106
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
107
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
108 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
109 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
110 genomeGraph.add((proteinURI,RDF.type,proteinClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
111 genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
112 genomeGraph.add((proteinURI, RDF.type, proteinClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
113 begin = end
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
114
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
115 def save():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
116 data = genomeGraph.serialize(format='turtle')
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
117 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
118
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
119 def main():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
120 store = plugin.get('IOMemory', Store)()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
121 global genomeGraph
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
122 genomeGraph = Graph(store,URIRef(URI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
123 genomeGraph.bind("ssb",coreURI)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
124 input_file = sys.argv[sys.argv.index("-input")+1]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
125 fasta_parser(input_file)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
126 save()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
127
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
128 if __name__ == '__main__':
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
129 main()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
130