comparison conversion/protein2rdf/protein_to_ttl.py @ 16:74b8ba5e2d5b

aragorn addition
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 17:17:06 +0100
parents
children
comparison
equal deleted inserted replaced
15:10cad758ed0f 16:74b8ba5e2d5b
1 def delete_galaxy():
2 import sys
3 for index, path in enumerate(sys.path):
4 if "galaxy-dist/" in path:
5 sys.path[index] = ''
6
7 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
8 delete_galaxy()
9
10 # from io import StringIO
11 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
12 # import rdflib
13 from rdflib.store import Store
14 import sys
15 import hashlib
16
17 store = plugin.get('IOMemory', Store)()
18
19 global URI
20 URI = "http://csb.wur.nl/genome/"
21 global seeAlso
22 seeAlso = "rdfs:seeAlso"
23 global coreURI
24 coreURI = Namespace(URI)
25
26
27 def createClass(uri):
28 genomeGraph.add((uri,RDF.type,OWL.Class))
29 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
30 return uri
31
32 def fasta_parser(input_file):
33 createClass(coreURI["Protein"])
34
35 genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
36 if genome == '':
37 genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
38
39 genomeURI = coreURI[genome]
40 for index, element in enumerate(sys.argv):
41 if '-organism' == element:
42 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
43 if '-ncbi_taxid' == element:
44 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
45 if '-idtag' == element:
46 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
47 if '-diagnosis' == element:
48 genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
49 if '-country' == element:
50 genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
51 if '-location' == element:
52 genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
53 if '-date' == element:
54 genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
55 if '-ids' == element:
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
57
58
59
60 data = (open(input_file).readlines())
61 fastadict = {}
62 sequence = ""
63 key = ""
64 for index, line in enumerate(data):
65 if ">" == line[0]:
66 if sequence:
67 fastadict[key] = sequence
68 key = line
69 sequence = ""
70 fastadict[key] = ""
71 else:
72 sequence += line.strip()
73 fastadict[key] = sequence
74
75 #Create a class, to be the same as all the other genome conversions...
76 #TODO: Proteins are part of cds, cds are part of dnaobject
77 #If CDS is not there... how then?
78 classURI = coreURI[genome + "/" + "protein_fasta"]
79 proteinClass = createClass(coreURI["Protein"])
80 genomeClass = createClass(coreURI["Genome"])
81 typeClass = createClass(coreURI["DnaObject"])
82 cdsClass = createClass(coreURI["Cds"])
83 #A theoretical begin, end is created to have a workable GBK generation
84 begin = 0
85 end = 0
86 genomeGraph.add((genomeURI, RDF.type, genomeClass))
87 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
88 genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
89 genomeGraph.add((classURI, RDF.type, typeClass))
90
91 for protein in fastadict:
92 sequence = fastadict[protein]
93 sequence = sequence.encode('utf-8')
94 end = begin + len(sequence)
95 md5_protein = hashlib.md5(sequence).hexdigest()
96 proteinURI = coreURI["protein/"+md5_protein]
97
98 cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
99 genomeGraph.add((classURI, coreURI["feature"] , cdsURI))
100 genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))
101 genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
102 genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
103 genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
104 genomeGraph.add((cdsURI, RDF.type, cdsClass))
105
106
107
108 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
109 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
110 genomeGraph.add((proteinURI,RDF.type,proteinClass))
111 genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
112 genomeGraph.add((proteinURI, RDF.type, proteinClass))
113 begin = end
114
115 def save():
116 data = genomeGraph.serialize(format='turtle')
117 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
118
119 def main():
120 store = plugin.get('IOMemory', Store)()
121 global genomeGraph
122 genomeGraph = Graph(store,URIRef(URI))
123 genomeGraph.bind("ssb",coreURI)
124 input_file = sys.argv[sys.argv.index("-input")+1]
125 fasta_parser(input_file)
126 save()
127
128 if __name__ == '__main__':
129 main()
130