16
|
1 def delete_galaxy():
|
|
2 import sys
|
|
3 for index, path in enumerate(sys.path):
|
|
4 if "galaxy-dist/" in path:
|
|
5 sys.path[index] = ''
|
|
6
|
|
7 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
|
|
8 delete_galaxy()
|
|
9
|
|
10 # from io import StringIO
|
|
11 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
|
|
12 # import rdflib
|
|
13 from rdflib.store import Store
|
|
14 import sys
|
|
15 import hashlib
|
|
16
|
|
17 store = plugin.get('IOMemory', Store)()
|
|
18
|
|
19 global URI
|
|
20 URI = "http://csb.wur.nl/genome/"
|
|
21 global seeAlso
|
|
22 seeAlso = "rdfs:seeAlso"
|
|
23 global coreURI
|
|
24 coreURI = Namespace(URI)
|
|
25
|
|
26
|
|
27 def createClass(uri):
|
|
28 genomeGraph.add((uri,RDF.type,OWL.Class))
|
|
29 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
|
|
30 return uri
|
|
31
|
|
32 def fasta_parser(input_file):
|
|
33 createClass(coreURI["Protein"])
|
|
34
|
|
35 genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
|
|
36 if genome == '':
|
|
37 genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
|
|
38
|
|
39 genomeURI = coreURI[genome]
|
|
40 for index, element in enumerate(sys.argv):
|
|
41 if '-organism' == element:
|
|
42 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
|
|
43 if '-ncbi_taxid' == element:
|
|
44 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
|
|
45 if '-idtag' == element:
|
|
46 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
|
|
47 if '-diagnosis' == element:
|
|
48 genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
|
|
49 if '-country' == element:
|
|
50 genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
|
|
51 if '-location' == element:
|
|
52 genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
|
|
53 if '-date' == element:
|
|
54 genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
|
|
55 if '-ids' == element:
|
|
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
|
|
57
|
|
58
|
|
59
|
|
60 data = (open(input_file).readlines())
|
|
61 fastadict = {}
|
|
62 sequence = ""
|
|
63 key = ""
|
|
64 for index, line in enumerate(data):
|
|
65 if ">" == line[0]:
|
|
66 if sequence:
|
|
67 fastadict[key] = sequence
|
|
68 key = line
|
|
69 sequence = ""
|
|
70 fastadict[key] = ""
|
|
71 else:
|
|
72 sequence += line.strip()
|
|
73 fastadict[key] = sequence
|
|
74
|
|
75 #Create a class, to be the same as all the other genome conversions...
|
|
76 #TODO: Proteins are part of cds, cds are part of dnaobject
|
|
77 #If CDS is not there... how then?
|
|
78 classURI = coreURI[genome + "/" + "protein_fasta"]
|
|
79 proteinClass = createClass(coreURI["Protein"])
|
|
80 genomeClass = createClass(coreURI["Genome"])
|
|
81 typeClass = createClass(coreURI["DnaObject"])
|
|
82 cdsClass = createClass(coreURI["Cds"])
|
|
83 #A theoretical begin, end is created to have a workable GBK generation
|
|
84 begin = 0
|
|
85 end = 0
|
|
86 genomeGraph.add((genomeURI, RDF.type, genomeClass))
|
|
87 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
|
|
88 genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
|
|
89 genomeGraph.add((classURI, RDF.type, typeClass))
|
|
90
|
|
91 for protein in fastadict:
|
|
92 sequence = fastadict[protein]
|
|
93 sequence = sequence.encode('utf-8')
|
|
94 end = begin + len(sequence)
|
|
95 md5_protein = hashlib.md5(sequence).hexdigest()
|
|
96 proteinURI = coreURI["protein/"+md5_protein]
|
|
97
|
|
98 cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
|
|
99 genomeGraph.add((classURI, coreURI["feature"] , cdsURI))
|
|
100 genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))
|
|
101 genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
|
|
102 genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
|
|
103 genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
|
|
104 genomeGraph.add((cdsURI, RDF.type, cdsClass))
|
|
105
|
|
106
|
|
107
|
|
108 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
|
|
109 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
|
|
110 genomeGraph.add((proteinURI,RDF.type,proteinClass))
|
|
111 genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
|
|
112 genomeGraph.add((proteinURI, RDF.type, proteinClass))
|
|
113 begin = end
|
|
114
|
|
115 def save():
|
|
116 data = genomeGraph.serialize(format='turtle')
|
|
117 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
|
|
118
|
|
119 def main():
|
|
120 store = plugin.get('IOMemory', Store)()
|
|
121 global genomeGraph
|
|
122 genomeGraph = Graph(store,URIRef(URI))
|
|
123 genomeGraph.bind("ssb",coreURI)
|
|
124 input_file = sys.argv[sys.argv.index("-input")+1]
|
|
125 fasta_parser(input_file)
|
|
126 save()
|
|
127
|
|
128 if __name__ == '__main__':
|
|
129 main()
|
|
130
|