diff conversion/protein2rdf/protein_to_ttl.py @ 16:74b8ba5e2d5b

aragorn addition
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 17:17:06 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/protein2rdf/protein_to_ttl.py	Sat Feb 21 17:17:06 2015 +0100
@@ -0,0 +1,130 @@
+def delete_galaxy():
+	import sys
+	for index, path in enumerate(sys.path):
+		if "galaxy-dist/" in path:
+			sys.path[index] = ''
+
+#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
+delete_galaxy()
+
+# from io import StringIO
+from rdflib import Graph, URIRef, Literal,Namespace,  RDF,RDFS,OWL,  plugin
+# import rdflib
+from rdflib.store import Store
+import sys
+import hashlib
+
+store = plugin.get('IOMemory', Store)()
+
+global URI
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+
+
+def createClass(uri):
+	genomeGraph.add((uri,RDF.type,OWL.Class))
+	genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+	return uri
+
+def fasta_parser(input_file):
+	createClass(coreURI["Protein"])
+	
+	genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
+	if genome == '':
+		genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
+
+	genomeURI = coreURI[genome]
+	for index, element in enumerate(sys.argv):
+		if '-organism' == element:
+			genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
+		if '-ncbi_taxid' == element:
+			genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
+		if '-idtag' == element:
+			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+		if '-diagnosis' == element:
+			genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
+		if '-country' == element:
+			genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
+		if '-location' == element:
+			genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
+		if '-date' == element:
+			genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
+		if '-ids' == element:
+			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+
+
+	
+	data = (open(input_file).readlines())
+	fastadict = {}
+	sequence = ""
+	key = ""
+	for index, line in enumerate(data):
+		if ">" == line[0]:
+			if sequence:
+				fastadict[key] = sequence
+			key = line
+			sequence = ""
+			fastadict[key] = ""
+		else:
+			sequence += line.strip()
+	fastadict[key] = sequence
+	
+	#Create a class, to be the same as all the other genome conversions...
+	#TODO: Proteins are part of cds, cds are part of dnaobject
+	#If CDS is not there... how then?
+	classURI = coreURI[genome + "/" + "protein_fasta"]
+	proteinClass = createClass(coreURI["Protein"])
+	genomeClass = createClass(coreURI["Genome"])
+	typeClass = createClass(coreURI["DnaObject"])
+	cdsClass = createClass(coreURI["Cds"])
+	#A theoretical begin, end is created to have a workable GBK generation	
+	begin = 0
+	end = 0
+	genomeGraph.add((genomeURI, RDF.type, genomeClass))
+	genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+	genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
+	genomeGraph.add((classURI, RDF.type, typeClass))
+
+	for protein in fastadict:
+		sequence = fastadict[protein]
+		sequence = sequence.encode('utf-8')
+		end = begin + len(sequence)
+		md5_protein = hashlib.md5(sequence).hexdigest()
+		proteinURI = coreURI["protein/"+md5_protein]
+		
+		cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
+		genomeGraph.add((classURI, coreURI["feature"] , cdsURI))	
+		genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))	
+		genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
+		genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+		genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
+		genomeGraph.add((cdsURI, RDF.type, cdsClass))
+		
+		
+
+		genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
+		genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
+		genomeGraph.add((proteinURI,RDF.type,proteinClass))
+		genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+		genomeGraph.add((proteinURI, RDF.type, proteinClass))
+		begin = end
+
+def save():
+	data = genomeGraph.serialize(format='turtle')
+	open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+
+def main():
+	store = plugin.get('IOMemory', Store)()
+	global genomeGraph
+	genomeGraph = Graph(store,URIRef(URI))
+	genomeGraph.bind("ssb",coreURI)
+	input_file = sys.argv[sys.argv.index("-input")+1]
+	fasta_parser(input_file)
+	save()
+
+if __name__ == '__main__':
+	main()
+