diff gbk2rdf/gbktordf.py @ 3:db04e12b8779

Uploaded
author jjkoehorst
date Sat, 21 Feb 2015 07:28:39 -0500
parents
children ec73c34af97b
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk2rdf/gbktordf.py	Sat Feb 21 07:28:39 2015 -0500
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3.4
+# Author: Jasper Jan Koehorst
+# Date created: Feb 21 2015
+# Function: generation of a RDF file from Genbank/EMBL
+
+import warnings
+warnings.filterwarnings("ignore")
+
+def delete_galaxy():
+	import sys
+	for index, path in enumerate(sys.path):
+		if "galaxy-dist/" in path:
+			sys.path[index] = ''
+
+#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
+delete_galaxy()
+
+from Bio import SeqIO
+# Import RDFLib's default Graph implementation.
+import os, sys
+from Bio.Seq import Seq
+
+from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
+from rdflib.store import Store
+import hashlib
+store = plugin.get('IOMemory', Store)()
+
+global URI
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+
+global SubClassOfDict
+SubClassOfDict = {}
+global SubClassOfDictRna
+SubClassOfDictRna = {}
+
+def createClass(uri, root=True):
+	genomeGraph.add((uri,RDF.type,OWL.Class))
+	if root:
+		genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+	return uri
+
+def tmp():
+	import time
+	global tmpFolder
+	tmpFolder = "/tmp/"+str(time.time())+"/"
+	os.mkdir(tmpFolder)
+
+def cleantmp():
+	os.system("ls "+tmpFolder)
+	os.system("rm -rf "+tmpFolder)
+
+def crawler():
+	#From input folder it looks for GBK file (gz files are in progress)
+	input_file = sys.argv[sys.argv.index("-input")+1]
+	gbk_parser(input_file)
+
+def gbk_parser():
+	prevObjStart = -1
+	prevObjStop = -1	
+	store = plugin.get('IOMemory', Store)()
+	global genomeGraph
+	genomeGraph = Graph(store,URIRef(URI))
+	genomeGraph.bind("ssb",coreURI)
+	input_file = sys.argv[sys.argv.index("-input")+1]
+
+	#CLASS definitions
+	genomeClass = createClass(coreURI["Genome"], root=True)
+	typeClass = createClass(coreURI["DnaObject"], root=True)
+	createClass(coreURI["Protein"], root=True)
+	pubmedClass = createClass(coreURI["Pubmed"], root=True)
+	miscClass = createClass(coreURI["MiscFeature"], root=False)
+	createClass(coreURI["Feature"], root=True)
+	SubClassOfDict["MiscFeature"] = 1
+	SubClassOfDictRna["Trna"] = 1
+	SubClassOfDictRna["Rrna"] = 1
+	SubClassOfDictRna["Tmrna"] = 1
+	SubClassOfDictRna["Ncrna"] = 1
+
+# 	codon = "11" #Default initialization if no CDS are present
+	##################
+	weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
+	scaf_value = 0
+	#Which files are already done
+	########
+	formatGBK = sys.argv[sys.argv.index("-format")+1]
+	for record in SeqIO.parse(input_file, formatGBK):
+		#Read first feature for genome name and information...
+		#Ignore the empty GBK file due to the lack of features?
+
+		for index, feature in enumerate(record.features):
+			if index == 0:
+				if "-identifier" in sys.argv:
+					genome = sys.argv[sys.argv.index("-identifier")+1]
+				else:
+					try:
+						genome = feature.qualifiers["organism"][0].replace(" ","_")
+					except:
+						#BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
+						genome = "XNoneX"
+				for char in weird_chars:
+					genome = genome.replace(char,"_")
+
+				try:
+					gi = record.annotations["gi"]
+					typ = str(gi)
+				except:
+					scaf_value += 1
+					typ = "scaffold_"+str(scaf_value)
+				genomeURI = coreURI[genome]
+				gbkURI = coreURI[genome + "/" + typ]
+				#To contig connection to connect all data to it
+				genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
+
+				#General genome features also stored in the class...
+				if "genome" in feature.qualifiers:
+					genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
+				if "strain" in feature.qualifiers:
+					genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
+				if "taxonomy" in record.annotations:
+					for taxon in record.annotations["taxonomy"]:
+						genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
+					record.annotations["taxonomy"] = []
+				#Genome sequence#
+				sequence = str(record.seq)
+				#Verify if sequence was not empty and is now full of X or N
+				filtered_sequence = sequence.replace("X","").replace("N","")
+				if len(filtered_sequence) == 0:
+					sequence = ""
+				#Record parsing#
+				for annot in record.annotations:
+					if type(record.annotations[annot]) == list:
+						if annot == "references":
+							for references in record.annotations[annot]:
+								if references.pubmed_id != "":
+									pubmed = references.pubmed_id
+									genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
+									obj_dict = references.__dict__
+									for key in obj_dict:
+										genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
+									genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
+									
+						else:
+							for a in record.annotations[annot]:
+								int_add(gbkURI,coreURI[annot.lower()],str(a))
+					else:
+						int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
+
+
+				#####END of RECORD####
+				if len(sequence) > 0:
+					genomeGraph.add((gbkURI, coreURI["sequence"] ,  Literal(sequence)))
+				genomeGraph.add((genomeURI, RDF.type,genomeClass))
+				genomeGraph.add((gbkURI, RDF.type,typeClass))
+				for key in feature.qualifiers:
+					genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
+				#break
+			else: #The rest of the GBK file
+				feature_type = feature.type
+				end = str(feature.location.end).replace(">","").replace("<","")
+				start = str(feature.location.start).replace(">","").replace("<","")
+				
+				strand = str(feature.location.strand)
+
+				if strand == 'None':
+					strand = 0
+
+# 				if feature_type == "gene":
+# 					gene = feature
+					#Store gene in next feature....
+# 					gene_location_start = end = str(gene.location.end).replace(">","").replace("<","")
+# 					gene_location_stop = str(gene.location.start).replace(">","").replace("<","")
+# 					gene_qualifiers = gene.qualifiers	
+				else:
+					if feature.type == "misc_feature": #Store as part of previous cds or something...
+						if strand == "-1":
+							miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
+						else:
+							miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
+						
+						# genomeGraph.add((generalURI,coreURI["subFeature"],miscURI))
+
+						# TODO: Check if biopython has an overlap function...
+						if int(prevObjStart) <= int(start):
+							if int(end) <= int(prevObjStop):
+								pass
+# 								genomeGraph.add((typeURI,coreURI["feature"],miscURI))
+# 								genomeGraph.add((miscURI,RDF.type,miscClass))
+							else:
+								genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
+								genomeGraph.add((miscURI,RDF.type,miscClass))
+						else:
+							genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
+							genomeGraph.add((miscURI,RDF.type,miscClass))
+
+						store_general_information(miscURI,feature,record)
+					else:
+						prevObjStart = start
+						prevObjStop = end
+						
+						
+						if strand == "-1":
+							typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
+						else:
+							typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
+
+# 						cds_sequence = str(feature.extract(sequence))
+						#Contig specific connection
+						
+						genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
+						############################
+
+						store_general_information(typeURI,feature,record)
+
+						for subfeature in feature.sub_features:
+							strand = str(subfeature.location.strand)
+							subfeature_type = subfeature.type
+							end = str(subfeature.location.end).replace(">","").replace("<","")
+							start = str(subfeature.location.start).replace(">","").replace("<","")
+
+							if strand == "-1":
+								subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
+							else:
+								subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
+							genomeGraph.add((typeURI, coreURI["feature"] , subURI))
+							store_general_information(subURI,subfeature,record,feature)
+
+def store_general_information(generalURI,feature,record,superfeature=""):
+	proteinClass = createClass(coreURI["Protein"], root=True)
+	sequence = str(record.seq)
+	cds_sequence = str(feature.extract(sequence))
+	#Fixes the 0 count instead of 1-count in biopython vs humans
+	feature_type = feature.type
+	end = str(feature.location.end).replace(">","").replace("<","")
+	start = str(feature.location.start).replace(">","").replace("<","")
+	strand = str(feature.location.strand)	
+	if strand == "None":
+		strand = 0
+
+	genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+
+	if strand == "-1":
+		genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
+		genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
+	else:
+		genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
+		genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))	
+	genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
+	if feature.type != "misc_feature":
+		try:
+			genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
+		except: #When protein sequence is not given for whatever reason
+			print ("wrong?")
+
+	if feature.type == "misc_feature":
+		pass
+	else:
+		genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
+		if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
+			SubClassOfDict[feature_type.lower().title()] = 1
+	for key in feature.qualifiers:
+		values = feature.qualifiers[key]
+		if key == "translation":
+			pass
+		elif type(values) == list:
+			for v in values:
+				int_add(generalURI,coreURI[key.lower()],v)
+		else:
+			int_add(generalURI,coreURI[key.lower()],values)
+	if feature.type == "CDS":
+		try:
+			#Feature is normally submitted to this function
+			#IF a subfeature is submitted it is submitted as a feature
+			#And subfeature variable will contain the superfeature
+			if superfeature:
+				codon = superfeature.qualifiers["transl_table"][0]
+# 			else:
+# 				codon = subfeature.qualifiers["transl_table"][0]
+		except:
+			#Default codon table 11
+			codon = "11"
+		#Protein linkage
+		translation = ""
+		try:
+			translation = feature.qualifiers["translation"][0].strip("*")
+		except KeyError:
+			#When protein sequence is not given...
+			if len(feature.location.parts) > 1:
+				#Exon boundaries?
+				seq = ''
+				for loc in feature.location:
+					seq += record.seq[loc]
+				if int(feature.location.strand) == -1:
+					seq = Seq(seq).complement()
+				else:
+					seq = Seq(seq)
+				translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
+			elif int(feature.location.strand) == -1:
+				if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
+					if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
+						translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
+					else:
+						translation = ''
+			elif int(feature.location.strand) == +1:
+					if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
+						translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
+					else:
+						translation = ''
+			
+			if translation:
+				translation = list(translation)
+				translation[0] = "M"
+				translation = ''.join(translation).strip("*")
+				if "*" in translation:
+					pass		
+
+		translation = translation.encode('utf-8')
+		md5_protein = hashlib.md5(translation).hexdigest()
+		proteinURI = coreURI["protein/"+md5_protein]
+		genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
+		for key in feature.qualifiers:
+			for v in feature.qualifiers[key]:
+				if key == "translation":
+					genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
+					genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
+					genomeGraph.add((proteinURI,RDF.type,proteinClass))
+				else:
+					for v in feature.qualifiers[key]:
+						int_add(generalURI,coreURI[key.lower()],v)
+	
+def int_add(subject, predicate, obj):
+	try:
+		object_float = float(obj.replace('"',''))
+		object_int = int(obj.replace('"',''))
+		if object_int == object_float:
+			genomeGraph.add((subject,predicate,Literal(object_int)))
+		else:
+			genomeGraph.add((subject,predicate,Literal(object_float)))
+	except:
+		genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
+				
+def save():
+	data = genomeGraph.serialize(format='turtle')
+	open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+
+def subClassOfBuilder():
+	for subclass in SubClassOfDict:
+		genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
+		genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
+
+def subClassOfBuilderRna():
+	for subclass in SubClassOfDictRna:
+		genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
+		genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
+		genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
+		genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
+		genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
+
+def main():
+	tmp()
+	gbk_parser()
+	subClassOfBuilder()
+	subClassOfBuilderRna()
+	save()
+	cleantmp()
+
+if __name__ == "__main__":
+	main()
\ No newline at end of file