changeset 10:3378d12591ea

FASTA 2 RDF code cleanup
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 15:40:23 +0100
parents e51957423315 (current diff) 3f4f1cd22a6a (diff)
children a712b378e090
files
diffstat 1 files changed, 20 insertions(+), 68 deletions(-) [+]
line wrap: on
line diff
--- a/fasta2rdf/fastatordf.py	Sat Feb 21 09:23:47 2015 -0500
+++ b/fasta2rdf/fastatordf.py	Sat Feb 21 15:40:23 2015 +0100
@@ -3,14 +3,6 @@
 # Date created: Jan 22 2015
 # Function: generation of a RDF file from a genome fasta file
 
-def delete_galaxy():
-	import sys
-	for index, path in enumerate(sys.path):
-		if "galaxy-dist/" in path:
-			sys.path[index] = ''
-
-#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
-delete_galaxy()
 
 # from io import StringIO
 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
@@ -26,6 +18,15 @@
 seeAlso = "rdfs:seeAlso"
 global coreURI
 coreURI = Namespace(URI)
+global genomeGraph
+store = plugin.get('IOMemory', Store)()
+genomeGraph = Graph(store,URIRef(URI))
+genomeGraph.bind("ssb",coreURI)
+
+def delete_galaxy():	
+	for index, path in enumerate(sys.path):
+		if "galaxy-dist/" in path:
+			sys.path[index] = ''
 
 def createClass(uri):
 	genomeGraph.add((uri,RDF.type,OWL.Class))
@@ -38,8 +39,6 @@
 
 	genomeDict = {}
 	
-	#requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds
-# 	regex = re.compile('\[type=(.*?)\]')
 	sequence = ""
 	genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
 	if genomeID == 'None':
@@ -57,7 +56,6 @@
 			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
 
 	genomeDict[genomeID] = {}
-	# typDict = {"plasmid":0,"scaffold":0,"chromosome":0}
 	
 	#Generating genome dictionary
 	data = open(input_file).readlines()
@@ -70,76 +68,30 @@
 		else:
 			fastadict[key] += line.strip()
 
-	# for line in fastadict:
-		# typ = regex.findall(line)
-		# value = 0
-		#If something is found
-		# if len(typ) > 0:
-			# typ = typ[0]
-		#If something is not found
-		# elif typ == []:
-		# typ = "scaffold"
-		#If something is found but does not contain a value
-		# elif "_" in typ:
-			# value = typ.split("_")[-1]
-			# try:
-				# value = int(value)
-			# except:
-				# value = 1
-				#Not a integer
-		
-		#If a value is not given it is automatically assigned as the first one
-		#If a value is given...
-		# if value > -1:
-			#If a second scaffold of a chromosome_1 is found
-		# if typ in genomeDict[genome]:
-			#Retrieve how many					
-			# value = len(genomeDict[genome][typ]) + 1
-			# genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]}
-		# else:
-			# genomeDict[genome][typ] = {}
-			# genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]}
-
-	#Genome dictionary to TTL
 	genomeClass = createClass(coreURI["Genome"])
 	typeClass = createClass(coreURI["DnaObject"])
 	for index, genome in enumerate(fastadict):
-		# for typ in genomeDict[genome]:
-			# for scaf in genomeDict[genome][typ]:
-				# for con in genomeDict[genome][typ][scaf]:
-					#A note is required here...
-					#Due to RDF performances we are reducing the amount of triples needed from a genome to a contig.
-					#Previously it was
-					# Genome > Class > Scaffold > Contig
-					#Now it will be
-					# Genome > Class/Scaffold/Contig
-					#typeURI = coreURI[genome + "/" + typ]
-					#scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf]
-					#Was contigURI
-					typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con]
-					# sequence = genomeDict[genome][typ][scaf][con]
-					sequence = fastadict[genome]
-					genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
-					genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
-					genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
-					genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
-					genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
-					genomeGraph.add((genomeURI, RDF.type,genomeClass))
-					genomeGraph.add((typeURI, RDF.type,typeClass))
+		typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
+		sequence = fastadict[genome]
+		genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
+		genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+		genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
+		genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
+		genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+		genomeGraph.add((genomeURI, RDF.type,genomeClass))
+		genomeGraph.add((typeURI, RDF.type,typeClass))
 
 def save():
 	data = genomeGraph.serialize(format='turtle')
 	open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
 
 def main():
-	store = plugin.get('IOMemory', Store)()
-	global genomeGraph
-	genomeGraph = Graph(store,URIRef(URI))
-	genomeGraph.bind("ssb",coreURI)
 	input_file = sys.argv[sys.argv.index("-input")+1]
 	fasta_parser(input_file)
 	save()
 
 if __name__ == '__main__':
+	#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
+	delete_galaxy()
 	main()