# HG changeset patch # User jjkoehorst # Date 1424529623 -3600 # Node ID 3378d12591ea5ba42d2b9a8c8dbf5a9a1a64af66 # Parent e519574233154f1ef7c3740e570ce631b0daa925# Parent 3f4f1cd22a6ac8cf8430ef616b8ebc3d31e27a45 FASTA 2 RDF code cleanup diff -r e51957423315 -r 3378d12591ea fasta2rdf/fastatordf.py --- a/fasta2rdf/fastatordf.py Sat Feb 21 09:23:47 2015 -0500 +++ b/fasta2rdf/fastatordf.py Sat Feb 21 15:40:23 2015 +0100 @@ -3,14 +3,6 @@ # Date created: Jan 22 2015 # Function: generation of a RDF file from a genome fasta file -def delete_galaxy(): - import sys - for index, path in enumerate(sys.path): - if "galaxy-dist/" in path: - sys.path[index] = '' - -#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. -delete_galaxy() # from io import StringIO from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin @@ -26,6 +18,15 @@ seeAlso = "rdfs:seeAlso" global coreURI coreURI = Namespace(URI) +global genomeGraph +store = plugin.get('IOMemory', Store)() +genomeGraph = Graph(store,URIRef(URI)) +genomeGraph.bind("ssb",coreURI) + +def delete_galaxy(): + for index, path in enumerate(sys.path): + if "galaxy-dist/" in path: + sys.path[index] = '' def createClass(uri): genomeGraph.add((uri,RDF.type,OWL.Class)) @@ -38,8 +39,6 @@ genomeDict = {} - #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds -# regex = re.compile('\[type=(.*?)\]') sequence = "" genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") if genomeID == 'None': @@ -57,7 +56,6 @@ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) genomeDict[genomeID] = {} - # typDict = {"plasmid":0,"scaffold":0,"chromosome":0} #Generating genome dictionary data = open(input_file).readlines() @@ -70,76 +68,30 @@ else: fastadict[key] += line.strip() - # for line in fastadict: - # typ = regex.findall(line) - # value = 0 - #If something is found - # if len(typ) > 0: - # typ = typ[0] - #If something is not found - # elif typ == []: - # typ = "scaffold" - #If something is found but does not contain a value - # elif "_" in typ: - # value = typ.split("_")[-1] - # try: - # value = int(value) - # except: - # value = 1 - #Not a integer - - #If a value is not given it is automatically assigned as the first one - #If a value is given... - # if value > -1: - #If a second scaffold of a chromosome_1 is found - # if typ in genomeDict[genome]: - #Retrieve how many - # value = len(genomeDict[genome][typ]) + 1 - # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]} - # else: - # genomeDict[genome][typ] = {} - # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]} - - #Genome dictionary to TTL genomeClass = createClass(coreURI["Genome"]) typeClass = createClass(coreURI["DnaObject"]) for index, genome in enumerate(fastadict): - # for typ in genomeDict[genome]: - # for scaf in genomeDict[genome][typ]: - # for con in genomeDict[genome][typ][scaf]: - #A note is required here... - #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig. - #Previously it was - # Genome > Class > Scaffold > Contig - #Now it will be - # Genome > Class/Scaffold/Contig - #typeURI = coreURI[genome + "/" + typ] - #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf] - #Was contigURI - typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con] - # sequence = genomeDict[genome][typ][scaf][con] - sequence = fastadict[genome] - genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) - genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) - genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) - genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) - genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) - genomeGraph.add((genomeURI, RDF.type,genomeClass)) - genomeGraph.add((typeURI, RDF.type,typeClass)) + typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] + sequence = fastadict[genome] + genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) + genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) + genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) + genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((genomeURI, RDF.type,genomeClass)) + genomeGraph.add((typeURI, RDF.type,typeClass)) def save(): data = genomeGraph.serialize(format='turtle') open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) def main(): - store = plugin.get('IOMemory', Store)() - global genomeGraph - genomeGraph = Graph(store,URIRef(URI)) - genomeGraph.bind("ssb",coreURI) input_file = sys.argv[sys.argv.index("-input")+1] fasta_parser(input_file) save() if __name__ == '__main__': + #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. + delete_galaxy() main()