Mercurial > repos > jjkoehorst > sapp
diff conversion/protein2rdf/protein_to_ttl.py @ 16:74b8ba5e2d5b
aragorn addition
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 17:17:06 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/conversion/protein2rdf/protein_to_ttl.py Sat Feb 21 17:17:06 2015 +0100 @@ -0,0 +1,130 @@ +def delete_galaxy(): + import sys + for index, path in enumerate(sys.path): + if "galaxy-dist/" in path: + sys.path[index] = '' + +#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. +delete_galaxy() + +# from io import StringIO +from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin +# import rdflib +from rdflib.store import Store +import sys +import hashlib + +store = plugin.get('IOMemory', Store)() + +global URI +URI = "http://csb.wur.nl/genome/" +global seeAlso +seeAlso = "rdfs:seeAlso" +global coreURI +coreURI = Namespace(URI) + + +def createClass(uri): + genomeGraph.add((uri,RDF.type,OWL.Class)) + genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) + return uri + +def fasta_parser(input_file): + createClass(coreURI["Protein"]) + + genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") + if genome == '': + genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") + + genomeURI = coreURI[genome] + for index, element in enumerate(sys.argv): + if '-organism' == element: + genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) + if '-ncbi_taxid' == element: + genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) + if '-idtag' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + if '-diagnosis' == element: + genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1]))) + if '-country' == element: + genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1]))) + if '-location' == element: + genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1]))) + if '-date' == element: + genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1]))) + if '-ids' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + + + + data = (open(input_file).readlines()) + fastadict = {} + sequence = "" + key = "" + for index, line in enumerate(data): + if ">" == line[0]: + if sequence: + fastadict[key] = sequence + key = line + sequence = "" + fastadict[key] = "" + else: + sequence += line.strip() + fastadict[key] = sequence + + #Create a class, to be the same as all the other genome conversions... + #TODO: Proteins are part of cds, cds are part of dnaobject + #If CDS is not there... how then? + classURI = coreURI[genome + "/" + "protein_fasta"] + proteinClass = createClass(coreURI["Protein"]) + genomeClass = createClass(coreURI["Genome"]) + typeClass = createClass(coreURI["DnaObject"]) + cdsClass = createClass(coreURI["Cds"]) + #A theoretical begin, end is created to have a workable GBK generation + begin = 0 + end = 0 + genomeGraph.add((genomeURI, RDF.type, genomeClass)) + genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI)) + genomeGraph.add((classURI, RDF.type, typeClass)) + + for protein in fastadict: + sequence = fastadict[protein] + sequence = sequence.encode('utf-8') + end = begin + len(sequence) + md5_protein = hashlib.md5(sequence).hexdigest() + proteinURI = coreURI["protein/"+md5_protein] + + cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)] + genomeGraph.add((classURI, coreURI["feature"] , cdsURI)) + genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin))) + genomeGraph.add((cdsURI, coreURI["end"] , Literal(end))) + genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI)) + genomeGraph.add((cdsURI, RDF.type, cdsClass)) + + + + genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) + genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence))) + genomeGraph.add((proteinURI,RDF.type,proteinClass)) + genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((proteinURI, RDF.type, proteinClass)) + begin = end + +def save(): + data = genomeGraph.serialize(format='turtle') + open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) + +def main(): + store = plugin.get('IOMemory', Store)() + global genomeGraph + genomeGraph = Graph(store,URIRef(URI)) + genomeGraph.bind("ssb",coreURI) + input_file = sys.argv[sys.argv.index("-input")+1] + fasta_parser(input_file) + save() + +if __name__ == '__main__': + main() +