Mercurial > repos > jjkoehorst > sapp
view conversion/fasta2rdf/fastatordf.py @ 24:9610ddbca991
.
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 19:25:28 +0100 |
parents | 74b8ba5e2d5b |
children |
line wrap: on
line source
#!/usr/bin/env python3.4 # Author: Jasper Jan Koehorst # Date created: Jan 22 2015 # Function: generation of a RDF file from a genome fasta file # from io import StringIO from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin # import rdflib from rdflib.store import Store import sys store = plugin.get('IOMemory', Store)() global URI URI = "http://csb.wur.nl/genome/" global seeAlso seeAlso = "rdfs:seeAlso" global coreURI coreURI = Namespace(URI) global genomeGraph store = plugin.get('IOMemory', Store)() genomeGraph = Graph(store,URIRef(URI)) genomeGraph.bind("ssb",coreURI) def delete_galaxy(): for index, path in enumerate(sys.path): if "galaxy-dist/" in path: sys.path[index] = '' def createClass(uri): genomeGraph.add((uri,RDF.type,OWL.Class)) genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) return uri def fasta_parser(input_file): createClass(coreURI["Genome"]) #Genome class createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) genomeDict = {} sequence = "" genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") if genomeID == 'None': genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") genomeURI = coreURI[genomeID] for index, element in enumerate(sys.argv): if '-organism' == element: genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) if '-ncbi_taxid' == element: genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) if '-idtag' == element: genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) if '-ids' == element: genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) genomeDict[genomeID] = {} #Generating genome dictionary data = open(input_file).readlines() fastadict = {} key = "" for index, line in enumerate(data): if ">" == line[0]: key = line.strip(">").strip() fastadict[key] = "" else: fastadict[key] += line.strip() genomeClass = createClass(coreURI["Genome"]) typeClass = createClass(coreURI["DnaObject"]) for index, genome in enumerate(fastadict): typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] sequence = fastadict[genome] genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) genomeGraph.add((genomeURI, RDF.type,genomeClass)) genomeGraph.add((typeURI, RDF.type,typeClass)) def save(): data = genomeGraph.serialize(format='turtle') open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) def main(): input_file = sys.argv[sys.argv.index("-input")+1] fasta_parser(input_file) save() if __name__ == '__main__': #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. delete_galaxy() main()