Mercurial > repos > jjkoehorst > sapp
diff fasta2rdf/fastatordf.py @ 6:ec73c34af97b
FASTA2RDF
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 15:19:42 +0100 |
parents | |
children | 3f4f1cd22a6a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta2rdf/fastatordf.py Sat Feb 21 15:19:42 2015 +0100 @@ -0,0 +1,145 @@ +#!/usr/bin/env python3.4 +# Author: Jasper Jan Koehorst +# Date created: Jan 22 2015 +# Function: generation of a RDF file from a genome fasta file + +def delete_galaxy(): + import sys + for index, path in enumerate(sys.path): + if "galaxy-dist/" in path: + sys.path[index] = '' + +#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. +delete_galaxy() + +# from io import StringIO +from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin +# import rdflib +from rdflib.store import Store +import sys + +store = plugin.get('IOMemory', Store)() + +global URI +URI = "http://csb.wur.nl/genome/" +global seeAlso +seeAlso = "rdfs:seeAlso" +global coreURI +coreURI = Namespace(URI) + +def createClass(uri): + genomeGraph.add((uri,RDF.type,OWL.Class)) + genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) + return uri + +def fasta_parser(input_file): + createClass(coreURI["Genome"]) #Genome class + createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) + + genomeDict = {} + + #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds +# regex = re.compile('\[type=(.*?)\]') + sequence = "" + genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") + if genomeID == 'None': + genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") + + genomeURI = coreURI[genomeID] + for index, element in enumerate(sys.argv): + if '-organism' == element: + genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) + if '-ncbi_taxid' == element: + genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) + if '-idtag' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + if '-ids' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + + genomeDict[genomeID] = {} + # typDict = {"plasmid":0,"scaffold":0,"chromosome":0} + + #Generating genome dictionary + data = open(input_file).readlines() + fastadict = {} + key = "" + for index, line in enumerate(data): + if ">" == line[0]: + key = line.strip(">").strip() + fastadict[key] = "" + else: + fastadict[key] += line.strip() + + # for line in fastadict: + # typ = regex.findall(line) + # value = 0 + #If something is found + # if len(typ) > 0: + # typ = typ[0] + #If something is not found + # elif typ == []: + # typ = "scaffold" + #If something is found but does not contain a value + # elif "_" in typ: + # value = typ.split("_")[-1] + # try: + # value = int(value) + # except: + # value = 1 + #Not a integer + + #If a value is not given it is automatically assigned as the first one + #If a value is given... + # if value > -1: + #If a second scaffold of a chromosome_1 is found + # if typ in genomeDict[genome]: + #Retrieve how many + # value = len(genomeDict[genome][typ]) + 1 + # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]} + # else: + # genomeDict[genome][typ] = {} + # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]} + + #Genome dictionary to TTL + genomeClass = createClass(coreURI["Genome"]) + typeClass = createClass(coreURI["DnaObject"]) + for index, genome in enumerate(fastadict): + # for typ in genomeDict[genome]: + # for scaf in genomeDict[genome][typ]: + # for con in genomeDict[genome][typ][scaf]: + #A note is required here... + #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig. + #Previously it was + # Genome > Class > Scaffold > Contig + #Now it will be + # Genome > Class/Scaffold/Contig + #typeURI = coreURI[genome + "/" + typ] + #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf] + #Was contigURI + typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con] + # sequence = genomeDict[genome][typ][scaf][con] + sequence = fastadict[genome] + genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) + genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) + genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) + genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((genomeURI, RDF.type,genomeClass)) + genomeGraph.add((typeURI, RDF.type,typeClass)) + +def save(): + data = genomeGraph.serialize(format='turtle') + open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) + +def main(): + store = plugin.get('IOMemory', Store)() + global genomeGraph + genomeGraph = Graph(store,URIRef(URI)) + genomeGraph.bind("ssb",coreURI) + input_file = sys.argv[sys.argv.index("-input")+1] + fasta_parser(input_file) + save() + +if __name__ == '__main__': + main() +