Mercurial > repos > jjkoehorst > sapp
comparison fasta2rdf/fastatordf.py @ 9:3f4f1cd22a6a
FASTA 2 RDF code cleanup
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 15:38:26 +0100 |
parents | ec73c34af97b |
children |
comparison
equal
deleted
inserted
replaced
7:c79025539d9b | 9:3f4f1cd22a6a |
---|---|
1 #!/usr/bin/env python3.4 | 1 #!/usr/bin/env python3.4 |
2 # Author: Jasper Jan Koehorst | 2 # Author: Jasper Jan Koehorst |
3 # Date created: Jan 22 2015 | 3 # Date created: Jan 22 2015 |
4 # Function: generation of a RDF file from a genome fasta file | 4 # Function: generation of a RDF file from a genome fasta file |
5 | 5 |
6 def delete_galaxy(): | |
7 import sys | |
8 for index, path in enumerate(sys.path): | |
9 if "galaxy-dist/" in path: | |
10 sys.path[index] = '' | |
11 | |
12 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. | |
13 delete_galaxy() | |
14 | 6 |
15 # from io import StringIO | 7 # from io import StringIO |
16 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin | 8 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin |
17 # import rdflib | 9 # import rdflib |
18 from rdflib.store import Store | 10 from rdflib.store import Store |
24 URI = "http://csb.wur.nl/genome/" | 16 URI = "http://csb.wur.nl/genome/" |
25 global seeAlso | 17 global seeAlso |
26 seeAlso = "rdfs:seeAlso" | 18 seeAlso = "rdfs:seeAlso" |
27 global coreURI | 19 global coreURI |
28 coreURI = Namespace(URI) | 20 coreURI = Namespace(URI) |
21 global genomeGraph | |
22 store = plugin.get('IOMemory', Store)() | |
23 genomeGraph = Graph(store,URIRef(URI)) | |
24 genomeGraph.bind("ssb",coreURI) | |
25 | |
26 def delete_galaxy(): | |
27 for index, path in enumerate(sys.path): | |
28 if "galaxy-dist/" in path: | |
29 sys.path[index] = '' | |
29 | 30 |
30 def createClass(uri): | 31 def createClass(uri): |
31 genomeGraph.add((uri,RDF.type,OWL.Class)) | 32 genomeGraph.add((uri,RDF.type,OWL.Class)) |
32 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) | 33 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) |
33 return uri | 34 return uri |
36 createClass(coreURI["Genome"]) #Genome class | 37 createClass(coreURI["Genome"]) #Genome class |
37 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) | 38 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) |
38 | 39 |
39 genomeDict = {} | 40 genomeDict = {} |
40 | 41 |
41 #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds | |
42 # regex = re.compile('\[type=(.*?)\]') | |
43 sequence = "" | 42 sequence = "" |
44 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") | 43 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") |
45 if genomeID == 'None': | 44 if genomeID == 'None': |
46 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") | 45 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") |
47 | 46 |
55 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | 54 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) |
56 if '-ids' == element: | 55 if '-ids' == element: |
57 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | 56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) |
58 | 57 |
59 genomeDict[genomeID] = {} | 58 genomeDict[genomeID] = {} |
60 # typDict = {"plasmid":0,"scaffold":0,"chromosome":0} | |
61 | 59 |
62 #Generating genome dictionary | 60 #Generating genome dictionary |
63 data = open(input_file).readlines() | 61 data = open(input_file).readlines() |
64 fastadict = {} | 62 fastadict = {} |
65 key = "" | 63 key = "" |
68 key = line.strip(">").strip() | 66 key = line.strip(">").strip() |
69 fastadict[key] = "" | 67 fastadict[key] = "" |
70 else: | 68 else: |
71 fastadict[key] += line.strip() | 69 fastadict[key] += line.strip() |
72 | 70 |
73 # for line in fastadict: | |
74 # typ = regex.findall(line) | |
75 # value = 0 | |
76 #If something is found | |
77 # if len(typ) > 0: | |
78 # typ = typ[0] | |
79 #If something is not found | |
80 # elif typ == []: | |
81 # typ = "scaffold" | |
82 #If something is found but does not contain a value | |
83 # elif "_" in typ: | |
84 # value = typ.split("_")[-1] | |
85 # try: | |
86 # value = int(value) | |
87 # except: | |
88 # value = 1 | |
89 #Not a integer | |
90 | |
91 #If a value is not given it is automatically assigned as the first one | |
92 #If a value is given... | |
93 # if value > -1: | |
94 #If a second scaffold of a chromosome_1 is found | |
95 # if typ in genomeDict[genome]: | |
96 #Retrieve how many | |
97 # value = len(genomeDict[genome][typ]) + 1 | |
98 # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]} | |
99 # else: | |
100 # genomeDict[genome][typ] = {} | |
101 # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]} | |
102 | |
103 #Genome dictionary to TTL | |
104 genomeClass = createClass(coreURI["Genome"]) | 71 genomeClass = createClass(coreURI["Genome"]) |
105 typeClass = createClass(coreURI["DnaObject"]) | 72 typeClass = createClass(coreURI["DnaObject"]) |
106 for index, genome in enumerate(fastadict): | 73 for index, genome in enumerate(fastadict): |
107 # for typ in genomeDict[genome]: | 74 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] |
108 # for scaf in genomeDict[genome][typ]: | 75 sequence = fastadict[genome] |
109 # for con in genomeDict[genome][typ][scaf]: | 76 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) |
110 #A note is required here... | 77 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) |
111 #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig. | 78 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) |
112 #Previously it was | 79 genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) |
113 # Genome > Class > Scaffold > Contig | 80 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) |
114 #Now it will be | 81 genomeGraph.add((genomeURI, RDF.type,genomeClass)) |
115 # Genome > Class/Scaffold/Contig | 82 genomeGraph.add((typeURI, RDF.type,typeClass)) |
116 #typeURI = coreURI[genome + "/" + typ] | |
117 #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf] | |
118 #Was contigURI | |
119 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con] | |
120 # sequence = genomeDict[genome][typ][scaf][con] | |
121 sequence = fastadict[genome] | |
122 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) | |
123 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
124 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) | |
125 genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) | |
126 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
127 genomeGraph.add((genomeURI, RDF.type,genomeClass)) | |
128 genomeGraph.add((typeURI, RDF.type,typeClass)) | |
129 | 83 |
130 def save(): | 84 def save(): |
131 data = genomeGraph.serialize(format='turtle') | 85 data = genomeGraph.serialize(format='turtle') |
132 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) | 86 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) |
133 | 87 |
134 def main(): | 88 def main(): |
135 store = plugin.get('IOMemory', Store)() | |
136 global genomeGraph | |
137 genomeGraph = Graph(store,URIRef(URI)) | |
138 genomeGraph.bind("ssb",coreURI) | |
139 input_file = sys.argv[sys.argv.index("-input")+1] | 89 input_file = sys.argv[sys.argv.index("-input")+1] |
140 fasta_parser(input_file) | 90 fasta_parser(input_file) |
141 save() | 91 save() |
142 | 92 |
143 if __name__ == '__main__': | 93 if __name__ == '__main__': |
94 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. | |
95 delete_galaxy() | |
144 main() | 96 main() |
145 | 97 |