Mercurial > repos > jjkoehorst > sapp
annotate fasta2rdf/fastatordf.py @ 9:3f4f1cd22a6a
FASTA 2 RDF code cleanup
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 15:38:26 +0100 |
parents | ec73c34af97b |
children |
rev | line source |
---|---|
6 | 1 #!/usr/bin/env python3.4 |
2 # Author: Jasper Jan Koehorst | |
3 # Date created: Jan 22 2015 | |
4 # Function: generation of a RDF file from a genome fasta file | |
5 | |
6 | |
7 # from io import StringIO | |
8 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin | |
9 # import rdflib | |
10 from rdflib.store import Store | |
11 import sys | |
12 | |
13 store = plugin.get('IOMemory', Store)() | |
14 | |
15 global URI | |
16 URI = "http://csb.wur.nl/genome/" | |
17 global seeAlso | |
18 seeAlso = "rdfs:seeAlso" | |
19 global coreURI | |
20 coreURI = Namespace(URI) | |
9
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
21 global genomeGraph |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
22 store = plugin.get('IOMemory', Store)() |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
23 genomeGraph = Graph(store,URIRef(URI)) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
24 genomeGraph.bind("ssb",coreURI) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
25 |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
26 def delete_galaxy(): |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
27 for index, path in enumerate(sys.path): |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
28 if "galaxy-dist/" in path: |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
29 sys.path[index] = '' |
6 | 30 |
31 def createClass(uri): | |
32 genomeGraph.add((uri,RDF.type,OWL.Class)) | |
33 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) | |
34 return uri | |
35 | |
36 def fasta_parser(input_file): | |
37 createClass(coreURI["Genome"]) #Genome class | |
38 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) | |
39 | |
40 genomeDict = {} | |
41 | |
42 sequence = "" | |
43 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") | |
44 if genomeID == 'None': | |
45 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") | |
46 | |
47 genomeURI = coreURI[genomeID] | |
48 for index, element in enumerate(sys.argv): | |
49 if '-organism' == element: | |
50 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) | |
51 if '-ncbi_taxid' == element: | |
52 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) | |
53 if '-idtag' == element: | |
54 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
55 if '-ids' == element: | |
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
57 | |
58 genomeDict[genomeID] = {} | |
59 | |
60 #Generating genome dictionary | |
61 data = open(input_file).readlines() | |
62 fastadict = {} | |
63 key = "" | |
64 for index, line in enumerate(data): | |
65 if ">" == line[0]: | |
66 key = line.strip(">").strip() | |
67 fastadict[key] = "" | |
68 else: | |
69 fastadict[key] += line.strip() | |
70 | |
71 genomeClass = createClass(coreURI["Genome"]) | |
72 typeClass = createClass(coreURI["DnaObject"]) | |
73 for index, genome in enumerate(fastadict): | |
9
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
74 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
75 sequence = fastadict[genome] |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
76 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
77 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
78 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
79 genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
80 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
81 genomeGraph.add((genomeURI, RDF.type,genomeClass)) |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
82 genomeGraph.add((typeURI, RDF.type,typeClass)) |
6 | 83 |
84 def save(): | |
85 data = genomeGraph.serialize(format='turtle') | |
86 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) | |
87 | |
88 def main(): | |
89 input_file = sys.argv[sys.argv.index("-input")+1] | |
90 fasta_parser(input_file) | |
91 save() | |
92 | |
93 if __name__ == '__main__': | |
9
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
94 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. |
3f4f1cd22a6a
FASTA 2 RDF code cleanup
jjkoehorst <jasperkoehorst@gmail.com>
parents:
6
diff
changeset
|
95 delete_galaxy() |
6 | 96 main() |
97 |