Mercurial > repos > jjkoehorst > sapp
comparison conversion/protein2rdf/protein_to_ttl.py @ 16:74b8ba5e2d5b
aragorn addition
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 17:17:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
15:10cad758ed0f | 16:74b8ba5e2d5b |
---|---|
1 def delete_galaxy(): | |
2 import sys | |
3 for index, path in enumerate(sys.path): | |
4 if "galaxy-dist/" in path: | |
5 sys.path[index] = '' | |
6 | |
7 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. | |
8 delete_galaxy() | |
9 | |
10 # from io import StringIO | |
11 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin | |
12 # import rdflib | |
13 from rdflib.store import Store | |
14 import sys | |
15 import hashlib | |
16 | |
17 store = plugin.get('IOMemory', Store)() | |
18 | |
19 global URI | |
20 URI = "http://csb.wur.nl/genome/" | |
21 global seeAlso | |
22 seeAlso = "rdfs:seeAlso" | |
23 global coreURI | |
24 coreURI = Namespace(URI) | |
25 | |
26 | |
27 def createClass(uri): | |
28 genomeGraph.add((uri,RDF.type,OWL.Class)) | |
29 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) | |
30 return uri | |
31 | |
32 def fasta_parser(input_file): | |
33 createClass(coreURI["Protein"]) | |
34 | |
35 genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") | |
36 if genome == '': | |
37 genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") | |
38 | |
39 genomeURI = coreURI[genome] | |
40 for index, element in enumerate(sys.argv): | |
41 if '-organism' == element: | |
42 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) | |
43 if '-ncbi_taxid' == element: | |
44 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) | |
45 if '-idtag' == element: | |
46 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
47 if '-diagnosis' == element: | |
48 genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1]))) | |
49 if '-country' == element: | |
50 genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1]))) | |
51 if '-location' == element: | |
52 genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1]))) | |
53 if '-date' == element: | |
54 genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1]))) | |
55 if '-ids' == element: | |
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
57 | |
58 | |
59 | |
60 data = (open(input_file).readlines()) | |
61 fastadict = {} | |
62 sequence = "" | |
63 key = "" | |
64 for index, line in enumerate(data): | |
65 if ">" == line[0]: | |
66 if sequence: | |
67 fastadict[key] = sequence | |
68 key = line | |
69 sequence = "" | |
70 fastadict[key] = "" | |
71 else: | |
72 sequence += line.strip() | |
73 fastadict[key] = sequence | |
74 | |
75 #Create a class, to be the same as all the other genome conversions... | |
76 #TODO: Proteins are part of cds, cds are part of dnaobject | |
77 #If CDS is not there... how then? | |
78 classURI = coreURI[genome + "/" + "protein_fasta"] | |
79 proteinClass = createClass(coreURI["Protein"]) | |
80 genomeClass = createClass(coreURI["Genome"]) | |
81 typeClass = createClass(coreURI["DnaObject"]) | |
82 cdsClass = createClass(coreURI["Cds"]) | |
83 #A theoretical begin, end is created to have a workable GBK generation | |
84 begin = 0 | |
85 end = 0 | |
86 genomeGraph.add((genomeURI, RDF.type, genomeClass)) | |
87 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
88 genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI)) | |
89 genomeGraph.add((classURI, RDF.type, typeClass)) | |
90 | |
91 for protein in fastadict: | |
92 sequence = fastadict[protein] | |
93 sequence = sequence.encode('utf-8') | |
94 end = begin + len(sequence) | |
95 md5_protein = hashlib.md5(sequence).hexdigest() | |
96 proteinURI = coreURI["protein/"+md5_protein] | |
97 | |
98 cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)] | |
99 genomeGraph.add((classURI, coreURI["feature"] , cdsURI)) | |
100 genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin))) | |
101 genomeGraph.add((cdsURI, coreURI["end"] , Literal(end))) | |
102 genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
103 genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI)) | |
104 genomeGraph.add((cdsURI, RDF.type, cdsClass)) | |
105 | |
106 | |
107 | |
108 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) | |
109 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence))) | |
110 genomeGraph.add((proteinURI,RDF.type,proteinClass)) | |
111 genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
112 genomeGraph.add((proteinURI, RDF.type, proteinClass)) | |
113 begin = end | |
114 | |
115 def save(): | |
116 data = genomeGraph.serialize(format='turtle') | |
117 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) | |
118 | |
119 def main(): | |
120 store = plugin.get('IOMemory', Store)() | |
121 global genomeGraph | |
122 genomeGraph = Graph(store,URIRef(URI)) | |
123 genomeGraph.bind("ssb",coreURI) | |
124 input_file = sys.argv[sys.argv.index("-input")+1] | |
125 fasta_parser(input_file) | |
126 save() | |
127 | |
128 if __name__ == '__main__': | |
129 main() | |
130 |