comparison conversion/gbk2rdf/gbktordf.py @ 16:74b8ba5e2d5b

aragorn addition
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 17:17:06 +0100
parents
children
comparison
equal deleted inserted replaced
15:10cad758ed0f 16:74b8ba5e2d5b
1 #!/usr/bin/env python3.4
2 # Author: Jasper Jan Koehorst
3 # Date created: Feb 21 2015
4 # Function: generation of a RDF file from Genbank/EMBL
5
6 import warnings
7 warnings.filterwarnings("ignore")
8
9 def delete_galaxy():
10 import sys
11 for index, path in enumerate(sys.path):
12 if "galaxy-dist/" in path:
13 sys.path[index] = ''
14
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
16 delete_galaxy()
17
18 from Bio import SeqIO
19 # Import RDFLib's default Graph implementation.
20 import os, sys
21 from Bio.Seq import Seq
22
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
24 from rdflib.store import Store
25 import hashlib
26 store = plugin.get('IOMemory', Store)()
27
28 global URI
29 URI = "http://csb.wur.nl/genome/"
30 global seeAlso
31 seeAlso = "rdfs:seeAlso"
32 global coreURI
33 coreURI = Namespace(URI)
34
35 global SubClassOfDict
36 SubClassOfDict = {}
37 global SubClassOfDictRna
38 SubClassOfDictRna = {}
39
40 def createClass(uri, root=True):
41 genomeGraph.add((uri,RDF.type,OWL.Class))
42 if root:
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
44 return uri
45
46 def tmp():
47 import time
48 global tmpFolder
49 tmpFolder = "/tmp/"+str(time.time())+"/"
50 os.mkdir(tmpFolder)
51
52 def cleantmp():
53 os.system("ls "+tmpFolder)
54 os.system("rm -rf "+tmpFolder)
55
56 def crawler():
57 #From input folder it looks for GBK file (gz files are in progress)
58 input_file = sys.argv[sys.argv.index("-input")+1]
59 gbk_parser(input_file)
60
61 def gbk_parser():
62 prevObjStart = -1
63 prevObjStop = -1
64 store = plugin.get('IOMemory', Store)()
65 global genomeGraph
66 genomeGraph = Graph(store,URIRef(URI))
67 genomeGraph.bind("ssb",coreURI)
68 input_file = sys.argv[sys.argv.index("-input")+1]
69
70 #CLASS definitions
71 genomeClass = createClass(coreURI["Genome"], root=True)
72 typeClass = createClass(coreURI["DnaObject"], root=True)
73 createClass(coreURI["Protein"], root=True)
74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
75 miscClass = createClass(coreURI["MiscFeature"], root=False)
76 createClass(coreURI["Feature"], root=True)
77 SubClassOfDict["MiscFeature"] = 1
78 SubClassOfDictRna["Trna"] = 1
79 SubClassOfDictRna["Rrna"] = 1
80 SubClassOfDictRna["Tmrna"] = 1
81 SubClassOfDictRna["Ncrna"] = 1
82
83 # codon = "11" #Default initialization if no CDS are present
84 ##################
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
86 scaf_value = 0
87 #Which files are already done
88 ########
89 formatGBK = sys.argv[sys.argv.index("-format")+1]
90 for record in SeqIO.parse(input_file, formatGBK):
91 #Read first feature for genome name and information...
92 #Ignore the empty GBK file due to the lack of features?
93
94 for index, feature in enumerate(record.features):
95 if index == 0:
96 if "-identifier" in sys.argv:
97 genome = sys.argv[sys.argv.index("-identifier")+1]
98 else:
99 try:
100 genome = feature.qualifiers["organism"][0].replace(" ","_")
101 except:
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
103 genome = "XNoneX"
104 for char in weird_chars:
105 genome = genome.replace(char,"_")
106
107 try:
108 gi = record.annotations["gi"]
109 typ = str(gi)
110 except:
111 try:
112 gi = record.annotations["accessions"][0]
113 typ = str(gi)
114 except:
115 scaf_value += 1
116 typ = "scaffold_"+str(scaf_value)
117 genomeURI = coreURI[genome]
118 gbkURI = coreURI[genome + "/" + typ]
119 #To contig connection to connect all data to it
120 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
121
122 #General genome features also stored in the class...
123 if "genome" in feature.qualifiers:
124 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
125 if "strain" in feature.qualifiers:
126 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
127 if "taxonomy" in record.annotations:
128 for taxon in record.annotations["taxonomy"]:
129 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
130 record.annotations["taxonomy"] = []
131 #Genome sequence#
132 sequence = str(record.seq)
133 #Verify if sequence was not empty and is now full of X or N
134 filtered_sequence = sequence.replace("X","").replace("N","")
135 if len(filtered_sequence) == 0:
136 sequence = ""
137 #Record parsing#
138 for annot in record.annotations:
139 if type(record.annotations[annot]) == list:
140 if annot == "references":
141 for references in record.annotations[annot]:
142 if references.pubmed_id != "":
143 pubmed = references.pubmed_id
144 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
145 obj_dict = references.__dict__
146 for key in obj_dict:
147 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
148 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
149
150 else:
151 for a in record.annotations[annot]:
152 int_add(gbkURI,coreURI[annot.lower()],str(a))
153 else:
154 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
155
156 #####END of RECORD####
157 if len(sequence) > 0:
158 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
159 genomeGraph.add((genomeURI, RDF.type,genomeClass))
160 genomeGraph.add((gbkURI, RDF.type,typeClass))
161 for key in feature.qualifiers:
162 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
163 #break
164 else: #The rest of the GBK file
165 feature_type = feature.type
166 end = str(feature.location.end).replace(">","").replace("<","")
167 start = str(feature.location.start).replace(">","").replace("<","")
168
169 strand = str(feature.location.strand)
170
171 if strand == 'None':
172 strand = 0
173 else:
174 if feature.type == "misc_feature": #Store as part of previous cds or something...
175 if strand == "-1":
176 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
177 else:
178 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
179
180 # TODO: Check if biopython has an overlap function...
181 if int(prevObjStart) <= int(start):
182 if int(end) <= int(prevObjStop):
183 pass
184 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
185 # genomeGraph.add((miscURI,RDF.type,miscClass))
186 else:
187 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
188 genomeGraph.add((miscURI,RDF.type,miscClass))
189 else:
190 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
191 genomeGraph.add((miscURI,RDF.type,miscClass))
192
193 store_general_information(miscURI,feature,record)
194 else:
195 prevObjStart = start
196 prevObjStop = end
197
198 if strand == "-1":
199 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
200 else:
201 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
202
203 #Contig specific connection
204 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
205 ############################
206
207 store_general_information(typeURI,feature,record)
208
209 for subfeature in feature.sub_features:
210 strand = str(subfeature.location.strand)
211 subfeature_type = subfeature.type
212 end = str(subfeature.location.end).replace(">","").replace("<","")
213 start = str(subfeature.location.start).replace(">","").replace("<","")
214
215 if strand == "-1":
216 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
217 else:
218 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
219 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
220 store_general_information(subURI,subfeature,record,feature)
221
222
223 def store_general_information(generalURI,feature,record,superfeature=""):
224 proteinClass = createClass(coreURI["Protein"], root=True)
225 sequence = str(record.seq)
226 cds_sequence = str(feature.extract(sequence))
227 #Fixes the 0 count instead of 1-count in biopython vs humans
228 feature_type = feature.type
229 end = str(feature.location.end).replace(">","").replace("<","")
230 start = str(feature.location.start).replace(">","").replace("<","")
231 strand = str(feature.location.strand)
232 if strand == "None":
233 strand = 0
234
235 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
236
237 if strand == "-1":
238 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
239 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
240 else:
241 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
242 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
243 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
244 if feature.type != "misc_feature":
245 try:
246 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
247 except: #When protein sequence is not given for whatever reason
248 print ("wrong?")
249
250 if feature.type == "misc_feature":
251 pass
252 else:
253 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
254 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
255 SubClassOfDict[feature_type.lower().title()] = 1
256 for key in feature.qualifiers:
257 values = feature.qualifiers[key]
258 if key == "translation":
259 pass
260 elif type(values) == list:
261 for v in values:
262 int_add(generalURI,coreURI[key.lower()],v)
263 else:
264 int_add(generalURI,coreURI[key.lower()],values)
265 if feature.type == "CDS":
266 try:
267 #Feature is normally submitted to this function
268 #IF a subfeature is submitted it is submitted as a feature
269 #And subfeature variable will contain the superfeature
270 if superfeature:
271 codon = superfeature.qualifiers["transl_table"][0]
272 except:
273 #Default codon table 11
274 codon = "11"
275 #Protein linkage
276 translation = ""
277 try:
278 translation = feature.qualifiers["translation"][0].strip("*")
279 except KeyError:
280 #When protein sequence is not given...
281 if len(feature.location.parts) > 1:
282 #Exon boundaries?
283 seq = ''
284 for loc in feature.location:
285 seq += record.seq[loc]
286 if int(feature.location.strand) == -1:
287 seq = Seq(seq).complement()
288 else:
289 seq = Seq(seq)
290 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
291 elif int(feature.location.strand) == -1:
292 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
293 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
294 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
295 else:
296 translation = ''
297 elif int(feature.location.strand) == +1:
298 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
299 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
300 else:
301 translation = ''
302
303 if translation:
304 translation = list(translation)
305 translation[0] = "M"
306 translation = ''.join(translation).strip("*")
307 if "*" in translation:
308 pass
309
310 translation = translation.encode('utf-8')
311 md5_protein = hashlib.md5(translation).hexdigest()
312 proteinURI = coreURI["protein/"+md5_protein]
313 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
314 for key in feature.qualifiers:
315 for v in feature.qualifiers[key]:
316 if key == "translation":
317 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
318 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
319 genomeGraph.add((proteinURI,RDF.type,proteinClass))
320 else:
321 for v in feature.qualifiers[key]:
322 int_add(generalURI,coreURI[key.lower()],v)
323
324 def int_add(subject, predicate, obj):
325 try:
326 object_float = float(obj.replace('"',''))
327 object_int = int(obj.replace('"',''))
328 if object_int == object_float:
329 genomeGraph.add((subject,predicate,Literal(object_int)))
330 else:
331 genomeGraph.add((subject,predicate,Literal(object_float)))
332 except:
333 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
334
335 def save():
336 data = genomeGraph.serialize(format='turtle')
337 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
338
339 def subClassOfBuilder():
340 for subclass in SubClassOfDict:
341 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
342 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
343
344 def subClassOfBuilderRna():
345 for subclass in SubClassOfDictRna:
346 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
347 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
348 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
349 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
350
351 def main():
352 tmp()
353 gbk_parser()
354 subClassOfBuilder()
355 subClassOfBuilderRna()
356 save()
357 cleantmp()
358
359 if __name__ == "__main__":
360 main()