Mercurial > repos > jjkoehorst > sapp
comparison conversion/gbk2rdf/gbktordf.py @ 16:74b8ba5e2d5b
aragorn addition
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 17:17:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
15:10cad758ed0f | 16:74b8ba5e2d5b |
---|---|
1 #!/usr/bin/env python3.4 | |
2 # Author: Jasper Jan Koehorst | |
3 # Date created: Feb 21 2015 | |
4 # Function: generation of a RDF file from Genbank/EMBL | |
5 | |
6 import warnings | |
7 warnings.filterwarnings("ignore") | |
8 | |
9 def delete_galaxy(): | |
10 import sys | |
11 for index, path in enumerate(sys.path): | |
12 if "galaxy-dist/" in path: | |
13 sys.path[index] = '' | |
14 | |
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now. | |
16 delete_galaxy() | |
17 | |
18 from Bio import SeqIO | |
19 # Import RDFLib's default Graph implementation. | |
20 import os, sys | |
21 from Bio.Seq import Seq | |
22 | |
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin | |
24 from rdflib.store import Store | |
25 import hashlib | |
26 store = plugin.get('IOMemory', Store)() | |
27 | |
28 global URI | |
29 URI = "http://csb.wur.nl/genome/" | |
30 global seeAlso | |
31 seeAlso = "rdfs:seeAlso" | |
32 global coreURI | |
33 coreURI = Namespace(URI) | |
34 | |
35 global SubClassOfDict | |
36 SubClassOfDict = {} | |
37 global SubClassOfDictRna | |
38 SubClassOfDictRna = {} | |
39 | |
40 def createClass(uri, root=True): | |
41 genomeGraph.add((uri,RDF.type,OWL.Class)) | |
42 if root: | |
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) | |
44 return uri | |
45 | |
46 def tmp(): | |
47 import time | |
48 global tmpFolder | |
49 tmpFolder = "/tmp/"+str(time.time())+"/" | |
50 os.mkdir(tmpFolder) | |
51 | |
52 def cleantmp(): | |
53 os.system("ls "+tmpFolder) | |
54 os.system("rm -rf "+tmpFolder) | |
55 | |
56 def crawler(): | |
57 #From input folder it looks for GBK file (gz files are in progress) | |
58 input_file = sys.argv[sys.argv.index("-input")+1] | |
59 gbk_parser(input_file) | |
60 | |
61 def gbk_parser(): | |
62 prevObjStart = -1 | |
63 prevObjStop = -1 | |
64 store = plugin.get('IOMemory', Store)() | |
65 global genomeGraph | |
66 genomeGraph = Graph(store,URIRef(URI)) | |
67 genomeGraph.bind("ssb",coreURI) | |
68 input_file = sys.argv[sys.argv.index("-input")+1] | |
69 | |
70 #CLASS definitions | |
71 genomeClass = createClass(coreURI["Genome"], root=True) | |
72 typeClass = createClass(coreURI["DnaObject"], root=True) | |
73 createClass(coreURI["Protein"], root=True) | |
74 pubmedClass = createClass(coreURI["Pubmed"], root=True) | |
75 miscClass = createClass(coreURI["MiscFeature"], root=False) | |
76 createClass(coreURI["Feature"], root=True) | |
77 SubClassOfDict["MiscFeature"] = 1 | |
78 SubClassOfDictRna["Trna"] = 1 | |
79 SubClassOfDictRna["Rrna"] = 1 | |
80 SubClassOfDictRna["Tmrna"] = 1 | |
81 SubClassOfDictRna["Ncrna"] = 1 | |
82 | |
83 # codon = "11" #Default initialization if no CDS are present | |
84 ################## | |
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''') | |
86 scaf_value = 0 | |
87 #Which files are already done | |
88 ######## | |
89 formatGBK = sys.argv[sys.argv.index("-format")+1] | |
90 for record in SeqIO.parse(input_file, formatGBK): | |
91 #Read first feature for genome name and information... | |
92 #Ignore the empty GBK file due to the lack of features? | |
93 | |
94 for index, feature in enumerate(record.features): | |
95 if index == 0: | |
96 if "-identifier" in sys.argv: | |
97 genome = sys.argv[sys.argv.index("-identifier")+1] | |
98 else: | |
99 try: | |
100 genome = feature.qualifiers["organism"][0].replace(" ","_") | |
101 except: | |
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS | |
103 genome = "XNoneX" | |
104 for char in weird_chars: | |
105 genome = genome.replace(char,"_") | |
106 | |
107 try: | |
108 gi = record.annotations["gi"] | |
109 typ = str(gi) | |
110 except: | |
111 try: | |
112 gi = record.annotations["accessions"][0] | |
113 typ = str(gi) | |
114 except: | |
115 scaf_value += 1 | |
116 typ = "scaffold_"+str(scaf_value) | |
117 genomeURI = coreURI[genome] | |
118 gbkURI = coreURI[genome + "/" + typ] | |
119 #To contig connection to connect all data to it | |
120 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI)) | |
121 | |
122 #General genome features also stored in the class... | |
123 if "genome" in feature.qualifiers: | |
124 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0]))) | |
125 if "strain" in feature.qualifiers: | |
126 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0]))) | |
127 if "taxonomy" in record.annotations: | |
128 for taxon in record.annotations["taxonomy"]: | |
129 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon))) | |
130 record.annotations["taxonomy"] = [] | |
131 #Genome sequence# | |
132 sequence = str(record.seq) | |
133 #Verify if sequence was not empty and is now full of X or N | |
134 filtered_sequence = sequence.replace("X","").replace("N","") | |
135 if len(filtered_sequence) == 0: | |
136 sequence = "" | |
137 #Record parsing# | |
138 for annot in record.annotations: | |
139 if type(record.annotations[annot]) == list: | |
140 if annot == "references": | |
141 for references in record.annotations[annot]: | |
142 if references.pubmed_id != "": | |
143 pubmed = references.pubmed_id | |
144 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed])) | |
145 obj_dict = references.__dict__ | |
146 for key in obj_dict: | |
147 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key])))) | |
148 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass)) | |
149 | |
150 else: | |
151 for a in record.annotations[annot]: | |
152 int_add(gbkURI,coreURI[annot.lower()],str(a)) | |
153 else: | |
154 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot])) | |
155 | |
156 #####END of RECORD#### | |
157 if len(sequence) > 0: | |
158 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence))) | |
159 genomeGraph.add((genomeURI, RDF.type,genomeClass)) | |
160 genomeGraph.add((gbkURI, RDF.type,typeClass)) | |
161 for key in feature.qualifiers: | |
162 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0]))) | |
163 #break | |
164 else: #The rest of the GBK file | |
165 feature_type = feature.type | |
166 end = str(feature.location.end).replace(">","").replace("<","") | |
167 start = str(feature.location.start).replace(">","").replace("<","") | |
168 | |
169 strand = str(feature.location.strand) | |
170 | |
171 if strand == 'None': | |
172 strand = 0 | |
173 else: | |
174 if feature.type == "misc_feature": #Store as part of previous cds or something... | |
175 if strand == "-1": | |
176 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)] | |
177 else: | |
178 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)] | |
179 | |
180 # TODO: Check if biopython has an overlap function... | |
181 if int(prevObjStart) <= int(start): | |
182 if int(end) <= int(prevObjStop): | |
183 pass | |
184 # genomeGraph.add((typeURI,coreURI["feature"],miscURI)) | |
185 # genomeGraph.add((miscURI,RDF.type,miscClass)) | |
186 else: | |
187 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) | |
188 genomeGraph.add((miscURI,RDF.type,miscClass)) | |
189 else: | |
190 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) | |
191 genomeGraph.add((miscURI,RDF.type,miscClass)) | |
192 | |
193 store_general_information(miscURI,feature,record) | |
194 else: | |
195 prevObjStart = start | |
196 prevObjStop = end | |
197 | |
198 if strand == "-1": | |
199 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)] | |
200 else: | |
201 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)] | |
202 | |
203 #Contig specific connection | |
204 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI)) | |
205 ############################ | |
206 | |
207 store_general_information(typeURI,feature,record) | |
208 | |
209 for subfeature in feature.sub_features: | |
210 strand = str(subfeature.location.strand) | |
211 subfeature_type = subfeature.type | |
212 end = str(subfeature.location.end).replace(">","").replace("<","") | |
213 start = str(subfeature.location.start).replace(">","").replace("<","") | |
214 | |
215 if strand == "-1": | |
216 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)] | |
217 else: | |
218 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)] | |
219 genomeGraph.add((typeURI, coreURI["feature"] , subURI)) | |
220 store_general_information(subURI,subfeature,record,feature) | |
221 | |
222 | |
223 def store_general_information(generalURI,feature,record,superfeature=""): | |
224 proteinClass = createClass(coreURI["Protein"], root=True) | |
225 sequence = str(record.seq) | |
226 cds_sequence = str(feature.extract(sequence)) | |
227 #Fixes the 0 count instead of 1-count in biopython vs humans | |
228 feature_type = feature.type | |
229 end = str(feature.location.end).replace(">","").replace("<","") | |
230 start = str(feature.location.start).replace(">","").replace("<","") | |
231 strand = str(feature.location.strand) | |
232 if strand == "None": | |
233 strand = 0 | |
234 | |
235 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
236 | |
237 if strand == "-1": | |
238 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1))) | |
239 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end)))) | |
240 else: | |
241 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1))) | |
242 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end)))) | |
243 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand)))) | |
244 if feature.type != "misc_feature": | |
245 try: | |
246 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence))) | |
247 except: #When protein sequence is not given for whatever reason | |
248 print ("wrong?") | |
249 | |
250 if feature.type == "misc_feature": | |
251 pass | |
252 else: | |
253 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False))) | |
254 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna": | |
255 SubClassOfDict[feature_type.lower().title()] = 1 | |
256 for key in feature.qualifiers: | |
257 values = feature.qualifiers[key] | |
258 if key == "translation": | |
259 pass | |
260 elif type(values) == list: | |
261 for v in values: | |
262 int_add(generalURI,coreURI[key.lower()],v) | |
263 else: | |
264 int_add(generalURI,coreURI[key.lower()],values) | |
265 if feature.type == "CDS": | |
266 try: | |
267 #Feature is normally submitted to this function | |
268 #IF a subfeature is submitted it is submitted as a feature | |
269 #And subfeature variable will contain the superfeature | |
270 if superfeature: | |
271 codon = superfeature.qualifiers["transl_table"][0] | |
272 except: | |
273 #Default codon table 11 | |
274 codon = "11" | |
275 #Protein linkage | |
276 translation = "" | |
277 try: | |
278 translation = feature.qualifiers["translation"][0].strip("*") | |
279 except KeyError: | |
280 #When protein sequence is not given... | |
281 if len(feature.location.parts) > 1: | |
282 #Exon boundaries? | |
283 seq = '' | |
284 for loc in feature.location: | |
285 seq += record.seq[loc] | |
286 if int(feature.location.strand) == -1: | |
287 seq = Seq(seq).complement() | |
288 else: | |
289 seq = Seq(seq) | |
290 translation = str(seq.translate(feature.qualifiers["transl_table"][0])) | |
291 elif int(feature.location.strand) == -1: | |
292 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation: | |
293 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: | |
294 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)) | |
295 else: | |
296 translation = '' | |
297 elif int(feature.location.strand) == +1: | |
298 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: | |
299 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon)) | |
300 else: | |
301 translation = '' | |
302 | |
303 if translation: | |
304 translation = list(translation) | |
305 translation[0] = "M" | |
306 translation = ''.join(translation).strip("*") | |
307 if "*" in translation: | |
308 pass | |
309 | |
310 translation = translation.encode('utf-8') | |
311 md5_protein = hashlib.md5(translation).hexdigest() | |
312 proteinURI = coreURI["protein/"+md5_protein] | |
313 genomeGraph.add((generalURI,coreURI["protein"],proteinURI)) | |
314 for key in feature.qualifiers: | |
315 for v in feature.qualifiers[key]: | |
316 if key == "translation": | |
317 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) | |
318 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation))) | |
319 genomeGraph.add((proteinURI,RDF.type,proteinClass)) | |
320 else: | |
321 for v in feature.qualifiers[key]: | |
322 int_add(generalURI,coreURI[key.lower()],v) | |
323 | |
324 def int_add(subject, predicate, obj): | |
325 try: | |
326 object_float = float(obj.replace('"','')) | |
327 object_int = int(obj.replace('"','')) | |
328 if object_int == object_float: | |
329 genomeGraph.add((subject,predicate,Literal(object_int))) | |
330 else: | |
331 genomeGraph.add((subject,predicate,Literal(object_float))) | |
332 except: | |
333 genomeGraph.add((subject,predicate,Literal(obj.replace('"','')))) | |
334 | |
335 def save(): | |
336 data = genomeGraph.serialize(format='turtle') | |
337 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) | |
338 | |
339 def subClassOfBuilder(): | |
340 for subclass in SubClassOfDict: | |
341 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) | |
342 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"])) | |
343 | |
344 def subClassOfBuilderRna(): | |
345 for subclass in SubClassOfDictRna: | |
346 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) | |
347 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"])) | |
348 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) | |
349 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class)) | |
350 | |
351 def main(): | |
352 tmp() | |
353 gbk_parser() | |
354 subClassOfBuilder() | |
355 subClassOfBuilderRna() | |
356 save() | |
357 cleantmp() | |
358 | |
359 if __name__ == "__main__": | |
360 main() |