annotate gbk2rdf/gbktordf.py @ 3:db04e12b8779

Uploaded
author jjkoehorst
date Sat, 21 Feb 2015 07:28:39 -0500
parents
children ec73c34af97b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
1 #!/usr/bin/env python3.4
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
2 # Author: Jasper Jan Koehorst
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
3 # Date created: Feb 21 2015
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
4 # Function: generation of a RDF file from Genbank/EMBL
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
5
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
6 import warnings
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
7 warnings.filterwarnings("ignore")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
8
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
9 def delete_galaxy():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
10 import sys
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
11 for index, path in enumerate(sys.path):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
12 if "galaxy-dist/" in path:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
13 sys.path[index] = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
14
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
16 delete_galaxy()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
17
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
18 from Bio import SeqIO
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
19 # Import RDFLib's default Graph implementation.
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
20 import os, sys
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
21 from Bio.Seq import Seq
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
22
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
24 from rdflib.store import Store
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
25 import hashlib
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
26 store = plugin.get('IOMemory', Store)()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
27
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
28 global URI
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
29 URI = "http://csb.wur.nl/genome/"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
30 global seeAlso
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
31 seeAlso = "rdfs:seeAlso"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
32 global coreURI
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
33 coreURI = Namespace(URI)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
34
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
35 global SubClassOfDict
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
36 SubClassOfDict = {}
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
37 global SubClassOfDictRna
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
38 SubClassOfDictRna = {}
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
39
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
40 def createClass(uri, root=True):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
41 genomeGraph.add((uri,RDF.type,OWL.Class))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
42 if root:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
44 return uri
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
45
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
46 def tmp():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
47 import time
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
48 global tmpFolder
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
49 tmpFolder = "/tmp/"+str(time.time())+"/"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
50 os.mkdir(tmpFolder)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
51
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
52 def cleantmp():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
53 os.system("ls "+tmpFolder)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
54 os.system("rm -rf "+tmpFolder)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
55
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
56 def crawler():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
57 #From input folder it looks for GBK file (gz files are in progress)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
58 input_file = sys.argv[sys.argv.index("-input")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
59 gbk_parser(input_file)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
60
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
61 def gbk_parser():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
62 prevObjStart = -1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
63 prevObjStop = -1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
64 store = plugin.get('IOMemory', Store)()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
65 global genomeGraph
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
66 genomeGraph = Graph(store,URIRef(URI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
67 genomeGraph.bind("ssb",coreURI)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
68 input_file = sys.argv[sys.argv.index("-input")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
69
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
70 #CLASS definitions
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
71 genomeClass = createClass(coreURI["Genome"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
72 typeClass = createClass(coreURI["DnaObject"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
73 createClass(coreURI["Protein"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
75 miscClass = createClass(coreURI["MiscFeature"], root=False)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
76 createClass(coreURI["Feature"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
77 SubClassOfDict["MiscFeature"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
78 SubClassOfDictRna["Trna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
79 SubClassOfDictRna["Rrna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
80 SubClassOfDictRna["Tmrna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
81 SubClassOfDictRna["Ncrna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
82
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
83 # codon = "11" #Default initialization if no CDS are present
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
84 ##################
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
86 scaf_value = 0
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
87 #Which files are already done
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
88 ########
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
89 formatGBK = sys.argv[sys.argv.index("-format")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
90 for record in SeqIO.parse(input_file, formatGBK):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
91 #Read first feature for genome name and information...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
92 #Ignore the empty GBK file due to the lack of features?
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
93
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
94 for index, feature in enumerate(record.features):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
95 if index == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
96 if "-identifier" in sys.argv:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
97 genome = sys.argv[sys.argv.index("-identifier")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
98 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
99 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
100 genome = feature.qualifiers["organism"][0].replace(" ","_")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
101 except:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
103 genome = "XNoneX"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
104 for char in weird_chars:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
105 genome = genome.replace(char,"_")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
106
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
107 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
108 gi = record.annotations["gi"]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
109 typ = str(gi)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
110 except:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
111 scaf_value += 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
112 typ = "scaffold_"+str(scaf_value)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
113 genomeURI = coreURI[genome]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
114 gbkURI = coreURI[genome + "/" + typ]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
115 #To contig connection to connect all data to it
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
116 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
117
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
118 #General genome features also stored in the class...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
119 if "genome" in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
120 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
121 if "strain" in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
122 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
123 if "taxonomy" in record.annotations:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
124 for taxon in record.annotations["taxonomy"]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
125 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
126 record.annotations["taxonomy"] = []
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
127 #Genome sequence#
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
128 sequence = str(record.seq)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
129 #Verify if sequence was not empty and is now full of X or N
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
130 filtered_sequence = sequence.replace("X","").replace("N","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
131 if len(filtered_sequence) == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
132 sequence = ""
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
133 #Record parsing#
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
134 for annot in record.annotations:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
135 if type(record.annotations[annot]) == list:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
136 if annot == "references":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
137 for references in record.annotations[annot]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
138 if references.pubmed_id != "":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
139 pubmed = references.pubmed_id
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
140 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
141 obj_dict = references.__dict__
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
142 for key in obj_dict:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
143 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
144 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
145
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
146 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
147 for a in record.annotations[annot]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
148 int_add(gbkURI,coreURI[annot.lower()],str(a))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
149 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
150 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
151
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
152
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
153 #####END of RECORD####
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
154 if len(sequence) > 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
155 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
156 genomeGraph.add((genomeURI, RDF.type,genomeClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
157 genomeGraph.add((gbkURI, RDF.type,typeClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
158 for key in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
159 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
160 #break
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
161 else: #The rest of the GBK file
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
162 feature_type = feature.type
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
163 end = str(feature.location.end).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
164 start = str(feature.location.start).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
165
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
166 strand = str(feature.location.strand)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
167
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
168 if strand == 'None':
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
169 strand = 0
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
170
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
171 # if feature_type == "gene":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
172 # gene = feature
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
173 #Store gene in next feature....
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
174 # gene_location_start = end = str(gene.location.end).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
175 # gene_location_stop = str(gene.location.start).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
176 # gene_qualifiers = gene.qualifiers
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
177 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
178 if feature.type == "misc_feature": #Store as part of previous cds or something...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
179 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
180 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
181 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
182 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
183
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
184 # genomeGraph.add((generalURI,coreURI["subFeature"],miscURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
185
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
186 # TODO: Check if biopython has an overlap function...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
187 if int(prevObjStart) <= int(start):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
188 if int(end) <= int(prevObjStop):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
189 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
190 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
191 # genomeGraph.add((miscURI,RDF.type,miscClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
192 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
193 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
194 genomeGraph.add((miscURI,RDF.type,miscClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
195 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
196 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
197 genomeGraph.add((miscURI,RDF.type,miscClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
198
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
199 store_general_information(miscURI,feature,record)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
200 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
201 prevObjStart = start
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
202 prevObjStop = end
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
203
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
204
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
205 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
206 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
207 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
208 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
209
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
210 # cds_sequence = str(feature.extract(sequence))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
211 #Contig specific connection
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
212
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
213 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
214 ############################
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
215
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
216 store_general_information(typeURI,feature,record)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
217
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
218 for subfeature in feature.sub_features:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
219 strand = str(subfeature.location.strand)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
220 subfeature_type = subfeature.type
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
221 end = str(subfeature.location.end).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
222 start = str(subfeature.location.start).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
223
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
224 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
225 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
226 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
227 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
228 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
229 store_general_information(subURI,subfeature,record,feature)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
230
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
231 def store_general_information(generalURI,feature,record,superfeature=""):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
232 proteinClass = createClass(coreURI["Protein"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
233 sequence = str(record.seq)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
234 cds_sequence = str(feature.extract(sequence))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
235 #Fixes the 0 count instead of 1-count in biopython vs humans
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
236 feature_type = feature.type
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
237 end = str(feature.location.end).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
238 start = str(feature.location.start).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
239 strand = str(feature.location.strand)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
240 if strand == "None":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
241 strand = 0
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
242
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
243 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
244
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
245 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
246 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
247 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
248 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
249 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
250 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
251 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
252 if feature.type != "misc_feature":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
253 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
254 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
255 except: #When protein sequence is not given for whatever reason
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
256 print ("wrong?")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
257
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
258 if feature.type == "misc_feature":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
259 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
260 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
261 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
262 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
263 SubClassOfDict[feature_type.lower().title()] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
264 for key in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
265 values = feature.qualifiers[key]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
266 if key == "translation":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
267 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
268 elif type(values) == list:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
269 for v in values:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
270 int_add(generalURI,coreURI[key.lower()],v)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
271 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
272 int_add(generalURI,coreURI[key.lower()],values)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
273 if feature.type == "CDS":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
274 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
275 #Feature is normally submitted to this function
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
276 #IF a subfeature is submitted it is submitted as a feature
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
277 #And subfeature variable will contain the superfeature
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
278 if superfeature:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
279 codon = superfeature.qualifiers["transl_table"][0]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
280 # else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
281 # codon = subfeature.qualifiers["transl_table"][0]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
282 except:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
283 #Default codon table 11
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
284 codon = "11"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
285 #Protein linkage
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
286 translation = ""
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
287 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
288 translation = feature.qualifiers["translation"][0].strip("*")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
289 except KeyError:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
290 #When protein sequence is not given...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
291 if len(feature.location.parts) > 1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
292 #Exon boundaries?
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
293 seq = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
294 for loc in feature.location:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
295 seq += record.seq[loc]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
296 if int(feature.location.strand) == -1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
297 seq = Seq(seq).complement()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
298 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
299 seq = Seq(seq)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
300 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
301 elif int(feature.location.strand) == -1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
302 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
303 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
304 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
305 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
306 translation = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
307 elif int(feature.location.strand) == +1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
308 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
309 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
310 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
311 translation = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
312
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
313 if translation:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
314 translation = list(translation)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
315 translation[0] = "M"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
316 translation = ''.join(translation).strip("*")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
317 if "*" in translation:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
318 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
319
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
320 translation = translation.encode('utf-8')
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
321 md5_protein = hashlib.md5(translation).hexdigest()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
322 proteinURI = coreURI["protein/"+md5_protein]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
323 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
324 for key in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
325 for v in feature.qualifiers[key]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
326 if key == "translation":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
327 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
328 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
329 genomeGraph.add((proteinURI,RDF.type,proteinClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
330 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
331 for v in feature.qualifiers[key]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
332 int_add(generalURI,coreURI[key.lower()],v)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
333
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
334 def int_add(subject, predicate, obj):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
335 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
336 object_float = float(obj.replace('"',''))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
337 object_int = int(obj.replace('"',''))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
338 if object_int == object_float:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
339 genomeGraph.add((subject,predicate,Literal(object_int)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
340 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
341 genomeGraph.add((subject,predicate,Literal(object_float)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
342 except:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
343 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
344
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
345 def save():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
346 data = genomeGraph.serialize(format='turtle')
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
347 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
348
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
349 def subClassOfBuilder():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
350 for subclass in SubClassOfDict:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
351 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
352 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
353
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
354 def subClassOfBuilderRna():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
355 for subclass in SubClassOfDictRna:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
356 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
357 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
358 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
359 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
360 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
361
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
362 def main():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
363 tmp()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
364 gbk_parser()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
365 subClassOfBuilder()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
366 subClassOfBuilderRna()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
367 save()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
368 cleantmp()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
369
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
370 if __name__ == "__main__":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
371 main()