annotate gbk2rdf/gbktordf.py @ 13:1efd1975a68d

cutadapters sample added
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 16:58:00 +0100
parents ec73c34af97b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
1 #!/usr/bin/env python3.4
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
2 # Author: Jasper Jan Koehorst
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
3 # Date created: Feb 21 2015
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
4 # Function: generation of a RDF file from Genbank/EMBL
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
5
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
6 import warnings
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
7 warnings.filterwarnings("ignore")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
8
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
9 def delete_galaxy():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
10 import sys
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
11 for index, path in enumerate(sys.path):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
12 if "galaxy-dist/" in path:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
13 sys.path[index] = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
14
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
16 delete_galaxy()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
17
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
18 from Bio import SeqIO
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
19 # Import RDFLib's default Graph implementation.
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
20 import os, sys
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
21 from Bio.Seq import Seq
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
22
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
24 from rdflib.store import Store
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
25 import hashlib
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
26 store = plugin.get('IOMemory', Store)()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
27
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
28 global URI
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
29 URI = "http://csb.wur.nl/genome/"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
30 global seeAlso
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
31 seeAlso = "rdfs:seeAlso"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
32 global coreURI
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
33 coreURI = Namespace(URI)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
34
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
35 global SubClassOfDict
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
36 SubClassOfDict = {}
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
37 global SubClassOfDictRna
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
38 SubClassOfDictRna = {}
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
39
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
40 def createClass(uri, root=True):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
41 genomeGraph.add((uri,RDF.type,OWL.Class))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
42 if root:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
44 return uri
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
45
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
46 def tmp():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
47 import time
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
48 global tmpFolder
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
49 tmpFolder = "/tmp/"+str(time.time())+"/"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
50 os.mkdir(tmpFolder)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
51
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
52 def cleantmp():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
53 os.system("ls "+tmpFolder)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
54 os.system("rm -rf "+tmpFolder)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
55
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
56 def crawler():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
57 #From input folder it looks for GBK file (gz files are in progress)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
58 input_file = sys.argv[sys.argv.index("-input")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
59 gbk_parser(input_file)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
60
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
61 def gbk_parser():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
62 prevObjStart = -1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
63 prevObjStop = -1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
64 store = plugin.get('IOMemory', Store)()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
65 global genomeGraph
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
66 genomeGraph = Graph(store,URIRef(URI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
67 genomeGraph.bind("ssb",coreURI)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
68 input_file = sys.argv[sys.argv.index("-input")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
69
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
70 #CLASS definitions
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
71 genomeClass = createClass(coreURI["Genome"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
72 typeClass = createClass(coreURI["DnaObject"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
73 createClass(coreURI["Protein"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
75 miscClass = createClass(coreURI["MiscFeature"], root=False)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
76 createClass(coreURI["Feature"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
77 SubClassOfDict["MiscFeature"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
78 SubClassOfDictRna["Trna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
79 SubClassOfDictRna["Rrna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
80 SubClassOfDictRna["Tmrna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
81 SubClassOfDictRna["Ncrna"] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
82
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
83 # codon = "11" #Default initialization if no CDS are present
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
84 ##################
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
86 scaf_value = 0
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
87 #Which files are already done
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
88 ########
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
89 formatGBK = sys.argv[sys.argv.index("-format")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
90 for record in SeqIO.parse(input_file, formatGBK):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
91 #Read first feature for genome name and information...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
92 #Ignore the empty GBK file due to the lack of features?
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
93
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
94 for index, feature in enumerate(record.features):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
95 if index == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
96 if "-identifier" in sys.argv:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
97 genome = sys.argv[sys.argv.index("-identifier")+1]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
98 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
99 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
100 genome = feature.qualifiers["organism"][0].replace(" ","_")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
101 except:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
103 genome = "XNoneX"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
104 for char in weird_chars:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
105 genome = genome.replace(char,"_")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
106
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
107 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
108 gi = record.annotations["gi"]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
109 typ = str(gi)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
110 except:
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
111 try:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
112 gi = record.annotations["accessions"][0]
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
113 typ = str(gi)
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
114 except:
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
115 scaf_value += 1
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
116 typ = "scaffold_"+str(scaf_value)
3
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
117 genomeURI = coreURI[genome]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
118 gbkURI = coreURI[genome + "/" + typ]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
119 #To contig connection to connect all data to it
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
120 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
121
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
122 #General genome features also stored in the class...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
123 if "genome" in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
124 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
125 if "strain" in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
126 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
127 if "taxonomy" in record.annotations:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
128 for taxon in record.annotations["taxonomy"]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
129 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
130 record.annotations["taxonomy"] = []
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
131 #Genome sequence#
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
132 sequence = str(record.seq)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
133 #Verify if sequence was not empty and is now full of X or N
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
134 filtered_sequence = sequence.replace("X","").replace("N","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
135 if len(filtered_sequence) == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
136 sequence = ""
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
137 #Record parsing#
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
138 for annot in record.annotations:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
139 if type(record.annotations[annot]) == list:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
140 if annot == "references":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
141 for references in record.annotations[annot]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
142 if references.pubmed_id != "":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
143 pubmed = references.pubmed_id
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
144 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
145 obj_dict = references.__dict__
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
146 for key in obj_dict:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
147 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
148 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
149
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
150 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
151 for a in record.annotations[annot]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
152 int_add(gbkURI,coreURI[annot.lower()],str(a))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
153 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
154 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
155
3
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
156 #####END of RECORD####
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
157 if len(sequence) > 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
158 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
159 genomeGraph.add((genomeURI, RDF.type,genomeClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
160 genomeGraph.add((gbkURI, RDF.type,typeClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
161 for key in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
162 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
163 #break
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
164 else: #The rest of the GBK file
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
165 feature_type = feature.type
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
166 end = str(feature.location.end).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
167 start = str(feature.location.start).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
168
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
169 strand = str(feature.location.strand)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
170
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
171 if strand == 'None':
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
172 strand = 0
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
173 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
174 if feature.type == "misc_feature": #Store as part of previous cds or something...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
175 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
176 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
177 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
178 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
179
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
180 # TODO: Check if biopython has an overlap function...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
181 if int(prevObjStart) <= int(start):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
182 if int(end) <= int(prevObjStop):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
183 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
184 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
185 # genomeGraph.add((miscURI,RDF.type,miscClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
186 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
187 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
188 genomeGraph.add((miscURI,RDF.type,miscClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
189 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
190 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
191 genomeGraph.add((miscURI,RDF.type,miscClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
192
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
193 store_general_information(miscURI,feature,record)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
194 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
195 prevObjStart = start
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
196 prevObjStop = end
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
197
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
198 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
199 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
200 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
201 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
202
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
203 #Contig specific connection
3
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
204 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
205 ############################
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
206
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
207 store_general_information(typeURI,feature,record)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
208
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
209 for subfeature in feature.sub_features:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
210 strand = str(subfeature.location.strand)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
211 subfeature_type = subfeature.type
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
212 end = str(subfeature.location.end).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
213 start = str(subfeature.location.start).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
214
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
215 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
216 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
217 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
218 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
219 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
220 store_general_information(subURI,subfeature,record,feature)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
221
6
ec73c34af97b FASTA2RDF
jjkoehorst <jasperkoehorst@gmail.com>
parents: 3
diff changeset
222
3
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
223 def store_general_information(generalURI,feature,record,superfeature=""):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
224 proteinClass = createClass(coreURI["Protein"], root=True)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
225 sequence = str(record.seq)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
226 cds_sequence = str(feature.extract(sequence))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
227 #Fixes the 0 count instead of 1-count in biopython vs humans
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
228 feature_type = feature.type
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
229 end = str(feature.location.end).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
230 start = str(feature.location.start).replace(">","").replace("<","")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
231 strand = str(feature.location.strand)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
232 if strand == "None":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
233 strand = 0
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
234
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
235 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
236
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
237 if strand == "-1":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
238 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
239 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
240 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
241 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
242 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
243 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
244 if feature.type != "misc_feature":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
245 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
246 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
247 except: #When protein sequence is not given for whatever reason
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
248 print ("wrong?")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
249
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
250 if feature.type == "misc_feature":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
251 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
252 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
253 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
254 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
255 SubClassOfDict[feature_type.lower().title()] = 1
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
256 for key in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
257 values = feature.qualifiers[key]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
258 if key == "translation":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
259 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
260 elif type(values) == list:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
261 for v in values:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
262 int_add(generalURI,coreURI[key.lower()],v)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
263 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
264 int_add(generalURI,coreURI[key.lower()],values)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
265 if feature.type == "CDS":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
266 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
267 #Feature is normally submitted to this function
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
268 #IF a subfeature is submitted it is submitted as a feature
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
269 #And subfeature variable will contain the superfeature
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
270 if superfeature:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
271 codon = superfeature.qualifiers["transl_table"][0]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
272 except:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
273 #Default codon table 11
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
274 codon = "11"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
275 #Protein linkage
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
276 translation = ""
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
277 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
278 translation = feature.qualifiers["translation"][0].strip("*")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
279 except KeyError:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
280 #When protein sequence is not given...
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
281 if len(feature.location.parts) > 1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
282 #Exon boundaries?
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
283 seq = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
284 for loc in feature.location:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
285 seq += record.seq[loc]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
286 if int(feature.location.strand) == -1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
287 seq = Seq(seq).complement()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
288 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
289 seq = Seq(seq)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
290 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
291 elif int(feature.location.strand) == -1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
292 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
293 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
294 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
295 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
296 translation = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
297 elif int(feature.location.strand) == +1:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
298 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
299 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
300 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
301 translation = ''
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
302
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
303 if translation:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
304 translation = list(translation)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
305 translation[0] = "M"
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
306 translation = ''.join(translation).strip("*")
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
307 if "*" in translation:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
308 pass
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
309
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
310 translation = translation.encode('utf-8')
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
311 md5_protein = hashlib.md5(translation).hexdigest()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
312 proteinURI = coreURI["protein/"+md5_protein]
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
313 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
314 for key in feature.qualifiers:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
315 for v in feature.qualifiers[key]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
316 if key == "translation":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
317 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
318 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
319 genomeGraph.add((proteinURI,RDF.type,proteinClass))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
320 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
321 for v in feature.qualifiers[key]:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
322 int_add(generalURI,coreURI[key.lower()],v)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
323
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
324 def int_add(subject, predicate, obj):
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
325 try:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
326 object_float = float(obj.replace('"',''))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
327 object_int = int(obj.replace('"',''))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
328 if object_int == object_float:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
329 genomeGraph.add((subject,predicate,Literal(object_int)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
330 else:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
331 genomeGraph.add((subject,predicate,Literal(object_float)))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
332 except:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
333 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
334
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
335 def save():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
336 data = genomeGraph.serialize(format='turtle')
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
337 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
338
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
339 def subClassOfBuilder():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
340 for subclass in SubClassOfDict:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
341 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
342 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
343
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
344 def subClassOfBuilderRna():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
345 for subclass in SubClassOfDictRna:
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
346 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
347 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
348 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
349 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
350
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
351 def main():
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
352 tmp()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
353 gbk_parser()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
354 subClassOfBuilder()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
355 subClassOfBuilderRna()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
356 save()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
357 cleantmp()
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
358
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
359 if __name__ == "__main__":
db04e12b8779 Uploaded
jjkoehorst
parents:
diff changeset
360 main()