annotate conversion/gbk2rdf/gbktordf.py @ 23:48fda68e50b1

Merge
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 19:24:30 +0100
parents 74b8ba5e2d5b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python3.4
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
2 # Author: Jasper Jan Koehorst
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
3 # Date created: Feb 21 2015
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
4 # Function: generation of a RDF file from Genbank/EMBL
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
5
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
6 import warnings
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
7 warnings.filterwarnings("ignore")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
8
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
9 def delete_galaxy():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
10 import sys
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
11 for index, path in enumerate(sys.path):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
12 if "galaxy-dist/" in path:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
13 sys.path[index] = ''
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
14
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
16 delete_galaxy()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
17
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
18 from Bio import SeqIO
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
19 # Import RDFLib's default Graph implementation.
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
20 import os, sys
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
21 from Bio.Seq import Seq
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
22
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
24 from rdflib.store import Store
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
25 import hashlib
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
26 store = plugin.get('IOMemory', Store)()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
27
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
28 global URI
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
29 URI = "http://csb.wur.nl/genome/"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
30 global seeAlso
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
31 seeAlso = "rdfs:seeAlso"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
32 global coreURI
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
33 coreURI = Namespace(URI)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
34
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
35 global SubClassOfDict
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
36 SubClassOfDict = {}
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
37 global SubClassOfDictRna
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
38 SubClassOfDictRna = {}
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
39
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
40 def createClass(uri, root=True):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
41 genomeGraph.add((uri,RDF.type,OWL.Class))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
42 if root:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
44 return uri
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
45
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
46 def tmp():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
47 import time
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
48 global tmpFolder
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
49 tmpFolder = "/tmp/"+str(time.time())+"/"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
50 os.mkdir(tmpFolder)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
51
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
52 def cleantmp():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
53 os.system("ls "+tmpFolder)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
54 os.system("rm -rf "+tmpFolder)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
55
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
56 def crawler():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
57 #From input folder it looks for GBK file (gz files are in progress)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
58 input_file = sys.argv[sys.argv.index("-input")+1]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
59 gbk_parser(input_file)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
60
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
61 def gbk_parser():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
62 prevObjStart = -1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
63 prevObjStop = -1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
64 store = plugin.get('IOMemory', Store)()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
65 global genomeGraph
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
66 genomeGraph = Graph(store,URIRef(URI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
67 genomeGraph.bind("ssb",coreURI)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
68 input_file = sys.argv[sys.argv.index("-input")+1]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
69
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
70 #CLASS definitions
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
71 genomeClass = createClass(coreURI["Genome"], root=True)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
72 typeClass = createClass(coreURI["DnaObject"], root=True)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
73 createClass(coreURI["Protein"], root=True)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
75 miscClass = createClass(coreURI["MiscFeature"], root=False)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
76 createClass(coreURI["Feature"], root=True)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
77 SubClassOfDict["MiscFeature"] = 1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
78 SubClassOfDictRna["Trna"] = 1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
79 SubClassOfDictRna["Rrna"] = 1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
80 SubClassOfDictRna["Tmrna"] = 1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
81 SubClassOfDictRna["Ncrna"] = 1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
82
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
83 # codon = "11" #Default initialization if no CDS are present
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
84 ##################
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
86 scaf_value = 0
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
87 #Which files are already done
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
88 ########
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
89 formatGBK = sys.argv[sys.argv.index("-format")+1]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
90 for record in SeqIO.parse(input_file, formatGBK):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
91 #Read first feature for genome name and information...
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
92 #Ignore the empty GBK file due to the lack of features?
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
93
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
94 for index, feature in enumerate(record.features):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
95 if index == 0:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
96 if "-identifier" in sys.argv:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
97 genome = sys.argv[sys.argv.index("-identifier")+1]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
98 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
99 try:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
100 genome = feature.qualifiers["organism"][0].replace(" ","_")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
101 except:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
103 genome = "XNoneX"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
104 for char in weird_chars:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
105 genome = genome.replace(char,"_")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
106
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
107 try:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
108 gi = record.annotations["gi"]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
109 typ = str(gi)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
110 except:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
111 try:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
112 gi = record.annotations["accessions"][0]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
113 typ = str(gi)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
114 except:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
115 scaf_value += 1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
116 typ = "scaffold_"+str(scaf_value)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
117 genomeURI = coreURI[genome]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
118 gbkURI = coreURI[genome + "/" + typ]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
119 #To contig connection to connect all data to it
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
120 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
121
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
122 #General genome features also stored in the class...
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
123 if "genome" in feature.qualifiers:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
124 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
125 if "strain" in feature.qualifiers:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
126 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
127 if "taxonomy" in record.annotations:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
128 for taxon in record.annotations["taxonomy"]:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
129 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
130 record.annotations["taxonomy"] = []
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
131 #Genome sequence#
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
132 sequence = str(record.seq)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
133 #Verify if sequence was not empty and is now full of X or N
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
134 filtered_sequence = sequence.replace("X","").replace("N","")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
135 if len(filtered_sequence) == 0:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
136 sequence = ""
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
137 #Record parsing#
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
138 for annot in record.annotations:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
139 if type(record.annotations[annot]) == list:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
140 if annot == "references":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
141 for references in record.annotations[annot]:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
142 if references.pubmed_id != "":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
143 pubmed = references.pubmed_id
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
144 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
145 obj_dict = references.__dict__
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
146 for key in obj_dict:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
147 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
148 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
149
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
150 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
151 for a in record.annotations[annot]:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
152 int_add(gbkURI,coreURI[annot.lower()],str(a))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
153 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
154 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
155
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
156 #####END of RECORD####
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
157 if len(sequence) > 0:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
158 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
159 genomeGraph.add((genomeURI, RDF.type,genomeClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
160 genomeGraph.add((gbkURI, RDF.type,typeClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
161 for key in feature.qualifiers:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
162 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
163 #break
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
164 else: #The rest of the GBK file
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
165 feature_type = feature.type
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
166 end = str(feature.location.end).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
167 start = str(feature.location.start).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
168
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
169 strand = str(feature.location.strand)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
170
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
171 if strand == 'None':
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
172 strand = 0
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
173 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
174 if feature.type == "misc_feature": #Store as part of previous cds or something...
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
175 if strand == "-1":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
176 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
177 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
178 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
179
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
180 # TODO: Check if biopython has an overlap function...
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
181 if int(prevObjStart) <= int(start):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
182 if int(end) <= int(prevObjStop):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
183 pass
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
184 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
185 # genomeGraph.add((miscURI,RDF.type,miscClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
186 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
187 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
188 genomeGraph.add((miscURI,RDF.type,miscClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
189 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
190 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
191 genomeGraph.add((miscURI,RDF.type,miscClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
192
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
193 store_general_information(miscURI,feature,record)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
194 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
195 prevObjStart = start
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
196 prevObjStop = end
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
197
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
198 if strand == "-1":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
199 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
200 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
201 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
202
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
203 #Contig specific connection
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
204 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
205 ############################
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
206
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
207 store_general_information(typeURI,feature,record)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
208
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
209 for subfeature in feature.sub_features:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
210 strand = str(subfeature.location.strand)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
211 subfeature_type = subfeature.type
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
212 end = str(subfeature.location.end).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
213 start = str(subfeature.location.start).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
214
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
215 if strand == "-1":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
216 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
217 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
218 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
219 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
220 store_general_information(subURI,subfeature,record,feature)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
221
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
222
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
223 def store_general_information(generalURI,feature,record,superfeature=""):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
224 proteinClass = createClass(coreURI["Protein"], root=True)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
225 sequence = str(record.seq)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
226 cds_sequence = str(feature.extract(sequence))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
227 #Fixes the 0 count instead of 1-count in biopython vs humans
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
228 feature_type = feature.type
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
229 end = str(feature.location.end).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
230 start = str(feature.location.start).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
231 strand = str(feature.location.strand)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
232 if strand == "None":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
233 strand = 0
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
234
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
235 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
236
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
237 if strand == "-1":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
238 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
239 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
240 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
241 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
242 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
243 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
244 if feature.type != "misc_feature":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
245 try:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
246 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
247 except: #When protein sequence is not given for whatever reason
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
248 print ("wrong?")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
249
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
250 if feature.type == "misc_feature":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
251 pass
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
252 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
253 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
254 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
255 SubClassOfDict[feature_type.lower().title()] = 1
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
256 for key in feature.qualifiers:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
257 values = feature.qualifiers[key]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
258 if key == "translation":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
259 pass
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
260 elif type(values) == list:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
261 for v in values:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
262 int_add(generalURI,coreURI[key.lower()],v)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
263 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
264 int_add(generalURI,coreURI[key.lower()],values)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
265 if feature.type == "CDS":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
266 try:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
267 #Feature is normally submitted to this function
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
268 #IF a subfeature is submitted it is submitted as a feature
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
269 #And subfeature variable will contain the superfeature
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
270 if superfeature:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
271 codon = superfeature.qualifiers["transl_table"][0]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
272 except:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
273 #Default codon table 11
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
274 codon = "11"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
275 #Protein linkage
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
276 translation = ""
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
277 try:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
278 translation = feature.qualifiers["translation"][0].strip("*")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
279 except KeyError:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
280 #When protein sequence is not given...
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
281 if len(feature.location.parts) > 1:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
282 #Exon boundaries?
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
283 seq = ''
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
284 for loc in feature.location:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
285 seq += record.seq[loc]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
286 if int(feature.location.strand) == -1:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
287 seq = Seq(seq).complement()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
288 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
289 seq = Seq(seq)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
290 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
291 elif int(feature.location.strand) == -1:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
292 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
293 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
294 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
295 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
296 translation = ''
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
297 elif int(feature.location.strand) == +1:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
298 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
299 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
300 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
301 translation = ''
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
302
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
303 if translation:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
304 translation = list(translation)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
305 translation[0] = "M"
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
306 translation = ''.join(translation).strip("*")
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
307 if "*" in translation:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
308 pass
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
309
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
310 translation = translation.encode('utf-8')
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
311 md5_protein = hashlib.md5(translation).hexdigest()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
312 proteinURI = coreURI["protein/"+md5_protein]
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
313 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
314 for key in feature.qualifiers:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
315 for v in feature.qualifiers[key]:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
316 if key == "translation":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
317 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
318 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
319 genomeGraph.add((proteinURI,RDF.type,proteinClass))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
320 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
321 for v in feature.qualifiers[key]:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
322 int_add(generalURI,coreURI[key.lower()],v)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
323
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
324 def int_add(subject, predicate, obj):
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
325 try:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
326 object_float = float(obj.replace('"',''))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
327 object_int = int(obj.replace('"',''))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
328 if object_int == object_float:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
329 genomeGraph.add((subject,predicate,Literal(object_int)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
330 else:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
331 genomeGraph.add((subject,predicate,Literal(object_float)))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
332 except:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
333 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
334
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
335 def save():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
336 data = genomeGraph.serialize(format='turtle')
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
337 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
338
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
339 def subClassOfBuilder():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
340 for subclass in SubClassOfDict:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
341 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
342 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
343
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
344 def subClassOfBuilderRna():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
345 for subclass in SubClassOfDictRna:
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
346 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
347 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
348 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
349 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
350
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
351 def main():
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
352 tmp()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
353 gbk_parser()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
354 subClassOfBuilder()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
355 subClassOfBuilderRna()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
356 save()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
357 cleantmp()
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
358
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
359 if __name__ == "__main__":
74b8ba5e2d5b aragorn addition
jjkoehorst <jasperkoehorst@gmail.com>
parents:
diff changeset
360 main()