6
|
1 #!/usr/bin/env python3.4
|
|
2 # Author: Jasper Jan Koehorst
|
|
3 # Date created: Jan 22 2015
|
|
4 # Function: generation of a RDF file from a genome fasta file
|
|
5
|
|
6 def delete_galaxy():
|
|
7 import sys
|
|
8 for index, path in enumerate(sys.path):
|
|
9 if "galaxy-dist/" in path:
|
|
10 sys.path[index] = ''
|
|
11
|
|
12 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
|
|
13 delete_galaxy()
|
|
14
|
|
15 # from io import StringIO
|
|
16 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
|
|
17 # import rdflib
|
|
18 from rdflib.store import Store
|
|
19 import sys
|
|
20
|
|
21 store = plugin.get('IOMemory', Store)()
|
|
22
|
|
23 global URI
|
|
24 URI = "http://csb.wur.nl/genome/"
|
|
25 global seeAlso
|
|
26 seeAlso = "rdfs:seeAlso"
|
|
27 global coreURI
|
|
28 coreURI = Namespace(URI)
|
|
29
|
|
30 def createClass(uri):
|
|
31 genomeGraph.add((uri,RDF.type,OWL.Class))
|
|
32 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
|
|
33 return uri
|
|
34
|
|
35 def fasta_parser(input_file):
|
|
36 createClass(coreURI["Genome"]) #Genome class
|
|
37 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold)
|
|
38
|
|
39 genomeDict = {}
|
|
40
|
|
41 #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds
|
|
42 # regex = re.compile('\[type=(.*?)\]')
|
|
43 sequence = ""
|
|
44 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
|
|
45 if genomeID == 'None':
|
|
46 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
|
|
47
|
|
48 genomeURI = coreURI[genomeID]
|
|
49 for index, element in enumerate(sys.argv):
|
|
50 if '-organism' == element:
|
|
51 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
|
|
52 if '-ncbi_taxid' == element:
|
|
53 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
|
|
54 if '-idtag' == element:
|
|
55 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
|
|
56 if '-ids' == element:
|
|
57 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
|
|
58
|
|
59 genomeDict[genomeID] = {}
|
|
60 # typDict = {"plasmid":0,"scaffold":0,"chromosome":0}
|
|
61
|
|
62 #Generating genome dictionary
|
|
63 data = open(input_file).readlines()
|
|
64 fastadict = {}
|
|
65 key = ""
|
|
66 for index, line in enumerate(data):
|
|
67 if ">" == line[0]:
|
|
68 key = line.strip(">").strip()
|
|
69 fastadict[key] = ""
|
|
70 else:
|
|
71 fastadict[key] += line.strip()
|
|
72
|
|
73 # for line in fastadict:
|
|
74 # typ = regex.findall(line)
|
|
75 # value = 0
|
|
76 #If something is found
|
|
77 # if len(typ) > 0:
|
|
78 # typ = typ[0]
|
|
79 #If something is not found
|
|
80 # elif typ == []:
|
|
81 # typ = "scaffold"
|
|
82 #If something is found but does not contain a value
|
|
83 # elif "_" in typ:
|
|
84 # value = typ.split("_")[-1]
|
|
85 # try:
|
|
86 # value = int(value)
|
|
87 # except:
|
|
88 # value = 1
|
|
89 #Not a integer
|
|
90
|
|
91 #If a value is not given it is automatically assigned as the first one
|
|
92 #If a value is given...
|
|
93 # if value > -1:
|
|
94 #If a second scaffold of a chromosome_1 is found
|
|
95 # if typ in genomeDict[genome]:
|
|
96 #Retrieve how many
|
|
97 # value = len(genomeDict[genome][typ]) + 1
|
|
98 # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]}
|
|
99 # else:
|
|
100 # genomeDict[genome][typ] = {}
|
|
101 # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]}
|
|
102
|
|
103 #Genome dictionary to TTL
|
|
104 genomeClass = createClass(coreURI["Genome"])
|
|
105 typeClass = createClass(coreURI["DnaObject"])
|
|
106 for index, genome in enumerate(fastadict):
|
|
107 # for typ in genomeDict[genome]:
|
|
108 # for scaf in genomeDict[genome][typ]:
|
|
109 # for con in genomeDict[genome][typ][scaf]:
|
|
110 #A note is required here...
|
|
111 #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig.
|
|
112 #Previously it was
|
|
113 # Genome > Class > Scaffold > Contig
|
|
114 #Now it will be
|
|
115 # Genome > Class/Scaffold/Contig
|
|
116 #typeURI = coreURI[genome + "/" + typ]
|
|
117 #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf]
|
|
118 #Was contigURI
|
|
119 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con]
|
|
120 # sequence = genomeDict[genome][typ][scaf][con]
|
|
121 sequence = fastadict[genome]
|
|
122 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
|
|
123 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
|
|
124 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence)))
|
|
125 genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
|
|
126 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
|
|
127 genomeGraph.add((genomeURI, RDF.type,genomeClass))
|
|
128 genomeGraph.add((typeURI, RDF.type,typeClass))
|
|
129
|
|
130 def save():
|
|
131 data = genomeGraph.serialize(format='turtle')
|
|
132 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
|
|
133
|
|
134 def main():
|
|
135 store = plugin.get('IOMemory', Store)()
|
|
136 global genomeGraph
|
|
137 genomeGraph = Graph(store,URIRef(URI))
|
|
138 genomeGraph.bind("ssb",coreURI)
|
|
139 input_file = sys.argv[sys.argv.index("-input")+1]
|
|
140 fasta_parser(input_file)
|
|
141 save()
|
|
142
|
|
143 if __name__ == '__main__':
|
|
144 main()
|
|
145
|