Mercurial > repos > jjkoehorst > sapp
comparison fasta2rdf/fastatordf.py @ 6:ec73c34af97b
FASTA2RDF
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 15:19:42 +0100 |
parents | |
children | 3f4f1cd22a6a |
comparison
equal
deleted
inserted
replaced
4:47d1b27466ee | 6:ec73c34af97b |
---|---|
1 #!/usr/bin/env python3.4 | |
2 # Author: Jasper Jan Koehorst | |
3 # Date created: Jan 22 2015 | |
4 # Function: generation of a RDF file from a genome fasta file | |
5 | |
6 def delete_galaxy(): | |
7 import sys | |
8 for index, path in enumerate(sys.path): | |
9 if "galaxy-dist/" in path: | |
10 sys.path[index] = '' | |
11 | |
12 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. | |
13 delete_galaxy() | |
14 | |
15 # from io import StringIO | |
16 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin | |
17 # import rdflib | |
18 from rdflib.store import Store | |
19 import sys | |
20 | |
21 store = plugin.get('IOMemory', Store)() | |
22 | |
23 global URI | |
24 URI = "http://csb.wur.nl/genome/" | |
25 global seeAlso | |
26 seeAlso = "rdfs:seeAlso" | |
27 global coreURI | |
28 coreURI = Namespace(URI) | |
29 | |
30 def createClass(uri): | |
31 genomeGraph.add((uri,RDF.type,OWL.Class)) | |
32 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) | |
33 return uri | |
34 | |
35 def fasta_parser(input_file): | |
36 createClass(coreURI["Genome"]) #Genome class | |
37 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) | |
38 | |
39 genomeDict = {} | |
40 | |
41 #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds | |
42 # regex = re.compile('\[type=(.*?)\]') | |
43 sequence = "" | |
44 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") | |
45 if genomeID == 'None': | |
46 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") | |
47 | |
48 genomeURI = coreURI[genomeID] | |
49 for index, element in enumerate(sys.argv): | |
50 if '-organism' == element: | |
51 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) | |
52 if '-ncbi_taxid' == element: | |
53 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) | |
54 if '-idtag' == element: | |
55 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
56 if '-ids' == element: | |
57 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
58 | |
59 genomeDict[genomeID] = {} | |
60 # typDict = {"plasmid":0,"scaffold":0,"chromosome":0} | |
61 | |
62 #Generating genome dictionary | |
63 data = open(input_file).readlines() | |
64 fastadict = {} | |
65 key = "" | |
66 for index, line in enumerate(data): | |
67 if ">" == line[0]: | |
68 key = line.strip(">").strip() | |
69 fastadict[key] = "" | |
70 else: | |
71 fastadict[key] += line.strip() | |
72 | |
73 # for line in fastadict: | |
74 # typ = regex.findall(line) | |
75 # value = 0 | |
76 #If something is found | |
77 # if len(typ) > 0: | |
78 # typ = typ[0] | |
79 #If something is not found | |
80 # elif typ == []: | |
81 # typ = "scaffold" | |
82 #If something is found but does not contain a value | |
83 # elif "_" in typ: | |
84 # value = typ.split("_")[-1] | |
85 # try: | |
86 # value = int(value) | |
87 # except: | |
88 # value = 1 | |
89 #Not a integer | |
90 | |
91 #If a value is not given it is automatically assigned as the first one | |
92 #If a value is given... | |
93 # if value > -1: | |
94 #If a second scaffold of a chromosome_1 is found | |
95 # if typ in genomeDict[genome]: | |
96 #Retrieve how many | |
97 # value = len(genomeDict[genome][typ]) + 1 | |
98 # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]} | |
99 # else: | |
100 # genomeDict[genome][typ] = {} | |
101 # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]} | |
102 | |
103 #Genome dictionary to TTL | |
104 genomeClass = createClass(coreURI["Genome"]) | |
105 typeClass = createClass(coreURI["DnaObject"]) | |
106 for index, genome in enumerate(fastadict): | |
107 # for typ in genomeDict[genome]: | |
108 # for scaf in genomeDict[genome][typ]: | |
109 # for con in genomeDict[genome][typ][scaf]: | |
110 #A note is required here... | |
111 #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig. | |
112 #Previously it was | |
113 # Genome > Class > Scaffold > Contig | |
114 #Now it will be | |
115 # Genome > Class/Scaffold/Contig | |
116 #typeURI = coreURI[genome + "/" + typ] | |
117 #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf] | |
118 #Was contigURI | |
119 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con] | |
120 # sequence = genomeDict[genome][typ][scaf][con] | |
121 sequence = fastadict[genome] | |
122 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) | |
123 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
124 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) | |
125 genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) | |
126 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
127 genomeGraph.add((genomeURI, RDF.type,genomeClass)) | |
128 genomeGraph.add((typeURI, RDF.type,typeClass)) | |
129 | |
130 def save(): | |
131 data = genomeGraph.serialize(format='turtle') | |
132 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) | |
133 | |
134 def main(): | |
135 store = plugin.get('IOMemory', Store)() | |
136 global genomeGraph | |
137 genomeGraph = Graph(store,URIRef(URI)) | |
138 genomeGraph.bind("ssb",coreURI) | |
139 input_file = sys.argv[sys.argv.index("-input")+1] | |
140 fasta_parser(input_file) | |
141 save() | |
142 | |
143 if __name__ == '__main__': | |
144 main() | |
145 |