annotate sparql_import.py @ 2:62cfd14e2520 draft

Uploaded
author saml
date Wed, 21 Nov 2012 12:21:20 -0500
parents
children 4b4bbcf5db31
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
62cfd14e2520 Uploaded
saml
parents:
diff changeset
1 #!/usr/bin/python
62cfd14e2520 Uploaded
saml
parents:
diff changeset
2 # --------------------------------------------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
3 # A Galaxy plugin for querying external SPARQL Endpoints
62cfd14e2520 Uploaded
saml
parents:
diff changeset
4 # Samuel Lampa, samuel.lampa@gmail.com
62cfd14e2520 Uploaded
saml
parents:
diff changeset
5 # Created: 2012-11-16
62cfd14e2520 Uploaded
saml
parents:
diff changeset
6 # --------------------------------------------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
7
62cfd14e2520 Uploaded
saml
parents:
diff changeset
8 from xml.etree import ElementTree as et
62cfd14e2520 Uploaded
saml
parents:
diff changeset
9 from optparse import OptionParser
62cfd14e2520 Uploaded
saml
parents:
diff changeset
10 import urllib, sys, re
62cfd14e2520 Uploaded
saml
parents:
diff changeset
11
62cfd14e2520 Uploaded
saml
parents:
diff changeset
12 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
13 # Option parsing
62cfd14e2520 Uploaded
saml
parents:
diff changeset
14 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
15
62cfd14e2520 Uploaded
saml
parents:
diff changeset
16 parser = OptionParser()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
17 parser.add_option("-u", "--url",
62cfd14e2520 Uploaded
saml
parents:
diff changeset
18 help = "The URL to the SPARQL endpoint")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
19 parser.add_option("-q", "--sparql_query",
62cfd14e2520 Uploaded
saml
parents:
diff changeset
20 help = "A SPARQL query to send to a SPARQL endpoint")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
21 parser.add_option("-o", "--output_file",
62cfd14e2520 Uploaded
saml
parents:
diff changeset
22 help = "An output file for storing the results")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
23 (options, args) = parser.parse_args()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
24
62cfd14e2520 Uploaded
saml
parents:
diff changeset
25 if not options.url:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
26 sys.exit("You have to specify an URL! Use the -h flag to view command line options!")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
27 if not options.sparql_query:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
28 sys.exit("You have to specify a SPARQL query! Use the -h flag to view command line options!")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
29 if not options.output_file:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
30 sys.exit("You have to specify an output file! Use the -h flag to view command line options!")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
31
62cfd14e2520 Uploaded
saml
parents:
diff changeset
32 if len(options.sparql_query) < 9:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
33 sys.exit("Your SPARQL query is too short (printed below)!\n" + options.sparql_query)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
34
62cfd14e2520 Uploaded
saml
parents:
diff changeset
35 if not re.match("^http", options.url):
62cfd14e2520 Uploaded
saml
parents:
diff changeset
36 sys.exit("The URL has to start with 'http://'! Please try again!")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
37
62cfd14e2520 Uploaded
saml
parents:
diff changeset
38 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
39 # The main code
62cfd14e2520 Uploaded
saml
parents:
diff changeset
40 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
41
62cfd14e2520 Uploaded
saml
parents:
diff changeset
42 def main():
62cfd14e2520 Uploaded
saml
parents:
diff changeset
43 # Extract command line options
62cfd14e2520 Uploaded
saml
parents:
diff changeset
44 sparql_query = options.sparql_query
62cfd14e2520 Uploaded
saml
parents:
diff changeset
45 sparql_query = sparql_query.replace("__oc__","{")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
46 sparql_query = sparql_query.replace("__ob__","[")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
47 sparql_query = sparql_query.replace("__cc__","}")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
48 sparql_query = sparql_query.replace("__cb__","]")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
49 sparql_query = sparql_query.replace("__cr__"," ")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
50 sparql_query = sparql_query.replace("__cn__"," ")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
51 sparql_query = urllib.quote_plus(sparql_query)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
52 url = options.url
62cfd14e2520 Uploaded
saml
parents:
diff changeset
53
62cfd14e2520 Uploaded
saml
parents:
diff changeset
54 output_file = options.output_file
62cfd14e2520 Uploaded
saml
parents:
diff changeset
55
62cfd14e2520 Uploaded
saml
parents:
diff changeset
56 # Create SPARQL query URL
62cfd14e2520 Uploaded
saml
parents:
diff changeset
57 sparql_query_url = url + "?query=" + sparql_query
62cfd14e2520 Uploaded
saml
parents:
diff changeset
58
62cfd14e2520 Uploaded
saml
parents:
diff changeset
59 # Read from SPARQL Endpoint
62cfd14e2520 Uploaded
saml
parents:
diff changeset
60 sparql_endpoint = urllib.urlopen(sparql_query_url)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
61 results = sparql_endpoint.read()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
62 sparql_endpoint.close()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
63
62cfd14e2520 Uploaded
saml
parents:
diff changeset
64 # Convert to tabular format
62cfd14e2520 Uploaded
saml
parents:
diff changeset
65 if "<sparql" in results:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
66 xmldata = extract_xml( results )
62cfd14e2520 Uploaded
saml
parents:
diff changeset
67 tabular = xml_to_tabular( xmldata )
62cfd14e2520 Uploaded
saml
parents:
diff changeset
68 else:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
69 sys.exit("No SPARQL content found in returned data!\nReturned data:\n" + "-"*80 + "\n" + results)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
70
62cfd14e2520 Uploaded
saml
parents:
diff changeset
71 # Print to file
62cfd14e2520 Uploaded
saml
parents:
diff changeset
72 of = open(output_file, "w")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
73 of.write(tabular)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
74 of.close()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
75
62cfd14e2520 Uploaded
saml
parents:
diff changeset
76 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
77 # Helper methods
62cfd14e2520 Uploaded
saml
parents:
diff changeset
78 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
79
62cfd14e2520 Uploaded
saml
parents:
diff changeset
80 def extract_xml( content ):
62cfd14e2520 Uploaded
saml
parents:
diff changeset
81 '''Extract the part of the document starting with <?xml ...'''
62cfd14e2520 Uploaded
saml
parents:
diff changeset
82 xmlcontent = re.search("<sparql.*", content, re.DOTALL).group(0)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
83 return xmlcontent
62cfd14e2520 Uploaded
saml
parents:
diff changeset
84
62cfd14e2520 Uploaded
saml
parents:
diff changeset
85 def xml_to_tabular( xmldata ):
62cfd14e2520 Uploaded
saml
parents:
diff changeset
86 '''Convert SPARQL result set XML format to tabular text'''
62cfd14e2520 Uploaded
saml
parents:
diff changeset
87 root = et.fromstring(xmldata)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
88 tree = et.ElementTree(root)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
89 tabular = ""
62cfd14e2520 Uploaded
saml
parents:
diff changeset
90
62cfd14e2520 Uploaded
saml
parents:
diff changeset
91 results = root.getchildren()[1]
62cfd14e2520 Uploaded
saml
parents:
diff changeset
92 for result in results:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
93 line_bits = [binding.getchildren()[0].text for binding in result.getchildren()]
62cfd14e2520 Uploaded
saml
parents:
diff changeset
94 line = "\t".join(line_bits)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
95 tabular += line + "\n"
62cfd14e2520 Uploaded
saml
parents:
diff changeset
96 return tabular
62cfd14e2520 Uploaded
saml
parents:
diff changeset
97
62cfd14e2520 Uploaded
saml
parents:
diff changeset
98 if __name__ == '__main__':
62cfd14e2520 Uploaded
saml
parents:
diff changeset
99 main()