annotate sparql_import.py @ 4:ee072a7d271b draft

Uploaded
author saml
date Wed, 21 Nov 2012 12:53:24 -0500
parents 4b4bbcf5db31
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
62cfd14e2520 Uploaded
saml
parents:
diff changeset
1 #!/usr/bin/python
62cfd14e2520 Uploaded
saml
parents:
diff changeset
2 # --------------------------------------------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
3 # A Galaxy plugin for querying external SPARQL Endpoints
62cfd14e2520 Uploaded
saml
parents:
diff changeset
4 # Samuel Lampa, samuel.lampa@gmail.com
62cfd14e2520 Uploaded
saml
parents:
diff changeset
5 # Created: 2012-11-16
62cfd14e2520 Uploaded
saml
parents:
diff changeset
6 # --------------------------------------------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
7
62cfd14e2520 Uploaded
saml
parents:
diff changeset
8 from xml.etree import ElementTree as et
62cfd14e2520 Uploaded
saml
parents:
diff changeset
9 from optparse import OptionParser
62cfd14e2520 Uploaded
saml
parents:
diff changeset
10 import urllib, sys, re
62cfd14e2520 Uploaded
saml
parents:
diff changeset
11
62cfd14e2520 Uploaded
saml
parents:
diff changeset
12 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
13 # The main code
62cfd14e2520 Uploaded
saml
parents:
diff changeset
14 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
15
62cfd14e2520 Uploaded
saml
parents:
diff changeset
16 def main():
3
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
17 # Parse command line options
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
18 (options, args) = parse_options()
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
19
2
62cfd14e2520 Uploaded
saml
parents:
diff changeset
20 # Extract command line options
62cfd14e2520 Uploaded
saml
parents:
diff changeset
21 sparql_query = options.sparql_query
3
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
22 sparql_query = restore_escaped_chars( sparql_query )
2
62cfd14e2520 Uploaded
saml
parents:
diff changeset
23 sparql_query = urllib.quote_plus(sparql_query)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
24 url = options.url
62cfd14e2520 Uploaded
saml
parents:
diff changeset
25 output_file = options.output_file
62cfd14e2520 Uploaded
saml
parents:
diff changeset
26
62cfd14e2520 Uploaded
saml
parents:
diff changeset
27 # Create SPARQL query URL
62cfd14e2520 Uploaded
saml
parents:
diff changeset
28 sparql_query_url = url + "?query=" + sparql_query
62cfd14e2520 Uploaded
saml
parents:
diff changeset
29
62cfd14e2520 Uploaded
saml
parents:
diff changeset
30 # Read from SPARQL Endpoint
62cfd14e2520 Uploaded
saml
parents:
diff changeset
31 sparql_endpoint = urllib.urlopen(sparql_query_url)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
32 results = sparql_endpoint.read()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
33 sparql_endpoint.close()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
34
62cfd14e2520 Uploaded
saml
parents:
diff changeset
35 # Convert to tabular format
62cfd14e2520 Uploaded
saml
parents:
diff changeset
36 if "<sparql" in results:
62cfd14e2520 Uploaded
saml
parents:
diff changeset
37 xmldata = extract_xml( results )
62cfd14e2520 Uploaded
saml
parents:
diff changeset
38 tabular = xml_to_tabular( xmldata )
62cfd14e2520 Uploaded
saml
parents:
diff changeset
39 else:
3
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
40 sys.exit("No SPARQL content found in returned data!\nReturned data:\n" + "-"*10 + "\n" + results)
2
62cfd14e2520 Uploaded
saml
parents:
diff changeset
41
62cfd14e2520 Uploaded
saml
parents:
diff changeset
42 # Print to file
62cfd14e2520 Uploaded
saml
parents:
diff changeset
43 of = open(output_file, "w")
62cfd14e2520 Uploaded
saml
parents:
diff changeset
44 of.write(tabular)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
45 of.close()
62cfd14e2520 Uploaded
saml
parents:
diff changeset
46
62cfd14e2520 Uploaded
saml
parents:
diff changeset
47 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
48 # Helper methods
62cfd14e2520 Uploaded
saml
parents:
diff changeset
49 # -----------------------
62cfd14e2520 Uploaded
saml
parents:
diff changeset
50
62cfd14e2520 Uploaded
saml
parents:
diff changeset
51 def extract_xml( content ):
62cfd14e2520 Uploaded
saml
parents:
diff changeset
52 '''Extract the part of the document starting with <?xml ...'''
62cfd14e2520 Uploaded
saml
parents:
diff changeset
53 xmlcontent = re.search("<sparql.*", content, re.DOTALL).group(0)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
54 return xmlcontent
62cfd14e2520 Uploaded
saml
parents:
diff changeset
55
62cfd14e2520 Uploaded
saml
parents:
diff changeset
56 def xml_to_tabular( xmldata ):
62cfd14e2520 Uploaded
saml
parents:
diff changeset
57 '''Convert SPARQL result set XML format to tabular text'''
62cfd14e2520 Uploaded
saml
parents:
diff changeset
58 root = et.fromstring(xmldata)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
59 tree = et.ElementTree(root)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
60 tabular = ""
62cfd14e2520 Uploaded
saml
parents:
diff changeset
61
62cfd14e2520 Uploaded
saml
parents:
diff changeset
62 results = root.getchildren()[1]
62cfd14e2520 Uploaded
saml
parents:
diff changeset
63 for result in results:
3
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
64 line_bits = ['<' + binding.getchildren()[0].text + '>' for binding in result.getchildren()]
2
62cfd14e2520 Uploaded
saml
parents:
diff changeset
65 line = "\t".join(line_bits)
62cfd14e2520 Uploaded
saml
parents:
diff changeset
66 tabular += line + "\n"
62cfd14e2520 Uploaded
saml
parents:
diff changeset
67 return tabular
62cfd14e2520 Uploaded
saml
parents:
diff changeset
68
3
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
69 def restore_escaped_chars( sparql_query ):
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
70 sparql_query = sparql_query.replace("__oc__","{")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
71 sparql_query = sparql_query.replace("__ob__","[")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
72 sparql_query = sparql_query.replace("__cc__","}")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
73 sparql_query = sparql_query.replace("__cb__","]")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
74 sparql_query = sparql_query.replace("__cr__"," ")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
75 sparql_query = sparql_query.replace("__cn__"," ")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
76 sparql_query = sparql_query.replace("__at__","@")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
77 return sparql_query
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
78
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
79 def parse_options():
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
80 parser = OptionParser()
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
81 parser.add_option("-u", "--url",
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
82 help = "The URL to the SPARQL endpoint")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
83 parser.add_option("-q", "--sparql_query",
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
84 help = "A SPARQL query to send to a SPARQL endpoint")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
85 parser.add_option("-o", "--output_file",
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
86 help = "An output file for storing the results")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
87 (options, args) = parser.parse_args()
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
88
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
89 if not options.url:
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
90 sys.exit("You have to specify an URL! Use the -h flag to view command line options!")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
91 if not options.sparql_query:
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
92 sys.exit("You have to specify a SPARQL query! Use the -h flag to view command line options!")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
93 if not options.output_file:
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
94 sys.exit("You have to specify an output file! Use the -h flag to view command line options!")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
95
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
96 if len(options.sparql_query) < 9:
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
97 sys.exit("Your SPARQL query is too short (printed below)!\n" + options.sparql_query)
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
98
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
99 if not re.match("^http", options.url):
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
100 sys.exit("The URL has to start with 'http://'! Please try again!")
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
101
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
102 return options, args
4b4bbcf5db31 Uploaded
saml
parents: 2
diff changeset
103
2
62cfd14e2520 Uploaded
saml
parents:
diff changeset
104 if __name__ == '__main__':
62cfd14e2520 Uploaded
saml
parents:
diff changeset
105 main()