comparison sparql_import.py @ 2:62cfd14e2520 draft

Uploaded
author saml
date Wed, 21 Nov 2012 12:21:20 -0500
parents
children 4b4bbcf5db31
comparison
equal deleted inserted replaced
1:5972a5799e8f 2:62cfd14e2520
1 #!/usr/bin/python
2 # --------------------------------------------------------
3 # A Galaxy plugin for querying external SPARQL Endpoints
4 # Samuel Lampa, samuel.lampa@gmail.com
5 # Created: 2012-11-16
6 # --------------------------------------------------------
7
8 from xml.etree import ElementTree as et
9 from optparse import OptionParser
10 import urllib, sys, re
11
12 # -----------------------
13 # Option parsing
14 # -----------------------
15
16 parser = OptionParser()
17 parser.add_option("-u", "--url",
18 help = "The URL to the SPARQL endpoint")
19 parser.add_option("-q", "--sparql_query",
20 help = "A SPARQL query to send to a SPARQL endpoint")
21 parser.add_option("-o", "--output_file",
22 help = "An output file for storing the results")
23 (options, args) = parser.parse_args()
24
25 if not options.url:
26 sys.exit("You have to specify an URL! Use the -h flag to view command line options!")
27 if not options.sparql_query:
28 sys.exit("You have to specify a SPARQL query! Use the -h flag to view command line options!")
29 if not options.output_file:
30 sys.exit("You have to specify an output file! Use the -h flag to view command line options!")
31
32 if len(options.sparql_query) < 9:
33 sys.exit("Your SPARQL query is too short (printed below)!\n" + options.sparql_query)
34
35 if not re.match("^http", options.url):
36 sys.exit("The URL has to start with 'http://'! Please try again!")
37
38 # -----------------------
39 # The main code
40 # -----------------------
41
42 def main():
43 # Extract command line options
44 sparql_query = options.sparql_query
45 sparql_query = sparql_query.replace("__oc__","{")
46 sparql_query = sparql_query.replace("__ob__","[")
47 sparql_query = sparql_query.replace("__cc__","}")
48 sparql_query = sparql_query.replace("__cb__","]")
49 sparql_query = sparql_query.replace("__cr__"," ")
50 sparql_query = sparql_query.replace("__cn__"," ")
51 sparql_query = urllib.quote_plus(sparql_query)
52 url = options.url
53
54 output_file = options.output_file
55
56 # Create SPARQL query URL
57 sparql_query_url = url + "?query=" + sparql_query
58
59 # Read from SPARQL Endpoint
60 sparql_endpoint = urllib.urlopen(sparql_query_url)
61 results = sparql_endpoint.read()
62 sparql_endpoint.close()
63
64 # Convert to tabular format
65 if "<sparql" in results:
66 xmldata = extract_xml( results )
67 tabular = xml_to_tabular( xmldata )
68 else:
69 sys.exit("No SPARQL content found in returned data!\nReturned data:\n" + "-"*80 + "\n" + results)
70
71 # Print to file
72 of = open(output_file, "w")
73 of.write(tabular)
74 of.close()
75
76 # -----------------------
77 # Helper methods
78 # -----------------------
79
80 def extract_xml( content ):
81 '''Extract the part of the document starting with <?xml ...'''
82 xmlcontent = re.search("<sparql.*", content, re.DOTALL).group(0)
83 return xmlcontent
84
85 def xml_to_tabular( xmldata ):
86 '''Convert SPARQL result set XML format to tabular text'''
87 root = et.fromstring(xmldata)
88 tree = et.ElementTree(root)
89 tabular = ""
90
91 results = root.getchildren()[1]
92 for result in results:
93 line_bits = [binding.getchildren()[0].text for binding in result.getchildren()]
94 line = "\t".join(line_bits)
95 tabular += line + "\n"
96 return tabular
97
98 if __name__ == '__main__':
99 main()