Mercurial > repos > saml > semweb_tools
diff sparql_import.py @ 2:62cfd14e2520 draft
Uploaded
author | saml |
---|---|
date | Wed, 21 Nov 2012 12:21:20 -0500 |
parents | |
children | 4b4bbcf5db31 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sparql_import.py Wed Nov 21 12:21:20 2012 -0500 @@ -0,0 +1,99 @@ +#!/usr/bin/python +# -------------------------------------------------------- +# A Galaxy plugin for querying external SPARQL Endpoints +# Samuel Lampa, samuel.lampa@gmail.com +# Created: 2012-11-16 +# -------------------------------------------------------- + +from xml.etree import ElementTree as et +from optparse import OptionParser +import urllib, sys, re + +# ----------------------- +# Option parsing +# ----------------------- + +parser = OptionParser() +parser.add_option("-u", "--url", + help = "The URL to the SPARQL endpoint") +parser.add_option("-q", "--sparql_query", + help = "A SPARQL query to send to a SPARQL endpoint") +parser.add_option("-o", "--output_file", + help = "An output file for storing the results") +(options, args) = parser.parse_args() + +if not options.url: + sys.exit("You have to specify an URL! Use the -h flag to view command line options!") +if not options.sparql_query: + sys.exit("You have to specify a SPARQL query! Use the -h flag to view command line options!") +if not options.output_file: + sys.exit("You have to specify an output file! Use the -h flag to view command line options!") + +if len(options.sparql_query) < 9: + sys.exit("Your SPARQL query is too short (printed below)!\n" + options.sparql_query) + +if not re.match("^http", options.url): + sys.exit("The URL has to start with 'http://'! Please try again!") + +# ----------------------- +# The main code +# ----------------------- + +def main(): + # Extract command line options + sparql_query = options.sparql_query + sparql_query = sparql_query.replace("__oc__","{") + sparql_query = sparql_query.replace("__ob__","[") + sparql_query = sparql_query.replace("__cc__","}") + sparql_query = sparql_query.replace("__cb__","]") + sparql_query = sparql_query.replace("__cr__"," ") + sparql_query = sparql_query.replace("__cn__"," ") + sparql_query = urllib.quote_plus(sparql_query) + url = options.url + + output_file = options.output_file + + # Create SPARQL query URL + sparql_query_url = url + "?query=" + sparql_query + + # Read from SPARQL Endpoint + sparql_endpoint = urllib.urlopen(sparql_query_url) + results = sparql_endpoint.read() + sparql_endpoint.close() + + # Convert to tabular format + if "<sparql" in results: + xmldata = extract_xml( results ) + tabular = xml_to_tabular( xmldata ) + else: + sys.exit("No SPARQL content found in returned data!\nReturned data:\n" + "-"*80 + "\n" + results) + + # Print to file + of = open(output_file, "w") + of.write(tabular) + of.close() + +# ----------------------- +# Helper methods +# ----------------------- + +def extract_xml( content ): + '''Extract the part of the document starting with <?xml ...''' + xmlcontent = re.search("<sparql.*", content, re.DOTALL).group(0) + return xmlcontent + +def xml_to_tabular( xmldata ): + '''Convert SPARQL result set XML format to tabular text''' + root = et.fromstring(xmldata) + tree = et.ElementTree(root) + tabular = "" + + results = root.getchildren()[1] + for result in results: + line_bits = [binding.getchildren()[0].text for binding in result.getchildren()] + line = "\t".join(line_bits) + tabular += line + "\n" + return tabular + +if __name__ == '__main__': + main()