view sparql_import.py @ 5:eee537fd4600 draft default tip

Uploaded
author saml
date Wed, 21 Nov 2012 12:55:30 -0500
parents 4b4bbcf5db31
children
line wrap: on
line source

#!/usr/bin/python
# --------------------------------------------------------
# A Galaxy plugin for querying external SPARQL Endpoints
# Samuel Lampa, samuel.lampa@gmail.com
# Created: 2012-11-16
# --------------------------------------------------------

from xml.etree import ElementTree as et
from optparse import OptionParser
import urllib, sys, re

# -----------------------
# The main code
# -----------------------

def main():
	# Parse command line options
	(options, args) = parse_options()

	# Extract command line options
	sparql_query = options.sparql_query
	sparql_query = restore_escaped_chars( sparql_query ) 
	sparql_query = urllib.quote_plus(sparql_query)
	url = options.url
	output_file = options.output_file

	# Create SPARQL query URL
	sparql_query_url = url + "?query=" + sparql_query

	# Read from SPARQL Endpoint
	sparql_endpoint = urllib.urlopen(sparql_query_url)
	results = sparql_endpoint.read()
	sparql_endpoint.close()

        # Convert to tabular format
        if "<sparql" in results:
                xmldata = extract_xml( results )
                tabular = xml_to_tabular( xmldata )
        else:
                sys.exit("No SPARQL content found in returned data!\nReturned data:\n" + "-"*10 + "\n" + results)

	# Print to file
	of = open(output_file, "w")
	of.write(tabular)
	of.close()

# -----------------------
# Helper methods
# -----------------------

def extract_xml( content ):
	'''Extract the part of the document starting with <?xml ...'''
	xmlcontent = re.search("<sparql.*", content, re.DOTALL).group(0)
	return xmlcontent

def xml_to_tabular( xmldata ):
	'''Convert SPARQL result set XML format to tabular text'''
	root = et.fromstring(xmldata)
	tree = et.ElementTree(root)
	tabular = ""

	results = root.getchildren()[1]
	for result in results:
		line_bits = ['<' + binding.getchildren()[0].text + '>' for binding in result.getchildren()]
		line = "\t".join(line_bits)
		tabular += line + "\n"	
	return tabular

def restore_escaped_chars( sparql_query ):
	sparql_query = sparql_query.replace("__oc__","{")
	sparql_query = sparql_query.replace("__ob__","[")
	sparql_query = sparql_query.replace("__cc__","}")
	sparql_query = sparql_query.replace("__cb__","]")
	sparql_query = sparql_query.replace("__cr__"," ")
	sparql_query = sparql_query.replace("__cn__"," ")
	sparql_query = sparql_query.replace("__at__","@")
	return sparql_query

def parse_options():
	parser = OptionParser()
	parser.add_option("-u", "--url",
		help = "The URL to the SPARQL endpoint")
	parser.add_option("-q", "--sparql_query",
		help = "A SPARQL query to send to a SPARQL endpoint")
	parser.add_option("-o", "--output_file",
		help = "An output file for storing the results")
	(options, args) = parser.parse_args()

	if not options.url:
		sys.exit("You have to specify an URL! Use the -h flag to view command line options!")
	if not options.sparql_query:
		sys.exit("You have to specify a SPARQL query! Use the -h flag to view command line options!")
	if not options.output_file:
		sys.exit("You have to specify an output file! Use the -h flag to view command line options!")

	if len(options.sparql_query) < 9: 
		sys.exit("Your SPARQL query is too short (printed below)!\n" + options.sparql_query)

	if not re.match("^http", options.url):
		sys.exit("The URL has to start with 'http://'! Please try again!")

	return options, args

if __name__ == '__main__':
	main()