# HG changeset patch
# User fabad
# Date 1462222200 14400
# Node ID 9bb153d42a1c272c814df8b198802ab314a82738
# Parent c76273c080a18474da00b452326748a6c9f6bade
Uploaded
diff -r c76273c080a1 -r 9bb153d42a1c sparql_uniprot.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sparql_uniprot.py Mon May 02 16:50:00 2016 -0400
@@ -0,0 +1,230 @@
+from SPARQLWrapper import SPARQLWrapper, JSON
+import sys
+
+# Constante con los prefijos comunes a usar en queries
+COMMON_PREFIXES = """
+ PREFIX up:
+ PREFIX keywords:
+ PREFIX uniprotkb:
+ PREFIX taxon:
+ PREFIX ec:
+ PREFIX rdf:
+ PREFIX rdfs:
+ PREFIX skos:
+ PREFIX owl:
+ PREFIX bibo:
+ PREFIX dc:
+ PREFIX xsd:
+ PREFIX faldo:
+"""
+
+# Lista con los nombres de las variables que obtenemos de la base de datos.
+paramList = ['protein', 'proteinFullName', 'geneName', 'organismName', 'diseaseAnnotationText', 'domainFullName', 'similarityAnnotationText', 'locationAnnotationText', 'functionAnnotationText', 'pharmaceuticalAnnotationText'];
+
+def buildQuery(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation):
+ query = COMMON_PREFIXES
+ query += "select distinct \n"
+ query += " ?protein\n"
+ query += " ?proteinFullName\n"
+ query += " ?geneName\n"
+ query += " ?organismName\n"
+ query += " ?diseaseAnnotationText\n"
+ query += " ?domainFullName\n"
+ query += " ?similarityAnnotationText\n"
+ query += " ?locationAnnotationText\n"
+ query += " ?functionAnnotationText\n"
+ query += " ?pharmaceuticalAnnotationText\n"
+ query += "where{\n"
+
+ query += " ?protein a up:Protein .\n"
+
+ if (proteinId != ''):
+ query += " VALUES ?protein {uniprotkb:"+ proteinId + "}\n"
+
+ query += "\n"
+
+ if (proteinName == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:recommendedName ?proteinName .\n"
+ query += " ?proteinName up:fullName ?proteinFullName . \n"
+ if (proteinName !=''):
+ query += " filter( regex(str(?proteinFullName), " + '"' + proteinName + '"' + ",\"i\" )) .\n"
+ if (proteinName == ''):
+ query += " }\n"
+ query += "\n"
+
+ if (geneName == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:encodedBy ?gene .\n"
+ query += " ?gene skos:prefLabel ?geneName .\n"
+ if (geneName != ''):
+ query += " filter( regex(str(?geneName), " + '"' + geneName + '"' + ",\"i\" )) .\n"
+ if (geneName == ''):
+ query += " }\n"
+
+ query += "\n"
+
+ if (organismName == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:organism ?organism .\n"
+ query += " ?organism up:scientificName ?organismName .\n"
+ if (organismName != ''):
+ query += " filter( regex(str(?organismName), " + '"' + organismName + '"' + ",\"i\" )) .\n"
+ if (organismName == ''):
+ query += " }\n"
+
+ query += "\n"
+
+ if (diseaseAnnotation == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:annotation ?diseaseAnnotation .\n"
+ query += " ?diseaseAnnotation a up:Disease_Annotation .\n"
+ query += " ?diseaseAnnotation up:disease ?disease .\n"
+ query += " ?disease rdfs:comment ?diseaseAnnotationText\n"
+ if (diseaseAnnotation != ''):
+ query += " filter( regex(str(?diseaseAnnotationText), " + '"' + diseaseAnnotation + '"' + ",\"i\" )) .\n"
+ if (diseaseAnnotation == ''):
+ query += " }\n"
+
+ query += "\n"
+
+ if (domainName == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:domain ?domain .\n"
+ query += " ?domain up:recommendedName ?domainName .\n"
+ query += " ?domainName up:fullName ?domainFullName .\n"
+ if (domainName != ''):
+ query += " filter( regex(str(?domainFullName), " + '"' + domainName + '"' + ",\"i\" )) .\n"
+ if (domainName == ''):
+ query += " }\n"
+
+ query += "\n"
+
+ if (similarityAnnotation == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:annotation ?similarityAnnotation .\n"
+ query += " ?similarityAnnotation a up:Similarity_Annotation .\n"
+ query += " ?similarityAnnotation rdfs:comment ?similarityAnnotationText .\n"
+ if (similarityAnnotation != ''):
+ query += " filter( regex(str(?similarityAnotationText), " + '"' + similarityAnnotation + '"' + ",\"i\" )) .\n"
+ if (similarityAnnotation == ''):
+ query += " }\n"
+
+ query += "\n"
+
+ if (locationAnnotation == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:annotation ?locationAnnotation .\n"
+ query += " ?locationAnnotation a up:Subcellular_Location_Annotation .\n"
+ query += " ?locationAnnotation up:locatedIn ?location .\n"
+ query += " ?location up:cellularComponent ?cellComponent .\n"
+ query += " ?cellComponent rdfs:comment ?locationAnnotationText .\n"
+ if (locationAnnotation != ''):
+ query += " filter( regex(str(?locationAnnotationText), " + '"' + locationAnnotation + '"' + ",\"i\" )) .\n"
+ if (locationAnnotation == ''):
+ query += " }\n"
+
+ query += "\n"
+
+ if (functionAnnotation == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:annotation ?functionAnnotation .\n"
+ query += " ?functionAnnotation a up:Function_Annotation .\n"
+ query += " ?functionAnnotation rdfs:comment ?functionAnnotationText .\n"
+ if (functionAnnotation != ''):
+ query += " filter( regex(str(?functionAnnotationText), " + '"' + functionAnnotation + '"' + ",\"i\" )) .\n"
+ if(functionAnnotation == ''):
+ query += " }\n"
+
+ query += "\n"
+
+ if (pharmaceuticalAnnotation == ''):
+ query += " OPTIONAL {\n"
+ query += " ?protein up:annotation ?pharmaceuticalAnnotation .\n"
+ query += " ?pharmaceuticalAnnotation a up:Pharmaceutical_Annotation .\n"
+ query += " ?pharmaceuticalAnnotation rdfs:comment ?pharmaceuticalAnnotationText .\n"
+ if (pharmaceuticalAnnotation != ''):
+ query += " filter( regex(str(?pharmaceuticalAnnotationText), " + '"' + pharmaceuticalAnnotation + '"' + ",\"i\" )) .\n"
+ if (pharmaceuticalAnnotation == ''):
+ query += " }\n"
+ query += "}\n"
+ #query += "limit 30\n"
+
+ return query
+
+def printResults(json, output):
+ # Abrir fichero para escritura
+ fileRes = open(output, 'w')
+ # Imprimir cabecera
+ for param in paramList:
+ fileRes.write(param + "\t")
+ fileRes.write("\n")
+
+ # El formato json se puede recorrer de esta manera
+ # para ir obteniendo valores de la respuesta.
+ for entrada in json["results"]["bindings"]:
+ for param in paramList:
+ if (entrada.get(param)):
+ fileRes.write(entrada.get(param)["value"] + "\t")
+ else:
+ fileRes.write("\t")
+ fileRes.write("\n")
+ fileRes.close()
+
+def sparqlwrap(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation, output):
+
+ query = buildQuery(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation)
+ print query
+
+ # Creamos un objeto del tipo SPARQLWrapper indicando en que
+ # direccion esta el servicio que recibe consultas en sparql
+ # y responde a estas.
+ sparql = SPARQLWrapper('http://sparql.uniprot.org/sparql')
+
+ # Especificamos la consulta que queremos hacer en sparql.
+ sparql.setQuery(query)
+
+ # Indicamos en que formato queremos que nos devuelva
+ # los resultados de la consulta. Puede ser json, xml,
+ # rfd, turtle... Simplemente son distintos formatos
+ # para representar los datos en ficheros de texto.
+ sparql.setReturnFormat(JSON)
+
+ # Esta es la instruccion que realiza la consulta a
+ # uniprot. Devuelve un objeto de python que hay que
+ # tratar.
+ print "Ejecutando query"
+ results = sparql.query()
+
+ # Con esto, convertimos el objeto devuelto por
+ # el servicio al formato que especificamos antes.
+ # En este caso, json.
+ print "Conviertiendo a json"
+ json = results.convert()
+ print "Fin conversion a json"
+
+ # Dentro de la variable results tenemos informacion
+ # (metadatos) de lo que ha devuelto el servidor de
+ # uniprot.
+ print results.info()
+
+ # Imprimir resultados
+ printResults(json, output)
+
+
+# Obtener parametros de la linea de comandos.
+proteinId = sys.argv[1]
+proteinName = sys.argv[2]
+geneName = sys.argv[3]
+organismName = sys.argv[4]
+diseaseAnnotation = sys.argv[5]
+domainName =sys.argv[6]
+similarityAnnotation = sys.argv[7]
+locationAnnotation = sys.argv[8]
+functionAnnotation = sys.argv[9]
+pharmaceuticalAnnotation = sys.argv[10]
+output = sys.argv[11]
+
+# Llamada a la funcion que realiza la consulta.
+sparqlwrap(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation, output)
+