# HG changeset patch # User fabad # Date 1462222200 14400 # Node ID 9bb153d42a1c272c814df8b198802ab314a82738 # Parent c76273c080a18474da00b452326748a6c9f6bade Uploaded diff -r c76273c080a1 -r 9bb153d42a1c sparql_uniprot.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sparql_uniprot.py Mon May 02 16:50:00 2016 -0400 @@ -0,0 +1,230 @@ +from SPARQLWrapper import SPARQLWrapper, JSON +import sys + +# Constante con los prefijos comunes a usar en queries +COMMON_PREFIXES = """ + PREFIX up: + PREFIX keywords: + PREFIX uniprotkb: + PREFIX taxon: + PREFIX ec: + PREFIX rdf: + PREFIX rdfs: + PREFIX skos: + PREFIX owl: + PREFIX bibo: + PREFIX dc: + PREFIX xsd: + PREFIX faldo: +""" + +# Lista con los nombres de las variables que obtenemos de la base de datos. +paramList = ['protein', 'proteinFullName', 'geneName', 'organismName', 'diseaseAnnotationText', 'domainFullName', 'similarityAnnotationText', 'locationAnnotationText', 'functionAnnotationText', 'pharmaceuticalAnnotationText']; + +def buildQuery(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation): + query = COMMON_PREFIXES + query += "select distinct \n" + query += " ?protein\n" + query += " ?proteinFullName\n" + query += " ?geneName\n" + query += " ?organismName\n" + query += " ?diseaseAnnotationText\n" + query += " ?domainFullName\n" + query += " ?similarityAnnotationText\n" + query += " ?locationAnnotationText\n" + query += " ?functionAnnotationText\n" + query += " ?pharmaceuticalAnnotationText\n" + query += "where{\n" + + query += " ?protein a up:Protein .\n" + + if (proteinId != ''): + query += " VALUES ?protein {uniprotkb:"+ proteinId + "}\n" + + query += "\n" + + if (proteinName == ''): + query += " OPTIONAL {\n" + query += " ?protein up:recommendedName ?proteinName .\n" + query += " ?proteinName up:fullName ?proteinFullName . \n" + if (proteinName !=''): + query += " filter( regex(str(?proteinFullName), " + '"' + proteinName + '"' + ",\"i\" )) .\n" + if (proteinName == ''): + query += " }\n" + query += "\n" + + if (geneName == ''): + query += " OPTIONAL {\n" + query += " ?protein up:encodedBy ?gene .\n" + query += " ?gene skos:prefLabel ?geneName .\n" + if (geneName != ''): + query += " filter( regex(str(?geneName), " + '"' + geneName + '"' + ",\"i\" )) .\n" + if (geneName == ''): + query += " }\n" + + query += "\n" + + if (organismName == ''): + query += " OPTIONAL {\n" + query += " ?protein up:organism ?organism .\n" + query += " ?organism up:scientificName ?organismName .\n" + if (organismName != ''): + query += " filter( regex(str(?organismName), " + '"' + organismName + '"' + ",\"i\" )) .\n" + if (organismName == ''): + query += " }\n" + + query += "\n" + + if (diseaseAnnotation == ''): + query += " OPTIONAL {\n" + query += " ?protein up:annotation ?diseaseAnnotation .\n" + query += " ?diseaseAnnotation a up:Disease_Annotation .\n" + query += " ?diseaseAnnotation up:disease ?disease .\n" + query += " ?disease rdfs:comment ?diseaseAnnotationText\n" + if (diseaseAnnotation != ''): + query += " filter( regex(str(?diseaseAnnotationText), " + '"' + diseaseAnnotation + '"' + ",\"i\" )) .\n" + if (diseaseAnnotation == ''): + query += " }\n" + + query += "\n" + + if (domainName == ''): + query += " OPTIONAL {\n" + query += " ?protein up:domain ?domain .\n" + query += " ?domain up:recommendedName ?domainName .\n" + query += " ?domainName up:fullName ?domainFullName .\n" + if (domainName != ''): + query += " filter( regex(str(?domainFullName), " + '"' + domainName + '"' + ",\"i\" )) .\n" + if (domainName == ''): + query += " }\n" + + query += "\n" + + if (similarityAnnotation == ''): + query += " OPTIONAL {\n" + query += " ?protein up:annotation ?similarityAnnotation .\n" + query += " ?similarityAnnotation a up:Similarity_Annotation .\n" + query += " ?similarityAnnotation rdfs:comment ?similarityAnnotationText .\n" + if (similarityAnnotation != ''): + query += " filter( regex(str(?similarityAnotationText), " + '"' + similarityAnnotation + '"' + ",\"i\" )) .\n" + if (similarityAnnotation == ''): + query += " }\n" + + query += "\n" + + if (locationAnnotation == ''): + query += " OPTIONAL {\n" + query += " ?protein up:annotation ?locationAnnotation .\n" + query += " ?locationAnnotation a up:Subcellular_Location_Annotation .\n" + query += " ?locationAnnotation up:locatedIn ?location .\n" + query += " ?location up:cellularComponent ?cellComponent .\n" + query += " ?cellComponent rdfs:comment ?locationAnnotationText .\n" + if (locationAnnotation != ''): + query += " filter( regex(str(?locationAnnotationText), " + '"' + locationAnnotation + '"' + ",\"i\" )) .\n" + if (locationAnnotation == ''): + query += " }\n" + + query += "\n" + + if (functionAnnotation == ''): + query += " OPTIONAL {\n" + query += " ?protein up:annotation ?functionAnnotation .\n" + query += " ?functionAnnotation a up:Function_Annotation .\n" + query += " ?functionAnnotation rdfs:comment ?functionAnnotationText .\n" + if (functionAnnotation != ''): + query += " filter( regex(str(?functionAnnotationText), " + '"' + functionAnnotation + '"' + ",\"i\" )) .\n" + if(functionAnnotation == ''): + query += " }\n" + + query += "\n" + + if (pharmaceuticalAnnotation == ''): + query += " OPTIONAL {\n" + query += " ?protein up:annotation ?pharmaceuticalAnnotation .\n" + query += " ?pharmaceuticalAnnotation a up:Pharmaceutical_Annotation .\n" + query += " ?pharmaceuticalAnnotation rdfs:comment ?pharmaceuticalAnnotationText .\n" + if (pharmaceuticalAnnotation != ''): + query += " filter( regex(str(?pharmaceuticalAnnotationText), " + '"' + pharmaceuticalAnnotation + '"' + ",\"i\" )) .\n" + if (pharmaceuticalAnnotation == ''): + query += " }\n" + query += "}\n" + #query += "limit 30\n" + + return query + +def printResults(json, output): + # Abrir fichero para escritura + fileRes = open(output, 'w') + # Imprimir cabecera + for param in paramList: + fileRes.write(param + "\t") + fileRes.write("\n") + + # El formato json se puede recorrer de esta manera + # para ir obteniendo valores de la respuesta. + for entrada in json["results"]["bindings"]: + for param in paramList: + if (entrada.get(param)): + fileRes.write(entrada.get(param)["value"] + "\t") + else: + fileRes.write("\t") + fileRes.write("\n") + fileRes.close() + +def sparqlwrap(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation, output): + + query = buildQuery(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation) + print query + + # Creamos un objeto del tipo SPARQLWrapper indicando en que + # direccion esta el servicio que recibe consultas en sparql + # y responde a estas. + sparql = SPARQLWrapper('http://sparql.uniprot.org/sparql') + + # Especificamos la consulta que queremos hacer en sparql. + sparql.setQuery(query) + + # Indicamos en que formato queremos que nos devuelva + # los resultados de la consulta. Puede ser json, xml, + # rfd, turtle... Simplemente son distintos formatos + # para representar los datos en ficheros de texto. + sparql.setReturnFormat(JSON) + + # Esta es la instruccion que realiza la consulta a + # uniprot. Devuelve un objeto de python que hay que + # tratar. + print "Ejecutando query" + results = sparql.query() + + # Con esto, convertimos el objeto devuelto por + # el servicio al formato que especificamos antes. + # En este caso, json. + print "Conviertiendo a json" + json = results.convert() + print "Fin conversion a json" + + # Dentro de la variable results tenemos informacion + # (metadatos) de lo que ha devuelto el servidor de + # uniprot. + print results.info() + + # Imprimir resultados + printResults(json, output) + + +# Obtener parametros de la linea de comandos. +proteinId = sys.argv[1] +proteinName = sys.argv[2] +geneName = sys.argv[3] +organismName = sys.argv[4] +diseaseAnnotation = sys.argv[5] +domainName =sys.argv[6] +similarityAnnotation = sys.argv[7] +locationAnnotation = sys.argv[8] +functionAnnotation = sys.argv[9] +pharmaceuticalAnnotation = sys.argv[10] +output = sys.argv[11] + +# Llamada a la funcion que realiza la consulta. +sparqlwrap(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation, output) +