Mercurial > repos > fabad > sparql_uniprot
view sparql_uniprot.py @ 15:ccee4fa38909 draft
Uploaded
author | fabad |
---|---|
date | Mon, 02 May 2016 17:45:04 -0400 |
parents | 9bb153d42a1c |
children | 478f5587f12a |
line wrap: on
line source
from SPARQLWrapper import SPARQLWrapper, JSON import sys # Constante con los prefijos comunes a usar en queries COMMON_PREFIXES = """ PREFIX up:<http://purl.uniprot.org/core/> PREFIX keywords:<http://purl.uniprot.org/keywords/> PREFIX uniprotkb:<http://purl.uniprot.org/uniprot/> PREFIX taxon:<http://purl.uniprot.org/taxonomy/> PREFIX ec:<http://purl.uniprot.org/enzyme/> PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> PREFIX skos:<http://www.w3.org/2004/02/skos/core#> PREFIX owl:<http://www.w3.org/2002/07/owl#> PREFIX bibo:<http://purl.org/ontology/bibo/> PREFIX dc:<http://purl.org/dc/terms/> PREFIX xsd:<http://www.w3.org/2001/XMLSchema#> PREFIX faldo:<http://biohackathon.org/resource/faldo#> """ # Lista con los nombres de las variables que obtenemos de la base de datos. paramList = ['protein', 'proteinFullName', 'geneName', 'organismName', 'diseaseAnnotationText', 'domainFullName', 'similarityAnnotationText', 'locationAnnotationText', 'functionAnnotationText', 'pharmaceuticalAnnotationText']; def buildQuery(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation): query = COMMON_PREFIXES query += "select distinct \n" query += " ?protein\n" query += " ?proteinFullName\n" query += " ?geneName\n" query += " ?organismName\n" query += " ?diseaseAnnotationText\n" query += " ?domainFullName\n" query += " ?similarityAnnotationText\n" query += " ?locationAnnotationText\n" query += " ?functionAnnotationText\n" query += " ?pharmaceuticalAnnotationText\n" query += "where{\n" query += " ?protein a up:Protein .\n" if (proteinId != ''): query += " VALUES ?protein {uniprotkb:"+ proteinId + "}\n" query += "\n" if (proteinName == ''): query += " OPTIONAL {\n" query += " ?protein up:recommendedName ?proteinName .\n" query += " ?proteinName up:fullName ?proteinFullName . \n" if (proteinName !=''): query += " filter( regex(str(?proteinFullName), " + '"' + proteinName + '"' + ",\"i\" )) .\n" if (proteinName == ''): query += " }\n" query += "\n" if (geneName == ''): query += " OPTIONAL {\n" query += " ?protein up:encodedBy ?gene .\n" query += " ?gene skos:prefLabel ?geneName .\n" if (geneName != ''): query += " filter( regex(str(?geneName), " + '"' + geneName + '"' + ",\"i\" )) .\n" if (geneName == ''): query += " }\n" query += "\n" if (organismName == ''): query += " OPTIONAL {\n" query += " ?protein up:organism ?organism .\n" query += " ?organism up:scientificName ?organismName .\n" if (organismName != ''): query += " filter( regex(str(?organismName), " + '"' + organismName + '"' + ",\"i\" )) .\n" if (organismName == ''): query += " }\n" query += "\n" if (diseaseAnnotation == ''): query += " OPTIONAL {\n" query += " ?protein up:annotation ?diseaseAnnotation .\n" query += " ?diseaseAnnotation a up:Disease_Annotation .\n" query += " ?diseaseAnnotation up:disease ?disease .\n" query += " ?disease rdfs:comment ?diseaseAnnotationText\n" if (diseaseAnnotation != ''): query += " filter( regex(str(?diseaseAnnotationText), " + '"' + diseaseAnnotation + '"' + ",\"i\" )) .\n" if (diseaseAnnotation == ''): query += " }\n" query += "\n" if (domainName == ''): query += " OPTIONAL {\n" query += " ?protein up:domain ?domain .\n" query += " ?domain up:recommendedName ?domainName .\n" query += " ?domainName up:fullName ?domainFullName .\n" if (domainName != ''): query += " filter( regex(str(?domainFullName), " + '"' + domainName + '"' + ",\"i\" )) .\n" if (domainName == ''): query += " }\n" query += "\n" if (similarityAnnotation == ''): query += " OPTIONAL {\n" query += " ?protein up:annotation ?similarityAnnotation .\n" query += " ?similarityAnnotation a up:Similarity_Annotation .\n" query += " ?similarityAnnotation rdfs:comment ?similarityAnnotationText .\n" if (similarityAnnotation != ''): query += " filter( regex(str(?similarityAnotationText), " + '"' + similarityAnnotation + '"' + ",\"i\" )) .\n" if (similarityAnnotation == ''): query += " }\n" query += "\n" if (locationAnnotation == ''): query += " OPTIONAL {\n" query += " ?protein up:annotation ?locationAnnotation .\n" query += " ?locationAnnotation a up:Subcellular_Location_Annotation .\n" query += " ?locationAnnotation up:locatedIn ?location .\n" query += " ?location up:cellularComponent ?cellComponent .\n" query += " ?cellComponent rdfs:comment ?locationAnnotationText .\n" if (locationAnnotation != ''): query += " filter( regex(str(?locationAnnotationText), " + '"' + locationAnnotation + '"' + ",\"i\" )) .\n" if (locationAnnotation == ''): query += " }\n" query += "\n" if (functionAnnotation == ''): query += " OPTIONAL {\n" query += " ?protein up:annotation ?functionAnnotation .\n" query += " ?functionAnnotation a up:Function_Annotation .\n" query += " ?functionAnnotation rdfs:comment ?functionAnnotationText .\n" if (functionAnnotation != ''): query += " filter( regex(str(?functionAnnotationText), " + '"' + functionAnnotation + '"' + ",\"i\" )) .\n" if(functionAnnotation == ''): query += " }\n" query += "\n" if (pharmaceuticalAnnotation == ''): query += " OPTIONAL {\n" query += " ?protein up:annotation ?pharmaceuticalAnnotation .\n" query += " ?pharmaceuticalAnnotation a up:Pharmaceutical_Annotation .\n" query += " ?pharmaceuticalAnnotation rdfs:comment ?pharmaceuticalAnnotationText .\n" if (pharmaceuticalAnnotation != ''): query += " filter( regex(str(?pharmaceuticalAnnotationText), " + '"' + pharmaceuticalAnnotation + '"' + ",\"i\" )) .\n" if (pharmaceuticalAnnotation == ''): query += " }\n" query += "}\n" #query += "limit 30\n" return query def printResults(json, output): # Abrir fichero para escritura fileRes = open(output, 'w') # Imprimir cabecera for param in paramList: fileRes.write(param + "\t") fileRes.write("\n") # El formato json se puede recorrer de esta manera # para ir obteniendo valores de la respuesta. for entrada in json["results"]["bindings"]: for param in paramList: if (entrada.get(param)): fileRes.write(entrada.get(param)["value"] + "\t") else: fileRes.write("\t") fileRes.write("\n") fileRes.close() def sparqlwrap(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation, output): query = buildQuery(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation) print query # Creamos un objeto del tipo SPARQLWrapper indicando en que # direccion esta el servicio que recibe consultas en sparql # y responde a estas. sparql = SPARQLWrapper('http://sparql.uniprot.org/sparql') # Especificamos la consulta que queremos hacer en sparql. sparql.setQuery(query) # Indicamos en que formato queremos que nos devuelva # los resultados de la consulta. Puede ser json, xml, # rfd, turtle... Simplemente son distintos formatos # para representar los datos en ficheros de texto. sparql.setReturnFormat(JSON) # Esta es la instruccion que realiza la consulta a # uniprot. Devuelve un objeto de python que hay que # tratar. print "Ejecutando query" results = sparql.query() # Con esto, convertimos el objeto devuelto por # el servicio al formato que especificamos antes. # En este caso, json. print "Conviertiendo a json" json = results.convert() print "Fin conversion a json" # Dentro de la variable results tenemos informacion # (metadatos) de lo que ha devuelto el servidor de # uniprot. print results.info() # Imprimir resultados printResults(json, output) # Obtener parametros de la linea de comandos. proteinId = sys.argv[1] proteinName = sys.argv[2] geneName = sys.argv[3] organismName = sys.argv[4] diseaseAnnotation = sys.argv[5] domainName =sys.argv[6] similarityAnnotation = sys.argv[7] locationAnnotation = sys.argv[8] functionAnnotation = sys.argv[9] pharmaceuticalAnnotation = sys.argv[10] output = sys.argv[11] # Llamada a la funcion que realiza la consulta. sparqlwrap(proteinId, proteinName, geneName, organismName, diseaseAnnotation, domainName, similarityAnnotation, locationAnnotation, functionAnnotation, pharmaceuticalAnnotation, output)