Mercurial > repos > in_silico > cravat_annotate_mutations
view cravat_annotate/cravat_annotate.py @ 10:152227fa7851 draft
Uploaded
author | in_silico |
---|---|
date | Tue, 12 Jun 2018 11:04:25 -0400 |
parents | |
children |
line wrap: on
line source
""" A galaxy wrapper for the /rest/service/query API endpoint on Cravat. Notes on Mapping: ----------------- The CravatQuery class uses static method 'from_array' to interpret an array of values into a query string for the /rest/service/query API service on the cravat server. This involves using a mapping dictionary to know how to associate the array's index positions in to query-ing attributes, such as the chromosome, position, etc. The CravatQuery class contains a default value ('default_mapping'); however, this could also be offered as a user-configurable option. Remaining Items (including possible expansion features): ----------------- TODO: Possibly provide user-configurability of CravatQuery array index mapping TODO: Possibly provide user-configurability of delimiter value TODO: Check if chromosomes are 0 or 1 based indexing TODO: Port 'write headers' option and include in user prompts in galaxy xml TODO: Try-catch structure on the query call to cravat so if one bad query doesn't get back a response, the rest of the run can still execute. Report this to user. """ import requests import json import sys import re ### import ipdb class CravatQueryException(Exception): def __init__(self, message, errors=None): super(CravatQueryException, self).__init__(message) # Support for custom error codes self.errors = errors class CravatQuery(object): """ : A class for handling Cravat query strings. : Args (all required): : chr - Chromosome : pos - Position : strand - Strand : ref - Reference Base : alt - Alternate Base """ # The endpoint that CravatQuerys are submitted to endpoint = 'http://cravat.us/CRAVAT/rest/service/query' # The value delimiter used in the Cravat input file to delimit values delimiter = "\t" # Defualt indices for intepretting a cravat file's row of data in to a CravatQuery default_mapping = { 'chromosome': 1, 'position': 2, 'strand': 3, 'reference': 4, 'alternate': 5 } # Defualt values. Used as backup for CravatQuery to resolve query with incomplete information default_values = { 'strand': '+' } # The neccessary attributes neeeded to submit a query. query_keys = [ 'chromosome', 'position', 'strand', 'reference', 'alternate' ] # Expected response keys from server. Ordered in list so that galaxy output has uniform column ordering run-to-run. # If cravat server returns additional keys, they are appended to and included in output. response_keys = [ "Chromosome", "Position", "Strand", "Reference base(s)", "Alternate base(s)", "HUGO symbol", "S.O. transcript", "Sequence ontology protein change", "Sequence ontology", "S.O. all transcripts", "gnomAD AF", "gnomAD AF (African)", "gnomAD AF (Amrican)", "gnomAD AF (Ashkenazi Jewish)", "gnomAD AF (East Asian)", "gnomAD AF (Finnish)", "gnomAD AF (Non-Finnish European)", "gnomAD AF (Other)", "gnomAD AF (South Asian)", "1000 Genomes AF", "ESP6500 AF (average)", "ESP6500 AF (European American)", "ESP6500 AF (African American)", "COSMIC transcript", "COSMIC protein change", "COSMIC variant count [exact nucleotide change]", "cosmic_site_nt", "CGL driver class", "TARGET", "dbSNP", "cgc_role", "cgc_inheritance", "cgc_tumor_type_somatic", "cgc_tumor_type_germline", "ClinVar", "ClinVar disease identifier", "ClinVar XRef", "GWAS Phenotype (GRASP)", "GWAS PMID (GRASP)", "Protein 3D variant" ] def __init__(self, _chr, pos, strand, ref, alt): # '_chr' used to avoid naming confliction with python built-in 'chr' self.chromosome = CravatQuery.format_chromosome(_chr) self.position = pos self.strand = strand self.reference = ref self.alternate = alt self.values = [self.chromosome, self.position, self.strand, self.reference, self.alternate] def __str__(self): """ : Represent the CravatQuery as a valid query string for call to Cravat server """ return "_".join(map(lambda x: str(x), self.values)) def as_query_string(self): return str(self) @staticmethod def from_dictionary(d): """ : Instantiate a CravatQuery from a dictionary representation. : Args: : d <dictionary>: A dictionary representing a CravatQuery, containing keys: [{}] """.format(CravatQuery.query_keys) for key in CravatQuery.query_keys: if key not in d: raise CravatQueryException("CravatQuery.from_dictionary requires keys: [{}], however key: '{}' was not provided " .format(CravatQuery.query_keys, key)) return CravatQuery(d["chromosome"], d["position"], d["strand"], d["reference"], d["alternate"]) @staticmethod def from_array(array, mapping=None): """ : Instantiate a CravatQuery from an array of values. Useful when translating read lines from a file. : Args: : fmt <str> - Either 'cr' or 'vcf', describing input format : array <list> - The values to instantiate the CravatQuery from : mapping <dict> - Optional. A dictionary associating cravat parameters to indicies in the array. Valid values are: 'chromosome', 'position', 'strand', 'reference', 'alternate' """ # Set the mapping value. Either recieved from user, or obtained via defualt associated to 'fmt' if mapping == None: mapping = CravatQuery.default_mapping # Build a dict of cravat querying keys to values. d = {} for key in CravatQuery.query_keys: # Try to get index position from mapping by the key, and value from array by the index if key in mapping: index = mapping[key] d[key] = array[index] # If index not provided in mapping, check if there is a defualt value elif key in CravatQuery.default_values: d[key] = CravatQuery.default_values[key] # Unable to get value for querying key, meaning can't construct the minimum requirements for query else: raise CravatQueryException("CravatQuery.from_array requires a mapping index for key: '{}', however value was not provided".format(key)) return CravatQuery.from_dictionary(d) @staticmethod def format_chromosome(_chr): """ : Format a chromosome for use as query parameter. '_chr' name used to avoid python built-in name confliction. : Args: : _chr - Either an interger [1,23], or 'x'/'X', or 'y'/'Y', or a string of the form : 'chr<z>' where '<z>' is one of the previously described values """ inRange = lambda x: 1 <= x and x <= 23 _chr = _chr.lower() _chr = _chr.strip('chr') # Handler interger chromosomes 1 to 23 try: _chr = int(_chr) if inRange(_chr): return 'chr' + str(_chr) else: raise CravatQueryException("Chromsomme of '{}' was out of range [1,23]".format(_chr)) except: pass # Handle chromosomes chromosomes x and y if _chr == 'x' or _chr == 'y': return 'chr' + _chr raise CravatQueryException("Unable to resolve input: '{}' into a valid chromosome representation".format(_chr)) @staticmethod def jump_header(in_file, out_file, headerlines=0): """ : Jumps over a header space of line number 'headerlines'. Sets up in_file so that : the next execution of in_file.readline() will return the first non-header line. """ in_file.seek(0) for line in range(headerlines): in_file.readline() def main(in_path, out_path, pre_callback=None, user_mapping=None): """ : Read the file line by line and use data to query cravat server. : Args: : - fmt <str>: 'cr' or 'vcf'. The input format : - in_path <str>: Path to input file : - in_path <str>: Path to output file : - header_callback <function>: A function to handle the header space. Executed before main loop. Recieves in_file, out_file, and fmt as argumnets """ with open(in_path, 'r') as in_file, \ open(out_path, 'w') as out_file: # Perform any pre-processing steps, such as jumping a header space if pre_callback: pre_callback(in_file, out_file, fmt) # main loop for line in in_file: # Create query from line of input data line = line.strip().split('\t') query = CravatQuery.from_array(line, user_mapping) # Make request, and write respone data call = requests.get(CravatQuery.endpoint, params={ 'mutation': query.as_query_string }) ipdb.set_trace() try: if call.status_code != 200 or call.text == "": raise CravatQueryException("Bad Server Response. Respone code: '{}', Response Text: '{}'".format(call.status_code, call.text)) json_response = json.loads(call.text) wrote = False for key, val in json_response.items(): # Set numeric values to uniform format try: val = float(val) val = format(val, ".4f") except: pass if wrote: out_file.write("\t") out_file.write(val) wrote = True out_file.write("\n") except CravatQueryException as e: print(e) if __name__ == "__main__": # Input and output file paths, obtained form command line in_path = sys.argv[1] out_path = sys.argv[2] # Possibly allow user mapping configuration thourgh here. Not fully implemented if len(sys.argv) > 2: user_mapping = sys.argv[3] # Run the main operation main(in_path, out_path)