14
|
1 """
|
|
2 A galaxy wrapper for the /rest/service/query API endpoint on Cravat.
|
|
3
|
|
4
|
|
5 Notes on Mapping:
|
|
6 -----------------
|
|
7 The CravatQuery class uses static method 'from_array' to interpret an array of values
|
|
8 into a query string for the /rest/service/query API service on the cravat server.
|
|
9 This involves using a mapping dictionary to know how to associate the array's index positions
|
|
10 in to query-ing attributes, such as the chromosome, position, etc. The CravatQuery
|
|
11 class contains a default value ('default_mapping'); however, this could also be
|
|
12 offered as a user-configurable option.
|
|
13 """
|
|
14
|
|
15
|
|
16 import requests
|
|
17 import json
|
|
18 import sys
|
|
19 import re
|
|
20
|
|
21
|
|
22 class CravatQueryException(Exception):
|
|
23
|
|
24 def __init__(self, message, errors=None):
|
|
25 super(CravatQueryException, self).__init__(message)
|
|
26 # Support for custom error codes
|
|
27 self.errors = errors
|
|
28
|
|
29
|
|
30 class CravatQuery(object):
|
|
31 """
|
|
32 : A class for handling Cravat query strings.
|
|
33 : Args (all required):
|
|
34 : chr - Chromosome
|
|
35 : pos - Position
|
|
36 : strand - Strand
|
|
37 : ref - Reference Base
|
|
38 : alt - Alternate Base
|
|
39 """
|
|
40
|
|
41 # The endpoint that CravatQuerys are submitted to
|
|
42 endpoint = 'http://cravat.us/CRAVAT/rest/service/query'
|
|
43
|
|
44 # The value delimiter used in the Cravat input file to delimit values
|
|
45 delimiter = "\t"
|
|
46
|
|
47 # Defualt indices for intepretting a cravat file's row of data in to a CravatQuery
|
|
48 default_mapping = {
|
|
49 'chromosome': 1,
|
|
50 'position': 2,
|
|
51 'strand': 3,
|
|
52 'reference': 4,
|
|
53 'alternate': 5
|
|
54 }
|
|
55
|
|
56 # Defualt values. Used as backup for CravatQuery to resolve query with incomplete information
|
|
57 default_values = {
|
|
58 'strand': '+'
|
|
59 }
|
|
60
|
|
61 # The neccessary attributes neeeded to submit a query.
|
|
62 query_keys = [
|
|
63 'chromosome', 'position', 'strand', 'reference', 'alternate'
|
|
64 ]
|
|
65
|
|
66 # Expected response keys from server. Ordered in list so that galaxy output has uniform column ordering run-to-run.
|
|
67 # If cravat server returns additional keys, they are appended to and included in output.
|
|
68 response_keys = [
|
|
69 "Chromosome", "Position", "Strand", "Reference base(s)", "Alternate base(s)",
|
|
70 "HUGO symbol", "S.O. transcript", "Sequence ontology protein change", "Sequence ontology",
|
|
71 "S.O. all transcripts", "gnomAD AF", "gnomAD AF (African)", "gnomAD AF (Amrican)",
|
|
72 "gnomAD AF (Ashkenazi Jewish)", "gnomAD AF (East Asian)", "gnomAD AF (Finnish)",
|
|
73 "gnomAD AF (Non-Finnish European)", "gnomAD AF (Other)", "gnomAD AF (South Asian)",
|
|
74 "1000 Genomes AF", "ESP6500 AF (average)", "ESP6500 AF (European American)",
|
|
75 "ESP6500 AF (African American)", "COSMIC transcript", "COSMIC protein change",
|
|
76 "COSMIC variant count [exact nucleotide change]", "cosmic_site_nt", "CGL driver class",
|
|
77 "TARGET", "dbSNP", "cgc_role", "cgc_inheritance", "cgc_tumor_type_somatic",
|
|
78 "cgc_tumor_type_germline", "ClinVar", "ClinVar disease identifier", "ClinVar XRef",
|
|
79 "GWAS Phenotype (GRASP)", "GWAS PMID (GRASP)", "Protein 3D variant"
|
|
80 ]
|
|
81
|
|
82
|
|
83 def __init__(self, _chr, pos, strand, ref, alt):
|
|
84 # '_chr' used to avoid naming confliction with python built-in 'chr'
|
|
85 self.chromosome = CravatQuery.format_chromosome(_chr)
|
|
86 self.position = pos
|
|
87 self.strand = strand
|
|
88 self.reference = ref
|
|
89 self.alternate = alt
|
|
90 self.values = [self.chromosome, self.position, self.strand, self.reference, self.alternate]
|
|
91
|
|
92
|
|
93 def __str__(self):
|
|
94 """ : Represent the CravatQuery as a valid query string for call to Cravat server """
|
|
95 return "_".join(map(lambda x: str(x), self.values))
|
|
96
|
|
97
|
|
98 def as_query_string(self):
|
|
99 return str(self)
|
|
100
|
|
101
|
|
102 @staticmethod
|
|
103 def from_dictionary(d):
|
|
104 """
|
|
105 : Instantiate a CravatQuery from a dictionary representation.
|
|
106 : Args:
|
|
107 : d <dictionary>: A dictionary representing a CravatQuery, containing keys: [{}]
|
|
108 """.format(CravatQuery.query_keys)
|
|
109
|
|
110 for key in CravatQuery.query_keys:
|
|
111 if key not in d:
|
|
112 raise CravatQueryException("CravatQuery.from_dictionary requires keys: [{}], however key: '{}' was not provided "
|
|
113 .format(CravatQuery.query_keys, key))
|
|
114 return CravatQuery(d["chromosome"], d["position"], d["strand"], d["reference"], d["alternate"])
|
|
115
|
|
116
|
|
117 @staticmethod
|
|
118 def from_array(array, mapping=None):
|
|
119 """
|
|
120 : Instantiate a CravatQuery from an array of values. Useful when translating read lines from a file.
|
|
121 : Args:
|
|
122 : fmt <str> - Either 'cr' or 'vcf', describing input format
|
|
123 : array <list> - The values to instantiate the CravatQuery from
|
|
124 : mapping <dict> - Optional. A dictionary associating cravat parameters to indicies in the array.
|
|
125 Valid values are: 'chromosome', 'position', 'strand', 'reference', 'alternate'
|
|
126 """
|
|
127
|
|
128 # Set the mapping value. Either recieved from user, or obtained via defualt associated to 'fmt'
|
|
129 if mapping == None:
|
|
130 mapping = CravatQuery.default_mapping
|
|
131
|
|
132 # Build a dict of cravat querying keys to values.
|
|
133 d = {}
|
|
134 for key in CravatQuery.query_keys:
|
|
135 # Try to get index position from mapping by the key, and value from array by the index
|
|
136 if key in mapping:
|
|
137 index = mapping[key]
|
|
138 d[key] = array[index]
|
|
139 # If index not provided in mapping, check if there is a defualt value
|
|
140 elif key in CravatQuery.default_values:
|
|
141 d[key] = CravatQuery.default_values[key]
|
|
142 # Unable to get value for querying key, meaning can't construct the minimum requirements for query
|
|
143 else:
|
|
144 raise CravatQueryException("CravatQuery.from_array requires a mapping index for key: '{}', however value was not provided".format(key))
|
|
145 return CravatQuery.from_dictionary(d)
|
|
146
|
|
147
|
|
148
|
|
149 @staticmethod
|
|
150 def format_chromosome(_chr):
|
|
151 """
|
|
152 : Format a chromosome for use as query parameter. '_chr' name used to avoid python built-in name confliction.
|
|
153 : Args:
|
|
154 : _chr - Either an interger [1,23], or 'x'/'X', or 'y'/'Y', or a string of the form
|
|
155 : 'chr<z>' where '<z>' is one of the previously described values
|
|
156 """
|
|
157 inRange = lambda x: 1 <= x and x <= 23
|
|
158 _chr = _chr.lower()
|
|
159 _chr = _chr.strip('chr')
|
|
160 # Handler interger chromosomes 1 to 23
|
|
161 try:
|
|
162 _chr = int(_chr)
|
|
163 if inRange(_chr):
|
|
164 return 'chr' + str(_chr)
|
|
165 else:
|
|
166 raise CravatQueryException("Chromsomme of '{}' was out of range [1,23]".format(_chr))
|
|
167 except:
|
|
168 pass
|
|
169 # Handle chromosomes chromosomes x and y
|
|
170 if _chr == 'x' or _chr == 'y':
|
|
171 return 'chr' + _chr
|
|
172 raise CravatQueryException("Unable to resolve input: '{}' into a valid chromosome representation".format(_chr))
|
|
173
|
|
174
|
|
175 @staticmethod
|
|
176 def jump_header(in_file, out_file, headerlines=0):
|
|
177 """
|
|
178 : Jumps over a header space of line number 'headerlines'. Sets up in_file so that
|
|
179 : the next execution of in_file.readline() will return the first non-header line.
|
|
180 """
|
|
181 in_file.seek(0)
|
|
182 for line in range(headerlines):
|
|
183 in_file.readline()
|
|
184
|
|
185
|
|
186 def main(in_path, out_path, pre_callback=None, user_mapping=None):
|
|
187 """
|
|
188 : Read the file line by line and use data to query cravat server.
|
|
189 : Args:
|
|
190 : fmt <str>: 'cr' or 'vcf'. The input format
|
|
191 : in_path <str>: Path to input file
|
|
192 : in_path <str>: Path to output file
|
|
193 : header_callback <function>: A function to handle the header space. Executed
|
|
194 before main loop. Recieves in_file, out_file, and fmt as argumnets
|
|
195 """
|
|
196
|
|
197 with open(in_path, 'r') as in_file, \
|
|
198 open(out_path, 'w') as out_file:
|
|
199
|
|
200 # Perform any pre-processing steps, such as jumping a header space
|
|
201 if pre_callback:
|
|
202 pre_callback(in_file, out_file, fmt)
|
|
203
|
|
204 # main loop
|
|
205 for line in in_file:
|
|
206
|
|
207 # Create query from line of input data
|
|
208 line = line.strip().split('\t')
|
|
209 query = CravatQuery.from_array(line, user_mapping)
|
|
210 # Make request, and write respone data
|
|
211 call = requests.get(CravatQuery.endpoint, params={ 'mutation': query.as_query_string })
|
|
212 try:
|
|
213 if call.status_code != 200 or call.text == "":
|
|
214 raise CravatQueryException("Bad Server Response. Respone code: '{}', Response Text: '{}'".format(call.status_code, call.text))
|
|
215 json_response = json.loads(call.text)
|
|
216 wrote = False
|
|
217 for key, val in json_response.items():
|
|
218 # Set numeric values to uniform format
|
|
219 try:
|
|
220 val = float(val)
|
|
221 val = format(val, ".4f")
|
|
222 except:
|
|
223 pass
|
|
224 if wrote:
|
|
225 out_file.write("\t")
|
|
226 out_file.write(val)
|
|
227 wrote = True
|
|
228 out_file.write("\n")
|
|
229 except CravatQueryException as e:
|
|
230 print(e)
|
|
231
|
|
232
|
|
233
|
|
234
|
|
235 if __name__ == "__main__":
|
|
236
|
|
237 # Input and output file paths, obtained form command line
|
|
238 in_path = sys.argv[1]
|
|
239 out_path = sys.argv[2]
|
|
240
|
|
241 # Possibly allow user mapping configuration thourgh here. Not fully implemented
|
|
242 if len(sys.argv) > 2:
|
|
243 user_mapping = sys.argv[3]
|
|
244
|
|
245 # Run the main operation
|
|
246 main(in_path, out_path) |