10
|
1 """
|
|
2 A galaxy wrapper for the /rest/service/query API endpoint on Cravat.
|
|
3
|
|
4
|
|
5 Notes on Mapping:
|
|
6 -----------------
|
|
7 The CravatQuery class uses static method 'from_array' to interpret an array of values
|
|
8 into a query string for the /rest/service/query API service on the cravat server.
|
|
9 This involves using a mapping dictionary to know how to associate the array's index positions
|
|
10 in to query-ing attributes, such as the chromosome, position, etc. The CravatQuery
|
|
11 class contains a default value ('default_mapping'); however, this could also be
|
|
12 offered as a user-configurable option.
|
|
13
|
|
14
|
|
15 Remaining Items (including possible expansion features):
|
|
16 -----------------
|
|
17 TODO: Possibly provide user-configurability of CravatQuery array index mapping
|
|
18 TODO: Possibly provide user-configurability of delimiter value
|
|
19 TODO: Check if chromosomes are 0 or 1 based indexing
|
|
20 TODO: Port 'write headers' option and include in user prompts in galaxy xml
|
|
21 TODO: Try-catch structure on the query call to cravat so if one bad query doesn't get back a response,
|
|
22 the rest of the run can still execute. Report this to user.
|
|
23 """
|
|
24
|
|
25
|
|
26 import requests
|
|
27 import json
|
|
28 import sys
|
|
29 import re
|
|
30 ###
|
|
31 import ipdb
|
|
32
|
|
33
|
|
34 class CravatQueryException(Exception):
|
|
35
|
|
36 def __init__(self, message, errors=None):
|
|
37 super(CravatQueryException, self).__init__(message)
|
|
38 # Support for custom error codes
|
|
39 self.errors = errors
|
|
40
|
|
41
|
|
42 class CravatQuery(object):
|
|
43 """
|
|
44 : A class for handling Cravat query strings.
|
|
45 : Args (all required):
|
|
46 : chr - Chromosome
|
|
47 : pos - Position
|
|
48 : strand - Strand
|
|
49 : ref - Reference Base
|
|
50 : alt - Alternate Base
|
|
51 """
|
|
52
|
|
53 # The endpoint that CravatQuerys are submitted to
|
|
54 endpoint = 'http://cravat.us/CRAVAT/rest/service/query'
|
|
55
|
|
56 # The value delimiter used in the Cravat input file to delimit values
|
|
57 delimiter = "\t"
|
|
58
|
|
59 # Defualt indices for intepretting a cravat file's row of data in to a CravatQuery
|
|
60 default_mapping = {
|
|
61 'chromosome': 1,
|
|
62 'position': 2,
|
|
63 'strand': 3,
|
|
64 'reference': 4,
|
|
65 'alternate': 5
|
|
66 }
|
|
67
|
|
68 # Defualt values. Used as backup for CravatQuery to resolve query with incomplete information
|
|
69 default_values = {
|
|
70 'strand': '+'
|
|
71 }
|
|
72
|
|
73 # The neccessary attributes neeeded to submit a query.
|
|
74 query_keys = [
|
|
75 'chromosome', 'position', 'strand', 'reference', 'alternate'
|
|
76 ]
|
|
77
|
|
78 # Expected response keys from server. Ordered in list so that galaxy output has uniform column ordering run-to-run.
|
|
79 # If cravat server returns additional keys, they are appended to and included in output.
|
|
80 response_keys = [
|
|
81 "Chromosome", "Position", "Strand", "Reference base(s)", "Alternate base(s)",
|
|
82 "HUGO symbol", "S.O. transcript", "Sequence ontology protein change", "Sequence ontology",
|
|
83 "S.O. all transcripts", "gnomAD AF", "gnomAD AF (African)", "gnomAD AF (Amrican)",
|
|
84 "gnomAD AF (Ashkenazi Jewish)", "gnomAD AF (East Asian)", "gnomAD AF (Finnish)",
|
|
85 "gnomAD AF (Non-Finnish European)", "gnomAD AF (Other)", "gnomAD AF (South Asian)",
|
|
86 "1000 Genomes AF", "ESP6500 AF (average)", "ESP6500 AF (European American)",
|
|
87 "ESP6500 AF (African American)", "COSMIC transcript", "COSMIC protein change",
|
|
88 "COSMIC variant count [exact nucleotide change]", "cosmic_site_nt", "CGL driver class",
|
|
89 "TARGET", "dbSNP", "cgc_role", "cgc_inheritance", "cgc_tumor_type_somatic",
|
|
90 "cgc_tumor_type_germline", "ClinVar", "ClinVar disease identifier", "ClinVar XRef",
|
|
91 "GWAS Phenotype (GRASP)", "GWAS PMID (GRASP)", "Protein 3D variant"
|
|
92 ]
|
|
93
|
|
94
|
|
95 def __init__(self, _chr, pos, strand, ref, alt):
|
|
96 # '_chr' used to avoid naming confliction with python built-in 'chr'
|
|
97 self.chromosome = CravatQuery.format_chromosome(_chr)
|
|
98 self.position = pos
|
|
99 self.strand = strand
|
|
100 self.reference = ref
|
|
101 self.alternate = alt
|
|
102 self.values = [self.chromosome, self.position, self.strand, self.reference, self.alternate]
|
|
103
|
|
104
|
|
105 def __str__(self):
|
|
106 """
|
|
107 : Represent the CravatQuery as a valid query string for call to Cravat server
|
|
108 """
|
|
109 return "_".join(map(lambda x: str(x), self.values))
|
|
110
|
|
111
|
|
112 def as_query_string(self):
|
|
113 return str(self)
|
|
114
|
|
115
|
|
116 @staticmethod
|
|
117 def from_dictionary(d):
|
|
118 """
|
|
119 : Instantiate a CravatQuery from a dictionary representation.
|
|
120 : Args:
|
|
121 : d <dictionary>: A dictionary representing a CravatQuery, containing keys: [{}]
|
|
122 """.format(CravatQuery.query_keys)
|
|
123
|
|
124 for key in CravatQuery.query_keys:
|
|
125 if key not in d:
|
|
126 raise CravatQueryException("CravatQuery.from_dictionary requires keys: [{}], however key: '{}' was not provided "
|
|
127 .format(CravatQuery.query_keys, key))
|
|
128 return CravatQuery(d["chromosome"], d["position"], d["strand"], d["reference"], d["alternate"])
|
|
129
|
|
130
|
|
131 @staticmethod
|
|
132 def from_array(array, mapping=None):
|
|
133 """
|
|
134 : Instantiate a CravatQuery from an array of values. Useful when translating read lines from a file.
|
|
135 : Args:
|
|
136 : fmt <str> - Either 'cr' or 'vcf', describing input format
|
|
137 : array <list> - The values to instantiate the CravatQuery from
|
|
138 : mapping <dict> - Optional. A dictionary associating cravat parameters to indicies in the array.
|
|
139 Valid values are: 'chromosome', 'position', 'strand', 'reference', 'alternate'
|
|
140 """
|
|
141
|
|
142 # Set the mapping value. Either recieved from user, or obtained via defualt associated to 'fmt'
|
|
143 if mapping == None:
|
|
144 mapping = CravatQuery.default_mapping
|
|
145
|
|
146 # Build a dict of cravat querying keys to values.
|
|
147 d = {}
|
|
148 for key in CravatQuery.query_keys:
|
|
149 # Try to get index position from mapping by the key, and value from array by the index
|
|
150 if key in mapping:
|
|
151 index = mapping[key]
|
|
152 d[key] = array[index]
|
|
153 # If index not provided in mapping, check if there is a defualt value
|
|
154 elif key in CravatQuery.default_values:
|
|
155 d[key] = CravatQuery.default_values[key]
|
|
156 # Unable to get value for querying key, meaning can't construct the minimum requirements for query
|
|
157 else:
|
|
158 raise CravatQueryException("CravatQuery.from_array requires a mapping index for key: '{}', however value was not provided".format(key))
|
|
159 return CravatQuery.from_dictionary(d)
|
|
160
|
|
161
|
|
162
|
|
163 @staticmethod
|
|
164 def format_chromosome(_chr):
|
|
165 """
|
|
166 : Format a chromosome for use as query parameter. '_chr' name used to avoid python built-in name confliction.
|
|
167 : Args:
|
|
168 : _chr - Either an interger [1,23], or 'x'/'X', or 'y'/'Y', or a string of the form
|
|
169 : 'chr<z>' where '<z>' is one of the previously described values
|
|
170 """
|
|
171 inRange = lambda x: 1 <= x and x <= 23
|
|
172 _chr = _chr.lower()
|
|
173 _chr = _chr.strip('chr')
|
|
174 # Handler interger chromosomes 1 to 23
|
|
175 try:
|
|
176 _chr = int(_chr)
|
|
177 if inRange(_chr):
|
|
178 return 'chr' + str(_chr)
|
|
179 else:
|
|
180 raise CravatQueryException("Chromsomme of '{}' was out of range [1,23]".format(_chr))
|
|
181 except:
|
|
182 pass
|
|
183 # Handle chromosomes chromosomes x and y
|
|
184 if _chr == 'x' or _chr == 'y':
|
|
185 return 'chr' + _chr
|
|
186 raise CravatQueryException("Unable to resolve input: '{}' into a valid chromosome representation".format(_chr))
|
|
187
|
|
188
|
|
189 @staticmethod
|
|
190 def jump_header(in_file, out_file, headerlines=0):
|
|
191 """
|
|
192 : Jumps over a header space of line number 'headerlines'. Sets up in_file so that
|
|
193 : the next execution of in_file.readline() will return the first non-header line.
|
|
194 """
|
|
195 in_file.seek(0)
|
|
196 for line in range(headerlines):
|
|
197 in_file.readline()
|
|
198
|
|
199
|
|
200 def main(in_path, out_path, pre_callback=None, user_mapping=None):
|
|
201 """
|
|
202 : Read the file line by line and use data to query cravat server.
|
|
203 : Args:
|
|
204 : - fmt <str>: 'cr' or 'vcf'. The input format
|
|
205 : - in_path <str>: Path to input file
|
|
206 : - in_path <str>: Path to output file
|
|
207 : - header_callback <function>: A function to handle the header space. Executed
|
|
208 before main loop. Recieves in_file, out_file, and fmt as argumnets
|
|
209 """
|
|
210
|
|
211 with open(in_path, 'r') as in_file, \
|
|
212 open(out_path, 'w') as out_file:
|
|
213
|
|
214 # Perform any pre-processing steps, such as jumping a header space
|
|
215 if pre_callback:
|
|
216 pre_callback(in_file, out_file, fmt)
|
|
217
|
|
218 # main loop
|
|
219 for line in in_file:
|
|
220
|
|
221 # Create query from line of input data
|
|
222 line = line.strip().split('\t')
|
|
223 query = CravatQuery.from_array(line, user_mapping)
|
|
224 # Make request, and write respone data
|
|
225 call = requests.get(CravatQuery.endpoint, params={ 'mutation': query.as_query_string })
|
|
226 ipdb.set_trace()
|
|
227 try:
|
|
228 if call.status_code != 200 or call.text == "":
|
|
229 raise CravatQueryException("Bad Server Response. Respone code: '{}', Response Text: '{}'".format(call.status_code, call.text))
|
|
230 json_response = json.loads(call.text)
|
|
231 wrote = False
|
|
232 for key, val in json_response.items():
|
|
233 # Set numeric values to uniform format
|
|
234 try:
|
|
235 val = float(val)
|
|
236 val = format(val, ".4f")
|
|
237 except:
|
|
238 pass
|
|
239 if wrote:
|
|
240 out_file.write("\t")
|
|
241 out_file.write(val)
|
|
242 wrote = True
|
|
243 out_file.write("\n")
|
|
244 except CravatQueryException as e:
|
|
245 print(e)
|
|
246
|
|
247
|
|
248
|
|
249
|
|
250 if __name__ == "__main__":
|
|
251
|
|
252 # Input and output file paths, obtained form command line
|
|
253 in_path = sys.argv[1]
|
|
254 out_path = sys.argv[2]
|
|
255
|
|
256 # Possibly allow user mapping configuration thourgh here. Not fully implemented
|
|
257 if len(sys.argv) > 2:
|
|
258 user_mapping = sys.argv[3]
|
|
259
|
|
260 # Run the main operation
|
|
261 main(in_path, out_path) |