46
|
1 from __future__ import division
|
|
2 import sys
|
|
3 import pandas as pd
|
|
4 import collections
|
|
5 import pickle as pk
|
|
6 import math
|
|
7 import argparse
|
|
8
|
|
9 ########################## argparse ##########################################
|
|
10
|
|
11 def process_args(args):
|
|
12 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
|
|
13 description = 'process some value\'s'+
|
|
14 ' genes to create a comparison\'s map.')
|
|
15 parser.add_argument('-rs', '--rules_selector',
|
|
16 type = str,
|
|
17 default = 'HMRcore',
|
|
18 choices = ['HMRcore', 'Recon', 'Custom'],
|
|
19 help = 'chose which type of dataset you want use')
|
|
20 parser.add_argument('-cr', '--custom',
|
|
21 type = str,
|
|
22 help='your dataset if you want custom rules')
|
|
23 parser.add_argument('-n', '--none',
|
|
24 type = str,
|
|
25 default = 'true',
|
|
26 choices = ['true', 'false'],
|
|
27 help = 'compute Nan values')
|
|
28 parser.add_argument('-td', '--tool_dir',
|
|
29 type = str,
|
|
30 required = True,
|
|
31 help = 'your tool directory')
|
|
32 parser.add_argument('-ol', '--out_log',
|
|
33 help = "Output log")
|
47
|
34 parser.add_argument('-id', '--input',
|
46
|
35 type = str,
|
|
36 help = 'input dataset')
|
47
|
37 parser.add_argument('-ra', '--ras_output',
|
46
|
38 type = str,
|
47
|
39 required = True,
|
|
40 help = 'ras output')
|
|
41
|
46
|
42 args = parser.parse_args()
|
|
43 return args
|
|
44
|
|
45 ########################### warning ###########################################
|
|
46
|
|
47 def warning(s):
|
|
48 args = process_args(sys.argv)
|
|
49 with open(args.out_log, 'a') as log:
|
|
50 log.write(s)
|
|
51
|
|
52 ############################ dataset input ####################################
|
|
53
|
|
54 def read_dataset(data, name):
|
|
55 try:
|
|
56 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python')
|
|
57 except pd.errors.EmptyDataError:
|
|
58 sys.exit('Execution aborted: wrong format of ' + name + '\n')
|
|
59 if len(dataset.columns) < 2:
|
|
60 sys.exit('Execution aborted: wrong format of ' + name + '\n')
|
|
61 return dataset
|
|
62
|
|
63 ############################ dataset name #####################################
|
|
64
|
|
65 def name_dataset(name_data, count):
|
|
66 if str(name_data) == 'Dataset':
|
|
67 return str(name_data) + '_' + str(count)
|
|
68 else:
|
|
69 return str(name_data)
|
|
70
|
|
71 ############################ load id e rules ##################################
|
|
72
|
|
73 def load_id_rules(reactions):
|
|
74 ids, rules = [], []
|
|
75 for key, value in reactions.items():
|
|
76 ids.append(key)
|
|
77 rules.append(value)
|
|
78 return (ids, rules)
|
|
79
|
|
80 ############################ check_methods ####################################
|
|
81
|
|
82 def gene_type(l, name):
|
|
83 if check_hgnc(l):
|
|
84 return 'hugo_id'
|
|
85 elif check_ensembl(l):
|
|
86 return 'ensembl_gene_id'
|
|
87 elif check_symbol(l):
|
|
88 return 'symbol'
|
|
89 elif check_entrez(l):
|
|
90 return 'entrez_id'
|
|
91 else:
|
|
92 sys.exit('Execution aborted:\n' +
|
|
93 'gene ID type in ' + name + ' not supported. Supported ID'+
|
|
94 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
|
|
95
|
|
96 def check_hgnc(l):
|
|
97 if len(l) > 5:
|
|
98 if (l.upper()).startswith('HGNC:'):
|
|
99 return l[5:].isdigit()
|
|
100 else:
|
|
101 return False
|
|
102 else:
|
|
103 return False
|
|
104
|
|
105 def check_ensembl(l):
|
|
106 if len(l) == 15:
|
|
107 if (l.upper()).startswith('ENS'):
|
|
108 return l[4:].isdigit()
|
|
109 else:
|
|
110 return False
|
|
111 else:
|
|
112 return False
|
|
113
|
|
114 def check_symbol(l):
|
|
115 if len(l) > 0:
|
|
116 if l[0].isalpha() and l[1:].isalnum():
|
|
117 return True
|
|
118 else:
|
|
119 return False
|
|
120 else:
|
|
121 return False
|
|
122
|
|
123 def check_entrez(l):
|
|
124 if len(l) > 0:
|
|
125 return l.isdigit()
|
|
126 else:
|
|
127 return False
|
|
128
|
|
129 def check_bool(b):
|
|
130 if b == 'true':
|
|
131 return True
|
|
132 elif b == 'false':
|
|
133 return False
|
|
134
|
|
135 ############################ resolve_methods ##################################
|
|
136
|
|
137 def replace_gene_value(l, d):
|
|
138 tmp = []
|
|
139 err = []
|
|
140 while l:
|
|
141 if isinstance(l[0], list):
|
|
142 tmp_rules, tmp_err = replace_gene_value(l[0], d)
|
|
143 tmp.append(tmp_rules)
|
|
144 err.extend(tmp_err)
|
|
145 else:
|
|
146 value = replace_gene(l[0], d)
|
|
147 tmp.append(value)
|
|
148 if value == None:
|
|
149 err.append(l[0])
|
|
150 l = l[1:]
|
|
151 return (tmp, err)
|
|
152
|
|
153
|
|
154 def replace_gene(l, d):
|
|
155 if l =='and' or l == 'or':
|
|
156 return l
|
|
157 else:
|
|
158 value = d.get(l, None)
|
|
159 if not(value == None or isinstance(value, (int, float))):
|
|
160 sys.exit('Execution aborted: ' + value + ' value not valid\n')
|
|
161 return value
|
|
162
|
|
163 def computes(val1, op, val2, cn):
|
|
164 if val1 != None and val2 != None:
|
|
165 if op == 'and':
|
|
166 return min(val1, val2)
|
|
167 else:
|
|
168 return val1 + val2
|
|
169 elif op == 'and':
|
|
170 if cn is True:
|
|
171 if val1 != None:
|
|
172 return val1
|
|
173 elif val2 != None:
|
|
174 return val2
|
|
175 else:
|
|
176 return None
|
|
177 else:
|
|
178 return None
|
|
179 else:
|
|
180 if val1 != None:
|
|
181 return val1
|
|
182 elif val2 != None:
|
|
183 return val2
|
|
184 else:
|
|
185 return None
|
|
186
|
|
187 def control(ris, l, cn):
|
|
188 if len(l) == 1:
|
|
189 if isinstance(l[0], (float, int)) or l[0] == None:
|
|
190 return l[0]
|
|
191 elif isinstance(l[0], list):
|
|
192 return control(None, l[0], cn)
|
|
193 else:
|
|
194 return False
|
|
195 elif len(l) > 2:
|
|
196 return control_list(ris, l, cn)
|
|
197 else:
|
|
198 return False
|
|
199
|
|
200 def control_list(ris, l, cn):
|
|
201 while l:
|
|
202 if len(l) == 1:
|
|
203 return False
|
|
204 elif (isinstance(l[0], (float, int)) or
|
|
205 l[0] == None) and l[1] in ['and', 'or']:
|
|
206 if isinstance(l[2], (float, int)) or l[2] == None:
|
|
207 ris = computes(l[0], l[1], l[2], cn)
|
|
208 elif isinstance(l[2], list):
|
|
209 tmp = control(None, l[2], cn)
|
|
210 if tmp is False:
|
|
211 return False
|
|
212 else:
|
|
213 ris = computes(l[0], l[1], tmp, cn)
|
|
214 else:
|
|
215 return False
|
|
216 l = l[3:]
|
|
217 elif l[0] in ['and', 'or']:
|
|
218 if isinstance(l[1], (float, int)) or l[1] == None:
|
|
219 ris = computes(ris, l[0], l[1], cn)
|
|
220 elif isinstance(l[1], list):
|
|
221 tmp = control(None,l[1], cn)
|
|
222 if tmp is False:
|
|
223 return False
|
|
224 else:
|
|
225 ris = computes(ris, l[0], tmp, cn)
|
|
226 else:
|
|
227 return False
|
|
228 l = l[2:]
|
|
229 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
|
|
230 if isinstance(l[2], (float, int)) or l[2] == None:
|
|
231 tmp = control(None, l[0], cn)
|
|
232 if tmp is False:
|
|
233 return False
|
|
234 else:
|
|
235 ris = computes(tmp, l[1], l[2], cn)
|
|
236 elif isinstance(l[2], list):
|
|
237 tmp = control(None, l[0], cn)
|
|
238 tmp2 = control(None, l[2], cn)
|
|
239 if tmp is False or tmp2 is False:
|
|
240 return False
|
|
241 else:
|
|
242 ris = computes(tmp, l[1], tmp2, cn)
|
|
243 else:
|
|
244 return False
|
|
245 l = l[3:]
|
|
246 else:
|
|
247 return False
|
|
248 return ris
|
|
249
|
|
250 ############################ make recon #######################################
|
|
251
|
|
252 def check_and_doWord(l):
|
|
253 tmp = []
|
|
254 tmp_genes = []
|
|
255 count = 0
|
|
256 while l:
|
|
257 if count >= 0:
|
|
258 if l[0] == '(':
|
|
259 count += 1
|
|
260 tmp.append(l[0])
|
|
261 l.pop(0)
|
|
262 elif l[0] == ')':
|
|
263 count -= 1
|
|
264 tmp.append(l[0])
|
|
265 l.pop(0)
|
|
266 elif l[0] == ' ':
|
|
267 l.pop(0)
|
|
268 else:
|
|
269 word = []
|
|
270 while l:
|
|
271 if l[0] in [' ', '(', ')']:
|
|
272 break
|
|
273 else:
|
|
274 word.append(l[0])
|
|
275 l.pop(0)
|
|
276 word = ''.join(word)
|
|
277 tmp.append(word)
|
|
278 if not(word in ['or', 'and']):
|
|
279 tmp_genes.append(word)
|
|
280 else:
|
|
281 return False
|
|
282 if count == 0:
|
|
283 return (tmp, tmp_genes)
|
|
284 else:
|
|
285 return False
|
|
286
|
|
287 def brackets_to_list(l):
|
|
288 tmp = []
|
|
289 while l:
|
|
290 if l[0] == '(':
|
|
291 l.pop(0)
|
|
292 tmp.append(resolve_brackets(l))
|
|
293 else:
|
|
294 tmp.append(l[0])
|
|
295 l.pop(0)
|
|
296 return tmp
|
|
297
|
|
298 def resolve_brackets(l):
|
|
299 tmp = []
|
|
300 while l[0] != ')':
|
|
301 if l[0] == '(':
|
|
302 l.pop(0)
|
|
303 tmp.append(resolve_brackets(l))
|
|
304 else:
|
|
305 tmp.append(l[0])
|
|
306 l.pop(0)
|
|
307 l.pop(0)
|
|
308 return tmp
|
|
309
|
|
310 def priorityAND(l):
|
|
311 tmp = []
|
|
312 flag = True
|
|
313 while l:
|
|
314 if len(l) == 1:
|
|
315 if isinstance(l[0], list):
|
|
316 tmp.append(priorityAND(l[0]))
|
|
317 else:
|
|
318 tmp.append(l[0])
|
|
319 l = l[1:]
|
|
320 elif l[0] == 'or':
|
|
321 tmp.append(l[0])
|
|
322 flag = False
|
|
323 l = l[1:]
|
|
324 elif l[1] == 'or':
|
|
325 if isinstance(l[0], list):
|
|
326 tmp.append(priorityAND(l[0]))
|
|
327 else:
|
|
328 tmp.append(l[0])
|
|
329 tmp.append(l[1])
|
|
330 flag = False
|
|
331 l = l[2:]
|
|
332 elif l[1] == 'and':
|
|
333 tmpAnd = []
|
|
334 if isinstance(l[0], list):
|
|
335 tmpAnd.append(priorityAND(l[0]))
|
|
336 else:
|
|
337 tmpAnd.append(l[0])
|
|
338 tmpAnd.append(l[1])
|
|
339 if isinstance(l[2], list):
|
|
340 tmpAnd.append(priorityAND(l[2]))
|
|
341 else:
|
|
342 tmpAnd.append(l[2])
|
|
343 l = l[3:]
|
|
344 while l:
|
|
345 if l[0] == 'and':
|
|
346 tmpAnd.append(l[0])
|
|
347 if isinstance(l[1], list):
|
|
348 tmpAnd.append(priorityAND(l[1]))
|
|
349 else:
|
|
350 tmpAnd.append(l[1])
|
|
351 l = l[2:]
|
|
352 elif l[0] == 'or':
|
|
353 flag = False
|
|
354 break
|
|
355 if flag == True: #when there are only AND in list
|
|
356 tmp.extend(tmpAnd)
|
|
357 elif flag == False:
|
|
358 tmp.append(tmpAnd)
|
|
359 return tmp
|
|
360
|
|
361 def checkRule(l):
|
|
362 if len(l) == 1:
|
|
363 if isinstance(l[0], list):
|
|
364 if checkRule(l[0]) is False:
|
|
365 return False
|
|
366 elif len(l) > 2:
|
|
367 if checkRule2(l) is False:
|
|
368 return False
|
|
369 else:
|
|
370 return False
|
|
371 return True
|
|
372
|
|
373 def checkRule2(l):
|
|
374 while l:
|
|
375 if len(l) == 1:
|
|
376 return False
|
|
377 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
|
|
378 if checkRule(l[0]) is False:
|
|
379 return False
|
|
380 if isinstance(l[2], list):
|
|
381 if checkRule(l[2]) is False:
|
|
382 return False
|
|
383 l = l[3:]
|
|
384 elif l[1] in ['and', 'or']:
|
|
385 if isinstance(l[2], list):
|
|
386 if checkRule(l[2]) is False:
|
|
387 return False
|
|
388 l = l[3:]
|
|
389 elif l[0] in ['and', 'or']:
|
|
390 if isinstance(l[1], list):
|
|
391 if checkRule(l[1]) is False:
|
|
392 return False
|
|
393 l = l[2:]
|
|
394 else:
|
|
395 return False
|
|
396 return True
|
|
397
|
|
398 def do_rules(rules):
|
|
399 split_rules = []
|
|
400 err_rules = []
|
|
401 tmp_gene_in_rule = []
|
|
402 for i in range(len(rules)):
|
|
403 tmp = list(rules[i])
|
|
404 if tmp:
|
|
405 tmp, tmp_genes = check_and_doWord(tmp)
|
|
406 tmp_gene_in_rule.extend(tmp_genes)
|
|
407 if tmp is False:
|
|
408 split_rules.append([])
|
|
409 err_rules.append(rules[i])
|
|
410 else:
|
|
411 tmp = brackets_to_list(tmp)
|
|
412 if checkRule(tmp):
|
|
413 split_rules.append(priorityAND(tmp))
|
|
414 else:
|
|
415 split_rules.append([])
|
|
416 err_rules.append(rules[i])
|
|
417 else:
|
|
418 split_rules.append([])
|
|
419 if err_rules:
|
|
420 warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
|
|
421 return (split_rules, list(set(tmp_gene_in_rule)))
|
|
422
|
|
423 def make_recon(data):
|
|
424 try:
|
|
425 import cobra as cb
|
|
426 import warnings
|
|
427 with warnings.catch_warnings():
|
|
428 warnings.simplefilter('ignore')
|
|
429 recon = cb.io.read_sbml_model(data)
|
|
430 react = recon.reactions
|
|
431 rules = [react[i].gene_reaction_rule for i in range(len(react))]
|
|
432 ids = [react[i].id for i in range(len(react))]
|
|
433 except cb.io.sbml3.CobraSBMLError:
|
|
434 try:
|
|
435 data = (pd.read_csv(data, sep = '\t', dtype = str, engine='python')).fillna('')
|
|
436 if len(data.columns) < 2:
|
|
437 sys.exit('Execution aborted: wrong format of '+
|
|
438 'custom datarules\n')
|
|
439 if not len(data.columns) == 2:
|
|
440 warning('Warning: more than 2 columns in custom datarules.\n' +
|
|
441 'Extra columns have been disregarded\n')
|
|
442 ids = list(data.iloc[:, 0])
|
|
443 rules = list(data.iloc[:, 1])
|
|
444 except pd.errors.EmptyDataError:
|
|
445 sys.exit('Execution aborted: wrong format of custom datarules\n')
|
|
446 except pd.errors.ParserError:
|
|
447 sys.exit('Execution aborted: wrong format of custom datarules\n')
|
|
448 split_rules, tmp_genes = do_rules(rules)
|
|
449 gene_in_rule = {}
|
|
450 for i in tmp_genes:
|
|
451 gene_in_rule[i] = 'ok'
|
|
452 return (ids, split_rules, gene_in_rule)
|
|
453
|
|
454 ############################ gene #############################################
|
|
455
|
|
456 def data_gene(gene, type_gene, name, gene_custom):
|
|
457 args = process_args(sys.argv)
|
|
458 for i in range(len(gene)):
|
|
459 tmp = gene.iloc[i, 0]
|
|
460 if tmp.startswith(' ') or tmp.endswith(' '):
|
|
461 gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
|
|
462 gene_dup = [item for item, count in
|
|
463 collections.Counter(gene[gene.columns[0]]).items() if count > 1]
|
|
464 pat_dup = [item for item, count in
|
|
465 collections.Counter(list(gene.columns)).items() if count > 1]
|
|
466
|
|
467 if gene_dup:
|
|
468 if gene_custom == None:
|
|
469 if args.rules_selector == 'HMRcore':
|
|
470 gene_in_rule = pk.load(open(args.tool_dir +
|
|
471 '/local/HMRcore_genes.p', 'rb'))
|
|
472 elif args.rules_selector == 'Recon':
|
|
473 gene_in_rule = pk.load(open(args.tool_dir +
|
|
474 '/local/Recon_genes.p', 'rb'))
|
|
475 gene_in_rule = gene_in_rule.get(type_gene)
|
|
476 else:
|
|
477 gene_in_rule = gene_custom
|
|
478 tmp = []
|
|
479 for i in gene_dup:
|
|
480 if gene_in_rule.get(i) == 'ok':
|
|
481 tmp.append(i)
|
|
482 if tmp:
|
|
483 sys.exit('Execution aborted because gene ID '
|
|
484 +str(tmp)+' in '+name+' is duplicated\n')
|
|
485 if pat_dup:
|
|
486 warning('Warning: duplicated label\n' + str(pat_dup) + 'in ' + name +
|
|
487 '\n')
|
|
488
|
|
489 return (gene.set_index(gene.columns[0])).to_dict()
|
|
490
|
|
491 ############################ resolve ##########################################
|
|
492
|
|
493 def resolve(genes, rules, ids, resolve_none, name):
|
|
494 resolve_rules = {}
|
|
495 not_found = []
|
|
496 flag = False
|
|
497 for key, value in genes.items():
|
|
498 tmp_resolve = []
|
|
499 for i in range(len(rules)):
|
|
500 tmp = rules[i]
|
|
501 if tmp:
|
|
502 tmp, err = replace_gene_value(tmp, value)
|
|
503 if err:
|
|
504 not_found.extend(err)
|
|
505 ris = control(None, tmp, resolve_none)
|
|
506 if ris is False or ris == None:
|
|
507 tmp_resolve.append(None)
|
|
508 else:
|
|
509 tmp_resolve.append(ris)
|
|
510 flag = True
|
|
511 else:
|
|
512 tmp_resolve.append(None)
|
|
513 resolve_rules[key] = tmp_resolve
|
|
514 if flag is False:
|
|
515 warning('Warning: no computable score (due to missing gene values)' +
|
|
516 'for class ' + name + ', the class has been disregarded\n')
|
|
517 return (None, None)
|
|
518 return (resolve_rules, list(set(not_found)))
|
|
519
|
|
520 ############################ split class ######################################
|
|
521
|
|
522 def split_class(classes, resolve_rules):
|
|
523 class_pat = {}
|
|
524 for i in range(len(classes)):
|
|
525 classe = classes.iloc[i, 1]
|
|
526 if not pd.isnull(classe):
|
|
527 l = []
|
|
528 for j in range(i, len(classes)):
|
|
529 if classes.iloc[j, 1] == classe:
|
|
530 pat_id = classes.iloc[j, 0]
|
|
531 if tmp != None:
|
|
532 l.append(tmp)
|
|
533 classes.iloc[j, 1] = None
|
|
534 if l:
|
|
535 class_pat[classe] = list(map(list, zip(*l)))
|
|
536 else:
|
|
537 warning('Warning: no sample found in class ' + classe +
|
|
538 ', the class has been disregarded\n')
|
|
539 return class_pat
|
|
540
|
|
541 ############################ create_ras #######################################
|
|
542
|
47
|
543 def create_ras (resolve_rules, dataset_name, rules, ids, file):
|
46
|
544
|
|
545 if resolve_rules == None:
|
|
546 warning("Couldn't generate RAS for current dataset: " + dataset_name)
|
|
547
|
|
548 for geni in resolve_rules.values():
|
|
549 for i, valori in enumerate(geni):
|
|
550 if valori == None:
|
|
551 geni[i] = 'None'
|
|
552
|
|
553 output_ras = pd.DataFrame.from_dict(resolve_rules)
|
|
554
|
|
555 output_ras.insert(0, 'Reactions', ids)
|
|
556 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
|
|
557
|
47
|
558 text_file = open(file, "w")
|
46
|
559
|
|
560 text_file.write(output_to_csv)
|
|
561 text_file.close()
|
|
562
|
|
563 ############################ MAIN #############################################
|
|
564
|
|
565 def main():
|
|
566 args = process_args(sys.argv)
|
|
567
|
|
568 if args.rules_selector == 'HMRcore':
|
|
569 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
|
|
570 elif args.rules_selector == 'Recon':
|
|
571 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
|
|
572 elif args.rules_selector == 'Custom':
|
|
573 ids, rules, gene_in_rule = make_recon(args.custom)
|
|
574
|
|
575 resolve_none = check_bool(args.none)
|
|
576
|
|
577
|
47
|
578 name = "RAS Dataset"
|
|
579 dataset = read_dataset(args.input, "dataset")
|
46
|
580
|
47
|
581 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
|
46
|
582
|
47
|
583 type_gene = gene_type(dataset.iloc[0, 0], name)
|
|
584
|
|
585 if args.rules_selector != 'Custom':
|
|
586 genes = data_gene(dataset, type_gene, name, None)
|
|
587 ids, rules = load_id_rules(recon.get(type_gene))
|
|
588 elif args.rules_selector == 'Custom':
|
|
589 genes = data_gene(dataset, type_gene, name, gene_in_rule)
|
46
|
590
|
47
|
591 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name)
|
|
592
|
|
593 create_ras(resolve_rules, name, rules, ids, args.ras_output)
|
|
594
|
|
595 if err != None and err:
|
|
596 warning('Warning: gene\n' + str(err) + '\nnot found in class '
|
|
597 + name + ', the expression level for this gene ' +
|
|
598 'will be considered NaN\n')
|
|
599
|
|
600
|
46
|
601 print('Execution succeded')
|
|
602
|
|
603 return None
|
|
604
|
|
605 ###############################################################################
|
|
606
|
|
607 if __name__ == "__main__":
|
|
608 main()
|