comparison Marea/marea_cluster.py @ 0:23ac9cf12788 draft

Uploaded
author bimib
date Tue, 06 Nov 2018 03:16:21 -0500
parents
children 5721182715a7
comparison
equal deleted inserted replaced
-1:000000000000 0:23ac9cf12788
1
2 from __future__ import division
3 import os
4 import sys
5 import pandas as pd
6 import collections
7 import pickle as pk
8 import argparse
9 from sklearn.cluster import KMeans
10 import matplotlib.pyplot as plt
11
12 ########################## argparse ###########################################
13
14 def process_args(args):
15 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
16 description = 'process some value\'s' +
17 ' genes to create class.')
18 parser.add_argument('-rs', '--rules_selector',
19 type = str,
20 default = 'HMRcore',
21 choices = ['HMRcore', 'Recon', 'Custom'],
22 help = 'chose which type of dataset you want use')
23 parser.add_argument('-cr', '--custom',
24 type = str,
25 help='your dataset if you want custom rules')
26 parser.add_argument('-ch', '--cond_hier',
27 type = str,
28 default = 'no',
29 choices = ['no', 'yes'],
30 help = 'chose if you wanna hierical dendrogram')
31 parser.add_argument('-lk', '--k_min',
32 type = int,
33 help = 'min number of cluster')
34 parser.add_argument('-uk', '--k_max',
35 type = int,
36 help = 'max number of cluster')
37 parser.add_argument('-li', '--linkage',
38 type = str,
39 choices = ['single', 'complete', 'average'],
40 help='linkage hierarchical cluster')
41 parser.add_argument('-d', '--data',
42 type = str,
43 required = True,
44 help = 'input dataset')
45 parser.add_argument('-n', '--none',
46 type = str,
47 default = 'true',
48 choices = ['true', 'false'],
49 help = 'compute Nan values')
50 parser.add_argument('-td', '--tool_dir',
51 type = str,
52 required = True,
53 help = 'your tool directory')
54 parser.add_argument('-na', '--name',
55 type = str,
56 help = 'name of dataset')
57 parser.add_argument('-de', '--dendro',
58 help = "Dendrogram out")
59 parser.add_argument('-ol', '--out_log',
60 help = "Output log")
61 parser.add_argument('-el', '--elbow',
62 help = "Out elbow")
63 args = parser.parse_args()
64 return args
65
66 ########################### warning ###########################################
67
68 def warning(s):
69 args = process_args(sys.argv)
70 with open(args.out_log, 'a') as log:
71 log.write(s)
72
73 ############################ dataset input ####################################
74
75 def read_dataset(data, name):
76 try:
77 dataset = pd.read_csv(data, sep = '\t', header = 0)
78 except pd.errors.EmptyDataError:
79 sys.exit('Execution aborted: wrong format of '+name+'\n')
80 if len(dataset.columns) < 2:
81 sys.exit('Execution aborted: wrong format of '+name+'\n')
82 return dataset
83
84 ############################ dataset name #####################################
85
86 def name_dataset(name_data, count):
87 if str(name_data) == 'Dataset':
88 return str(name_data) + '_' + str(count)
89 else:
90 return str(name_data)
91
92 ############################ load id e rules ##################################
93
94 def load_id_rules(reactions):
95 ids, rules = [], []
96 for key, value in reactions.items():
97 ids.append(key)
98 rules.append(value)
99 return (ids, rules)
100
101 ############################ check_methods ####################################
102
103 def gene_type(l, name):
104 if check_hgnc(l):
105 return 'hugo_id'
106 elif check_ensembl(l):
107 return 'ensembl_gene_id'
108 elif check_symbol(l):
109 return 'symbol'
110 elif check_entrez(l):
111 return 'entrez_id'
112 else:
113 sys.exit('Execution aborted:\n' +
114 'gene ID type in ' + name + ' not supported. Supported ID' +
115 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
116
117 def check_hgnc(l):
118 if len(l) > 5:
119 if (l.upper()).startswith('HGNC:'):
120 return l[5:].isdigit()
121 else:
122 return False
123 else:
124 return False
125
126 def check_ensembl(l):
127 if len(l) == 15:
128 if (l.upper()).startswith('ENS'):
129 return l[4:].isdigit()
130 else:
131 return False
132 else:
133 return False
134
135 def check_symbol(l):
136 if len(l) > 0:
137 if l[0].isalpha() and l[1:].isalnum():
138 return True
139 else:
140 return False
141 else:
142 return False
143
144 def check_entrez(l):
145 if len(l) > 0:
146 return l.isdigit()
147 else:
148 return False
149
150 def check_bool(b):
151 if b == 'true':
152 return True
153 elif b == 'false':
154 return False
155
156 ############################ make recon #######################################
157
158 def check_and_doWord(l):
159 tmp = []
160 tmp_genes = []
161 count = 0
162 while l:
163 if count >= 0:
164 if l[0] == '(':
165 count += 1
166 tmp.append(l[0])
167 l.pop(0)
168 elif l[0] == ')':
169 count -= 1
170 tmp.append(l[0])
171 l.pop(0)
172 elif l[0] == ' ':
173 l.pop(0)
174 else:
175 word = []
176 while l:
177 if l[0] in [' ', '(', ')']:
178 break
179 else:
180 word.append(l[0])
181 l.pop(0)
182 word = ''.join(word)
183 tmp.append(word)
184 if not(word in ['or', 'and']):
185 tmp_genes.append(word)
186 else:
187 return False
188 if count == 0:
189 return (tmp, tmp_genes)
190 else:
191 return False
192
193 def brackets_to_list(l):
194 tmp = []
195 while l:
196 if l[0] == '(':
197 l.pop(0)
198 tmp.append(resolve_brackets(l))
199 else:
200 tmp.append(l[0])
201 l.pop(0)
202 return tmp
203
204 def resolve_brackets(l):
205 tmp = []
206 while l[0] != ')':
207 if l[0] == '(':
208 l.pop(0)
209 tmp.append(resolve_brackets(l))
210 else:
211 tmp.append(l[0])
212 l.pop(0)
213 l.pop(0)
214 return tmp
215
216 def priorityAND(l):
217 tmp = []
218 flag = True
219 while l:
220 if len(l) == 1:
221 if isinstance(l[0], list):
222 tmp.append(priorityAND(l[0]))
223 else:
224 tmp.append(l[0])
225 l = l[1:]
226 elif l[0] == 'or':
227 tmp.append(l[0])
228 flag = False
229 l = l[1:]
230 elif l[1] == 'or':
231 if isinstance(l[0], list):
232 tmp.append(priorityAND(l[0]))
233 else:
234 tmp.append(l[0])
235 tmp.append(l[1])
236 flag = False
237 l = l[2:]
238 elif l[1] == 'and':
239 tmpAnd = []
240 if isinstance(l[0], list):
241 tmpAnd.append(priorityAND(l[0]))
242 else:
243 tmpAnd.append(l[0])
244 tmpAnd.append(l[1])
245 if isinstance(l[2], list):
246 tmpAnd.append(priorityAND(l[2]))
247 else:
248 tmpAnd.append(l[2])
249 l = l[3:]
250 while l:
251 if l[0] == 'and':
252 tmpAnd.append(l[0])
253 if isinstance(l[1], list):
254 tmpAnd.append(priorityAND(l[1]))
255 else:
256 tmpAnd.append(l[1])
257 l = l[2:]
258 elif l[0] == 'or':
259 flag = False
260 break
261 if flag == True: #se ci sono solo AND nella lista
262 tmp.extend(tmpAnd)
263 elif flag == False:
264 tmp.append(tmpAnd)
265 return tmp
266
267 def checkRule(l):
268 if len(l) == 1:
269 if isinstance(l[0], list):
270 if checkRule(l[0]) is False:
271 return False
272 elif len(l) > 2:
273 if checkRule2(l) is False:
274 return False
275 else:
276 return False
277 return True
278
279 def checkRule2(l):
280 while l:
281 if len(l) == 1:
282 return False
283 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
284 if checkRule(l[0]) is False:
285 return False
286 if isinstance(l[2], list):
287 if checkRule(l[2]) is False:
288 return False
289 l = l[3:]
290 elif l[1] in ['and', 'or']:
291 if isinstance(l[2], list):
292 if checkRule(l[2]) is False:
293 return False
294 l = l[3:]
295 elif l[0] in ['and', 'or']:
296 if isinstance(l[1], list):
297 if checkRule(l[1]) is False:
298 return False
299 l = l[2:]
300 else:
301 return False
302 return True
303
304 def do_rules(rules):
305 split_rules = []
306 err_rules = []
307 tmp_gene_in_rule = []
308 for i in range(len(rules)):
309 tmp = list(rules[i])
310 if tmp:
311 tmp, tmp_genes = check_and_doWord(tmp)
312 tmp_gene_in_rule.extend(tmp_genes)
313 if tmp is False:
314 split_rules.append([])
315 err_rules.append(rules[i])
316 else:
317 tmp = brackets_to_list(tmp)
318 if checkRule(tmp):
319 split_rules.append(priorityAND(tmp))
320 else:
321 split_rules.append([])
322 err_rules.append(rules[i])
323 else:
324 split_rules.append([])
325 if err_rules:
326 warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
327 return (split_rules, list(set(tmp_gene_in_rule)))
328
329 def make_recon(data):
330 try:
331 import cobra as cb
332 import warnings
333 with warnings.catch_warnings():
334 warnings.simplefilter('ignore')
335 recon = cb.io.read_sbml_model(data)
336 react = recon.reactions
337 rules = [react[i].gene_reaction_rule for i in range(len(react))]
338 ids = [react[i].id for i in range(len(react))]
339 except cb.io.sbml3.CobraSBMLError:
340 try:
341 data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('')
342 if len(data.columns) < 2:
343 sys.exit('Execution aborted: wrong format of ' +
344 'custom GPR rules\n')
345 if not len(data.columns) == 2:
346 warning('WARNING: more than 2 columns in custom GPR rules.\n' +
347 'Extra columns have been disregarded\n')
348 ids = list(data.iloc[:, 0])
349 rules = list(data.iloc[:, 1])
350 except pd.errors.EmptyDataError:
351 sys.exit('Execution aborted: wrong format of custom GPR rules\n')
352 except pd.errors.ParserError:
353 sys.exit('Execution aborted: wrong format of custom GPR rules\n')
354 split_rules, tmp_genes = do_rules(rules)
355 gene_in_rule = {}
356 for i in tmp_genes:
357 gene_in_rule[i] = 'ok'
358 return (ids, split_rules, gene_in_rule)
359
360 ############################ resolve_methods ##################################
361
362 def replace_gene_value(l, d):
363 tmp = []
364 err = []
365 while l:
366 if isinstance(l[0], list):
367 tmp_rules, tmp_err = replace_gene_value(l[0], d)
368 tmp.append(tmp_rules)
369 err.extend(tmp_err)
370 else:
371 value = replace_gene(l[0],d)
372 tmp.append(value)
373 if value == None:
374 err.append(l[0])
375 l = l[1:]
376 return (tmp, err)
377
378 def replace_gene(l, d):
379 if l =='and' or l == 'or':
380 return l
381 else:
382 value = d.get(l, None)
383 if not(value == None or isinstance(value, (int, float))):
384 sys.exit('Execution aborted: ' + value + ' value not valid\n')
385 return value
386
387 def compute(val1, op, val2, cn):
388 if val1 != None and val2 != None:
389 if op == 'and':
390 return min(val1, val2)
391 else:
392 return val1 + val2
393 elif op == 'and':
394 if cn is True:
395 if val1 != None:
396 return val1
397 elif val2 != None:
398 return val2
399 else:
400 return None
401 else:
402 return None
403 else:
404 if val1 != None:
405 return val1
406 elif val2 != None:
407 return val2
408 else:
409 return None
410
411 def control(ris, l, cn):
412 if len(l) == 1:
413 if isinstance(l[0], (float, int)) or l[0] == None:
414 return l[0]
415 elif isinstance(l[0], list):
416 return control(None, l[0], cn)
417 else:
418 return False
419 elif len(l) > 2:
420 return control_list(ris, l, cn)
421 else:
422 return False
423
424 def control_list(ris, l, cn):
425 while l:
426 if len(l) == 1:
427 return False
428 elif (isinstance(l[0], (float, int)) or
429 l[0] == None) and l[1] in ['and', 'or']:
430 if isinstance(l[2], (float, int)) or l[2] == None:
431 ris = compute(l[0], l[1], l[2], cn)
432 elif isinstance(l[2], list):
433 tmp = control(None, l[2], cn)
434 if tmp is False:
435 return False
436 else:
437 ris = compute(l[0], l[1], tmp, cn)
438 else:
439 return False
440 l = l[3:]
441 elif l[0] in ['and', 'or']:
442 if isinstance(l[1], (float, int)) or l[1] == None:
443 ris = compute(ris, l[0], l[1], cn)
444 elif isinstance(l[1], list):
445 tmp = control(None,l[1], cn)
446 if tmp is False:
447 return False
448 else:
449 ris = compute(ris, l[0], tmp, cn)
450 else:
451 return False
452 l = l[2:]
453 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
454 if isinstance(l[2], (float, int)) or l[2] == None:
455 tmp = control(None, l[0], cn)
456 if tmp is False:
457 return False
458 else:
459 ris = compute(tmp, l[1], l[2], cn)
460 elif isinstance(l[2], list):
461 tmp = control(None, l[0], cn)
462 tmp2 = control(None, l[2], cn)
463 if tmp is False or tmp2 is False:
464 return False
465 else:
466 ris = compute(tmp, l[1], tmp2, cn)
467 else:
468 return False
469 l = l[3:]
470 else:
471 return False
472 return ris
473
474 ############################ gene #############################################
475
476 def data_gene(gene, type_gene, name, gene_custom):
477 args = process_args(sys.argv)
478 for i in range(len(gene)):
479 tmp = gene.iloc[i, 0]
480 if tmp.startswith(' ') or tmp.endswith(' '):
481 gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
482 gene_dup = [item for item, count in
483 collections.Counter(gene[gene.columns[0]]).items() if count > 1]
484 pat_dup = [item for item, count in
485 collections.Counter(list(gene.columns)).items() if count > 1]
486 if gene_dup:
487 if gene_custom == None:
488 if args.rules_selector == 'HMRcore':
489 gene_in_rule = pk.load(open(args.tool_dir +
490 '/local/HMRcore_genes.p', 'rb'))
491 elif args.rules_selector == 'Recon':
492 gene_in_rule = pk.load(open(args.tool_dir +
493 '/local/Recon_genes.p', 'rb'))
494 gene_in_rule = gene_in_rule.get(type_gene)
495 else:
496 gene_in_rule = gene_custom
497 tmp = []
498 for i in gene_dup:
499 if gene_in_rule.get(i) == 'ok':
500 tmp.append(i)
501 if tmp:
502 sys.exit('Execution aborted because gene ID '
503 + str(tmp) + ' in ' + name + ' is duplicated\n')
504 if pat_dup:
505 sys.exit('Execution aborted: duplicated label\n'
506 + str(pat_dup) + 'in ' + name + '\n')
507 return (gene.set_index(gene.columns[0])).to_dict()
508
509 ############################ resolve ##########################################
510
511 def resolve(genes, rules, ids, resolve_none, name):
512 resolve_rules = {}
513 not_found = []
514 flag = False
515 for key, value in genes.items():
516 tmp_resolve = []
517 for i in range(len(rules)):
518 tmp = rules[i]
519 if tmp:
520 tmp, err = replace_gene_value(tmp, value)
521 if err:
522 not_found.extend(err)
523 ris = control(None, tmp, resolve_none)
524 if ris is False or ris == None:
525 tmp_resolve.append(None)
526 else:
527 tmp_resolve.append(ris)
528 flag = True
529 else:
530 tmp_resolve.append(None)
531 resolve_rules[key] = tmp_resolve
532 if flag is False:
533 sys.exit('Execution aborted: no computable score' +
534 ' (due to missing gene values) for class '
535 + name + ', the class has been disregarded\n')
536 return (resolve_rules, list(set(not_found)))
537
538 ################################# clustering ##################################
539
540 def f_cluster(resolve_rules):
541 os.makedirs('cluster_out')
542 args = process_args(sys.argv)
543 cluster_data = pd.DataFrame.from_dict(resolve_rules, orient = 'index')
544 for i in cluster_data.columns:
545 tmp = cluster_data[i][0]
546 if tmp == None:
547 cluster_data = cluster_data.drop(columns=[i])
548 distorsion = []
549 for i in range(args.k_min, args.k_max+1):
550 tmp_kmeans = KMeans(n_clusters = i,
551 n_init = 100,
552 max_iter = 300,
553 random_state = 0).fit(cluster_data)
554 distorsion.append(tmp_kmeans.inertia_)
555 predict = tmp_kmeans.predict(cluster_data)
556 predict = [x+1 for x in predict]
557 classe = (pd.DataFrame(zip(cluster_data.index, predict))).astype(str)
558 dest = 'cluster_out/K=' + str(i) + '_' + args.name+'.tsv'
559 classe.to_csv(dest, sep = '\t', index = False,
560 header = ['Patient_ID', 'Class'])
561 plt.figure(0)
562 plt.plot(range(args.k_min, args.k_max+1), distorsion, marker = 'o')
563 plt.xlabel('Number of cluster')
564 plt.ylabel('Distorsion')
565 plt.savefig(args.elbow, dpi = 240, format = 'pdf')
566 if args.cond_hier == 'yes':
567 import scipy.cluster.hierarchy as hier
568 lin = hier.linkage(cluster_data, args.linkage)
569 plt.figure(1)
570 plt.figure(figsize=(10, 5))
571 hier.dendrogram(lin, leaf_font_size = 2, labels = cluster_data.index)
572 plt.savefig(args.dendro, dpi = 480, format = 'pdf')
573 return None
574
575 ################################# main ########################################
576
577 def main():
578 args = process_args(sys.argv)
579 if args.k_min > args.k_max:
580 sys.exit('Execution aborted: max cluster > min cluster')
581 if args.rules_selector == 'HMRcore':
582 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb'))
583 elif args.rules_selector == 'Recon':
584 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb'))
585 elif args.rules_selector == 'Custom':
586 ids, rules, gene_in_rule = make_recon(args.custom)
587 resolve_none = check_bool(args.none)
588 dataset = read_dataset(args.data, args.name)
589 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
590 type_gene = gene_type(dataset.iloc[0, 0], args.name)
591 if args.rules_selector != 'Custom':
592 genes = data_gene(dataset, type_gene, args.name, None)
593 ids, rules = load_id_rules(recon.get(type_gene))
594 elif args.rules_selector == 'Custom':
595 genes = data_gene(dataset, type_gene, args.name, gene_in_rule)
596 resolve_rules, err = resolve(genes, rules, ids, resolve_none, args.name)
597 if err:
598 warning('WARNING: gene\n' + str(err) + '\nnot found in class '
599 + args.name + ', the expression level for this gene ' +
600 'will be considered NaN\n')
601 f_cluster(resolve_rules)
602 warning('Execution succeeded')
603 return None
604
605 ###############################################################################
606
607 if __name__ == "__main__":
608 main()