Mercurial > repos > bimib > marea
comparison Marea/marea_cluster.py @ 0:23ac9cf12788 draft
Uploaded
author | bimib |
---|---|
date | Tue, 06 Nov 2018 03:16:21 -0500 |
parents | |
children | 5721182715a7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:23ac9cf12788 |
---|---|
1 | |
2 from __future__ import division | |
3 import os | |
4 import sys | |
5 import pandas as pd | |
6 import collections | |
7 import pickle as pk | |
8 import argparse | |
9 from sklearn.cluster import KMeans | |
10 import matplotlib.pyplot as plt | |
11 | |
12 ########################## argparse ########################################### | |
13 | |
14 def process_args(args): | |
15 parser = argparse.ArgumentParser(usage = '%(prog)s [options]', | |
16 description = 'process some value\'s' + | |
17 ' genes to create class.') | |
18 parser.add_argument('-rs', '--rules_selector', | |
19 type = str, | |
20 default = 'HMRcore', | |
21 choices = ['HMRcore', 'Recon', 'Custom'], | |
22 help = 'chose which type of dataset you want use') | |
23 parser.add_argument('-cr', '--custom', | |
24 type = str, | |
25 help='your dataset if you want custom rules') | |
26 parser.add_argument('-ch', '--cond_hier', | |
27 type = str, | |
28 default = 'no', | |
29 choices = ['no', 'yes'], | |
30 help = 'chose if you wanna hierical dendrogram') | |
31 parser.add_argument('-lk', '--k_min', | |
32 type = int, | |
33 help = 'min number of cluster') | |
34 parser.add_argument('-uk', '--k_max', | |
35 type = int, | |
36 help = 'max number of cluster') | |
37 parser.add_argument('-li', '--linkage', | |
38 type = str, | |
39 choices = ['single', 'complete', 'average'], | |
40 help='linkage hierarchical cluster') | |
41 parser.add_argument('-d', '--data', | |
42 type = str, | |
43 required = True, | |
44 help = 'input dataset') | |
45 parser.add_argument('-n', '--none', | |
46 type = str, | |
47 default = 'true', | |
48 choices = ['true', 'false'], | |
49 help = 'compute Nan values') | |
50 parser.add_argument('-td', '--tool_dir', | |
51 type = str, | |
52 required = True, | |
53 help = 'your tool directory') | |
54 parser.add_argument('-na', '--name', | |
55 type = str, | |
56 help = 'name of dataset') | |
57 parser.add_argument('-de', '--dendro', | |
58 help = "Dendrogram out") | |
59 parser.add_argument('-ol', '--out_log', | |
60 help = "Output log") | |
61 parser.add_argument('-el', '--elbow', | |
62 help = "Out elbow") | |
63 args = parser.parse_args() | |
64 return args | |
65 | |
66 ########################### warning ########################################### | |
67 | |
68 def warning(s): | |
69 args = process_args(sys.argv) | |
70 with open(args.out_log, 'a') as log: | |
71 log.write(s) | |
72 | |
73 ############################ dataset input #################################### | |
74 | |
75 def read_dataset(data, name): | |
76 try: | |
77 dataset = pd.read_csv(data, sep = '\t', header = 0) | |
78 except pd.errors.EmptyDataError: | |
79 sys.exit('Execution aborted: wrong format of '+name+'\n') | |
80 if len(dataset.columns) < 2: | |
81 sys.exit('Execution aborted: wrong format of '+name+'\n') | |
82 return dataset | |
83 | |
84 ############################ dataset name ##################################### | |
85 | |
86 def name_dataset(name_data, count): | |
87 if str(name_data) == 'Dataset': | |
88 return str(name_data) + '_' + str(count) | |
89 else: | |
90 return str(name_data) | |
91 | |
92 ############################ load id e rules ################################## | |
93 | |
94 def load_id_rules(reactions): | |
95 ids, rules = [], [] | |
96 for key, value in reactions.items(): | |
97 ids.append(key) | |
98 rules.append(value) | |
99 return (ids, rules) | |
100 | |
101 ############################ check_methods #################################### | |
102 | |
103 def gene_type(l, name): | |
104 if check_hgnc(l): | |
105 return 'hugo_id' | |
106 elif check_ensembl(l): | |
107 return 'ensembl_gene_id' | |
108 elif check_symbol(l): | |
109 return 'symbol' | |
110 elif check_entrez(l): | |
111 return 'entrez_id' | |
112 else: | |
113 sys.exit('Execution aborted:\n' + | |
114 'gene ID type in ' + name + ' not supported. Supported ID' + | |
115 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n') | |
116 | |
117 def check_hgnc(l): | |
118 if len(l) > 5: | |
119 if (l.upper()).startswith('HGNC:'): | |
120 return l[5:].isdigit() | |
121 else: | |
122 return False | |
123 else: | |
124 return False | |
125 | |
126 def check_ensembl(l): | |
127 if len(l) == 15: | |
128 if (l.upper()).startswith('ENS'): | |
129 return l[4:].isdigit() | |
130 else: | |
131 return False | |
132 else: | |
133 return False | |
134 | |
135 def check_symbol(l): | |
136 if len(l) > 0: | |
137 if l[0].isalpha() and l[1:].isalnum(): | |
138 return True | |
139 else: | |
140 return False | |
141 else: | |
142 return False | |
143 | |
144 def check_entrez(l): | |
145 if len(l) > 0: | |
146 return l.isdigit() | |
147 else: | |
148 return False | |
149 | |
150 def check_bool(b): | |
151 if b == 'true': | |
152 return True | |
153 elif b == 'false': | |
154 return False | |
155 | |
156 ############################ make recon ####################################### | |
157 | |
158 def check_and_doWord(l): | |
159 tmp = [] | |
160 tmp_genes = [] | |
161 count = 0 | |
162 while l: | |
163 if count >= 0: | |
164 if l[0] == '(': | |
165 count += 1 | |
166 tmp.append(l[0]) | |
167 l.pop(0) | |
168 elif l[0] == ')': | |
169 count -= 1 | |
170 tmp.append(l[0]) | |
171 l.pop(0) | |
172 elif l[0] == ' ': | |
173 l.pop(0) | |
174 else: | |
175 word = [] | |
176 while l: | |
177 if l[0] in [' ', '(', ')']: | |
178 break | |
179 else: | |
180 word.append(l[0]) | |
181 l.pop(0) | |
182 word = ''.join(word) | |
183 tmp.append(word) | |
184 if not(word in ['or', 'and']): | |
185 tmp_genes.append(word) | |
186 else: | |
187 return False | |
188 if count == 0: | |
189 return (tmp, tmp_genes) | |
190 else: | |
191 return False | |
192 | |
193 def brackets_to_list(l): | |
194 tmp = [] | |
195 while l: | |
196 if l[0] == '(': | |
197 l.pop(0) | |
198 tmp.append(resolve_brackets(l)) | |
199 else: | |
200 tmp.append(l[0]) | |
201 l.pop(0) | |
202 return tmp | |
203 | |
204 def resolve_brackets(l): | |
205 tmp = [] | |
206 while l[0] != ')': | |
207 if l[0] == '(': | |
208 l.pop(0) | |
209 tmp.append(resolve_brackets(l)) | |
210 else: | |
211 tmp.append(l[0]) | |
212 l.pop(0) | |
213 l.pop(0) | |
214 return tmp | |
215 | |
216 def priorityAND(l): | |
217 tmp = [] | |
218 flag = True | |
219 while l: | |
220 if len(l) == 1: | |
221 if isinstance(l[0], list): | |
222 tmp.append(priorityAND(l[0])) | |
223 else: | |
224 tmp.append(l[0]) | |
225 l = l[1:] | |
226 elif l[0] == 'or': | |
227 tmp.append(l[0]) | |
228 flag = False | |
229 l = l[1:] | |
230 elif l[1] == 'or': | |
231 if isinstance(l[0], list): | |
232 tmp.append(priorityAND(l[0])) | |
233 else: | |
234 tmp.append(l[0]) | |
235 tmp.append(l[1]) | |
236 flag = False | |
237 l = l[2:] | |
238 elif l[1] == 'and': | |
239 tmpAnd = [] | |
240 if isinstance(l[0], list): | |
241 tmpAnd.append(priorityAND(l[0])) | |
242 else: | |
243 tmpAnd.append(l[0]) | |
244 tmpAnd.append(l[1]) | |
245 if isinstance(l[2], list): | |
246 tmpAnd.append(priorityAND(l[2])) | |
247 else: | |
248 tmpAnd.append(l[2]) | |
249 l = l[3:] | |
250 while l: | |
251 if l[0] == 'and': | |
252 tmpAnd.append(l[0]) | |
253 if isinstance(l[1], list): | |
254 tmpAnd.append(priorityAND(l[1])) | |
255 else: | |
256 tmpAnd.append(l[1]) | |
257 l = l[2:] | |
258 elif l[0] == 'or': | |
259 flag = False | |
260 break | |
261 if flag == True: #se ci sono solo AND nella lista | |
262 tmp.extend(tmpAnd) | |
263 elif flag == False: | |
264 tmp.append(tmpAnd) | |
265 return tmp | |
266 | |
267 def checkRule(l): | |
268 if len(l) == 1: | |
269 if isinstance(l[0], list): | |
270 if checkRule(l[0]) is False: | |
271 return False | |
272 elif len(l) > 2: | |
273 if checkRule2(l) is False: | |
274 return False | |
275 else: | |
276 return False | |
277 return True | |
278 | |
279 def checkRule2(l): | |
280 while l: | |
281 if len(l) == 1: | |
282 return False | |
283 elif isinstance(l[0], list) and l[1] in ['and', 'or']: | |
284 if checkRule(l[0]) is False: | |
285 return False | |
286 if isinstance(l[2], list): | |
287 if checkRule(l[2]) is False: | |
288 return False | |
289 l = l[3:] | |
290 elif l[1] in ['and', 'or']: | |
291 if isinstance(l[2], list): | |
292 if checkRule(l[2]) is False: | |
293 return False | |
294 l = l[3:] | |
295 elif l[0] in ['and', 'or']: | |
296 if isinstance(l[1], list): | |
297 if checkRule(l[1]) is False: | |
298 return False | |
299 l = l[2:] | |
300 else: | |
301 return False | |
302 return True | |
303 | |
304 def do_rules(rules): | |
305 split_rules = [] | |
306 err_rules = [] | |
307 tmp_gene_in_rule = [] | |
308 for i in range(len(rules)): | |
309 tmp = list(rules[i]) | |
310 if tmp: | |
311 tmp, tmp_genes = check_and_doWord(tmp) | |
312 tmp_gene_in_rule.extend(tmp_genes) | |
313 if tmp is False: | |
314 split_rules.append([]) | |
315 err_rules.append(rules[i]) | |
316 else: | |
317 tmp = brackets_to_list(tmp) | |
318 if checkRule(tmp): | |
319 split_rules.append(priorityAND(tmp)) | |
320 else: | |
321 split_rules.append([]) | |
322 err_rules.append(rules[i]) | |
323 else: | |
324 split_rules.append([]) | |
325 if err_rules: | |
326 warning('Warning: wrong format rule in ' + str(err_rules) + '\n') | |
327 return (split_rules, list(set(tmp_gene_in_rule))) | |
328 | |
329 def make_recon(data): | |
330 try: | |
331 import cobra as cb | |
332 import warnings | |
333 with warnings.catch_warnings(): | |
334 warnings.simplefilter('ignore') | |
335 recon = cb.io.read_sbml_model(data) | |
336 react = recon.reactions | |
337 rules = [react[i].gene_reaction_rule for i in range(len(react))] | |
338 ids = [react[i].id for i in range(len(react))] | |
339 except cb.io.sbml3.CobraSBMLError: | |
340 try: | |
341 data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('') | |
342 if len(data.columns) < 2: | |
343 sys.exit('Execution aborted: wrong format of ' + | |
344 'custom GPR rules\n') | |
345 if not len(data.columns) == 2: | |
346 warning('WARNING: more than 2 columns in custom GPR rules.\n' + | |
347 'Extra columns have been disregarded\n') | |
348 ids = list(data.iloc[:, 0]) | |
349 rules = list(data.iloc[:, 1]) | |
350 except pd.errors.EmptyDataError: | |
351 sys.exit('Execution aborted: wrong format of custom GPR rules\n') | |
352 except pd.errors.ParserError: | |
353 sys.exit('Execution aborted: wrong format of custom GPR rules\n') | |
354 split_rules, tmp_genes = do_rules(rules) | |
355 gene_in_rule = {} | |
356 for i in tmp_genes: | |
357 gene_in_rule[i] = 'ok' | |
358 return (ids, split_rules, gene_in_rule) | |
359 | |
360 ############################ resolve_methods ################################## | |
361 | |
362 def replace_gene_value(l, d): | |
363 tmp = [] | |
364 err = [] | |
365 while l: | |
366 if isinstance(l[0], list): | |
367 tmp_rules, tmp_err = replace_gene_value(l[0], d) | |
368 tmp.append(tmp_rules) | |
369 err.extend(tmp_err) | |
370 else: | |
371 value = replace_gene(l[0],d) | |
372 tmp.append(value) | |
373 if value == None: | |
374 err.append(l[0]) | |
375 l = l[1:] | |
376 return (tmp, err) | |
377 | |
378 def replace_gene(l, d): | |
379 if l =='and' or l == 'or': | |
380 return l | |
381 else: | |
382 value = d.get(l, None) | |
383 if not(value == None or isinstance(value, (int, float))): | |
384 sys.exit('Execution aborted: ' + value + ' value not valid\n') | |
385 return value | |
386 | |
387 def compute(val1, op, val2, cn): | |
388 if val1 != None and val2 != None: | |
389 if op == 'and': | |
390 return min(val1, val2) | |
391 else: | |
392 return val1 + val2 | |
393 elif op == 'and': | |
394 if cn is True: | |
395 if val1 != None: | |
396 return val1 | |
397 elif val2 != None: | |
398 return val2 | |
399 else: | |
400 return None | |
401 else: | |
402 return None | |
403 else: | |
404 if val1 != None: | |
405 return val1 | |
406 elif val2 != None: | |
407 return val2 | |
408 else: | |
409 return None | |
410 | |
411 def control(ris, l, cn): | |
412 if len(l) == 1: | |
413 if isinstance(l[0], (float, int)) or l[0] == None: | |
414 return l[0] | |
415 elif isinstance(l[0], list): | |
416 return control(None, l[0], cn) | |
417 else: | |
418 return False | |
419 elif len(l) > 2: | |
420 return control_list(ris, l, cn) | |
421 else: | |
422 return False | |
423 | |
424 def control_list(ris, l, cn): | |
425 while l: | |
426 if len(l) == 1: | |
427 return False | |
428 elif (isinstance(l[0], (float, int)) or | |
429 l[0] == None) and l[1] in ['and', 'or']: | |
430 if isinstance(l[2], (float, int)) or l[2] == None: | |
431 ris = compute(l[0], l[1], l[2], cn) | |
432 elif isinstance(l[2], list): | |
433 tmp = control(None, l[2], cn) | |
434 if tmp is False: | |
435 return False | |
436 else: | |
437 ris = compute(l[0], l[1], tmp, cn) | |
438 else: | |
439 return False | |
440 l = l[3:] | |
441 elif l[0] in ['and', 'or']: | |
442 if isinstance(l[1], (float, int)) or l[1] == None: | |
443 ris = compute(ris, l[0], l[1], cn) | |
444 elif isinstance(l[1], list): | |
445 tmp = control(None,l[1], cn) | |
446 if tmp is False: | |
447 return False | |
448 else: | |
449 ris = compute(ris, l[0], tmp, cn) | |
450 else: | |
451 return False | |
452 l = l[2:] | |
453 elif isinstance(l[0], list) and l[1] in ['and', 'or']: | |
454 if isinstance(l[2], (float, int)) or l[2] == None: | |
455 tmp = control(None, l[0], cn) | |
456 if tmp is False: | |
457 return False | |
458 else: | |
459 ris = compute(tmp, l[1], l[2], cn) | |
460 elif isinstance(l[2], list): | |
461 tmp = control(None, l[0], cn) | |
462 tmp2 = control(None, l[2], cn) | |
463 if tmp is False or tmp2 is False: | |
464 return False | |
465 else: | |
466 ris = compute(tmp, l[1], tmp2, cn) | |
467 else: | |
468 return False | |
469 l = l[3:] | |
470 else: | |
471 return False | |
472 return ris | |
473 | |
474 ############################ gene ############################################# | |
475 | |
476 def data_gene(gene, type_gene, name, gene_custom): | |
477 args = process_args(sys.argv) | |
478 for i in range(len(gene)): | |
479 tmp = gene.iloc[i, 0] | |
480 if tmp.startswith(' ') or tmp.endswith(' '): | |
481 gene.iloc[i, 0] = (tmp.lstrip()).rstrip() | |
482 gene_dup = [item for item, count in | |
483 collections.Counter(gene[gene.columns[0]]).items() if count > 1] | |
484 pat_dup = [item for item, count in | |
485 collections.Counter(list(gene.columns)).items() if count > 1] | |
486 if gene_dup: | |
487 if gene_custom == None: | |
488 if args.rules_selector == 'HMRcore': | |
489 gene_in_rule = pk.load(open(args.tool_dir + | |
490 '/local/HMRcore_genes.p', 'rb')) | |
491 elif args.rules_selector == 'Recon': | |
492 gene_in_rule = pk.load(open(args.tool_dir + | |
493 '/local/Recon_genes.p', 'rb')) | |
494 gene_in_rule = gene_in_rule.get(type_gene) | |
495 else: | |
496 gene_in_rule = gene_custom | |
497 tmp = [] | |
498 for i in gene_dup: | |
499 if gene_in_rule.get(i) == 'ok': | |
500 tmp.append(i) | |
501 if tmp: | |
502 sys.exit('Execution aborted because gene ID ' | |
503 + str(tmp) + ' in ' + name + ' is duplicated\n') | |
504 if pat_dup: | |
505 sys.exit('Execution aborted: duplicated label\n' | |
506 + str(pat_dup) + 'in ' + name + '\n') | |
507 return (gene.set_index(gene.columns[0])).to_dict() | |
508 | |
509 ############################ resolve ########################################## | |
510 | |
511 def resolve(genes, rules, ids, resolve_none, name): | |
512 resolve_rules = {} | |
513 not_found = [] | |
514 flag = False | |
515 for key, value in genes.items(): | |
516 tmp_resolve = [] | |
517 for i in range(len(rules)): | |
518 tmp = rules[i] | |
519 if tmp: | |
520 tmp, err = replace_gene_value(tmp, value) | |
521 if err: | |
522 not_found.extend(err) | |
523 ris = control(None, tmp, resolve_none) | |
524 if ris is False or ris == None: | |
525 tmp_resolve.append(None) | |
526 else: | |
527 tmp_resolve.append(ris) | |
528 flag = True | |
529 else: | |
530 tmp_resolve.append(None) | |
531 resolve_rules[key] = tmp_resolve | |
532 if flag is False: | |
533 sys.exit('Execution aborted: no computable score' + | |
534 ' (due to missing gene values) for class ' | |
535 + name + ', the class has been disregarded\n') | |
536 return (resolve_rules, list(set(not_found))) | |
537 | |
538 ################################# clustering ################################## | |
539 | |
540 def f_cluster(resolve_rules): | |
541 os.makedirs('cluster_out') | |
542 args = process_args(sys.argv) | |
543 cluster_data = pd.DataFrame.from_dict(resolve_rules, orient = 'index') | |
544 for i in cluster_data.columns: | |
545 tmp = cluster_data[i][0] | |
546 if tmp == None: | |
547 cluster_data = cluster_data.drop(columns=[i]) | |
548 distorsion = [] | |
549 for i in range(args.k_min, args.k_max+1): | |
550 tmp_kmeans = KMeans(n_clusters = i, | |
551 n_init = 100, | |
552 max_iter = 300, | |
553 random_state = 0).fit(cluster_data) | |
554 distorsion.append(tmp_kmeans.inertia_) | |
555 predict = tmp_kmeans.predict(cluster_data) | |
556 predict = [x+1 for x in predict] | |
557 classe = (pd.DataFrame(zip(cluster_data.index, predict))).astype(str) | |
558 dest = 'cluster_out/K=' + str(i) + '_' + args.name+'.tsv' | |
559 classe.to_csv(dest, sep = '\t', index = False, | |
560 header = ['Patient_ID', 'Class']) | |
561 plt.figure(0) | |
562 plt.plot(range(args.k_min, args.k_max+1), distorsion, marker = 'o') | |
563 plt.xlabel('Number of cluster') | |
564 plt.ylabel('Distorsion') | |
565 plt.savefig(args.elbow, dpi = 240, format = 'pdf') | |
566 if args.cond_hier == 'yes': | |
567 import scipy.cluster.hierarchy as hier | |
568 lin = hier.linkage(cluster_data, args.linkage) | |
569 plt.figure(1) | |
570 plt.figure(figsize=(10, 5)) | |
571 hier.dendrogram(lin, leaf_font_size = 2, labels = cluster_data.index) | |
572 plt.savefig(args.dendro, dpi = 480, format = 'pdf') | |
573 return None | |
574 | |
575 ################################# main ######################################## | |
576 | |
577 def main(): | |
578 args = process_args(sys.argv) | |
579 if args.k_min > args.k_max: | |
580 sys.exit('Execution aborted: max cluster > min cluster') | |
581 if args.rules_selector == 'HMRcore': | |
582 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) | |
583 elif args.rules_selector == 'Recon': | |
584 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) | |
585 elif args.rules_selector == 'Custom': | |
586 ids, rules, gene_in_rule = make_recon(args.custom) | |
587 resolve_none = check_bool(args.none) | |
588 dataset = read_dataset(args.data, args.name) | |
589 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) | |
590 type_gene = gene_type(dataset.iloc[0, 0], args.name) | |
591 if args.rules_selector != 'Custom': | |
592 genes = data_gene(dataset, type_gene, args.name, None) | |
593 ids, rules = load_id_rules(recon.get(type_gene)) | |
594 elif args.rules_selector == 'Custom': | |
595 genes = data_gene(dataset, type_gene, args.name, gene_in_rule) | |
596 resolve_rules, err = resolve(genes, rules, ids, resolve_none, args.name) | |
597 if err: | |
598 warning('WARNING: gene\n' + str(err) + '\nnot found in class ' | |
599 + args.name + ', the expression level for this gene ' + | |
600 'will be considered NaN\n') | |
601 f_cluster(resolve_rules) | |
602 warning('Execution succeeded') | |
603 return None | |
604 | |
605 ############################################################################### | |
606 | |
607 if __name__ == "__main__": | |
608 main() |