Mercurial > repos > bimib > marea
view Marea/marea_cluster.py @ 13:e96f3b85e5a0 draft
Uploaded
author | bimib |
---|---|
date | Wed, 13 Feb 2019 05:42:20 -0500 |
parents | 3d77287caf22 |
children | 1a0c8c2780f2 |
line wrap: on
line source
from __future__ import division import os import sys import pandas as pd import collections import pickle as pk import argparse from sklearn.cluster import KMeans import matplotlib matplotlib.use('GTKAgg') import matplotlib.pyplot as plt ########################## argparse ########################################### def process_args(args): parser = argparse.ArgumentParser(usage = '%(prog)s [options]', description = 'process some value\'s' + ' genes to create class.') parser.add_argument('-rs', '--rules_selector', type = str, default = 'HMRcore', choices = ['HMRcore', 'Recon', 'Custom'], help = 'chose which type of dataset you want use') parser.add_argument('-cr', '--custom', type = str, help='your dataset if you want custom rules') parser.add_argument('-ch', '--cond_hier', type = str, default = 'no', choices = ['no', 'yes'], help = 'chose if you wanna hierical dendrogram') parser.add_argument('-lk', '--k_min', type = int, help = 'min number of cluster') parser.add_argument('-uk', '--k_max', type = int, help = 'max number of cluster') parser.add_argument('-li', '--linkage', type = str, choices = ['single', 'complete', 'average'], help='linkage hierarchical cluster') parser.add_argument('-d', '--data', type = str, required = True, help = 'input dataset') parser.add_argument('-n', '--none', type = str, default = 'true', choices = ['true', 'false'], help = 'compute Nan values') parser.add_argument('-td', '--tool_dir', type = str, required = True, help = 'your tool directory') parser.add_argument('-na', '--name', type = str, help = 'name of dataset') parser.add_argument('-de', '--dendro', help = "Dendrogram out") parser.add_argument('-ol', '--out_log', help = "Output log") parser.add_argument('-el', '--elbow', help = "Out elbow") args = parser.parse_args() return args ########################### warning ########################################### def warning(s): args = process_args(sys.argv) with open(args.out_log, 'a') as log: log.write(s) ############################ dataset input #################################### def read_dataset(data, name): try: dataset = pd.read_csv(data, sep = '\t', header = 0) except pd.errors.EmptyDataError: sys.exit('Execution aborted: wrong format of '+name+'\n') if len(dataset.columns) < 2: sys.exit('Execution aborted: wrong format of '+name+'\n') return dataset ############################ dataset name ##################################### def name_dataset(name_data, count): if str(name_data) == 'Dataset': return str(name_data) + '_' + str(count) else: return str(name_data) ############################ load id e rules ################################## def load_id_rules(reactions): ids, rules = [], [] for key, value in reactions.items(): ids.append(key) rules.append(value) return (ids, rules) ############################ check_methods #################################### def gene_type(l, name): if check_hgnc(l): return 'hugo_id' elif check_ensembl(l): return 'ensembl_gene_id' elif check_symbol(l): return 'symbol' elif check_entrez(l): return 'entrez_id' else: sys.exit('Execution aborted:\n' + 'gene ID type in ' + name + ' not supported. Supported ID' + 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n') def check_hgnc(l): if len(l) > 5: if (l.upper()).startswith('HGNC:'): return l[5:].isdigit() else: return False else: return False def check_ensembl(l): if len(l) == 15: if (l.upper()).startswith('ENS'): return l[4:].isdigit() else: return False else: return False def check_symbol(l): if len(l) > 0: if l[0].isalpha() and l[1:].isalnum(): return True else: return False else: return False def check_entrez(l): if len(l) > 0: return l.isdigit() else: return False def check_bool(b): if b == 'true': return True elif b == 'false': return False ############################ make recon ####################################### def check_and_doWord(l): tmp = [] tmp_genes = [] count = 0 while l: if count >= 0: if l[0] == '(': count += 1 tmp.append(l[0]) l.pop(0) elif l[0] == ')': count -= 1 tmp.append(l[0]) l.pop(0) elif l[0] == ' ': l.pop(0) else: word = [] while l: if l[0] in [' ', '(', ')']: break else: word.append(l[0]) l.pop(0) word = ''.join(word) tmp.append(word) if not(word in ['or', 'and']): tmp_genes.append(word) else: return False if count == 0: return (tmp, tmp_genes) else: return False def brackets_to_list(l): tmp = [] while l: if l[0] == '(': l.pop(0) tmp.append(resolve_brackets(l)) else: tmp.append(l[0]) l.pop(0) return tmp def resolve_brackets(l): tmp = [] while l[0] != ')': if l[0] == '(': l.pop(0) tmp.append(resolve_brackets(l)) else: tmp.append(l[0]) l.pop(0) l.pop(0) return tmp def priorityAND(l): tmp = [] flag = True while l: if len(l) == 1: if isinstance(l[0], list): tmp.append(priorityAND(l[0])) else: tmp.append(l[0]) l = l[1:] elif l[0] == 'or': tmp.append(l[0]) flag = False l = l[1:] elif l[1] == 'or': if isinstance(l[0], list): tmp.append(priorityAND(l[0])) else: tmp.append(l[0]) tmp.append(l[1]) flag = False l = l[2:] elif l[1] == 'and': tmpAnd = [] if isinstance(l[0], list): tmpAnd.append(priorityAND(l[0])) else: tmpAnd.append(l[0]) tmpAnd.append(l[1]) if isinstance(l[2], list): tmpAnd.append(priorityAND(l[2])) else: tmpAnd.append(l[2]) l = l[3:] while l: if l[0] == 'and': tmpAnd.append(l[0]) if isinstance(l[1], list): tmpAnd.append(priorityAND(l[1])) else: tmpAnd.append(l[1]) l = l[2:] elif l[0] == 'or': flag = False break if flag == True: #se ci sono solo AND nella lista tmp.extend(tmpAnd) elif flag == False: tmp.append(tmpAnd) return tmp def checkRule(l): if len(l) == 1: if isinstance(l[0], list): if checkRule(l[0]) is False: return False elif len(l) > 2: if checkRule2(l) is False: return False else: return False return True def checkRule2(l): while l: if len(l) == 1: return False elif isinstance(l[0], list) and l[1] in ['and', 'or']: if checkRule(l[0]) is False: return False if isinstance(l[2], list): if checkRule(l[2]) is False: return False l = l[3:] elif l[1] in ['and', 'or']: if isinstance(l[2], list): if checkRule(l[2]) is False: return False l = l[3:] elif l[0] in ['and', 'or']: if isinstance(l[1], list): if checkRule(l[1]) is False: return False l = l[2:] else: return False return True def do_rules(rules): split_rules = [] err_rules = [] tmp_gene_in_rule = [] for i in range(len(rules)): tmp = list(rules[i]) if tmp: tmp, tmp_genes = check_and_doWord(tmp) tmp_gene_in_rule.extend(tmp_genes) if tmp is False: split_rules.append([]) err_rules.append(rules[i]) else: tmp = brackets_to_list(tmp) if checkRule(tmp): split_rules.append(priorityAND(tmp)) else: split_rules.append([]) err_rules.append(rules[i]) else: split_rules.append([]) if err_rules: warning('Warning: wrong format rule in ' + str(err_rules) + '\n') return (split_rules, list(set(tmp_gene_in_rule))) def make_recon(data): try: import cobra as cb import warnings with warnings.catch_warnings(): warnings.simplefilter('ignore') recon = cb.io.read_sbml_model(data) react = recon.reactions rules = [react[i].gene_reaction_rule for i in range(len(react))] ids = [react[i].id for i in range(len(react))] except cb.io.sbml3.CobraSBMLError: try: data = (pd.read_csv(data, sep = '\t', dtype = str)).fillna('') if len(data.columns) < 2: sys.exit('Execution aborted: wrong format of ' + 'custom GPR rules\n') if not len(data.columns) == 2: warning('WARNING: more than 2 columns in custom GPR rules.\n' + 'Extra columns have been disregarded\n') ids = list(data.iloc[:, 0]) rules = list(data.iloc[:, 1]) except pd.errors.EmptyDataError: sys.exit('Execution aborted: wrong format of custom GPR rules\n') except pd.errors.ParserError: sys.exit('Execution aborted: wrong format of custom GPR rules\n') split_rules, tmp_genes = do_rules(rules) gene_in_rule = {} for i in tmp_genes: gene_in_rule[i] = 'ok' return (ids, split_rules, gene_in_rule) ############################ resolve_methods ################################## def replace_gene_value(l, d): tmp = [] err = [] while l: if isinstance(l[0], list): tmp_rules, tmp_err = replace_gene_value(l[0], d) tmp.append(tmp_rules) err.extend(tmp_err) else: value = replace_gene(l[0],d) tmp.append(value) if value == None: err.append(l[0]) l = l[1:] return (tmp, err) def replace_gene(l, d): if l =='and' or l == 'or': return l else: value = d.get(l, None) if not(value == None or isinstance(value, (int, float))): sys.exit('Execution aborted: ' + value + ' value not valid\n') return value def compute(val1, op, val2, cn): if val1 != None and val2 != None: if op == 'and': return min(val1, val2) else: return val1 + val2 elif op == 'and': if cn is True: if val1 != None: return val1 elif val2 != None: return val2 else: return None else: return None else: if val1 != None: return val1 elif val2 != None: return val2 else: return None def control(ris, l, cn): if len(l) == 1: if isinstance(l[0], (float, int)) or l[0] == None: return l[0] elif isinstance(l[0], list): return control(None, l[0], cn) else: return False elif len(l) > 2: return control_list(ris, l, cn) else: return False def control_list(ris, l, cn): while l: if len(l) == 1: return False elif (isinstance(l[0], (float, int)) or l[0] == None) and l[1] in ['and', 'or']: if isinstance(l[2], (float, int)) or l[2] == None: ris = compute(l[0], l[1], l[2], cn) elif isinstance(l[2], list): tmp = control(None, l[2], cn) if tmp is False: return False else: ris = compute(l[0], l[1], tmp, cn) else: return False l = l[3:] elif l[0] in ['and', 'or']: if isinstance(l[1], (float, int)) or l[1] == None: ris = compute(ris, l[0], l[1], cn) elif isinstance(l[1], list): tmp = control(None,l[1], cn) if tmp is False: return False else: ris = compute(ris, l[0], tmp, cn) else: return False l = l[2:] elif isinstance(l[0], list) and l[1] in ['and', 'or']: if isinstance(l[2], (float, int)) or l[2] == None: tmp = control(None, l[0], cn) if tmp is False: return False else: ris = compute(tmp, l[1], l[2], cn) elif isinstance(l[2], list): tmp = control(None, l[0], cn) tmp2 = control(None, l[2], cn) if tmp is False or tmp2 is False: return False else: ris = compute(tmp, l[1], tmp2, cn) else: return False l = l[3:] else: return False return ris ############################ gene ############################################# def data_gene(gene, type_gene, name, gene_custom): args = process_args(sys.argv) for i in range(len(gene)): tmp = gene.iloc[i, 0] if tmp.startswith(' ') or tmp.endswith(' '): gene.iloc[i, 0] = (tmp.lstrip()).rstrip() gene_dup = [item for item, count in collections.Counter(gene[gene.columns[0]]).items() if count > 1] pat_dup = [item for item, count in collections.Counter(list(gene.columns)).items() if count > 1] if gene_dup: if gene_custom == None: if args.rules_selector == 'HMRcore': gene_in_rule = pk.load(open(args.tool_dir + '/local/HMRcore_genes.p', 'rb')) elif args.rules_selector == 'Recon': gene_in_rule = pk.load(open(args.tool_dir + '/local/Recon_genes.p', 'rb')) gene_in_rule = gene_in_rule.get(type_gene) else: gene_in_rule = gene_custom tmp = [] for i in gene_dup: if gene_in_rule.get(i) == 'ok': tmp.append(i) if tmp: sys.exit('Execution aborted because gene ID ' + str(tmp) + ' in ' + name + ' is duplicated\n') if pat_dup: sys.exit('Execution aborted: duplicated label\n' + str(pat_dup) + 'in ' + name + '\n') return (gene.set_index(gene.columns[0])).to_dict() ############################ resolve ########################################## def resolve(genes, rules, ids, resolve_none, name): resolve_rules = {} not_found = [] flag = False for key, value in genes.items(): tmp_resolve = [] for i in range(len(rules)): tmp = rules[i] if tmp: tmp, err = replace_gene_value(tmp, value) if err: not_found.extend(err) ris = control(None, tmp, resolve_none) if ris is False or ris == None: tmp_resolve.append(None) else: tmp_resolve.append(ris) flag = True else: tmp_resolve.append(None) resolve_rules[key] = tmp_resolve if flag is False: sys.exit('Execution aborted: no computable score' + ' (due to missing gene values) for class ' + name + ', the class has been disregarded\n') return (resolve_rules, list(set(not_found))) ################################# clustering ################################## def f_cluster(resolve_rules): os.makedirs('cluster_out') args = process_args(sys.argv) k_min = args.k_min k_max = args.k_max if k_min > k_max: warning('k range boundaries inverted.\n') tmp = k_min k_min = k_max k_max = tmp else: warning('k range correct.\n') cluster_data = pd.DataFrame.from_dict(resolve_rules, orient = 'index') for i in cluster_data.columns: tmp = cluster_data[i][0] if tmp == None: cluster_data = cluster_data.drop(columns=[i]) distorsion = [] for i in range(k_min, k_max+1): tmp_kmeans = KMeans(n_clusters = i, n_init = 100, max_iter = 300, random_state = 0).fit(cluster_data) distorsion.append(tmp_kmeans.inertia_) predict = tmp_kmeans.predict(cluster_data) predict = [x+1 for x in predict] classe = (pd.DataFrame(list(zip(cluster_data.index, predict)))).astype(str) dest = 'cluster_out/K=' + str(i) + '_' + args.name+'.tsv' classe.to_csv(dest, sep = '\t', index = False, header = ['Patient_ID', 'Class']) plt.figure(0) plt.plot(range(k_min, k_max+1), distorsion, marker = 'o') plt.xlabel('Number of cluster') plt.ylabel('Distorsion') plt.savefig(args.elbow, dpi = 240, format = 'pdf') if args.cond_hier == 'yes': import scipy.cluster.hierarchy as hier lin = hier.linkage(cluster_data, args.linkage) plt.figure(1) plt.figure(figsize=(10, 5)) hier.dendrogram(lin, leaf_font_size = 2, labels = cluster_data.index) plt.savefig(args.dendro, dpi = 480, format = 'pdf') return None ################################# main ######################################## def main(): args = process_args(sys.argv) if args.rules_selector == 'HMRcore': recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) elif args.rules_selector == 'Recon': recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) elif args.rules_selector == 'Custom': ids, rules, gene_in_rule = make_recon(args.custom) resolve_none = check_bool(args.none) dataset = read_dataset(args.data, args.name) dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) type_gene = gene_type(dataset.iloc[0, 0], args.name) if args.rules_selector != 'Custom': genes = data_gene(dataset, type_gene, args.name, None) ids, rules = load_id_rules(recon.get(type_gene)) elif args.rules_selector == 'Custom': genes = data_gene(dataset, type_gene, args.name, gene_in_rule) resolve_rules, err = resolve(genes, rules, ids, resolve_none, args.name) if err: warning('WARNING: gene\n' + str(err) + '\nnot found in class ' + args.name + ', the expression level for this gene ' + 'will be considered NaN\n') f_cluster(resolve_rules) warning('Execution succeeded') return None ############################################################################### if __name__ == "__main__": main()