Mercurial > repos > bimib > marea
view Marea/marea.py @ 28:e6831924df01 draft
small fixes (elbow plot and output managment)
author | bimib |
---|---|
date | Mon, 14 Oct 2019 05:01:08 -0400 |
parents | c71ac0bb12de |
children | 944e15aa970a |
line wrap: on
line source
from __future__ import division import sys import pandas as pd import itertools as it import scipy.stats as st import collections import lxml.etree as ET import shutil import pickle as pk import math import os import argparse from svglib.svglib import svg2rlg from reportlab.graphics import renderPDF ########################## argparse ########################################## def process_args(args): parser = argparse.ArgumentParser(usage = '%(prog)s [options]', description = 'process some value\'s'+ ' genes to create a comparison\'s map.') parser.add_argument('-rs', '--rules_selector', type = str, default = 'HMRcore', choices = ['HMRcore', 'Recon', 'Custom'], help = 'chose which type of dataset you want use') parser.add_argument('-cr', '--custom', type = str, help='your dataset if you want custom rules') parser.add_argument('-na', '--names', type = str, nargs = '+', help = 'input names') parser.add_argument('-n', '--none', type = str, default = 'true', choices = ['true', 'false'], help = 'compute Nan values') parser.add_argument('-pv' ,'--pValue', type = float, default = 0.05, help = 'P-Value threshold (default: %(default)s)') parser.add_argument('-fc', '--fChange', type = float, default = 1.5, help = 'Fold-Change threshold (default: %(default)s)') parser.add_argument('-td', '--tool_dir', type = str, required = True, help = 'your tool directory') parser.add_argument('-op', '--option', type = str, choices = ['datasets', 'dataset_class', 'datasets_rasonly'], help='dataset or dataset and class') parser.add_argument('-ol', '--out_log', help = "Output log") parser.add_argument('-ids', '--input_datas', type = str, nargs = '+', help = 'input datasets') parser.add_argument('-id', '--input_data', type = str, help = 'input dataset') parser.add_argument('-ic', '--input_class', type = str, help = 'sample group specification') parser.add_argument('-cm', '--custom_map', type = str, help = 'custom map') parser.add_argument('-yn', '--yes_no', type = str, choices = ['yes', 'no'], help = 'if make or not custom map') parser.add_argument('-gs', '--generate_svg', type = str, default = 'true', choices = ['true', 'false'], help = 'generate svg map') parser.add_argument('-gp', '--generate_pdf', type = str, default = 'true', choices = ['true', 'false'], help = 'generate pdf map') parser.add_argument('-gr', '--generate_ras', type = str, default = 'true', choices = ['true', 'false'], help = 'generate reaction activity score') parser.add_argument('-sr', '--single_ras_file', type = str, help = 'file that will contain ras') args = parser.parse_args() return args ########################### warning ########################################### def warning(s): args = process_args(sys.argv) with open(args.out_log, 'a') as log: log.write(s) ############################ dataset input #################################### def read_dataset(data, name): try: dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python') except pd.errors.EmptyDataError: sys.exit('Execution aborted: wrong format of ' + name + '\n') if len(dataset.columns) < 2: sys.exit('Execution aborted: wrong format of ' + name + '\n') return dataset ############################ dataset name ##################################### def name_dataset(name_data, count): if str(name_data) == 'Dataset': return str(name_data) + '_' + str(count) else: return str(name_data) ############################ load id e rules ################################## def load_id_rules(reactions): ids, rules = [], [] for key, value in reactions.items(): ids.append(key) rules.append(value) return (ids, rules) ############################ check_methods #################################### def gene_type(l, name): if check_hgnc(l): return 'hugo_id' elif check_ensembl(l): return 'ensembl_gene_id' elif check_symbol(l): return 'symbol' elif check_entrez(l): return 'entrez_id' else: sys.exit('Execution aborted:\n' + 'gene ID type in ' + name + ' not supported. Supported ID'+ 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n') def check_hgnc(l): if len(l) > 5: if (l.upper()).startswith('HGNC:'): return l[5:].isdigit() else: return False else: return False def check_ensembl(l): if len(l) == 15: if (l.upper()).startswith('ENS'): return l[4:].isdigit() else: return False else: return False def check_symbol(l): if len(l) > 0: if l[0].isalpha() and l[1:].isalnum(): return True else: return False else: return False def check_entrez(l): if len(l) > 0: return l.isdigit() else: return False def check_bool(b): if b == 'true': return True elif b == 'false': return False ############################ resolve_methods ################################## def replace_gene_value(l, d): tmp = [] err = [] while l: if isinstance(l[0], list): tmp_rules, tmp_err = replace_gene_value(l[0], d) tmp.append(tmp_rules) err.extend(tmp_err) else: value = replace_gene(l[0], d) tmp.append(value) if value == None: err.append(l[0]) l = l[1:] return (tmp, err) def replace_gene(l, d): if l =='and' or l == 'or': return l else: value = d.get(l, None) if not(value == None or isinstance(value, (int, float))): sys.exit('Execution aborted: ' + value + ' value not valid\n') return value def computes(val1, op, val2, cn): if val1 != None and val2 != None: if op == 'and': return min(val1, val2) else: return val1 + val2 elif op == 'and': if cn is True: if val1 != None: return val1 elif val2 != None: return val2 else: return None else: return None else: if val1 != None: return val1 elif val2 != None: return val2 else: return None def control(ris, l, cn): if len(l) == 1: if isinstance(l[0], (float, int)) or l[0] == None: return l[0] elif isinstance(l[0], list): return control(None, l[0], cn) else: return False elif len(l) > 2: return control_list(ris, l, cn) else: return False def control_list(ris, l, cn): while l: if len(l) == 1: return False elif (isinstance(l[0], (float, int)) or l[0] == None) and l[1] in ['and', 'or']: if isinstance(l[2], (float, int)) or l[2] == None: ris = computes(l[0], l[1], l[2], cn) elif isinstance(l[2], list): tmp = control(None, l[2], cn) if tmp is False: return False else: ris = computes(l[0], l[1], tmp, cn) else: return False l = l[3:] elif l[0] in ['and', 'or']: if isinstance(l[1], (float, int)) or l[1] == None: ris = computes(ris, l[0], l[1], cn) elif isinstance(l[1], list): tmp = control(None,l[1], cn) if tmp is False: return False else: ris = computes(ris, l[0], tmp, cn) else: return False l = l[2:] elif isinstance(l[0], list) and l[1] in ['and', 'or']: if isinstance(l[2], (float, int)) or l[2] == None: tmp = control(None, l[0], cn) if tmp is False: return False else: ris = computes(tmp, l[1], l[2], cn) elif isinstance(l[2], list): tmp = control(None, l[0], cn) tmp2 = control(None, l[2], cn) if tmp is False or tmp2 is False: return False else: ris = computes(tmp, l[1], tmp2, cn) else: return False l = l[3:] else: return False return ris ############################ map_methods ###################################### def fold_change(avg1, avg2): if avg1 == 0 and avg2 == 0: return 0 elif avg1 == 0: return '-INF' elif avg2 == 0: return 'INF' else: return math.log(avg1 / avg2, 2) def fix_style(l, col, width, dash): tmp = l.split(';') flag_col = False flag_width = False flag_dash = False for i in range(len(tmp)): if tmp[i].startswith('stroke:'): tmp[i] = 'stroke:' + col flag_col = True if tmp[i].startswith('stroke-width:'): tmp[i] = 'stroke-width:' + width flag_width = True if tmp[i].startswith('stroke-dasharray:'): tmp[i] = 'stroke-dasharray:' + dash flag_dash = True if not flag_col: tmp.append('stroke:' + col) if not flag_width: tmp.append('stroke-width:' + width) if not flag_dash: tmp.append('stroke-dasharray:' + dash) return ';'.join(tmp) def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C): maxT = 12 minT = 2 grey = '#BEBEBE' blue = '#0000FF' red = '#E41A1C' for el in core_map.iter(): el_id = str(el.get('id')) if el_id.startswith('R_'): tmp = d.get(el_id[2:]) if tmp != None: p_val = tmp[0] f_c = tmp[1] if p_val < threshold_P_V: if not isinstance(f_c, str): if abs(f_c) < math.log(threshold_F_C, 2): col = grey width = str(minT) else: if f_c < 0: col = blue elif f_c > 0: col = red width = str(max((abs(f_c) * maxT) / max_F_C, minT)) else: if f_c == '-INF': col = blue elif f_c == 'INF': col = red width = str(maxT) dash = 'none' else: dash = '5,5' col = grey width = str(minT) el.set('style', fix_style(el.get('style'), col, width, dash)) return core_map ############################ make recon ####################################### def check_and_doWord(l): tmp = [] tmp_genes = [] count = 0 while l: if count >= 0: if l[0] == '(': count += 1 tmp.append(l[0]) l.pop(0) elif l[0] == ')': count -= 1 tmp.append(l[0]) l.pop(0) elif l[0] == ' ': l.pop(0) else: word = [] while l: if l[0] in [' ', '(', ')']: break else: word.append(l[0]) l.pop(0) word = ''.join(word) tmp.append(word) if not(word in ['or', 'and']): tmp_genes.append(word) else: return False if count == 0: return (tmp, tmp_genes) else: return False def brackets_to_list(l): tmp = [] while l: if l[0] == '(': l.pop(0) tmp.append(resolve_brackets(l)) else: tmp.append(l[0]) l.pop(0) return tmp def resolve_brackets(l): tmp = [] while l[0] != ')': if l[0] == '(': l.pop(0) tmp.append(resolve_brackets(l)) else: tmp.append(l[0]) l.pop(0) l.pop(0) return tmp def priorityAND(l): tmp = [] flag = True while l: if len(l) == 1: if isinstance(l[0], list): tmp.append(priorityAND(l[0])) else: tmp.append(l[0]) l = l[1:] elif l[0] == 'or': tmp.append(l[0]) flag = False l = l[1:] elif l[1] == 'or': if isinstance(l[0], list): tmp.append(priorityAND(l[0])) else: tmp.append(l[0]) tmp.append(l[1]) flag = False l = l[2:] elif l[1] == 'and': tmpAnd = [] if isinstance(l[0], list): tmpAnd.append(priorityAND(l[0])) else: tmpAnd.append(l[0]) tmpAnd.append(l[1]) if isinstance(l[2], list): tmpAnd.append(priorityAND(l[2])) else: tmpAnd.append(l[2]) l = l[3:] while l: if l[0] == 'and': tmpAnd.append(l[0]) if isinstance(l[1], list): tmpAnd.append(priorityAND(l[1])) else: tmpAnd.append(l[1]) l = l[2:] elif l[0] == 'or': flag = False break if flag == True: #when there are only AND in list tmp.extend(tmpAnd) elif flag == False: tmp.append(tmpAnd) return tmp def checkRule(l): if len(l) == 1: if isinstance(l[0], list): if checkRule(l[0]) is False: return False elif len(l) > 2: if checkRule2(l) is False: return False else: return False return True def checkRule2(l): while l: if len(l) == 1: return False elif isinstance(l[0], list) and l[1] in ['and', 'or']: if checkRule(l[0]) is False: return False if isinstance(l[2], list): if checkRule(l[2]) is False: return False l = l[3:] elif l[1] in ['and', 'or']: if isinstance(l[2], list): if checkRule(l[2]) is False: return False l = l[3:] elif l[0] in ['and', 'or']: if isinstance(l[1], list): if checkRule(l[1]) is False: return False l = l[2:] else: return False return True def do_rules(rules): split_rules = [] err_rules = [] tmp_gene_in_rule = [] for i in range(len(rules)): tmp = list(rules[i]) if tmp: tmp, tmp_genes = check_and_doWord(tmp) tmp_gene_in_rule.extend(tmp_genes) if tmp is False: split_rules.append([]) err_rules.append(rules[i]) else: tmp = brackets_to_list(tmp) if checkRule(tmp): split_rules.append(priorityAND(tmp)) else: split_rules.append([]) err_rules.append(rules[i]) else: split_rules.append([]) if err_rules: warning('Warning: wrong format rule in ' + str(err_rules) + '\n') return (split_rules, list(set(tmp_gene_in_rule))) def make_recon(data): try: import cobra as cb import warnings with warnings.catch_warnings(): warnings.simplefilter('ignore') recon = cb.io.read_sbml_model(data) react = recon.reactions rules = [react[i].gene_reaction_rule for i in range(len(react))] ids = [react[i].id for i in range(len(react))] except cb.io.sbml3.CobraSBMLError: try: data = (pd.read_csv(data, sep = '\t', dtype = str, engine='python')).fillna('') if len(data.columns) < 2: sys.exit('Execution aborted: wrong format of '+ 'custom datarules\n') if not len(data.columns) == 2: warning('Warning: more than 2 columns in custom datarules.\n' + 'Extra columns have been disregarded\n') ids = list(data.iloc[:, 0]) rules = list(data.iloc[:, 1]) except pd.errors.EmptyDataError: sys.exit('Execution aborted: wrong format of custom datarules\n') except pd.errors.ParserError: sys.exit('Execution aborted: wrong format of custom datarules\n') split_rules, tmp_genes = do_rules(rules) gene_in_rule = {} for i in tmp_genes: gene_in_rule[i] = 'ok' return (ids, split_rules, gene_in_rule) ############################ gene ############################################# def data_gene(gene, type_gene, name, gene_custom): args = process_args(sys.argv) for i in range(len(gene)): tmp = gene.iloc[i, 0] if tmp.startswith(' ') or tmp.endswith(' '): gene.iloc[i, 0] = (tmp.lstrip()).rstrip() gene_dup = [item for item, count in collections.Counter(gene[gene.columns[0]]).items() if count > 1] pat_dup = [item for item, count in collections.Counter(list(gene.columns)).items() if count > 1] if gene_dup: if gene_custom == None: if args.rules_selector == 'HMRcore': gene_in_rule = pk.load(open(args.tool_dir + '/local/HMRcore_genes.p', 'rb')) elif args.rules_selector == 'Recon': gene_in_rule = pk.load(open(args.tool_dir + '/local/Recon_genes.p', 'rb')) gene_in_rule = gene_in_rule.get(type_gene) else: gene_in_rule = gene_custom tmp = [] for i in gene_dup: if gene_in_rule.get(i) == 'ok': tmp.append(i) if tmp: sys.exit('Execution aborted because gene ID ' +str(tmp)+' in '+name+' is duplicated\n') if pat_dup: warning('Warning: duplicated label\n' + str(pat_dup) + 'in ' + name + '\n') return (gene.set_index(gene.columns[0])).to_dict() ############################ resolve ########################################## def resolve(genes, rules, ids, resolve_none, name): resolve_rules = {} not_found = [] flag = False for key, value in genes.items(): tmp_resolve = [] for i in range(len(rules)): tmp = rules[i] if tmp: tmp, err = replace_gene_value(tmp, value) if err: not_found.extend(err) ris = control(None, tmp, resolve_none) if ris is False or ris == None: tmp_resolve.append(None) else: tmp_resolve.append(ris) flag = True else: tmp_resolve.append(None) resolve_rules[key] = tmp_resolve if flag is False: warning('Warning: no computable score (due to missing gene values)' + 'for class ' + name + ', the class has been disregarded\n') return (None, None) return (resolve_rules, list(set(not_found))) ############################ split class ###################################### def split_class(classes, resolve_rules): class_pat = {} for i in range(len(classes)): classe = classes.iloc[i, 1] if not pd.isnull(classe): l = [] for j in range(i, len(classes)): if classes.iloc[j, 1] == classe: pat_id = classes.iloc[j, 0] tmp = resolve_rules.get(pat_id, None) if tmp != None: l.append(tmp) classes.iloc[j, 1] = None if l: class_pat[classe] = list(map(list, zip(*l))) else: warning('Warning: no sample found in class ' + classe + ', the class has been disregarded\n') return class_pat ############################ create_ras ####################################### def create_ras (resolve_rules, dataset_name, single_ras): if resolve_rules == None: warning("Couldn't generate RAS for current dataset: " + dataset_name) for geni in resolve_rules.values(): for i, valori in enumerate(geni): if valori == None: geni[i] = 'None' output_ras = pd.DataFrame.from_dict(resolve_rules) output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) if (single_ras): args = process_args(sys.argv) text_file = open(args.single_ras_file, "w") else: text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w") text_file.write(output_to_csv) text_file.close() ############################ map ############################################## def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf): args = process_args(sys.argv) if (not class_pat) or (len(class_pat.keys()) < 2): sys.exit('Execution aborted: classes provided for comparisons are ' + 'less than two\n') for i, j in it.combinations(class_pat.keys(), 2): tmp = {} count = 0 max_F_C = 0 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)): try: stat_D, p_value = st.ks_2samp(l1, l2) avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2)) if not isinstance(avg, str): if max_F_C < abs(avg): max_F_C = abs(avg) tmp[ids[count]] = [float(p_value), avg] count += 1 except (TypeError, ZeroDivisionError): count += 1 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv' tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index") tmp_csv = tmp_csv.reset_index() header = ['ids', 'P_Value', 'Average'] tmp_csv.to_csv(tab, sep = '\t', index = False, header = header) if create_svg or create_pdf: if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom' and args.yes_no == 'yes'): fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C) file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg' with open(file_svg, 'wb') as new_map: new_map.write(ET.tostring(core_map)) if create_pdf: file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf' renderPDF.drawToFile(svg2rlg(file_svg), file_pdf) if not create_svg: #Ho utilizzato il file svg per generare il pdf, #ma l'utente non ne ha richiesto il ritorno, quindi #lo elimino os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg') return None ############################ MAIN ############################################# def main(): args = process_args(sys.argv) create_svg = check_bool(args.generate_svg) create_pdf = check_bool(args.generate_pdf) generate_ras = check_bool(args.generate_ras) os.makedirs('result') if generate_ras: os.makedirs('ras') if args.rules_selector == 'HMRcore': recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) elif args.rules_selector == 'Recon': recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) elif args.rules_selector == 'Custom': ids, rules, gene_in_rule = make_recon(args.custom) resolve_none = check_bool(args.none) class_pat = {} if args.option == 'datasets_rasonly': name = "RAS Dataset" dataset = read_dataset(args.input_datas[0],"dataset") dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) type_gene = gene_type(dataset.iloc[0, 0], name) if args.rules_selector != 'Custom': genes = data_gene(dataset, type_gene, name, None) ids, rules = load_id_rules(recon.get(type_gene)) elif args.rules_selector == 'Custom': genes = data_gene(dataset, type_gene, name, gene_in_rule) resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) create_ras(resolve_rules, name, True) if err != None and err: warning('Warning: gene\n' + str(err) + '\nnot found in class ' + name + ', the expression level for this gene ' + 'will be considered NaN\n') print('execution succeded') return None elif args.option == 'datasets': num = 1 for i, j in zip(args.input_datas, args.names): name = name_dataset(j, num) dataset = read_dataset(i, name) dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) type_gene = gene_type(dataset.iloc[0, 0], name) if args.rules_selector != 'Custom': genes = data_gene(dataset, type_gene, name, None) ids, rules = load_id_rules(recon.get(type_gene)) elif args.rules_selector == 'Custom': genes = data_gene(dataset, type_gene, name, gene_in_rule) resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) if generate_ras: create_ras(resolve_rules, name, False) if err != None and err: warning('Warning: gene\n' + str(err) + '\nnot found in class ' + name + ', the expression level for this gene ' + 'will be considered NaN\n') if resolve_rules != None: class_pat[name] = list(map(list, zip(*resolve_rules.values()))) num += 1 elif args.option == 'dataset_class': name = 'RNAseq' dataset = read_dataset(args.input_data, name) dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) type_gene = gene_type(dataset.iloc[0, 0], name) classes = read_dataset(args.input_class, 'class') if not len(classes.columns) == 2: warning('Warning: more than 2 columns in class file. Extra' + 'columns have been disregarded\n') classes = classes.astype(str) if args.rules_selector != 'Custom': genes = data_gene(dataset, type_gene, name, None) ids, rules = load_id_rules(recon.get(type_gene)) elif args.rules_selector == 'Custom': genes = data_gene(dataset, type_gene, name, gene_in_rule) resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) if err != None and err: warning('Warning: gene\n'+str(err)+'\nnot found in class ' + name + ', the expression level for this gene ' + 'will be considered NaN\n') if resolve_rules != None: class_pat = split_class(classes, resolve_rules) if args.rules_selector == 'Custom': if args.yes_no == 'yes': try: core_map = ET.parse(args.custom_map) except (ET.XMLSyntaxError, ET.XMLSchemaParseError): sys.exit('Execution aborted: custom map in wrong format') elif args.yes_no == 'no': core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg') else: core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg') maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf) print('Execution succeded') return None ############################################################################### if __name__ == "__main__": main()