Mercurial > repos > bimib > marea
comparison Marea/ras_generator.py @ 47:3af9d394367c draft
Uploaded
| author | bimib | 
|---|---|
| date | Wed, 19 Feb 2020 10:44:52 -0500 | 
| parents | 5d5d01ef1d68 | 
| children | 3b0e71e28c0b | 
   comparison
  equal
  deleted
  inserted
  replaced
| 46:5d5d01ef1d68 | 47:3af9d394367c | 
|---|---|
| 1 from __future__ import division | 1 from __future__ import division | 
| 2 import sys | 2 import sys | 
| 3 import pandas as pd | 3 import pandas as pd | 
| 4 import itertools as it | |
| 5 import scipy.stats as st | |
| 6 import collections | 4 import collections | 
| 7 import lxml.etree as ET | |
| 8 import pickle as pk | 5 import pickle as pk | 
| 9 import math | 6 import math | 
| 10 import os | |
| 11 import argparse | 7 import argparse | 
| 12 from svglib.svglib import svg2rlg | |
| 13 from reportlab.graphics import renderPDF | |
| 14 | 8 | 
| 15 ########################## argparse ########################################## | 9 ########################## argparse ########################################## | 
| 16 | 10 | 
| 17 def process_args(args): | 11 def process_args(args): | 
| 18 parser = argparse.ArgumentParser(usage = '%(prog)s [options]', | 12 parser = argparse.ArgumentParser(usage = '%(prog)s [options]', | 
| 24 choices = ['HMRcore', 'Recon', 'Custom'], | 18 choices = ['HMRcore', 'Recon', 'Custom'], | 
| 25 help = 'chose which type of dataset you want use') | 19 help = 'chose which type of dataset you want use') | 
| 26 parser.add_argument('-cr', '--custom', | 20 parser.add_argument('-cr', '--custom', | 
| 27 type = str, | 21 type = str, | 
| 28 help='your dataset if you want custom rules') | 22 help='your dataset if you want custom rules') | 
| 29 parser.add_argument('-na', '--names', | |
| 30 type = str, | |
| 31 nargs = '+', | |
| 32 help = 'input names') | |
| 33 parser.add_argument('-n', '--none', | 23 parser.add_argument('-n', '--none', | 
| 34 type = str, | 24 type = str, | 
| 35 default = 'true', | 25 default = 'true', | 
| 36 choices = ['true', 'false'], | 26 choices = ['true', 'false'], | 
| 37 help = 'compute Nan values') | 27 help = 'compute Nan values') | 
| 38 parser.add_argument('-pv' ,'--pValue', | |
| 39 type = float, | |
| 40 default = 0.05, | |
| 41 help = 'P-Value threshold (default: %(default)s)') | |
| 42 parser.add_argument('-fc', '--fChange', | |
| 43 type = float, | |
| 44 default = 1.5, | |
| 45 help = 'Fold-Change threshold (default: %(default)s)') | |
| 46 parser.add_argument('-td', '--tool_dir', | 28 parser.add_argument('-td', '--tool_dir', | 
| 47 type = str, | 29 type = str, | 
| 48 required = True, | 30 required = True, | 
| 49 help = 'your tool directory') | 31 help = 'your tool directory') | 
| 50 parser.add_argument('-op', '--option', | |
| 51 type = str, | |
| 52 choices = ['datasets', 'dataset_class', 'datasets_rasonly'], | |
| 53 help='dataset or dataset and class') | |
| 54 parser.add_argument('-ol', '--out_log', | 32 parser.add_argument('-ol', '--out_log', | 
| 55 help = "Output log") | 33 help = "Output log") | 
| 56 parser.add_argument('-ids', '--input_datas', | 34 parser.add_argument('-id', '--input', | 
| 57 type = str, | |
| 58 nargs = '+', | |
| 59 help = 'input datasets') | |
| 60 parser.add_argument('-id', '--input_data', | |
| 61 type = str, | 35 type = str, | 
| 62 help = 'input dataset') | 36 help = 'input dataset') | 
| 63 parser.add_argument('-ic', '--input_class', | 37 parser.add_argument('-ra', '--ras_output', | 
| 64 type = str, | |
| 65 help = 'sample group specification') | |
| 66 parser.add_argument('-cm', '--custom_map', | |
| 67 type = str, | |
| 68 help = 'custom map') | |
| 69 parser.add_argument('-yn', '--yes_no', | |
| 70 type = str, | 38 type = str, | 
| 71 choices = ['yes', 'no'], | 39 required = True, | 
| 72 help = 'if make or not custom map') | 40 help = 'ras output') | 
| 73 parser.add_argument('-gs', '--generate_svg', | 41 | 
| 74 type = str, | |
| 75 default = 'true', | |
| 76 choices = ['true', 'false'], | |
| 77 help = 'generate svg map') | |
| 78 parser.add_argument('-gp', '--generate_pdf', | |
| 79 type = str, | |
| 80 default = 'true', | |
| 81 choices = ['true', 'false'], | |
| 82 help = 'generate pdf map') | |
| 83 parser.add_argument('-gr', '--generate_ras', | |
| 84 type = str, | |
| 85 default = 'true', | |
| 86 choices = ['true', 'false'], | |
| 87 help = 'generate reaction activity score') | |
| 88 parser.add_argument('-sr', '--single_ras_file', | |
| 89 type = str, | |
| 90 help = 'file that will contain ras') | |
| 91 | |
| 92 args = parser.parse_args() | 42 args = parser.parse_args() | 
| 93 return args | 43 return args | 
| 94 | 44 | 
| 95 ########################### warning ########################################### | 45 ########################### warning ########################################### | 
| 96 | 46 | 
| 294 return False | 244 return False | 
| 295 l = l[3:] | 245 l = l[3:] | 
| 296 else: | 246 else: | 
| 297 return False | 247 return False | 
| 298 return ris | 248 return ris | 
| 299 | |
| 300 ############################ map_methods ###################################### | |
| 301 | |
| 302 def fold_change(avg1, avg2): | |
| 303 if avg1 == 0 and avg2 == 0: | |
| 304 return 0 | |
| 305 elif avg1 == 0: | |
| 306 return '-INF' | |
| 307 elif avg2 == 0: | |
| 308 return 'INF' | |
| 309 else: | |
| 310 return math.log(avg1 / avg2, 2) | |
| 311 | |
| 312 def fix_style(l, col, width, dash): | |
| 313 tmp = l.split(';') | |
| 314 flag_col = False | |
| 315 flag_width = False | |
| 316 flag_dash = False | |
| 317 for i in range(len(tmp)): | |
| 318 if tmp[i].startswith('stroke:'): | |
| 319 tmp[i] = 'stroke:' + col | |
| 320 flag_col = True | |
| 321 if tmp[i].startswith('stroke-width:'): | |
| 322 tmp[i] = 'stroke-width:' + width | |
| 323 flag_width = True | |
| 324 if tmp[i].startswith('stroke-dasharray:'): | |
| 325 tmp[i] = 'stroke-dasharray:' + dash | |
| 326 flag_dash = True | |
| 327 if not flag_col: | |
| 328 tmp.append('stroke:' + col) | |
| 329 if not flag_width: | |
| 330 tmp.append('stroke-width:' + width) | |
| 331 if not flag_dash: | |
| 332 tmp.append('stroke-dasharray:' + dash) | |
| 333 return ';'.join(tmp) | |
| 334 | |
| 335 def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C): | |
| 336 maxT = 12 | |
| 337 minT = 2 | |
| 338 grey = '#BEBEBE' | |
| 339 blue = '#0000FF' | |
| 340 red = '#E41A1C' | |
| 341 for el in core_map.iter(): | |
| 342 el_id = str(el.get('id')) | |
| 343 if el_id.startswith('R_'): | |
| 344 tmp = d.get(el_id[2:]) | |
| 345 if tmp != None: | |
| 346 p_val = tmp[0] | |
| 347 f_c = tmp[1] | |
| 348 if p_val < threshold_P_V: | |
| 349 if not isinstance(f_c, str): | |
| 350 if abs(f_c) < math.log(threshold_F_C, 2): | |
| 351 col = grey | |
| 352 width = str(minT) | |
| 353 else: | |
| 354 if f_c < 0: | |
| 355 col = blue | |
| 356 elif f_c > 0: | |
| 357 col = red | |
| 358 width = str(max((abs(f_c) * maxT) / max_F_C, minT)) | |
| 359 else: | |
| 360 if f_c == '-INF': | |
| 361 col = blue | |
| 362 elif f_c == 'INF': | |
| 363 col = red | |
| 364 width = str(maxT) | |
| 365 dash = 'none' | |
| 366 else: | |
| 367 dash = '5,5' | |
| 368 col = grey | |
| 369 width = str(minT) | |
| 370 el.set('style', fix_style(el.get('style'), col, width, dash)) | |
| 371 return core_map | |
| 372 | 249 | 
| 373 ############################ make recon ####################################### | 250 ############################ make recon ####################################### | 
| 374 | 251 | 
| 375 def check_and_doWord(l): | 252 def check_and_doWord(l): | 
| 376 tmp = [] | 253 tmp = [] | 
| 613 | 490 | 
| 614 ############################ resolve ########################################## | 491 ############################ resolve ########################################## | 
| 615 | 492 | 
| 616 def resolve(genes, rules, ids, resolve_none, name): | 493 def resolve(genes, rules, ids, resolve_none, name): | 
| 617 resolve_rules = {} | 494 resolve_rules = {} | 
| 618 names_array = [] | |
| 619 not_found = [] | 495 not_found = [] | 
| 620 flag = False | 496 flag = False | 
| 621 for key, value in genes.items(): | 497 for key, value in genes.items(): | 
| 622 tmp_resolve = [] | 498 tmp_resolve = [] | 
| 623 for i in range(len(rules)): | 499 for i in range(len(rules)): | 
| 650 if not pd.isnull(classe): | 526 if not pd.isnull(classe): | 
| 651 l = [] | 527 l = [] | 
| 652 for j in range(i, len(classes)): | 528 for j in range(i, len(classes)): | 
| 653 if classes.iloc[j, 1] == classe: | 529 if classes.iloc[j, 1] == classe: | 
| 654 pat_id = classes.iloc[j, 0] | 530 pat_id = classes.iloc[j, 0] | 
| 655 tmp = resolve_rules.get(pat_id, None) | |
| 656 if tmp != None: | 531 if tmp != None: | 
| 657 l.append(tmp) | 532 l.append(tmp) | 
| 658 classes.iloc[j, 1] = None | 533 classes.iloc[j, 1] = None | 
| 659 if l: | 534 if l: | 
| 660 class_pat[classe] = list(map(list, zip(*l))) | 535 class_pat[classe] = list(map(list, zip(*l))) | 
| 663 ', the class has been disregarded\n') | 538 ', the class has been disregarded\n') | 
| 664 return class_pat | 539 return class_pat | 
| 665 | 540 | 
| 666 ############################ create_ras ####################################### | 541 ############################ create_ras ####################################### | 
| 667 | 542 | 
| 668 def create_ras (resolve_rules, dataset_name, single_ras, rules, ids): | 543 def create_ras (resolve_rules, dataset_name, rules, ids, file): | 
| 669 | 544 | 
| 670 if resolve_rules == None: | 545 if resolve_rules == None: | 
| 671 warning("Couldn't generate RAS for current dataset: " + dataset_name) | 546 warning("Couldn't generate RAS for current dataset: " + dataset_name) | 
| 672 | 547 | 
| 673 for geni in resolve_rules.values(): | 548 for geni in resolve_rules.values(): | 
| 678 output_ras = pd.DataFrame.from_dict(resolve_rules) | 553 output_ras = pd.DataFrame.from_dict(resolve_rules) | 
| 679 | 554 | 
| 680 output_ras.insert(0, 'Reactions', ids) | 555 output_ras.insert(0, 'Reactions', ids) | 
| 681 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) | 556 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) | 
| 682 | 557 | 
| 683 if (single_ras): | 558 text_file = open(file, "w") | 
| 684 args = process_args(sys.argv) | |
| 685 text_file = open(args.single_ras_file, "w") | |
| 686 else: | |
| 687 text_file = open("ras/Reaction_Activity_Score_Of_" + dataset_name + ".tsv", "w") | |
| 688 | 559 | 
| 689 text_file.write(output_to_csv) | 560 text_file.write(output_to_csv) | 
| 690 text_file.close() | 561 text_file.close() | 
| 691 | 562 | 
| 692 ############################ map ############################################## | |
| 693 | |
| 694 def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf): | |
| 695 args = process_args(sys.argv) | |
| 696 if (not class_pat) or (len(class_pat.keys()) < 2): | |
| 697 sys.exit('Execution aborted: classes provided for comparisons are ' + | |
| 698 'less than two\n') | |
| 699 for i, j in it.combinations(class_pat.keys(), 2): | |
| 700 tmp = {} | |
| 701 count = 0 | |
| 702 max_F_C = 0 | |
| 703 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)): | |
| 704 try: | |
| 705 stat_D, p_value = st.ks_2samp(l1, l2) | |
| 706 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2)) | |
| 707 if not isinstance(avg, str): | |
| 708 if max_F_C < abs(avg): | |
| 709 max_F_C = abs(avg) | |
| 710 tmp[ids[count]] = [float(p_value), avg] | |
| 711 count += 1 | |
| 712 except (TypeError, ZeroDivisionError): | |
| 713 count += 1 | |
| 714 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv' | |
| 715 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index") | |
| 716 tmp_csv = tmp_csv.reset_index() | |
| 717 header = ['ids', 'P_Value', 'Log2(fold change)'] | |
| 718 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header) | |
| 719 | |
| 720 if create_svg or create_pdf: | |
| 721 if args.rules_selector == 'HMRcore' or (args.rules_selector == 'Custom' | |
| 722 and args.yes_no == 'yes'): | |
| 723 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C) | |
| 724 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg' | |
| 725 with open(file_svg, 'wb') as new_map: | |
| 726 new_map.write(ET.tostring(core_map)) | |
| 727 | |
| 728 | |
| 729 if create_pdf: | |
| 730 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf' | |
| 731 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf) | |
| 732 | |
| 733 if not create_svg: | |
| 734 #Ho utilizzato il file svg per generare il pdf, | |
| 735 #ma l'utente non ne ha richiesto il ritorno, quindi | |
| 736 #lo elimino | |
| 737 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg') | |
| 738 | |
| 739 return None | |
| 740 | |
| 741 ############################ MAIN ############################################# | 563 ############################ MAIN ############################################# | 
| 742 | 564 | 
| 743 def main(): | 565 def main(): | 
| 744 args = process_args(sys.argv) | 566 args = process_args(sys.argv) | 
| 745 | 567 | 
| 746 create_svg = check_bool(args.generate_svg) | |
| 747 create_pdf = check_bool(args.generate_pdf) | |
| 748 generate_ras = check_bool(args.generate_ras) | |
| 749 | |
| 750 os.makedirs('result') | |
| 751 | |
| 752 if generate_ras: | |
| 753 os.makedirs('ras') | |
| 754 | |
| 755 if args.rules_selector == 'HMRcore': | 568 if args.rules_selector == 'HMRcore': | 
| 756 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) | 569 recon = pk.load(open(args.tool_dir + '/local/HMRcore_rules.p', 'rb')) | 
| 757 elif args.rules_selector == 'Recon': | 570 elif args.rules_selector == 'Recon': | 
| 758 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) | 571 recon = pk.load(open(args.tool_dir + '/local/Recon_rules.p', 'rb')) | 
| 759 elif args.rules_selector == 'Custom': | 572 elif args.rules_selector == 'Custom': | 
| 760 ids, rules, gene_in_rule = make_recon(args.custom) | 573 ids, rules, gene_in_rule = make_recon(args.custom) | 
| 761 | 574 | 
| 762 resolve_none = check_bool(args.none) | 575 resolve_none = check_bool(args.none) | 
| 763 | 576 | 
| 764 class_pat = {} | 577 | 
| 765 | 578 name = "RAS Dataset" | 
| 766 if args.option == 'datasets_rasonly': | 579 dataset = read_dataset(args.input, "dataset") | 
| 767 name = "RAS Dataset" | 580 | 
| 768 dataset = read_dataset(args.input_datas[0],"dataset") | 581 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) | 
| 769 | 582 | 
| 770 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) | 583 type_gene = gene_type(dataset.iloc[0, 0], name) | 
| 771 | |
| 772 type_gene = gene_type(dataset.iloc[0, 0], name) | |
| 773 | |
| 774 if args.rules_selector != 'Custom': | |
| 775 genes = data_gene(dataset, type_gene, name, None) | |
| 776 ids, rules = load_id_rules(recon.get(type_gene)) | |
| 777 elif args.rules_selector == 'Custom': | |
| 778 genes = data_gene(dataset, type_gene, name, gene_in_rule) | |
| 779 | 584 | 
| 780 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) | 585 if args.rules_selector != 'Custom': | 
| 781 | 586 genes = data_gene(dataset, type_gene, name, None) | 
| 782 create_ras(resolve_rules, name, True, rules, ids) | 587 ids, rules = load_id_rules(recon.get(type_gene)) | 
| 783 | 588 elif args.rules_selector == 'Custom': | 
| 784 if err != None and err: | 589 genes = data_gene(dataset, type_gene, name, gene_in_rule) | 
| 785 warning('Warning: gene\n' + str(err) + '\nnot found in class ' | 590 | 
| 786 + name + ', the expression level for this gene ' + | 591 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) | 
| 787 'will be considered NaN\n') | 592 | 
| 788 | 593 create_ras(resolve_rules, name, rules, ids, args.ras_output) | 
| 789 print('execution succeded') | 594 | 
| 790 return None | 595 if err != None and err: | 
| 791 | 596 warning('Warning: gene\n' + str(err) + '\nnot found in class ' | 
| 792 | 597 + name + ', the expression level for this gene ' + | 
| 793 elif args.option == 'datasets': | 598 'will be considered NaN\n') | 
| 794 num = 1 | 599 | 
| 795 for i, j in zip(args.input_datas, args.names): | 600 | 
| 796 | |
| 797 name = name_dataset(j, num) | |
| 798 dataset = read_dataset(i, name) | |
| 799 | |
| 800 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) | |
| 801 | |
| 802 type_gene = gene_type(dataset.iloc[0, 0], name) | |
| 803 | |
| 804 if args.rules_selector != 'Custom': | |
| 805 genes = data_gene(dataset, type_gene, name, None) | |
| 806 ids, rules = load_id_rules(recon.get(type_gene)) | |
| 807 elif args.rules_selector == 'Custom': | |
| 808 genes = data_gene(dataset, type_gene, name, gene_in_rule) | |
| 809 | |
| 810 | |
| 811 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) | |
| 812 | |
| 813 if generate_ras: | |
| 814 create_ras(resolve_rules, name, False, rules, ids) | |
| 815 | |
| 816 if err != None and err: | |
| 817 warning('Warning: gene\n' + str(err) + '\nnot found in class ' | |
| 818 + name + ', the expression level for this gene ' + | |
| 819 'will be considered NaN\n') | |
| 820 if resolve_rules != None: | |
| 821 class_pat[name] = list(map(list, zip(*resolve_rules.values()))) | |
| 822 num += 1 | |
| 823 elif args.option == 'dataset_class': | |
| 824 name = 'RNAseq' | |
| 825 dataset = read_dataset(args.input_data, name) | |
| 826 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) | |
| 827 type_gene = gene_type(dataset.iloc[0, 0], name) | |
| 828 classes = read_dataset(args.input_class, 'class') | |
| 829 if not len(classes.columns) == 2: | |
| 830 warning('Warning: more than 2 columns in class file. Extra' + | |
| 831 'columns have been disregarded\n') | |
| 832 classes = classes.astype(str) | |
| 833 if args.rules_selector != 'Custom': | |
| 834 genes = data_gene(dataset, type_gene, name, None) | |
| 835 ids, rules = load_id_rules(recon.get(type_gene)) | |
| 836 elif args.rules_selector == 'Custom': | |
| 837 genes = data_gene(dataset, type_gene, name, gene_in_rule) | |
| 838 resolve_rules, err = resolve(genes, rules, ids, resolve_none, name) | |
| 839 if err != None and err: | |
| 840 warning('Warning: gene\n'+str(err)+'\nnot found in class ' | |
| 841 + name + ', the expression level for this gene ' + | |
| 842 'will be considered NaN\n') | |
| 843 if resolve_rules != None: | |
| 844 class_pat = split_class(classes, resolve_rules) | |
| 845 if generate_ras: | |
| 846 create_ras(resolve_rules, name, False, rules, ids) | |
| 847 | |
| 848 | |
| 849 if args.rules_selector == 'Custom': | |
| 850 if args.yes_no == 'yes': | |
| 851 try: | |
| 852 core_map = ET.parse(args.custom_map) | |
| 853 except (ET.XMLSyntaxError, ET.XMLSchemaParseError): | |
| 854 sys.exit('Execution aborted: custom map in wrong format') | |
| 855 elif args.yes_no == 'no': | |
| 856 core_map = ET.parse(args.tool_dir + '/local/HMRcoreMap.svg') | |
| 857 else: | |
| 858 core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg') | |
| 859 | |
| 860 maps(core_map, class_pat, ids, args.pValue, args.fChange, create_svg, create_pdf) | |
| 861 | |
| 862 print('Execution succeded') | 601 print('Execution succeded') | 
| 863 | 602 | 
| 864 return None | 603 return None | 
| 865 | 604 | 
| 866 ############################################################################### | 605 ############################################################################### | 
