Mercurial > repos > fubar > lifelines_km_cph_tool
annotate lifelines_tool/plotlykm.py @ 2:dd5e65893cb8 draft default tip
add survival and collapsed life table outputs suggested by Wolfgang
author | fubar |
---|---|
date | Thu, 10 Aug 2023 22:52:45 +0000 (20 months ago) |
parents | 232b874046a7 |
children |
rev | line source |
---|---|
0 | 1 # script for a lifelines ToolFactory KM/CPH tool for Galaxy |
2 # km models for https://github.com/galaxyproject/tools-iuc/issues/5393 | |
3 # test as | |
4 # python plotlykm.py --input_tab rossi.tab --htmlout "testfoo" --time "week" --status "arrest" --title "test" --image_dir images --cphcol="prio,age,race,paro,mar,fin" | |
1 | 5 # Ross Lazarus July 2023 |
6 import argparse | |
0 | 7 |
8 import os | |
9 import sys | |
10 | |
11 import lifelines | |
12 | |
13 from matplotlib import pyplot as plt | |
14 | |
15 import pandas as pd | |
16 | |
17 | |
1 | 18 def trimlegend(v): |
19 """ | |
20 for int64 quintiles - must be ints - otherwise get silly legends with long float values | |
21 """ | |
22 for i, av in enumerate(v): | |
23 x = int(av) | |
24 v[i] = str(x) | |
25 return v | |
0 | 26 |
27 kmf = lifelines.KaplanMeierFitter() | |
28 cph = lifelines.CoxPHFitter() | |
29 | |
30 parser = argparse.ArgumentParser() | |
31 a = parser.add_argument | |
1 | 32 a('--input_tab', default='rossi.tab', required=True) |
0 | 33 a('--header', default='') |
34 a('--htmlout', default="test_run.html") | |
35 a('--group', default='') | |
36 a('--time', default='', required=True) | |
37 a('--status',default='', required=True) | |
38 a('--cphcols',default='') | |
39 a('--title', default='Default plot title') | |
40 a('--image_type', default='png') | |
41 a('--image_dir', default='images') | |
42 a('--readme', default='run_log.txt') | |
43 args = parser.parse_args() | |
44 sys.stdout = open(args.readme, 'w') | |
45 df = pd.read_csv(args.input_tab, sep='\t') | |
46 NCOLS = df.columns.size | |
47 NROWS = len(df.index) | |
1 | 48 QVALS = [.2, .4, .6, .8] # for partial cox ph plots |
0 | 49 defaultcols = ['col%d' % (x+1) for x in range(NCOLS)] |
50 testcols = df.columns | |
51 if len(args.header.strip()) > 0: | |
52 newcols = args.header.split(',') | |
53 if len(newcols) == NCOLS: | |
54 if (args.time in newcols) and (args.status in newcols): | |
55 df.columns = newcols | |
56 else: | |
57 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and/or status %s not found in supplied header parameter %s' % (args.time, args.status, args.header)) | |
58 sys.exit(4) | |
59 else: | |
60 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): Supplied header %s has %d comma delimited header names - does not match the input tabular file %d columns' % (args.header, len(newcols), NCOLS)) | |
61 sys.exit(5) | |
62 else: # no header supplied - check for a real one that matches the x and y axis column names | |
63 colsok = (args.time in testcols) and (args.status in testcols) # if they match, probably ok...should use more code and logic.. | |
64 if colsok: | |
65 df.columns = testcols # use actual header | |
66 else: | |
67 colsok = (args.time in defaultcols) and (args.status in defaultcols) | |
68 if colsok: | |
1 | 69 print('Replacing first row of data derived header %s with %s' % (testcols, defaultcols)) |
0 | 70 df.columns = defaultcols |
71 else: | |
72 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and status %s do not match anything in the file header, supplied header or automatic default column names %s' % (args.time, args.status, defaultcols)) | |
1 | 73 print('## Lifelines tool\nInput data header =', df.columns, 'time column =', args.time, 'status column =', args.status) |
0 | 74 os.makedirs(args.image_dir, exist_ok=True) |
75 fig, ax = plt.subplots() | |
76 if args.group > '': | |
77 names = [] | |
78 times = [] | |
79 events = [] | |
80 for name, grouped_df in df.groupby(args.group): | |
81 T = grouped_df[args.time] | |
82 E = grouped_df[args.status] | |
83 gfit = kmf.fit(T, E, label=name) | |
84 kmf.plot_survival_function(ax=ax) | |
85 names.append(str(name)) | |
86 times.append(T) | |
87 events.append(E) | |
1 | 88 ax.set_title(args.title) |
89 fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title)) | |
0 | 90 ngroup = len(names) |
91 if ngroup == 2: # run logrank test if 2 groups | |
92 results = lifelines.statistics.logrank_test(times[0], times[1], events[0], events[1], alpha=.99) | |
1 | 93 print('Logrank test for %s - %s vs %s\n' % (args.group, names[0], names[1])) |
0 | 94 results.print_summary() |
95 else: | |
96 kmf.fit(df[args.time], df[args.status]) | |
97 kmf.plot_survival_function(ax=ax) | |
1 | 98 ax.set_title(args.title) |
99 fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title)) | |
100 print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available') | |
2
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
101 survdf = lifelines.utils.survival_table_from_events(df[args.time], df[args.status]) |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
102 lifedf = lifelines.utils.survival_table_from_events(df[args.time], df[args.status], collapse=True) |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
103 print("Survival table using time %s and event %s" % (args.time, args.status)) |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
104 with pd.option_context('display.max_rows', None, |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
105 'display.max_columns', None, |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
106 'display.precision', 3, |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
107 ): |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
108 print(survdf) |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
109 print("Life table using time %s and event %s" % (args.time, args.status)) |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
110 with pd.option_context('display.max_rows', None, |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
111 'display.max_columns', None, |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
112 'display.precision', 3, |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
113 ): |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
114 print(lifedf) |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
115 outpath = os.path.join(args.image_dir,'survival_table.tabular') |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
116 survdf.to_csv(outpath, sep='\t') |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
117 outpath = os.path.join(args.image_dir,'life_table.tabular') |
dd5e65893cb8
add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents:
1
diff
changeset
|
118 lifedf.to_csv(outpath, sep='\t') |
0 | 119 if len(args.cphcols) > 0: |
120 fig, ax = plt.subplots() | |
1 | 121 ax.set_title('Cox-PH model: %s' % args.title) |
0 | 122 cphcols = args.cphcols.strip().split(',') |
123 cphcols = [x.strip() for x in cphcols] | |
124 notfound = sum([(x not in df.columns) for x in cphcols]) | |
125 if notfound > 0: | |
126 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): One or more requested Cox PH columns %s not found in supplied column header %s' % (args.cphcols, df.columns)) | |
127 sys.exit(6) | |
1 | 128 colsdf = df[cphcols] |
0 | 129 print('### Lifelines test of Proportional Hazards results with %s as covariates on %s' % (', '.join(cphcols), args.title)) |
1 | 130 cutcphcols = [args.time, args.status] + cphcols |
131 cphdf = df[cutcphcols] | |
132 ucolcounts = colsdf.nunique(axis=0) | |
0 | 133 cph.fit(cphdf, duration_col=args.time, event_col=args.status) |
134 cph.print_summary() | |
1 | 135 for i, cov in enumerate(colsdf.columns): |
136 if ucolcounts[i] > 10: # a hack - assume categories are sparse - if not imaginary quintiles will have to do | |
137 v = pd.Series.tolist(cphdf[cov].quantile(QVALS)) | |
138 vdt = df.dtypes[cov] | |
139 if vdt == 'int64': | |
140 v = trimlegend(v) | |
141 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v) | |
142 axp.set_title('Cox-PH %s quintile partials: %s' % (cov,args.title)) | |
143 figr = axp.get_figure() | |
144 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type)) | |
145 figr.savefig(oname) | |
146 else: | |
147 v = pd.unique(cphdf[cov]) | |
148 v = [str(x) for x in v] | |
149 try: | |
150 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v) | |
151 axp.set_title('Cox-PH %s partials: %s' % (cov,args.title)) | |
152 figr = axp.get_figure() | |
153 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type)) | |
154 figr.savefig(oname) | |
155 except: | |
156 pass | |
0 | 157 cphaxes = cph.check_assumptions(cphdf, p_value_threshold=0.01, show_plots=True) |
158 for i, ax in enumerate(cphaxes): | |
159 figr = ax[0].get_figure() | |
160 titl = figr._suptitle.get_text().replace(' ','_').replace("'","") | |
161 oname = os.path.join(args.image_dir,'CPH%s.%s' % (titl, args.image_type)) | |
162 figr.savefig(oname) |