annotate lifelines_tool/plotlykm.py @ 2:dd5e65893cb8 draft default tip

add survival and collapsed life table outputs suggested by Wolfgang
author fubar
date Thu, 10 Aug 2023 22:52:45 +0000
parents 232b874046a7
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
1 # script for a lifelines ToolFactory KM/CPH tool for Galaxy
dd49a7040643 Initial commit
fubar
parents:
diff changeset
2 # km models for https://github.com/galaxyproject/tools-iuc/issues/5393
dd49a7040643 Initial commit
fubar
parents:
diff changeset
3 # test as
dd49a7040643 Initial commit
fubar
parents:
diff changeset
4 # python plotlykm.py --input_tab rossi.tab --htmlout "testfoo" --time "week" --status "arrest" --title "test" --image_dir images --cphcol="prio,age,race,paro,mar,fin"
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
5 # Ross Lazarus July 2023
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
6 import argparse
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
7
dd49a7040643 Initial commit
fubar
parents:
diff changeset
8 import os
dd49a7040643 Initial commit
fubar
parents:
diff changeset
9 import sys
dd49a7040643 Initial commit
fubar
parents:
diff changeset
10
dd49a7040643 Initial commit
fubar
parents:
diff changeset
11 import lifelines
dd49a7040643 Initial commit
fubar
parents:
diff changeset
12
dd49a7040643 Initial commit
fubar
parents:
diff changeset
13 from matplotlib import pyplot as plt
dd49a7040643 Initial commit
fubar
parents:
diff changeset
14
dd49a7040643 Initial commit
fubar
parents:
diff changeset
15 import pandas as pd
dd49a7040643 Initial commit
fubar
parents:
diff changeset
16
dd49a7040643 Initial commit
fubar
parents:
diff changeset
17
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
18 def trimlegend(v):
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
19 """
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
20 for int64 quintiles - must be ints - otherwise get silly legends with long float values
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
21 """
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
22 for i, av in enumerate(v):
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
23 x = int(av)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
24 v[i] = str(x)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
25 return v
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
26
dd49a7040643 Initial commit
fubar
parents:
diff changeset
27 kmf = lifelines.KaplanMeierFitter()
dd49a7040643 Initial commit
fubar
parents:
diff changeset
28 cph = lifelines.CoxPHFitter()
dd49a7040643 Initial commit
fubar
parents:
diff changeset
29
dd49a7040643 Initial commit
fubar
parents:
diff changeset
30 parser = argparse.ArgumentParser()
dd49a7040643 Initial commit
fubar
parents:
diff changeset
31 a = parser.add_argument
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
32 a('--input_tab', default='rossi.tab', required=True)
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
33 a('--header', default='')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
34 a('--htmlout', default="test_run.html")
dd49a7040643 Initial commit
fubar
parents:
diff changeset
35 a('--group', default='')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
36 a('--time', default='', required=True)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
37 a('--status',default='', required=True)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
38 a('--cphcols',default='')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
39 a('--title', default='Default plot title')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
40 a('--image_type', default='png')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
41 a('--image_dir', default='images')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
42 a('--readme', default='run_log.txt')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
43 args = parser.parse_args()
dd49a7040643 Initial commit
fubar
parents:
diff changeset
44 sys.stdout = open(args.readme, 'w')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
45 df = pd.read_csv(args.input_tab, sep='\t')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
46 NCOLS = df.columns.size
dd49a7040643 Initial commit
fubar
parents:
diff changeset
47 NROWS = len(df.index)
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
48 QVALS = [.2, .4, .6, .8] # for partial cox ph plots
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
49 defaultcols = ['col%d' % (x+1) for x in range(NCOLS)]
dd49a7040643 Initial commit
fubar
parents:
diff changeset
50 testcols = df.columns
dd49a7040643 Initial commit
fubar
parents:
diff changeset
51 if len(args.header.strip()) > 0:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
52 newcols = args.header.split(',')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
53 if len(newcols) == NCOLS:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
54 if (args.time in newcols) and (args.status in newcols):
dd49a7040643 Initial commit
fubar
parents:
diff changeset
55 df.columns = newcols
dd49a7040643 Initial commit
fubar
parents:
diff changeset
56 else:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
57 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and/or status %s not found in supplied header parameter %s' % (args.time, args.status, args.header))
dd49a7040643 Initial commit
fubar
parents:
diff changeset
58 sys.exit(4)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
59 else:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
60 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): Supplied header %s has %d comma delimited header names - does not match the input tabular file %d columns' % (args.header, len(newcols), NCOLS))
dd49a7040643 Initial commit
fubar
parents:
diff changeset
61 sys.exit(5)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
62 else: # no header supplied - check for a real one that matches the x and y axis column names
dd49a7040643 Initial commit
fubar
parents:
diff changeset
63 colsok = (args.time in testcols) and (args.status in testcols) # if they match, probably ok...should use more code and logic..
dd49a7040643 Initial commit
fubar
parents:
diff changeset
64 if colsok:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
65 df.columns = testcols # use actual header
dd49a7040643 Initial commit
fubar
parents:
diff changeset
66 else:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
67 colsok = (args.time in defaultcols) and (args.status in defaultcols)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
68 if colsok:
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
69 print('Replacing first row of data derived header %s with %s' % (testcols, defaultcols))
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
70 df.columns = defaultcols
dd49a7040643 Initial commit
fubar
parents:
diff changeset
71 else:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
72 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and status %s do not match anything in the file header, supplied header or automatic default column names %s' % (args.time, args.status, defaultcols))
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
73 print('## Lifelines tool\nInput data header =', df.columns, 'time column =', args.time, 'status column =', args.status)
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
74 os.makedirs(args.image_dir, exist_ok=True)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
75 fig, ax = plt.subplots()
dd49a7040643 Initial commit
fubar
parents:
diff changeset
76 if args.group > '':
dd49a7040643 Initial commit
fubar
parents:
diff changeset
77 names = []
dd49a7040643 Initial commit
fubar
parents:
diff changeset
78 times = []
dd49a7040643 Initial commit
fubar
parents:
diff changeset
79 events = []
dd49a7040643 Initial commit
fubar
parents:
diff changeset
80 for name, grouped_df in df.groupby(args.group):
dd49a7040643 Initial commit
fubar
parents:
diff changeset
81 T = grouped_df[args.time]
dd49a7040643 Initial commit
fubar
parents:
diff changeset
82 E = grouped_df[args.status]
dd49a7040643 Initial commit
fubar
parents:
diff changeset
83 gfit = kmf.fit(T, E, label=name)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
84 kmf.plot_survival_function(ax=ax)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
85 names.append(str(name))
dd49a7040643 Initial commit
fubar
parents:
diff changeset
86 times.append(T)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
87 events.append(E)
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
88 ax.set_title(args.title)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
89 fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
90 ngroup = len(names)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
91 if ngroup == 2: # run logrank test if 2 groups
dd49a7040643 Initial commit
fubar
parents:
diff changeset
92 results = lifelines.statistics.logrank_test(times[0], times[1], events[0], events[1], alpha=.99)
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
93 print('Logrank test for %s - %s vs %s\n' % (args.group, names[0], names[1]))
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
94 results.print_summary()
dd49a7040643 Initial commit
fubar
parents:
diff changeset
95 else:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
96 kmf.fit(df[args.time], df[args.status])
dd49a7040643 Initial commit
fubar
parents:
diff changeset
97 kmf.plot_survival_function(ax=ax)
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
98 ax.set_title(args.title)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
99 fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
100 print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available')
2
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
101 survdf = lifelines.utils.survival_table_from_events(df[args.time], df[args.status])
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
102 lifedf = lifelines.utils.survival_table_from_events(df[args.time], df[args.status], collapse=True)
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
103 print("Survival table using time %s and event %s" % (args.time, args.status))
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
104 with pd.option_context('display.max_rows', None,
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
105 'display.max_columns', None,
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
106 'display.precision', 3,
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
107 ):
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
108 print(survdf)
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
109 print("Life table using time %s and event %s" % (args.time, args.status))
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
110 with pd.option_context('display.max_rows', None,
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
111 'display.max_columns', None,
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
112 'display.precision', 3,
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
113 ):
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
114 print(lifedf)
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
115 outpath = os.path.join(args.image_dir,'survival_table.tabular')
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
116 survdf.to_csv(outpath, sep='\t')
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
117 outpath = os.path.join(args.image_dir,'life_table.tabular')
dd5e65893cb8 add survival and collapsed life table outputs suggested by Wolfgang
fubar
parents: 1
diff changeset
118 lifedf.to_csv(outpath, sep='\t')
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
119 if len(args.cphcols) > 0:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
120 fig, ax = plt.subplots()
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
121 ax.set_title('Cox-PH model: %s' % args.title)
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
122 cphcols = args.cphcols.strip().split(',')
dd49a7040643 Initial commit
fubar
parents:
diff changeset
123 cphcols = [x.strip() for x in cphcols]
dd49a7040643 Initial commit
fubar
parents:
diff changeset
124 notfound = sum([(x not in df.columns) for x in cphcols])
dd49a7040643 Initial commit
fubar
parents:
diff changeset
125 if notfound > 0:
dd49a7040643 Initial commit
fubar
parents:
diff changeset
126 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): One or more requested Cox PH columns %s not found in supplied column header %s' % (args.cphcols, df.columns))
dd49a7040643 Initial commit
fubar
parents:
diff changeset
127 sys.exit(6)
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
128 colsdf = df[cphcols]
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
129 print('### Lifelines test of Proportional Hazards results with %s as covariates on %s' % (', '.join(cphcols), args.title))
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
130 cutcphcols = [args.time, args.status] + cphcols
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
131 cphdf = df[cutcphcols]
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
132 ucolcounts = colsdf.nunique(axis=0)
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
133 cph.fit(cphdf, duration_col=args.time, event_col=args.status)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
134 cph.print_summary()
1
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
135 for i, cov in enumerate(colsdf.columns):
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
136 if ucolcounts[i] > 10: # a hack - assume categories are sparse - if not imaginary quintiles will have to do
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
137 v = pd.Series.tolist(cphdf[cov].quantile(QVALS))
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
138 vdt = df.dtypes[cov]
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
139 if vdt == 'int64':
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
140 v = trimlegend(v)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
141 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
142 axp.set_title('Cox-PH %s quintile partials: %s' % (cov,args.title))
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
143 figr = axp.get_figure()
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
144 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
145 figr.savefig(oname)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
146 else:
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
147 v = pd.unique(cphdf[cov])
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
148 v = [str(x) for x in v]
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
149 try:
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
150 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
151 axp.set_title('Cox-PH %s partials: %s' % (cov,args.title))
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
152 figr = axp.get_figure()
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
153 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
154 figr.savefig(oname)
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
155 except:
232b874046a7 Uploaded
fubar
parents: 0
diff changeset
156 pass
0
dd49a7040643 Initial commit
fubar
parents:
diff changeset
157 cphaxes = cph.check_assumptions(cphdf, p_value_threshold=0.01, show_plots=True)
dd49a7040643 Initial commit
fubar
parents:
diff changeset
158 for i, ax in enumerate(cphaxes):
dd49a7040643 Initial commit
fubar
parents:
diff changeset
159 figr = ax[0].get_figure()
dd49a7040643 Initial commit
fubar
parents:
diff changeset
160 titl = figr._suptitle.get_text().replace(' ','_').replace("'","")
dd49a7040643 Initial commit
fubar
parents:
diff changeset
161 oname = os.path.join(args.image_dir,'CPH%s.%s' % (titl, args.image_type))
dd49a7040643 Initial commit
fubar
parents:
diff changeset
162 figr.savefig(oname)