Mercurial > repos > fubar > lifelines_km_cph_tool
changeset 1:232b874046a7 draft
Uploaded
author | fubar |
---|---|
date | Thu, 10 Aug 2023 07:15:22 +0000 |
parents | dd49a7040643 |
children | dd5e65893cb8 |
files | lifelines_tool/README.md lifelines_tool/lifelineskmcph.xml lifelines_tool/plotlykm.py lifelines_tool/run_log.txt lifelines_tool/test-data/readme_sample |
diffstat | 5 files changed, 91 insertions(+), 146 deletions(-) [+] |
line wrap: on
line diff
--- a/lifelines_tool/README.md Wed Aug 09 11:12:16 2023 +0000 +++ b/lifelines_tool/README.md Thu Aug 10 07:15:22 2023 +0000 @@ -2,7 +2,7 @@ ## Galaxy tool to run failure time models using lifelines -## Install to your Galaxy server from the toolshed - search for lifelines_tool owned by fubar2 +## Install to your Galaxy server from the toolshed - search for lifelines_km_cph_tool owned by fubar2 ### More at https://lazarus.name/demo/ @@ -25,7 +25,7 @@ Should work with any tabular data with the required columns - time and status for observations. Issues to https://github.com/fubar2/lifelines_tool please. -Autogenerated so pull requests are possibly meaningless but regeneration of a new version is easy so please tell me what is needed. +Autogenerated so pull requests are possibly meaningless but regeneration of a new version should work. ## Tool made with the Galaxy ToolFactory: https://github.com/fubar2/galaxy_tf_overlay The current release includes this and a generic tabular version, and a java .jar wrapper in a history where the generating
--- a/lifelines_tool/lifelineskmcph.xml Wed Aug 09 11:12:16 2023 +0000 +++ b/lifelines_tool/lifelineskmcph.xml Thu Aug 10 07:15:22 2023 +0000 @@ -1,6 +1,6 @@ <tool name="lifelineskmcph" id="lifelineskmcph" version="0.01"> <!--Source in git at: https://github.com/fubar2/galaxy_tf_overlay--> - <!--Created by toolfactory@galaxy.org at 09/08/2023 17:43:16 using the Galaxy Tool Factory.--> + <!--Created by toolfactory@galaxy.org at 10/08/2023 15:48:43 using the Galaxy Tool Factory.--> <description>Lifelines KM and optional Cox PH models</description> <requirements> <requirement version="1.5.3" type="package">pandas</requirement> @@ -40,8 +40,9 @@ # km models for https://github.com/galaxyproject/tools-iuc/issues/5393 # test as # python plotlykm.py --input_tab rossi.tab --htmlout "testfoo" --time "week" --status "arrest" --title "test" --image_dir images --cphcol="prio,age,race,paro,mar,fin" +# Ross Lazarus July 2023 +import argparse -import argparse import os import sys @@ -51,15 +52,22 @@ import pandas as pd -# Ross Lazarus July 2023 +def trimlegend(v): + """ + for int64 quintiles - must be ints - otherwise get silly legends with long float values + """ + for i, av in enumerate(v): + x = int(av) + v[i] = str(x) + return v kmf = lifelines.KaplanMeierFitter() cph = lifelines.CoxPHFitter() parser = argparse.ArgumentParser() a = parser.add_argument -a('--input_tab', default='', required=True) +a('--input_tab', default='rossi.tab', required=True) a('--header', default='') a('--htmlout', default="test_run.html") a('--group', default='') @@ -75,6 +83,7 @@ df = pd.read_csv(args.input_tab, sep='\t') NCOLS = df.columns.size NROWS = len(df.index) +QVALS = [.2, .4, .6, .8] # for partial cox ph plots defaultcols = ['col%d' % (x+1) for x in range(NCOLS)] testcols = df.columns if len(args.header.strip()) > 0: @@ -106,14 +115,11 @@ names = [] times = [] events = [] - rmst = [] for name, grouped_df in df.groupby(args.group): T = grouped_df[args.time] E = grouped_df[args.status] gfit = kmf.fit(T, E, label=name) kmf.plot_survival_function(ax=ax) - rst = lifelines.utils.restricted_mean_survival_time(gfit) - rmst.append(rst) names.append(str(name)) times.append(T) events.append(E) @@ -124,30 +130,50 @@ results = lifelines.statistics.logrank_test(times[0], times[1], events[0], events[1], alpha=.99) print('Logrank test for %s - %s vs %s\n' % (args.group, names[0], names[1])) results.print_summary() - elif ngroup > 1: - fig, ax = plt.subplots(nrows=ngroup, ncols=1, sharex=True) - for i, rst in rmst: - lifelines.plotting.rmst_plot(rst, ax=ax) - fig.savefig(os.path.join(args.image_dir,'RMST_%s.png' % args.title)) else: kmf.fit(df[args.time], df[args.status]) kmf.plot_survival_function(ax=ax) ax.set_title(args.title) fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title)) + print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available') if len(args.cphcols) > 0: fig, ax = plt.subplots() - ax.set_title('Cox PH model: %s' % args.title) + ax.set_title('Cox-PH model: %s' % args.title) cphcols = args.cphcols.strip().split(',') cphcols = [x.strip() for x in cphcols] notfound = sum([(x not in df.columns) for x in cphcols]) if notfound > 0: sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): One or more requested Cox PH columns %s not found in supplied column header %s' % (args.cphcols, df.columns)) sys.exit(6) + colsdf = df[cphcols] print('### Lifelines test of Proportional Hazards results with %s as covariates on %s' % (', '.join(cphcols), args.title)) - cphcols += [args.time, args.status] - cphdf = df[cphcols] + cutcphcols = [args.time, args.status] + cphcols + cphdf = df[cutcphcols] + ucolcounts = colsdf.nunique(axis=0) cph.fit(cphdf, duration_col=args.time, event_col=args.status) cph.print_summary() + for i, cov in enumerate(colsdf.columns): + if ucolcounts[i] > 10: + v = pd.Series.tolist(cphdf[cov].quantile(QVALS)) + vdt = df.dtypes[cov] + if vdt == 'int64': + v = trimlegend(v) + axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v) + axp.set_title('Cox-PH %s quintile partials: %s' % (cov,args.title)) + figr = axp.get_figure() + oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type)) + figr.savefig(oname) + else: + v = pd.unique(cphdf[cov]) + v = [str(x) for x in v] + try: + axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v) + axp.set_title('Cox-PH %s partials: %s' % (cov,args.title)) + figr = axp.get_figure() + oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type)) + figr.savefig(oname) + except: + pass cphaxes = cph.check_assumptions(cphdf, p_value_threshold=0.01, show_plots=True) for i, ax in enumerate(cphaxes): figr = ax[0].get_figure()
--- a/lifelines_tool/plotlykm.py Wed Aug 09 11:12:16 2023 +0000 +++ b/lifelines_tool/plotlykm.py Thu Aug 10 07:15:22 2023 +0000 @@ -2,8 +2,9 @@ # km models for https://github.com/galaxyproject/tools-iuc/issues/5393 # test as # python plotlykm.py --input_tab rossi.tab --htmlout "testfoo" --time "week" --status "arrest" --title "test" --image_dir images --cphcol="prio,age,race,paro,mar,fin" +# Ross Lazarus July 2023 +import argparse -import argparse import os import sys @@ -13,15 +14,22 @@ import pandas as pd -# Ross Lazarus July 2023 +def trimlegend(v): + """ + for int64 quintiles - must be ints - otherwise get silly legends with long float values + """ + for i, av in enumerate(v): + x = int(av) + v[i] = str(x) + return v kmf = lifelines.KaplanMeierFitter() cph = lifelines.CoxPHFitter() parser = argparse.ArgumentParser() a = parser.add_argument -a('--input_tab', default='', required=True) +a('--input_tab', default='rossi.tab', required=True) a('--header', default='') a('--htmlout', default="test_run.html") a('--group', default='') @@ -37,6 +45,7 @@ df = pd.read_csv(args.input_tab, sep='\t') NCOLS = df.columns.size NROWS = len(df.index) +QVALS = [.2, .4, .6, .8] # for partial cox ph plots defaultcols = ['col%d' % (x+1) for x in range(NCOLS)] testcols = df.columns if len(args.header.strip()) > 0: @@ -57,62 +66,79 @@ else: colsok = (args.time in defaultcols) and (args.status in defaultcols) if colsok: - sys.stderr.write('replacing first row of data derived header %s with %s' % (testcols, defaultcols)) + print('Replacing first row of data derived header %s with %s' % (testcols, defaultcols)) df.columns = defaultcols else: sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and status %s do not match anything in the file header, supplied header or automatic default column names %s' % (args.time, args.status, defaultcols)) -print('## Lifelines tool starting.\nUsing data header =', df.columns, 'time column =', args.time, 'status column =', args.status) +print('## Lifelines tool\nInput data header =', df.columns, 'time column =', args.time, 'status column =', args.status) os.makedirs(args.image_dir, exist_ok=True) fig, ax = plt.subplots() if args.group > '': names = [] times = [] events = [] - rmst = [] for name, grouped_df in df.groupby(args.group): T = grouped_df[args.time] E = grouped_df[args.status] gfit = kmf.fit(T, E, label=name) kmf.plot_survival_function(ax=ax) - rst = lifelines.utils.restricted_mean_survival_time(gfit) - rmst.append(rst) names.append(str(name)) times.append(T) events.append(E) + ax.set_title(args.title) + fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title)) ngroup = len(names) if ngroup == 2: # run logrank test if 2 groups results = lifelines.statistics.logrank_test(times[0], times[1], events[0], events[1], alpha=.99) - print(' vs '.join(names), results) + print('Logrank test for %s - %s vs %s\n' % (args.group, names[0], names[1])) results.print_summary() - elif ngroup > 1: - fig, ax = plt.subplots(nrows=ngroup, ncols=1, sharex=True) - for i, rst in rmst: - lifelines.plotting.rmst_plot(rst, ax=ax) - fig.savefig(os.path.join(args.image_dir,'RMST_%s.png' % args.title)) else: kmf.fit(df[args.time], df[args.status]) kmf.plot_survival_function(ax=ax) -fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title)) + ax.set_title(args.title) + fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title)) + print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available') if len(args.cphcols) > 0: fig, ax = plt.subplots() + ax.set_title('Cox-PH model: %s' % args.title) cphcols = args.cphcols.strip().split(',') cphcols = [x.strip() for x in cphcols] notfound = sum([(x not in df.columns) for x in cphcols]) if notfound > 0: sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): One or more requested Cox PH columns %s not found in supplied column header %s' % (args.cphcols, df.columns)) sys.exit(6) + colsdf = df[cphcols] print('### Lifelines test of Proportional Hazards results with %s as covariates on %s' % (', '.join(cphcols), args.title)) - cphcols += [args.time, args.status] - cphdf = df[cphcols] + cutcphcols = [args.time, args.status] + cphcols + cphdf = df[cutcphcols] + ucolcounts = colsdf.nunique(axis=0) cph.fit(cphdf, duration_col=args.time, event_col=args.status) cph.print_summary() + for i, cov in enumerate(colsdf.columns): + if ucolcounts[i] > 10: # a hack - assume categories are sparse - if not imaginary quintiles will have to do + v = pd.Series.tolist(cphdf[cov].quantile(QVALS)) + vdt = df.dtypes[cov] + if vdt == 'int64': + v = trimlegend(v) + axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v) + axp.set_title('Cox-PH %s quintile partials: %s' % (cov,args.title)) + figr = axp.get_figure() + oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type)) + figr.savefig(oname) + else: + v = pd.unique(cphdf[cov]) + v = [str(x) for x in v] + try: + axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v) + axp.set_title('Cox-PH %s partials: %s' % (cov,args.title)) + figr = axp.get_figure() + oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type)) + figr.savefig(oname) + except: + pass cphaxes = cph.check_assumptions(cphdf, p_value_threshold=0.01, show_plots=True) for i, ax in enumerate(cphaxes): figr = ax[0].get_figure() titl = figr._suptitle.get_text().replace(' ','_').replace("'","") oname = os.path.join(args.image_dir,'CPH%s.%s' % (titl, args.image_type)) figr.savefig(oname) - - - -
--- a/lifelines_tool/run_log.txt Wed Aug 09 11:12:16 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -## Lifelines tool starting. -Using data header = Index(['Unnamed: 0', 'week', 'arrest', 'fin', 'age', 'race', 'wexp', 'mar', - 'paro', 'prio'], - dtype='object') time column = week status column = arrest -### Lifelines test of Proportional Hazards results with prio, age, race, paro, mar, fin as covariates on test -<lifelines.CoxPHFitter: fitted with 432 total observations, 318 right-censored observations> - duration col = 'week' - event col = 'arrest' - baseline estimation = breslow - number of observations = 432 -number of events observed = 114 - partial log-likelihood = -659.00 - time fit was run = 2023-08-09 00:18:43 UTC - ---- - coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% -covariate -prio 0.10 1.10 0.03 0.04 0.15 1.04 1.16 -age -0.06 0.94 0.02 -0.10 -0.02 0.90 0.98 -race 0.32 1.38 0.31 -0.28 0.92 0.75 2.52 -paro -0.09 0.91 0.20 -0.47 0.29 0.62 1.34 -mar -0.48 0.62 0.38 -1.22 0.25 0.30 1.29 -fin -0.38 0.68 0.19 -0.75 -0.00 0.47 1.00 - - cmp to z p -log2(p) -covariate -prio 0.00 3.53 <0.005 11.26 -age 0.00 -2.95 <0.005 8.28 -race 0.00 1.04 0.30 1.75 -paro 0.00 -0.46 0.65 0.63 -mar 0.00 -1.28 0.20 2.32 -fin 0.00 -1.98 0.05 4.40 ---- -Concordance = 0.63 -Partial AIC = 1330.00 -log-likelihood ratio test = 32.77 on 6 df --log2(p) of ll-ratio test = 16.39 - - - Bootstrapping lowess lines. May take a moment... - - - Bootstrapping lowess lines. May take a moment... - -The ``p_value_threshold`` is set at 0.01. Even under the null hypothesis of no violations, some -covariates will be below the threshold by chance. This is compounded when there are many covariates. -Similarly, when there are lots of observations, even minor deviances from the proportional hazard -assumption will be flagged. - -With that in mind, it's best to use a combination of statistical tests and visual tests to determine -the most serious violations. Produce visual plots using ``check_assumptions(..., show_plots=True)`` -and looking for non-constant lines. See link [A] below for a full example. - -<lifelines.StatisticalResult: proportional_hazard_test> - null_distribution = chi squared -degrees_of_freedom = 1 - model = <lifelines.CoxPHFitter: fitted with 432 total observations, 318 right-censored observations> - test_name = proportional_hazard_test - ---- - test_statistic p -log2(p) -age km 6.99 0.01 6.93 - rank 7.40 0.01 7.26 -fin km 0.02 0.90 0.15 - rank 0.01 0.91 0.13 -mar km 1.64 0.20 2.32 - rank 1.80 0.18 2.48 -paro km 0.06 0.81 0.31 - rank 0.07 0.79 0.34 -prio km 0.92 0.34 1.57 - rank 0.88 0.35 1.52 -race km 1.70 0.19 2.38 - rank 1.68 0.19 2.36 - - -1. Variable 'age' failed the non-proportional test: p-value is 0.0065. - - Advice 1: the functional form of the variable 'age' might be incorrect. That is, there may be -non-linear terms missing. The proportional hazard test used is very sensitive to incorrect -functional forms. See documentation in link [D] below on how to specify a functional form. - - Advice 2: try binning the variable 'age' using pd.cut, and then specify it in `strata=['age', -...]` in the call in `.fit`. See documentation in link [B] below. - - Advice 3: try adding an interaction term with your time variable. See documentation in link [C] -below. - - - Bootstrapping lowess lines. May take a moment... - - - Bootstrapping lowess lines. May take a moment... - - - Bootstrapping lowess lines. May take a moment... - - - Bootstrapping lowess lines. May take a moment... - - ---- -[A] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html -[B] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Bin-variable-and-stratify-on-it -[C] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Introduce-time-varying-covariates -[D] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Modify-the-functional-form -[E] https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Stratification -
--- a/lifelines_tool/test-data/readme_sample Wed Aug 09 11:12:16 2023 +0000 +++ b/lifelines_tool/test-data/readme_sample Thu Aug 10 07:15:22 2023 +0000 @@ -22,7 +22,7 @@ number of observations = 432 number of events observed = 114 partial log-likelihood = -659.00 - time fit was run = 2023-08-09 07:43:37 UTC + time fit was run = 2023-08-10 05:49:04 UTC --- coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95%