changeset 1:232b874046a7 draft

Uploaded
author fubar
date Thu, 10 Aug 2023 07:15:22 +0000
parents dd49a7040643
children dd5e65893cb8
files lifelines_tool/README.md lifelines_tool/lifelineskmcph.xml lifelines_tool/plotlykm.py lifelines_tool/run_log.txt lifelines_tool/test-data/readme_sample
diffstat 5 files changed, 91 insertions(+), 146 deletions(-) [+]
line wrap: on
line diff
--- a/lifelines_tool/README.md	Wed Aug 09 11:12:16 2023 +0000
+++ b/lifelines_tool/README.md	Thu Aug 10 07:15:22 2023 +0000
@@ -2,7 +2,7 @@
 
 ## Galaxy tool to run failure time models using lifelines
 
-## Install to your Galaxy server from the toolshed - search for lifelines_tool owned by fubar2
+## Install to your Galaxy server from the toolshed - search for lifelines_km_cph_tool owned by fubar2
 
 ### More at https://lazarus.name/demo/
 
@@ -25,7 +25,7 @@
 Should work with any tabular data with the required columns - time and status for observations.
 
 Issues to https://github.com/fubar2/lifelines_tool please.
-Autogenerated so pull requests are possibly meaningless but regeneration of a new version is easy so please tell me what is needed.
+Autogenerated so pull requests are possibly meaningless but regeneration of a new version should work.
 
 ## Tool made with the Galaxy ToolFactory: https://github.com/fubar2/galaxy_tf_overlay
 The current release includes this and a generic tabular version, and a java .jar wrapper in a history where the generating
--- a/lifelines_tool/lifelineskmcph.xml	Wed Aug 09 11:12:16 2023 +0000
+++ b/lifelines_tool/lifelineskmcph.xml	Thu Aug 10 07:15:22 2023 +0000
@@ -1,6 +1,6 @@
 <tool name="lifelineskmcph" id="lifelineskmcph" version="0.01">
   <!--Source in git at: https://github.com/fubar2/galaxy_tf_overlay-->
-  <!--Created by toolfactory@galaxy.org at 09/08/2023 17:43:16 using the Galaxy Tool Factory.-->
+  <!--Created by toolfactory@galaxy.org at 10/08/2023 15:48:43 using the Galaxy Tool Factory.-->
   <description>Lifelines KM and optional Cox PH models</description>
   <requirements>
     <requirement version="1.5.3" type="package">pandas</requirement>
@@ -40,8 +40,9 @@
 # km models for https://github.com/galaxyproject/tools-iuc/issues/5393
 # test as
 # python plotlykm.py --input_tab rossi.tab --htmlout "testfoo" --time "week" --status "arrest" --title "test" --image_dir images --cphcol="prio,age,race,paro,mar,fin"
+# Ross Lazarus July 2023
+import argparse
 
-import argparse
 import os
 import sys
 
@@ -51,15 +52,22 @@
 
 import pandas as pd
 
-# Ross Lazarus July 2023
 
+def trimlegend(v):
+    """
+    for int64 quintiles - must be ints - otherwise get silly legends with long float values
+    """
+    for i, av in enumerate(v):
+        x = int(av)
+        v[i] = str(x)
+    return v
 
 kmf = lifelines.KaplanMeierFitter()
 cph = lifelines.CoxPHFitter()
 
 parser = argparse.ArgumentParser()
 a = parser.add_argument
-a('--input_tab', default='', required=True)
+a('--input_tab', default='rossi.tab', required=True)
 a('--header', default='')
 a('--htmlout', default="test_run.html")
 a('--group', default='')
@@ -75,6 +83,7 @@
 df = pd.read_csv(args.input_tab, sep='\t')
 NCOLS = df.columns.size
 NROWS = len(df.index)
+QVALS = [.2, .4, .6, .8] # for partial cox ph plots
 defaultcols = ['col%d' % (x+1) for x in range(NCOLS)]
 testcols = df.columns
 if len(args.header.strip()) > 0:
@@ -106,14 +115,11 @@
     names = []
     times = []
     events = []
-    rmst = []
     for name, grouped_df in df.groupby(args.group):
         T = grouped_df[args.time]
         E = grouped_df[args.status]
         gfit = kmf.fit(T, E, label=name)
         kmf.plot_survival_function(ax=ax)
-        rst = lifelines.utils.restricted_mean_survival_time(gfit)
-        rmst.append(rst)
         names.append(str(name))
         times.append(T)
         events.append(E)
@@ -124,30 +130,50 @@
         results = lifelines.statistics.logrank_test(times[0], times[1], events[0], events[1], alpha=.99)
         print('Logrank test for %s - %s vs %s\n' % (args.group, names[0], names[1]))
         results.print_summary()
-    elif ngroup > 1:
-        fig, ax = plt.subplots(nrows=ngroup, ncols=1, sharex=True)
-        for i, rst in rmst:
-            lifelines.plotting.rmst_plot(rst, ax=ax)
-        fig.savefig(os.path.join(args.image_dir,'RMST_%s.png' % args.title))
 else:
     kmf.fit(df[args.time], df[args.status])
     kmf.plot_survival_function(ax=ax)
     ax.set_title(args.title)
     fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
+    print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available')
 if len(args.cphcols) > 0:
     fig, ax = plt.subplots()
-    ax.set_title('Cox PH model: %s' % args.title)
+    ax.set_title('Cox-PH model: %s' % args.title)
     cphcols = args.cphcols.strip().split(',')
     cphcols = [x.strip() for x in cphcols]
     notfound = sum([(x not in df.columns) for x in cphcols])
     if notfound > 0:
         sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): One or more requested Cox PH columns %s not found in supplied column header %s' % (args.cphcols, df.columns))
         sys.exit(6)
+    colsdf = df[cphcols]
     print('### Lifelines test of Proportional Hazards results with %s as covariates on %s' % (', '.join(cphcols), args.title))
-    cphcols += [args.time, args.status]
-    cphdf = df[cphcols]
+    cutcphcols = [args.time, args.status] + cphcols
+    cphdf = df[cutcphcols]
+    ucolcounts = colsdf.nunique(axis=0)
     cph.fit(cphdf, duration_col=args.time, event_col=args.status)
     cph.print_summary()
+    for i, cov in enumerate(colsdf.columns):
+         if ucolcounts[i] > 10:
+             v = pd.Series.tolist(cphdf[cov].quantile(QVALS))
+             vdt = df.dtypes[cov]
+             if vdt == 'int64':
+                 v = trimlegend(v)
+             axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
+             axp.set_title('Cox-PH %s quintile partials: %s' % (cov,args.title))
+             figr = axp.get_figure()
+             oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
+             figr.savefig(oname)
+         else:
+             v = pd.unique(cphdf[cov])
+             v = [str(x) for x in v]
+             try:
+                 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
+                 axp.set_title('Cox-PH %s partials: %s' % (cov,args.title))
+                 figr = axp.get_figure()
+                 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
+                 figr.savefig(oname)
+             except:
+                 pass
     cphaxes = cph.check_assumptions(cphdf, p_value_threshold=0.01, show_plots=True)
     for i, ax in enumerate(cphaxes):
         figr = ax[0].get_figure()
--- a/lifelines_tool/plotlykm.py	Wed Aug 09 11:12:16 2023 +0000
+++ b/lifelines_tool/plotlykm.py	Thu Aug 10 07:15:22 2023 +0000
@@ -2,8 +2,9 @@
 # km models for https://github.com/galaxyproject/tools-iuc/issues/5393
 # test as
 # python plotlykm.py --input_tab rossi.tab --htmlout "testfoo" --time "week" --status "arrest" --title "test" --image_dir images --cphcol="prio,age,race,paro,mar,fin"
+# Ross Lazarus July 2023
+import argparse
 
-import argparse
 import os
 import sys
 
@@ -13,15 +14,22 @@
 
 import pandas as pd
 
-# Ross Lazarus July 2023
 
+def trimlegend(v):
+    """
+    for int64 quintiles - must be ints - otherwise get silly legends with long float values
+    """
+    for i, av in enumerate(v):
+        x = int(av)
+        v[i] = str(x)
+    return v
 
 kmf = lifelines.KaplanMeierFitter()
 cph = lifelines.CoxPHFitter()
 
 parser = argparse.ArgumentParser()
 a = parser.add_argument
-a('--input_tab', default='', required=True)
+a('--input_tab', default='rossi.tab', required=True)
 a('--header', default='')
 a('--htmlout', default="test_run.html")
 a('--group', default='')
@@ -37,6 +45,7 @@
 df = pd.read_csv(args.input_tab, sep='\t')
 NCOLS = df.columns.size
 NROWS = len(df.index)
+QVALS = [.2, .4, .6, .8] # for partial cox ph plots
 defaultcols = ['col%d' % (x+1) for x in range(NCOLS)]
 testcols = df.columns
 if len(args.header.strip()) > 0:
@@ -57,62 +66,79 @@
     else:
         colsok = (args.time in defaultcols) and (args.status in defaultcols)
         if colsok:
-            sys.stderr.write('replacing first row of data derived header %s with %s' % (testcols, defaultcols))
+            print('Replacing first row of data derived header %s with %s' % (testcols, defaultcols))
             df.columns = defaultcols
         else:
             sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and status %s do not match anything in the file header, supplied header or automatic default column names %s' % (args.time, args.status, defaultcols))
-print('## Lifelines tool starting.\nUsing data header =', df.columns, 'time column =', args.time, 'status column =', args.status)
+print('## Lifelines tool\nInput data header =', df.columns, 'time column =', args.time, 'status column =', args.status)
 os.makedirs(args.image_dir, exist_ok=True)
 fig, ax = plt.subplots()
 if args.group > '':
     names = []
     times = []
     events = []
-    rmst = []
     for name, grouped_df in df.groupby(args.group):
         T = grouped_df[args.time]
         E = grouped_df[args.status]
         gfit = kmf.fit(T, E, label=name)
         kmf.plot_survival_function(ax=ax)
-        rst = lifelines.utils.restricted_mean_survival_time(gfit)
-        rmst.append(rst)
         names.append(str(name))
         times.append(T)
         events.append(E)
+    ax.set_title(args.title)
+    fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
     ngroup = len(names)
     if  ngroup == 2: # run logrank test if 2 groups
         results = lifelines.statistics.logrank_test(times[0], times[1], events[0], events[1], alpha=.99)
-        print(' vs '.join(names), results)
+        print('Logrank test for %s - %s vs %s\n' % (args.group, names[0], names[1]))
         results.print_summary()
-    elif ngroup > 1:
-        fig, ax = plt.subplots(nrows=ngroup, ncols=1, sharex=True)
-        for i, rst in rmst:
-            lifelines.plotting.rmst_plot(rst, ax=ax)
-        fig.savefig(os.path.join(args.image_dir,'RMST_%s.png' % args.title))
 else:
     kmf.fit(df[args.time], df[args.status])
     kmf.plot_survival_function(ax=ax)
-fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
+    ax.set_title(args.title)
+    fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
+    print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available')
 if len(args.cphcols) > 0:
     fig, ax = plt.subplots()
+    ax.set_title('Cox-PH model: %s' % args.title)
     cphcols = args.cphcols.strip().split(',')
     cphcols = [x.strip() for x in cphcols]
     notfound = sum([(x not in df.columns) for x in cphcols])
     if notfound > 0:
         sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): One or more requested Cox PH columns %s not found in supplied column header %s' % (args.cphcols, df.columns))
         sys.exit(6)
+    colsdf = df[cphcols]
     print('### Lifelines test of Proportional Hazards results with %s as covariates on %s' % (', '.join(cphcols), args.title))
-    cphcols += [args.time, args.status]
-    cphdf = df[cphcols]
+    cutcphcols = [args.time, args.status] + cphcols
+    cphdf = df[cutcphcols]
+    ucolcounts = colsdf.nunique(axis=0)
     cph.fit(cphdf, duration_col=args.time, event_col=args.status)
     cph.print_summary()
+    for i, cov in enumerate(colsdf.columns):
+         if ucolcounts[i] > 10: # a hack - assume categories are sparse - if not imaginary quintiles will have to do
+             v = pd.Series.tolist(cphdf[cov].quantile(QVALS))
+             vdt = df.dtypes[cov]
+             if vdt == 'int64':
+                 v = trimlegend(v)
+             axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
+             axp.set_title('Cox-PH %s quintile partials: %s' % (cov,args.title))
+             figr = axp.get_figure()
+             oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
+             figr.savefig(oname)
+         else:
+             v = pd.unique(cphdf[cov])
+             v = [str(x) for x in v]
+             try:
+                 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
+                 axp.set_title('Cox-PH %s partials: %s' % (cov,args.title))
+                 figr = axp.get_figure()
+                 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
+                 figr.savefig(oname)
+             except:
+                 pass
     cphaxes = cph.check_assumptions(cphdf, p_value_threshold=0.01, show_plots=True)
     for i, ax in enumerate(cphaxes):
         figr = ax[0].get_figure()
         titl = figr._suptitle.get_text().replace(' ','_').replace("'","")
         oname = os.path.join(args.image_dir,'CPH%s.%s' % (titl, args.image_type))
         figr.savefig(oname)
-
-
-
-
--- a/lifelines_tool/run_log.txt	Wed Aug 09 11:12:16 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,107 +0,0 @@
-## Lifelines tool starting.
-Using data header = Index(['Unnamed: 0', 'week', 'arrest', 'fin', 'age', 'race', 'wexp', 'mar',
-       'paro', 'prio'],
-      dtype='object') time column = week status column = arrest
-### Lifelines test of Proportional Hazards results with prio, age, race, paro, mar, fin as covariates on test
-<lifelines.CoxPHFitter: fitted with 432 total observations, 318 right-censored observations>
-             duration col = 'week'
-                event col = 'arrest'
-      baseline estimation = breslow
-   number of observations = 432
-number of events observed = 114
-   partial log-likelihood = -659.00
-         time fit was run = 2023-08-09 00:18:43 UTC
-
----
-            coef  exp(coef)   se(coef)   coef lower 95%   coef upper 95%  exp(coef) lower 95%  exp(coef) upper 95%
-covariate                                                                                                         
-prio        0.10       1.10       0.03             0.04             0.15                 1.04                 1.16
-age        -0.06       0.94       0.02            -0.10            -0.02                 0.90                 0.98
-race        0.32       1.38       0.31            -0.28             0.92                 0.75                 2.52
-paro       -0.09       0.91       0.20            -0.47             0.29                 0.62                 1.34
-mar        -0.48       0.62       0.38            -1.22             0.25                 0.30                 1.29
-fin        -0.38       0.68       0.19            -0.75            -0.00                 0.47                 1.00
-
-            cmp to     z      p   -log2(p)
-covariate                                 
-prio          0.00  3.53 <0.005      11.26
-age           0.00 -2.95 <0.005       8.28
-race          0.00  1.04   0.30       1.75
-paro          0.00 -0.46   0.65       0.63
-mar           0.00 -1.28   0.20       2.32
-fin           0.00 -1.98   0.05       4.40
----
-Concordance = 0.63
-Partial AIC = 1330.00
-log-likelihood ratio test = 32.77 on 6 df
--log2(p) of ll-ratio test = 16.39
-
-
-   Bootstrapping lowess lines. May take a moment...
-
-
-   Bootstrapping lowess lines. May take a moment...
-
-The ``p_value_threshold`` is set at 0.01. Even under the null hypothesis of no violations, some
-covariates will be below the threshold by chance. This is compounded when there are many covariates.
-Similarly, when there are lots of observations, even minor deviances from the proportional hazard
-assumption will be flagged.
-
-With that in mind, it's best to use a combination of statistical tests and visual tests to determine
-the most serious violations. Produce visual plots using ``check_assumptions(..., show_plots=True)``
-and looking for non-constant lines. See link [A] below for a full example.
-
-<lifelines.StatisticalResult: proportional_hazard_test>
- null_distribution = chi squared
-degrees_of_freedom = 1
-             model = <lifelines.CoxPHFitter: fitted with 432 total observations, 318 right-censored observations>
-         test_name = proportional_hazard_test
-
----
-           test_statistic    p  -log2(p)
-age  km              6.99 0.01      6.93
-     rank            7.40 0.01      7.26
-fin  km              0.02 0.90      0.15
-     rank            0.01 0.91      0.13
-mar  km              1.64 0.20      2.32
-     rank            1.80 0.18      2.48
-paro km              0.06 0.81      0.31
-     rank            0.07 0.79      0.34
-prio km              0.92 0.34      1.57
-     rank            0.88 0.35      1.52
-race km              1.70 0.19      2.38
-     rank            1.68 0.19      2.36
-
-
-1. Variable 'age' failed the non-proportional test: p-value is 0.0065.
-
-   Advice 1: the functional form of the variable 'age' might be incorrect. That is, there may be
-non-linear terms missing. The proportional hazard test used is very sensitive to incorrect
-functional forms. See documentation in link [D] below on how to specify a functional form.
-
-   Advice 2: try binning the variable 'age' using pd.cut, and then specify it in `strata=['age',
-...]` in the call in `.fit`. See documentation in link [B] below.
-
-   Advice 3: try adding an interaction term with your time variable. See documentation in link [C]
-below.
-
-
-   Bootstrapping lowess lines. May take a moment...
-
-
-   Bootstrapping lowess lines. May take a moment...
-
-
-   Bootstrapping lowess lines. May take a moment...
-
-
-   Bootstrapping lowess lines. May take a moment...
-
-
----
-[A]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html
-[B]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Bin-variable-and-stratify-on-it
-[C]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Introduce-time-varying-covariates
-[D]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Modify-the-functional-form
-[E]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Stratification
-
--- a/lifelines_tool/test-data/readme_sample	Wed Aug 09 11:12:16 2023 +0000
+++ b/lifelines_tool/test-data/readme_sample	Thu Aug 10 07:15:22 2023 +0000
@@ -22,7 +22,7 @@
    number of observations = 432
 number of events observed = 114
    partial log-likelihood = -659.00
-         time fit was run = 2023-08-09 07:43:37 UTC
+         time fit was run = 2023-08-10 05:49:04 UTC
 
 ---
             coef  exp(coef)   se(coef)   coef lower 95%   coef upper 95%  exp(coef) lower 95%  exp(coef) upper 95%