diff lifelines_tool/lifelineskmcph.xml @ 2:dd5e65893cb8 draft default tip

add survival and collapsed life table outputs suggested by Wolfgang
author fubar
date Thu, 10 Aug 2023 22:52:45 +0000
parents 232b874046a7
children
line wrap: on
line diff
--- a/lifelines_tool/lifelineskmcph.xml	Thu Aug 10 07:15:22 2023 +0000
+++ b/lifelines_tool/lifelineskmcph.xml	Thu Aug 10 22:52:45 2023 +0000
@@ -1,6 +1,6 @@
 <tool name="lifelineskmcph" id="lifelineskmcph" version="0.01">
   <!--Source in git at: https://github.com/fubar2/galaxy_tf_overlay-->
-  <!--Created by toolfactory@galaxy.org at 10/08/2023 15:48:43 using the Galaxy Tool Factory.-->
+  <!--Created by toolfactory@galaxy.org at 10/08/2023 21:59:53 using the Galaxy Tool Factory.-->
   <description>Lifelines KM and optional Cox PH models</description>
   <requirements>
     <requirement version="1.5.3" type="package">pandas</requirement>
@@ -104,11 +104,11 @@
     else:
         colsok = (args.time in defaultcols) and (args.status in defaultcols)
         if colsok:
-            sys.stderr.write('replacing first row of data derived header %s with %s' % (testcols, defaultcols))
+            print('Replacing first row of data derived header %s with %s' % (testcols, defaultcols))
             df.columns = defaultcols
         else:
             sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and status %s do not match anything in the file header, supplied header or automatic default column names %s' % (args.time, args.status, defaultcols))
-print('## Lifelines tool starting.\nUsing data header =', df.columns, 'time column =', args.time, 'status column =', args.status)
+print('## Lifelines tool\nInput data header =', df.columns, 'time column =', args.time, 'status column =', args.status)
 os.makedirs(args.image_dir, exist_ok=True)
 fig, ax = plt.subplots()
 if args.group > '':
@@ -136,6 +136,24 @@
     ax.set_title(args.title)
     fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
     print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available')
+survdf = lifelines.utils.survival_table_from_events(df[args.time], df[args.status])
+lifedf = lifelines.utils.survival_table_from_events(df[args.time], df[args.status], collapse=True)
+print("#### Survival table using time %s and event %s" % (args.time, args.status))
+with pd.option_context('display.max_rows', None,
+                       'display.max_columns', None,
+                       'display.precision', 3,
+                       ):
+    print(survdf)
+print("#### Life table using time %s and event %s" % (args.time, args.status))
+with pd.option_context('display.max_rows', None,
+                       'display.max_columns', None,
+                       'display.precision', 3,
+                       ):
+    print(lifedf)
+outpath = os.path.join(args.image_dir,'survival_table.tabular')
+survdf.to_csv(outpath, sep='\t')
+outpath = os.path.join(args.image_dir,'life_table.tabular')
+lifedf.to_csv(outpath, sep='\t')
 if len(args.cphcols) > 0:
     fig, ax = plt.subplots()
     ax.set_title('Cox-PH model: %s' % args.title)
@@ -153,7 +171,7 @@
     cph.fit(cphdf, duration_col=args.time, event_col=args.status)
     cph.print_summary()
     for i, cov in enumerate(colsdf.columns):
-         if ucolcounts[i] > 10:
+         if ucolcounts[i] > 10: # a hack - assume categories are sparse - if not imaginary quintiles will have to do
              v = pd.Series.tolist(cphdf[cov].quantile(QVALS))
              vdt = df.dtypes[cov]
              if vdt == 'int64':