0
|
1 <tool name="lifelineskmcph" id="lifelineskmcph" version="0.01">
|
|
2 <!--Source in git at: https://github.com/fubar2/galaxy_tf_overlay-->
|
1
|
3 <!--Created by toolfactory@galaxy.org at 10/08/2023 15:48:43 using the Galaxy Tool Factory.-->
|
0
|
4 <description>Lifelines KM and optional Cox PH models</description>
|
|
5 <requirements>
|
|
6 <requirement version="1.5.3" type="package">pandas</requirement>
|
|
7 <requirement version="3.7.2" type="package">matplotlib</requirement>
|
|
8 <requirement version="0.27.7" type="package">lifelines</requirement>
|
|
9 </requirements>
|
|
10 <stdio>
|
|
11 <exit_code range="1:" level="fatal"/>
|
|
12 </stdio>
|
|
13 <version_command><![CDATA[echo "0.01"]]></version_command>
|
|
14 <command><![CDATA[python
|
|
15 $runme
|
|
16 --input_tab
|
|
17 $input_tab
|
|
18 --readme
|
|
19 $readme
|
|
20 --time
|
|
21 '$time'
|
|
22 --status
|
|
23 '$status'
|
|
24 --cphcols
|
|
25 '$CPHcovariatecolumnnames'
|
|
26 --title
|
|
27 '$title'
|
|
28 --header
|
|
29 '$header'
|
|
30 --group
|
|
31 '$group'
|
|
32 --image_type
|
|
33 '$image_type'
|
|
34 --image_dir
|
|
35 'image_dir']]></command>
|
|
36 <configfiles>
|
|
37 <configfile name="runme"><![CDATA[#raw
|
|
38
|
|
39 # script for a lifelines ToolFactory KM/CPH tool for Galaxy
|
|
40 # km models for https://github.com/galaxyproject/tools-iuc/issues/5393
|
|
41 # test as
|
|
42 # python plotlykm.py --input_tab rossi.tab --htmlout "testfoo" --time "week" --status "arrest" --title "test" --image_dir images --cphcol="prio,age,race,paro,mar,fin"
|
1
|
43 # Ross Lazarus July 2023
|
|
44 import argparse
|
0
|
45
|
|
46 import os
|
|
47 import sys
|
|
48
|
|
49 import lifelines
|
|
50
|
|
51 from matplotlib import pyplot as plt
|
|
52
|
|
53 import pandas as pd
|
|
54
|
|
55
|
1
|
56 def trimlegend(v):
|
|
57 """
|
|
58 for int64 quintiles - must be ints - otherwise get silly legends with long float values
|
|
59 """
|
|
60 for i, av in enumerate(v):
|
|
61 x = int(av)
|
|
62 v[i] = str(x)
|
|
63 return v
|
0
|
64
|
|
65 kmf = lifelines.KaplanMeierFitter()
|
|
66 cph = lifelines.CoxPHFitter()
|
|
67
|
|
68 parser = argparse.ArgumentParser()
|
|
69 a = parser.add_argument
|
1
|
70 a('--input_tab', default='rossi.tab', required=True)
|
0
|
71 a('--header', default='')
|
|
72 a('--htmlout', default="test_run.html")
|
|
73 a('--group', default='')
|
|
74 a('--time', default='', required=True)
|
|
75 a('--status',default='', required=True)
|
|
76 a('--cphcols',default='')
|
|
77 a('--title', default='Default plot title')
|
|
78 a('--image_type', default='png')
|
|
79 a('--image_dir', default='images')
|
|
80 a('--readme', default='run_log.txt')
|
|
81 args = parser.parse_args()
|
|
82 sys.stdout = open(args.readme, 'w')
|
|
83 df = pd.read_csv(args.input_tab, sep='\t')
|
|
84 NCOLS = df.columns.size
|
|
85 NROWS = len(df.index)
|
1
|
86 QVALS = [.2, .4, .6, .8] # for partial cox ph plots
|
0
|
87 defaultcols = ['col%d' % (x+1) for x in range(NCOLS)]
|
|
88 testcols = df.columns
|
|
89 if len(args.header.strip()) > 0:
|
|
90 newcols = args.header.split(',')
|
|
91 if len(newcols) == NCOLS:
|
|
92 if (args.time in newcols) and (args.status in newcols):
|
|
93 df.columns = newcols
|
|
94 else:
|
|
95 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and/or status %s not found in supplied header parameter %s' % (args.time, args.status, args.header))
|
|
96 sys.exit(4)
|
|
97 else:
|
|
98 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): Supplied header %s has %d comma delimited header names - does not match the input tabular file %d columns' % (args.header, len(newcols), NCOLS))
|
|
99 sys.exit(5)
|
|
100 else: # no header supplied - check for a real one that matches the x and y axis column names
|
|
101 colsok = (args.time in testcols) and (args.status in testcols) # if they match, probably ok...should use more code and logic..
|
|
102 if colsok:
|
|
103 df.columns = testcols # use actual header
|
|
104 else:
|
|
105 colsok = (args.time in defaultcols) and (args.status in defaultcols)
|
|
106 if colsok:
|
|
107 sys.stderr.write('replacing first row of data derived header %s with %s' % (testcols, defaultcols))
|
|
108 df.columns = defaultcols
|
|
109 else:
|
|
110 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): time %s and status %s do not match anything in the file header, supplied header or automatic default column names %s' % (args.time, args.status, defaultcols))
|
|
111 print('## Lifelines tool starting.\nUsing data header =', df.columns, 'time column =', args.time, 'status column =', args.status)
|
|
112 os.makedirs(args.image_dir, exist_ok=True)
|
|
113 fig, ax = plt.subplots()
|
|
114 if args.group > '':
|
|
115 names = []
|
|
116 times = []
|
|
117 events = []
|
|
118 for name, grouped_df in df.groupby(args.group):
|
|
119 T = grouped_df[args.time]
|
|
120 E = grouped_df[args.status]
|
|
121 gfit = kmf.fit(T, E, label=name)
|
|
122 kmf.plot_survival_function(ax=ax)
|
|
123 names.append(str(name))
|
|
124 times.append(T)
|
|
125 events.append(E)
|
|
126 ax.set_title(args.title)
|
|
127 fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
|
|
128 ngroup = len(names)
|
|
129 if ngroup == 2: # run logrank test if 2 groups
|
|
130 results = lifelines.statistics.logrank_test(times[0], times[1], events[0], events[1], alpha=.99)
|
|
131 print('Logrank test for %s - %s vs %s\n' % (args.group, names[0], names[1]))
|
|
132 results.print_summary()
|
|
133 else:
|
|
134 kmf.fit(df[args.time], df[args.status])
|
|
135 kmf.plot_survival_function(ax=ax)
|
|
136 ax.set_title(args.title)
|
|
137 fig.savefig(os.path.join(args.image_dir,'KM_%s.png' % args.title))
|
1
|
138 print('#### No grouping variable, so no log rank or other Kaplan-Meier statistical output is available')
|
0
|
139 if len(args.cphcols) > 0:
|
|
140 fig, ax = plt.subplots()
|
1
|
141 ax.set_title('Cox-PH model: %s' % args.title)
|
0
|
142 cphcols = args.cphcols.strip().split(',')
|
|
143 cphcols = [x.strip() for x in cphcols]
|
|
144 notfound = sum([(x not in df.columns) for x in cphcols])
|
|
145 if notfound > 0:
|
|
146 sys.stderr.write('## CRITICAL USAGE ERROR (not a bug!): One or more requested Cox PH columns %s not found in supplied column header %s' % (args.cphcols, df.columns))
|
|
147 sys.exit(6)
|
1
|
148 colsdf = df[cphcols]
|
0
|
149 print('### Lifelines test of Proportional Hazards results with %s as covariates on %s' % (', '.join(cphcols), args.title))
|
1
|
150 cutcphcols = [args.time, args.status] + cphcols
|
|
151 cphdf = df[cutcphcols]
|
|
152 ucolcounts = colsdf.nunique(axis=0)
|
0
|
153 cph.fit(cphdf, duration_col=args.time, event_col=args.status)
|
|
154 cph.print_summary()
|
1
|
155 for i, cov in enumerate(colsdf.columns):
|
|
156 if ucolcounts[i] > 10:
|
|
157 v = pd.Series.tolist(cphdf[cov].quantile(QVALS))
|
|
158 vdt = df.dtypes[cov]
|
|
159 if vdt == 'int64':
|
|
160 v = trimlegend(v)
|
|
161 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
|
|
162 axp.set_title('Cox-PH %s quintile partials: %s' % (cov,args.title))
|
|
163 figr = axp.get_figure()
|
|
164 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
|
|
165 figr.savefig(oname)
|
|
166 else:
|
|
167 v = pd.unique(cphdf[cov])
|
|
168 v = [str(x) for x in v]
|
|
169 try:
|
|
170 axp = cph.plot_partial_effects_on_outcome(cov, cmap='coolwarm', values=v)
|
|
171 axp.set_title('Cox-PH %s partials: %s' % (cov,args.title))
|
|
172 figr = axp.get_figure()
|
|
173 oname = os.path.join(args.image_dir,'%s_CoxPH_%s.%s' % (args.title, cov, args.image_type))
|
|
174 figr.savefig(oname)
|
|
175 except:
|
|
176 pass
|
0
|
177 cphaxes = cph.check_assumptions(cphdf, p_value_threshold=0.01, show_plots=True)
|
|
178 for i, ax in enumerate(cphaxes):
|
|
179 figr = ax[0].get_figure()
|
|
180 titl = figr._suptitle.get_text().replace(' ','_').replace("'","")
|
|
181 oname = os.path.join(args.image_dir,'CPH%s.%s' % (titl, args.image_type))
|
|
182 figr.savefig(oname)
|
|
183
|
|
184
|
|
185 #end raw]]></configfile>
|
|
186 </configfiles>
|
|
187 <inputs>
|
|
188 <param name="input_tab" type="data" optional="false" label="Tabular input file for failure time testing." help="Must have a column with a measure of time and status (0,1) at observation." format="tabular" multiple="false"/>
|
|
189 <param name="time" type="text" value="week" label="Name of column containing a time to observation" help="Use a column name from the file header if the data has one, or use one from the list supplied below, or use col1....colN otherwise to select the correct column"/>
|
|
190 <param name="status" type="text" value="arrest" label="Status at observation. Typically 1=alive, 0=deceased for life-table observations" help="Use a column name from the header if the file has one, or use one from the list supplied below, or use col1....colN otherwise to select the correct column"/>
|
|
191 <param name="CPHcovariatecolumnnames" type="text" value="prio,age,race,paro,mar,fin" label="Optional comma delimited column names to use as covariates in the Cox Proportional Hazards model" help="Leave blank for no Cox PH model tests "/>
|
|
192 <param name="title" type="text" value="KM and CPH in lifelines test" label="Title for this lifelines analysis" help="Special characters will probably be escaped so do not use them"/>
|
|
193 <param name="header" type="text" value="" label="Optional comma delimited list of column names to use for this tabular file. Default is None when col1...coln will be used if no header row in the input data" help="The column names supplied for time, status and so on MUST match either this supplied list, or if none, the original file header if it exists, or col1...coln as the default of last resort."/>
|
|
194 <param name="group" type="text" value="race" label="Optional group column name for KM plot" help="If there are exactly 2 groups, a log-rank statistic will be generated as part of the Kaplan-Meier test."/>
|
|
195 <param name="image_type" type="select" label="Output format for all images" help="">
|
|
196 <option value="png">Portable Network Graphics .png format</option>
|
|
197 <option value="jpg">JPEG</option>
|
|
198 <option value="pdf">PDF</option>
|
|
199 <option value="tiff">TIFF</option>
|
|
200 </param>
|
|
201 </inputs>
|
|
202 <outputs>
|
|
203 <collection name="image_dir" type="list" label="Images from $title on $input_tab.element_identifier">
|
|
204 <discover_datasets pattern="__name_and_ext__" directory="image_dir" visible="false"/>
|
|
205 </collection>
|
|
206 <data name="readme" format="txt" label="Lifelines_km_cph $title on $input_tab.element_identifier" hidden="false"/>
|
|
207 </outputs>
|
|
208 <tests>
|
|
209 <test>
|
|
210 <output_collection name="image_dir"/>
|
|
211 <output name="readme" value="readme_sample" compare="sim_size" delta="1000"/>
|
|
212 <param name="input_tab" value="input_tab_sample"/>
|
|
213 <param name="time" value="week"/>
|
|
214 <param name="status" value="arrest"/>
|
|
215 <param name="CPHcovariatecolumnnames" value="prio,age,race,paro,mar,fin"/>
|
|
216 <param name="title" value="KM and CPH in lifelines test"/>
|
|
217 <param name="header" value=""/>
|
|
218 <param name="group" value="race"/>
|
|
219 <param name="image_type" value="png"/>
|
|
220 </test>
|
|
221 </tests>
|
|
222 <help><![CDATA[
|
|
223
|
|
224 This is a wrapper for some elementary life table analysis functions from the Lifelines package - see https://lifelines.readthedocs.io/en/latest for the full story
|
|
225
|
|
226
|
|
227
|
|
228 Given a Galaxy tabular dataset with suitable indicators for time and status at observation, this tool can perform some simple life-table analyses and produce some useful plots. Kaplan-Meier is the default. Cox Proportional Hazards model will be tested if covariates to include are provided.
|
|
229
|
|
230
|
|
231
|
|
232 1. Kaplan-Meier survival analysis - see https://lifelines.readthedocs.io/en/latest/Survival%20analysis%20with%20lifelines.html
|
|
233
|
|
234 This is always performed and a survival curve is plotted.
|
|
235
|
|
236 If there is an optional "group" column, the plot will show each group separately. If there are *exactly* two groups, a log-rank test for difference is performed and reported
|
|
237
|
|
238
|
|
239
|
|
240 2. The Cox Proportional Hazards model can be tested, if a comma separated list of covariate column names is supplied on the tool form.
|
|
241
|
|
242 These are used in as covariates.
|
|
243
|
|
244 Although not usually a real problem, some diagnostics and advice about the assumption of proportional hazards are are also provided as outputs - see https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html
|
|
245
|
|
246
|
|
247
|
|
248 A big shout out to the lifelines authors - no R code needed - nice job, thanks!
|
|
249
|
|
250 ]]></help>
|
|
251 <citations>
|
|
252 <citation type="doi">10.1093/bioinformatics/bts573</citation>
|
|
253 </citations>
|
|
254 </tool>
|
|
255
|