comparison src/lib/MaaslinPlots.R @ 0:e0b5980139d9

maaslin
author george-weingart
date Tue, 13 May 2014 22:00:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0b5980139d9
1 #####################################################################################
2 #Copyright (C) <2012>
3 #
4 #Permission is hereby granted, free of charge, to any person obtaining a copy of
5 #this software and associated documentation files (the "Software"), to deal in the
6 #Software without restriction, including without limitation the rights to use, copy,
7 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
8 #and to permit persons to whom the Software is furnished to do so, subject to
9 #the following conditions:
10 #
11 #The above copyright notice and this permission notice shall be included in all copies
12 #or substantial portions of the Software.
13 #
14 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
15 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
16 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
17 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
18 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
19 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 #
21 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
22 # authored by the Huttenhower lab at the Harvard School of Public Health
23 # (contact Timothy Tickle, ttickle@hsph.harvard.edu).
24 #####################################################################################
25
26 inlinedocs <- function(
27 ##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu>
28 ##description<< Holds MaAsLin related plotting
29 ) { return( pArgs ) }
30
31 funcPDF <- function(
32 ### Function to plot raw data with linear model information.
33 ### Continuous and integer variables are plotted with a line of best fit.
34 ### Other data is plotted as boxplots.
35 frmeTmp,
36 lsCur,
37 ### Linear model information
38 curPValue,
39 ### Pvalue to display
40 curQValue,
41 ### Qvalue to display
42 strFilePDF,
43 ### PDF file to create or to which to append
44 strBaseOut,
45 ### Project directory to place pdf in
46 strName,
47 ### Name of taxon
48 funcUnTransform=NULL,
49 ### If a transform is used the appropriate of that transfor must be used on the residuals in the partial residual plots
50 fDoResidualPlot = TRUE,
51 ### Plot the residual plots
52 fInvert = FALSE,
53 ### Invert the figure so the background is black
54 liNaIndices = c()
55 ### Indices of NA data that was imputed
56 ){
57 if( is.na( strFilePDF ) )
58 {
59 strFilePDF <- sprintf( "%s-%s.pdf", strBaseOut, strName )
60 pdf( strFilePDF, width = 11, useDingbats=FALSE )
61 }
62
63 #Invert plots
64 adColorMin <- c(1, 0, 0)
65 adColorMax <- c(0, 1, 0)
66 adColorMed <- c(0, 0, 0)
67 if( fInvert )
68 {
69 par( bg = "black", fg = "white", col.axis = "white", col.lab = "white", col.main = "white", col.sub = "white" )
70 adColorMin <- c(1, 1, 0)
71 adColorMax <- c(0, 1, 1)
72 adColorMed <- c(1, 1, 1)
73 }
74
75 #Create linear model title data string
76 strTitle <- sprintf( "%s (%.3g sd %.3g, p=%.3g, q=%.3g)", lsCur$orig, lsCur$value, lsCur$std, curPValue, curQValue )
77 adMar <- c(5, 4, 4, 2) + 0.1
78 dLine <- NA
79 strTaxon <- lsCur$taxon
80 if( nchar( strTaxon ) > 80 )
81 {
82 dCEX <- 0.75
83 iLen <- nchar( strTaxon )
84 if( iLen > 120 )
85 {
86 dLine <- 2.5
87 i <- round( iLen / 2 )
88 strTaxon <- paste( substring( strTaxon, 0, i ), substring( strTaxon, i + 1 ), sep = "\n" )
89 adMar[2] <- adMar[2] + 1
90 }
91 } else { dCEX = 1 }
92
93 #Plot 1x2 graphs per page
94 if(fDoResidualPlot){par(mfrow=c(1,2))}
95
96 # Plot factor data as boxplot if is descrete data
97 # Otherwise plot as a line
98 adCur <- lsCur$metadata
99 adY <- lsCur$data
100
101 # Remove NAs from data visualization if set to do so (if liNaIndices is not empty)
102 if(lsCur$name %in% names(liNaIndices)&&(length(liNaIndices[[lsCur$name]])>0))
103 {
104 adY <- adY[-1*liNaIndices[[lsCur$name]]]
105 adCur = adCur[-1*liNaIndices[[lsCur$name]]]
106 if(class(adCur)=="factor")
107 {
108 adCur = factor(adCur)
109 }
110 }
111
112 # Set the factor levels to include NA if they still exist
113 # This is so if something is not imputed, then if there are NAs they will be plotted (to show no imputing)
114 if( class( lsCur$metadata ) == "factor" )
115 {
116 sCurLevels = levels(adCur)
117 adCur = (as.character(adCur))
118 aiCurNAs = which(is.na(adCur))
119 if(length(aiCurNAs) > 0)
120 {
121 adCur[aiCurNAs]="NA"
122 sCurLevels = c(sCurLevels,"NA")
123 }
124 adCur = factor(adCur, levels = sCurLevels)
125 }
126
127 if( class( lsCur$metadata ) == "factor" )
128 {
129 astrNames <- c()
130 astrColors <- c()
131 dMed <- median( adY[adCur == levels( adCur )[1]], na.rm = TRUE )
132 adIQR <- quantile( adY, probs = c(0.25, 0.75), na.rm = TRUE )
133 dIQR <- adIQR[2] - adIQR[1]
134 if( dIQR <= 0 )
135 {
136 dIQR <- sd( adY, na.rm = TRUE )
137 }
138 dIQR <- dIQR / 2
139
140 #Print boxplots/strip charts of raw data. Add model data to it.
141 for( strLevel in levels( adCur ) )
142 {
143 c_iLen <- 32
144 strLength <- strLevel
145 if( nchar( strLength ) > c_iLen )
146 {
147 iTmp <- ( c_iLen / 2 ) - 2
148 strLength <- paste( substr( strLength, 1, iTmp ), substring( strLength, nchar( strLength ) - iTmp ), sep = "..." )
149 }
150 astrNames <- c(astrNames, sprintf( "%s (%d)", strLength, sum( adCur == strLevel, na.rm = TRUE ) ))
151 astrColors <- c(astrColors, sprintf( "%sAA", funcColor( ( median( adY[adCur == strLevel], na.rm = TRUE ) - dMed ) /
152 dIQR, dMax = 3, dMin = -3, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) ))
153 }
154 #Controls boxplot labels
155 #(When there are many factor levels some are skipped and not plotted
156 #So this must be reduced)
157 dBoxPlotLabelCex = dCEX
158 if(length(astrNames)>8)
159 {
160 dBoxPlotLabelCex = dBoxPlotLabelCex * 1.5/(length(astrNames)/8)
161 }
162 par(cex.axis = dBoxPlotLabelCex)
163 boxplot( adY ~ adCur, notch = TRUE, names = astrNames, mar = adMar, col = astrColors,
164 main = strTitle, cex.main = 48/nchar(strTitle), xlab = lsCur$name, ylab = NA, cex.lab = dCEX, outpch = 4, outcex = 0.5 )
165 par(cex.axis = dCEX)
166 stripchart( adY ~ adCur, add = TRUE, col = astrColors, method = "jitter", vertical = TRUE, pch = 20 )
167 title( ylab = strTaxon, cex.lab = dCEX, line = dLine )
168 } else {
169 #Plot continuous data
170 plot( adCur, adY, mar = adMar, main = strTitle, xlab = lsCur$name, pch = 20,
171 col = sprintf( "%s99", funcGetColor( ) ), ylab = NA, xaxt = "s" )
172 title( ylab = strTaxon, cex.lab = dCEX )
173 lmod <- lm( adY ~ adCur )
174 dColor <- lmod$coefficients[2] * mean( adCur, na.rm = TRUE ) / mean( adY, na.rm = TRUE )
175 strColor <- sprintf( "%sDD", funcColor( dColor, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) )
176 abline( reg = lmod, col = strColor, lwd = 3 )
177 }
178 ### Plot the residual plot
179 if(fDoResidualPlot){funcResidualPlot(lsCur=lsCur, frmeTmp=frmeTmp, adColorMin=adColorMin, adColorMax=adColorMax, adColorMed=adColorMed, adMar, funcUnTransform=funcUnTransform, liNaIndices)}
180 return(strFilePDF)
181 ### File to which the pdf was written
182 }
183
184 ### Plot 1
185 # axis 1 gene expression (one bug)
186 # axis 2 PC1 (bugs and metadata)(MFA)
187 # over plot real data vs the residuals (real data verses the prediction)
188 # remove all but 1 bug + metadata
189 ### Plot 2
190 #residuals (y) PCL1 (1 bug + metadata)
191 #Plot 3
192 ### Can plot the residuals against all the variables in a grid/lattic
193 funcGetFactorBoxColors = function(adCur,adY,adColorMin,adColorMax,adColorMed)
194 {
195 astrColors = c()
196
197 if( class( adCur ) == "factor" )
198 {
199 if( "NA" %in% levels( adCur ) )
200 {
201 afNA = adCur == "NA"
202 adY = adY[!afNA]
203 adCur = adCur[!afNA]
204 adCur = factor( adCur, levels = setdiff( levels( adCur ), "NA" ) )
205 }
206 dMed = median( adY[adCur == levels( adCur )[1]], na.rm = TRUE )
207 adIQR = quantile( adY, probs = c(0.25, 0.75), na.rm = TRUE )
208 dIQR = adIQR[2] - adIQR[1]
209 if( dIQR <= 0 )
210 {
211 dIQR <- sd( adY, na.rm = TRUE )
212 }
213 dIQR <- dIQR / 2
214
215 for( strLevel in levels( adCur ) )
216 {
217 astrColors <- c(astrColors, sprintf( "%sAA", funcColor( ( median( adY[adCur == strLevel], na.rm = TRUE ) - dMed ) /
218 dIQR, dMax = 3, dMin = -3, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) ))
219 }
220 }
221 return(astrColors)
222 }
223
224 funcResidualPlotHelper <- function(
225 frmTmp,
226 ### The dataframe to plot from
227 sResponseFeature,
228 lsFullModelCovariateNames,
229 ### All covariates in lm (Column Names)
230 lsCovariateToControlForNames,
231 ### Y Axis: All the covariate which will be plotted together respectively * their beta with the residuals of the full model added. (These should dummied names for factor data; not column names).
232 sCovariateOfInterest,
233 ### X Axis: raw data of the covariate of interest. (Column Name)
234 adColorMin,
235 ### Min color in color range for markers
236 adColorMax,
237 ### Max color in color range for markers
238 adColorMed,
239 ### Medium color in color range for markers
240 adMar,
241 ### Standardized margins
242 funcUnTransform = NULL,
243 ### If a transform is used the opposite of that transfor must be used on the residuals in the partial residual plots
244 liNaIndices = c()
245 ### Indices of NA data that was imputed
246 ){
247 # Get model matrix (raw data)
248 adCur = frmTmp[[sResponseFeature]]
249
250 # Make a formula to calculate the new model to get the full model residuals
251 strFormula = paste("adCur",paste(sprintf( "`%s`", lsFullModelCovariateNames ),sep="", collapse="+"),sep="~")
252
253 # Calculate lm
254 lmod = (lm(as.formula(strFormula),frmTmp))
255
256 # Get all coefficient data in the new model
257 dfdCoefs = coefficients(lmod)
258
259 # Get all coefficient names in the new model
260 asAllCoefNames = names(dfdCoefs)
261
262 # Get Y
263 # For each covariate that is being plotted on the y axis
264 # Convert the coefficient name to the column name
265 # If they are equal then the data is not discontinuous and you can use the raw data as is and multiply it by the coefficient in the model
266 # If they are not equal than the data is discontinuous, get the value for the data, set all but the levels equal to it to zero and multiply by the ceofficient from the model.
267 vY = rep(coefficients(lmod)[["(Intercept)"]],dim(frmTmp)[1])
268 # vY = rep(0,dim(frmTmp)[1])
269
270 #Here we are not dealing with column names but, if factor data, the coefficient names
271 for(iCvIndex in 1:length(lsCovariateToControlForNames))
272 {
273 sCurrentCovariate = lsCovariateToControlForNames[iCvIndex]
274 #Get the column name of the current covariate (this must be used to compare to other column names)
275 sCurCovariateColumnName = funcCoef2Col(sCurrentCovariate, frmTmp)
276
277 #This is continuous data
278 if(sCurrentCovariate == sCurCovariateColumnName)
279 {
280 vY = vY + dfdCoefs[sCurrentCovariate]*frmTmp[[sCurCovariateColumnName]]
281 } else {
282 #Discontinuous data
283 # Get level
284 xLevel = substr(sCurrentCovariate,nchar(sCurCovariateColumnName)+1,nchar(sCurrentCovariate))
285
286 # Get locations where the data = level
287 aiLevelIndices = rep(0,dim(frmTmp)[1])
288 aiLevelIndices[which(frmTmp[sCurCovariateColumnName] == xLevel)]=1
289 sCurrentCovariateBeta = dfdCoefs[sCurrentCovariate]
290 if(is.na(sCurrentCovariateBeta)){sCurrentCovariateBeta=0}
291 vY = vY + sCurrentCovariateBeta * aiLevelIndices
292 }
293 }
294 #TODO based on transform vY = vY+sin(residuals(lmod))^2
295 if(!is.null(funcUnTransform))
296 {
297 vY = vY + funcUnTransform(residuals(lmod))
298 } else {
299 vY = vY + residuals(lmod) }
300
301 # Plot x, raw data
302 ## y label
303 sYLabel = paste(paste("B",lsCovariateToControlForNames,sep="*",collapse="+"),"e",sep="+")
304 sTitle = "Partial Residual Plot"
305
306 adCurXValues = frmTmp[[sCovariateOfInterest]]
307
308 # If not plotting metadata that was originally NA then remove the imputed values here
309 if(sCovariateOfInterest %in% names(liNaIndices)&&(length(liNaIndices[[sCovariateOfInterest]])>0))
310 {
311 adCurXValues = adCurXValues[-1*liNaIndices[[sCovariateOfInterest]]]
312 vY <- vY[-1*liNaIndices[[sCovariateOfInterest]]]
313 if(is.factor(adCurXValues)){adCurXValues = factor(adCurXValues)}
314 }
315
316 # Set the factor levels to include NA if they still exist
317 # This is so if something is not imputed, then if there are NAs they will be plotted (to show no imputing)
318 # Do not forget to keep te level order incase it was changed by the custom scripts.
319 if( class( adCurXValues ) == "factor" )
320 {
321 vsLevels = levels(adCurXValues)
322 if(sum(is.na(adCurXValues))>0)
323 {
324 adCurXValues = as.character(adCurXValues)
325 adCurXValues[is.na(adCurXValues)]="NA"
326 adCurXValues = factor(adCurXValues, levels=c(vsLevels,"NA"))
327 }
328 }
329
330 # Scale to the original range
331 if(!(class( adCurXValues ) == "factor" ))
332 {
333 vY = vY + mean(adCurXValues,rm.na=TRUE)
334 }
335
336 # Plot Partial Residual Plot
337 # If we are printing discontinuous data
338 # Get the color of the box plots
339 # Plot box plots
340 # Plot data as strip charts
341 if(is.factor(adCurXValues))
342 {
343 # adCurXValues = factor(adCurXValues)
344 astrColors = funcGetFactorBoxColors(adCurXValues,vY,adColorMin,adColorMax,adColorMed)
345 asNames = c()
346 for(sLevel in levels(adCurXValues))
347 {
348 asNames = c(asNames,sprintf( "%s (%d)", sLevel, sum( adCurXValues == sLevel, na.rm = TRUE ) ))
349 }
350
351 plot(adCurXValues, vY, xlab=sCovariateOfInterest, ylab=sYLabel, names=asNames, notch = TRUE,mar = adMar,col = astrColors, main=sTitle, outpch = 4, outcex = 0.5 )
352 stripchart( vY ~ adCurXValues, add = TRUE, col = astrColors, method = "jitter", vertical = TRUE, pch = 20 )
353
354 } else {
355 plot( adCurXValues, vY, mar = adMar, main = sTitle, xlab=sCovariateOfInterest, col = sprintf( "%s99", funcGetColor( ) ), pch = 20,ylab = sYLabel, xaxt = "s" )
356
357 lmodLine = lm(vY~adCurXValues)
358
359 dColor <- lmodLine$coefficients[2] * mean( adCurXValues, na.rm = TRUE ) / mean( vY, na.rm = TRUE )
360 strColor <- sprintf( "%sDD", funcColor( dColor, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) )
361 abline( reg =lmodLine, col = strColor, lwd = 3 )
362 }
363 }
364
365 funcBoostInfluencePlot <- function(
366 # Plot to show the rel.inf from boosting, what to know if the rank order is correct, better ranks for spiked data.
367 # Show the cut off and features identified as uneven.
368 vdRelInf,
369 sFeature,
370 vsPredictorNames,
371 vstrKeepMetadata,
372 vstrUneven = c()
373 ){
374 vsCol = rep("black",length(vdRelInf))
375 vsCol[which(vsPredictorNames %in% vstrKeepMetadata)]="green"
376 vsCol[which(vsPredictorNames %in% vstrUneven)] = "orange"
377 plot(vdRelInf, col=vsCol, main=sFeature, xlab="Index", ylab="Relative Influence")
378 legend("topright", pch = paste(1:length(vsPredictorNames)), legend= vsPredictorNames, text.col=vsCol, col=vsCol)
379 }
380
381 funcResidualPlot <- function(
382 ### Plot to data after confounding.
383 ### That is, in a linear model with significant coefficient b1 for variable x1,
384 ### that's been sparsified to some subset of terms: y = b0 + b1*x1 + sum(bi*xi)
385 ### Plot x1 on the X axis, and instead of y on the Y axis, instead plot:
386 ### y' = b0 + sum(bi*xi)
387 lsCur,
388 ### Assocation to plot
389 frmeTmp,
390 ### Data frame of orginal data
391 adColorMin,
392 ### Min color in color range for markers
393 adColorMax,
394 ### Max color in color range for markers
395 adColorMed,
396 ### Medium color in color range for markers
397 adMar,
398 ### Standardized margins
399 funcUnTransform,
400 ### If a transform is used the opporite of that transfor must be used on the residuals in the partial residual plots
401 liNaIndices = c()
402 ### Indices of NA data that was imputed
403 ){
404 #Now plot residual hat plot
405 #Get coefficient names
406 asAllCoefs = setdiff(names(lsCur$allCoefs),c("(Intercept)"))
407 asAllColNames = c()
408 for(sCoef in asAllCoefs)
409 {
410 asAllColNames = c(asAllColNames,funcCoef2Col(sCoef,frmeData=frmeTmp))
411 }
412 asAllColNames = unique(asAllColNames)
413
414 # All coefficients except for the one of interest
415 lsOtherCoefs = setdiff(asAllColNames, c(lsCur$name))
416
417 lsCovariatesToPlot = NULL
418 if(is.factor(lsCur$metadata))
419 {
420 lsCovariatesToPlot = paste(lsCur$name,levels(lsCur$metadata),sep="")
421 }else{lsCovariatesToPlot=c(lsCur$orig)}
422
423 # If there are no other coefficients then skip plot
424 # if(!length(lsOtherCoefs)){return()}
425
426 # Plot residuals
427 funcResidualPlotHelper(frmTmp=frmeTmp, sResponseFeature=lsCur$taxon, lsFullModelCovariateNames=asAllColNames, lsCovariateToControlForNames=lsCovariatesToPlot, sCovariateOfInterest=lsCur$name, adColorMin=adColorMin, adColorMax=adColorMax, adColorMed=adColorMed, adMar=adMar, funcUnTransform=funcUnTransform, liNaIndices=liNaIndices)
428 }