Mercurial > repos > george-weingart > maaslin
comparison src/lib/MaaslinPlots.R @ 0:e0b5980139d9
maaslin
author | george-weingart |
---|---|
date | Tue, 13 May 2014 22:00:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0b5980139d9 |
---|---|
1 ##################################################################################### | |
2 #Copyright (C) <2012> | |
3 # | |
4 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
5 #this software and associated documentation files (the "Software"), to deal in the | |
6 #Software without restriction, including without limitation the rights to use, copy, | |
7 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
8 #and to permit persons to whom the Software is furnished to do so, subject to | |
9 #the following conditions: | |
10 # | |
11 #The above copyright notice and this permission notice shall be included in all copies | |
12 #or substantial portions of the Software. | |
13 # | |
14 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
15 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
16 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
17 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
18 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
19 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
20 # | |
21 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | |
22 # authored by the Huttenhower lab at the Harvard School of Public Health | |
23 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | |
24 ##################################################################################### | |
25 | |
26 inlinedocs <- function( | |
27 ##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu> | |
28 ##description<< Holds MaAsLin related plotting | |
29 ) { return( pArgs ) } | |
30 | |
31 funcPDF <- function( | |
32 ### Function to plot raw data with linear model information. | |
33 ### Continuous and integer variables are plotted with a line of best fit. | |
34 ### Other data is plotted as boxplots. | |
35 frmeTmp, | |
36 lsCur, | |
37 ### Linear model information | |
38 curPValue, | |
39 ### Pvalue to display | |
40 curQValue, | |
41 ### Qvalue to display | |
42 strFilePDF, | |
43 ### PDF file to create or to which to append | |
44 strBaseOut, | |
45 ### Project directory to place pdf in | |
46 strName, | |
47 ### Name of taxon | |
48 funcUnTransform=NULL, | |
49 ### If a transform is used the appropriate of that transfor must be used on the residuals in the partial residual plots | |
50 fDoResidualPlot = TRUE, | |
51 ### Plot the residual plots | |
52 fInvert = FALSE, | |
53 ### Invert the figure so the background is black | |
54 liNaIndices = c() | |
55 ### Indices of NA data that was imputed | |
56 ){ | |
57 if( is.na( strFilePDF ) ) | |
58 { | |
59 strFilePDF <- sprintf( "%s-%s.pdf", strBaseOut, strName ) | |
60 pdf( strFilePDF, width = 11, useDingbats=FALSE ) | |
61 } | |
62 | |
63 #Invert plots | |
64 adColorMin <- c(1, 0, 0) | |
65 adColorMax <- c(0, 1, 0) | |
66 adColorMed <- c(0, 0, 0) | |
67 if( fInvert ) | |
68 { | |
69 par( bg = "black", fg = "white", col.axis = "white", col.lab = "white", col.main = "white", col.sub = "white" ) | |
70 adColorMin <- c(1, 1, 0) | |
71 adColorMax <- c(0, 1, 1) | |
72 adColorMed <- c(1, 1, 1) | |
73 } | |
74 | |
75 #Create linear model title data string | |
76 strTitle <- sprintf( "%s (%.3g sd %.3g, p=%.3g, q=%.3g)", lsCur$orig, lsCur$value, lsCur$std, curPValue, curQValue ) | |
77 adMar <- c(5, 4, 4, 2) + 0.1 | |
78 dLine <- NA | |
79 strTaxon <- lsCur$taxon | |
80 if( nchar( strTaxon ) > 80 ) | |
81 { | |
82 dCEX <- 0.75 | |
83 iLen <- nchar( strTaxon ) | |
84 if( iLen > 120 ) | |
85 { | |
86 dLine <- 2.5 | |
87 i <- round( iLen / 2 ) | |
88 strTaxon <- paste( substring( strTaxon, 0, i ), substring( strTaxon, i + 1 ), sep = "\n" ) | |
89 adMar[2] <- adMar[2] + 1 | |
90 } | |
91 } else { dCEX = 1 } | |
92 | |
93 #Plot 1x2 graphs per page | |
94 if(fDoResidualPlot){par(mfrow=c(1,2))} | |
95 | |
96 # Plot factor data as boxplot if is descrete data | |
97 # Otherwise plot as a line | |
98 adCur <- lsCur$metadata | |
99 adY <- lsCur$data | |
100 | |
101 # Remove NAs from data visualization if set to do so (if liNaIndices is not empty) | |
102 if(lsCur$name %in% names(liNaIndices)&&(length(liNaIndices[[lsCur$name]])>0)) | |
103 { | |
104 adY <- adY[-1*liNaIndices[[lsCur$name]]] | |
105 adCur = adCur[-1*liNaIndices[[lsCur$name]]] | |
106 if(class(adCur)=="factor") | |
107 { | |
108 adCur = factor(adCur) | |
109 } | |
110 } | |
111 | |
112 # Set the factor levels to include NA if they still exist | |
113 # This is so if something is not imputed, then if there are NAs they will be plotted (to show no imputing) | |
114 if( class( lsCur$metadata ) == "factor" ) | |
115 { | |
116 sCurLevels = levels(adCur) | |
117 adCur = (as.character(adCur)) | |
118 aiCurNAs = which(is.na(adCur)) | |
119 if(length(aiCurNAs) > 0) | |
120 { | |
121 adCur[aiCurNAs]="NA" | |
122 sCurLevels = c(sCurLevels,"NA") | |
123 } | |
124 adCur = factor(adCur, levels = sCurLevels) | |
125 } | |
126 | |
127 if( class( lsCur$metadata ) == "factor" ) | |
128 { | |
129 astrNames <- c() | |
130 astrColors <- c() | |
131 dMed <- median( adY[adCur == levels( adCur )[1]], na.rm = TRUE ) | |
132 adIQR <- quantile( adY, probs = c(0.25, 0.75), na.rm = TRUE ) | |
133 dIQR <- adIQR[2] - adIQR[1] | |
134 if( dIQR <= 0 ) | |
135 { | |
136 dIQR <- sd( adY, na.rm = TRUE ) | |
137 } | |
138 dIQR <- dIQR / 2 | |
139 | |
140 #Print boxplots/strip charts of raw data. Add model data to it. | |
141 for( strLevel in levels( adCur ) ) | |
142 { | |
143 c_iLen <- 32 | |
144 strLength <- strLevel | |
145 if( nchar( strLength ) > c_iLen ) | |
146 { | |
147 iTmp <- ( c_iLen / 2 ) - 2 | |
148 strLength <- paste( substr( strLength, 1, iTmp ), substring( strLength, nchar( strLength ) - iTmp ), sep = "..." ) | |
149 } | |
150 astrNames <- c(astrNames, sprintf( "%s (%d)", strLength, sum( adCur == strLevel, na.rm = TRUE ) )) | |
151 astrColors <- c(astrColors, sprintf( "%sAA", funcColor( ( median( adY[adCur == strLevel], na.rm = TRUE ) - dMed ) / | |
152 dIQR, dMax = 3, dMin = -3, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) )) | |
153 } | |
154 #Controls boxplot labels | |
155 #(When there are many factor levels some are skipped and not plotted | |
156 #So this must be reduced) | |
157 dBoxPlotLabelCex = dCEX | |
158 if(length(astrNames)>8) | |
159 { | |
160 dBoxPlotLabelCex = dBoxPlotLabelCex * 1.5/(length(astrNames)/8) | |
161 } | |
162 par(cex.axis = dBoxPlotLabelCex) | |
163 boxplot( adY ~ adCur, notch = TRUE, names = astrNames, mar = adMar, col = astrColors, | |
164 main = strTitle, cex.main = 48/nchar(strTitle), xlab = lsCur$name, ylab = NA, cex.lab = dCEX, outpch = 4, outcex = 0.5 ) | |
165 par(cex.axis = dCEX) | |
166 stripchart( adY ~ adCur, add = TRUE, col = astrColors, method = "jitter", vertical = TRUE, pch = 20 ) | |
167 title( ylab = strTaxon, cex.lab = dCEX, line = dLine ) | |
168 } else { | |
169 #Plot continuous data | |
170 plot( adCur, adY, mar = adMar, main = strTitle, xlab = lsCur$name, pch = 20, | |
171 col = sprintf( "%s99", funcGetColor( ) ), ylab = NA, xaxt = "s" ) | |
172 title( ylab = strTaxon, cex.lab = dCEX ) | |
173 lmod <- lm( adY ~ adCur ) | |
174 dColor <- lmod$coefficients[2] * mean( adCur, na.rm = TRUE ) / mean( adY, na.rm = TRUE ) | |
175 strColor <- sprintf( "%sDD", funcColor( dColor, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) ) | |
176 abline( reg = lmod, col = strColor, lwd = 3 ) | |
177 } | |
178 ### Plot the residual plot | |
179 if(fDoResidualPlot){funcResidualPlot(lsCur=lsCur, frmeTmp=frmeTmp, adColorMin=adColorMin, adColorMax=adColorMax, adColorMed=adColorMed, adMar, funcUnTransform=funcUnTransform, liNaIndices)} | |
180 return(strFilePDF) | |
181 ### File to which the pdf was written | |
182 } | |
183 | |
184 ### Plot 1 | |
185 # axis 1 gene expression (one bug) | |
186 # axis 2 PC1 (bugs and metadata)(MFA) | |
187 # over plot real data vs the residuals (real data verses the prediction) | |
188 # remove all but 1 bug + metadata | |
189 ### Plot 2 | |
190 #residuals (y) PCL1 (1 bug + metadata) | |
191 #Plot 3 | |
192 ### Can plot the residuals against all the variables in a grid/lattic | |
193 funcGetFactorBoxColors = function(adCur,adY,adColorMin,adColorMax,adColorMed) | |
194 { | |
195 astrColors = c() | |
196 | |
197 if( class( adCur ) == "factor" ) | |
198 { | |
199 if( "NA" %in% levels( adCur ) ) | |
200 { | |
201 afNA = adCur == "NA" | |
202 adY = adY[!afNA] | |
203 adCur = adCur[!afNA] | |
204 adCur = factor( adCur, levels = setdiff( levels( adCur ), "NA" ) ) | |
205 } | |
206 dMed = median( adY[adCur == levels( adCur )[1]], na.rm = TRUE ) | |
207 adIQR = quantile( adY, probs = c(0.25, 0.75), na.rm = TRUE ) | |
208 dIQR = adIQR[2] - adIQR[1] | |
209 if( dIQR <= 0 ) | |
210 { | |
211 dIQR <- sd( adY, na.rm = TRUE ) | |
212 } | |
213 dIQR <- dIQR / 2 | |
214 | |
215 for( strLevel in levels( adCur ) ) | |
216 { | |
217 astrColors <- c(astrColors, sprintf( "%sAA", funcColor( ( median( adY[adCur == strLevel], na.rm = TRUE ) - dMed ) / | |
218 dIQR, dMax = 3, dMin = -3, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) )) | |
219 } | |
220 } | |
221 return(astrColors) | |
222 } | |
223 | |
224 funcResidualPlotHelper <- function( | |
225 frmTmp, | |
226 ### The dataframe to plot from | |
227 sResponseFeature, | |
228 lsFullModelCovariateNames, | |
229 ### All covariates in lm (Column Names) | |
230 lsCovariateToControlForNames, | |
231 ### Y Axis: All the covariate which will be plotted together respectively * their beta with the residuals of the full model added. (These should dummied names for factor data; not column names). | |
232 sCovariateOfInterest, | |
233 ### X Axis: raw data of the covariate of interest. (Column Name) | |
234 adColorMin, | |
235 ### Min color in color range for markers | |
236 adColorMax, | |
237 ### Max color in color range for markers | |
238 adColorMed, | |
239 ### Medium color in color range for markers | |
240 adMar, | |
241 ### Standardized margins | |
242 funcUnTransform = NULL, | |
243 ### If a transform is used the opposite of that transfor must be used on the residuals in the partial residual plots | |
244 liNaIndices = c() | |
245 ### Indices of NA data that was imputed | |
246 ){ | |
247 # Get model matrix (raw data) | |
248 adCur = frmTmp[[sResponseFeature]] | |
249 | |
250 # Make a formula to calculate the new model to get the full model residuals | |
251 strFormula = paste("adCur",paste(sprintf( "`%s`", lsFullModelCovariateNames ),sep="", collapse="+"),sep="~") | |
252 | |
253 # Calculate lm | |
254 lmod = (lm(as.formula(strFormula),frmTmp)) | |
255 | |
256 # Get all coefficient data in the new model | |
257 dfdCoefs = coefficients(lmod) | |
258 | |
259 # Get all coefficient names in the new model | |
260 asAllCoefNames = names(dfdCoefs) | |
261 | |
262 # Get Y | |
263 # For each covariate that is being plotted on the y axis | |
264 # Convert the coefficient name to the column name | |
265 # If they are equal then the data is not discontinuous and you can use the raw data as is and multiply it by the coefficient in the model | |
266 # If they are not equal than the data is discontinuous, get the value for the data, set all but the levels equal to it to zero and multiply by the ceofficient from the model. | |
267 vY = rep(coefficients(lmod)[["(Intercept)"]],dim(frmTmp)[1]) | |
268 # vY = rep(0,dim(frmTmp)[1]) | |
269 | |
270 #Here we are not dealing with column names but, if factor data, the coefficient names | |
271 for(iCvIndex in 1:length(lsCovariateToControlForNames)) | |
272 { | |
273 sCurrentCovariate = lsCovariateToControlForNames[iCvIndex] | |
274 #Get the column name of the current covariate (this must be used to compare to other column names) | |
275 sCurCovariateColumnName = funcCoef2Col(sCurrentCovariate, frmTmp) | |
276 | |
277 #This is continuous data | |
278 if(sCurrentCovariate == sCurCovariateColumnName) | |
279 { | |
280 vY = vY + dfdCoefs[sCurrentCovariate]*frmTmp[[sCurCovariateColumnName]] | |
281 } else { | |
282 #Discontinuous data | |
283 # Get level | |
284 xLevel = substr(sCurrentCovariate,nchar(sCurCovariateColumnName)+1,nchar(sCurrentCovariate)) | |
285 | |
286 # Get locations where the data = level | |
287 aiLevelIndices = rep(0,dim(frmTmp)[1]) | |
288 aiLevelIndices[which(frmTmp[sCurCovariateColumnName] == xLevel)]=1 | |
289 sCurrentCovariateBeta = dfdCoefs[sCurrentCovariate] | |
290 if(is.na(sCurrentCovariateBeta)){sCurrentCovariateBeta=0} | |
291 vY = vY + sCurrentCovariateBeta * aiLevelIndices | |
292 } | |
293 } | |
294 #TODO based on transform vY = vY+sin(residuals(lmod))^2 | |
295 if(!is.null(funcUnTransform)) | |
296 { | |
297 vY = vY + funcUnTransform(residuals(lmod)) | |
298 } else { | |
299 vY = vY + residuals(lmod) } | |
300 | |
301 # Plot x, raw data | |
302 ## y label | |
303 sYLabel = paste(paste("B",lsCovariateToControlForNames,sep="*",collapse="+"),"e",sep="+") | |
304 sTitle = "Partial Residual Plot" | |
305 | |
306 adCurXValues = frmTmp[[sCovariateOfInterest]] | |
307 | |
308 # If not plotting metadata that was originally NA then remove the imputed values here | |
309 if(sCovariateOfInterest %in% names(liNaIndices)&&(length(liNaIndices[[sCovariateOfInterest]])>0)) | |
310 { | |
311 adCurXValues = adCurXValues[-1*liNaIndices[[sCovariateOfInterest]]] | |
312 vY <- vY[-1*liNaIndices[[sCovariateOfInterest]]] | |
313 if(is.factor(adCurXValues)){adCurXValues = factor(adCurXValues)} | |
314 } | |
315 | |
316 # Set the factor levels to include NA if they still exist | |
317 # This is so if something is not imputed, then if there are NAs they will be plotted (to show no imputing) | |
318 # Do not forget to keep te level order incase it was changed by the custom scripts. | |
319 if( class( adCurXValues ) == "factor" ) | |
320 { | |
321 vsLevels = levels(adCurXValues) | |
322 if(sum(is.na(adCurXValues))>0) | |
323 { | |
324 adCurXValues = as.character(adCurXValues) | |
325 adCurXValues[is.na(adCurXValues)]="NA" | |
326 adCurXValues = factor(adCurXValues, levels=c(vsLevels,"NA")) | |
327 } | |
328 } | |
329 | |
330 # Scale to the original range | |
331 if(!(class( adCurXValues ) == "factor" )) | |
332 { | |
333 vY = vY + mean(adCurXValues,rm.na=TRUE) | |
334 } | |
335 | |
336 # Plot Partial Residual Plot | |
337 # If we are printing discontinuous data | |
338 # Get the color of the box plots | |
339 # Plot box plots | |
340 # Plot data as strip charts | |
341 if(is.factor(adCurXValues)) | |
342 { | |
343 # adCurXValues = factor(adCurXValues) | |
344 astrColors = funcGetFactorBoxColors(adCurXValues,vY,adColorMin,adColorMax,adColorMed) | |
345 asNames = c() | |
346 for(sLevel in levels(adCurXValues)) | |
347 { | |
348 asNames = c(asNames,sprintf( "%s (%d)", sLevel, sum( adCurXValues == sLevel, na.rm = TRUE ) )) | |
349 } | |
350 | |
351 plot(adCurXValues, vY, xlab=sCovariateOfInterest, ylab=sYLabel, names=asNames, notch = TRUE,mar = adMar,col = astrColors, main=sTitle, outpch = 4, outcex = 0.5 ) | |
352 stripchart( vY ~ adCurXValues, add = TRUE, col = astrColors, method = "jitter", vertical = TRUE, pch = 20 ) | |
353 | |
354 } else { | |
355 plot( adCurXValues, vY, mar = adMar, main = sTitle, xlab=sCovariateOfInterest, col = sprintf( "%s99", funcGetColor( ) ), pch = 20,ylab = sYLabel, xaxt = "s" ) | |
356 | |
357 lmodLine = lm(vY~adCurXValues) | |
358 | |
359 dColor <- lmodLine$coefficients[2] * mean( adCurXValues, na.rm = TRUE ) / mean( vY, na.rm = TRUE ) | |
360 strColor <- sprintf( "%sDD", funcColor( dColor, adMax = adColorMin, adMin = adColorMax, adMed = adColorMed ) ) | |
361 abline( reg =lmodLine, col = strColor, lwd = 3 ) | |
362 } | |
363 } | |
364 | |
365 funcBoostInfluencePlot <- function( | |
366 # Plot to show the rel.inf from boosting, what to know if the rank order is correct, better ranks for spiked data. | |
367 # Show the cut off and features identified as uneven. | |
368 vdRelInf, | |
369 sFeature, | |
370 vsPredictorNames, | |
371 vstrKeepMetadata, | |
372 vstrUneven = c() | |
373 ){ | |
374 vsCol = rep("black",length(vdRelInf)) | |
375 vsCol[which(vsPredictorNames %in% vstrKeepMetadata)]="green" | |
376 vsCol[which(vsPredictorNames %in% vstrUneven)] = "orange" | |
377 plot(vdRelInf, col=vsCol, main=sFeature, xlab="Index", ylab="Relative Influence") | |
378 legend("topright", pch = paste(1:length(vsPredictorNames)), legend= vsPredictorNames, text.col=vsCol, col=vsCol) | |
379 } | |
380 | |
381 funcResidualPlot <- function( | |
382 ### Plot to data after confounding. | |
383 ### That is, in a linear model with significant coefficient b1 for variable x1, | |
384 ### that's been sparsified to some subset of terms: y = b0 + b1*x1 + sum(bi*xi) | |
385 ### Plot x1 on the X axis, and instead of y on the Y axis, instead plot: | |
386 ### y' = b0 + sum(bi*xi) | |
387 lsCur, | |
388 ### Assocation to plot | |
389 frmeTmp, | |
390 ### Data frame of orginal data | |
391 adColorMin, | |
392 ### Min color in color range for markers | |
393 adColorMax, | |
394 ### Max color in color range for markers | |
395 adColorMed, | |
396 ### Medium color in color range for markers | |
397 adMar, | |
398 ### Standardized margins | |
399 funcUnTransform, | |
400 ### If a transform is used the opporite of that transfor must be used on the residuals in the partial residual plots | |
401 liNaIndices = c() | |
402 ### Indices of NA data that was imputed | |
403 ){ | |
404 #Now plot residual hat plot | |
405 #Get coefficient names | |
406 asAllCoefs = setdiff(names(lsCur$allCoefs),c("(Intercept)")) | |
407 asAllColNames = c() | |
408 for(sCoef in asAllCoefs) | |
409 { | |
410 asAllColNames = c(asAllColNames,funcCoef2Col(sCoef,frmeData=frmeTmp)) | |
411 } | |
412 asAllColNames = unique(asAllColNames) | |
413 | |
414 # All coefficients except for the one of interest | |
415 lsOtherCoefs = setdiff(asAllColNames, c(lsCur$name)) | |
416 | |
417 lsCovariatesToPlot = NULL | |
418 if(is.factor(lsCur$metadata)) | |
419 { | |
420 lsCovariatesToPlot = paste(lsCur$name,levels(lsCur$metadata),sep="") | |
421 }else{lsCovariatesToPlot=c(lsCur$orig)} | |
422 | |
423 # If there are no other coefficients then skip plot | |
424 # if(!length(lsOtherCoefs)){return()} | |
425 | |
426 # Plot residuals | |
427 funcResidualPlotHelper(frmTmp=frmeTmp, sResponseFeature=lsCur$taxon, lsFullModelCovariateNames=asAllColNames, lsCovariateToControlForNames=lsCovariatesToPlot, sCovariateOfInterest=lsCur$name, adColorMin=adColorMin, adColorMax=adColorMax, adColorMed=adColorMed, adMar=adMar, funcUnTransform=funcUnTransform, liNaIndices=liNaIndices) | |
428 } |