0
|
1 #####################################################################################
|
|
2 #Copyright (C) <2012>
|
|
3 #
|
|
4 #Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
5 #this software and associated documentation files (the "Software"), to deal in the
|
|
6 #Software without restriction, including without limitation the rights to use, copy,
|
|
7 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
8 #and to permit persons to whom the Software is furnished to do so, subject to
|
|
9 #the following conditions:
|
|
10 #
|
|
11 #The above copyright notice and this permission notice shall be included in all copies
|
|
12 #or substantial portions of the Software.
|
|
13 #
|
|
14 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
|
15 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
16 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
17 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
18 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
19 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
20 #
|
|
21 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
|
|
22 # authored by the Huttenhower lab at the Harvard School of Public Health
|
|
23 # (contact Timothy Tickle, ttickle@hsph.harvard.edu).
|
|
24 #####################################################################################
|
|
25
|
|
26 inlinedocs <- function(
|
|
27 ##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu>
|
|
28 ##description<< Allows one to plug in new modules to perform analysis (univariate or multivariate), regularization, and data (response) transformation.
|
|
29 ) { return( pArgs ) }
|
|
30
|
|
31 # Libraries
|
|
32 suppressMessages(library( agricolae, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
33 # Needed for the pot-hoc Kruskal wallis comparisons
|
|
34 suppressMessages(library( penalized, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
35 # Needed for stepAIC
|
|
36 suppressMessages(library( MASS, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
37 # Needed for na action behavior
|
|
38 suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
39 # Needed for boosting
|
|
40 suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
41 # Needed for LASSO
|
|
42 suppressMessages(library( glmnet, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
43 # Needed for mixed models
|
|
44 #suppressMessages(library( lme4, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
45 suppressMessages(library( nlme, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
46
|
|
47 # Needed for zero inflated models
|
|
48 #suppressMessages(library( MCMCglmm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
49 suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
50 suppressMessages(library( gamlss, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
51 # Do not use #suppressMessages(library( glmmADMB, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
|
|
52
|
|
53 fAddBack = TRUE
|
|
54 dUnevenMax = .9
|
|
55
|
|
56
|
|
57 ### Helper functions
|
|
58 # OK
|
|
59 funcMakeContrasts <- function
|
|
60 ### Makes univariate contrasts of all predictors in the model formula with the response.
|
|
61 (strFormula,
|
|
62 ### lm style string defining reponse and predictors
|
|
63 strRandomFormula,
|
|
64 ### mixed model string defining the fixed covariates
|
|
65 frmeTmp,
|
|
66 ### The data frame to find predictor data in
|
|
67 iTaxon,
|
|
68 ### Taxon
|
|
69 functionContrast,
|
|
70 ### functionContrast The univariate test to perform
|
|
71 lsQCCounts
|
|
72 ### QC info
|
|
73 ){
|
|
74 #TODO are we updating the QCCounts?
|
|
75 lsSig = list()
|
|
76 ### Holds all the significance results from the tests
|
|
77 adP = c()
|
|
78 ### Holds the p-values
|
|
79 sCurDataName = names(frmeTmp)[iTaxon]
|
|
80 ### The name of the taxon (data row) that is being associated (always assumed to be numeric)
|
|
81 #Get test comparisons (predictor names from formula string)
|
|
82 asComparisons = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula)))
|
|
83
|
|
84 #Change metadata in formula to univariate comparisons
|
|
85 for(sComparison in asComparisons)
|
|
86 {
|
|
87 # Metadata values
|
|
88 vxTest = frmeTmp[[sComparison]]
|
|
89
|
|
90 # Get the levels in the comparison
|
|
91 # Can ignore the first level because it is the reference level
|
|
92 asLevels = sComparison
|
|
93 if(is.factor(vxTest)){asLevels = levels(vxTest)[2:length(vxTest)]}
|
|
94
|
|
95 lasComparisonResults = functionContrast(x=sComparison, adCur=frmeTmp[[sCurDataName]], dfData=frmeTmp)
|
|
96 for(asComparison in lasComparisonResults)
|
|
97 {
|
|
98 if( is.na( asComparison$p.value ) ) { next }
|
|
99 # Get pvalue
|
|
100 adP = c(adP, asComparison$p.value)
|
|
101 # Get SD, if not available, give SD of covariate
|
|
102 dSTD = asComparison$SD
|
|
103 # TODO Is using sd on factor and binary data correct?
|
|
104 if(is.na(dSTD) || is.null(dSTD)){dSTD = sd(vxTest)}
|
|
105
|
|
106 lsSig[[length( lsSig ) + 1]] <- list(
|
|
107 #Current metadata name (string column name) ok
|
|
108 name = sComparison,
|
|
109 #Current metadatda name (string, as a factor level if existing as such) ok
|
|
110 orig = asComparison$name,
|
|
111 #Taxon feature name (string) ok
|
|
112 taxon = colnames( frmeTmp )[iTaxon],
|
|
113 #Taxon data / response (double vector) ok
|
|
114 data = frmeTmp[,iTaxon],
|
|
115 #Name of column ok
|
|
116 factors = sComparison,
|
|
117 #Metadata values (metadata as a factor or raw numeric) ok
|
|
118 metadata = vxTest,
|
|
119 #Current coefficient value (named coef value with level name (from coefs) ok
|
|
120 value = asComparison$coef,
|
|
121 #Standard deviation (numeric) ok
|
|
122 std = dSTD,
|
|
123 #Model coefficients (output from coefs with intercept) ok
|
|
124 allCoefs = asComparison$coef)
|
|
125 }
|
|
126 }
|
|
127 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
|
|
128 ### Returns a list of p-value, standard deviation, and comparison which produced the p-value
|
|
129 }
|
|
130
|
|
131 #Ok
|
|
132 funcGetStepPredictors <- function
|
|
133 ### Retrieve the predictors of the reduced model after stepwise model selection
|
|
134 (lmod,
|
|
135 ### Linear model resulting from step-wise model selection
|
|
136 frmeTmp,
|
|
137 strLog
|
|
138 ### File to document logging
|
|
139 ){
|
|
140 #Document
|
|
141 funcWrite( "#model", strLog )
|
|
142 funcWrite( lmod$fit, strLog )
|
|
143 #TODO funcWrite( lmod$train.error, strLog )
|
|
144 #TODO funcWrite( lmod$Terms, strLog )
|
|
145 funcWrite( "#summary-gbm", strLog )
|
|
146 funcWrite( summary(lmod), strLog )
|
|
147
|
|
148 #Get Names from coefficients
|
|
149 asStepCoefsFactors = coefficients(lmod)
|
|
150 astrCoefNames = setdiff(names(asStepCoefsFactors[as.vector(!is.na(asStepCoefsFactors))==TRUE]),"(Intercept)")
|
|
151 asStepCoefsFactors = unique(as.vector(sapply(astrCoefNames,funcCoef2Col, frmeData=frmeTmp)))
|
|
152
|
|
153 if(length(asStepCoefsFactors)<1){ return(NA) }
|
|
154 return( asStepCoefsFactors )
|
|
155 ### Vector of string predictor names
|
|
156 }
|
|
157
|
|
158 funcGetUnivariateResults <- function
|
|
159 ### Reduce the list of list of results to the correct format
|
|
160 ( llmod,
|
|
161 ### The list of univariate models
|
|
162 frmeData,
|
|
163 ### Data analysis is performed on
|
|
164 liTaxon,
|
|
165 ### The response id
|
|
166 dSig,
|
|
167 ### Significance level for q-values
|
|
168 adP,
|
|
169 ### List of pvalues from all associations performed
|
|
170 lsSig,
|
|
171 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
|
|
172 strLog,
|
|
173 ### File to which to document logging
|
|
174 lsQCCounts,
|
|
175 ### Records of counts associated with quality control
|
|
176 lastrCols,
|
|
177 ### Predictors used in the association
|
|
178 asSuppressCovariates=c()
|
|
179 ### Vector of covariates to suppress and not give results for
|
|
180 ){
|
|
181 adP = c()
|
|
182 lsSig = list()
|
|
183 for(lmod in llmod)
|
|
184 {
|
|
185 adP = c(adP,lmod$adP)
|
|
186 lsSig = c(lsSig,lmod$lsSig)
|
|
187 }
|
|
188 return(list(adP=adP, lsSig=lsSig, lsQCCounts=llmod[length(llmod)]$lsQCCounts))
|
|
189 }
|
|
190
|
|
191 # OK
|
|
192 funcGetLMResults <- function
|
|
193 ### Reduce the lm object return to just the data needed for further analysis
|
|
194 ( llmod,
|
|
195 ### The result from a linear model
|
|
196 frmeData,
|
|
197 ### Data analysis is performed on
|
|
198 liTaxon,
|
|
199 ### The response id
|
|
200 dSig,
|
|
201 ### Significance level for q-values
|
|
202 adP,
|
|
203 ### List of pvalues from all associations performed
|
|
204 lsSig,
|
|
205 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
|
|
206 strLog,
|
|
207 ### File to which to document logging
|
|
208 lsQCCounts,
|
|
209 ### Records of counts associated with quality control
|
|
210 lastrCols,
|
|
211 ### Predictors used in the association
|
|
212 asSuppressCovariates=c()
|
|
213 ### Vector of covariates to suppress and not give results for
|
|
214 ){
|
|
215 ilmodIndex = 0
|
|
216 for( lmod in llmod )
|
|
217 {
|
|
218 ilmodIndex = ilmodIndex + 1
|
|
219 lmod = llmod[[ ilmodIndex ]]
|
|
220 iTaxon = liTaxon[[ ilmodIndex ]]
|
|
221 astrCols = lastrCols[[ ilmodIndex ]]
|
|
222
|
|
223 #Exclude none and errors
|
|
224 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
|
|
225 {
|
|
226 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
|
|
227 iPValuePosition = 4
|
|
228
|
|
229 #Get the column name of the iTaxon index
|
|
230 #frmeTmp needs to be what?
|
|
231 strTaxon = colnames( frmeData )[iTaxon]
|
|
232 #Get summary information from the linear model
|
|
233 lsSum = try( summary( lmod ) )
|
|
234 #The following can actually happen when the stranger regressors return broken results
|
|
235 if( class( lsSum ) == "try-error" )
|
|
236 {
|
|
237 next
|
|
238 }
|
|
239
|
|
240 #Write summary information to log file
|
|
241 funcWrite( "#model", strLog )
|
|
242 funcWrite( lmod, strLog )
|
|
243 funcWrite( "#summary", strLog )
|
|
244 #Unbelievably, some of the more unusual regression methods crash out when _printing_ their results
|
|
245 try( funcWrite( lsSum, strLog ) )
|
|
246
|
|
247 #Get the coefficients
|
|
248 #This should work for linear models
|
|
249 frmeCoefs <- try( coefficients( lsSum ) )
|
|
250
|
|
251 if( ( class(frmeCoefs ) == "try-error" ) || is.null( frmeCoefs ) )
|
|
252 {
|
|
253 adCoefs = try( coefficients( lmod ))
|
|
254 if(class( adCoefs ) == "try-error")
|
|
255 {
|
|
256 adCoefs = coef(lmod)
|
|
257 }
|
|
258 frmeCoefs <- NA
|
|
259 } else {
|
|
260 if( class( frmeCoefs ) == "list" )
|
|
261 {
|
|
262 frmeCoefs <- frmeCoefs$count
|
|
263 }
|
|
264 adCoefs = frmeCoefs[,1]
|
|
265 }
|
|
266
|
|
267 #Go through each coefficient
|
|
268 astrRows <- names( adCoefs )
|
|
269
|
|
270 ##lmm
|
|
271 if( is.null( astrRows ) )
|
|
272 {
|
|
273 astrRows = rownames( lsSum$tTable )
|
|
274 frmeCoefs = lsSum$tTable
|
|
275 iPValuePosition = 5
|
|
276 adCoefs = frmeCoefs[,1]
|
|
277 }
|
|
278
|
|
279 for( iMetadata in 1:length( astrRows ) )
|
|
280 {
|
|
281 #Current coef which is being evaluated
|
|
282 strOrig = astrRows[ iMetadata ]
|
|
283 #Skip y interscept
|
|
284 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
|
|
285 #Skip suppressed covariates
|
|
286 if( funcCoef2Col( strOrig, frmeData ) %in% asSuppressCovariates){ next }
|
|
287
|
|
288 #Extract pvalue and std in standard model
|
|
289 dP = frmeCoefs[ strOrig, iPValuePosition ]
|
|
290 dStd = frmeCoefs[ strOrig, 2 ]
|
|
291
|
|
292 #Attempt to extract the pvalue and std in mixed effects summary
|
|
293 #Could not get the pvalue so skip result
|
|
294 if( is.nan( dP ) || is.na( dP ) || is.null( dP ) ) { next }
|
|
295
|
|
296 dCoef = adCoefs[ iMetadata ]
|
|
297
|
|
298 #Setting adMetadata
|
|
299 #Metadata values
|
|
300 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
|
|
301 if( is.na( strMetadata ) )
|
|
302 {
|
|
303 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
|
|
304 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
|
|
305 }
|
|
306 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
|
|
307 adMetadata <- frmeData[, strMetadata ]
|
|
308
|
|
309 # Store (factor level modified) p-value
|
|
310 # Store general results for each coef
|
|
311 adP <- c( adP, dP )
|
|
312
|
|
313 # Append to the list of information about associations
|
|
314 lsSig[[ length( lsSig ) + 1 ]] <- list(
|
|
315 # Current metadata name
|
|
316 name = strMetadata,
|
|
317 # Current metadatda name (as a factor level if existing as such)
|
|
318 orig = strOrig,
|
|
319 # Taxon feature name
|
|
320 taxon = strTaxon,
|
|
321 # Taxon data / response
|
|
322 data = frmeData[, iTaxon ],
|
|
323 # All levels
|
|
324 factors = c( strMetadata ),
|
|
325 # Metadata values
|
|
326 metadata = adMetadata,
|
|
327 # Current coeficient value
|
|
328 value = dCoef,
|
|
329 # Standard deviation
|
|
330 std = dStd,
|
|
331 # Model coefficients
|
|
332 allCoefs = adCoefs )
|
|
333 }
|
|
334 }
|
|
335 }
|
|
336 return( list( adP = adP, lsSig = lsSig, lsQCCounts = lsQCCounts ) )
|
|
337 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
|
|
338 }
|
|
339
|
|
340 funcGetZeroInflatedResults <- function
|
|
341 ### Reduce the lm object return to just the data needed for further analysis
|
|
342 ( llmod,
|
|
343 ### The result from a linear model
|
|
344 frmeData,
|
|
345 ### Data analysis is performed on
|
|
346 liTaxon,
|
|
347 ### The response id
|
|
348 dSig,
|
|
349 ### Significance level for q-values
|
|
350 adP,
|
|
351 ### List of pvalues from all associations performed
|
|
352 lsSig,
|
|
353 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
|
|
354 strLog,
|
|
355 ### File to which to document logging
|
|
356 lsQCCounts,
|
|
357 ### Records of counts associated with quality control
|
|
358 lastrCols,
|
|
359 ### Predictors used in the association
|
|
360 asSuppressCovariates=c()
|
|
361 ### Vector of covariates to suppress and not give results for
|
|
362 ){
|
|
363 ilmodIndex = 0
|
|
364 for(lmod in llmod)
|
|
365 {
|
|
366 ilmodIndex = ilmodIndex + 1
|
|
367 lmod = llmod[[ilmodIndex]]
|
|
368 iTaxon = liTaxon[[ilmodIndex]]
|
|
369 astrCols = lastrCols[[ilmodIndex]]
|
|
370
|
|
371 #Exclude none and errors
|
|
372 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
|
|
373 {
|
|
374 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
|
|
375 iPValuePosition = 4
|
|
376
|
|
377 #Get the column name of the iTaxon index
|
|
378 #frmeTmp needs to be what?
|
|
379 strTaxon = colnames( frmeData )[iTaxon]
|
|
380
|
|
381 #Write summary information to log file
|
|
382 funcWrite( "#model", strLog )
|
|
383 funcWrite( lmod, strLog )
|
|
384
|
|
385 #Get the coefficients
|
|
386 #This should work for linear models
|
|
387 frmeCoefs <- summary(lmod)
|
|
388 if(! is.null( frmeCoefs$coefficients$count ) ) # Needed for zeroinfl
|
|
389 {
|
|
390 frmeCoefs = frmeCoefs$coefficients$count
|
|
391 }
|
|
392 adCoefs = frmeCoefs[,1]
|
|
393 names(adCoefs) = row.names(frmeCoefs)
|
|
394
|
|
395 funcWrite( "#Coefs", strLog )
|
|
396 funcWrite( frmeCoefs, strLog )
|
|
397
|
|
398 #Go through each coefficient
|
|
399 astrRows <- row.names( frmeCoefs )
|
|
400
|
|
401 for( iMetadata in 1:length( astrRows ) )
|
|
402 {
|
|
403 #Current coef which is being evaluated
|
|
404 strOrig = astrRows[iMetadata]
|
|
405 #Skip y interscept
|
|
406 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
|
|
407 #Skip suppressed covariates
|
|
408 if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next}
|
|
409
|
|
410 #Extract pvalue and std in standard model
|
|
411 dP = frmeCoefs[strOrig, iPValuePosition]
|
|
412 if(is.nan(dP)){next}
|
|
413 dStd = frmeCoefs[strOrig,2]
|
|
414
|
|
415 dCoef = adCoefs[iMetadata]
|
|
416
|
|
417 #Setting adMetadata
|
|
418 #Metadata values
|
|
419 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
|
|
420 if( is.na( strMetadata ) )
|
|
421 {
|
|
422 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
|
|
423 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
|
|
424 }
|
|
425 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
|
|
426 adMetadata <- frmeData[,strMetadata]
|
|
427
|
|
428 #Store (factor level modified) p-value
|
|
429 #Store general results for each coef
|
|
430 adP <- c(adP, dP)
|
|
431 lsSig[[length( lsSig ) + 1]] <- list(
|
|
432 #Current metadata name
|
|
433 name = strMetadata,
|
|
434 #Current metadatda name (as a factor level if existing as such)
|
|
435 orig = strOrig,#
|
|
436 #Taxon feature name
|
|
437 taxon = strTaxon,
|
|
438 #Taxon data / response
|
|
439 data = frmeData[,iTaxon],
|
|
440 #All levels
|
|
441 factors = c(strMetadata),
|
|
442 #Metadata values
|
|
443 metadata = adMetadata,
|
|
444 #Current coeficient value
|
|
445 value = dCoef,
|
|
446 #Standard deviation
|
|
447 std = dStd,
|
|
448 #Model coefficients
|
|
449 allCoefs = adCoefs)
|
|
450 }
|
|
451 }
|
|
452 }
|
|
453 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
|
|
454 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
|
|
455 }
|
|
456
|
|
457 oldfuncGetZeroInflatedResults <- function
|
|
458 ### Reduce the lm object return to just the data needed for further analysis
|
|
459 ( llmod,
|
|
460 ### The result from a linear model
|
|
461 frmeData,
|
|
462 ### Data analysis is performed on
|
|
463 liTaxon,
|
|
464 ### The response id
|
|
465 dSig,
|
|
466 ### Significance level for q-values
|
|
467 adP,
|
|
468 ### List of pvalues from all associations performed
|
|
469 lsSig,
|
|
470 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
|
|
471 strLog,
|
|
472 ### File to which to document logging
|
|
473 lsQCCounts,
|
|
474 ### Records of counts associated with quality control
|
|
475 lastrCols,
|
|
476 ### Predictors used in the association
|
|
477 asSuppressCovariates=c()
|
|
478 ### Vector of covariates to suppress and not give results for
|
|
479 ){
|
|
480 ilmodIndex = 0
|
|
481 for(lmod in llmod)
|
|
482 {
|
|
483 ilmodIndex = ilmodIndex + 1
|
|
484 lmod = llmod[[ilmodIndex]]
|
|
485 iTaxon = liTaxon[[ilmodIndex]]
|
|
486 astrCols = lastrCols[[ilmodIndex]]
|
|
487
|
|
488 #Exclude none and errors
|
|
489 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
|
|
490 {
|
|
491 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
|
|
492 iPValuePosition = 4
|
|
493
|
|
494 #Get the column name of the iTaxon index
|
|
495 #frmeTmp needs to be what?
|
|
496 strTaxon = colnames( frmeData )[iTaxon]
|
|
497
|
|
498 #Write summary information to log file
|
|
499 funcWrite( "#model", strLog )
|
|
500 funcWrite( lmod, strLog )
|
|
501
|
|
502 #Get the coefficients
|
|
503 #This should work for linear models
|
|
504 frmeCoefs <- summary(lmod)
|
|
505 adCoefs = frmeCoefs[,1]
|
|
506 names(adCoefs) = row.names(frmeCoefs)
|
|
507
|
|
508 #Go through each coefficient
|
|
509 astrRows <- row.names( frmeCoefs )
|
|
510
|
|
511 for( iMetadata in 1:length( astrRows ) )
|
|
512 {
|
|
513 #Current coef which is being evaluated
|
|
514 strOrig = astrRows[iMetadata]
|
|
515 #Skip y interscept
|
|
516 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
|
|
517 #Skip suppressed covariates
|
|
518 if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next}
|
|
519
|
|
520 #Extract pvalue and std in standard model
|
|
521 dP = frmeCoefs[strOrig, iPValuePosition]
|
|
522 dStd = frmeCoefs[strOrig,2]
|
|
523
|
|
524 dCoef = adCoefs[iMetadata]
|
|
525
|
|
526 #Setting adMetadata
|
|
527 #Metadata values
|
|
528 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
|
|
529 if( is.na( strMetadata ) )
|
|
530 {
|
|
531 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
|
|
532 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
|
|
533 }
|
|
534 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
|
|
535 adMetadata <- frmeData[,strMetadata]
|
|
536
|
|
537 #Store (factor level modified) p-value
|
|
538 #Store general results for each coef
|
|
539 adP <- c(adP, dP)
|
|
540 lsSig[[length( lsSig ) + 1]] <- list(
|
|
541 #Current metadata name
|
|
542 name = strMetadata,
|
|
543 #Current metadatda name (as a factor level if existing as such)
|
|
544 orig = strOrig,#
|
|
545 #Taxon feature name
|
|
546 taxon = strTaxon,
|
|
547 #Taxon data / response
|
|
548 data = frmeData[,iTaxon],
|
|
549 #All levels
|
|
550 factors = c(strMetadata),
|
|
551 #Metadata values
|
|
552 metadata = adMetadata,
|
|
553 #Current coeficient value
|
|
554 value = dCoef,
|
|
555 #Standard deviation
|
|
556 std = dStd,
|
|
557 #Model coefficients
|
|
558 allCoefs = adCoefs)
|
|
559 }
|
|
560 }
|
|
561 }
|
|
562 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
|
|
563 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
|
|
564 }
|
|
565
|
|
566
|
|
567 notfuncGetZeroInflatedResults <- function
|
|
568 ### Reduce the lm object return to just the data needed for further analysis
|
|
569 ( llmod,
|
|
570 ### The result from a linear model
|
|
571 frmeData,
|
|
572 ### Data analysis is performed on
|
|
573 liTaxon,
|
|
574 ### The response id
|
|
575 dSig,
|
|
576 ### Significance level for q-values
|
|
577 adP,
|
|
578 ### List of pvalues from all associations performed
|
|
579 lsSig,
|
|
580 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
|
|
581 strLog,
|
|
582 ### File to which to document logging
|
|
583 lsQCCounts,
|
|
584 ### Records of counts associated with quality control
|
|
585 lastrCols,
|
|
586 ### Predictors used in the association
|
|
587 asSuppressCovariates=c()
|
|
588 ### Vector of covariates to suppress and not give results for
|
|
589 ){
|
|
590 ilmodIndex = 0
|
|
591 for(lmod in llmod)
|
|
592 {
|
|
593 ilmodIndex = ilmodIndex + 1
|
|
594 lmod = llmod[[ilmodIndex]]
|
|
595 iTaxon = liTaxon[[ilmodIndex]]
|
|
596 astrCols = lastrCols[[ilmodIndex]]
|
|
597
|
|
598 #Exclude none and errors
|
|
599 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
|
|
600 {
|
|
601 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
|
|
602 iPValuePosition = 4
|
|
603
|
|
604 #Get the column name of the iTaxon index
|
|
605 #frmeTmp needs to be what?
|
|
606 strTaxon = colnames( frmeData )[iTaxon]
|
|
607
|
|
608 #Write summary information to log file
|
|
609 funcWrite( "#model", strLog )
|
|
610 funcWrite( lmod, strLog )
|
|
611
|
|
612 #Get the coefficients
|
|
613 #This should work for linear models
|
|
614 frmeCoefs <- summary(lmod)
|
|
615 frmeCoefs = frmeCoefs$coefficients$count
|
|
616 funcWrite( "#Coefs", strLog )
|
|
617 funcWrite( frmeCoefs, strLog )
|
|
618
|
|
619 adCoefs = frmeCoefs[,1]
|
|
620 names(adCoefs) = row.names(frmeCoefs)
|
|
621
|
|
622 #Go through each coefficient
|
|
623 astrRows <- row.names( frmeCoefs )
|
|
624
|
|
625 for( iMetadata in 1:length( astrRows ) )
|
|
626 {
|
|
627 #Current coef which is being evaluated
|
|
628 strOrig = astrRows[iMetadata]
|
|
629
|
|
630 #Skip y interscept
|
|
631 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
|
|
632 #Skip suppressed covariates
|
|
633 if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next}
|
|
634
|
|
635 #Extract pvalue and std in standard model
|
|
636 dP = frmeCoefs[strOrig, iPValuePosition]
|
|
637 if(is.nan(dP)){ next }
|
|
638 dStd = frmeCoefs[strOrig,2]
|
|
639 dCoef = adCoefs[iMetadata]
|
|
640
|
|
641 #Setting adMetadata
|
|
642 #Metadata values
|
|
643 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
|
|
644 if( is.na( strMetadata ) )
|
|
645 {
|
|
646 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
|
|
647 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
|
|
648 }
|
|
649 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
|
|
650 adMetadata <- frmeData[,strMetadata]
|
|
651
|
|
652 #Store (factor level modified) p-value
|
|
653 #Store general results for each coef
|
|
654 adP <- c(adP, dP)
|
|
655 lsSig[[length( lsSig ) + 1]] <- list(
|
|
656 #Current metadata name
|
|
657 name = strMetadata,
|
|
658 #Current metadatda name (as a factor level if existing as such)
|
|
659 orig = strOrig,#
|
|
660 #Taxon feature name
|
|
661 taxon = strTaxon,
|
|
662 #Taxon data / response
|
|
663 data = frmeData[,iTaxon],
|
|
664 #All levels
|
|
665 factors = c(strMetadata),
|
|
666 #Metadata values
|
|
667 metadata = adMetadata,
|
|
668 #Current coeficient value
|
|
669 value = dCoef,
|
|
670 #Standard deviation
|
|
671 std = dStd,
|
|
672 #Model coefficients
|
|
673 allCoefs = adCoefs)
|
|
674 }
|
|
675 }
|
|
676 }
|
|
677 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
|
|
678 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
|
|
679 }
|
|
680
|
|
681
|
|
682 ### Options for variable selection
|
|
683 # OK
|
|
684 funcBoostModel <- function(
|
|
685 ### Perform model selection / regularization with boosting
|
|
686 strFormula,
|
|
687 ### The formula of the full model before boosting
|
|
688 frmeTmp,
|
|
689 ### The data on which to perform analysis
|
|
690 adCur,
|
|
691 ### The response data
|
|
692 lsParameters,
|
|
693 ### User controlled parameters needed specific to boosting
|
|
694 lsForcedParameters = NULL,
|
|
695 ### Force these predictors to be in the model
|
|
696 strLog
|
|
697 ### File to which to document logging
|
|
698 ){
|
|
699 funcWrite( c("#Boost formula", strFormula), strLog )
|
|
700 lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="laplace", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) )
|
|
701 #TODO# lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="gaussian", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) )
|
|
702
|
|
703 astrTerms <- c()
|
|
704 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
|
|
705 {
|
|
706 #Get boosting summary results
|
|
707 lsSum <- summary( lmod, plotit = FALSE )
|
|
708
|
|
709 #Document
|
|
710 funcWrite( "#model-gbm", strLog )
|
|
711 funcWrite( lmod$fit, strLog )
|
|
712 funcWrite( lmod$train.error, strLog )
|
|
713 funcWrite( lmod$Terms, strLog )
|
|
714 funcWrite( "#summary-gbm", strLog )
|
|
715 funcWrite( lsSum, strLog )
|
|
716
|
|
717 # Uneven metadata
|
|
718 vstrUneven = c()
|
|
719 # Kept metadata
|
|
720 vstrKeepMetadata = c()
|
|
721
|
|
722 #Select model predictors
|
|
723 #Check the frequency of selection and skip if not selected more than set threshold dFreq
|
|
724 for( strMetadata in lmod$var.names )
|
|
725 {
|
|
726 #Get the name of the metadata
|
|
727 strTerm <- funcCoef2Col( strMetadata, frmeTmp, c(astrMetadata, astrGenetics) )
|
|
728
|
|
729 #Add back in uneven metadata
|
|
730 if(fAddBack)
|
|
731 {
|
|
732 ldMetadata = frmeTmp[[strMetadata]]
|
|
733 if(length(which(table(ldMetadata)/length(ldMetadata)>dUnevenMax))>0)
|
|
734 {
|
|
735 astrTerms <- c(astrTerms, strTerm)
|
|
736 vstrUneven = c(vstrUneven,strMetadata)
|
|
737 next
|
|
738 }
|
|
739 }
|
|
740
|
|
741 #If the selprob is less than a certain frequency, skip
|
|
742 dSel <- lsSum$rel.inf[which( lsSum$var == strMetadata )] / 100
|
|
743 if( is.na(dSel) || ( dSel <= lsParameters$dFreq ) ){ next }
|
|
744 #TODO# if( is.na(dSel) || ( dSel < lsParameters$dFreq ) ){ next }
|
|
745
|
|
746 #If you should ignore the metadata, continue
|
|
747 if( is.null( strTerm ) ) { next }
|
|
748
|
|
749 #If you cant find the metadata name, write
|
|
750 if( is.na( strTerm ) )
|
|
751 {
|
|
752 c_logrMaaslin$error( "Unknown coefficient: %s", strMetadata )
|
|
753 next
|
|
754 }
|
|
755
|
|
756 #Collect metadata names
|
|
757 astrTerms <- c(astrTerms, strTerm)
|
|
758 vstrKeepMetadata = c(vstrKeepMetadata,strTerm)
|
|
759 }
|
|
760 } else { astrTerms = lsForcedParameters }
|
|
761
|
|
762 # funcBoostInfluencePlot(vdRelInf=lsSum$rel.inf, sFeature=lsParameters$sBugName, vsPredictorNames=lsSum$var, vstrKeepMetadata=vstrKeepMetadata, vstrUneven=vstrUneven)
|
|
763
|
|
764 return(unique(c(astrTerms,lsForcedParameters)))
|
|
765 ### Return a vector of predictor names to use in a reduced model
|
|
766 }
|
|
767
|
|
768 #Glmnet default is to standardize the variables.
|
|
769 #used as an example for implementation
|
|
770 #http://r.789695.n4.nabble.com/estimating-survival-times-with-glmnet-and-coxph-td4614225.html
|
|
771 funcPenalizedModel <- function(
|
|
772 ### Perform penalized regularization for variable selection
|
|
773 strFormula,
|
|
774 ### The formula of the full model before boosting
|
|
775 frmeTmp,
|
|
776 ### The data on which to perform analysis
|
|
777 adCur,
|
|
778 ### The response data
|
|
779 lsParameters,
|
|
780 ### User controlled parameters needed specific to boosting
|
|
781 lsForcedParameters = NULL,
|
|
782 ### Force these predictors to be in the model
|
|
783 strLog
|
|
784 ### File to which to document logging
|
|
785 ){
|
|
786 #Convert the data frame to a model matrix
|
|
787 mtrxDesign = model.matrix(as.formula(strFormula), data=frmeTmp)
|
|
788
|
|
789 #Cross validate the lambda
|
|
790 cvRet = cv.glmnet(x=mtrxDesign,y=adCur,alpha=lsParameters$dPAlpha)
|
|
791
|
|
792 #Perform lasso
|
|
793 glmnetMod = glmnet(x=mtrxDesign,y=adCur,family=lsParameters$family,alpha=lsParameters$dPAlpha,lambda=cvRet$lambda.min)
|
|
794
|
|
795 #Get non zero beta and return column names for covariate names.
|
|
796 ldBeta = glmnetMod$beta[,which.max(glmnetMod$dev.ratio)]
|
|
797 ldBeta = names(ldBeta[which(abs(ldBeta)>0)])
|
|
798 return(sapply(ldBeta,funcCoef2Col,frmeData=frmeTmp))
|
|
799 }
|
|
800
|
|
801 # OK
|
|
802 funcForwardModel <- function(
|
|
803 ### Perform model selection with forward stepwise selection
|
|
804 strFormula,
|
|
805 ### lm style string defining reposne and predictors
|
|
806 frmeTmp,
|
|
807 ### Data on which to perform analysis
|
|
808 adCur,
|
|
809 ### Response data
|
|
810 lsParameters,
|
|
811 ### User controlled parameters needed specific to boosting
|
|
812 lsForcedParameters = NULL,
|
|
813 ### Force these predictors to be in the model
|
|
814 strLog
|
|
815 ### File to which to document logging
|
|
816 ){
|
|
817 funcWrite( c("#Forward formula", strFormula), strLog )
|
|
818
|
|
819 strNullFormula = "adCur ~ 1"
|
|
820 if(!is.null(lsForcedParameters))
|
|
821 {
|
|
822 strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " ))
|
|
823 }
|
|
824 lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp))
|
|
825 lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp ))
|
|
826 if(!("try-error" %in% c(class( lmodNull ),class( lmodFull ))))
|
|
827 {
|
|
828 lmod = stepAIC(lmodNull, scope=list(lower=lmodNull,upper=lmodFull), direction="forward", trace=0)
|
|
829 return(funcGetStepPredictors(lmod, frmeTmp, strLog))
|
|
830 }
|
|
831 return( lsForcedParameters )
|
|
832 ### Return a vector of predictor names to use in a reduced model or NA on error
|
|
833 }
|
|
834
|
|
835 # OK
|
|
836 # Select model with backwards selection
|
|
837 funcBackwardsModel <- function(
|
|
838 ### Perform model selection with backwards stepwise selection
|
|
839 strFormula,
|
|
840 ### lm style string defining reponse and predictors
|
|
841 frmeTmp,
|
|
842 ### Data on which to perform analysis
|
|
843 adCur,
|
|
844 ### Response data
|
|
845 lsParameters,
|
|
846 ### User controlled parameters needed specific to boosting
|
|
847 lsForcedParameters = NULL,
|
|
848 ### Force these predictors to be in the model
|
|
849 strLog
|
|
850 ### File to which to document logging
|
|
851 ){
|
|
852 funcWrite( c("#Backwards formula", strFormula), strLog )
|
|
853
|
|
854 strNullFormula = "adCur ~ 1"
|
|
855 if(!is.null(lsForcedParameters))
|
|
856 {
|
|
857 strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " ))
|
|
858 }
|
|
859
|
|
860 lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp))
|
|
861 lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp ))
|
|
862
|
|
863 if(! class( lmodFull ) == "try-error" )
|
|
864 {
|
|
865 lmod = stepAIC(lmodFull, scope=list(lower=lmodNull, upper=lmodFull), direction="backward")
|
|
866 return(funcGetStepPredictors(lmod, frmeTmp, strLog))
|
|
867 } else {
|
|
868 return( lsForcedParameters ) }
|
|
869 ### Return a vector of predictor names to use in a reduced model or NA on error
|
|
870 }
|
|
871
|
|
872 ### Analysis methods
|
|
873 ### Univariate options
|
|
874
|
|
875 # Sparse Dir. Model
|
|
876 #TODO# Implement in sfle
|
|
877
|
|
878 # Tested
|
|
879 # Correlation
|
|
880 # NOTE: Ignores the idea of random and fixed covariates
|
|
881 funcSpearman <- function(
|
|
882 ### Perform multiple univariate comparisons producing spearman correlations for association
|
|
883 strFormula,
|
|
884 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
|
|
885 frmeTmp,
|
|
886 ### Data on which to perform analysis
|
|
887 iTaxon,
|
|
888 ### Index of the response data
|
|
889 lsQCCounts,
|
|
890 ### List recording anything important to QC
|
|
891 strRandomFormula = NULL
|
|
892 ### Has the formula for random covariates
|
|
893 ){
|
|
894 return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon,
|
|
895 functionContrast=function(x,adCur,dfData)
|
|
896 {
|
|
897 retList = list()
|
|
898 ret = cor.test(as.formula(paste("~",x,"+ adCur")), data=dfData, method="spearman", na.action=c_strNA_Action)
|
|
899 #Returning rho for the coef in a named vector
|
|
900 vdCoef = c()
|
|
901 vdCoef[[x]]=ret$estimate
|
|
902 retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef)
|
|
903 return(retList)
|
|
904 }, lsQCCounts))
|
|
905 ### List of contrast information, pvalue, contrast and std per univariate test
|
|
906 }
|
|
907
|
|
908 # Tested
|
|
909 # Wilcoxon (T-Test)
|
|
910 # NOTE: Ignores the idea of random and fixed covariates
|
|
911 funcWilcoxon <- function(
|
|
912 ### Perform multiple univariate comparisons performing wilcoxon tests on discontinuous data with 2 levels
|
|
913 strFormula,
|
|
914 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
|
|
915 frmeTmp,
|
|
916 ### Data on which to perform analysis
|
|
917 iTaxon,
|
|
918 ### Index of the response data
|
|
919 lsQCCounts,
|
|
920 ### List recording anything important to QC
|
|
921 strRandomFormula = NULL
|
|
922 ### Has the formula for random covariates
|
|
923 ){
|
|
924 return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon,
|
|
925 functionContrast=function(x,adCur,dfData)
|
|
926 {
|
|
927 retList = list()
|
|
928 ret = wilcox.test(as.formula(paste("adCur",x,sep=" ~ ")), data=dfData, na.action=c_strNA_Action)
|
|
929 #Returning NA for the coef in a named vector
|
|
930 vdCoef = c()
|
|
931 vdCoef[[x]]=ret$statistic
|
|
932 retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef)
|
|
933 return(retList)
|
|
934 }, lsQCCounts))
|
|
935 ### List of contrast information, pvalue, contrast and std per univariate test
|
|
936 }
|
|
937
|
|
938 # Tested
|
|
939 # Kruskal.Wallis (Nonparameteric anova)
|
|
940 # NOTE: Ignores the idea of random and fixed covariates
|
|
941 funcKruskalWallis <- function(
|
|
942 ### Perform multiple univariate comparisons performing Kruskal wallis rank sum tests on discontuous data with more than 2 levels
|
|
943 strFormula,
|
|
944 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
|
|
945 frmeTmp,
|
|
946 ### Data on which to perform analysis
|
|
947 iTaxon,
|
|
948 ### Index of the response data
|
|
949 lsQCCounts,
|
|
950 ### List recording anything important to QC
|
|
951 strRandomFormula = NULL
|
|
952 ### Has the formula for random covariates
|
|
953 ){
|
|
954 return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon,
|
|
955 functionContrast=function(x,adCur,dfData)
|
|
956 {
|
|
957 retList = list()
|
|
958 lmodKW = kruskal(adCur,dfData[[x]],group=FALSE,p.adj="holm")
|
|
959
|
|
960 asLevels = levels(dfData[[x]])
|
|
961 # The names of the generated comparisons, sometimes the control is first sometimes it is not so
|
|
962 # We will just check which is in the names and use that
|
|
963 asComparisons = row.names(lmodKW$comparisons)
|
|
964 #Get the comparison with the control
|
|
965 for(sLevel in asLevels[2:length(asLevels)])
|
|
966 {
|
|
967 sComparison = intersect(c(paste(asLevels[1],sLevel,sep=" - "),paste(sLevel,asLevels[1],sep=" - ")),asComparisons)
|
|
968 #Returning NA for the coef in a named vector
|
|
969 vdCoef = c()
|
|
970 vdCoef[[paste(x,sLevel,sep="")]]=lmodKW$comparisons[sComparison,"Difference"]
|
|
971 # vdCoef[[paste(x,sLevel,sep="")]]=NA
|
|
972 retList[[length(retList)+1]]=list(p.value=lmodKW$comparisons[sComparison,"p.value"],SD=1.0,name=paste(x,sLevel,sep=""),coef=vdCoef)
|
|
973 }
|
|
974 return(retList)
|
|
975 }, lsQCCounts))
|
|
976 ### List of contrast information, pvalue, contrast and std per univariate test
|
|
977 }
|
|
978
|
|
979 # Tested
|
|
980 # NOTE: Ignores the idea of random and fixed covariates
|
|
981 funcDoUnivariate <- function(
|
|
982 ### Perform multiple univariate comparisons producing spearman correlations for association
|
|
983 strFormula,
|
|
984 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
|
|
985 frmeTmp,
|
|
986 ### Data on which to perform analysis
|
|
987 iTaxon,
|
|
988 ### Index of the response data
|
|
989 lsHistory,
|
|
990 ### List recording p-values, association information, and QC counts
|
|
991 strRandomFormula = NULL,
|
|
992 ### Has the formula for random covariates
|
|
993 fZeroInflate = FALSE
|
|
994 ){
|
|
995 if(fZeroInflate)
|
|
996 {
|
|
997 throw("There are no zero-inflated univariate models to perform your analysis.")
|
|
998 }
|
|
999
|
|
1000 # Get covariates
|
|
1001 astrCovariates = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula)))
|
|
1002
|
|
1003 # For each covariate
|
|
1004 for(sCovariate in astrCovariates)
|
|
1005 {
|
|
1006 ## Check to see if it is discrete
|
|
1007 axData = frmeTmp[[sCovariate]]
|
|
1008 lsRet = NA
|
|
1009 if(is.factor(axData) || is.logical(axData))
|
|
1010 {
|
|
1011 ## If discrete check how many levels
|
|
1012 lsDataLevels = levels(axData)
|
|
1013 ## If 2 levels do wilcoxon test
|
|
1014 if(length(lsDataLevels) < 3)
|
|
1015 {
|
|
1016 lsRet = funcWilcoxon(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts)
|
|
1017 } else {
|
|
1018 ## If 3 or more levels do kruskal wallis test
|
|
1019 lsRet = funcKruskalWallis(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts)
|
|
1020 }
|
|
1021 } else {
|
|
1022 ## If not discrete do spearman test (list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
|
|
1023 lsRet = funcSpearman(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts)
|
|
1024 }
|
|
1025 lsHistory[["adP"]] = c(lsHistory[["adP"]], lsRet[["adP"]])
|
|
1026 lsHistory[["lsSig"]] = c(lsHistory[["lsSig"]], lsRet[["lsSig"]])
|
|
1027 lsHistory[["lsQCCounts"]] = lsRet[["lsQCCounts"]]
|
|
1028 }
|
|
1029 return(lsHistory)
|
|
1030 }
|
|
1031
|
|
1032 ### Multivariate
|
|
1033
|
|
1034 # Tested
|
|
1035 funcLM <- function(
|
|
1036 ### Perform vanilla linear regression
|
|
1037 strFormula,
|
|
1038 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
|
|
1039 frmeTmp,
|
|
1040 ### Data on which to perform analysis
|
|
1041 iTaxon,
|
|
1042 ### Index of the response data
|
|
1043 lsHistory,
|
|
1044 ### List recording p-values, association information, and QC counts
|
|
1045 strRandomFormula = NULL,
|
|
1046 ### Has the formula for random covariates
|
|
1047 fZeroInflated = FALSE
|
|
1048 ### Turns on the zero inflated model
|
|
1049 ){
|
|
1050 adCur = frmeTmp[,iTaxon]
|
|
1051 if(fZeroInflated)
|
|
1052 {
|
|
1053 return(try(gamlss(formula=as.formula(strFormula), family=BEZI, data=frmeTmp))) # gamlss
|
|
1054 } else {
|
|
1055 if(!is.null(strRandomFormula))
|
|
1056 {
|
|
1057 return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=gaussian(link="identity"), data=frmeTmp)))
|
|
1058 #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled)
|
|
1059 #return(try( lmer(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) ))
|
|
1060 } else {
|
|
1061 return(try( lm(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) ))
|
|
1062 }
|
|
1063 }
|
|
1064 ### lmod result object from lm
|
|
1065 }
|
|
1066
|
|
1067 # Tested
|
|
1068 funcBinomialMult <- function(
|
|
1069 ### Perform linear regression with negative binomial link
|
|
1070 strFormula,
|
|
1071 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
|
|
1072 frmeTmp,
|
|
1073 ### Data on which to perform analysis
|
|
1074 iTaxon,
|
|
1075 ### Index of the response data
|
|
1076 lsHistory,
|
|
1077 ### List recording p-values, association information, and QC counts
|
|
1078 strRandomFormula = NULL,
|
|
1079 ### Has the formula for random covariates
|
|
1080 fZeroInflated = FALSE
|
|
1081 ### Turns on the zero inflated model
|
|
1082 ){
|
|
1083 adCur = frmeTmp[,iTaxon]
|
|
1084 if(fZeroInflated)
|
|
1085 {
|
|
1086 return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="negbin"))) # pscl
|
|
1087 # return(try(gamlss(as.formula(strFormula), family=ZINBI, data=frmeTmp))) # pscl
|
|
1088 } else {
|
|
1089 if(!is.null(strRandomFormula))
|
|
1090 {
|
|
1091 throw("This analysis flow is not completely developed, please choose an option other than negative bionomial with random covariates")
|
|
1092 #TODO need to estimate the theta
|
|
1093 #return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=negative.binomial(theta = 2, link=log), data=frmeTmp)))
|
|
1094 #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled)
|
|
1095 } else {
|
|
1096 return(try( glm.nb(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action )))
|
|
1097 }
|
|
1098 }
|
|
1099 ### lmod result object from lm
|
|
1100 }
|
|
1101
|
|
1102 # Tested
|
|
1103 funcQuasiMult <- function(
|
|
1104 ### Perform linear regression with quasi-poisson link
|
|
1105 strFormula,
|
|
1106 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
|
|
1107 frmeTmp,
|
|
1108 ### Data on which to perform analysis
|
|
1109 iTaxon,
|
|
1110 ### Index of the response data
|
|
1111 lsHistory,
|
|
1112 ### List recording p-values, association information, and QC counts
|
|
1113 strRandomFormula = NULL,
|
|
1114 ### Has the formula for random covariates
|
|
1115 fZeroInflated = FALSE
|
|
1116 ### Turns on a zero infalted model
|
|
1117 ){
|
|
1118 adCur = frmeTmp[,iTaxon]
|
|
1119 if(fZeroInflated)
|
|
1120 {
|
|
1121 # return(try(gamlss(formula=as.formula(strFormula), family=ZIP, data=frmeTmp))) # gamlss
|
|
1122 return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="poisson"))) # pscl
|
|
1123 } else {
|
|
1124 #Check to see if | is in the model, if so use a lmm otherwise the standard glm is ok
|
|
1125 if(!is.null(strRandomFormula))
|
|
1126 {
|
|
1127 return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family= quasipoisson, data=frmeTmp)))
|
|
1128 #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled)
|
|
1129 #return(try ( glmer(as.formula(strFormula), data=frmeTmp, family=quasipoisson, na.action=c_strNA_Action) ))
|
|
1130 } else {
|
|
1131 return(try( glm(as.formula(strFormula), family=quasipoisson, data=frmeTmp, na.action=c_strNA_Action) ))
|
|
1132 }
|
|
1133 }
|
|
1134 ### lmod result object from lm
|
|
1135 }
|
|
1136
|
|
1137 ### Transformations
|
|
1138 # Tested
|
|
1139 funcArcsinSqrt <- function(
|
|
1140 # Transform data with arcsin sqrt transformation
|
|
1141 aData
|
|
1142 ### The data on which to perform the transformation
|
|
1143 ){
|
|
1144 return(asin(sqrt(aData)))
|
|
1145 ### Transformed data
|
|
1146 }
|
|
1147
|
|
1148 funcSquareSin <- function(
|
|
1149 # Transform data with square sin transformation
|
|
1150 # Opposite of the funcArcsinSqrt
|
|
1151 aData
|
|
1152 ### The data on which to perform the transformation
|
|
1153 ){
|
|
1154 return(sin(aData)^2)
|
|
1155 ### Transformed data
|
|
1156 }
|
|
1157
|
|
1158 # Tested
|
|
1159 funcNoTransform <-function(
|
|
1160 ### Pass data without transform
|
|
1161 aData
|
|
1162 ### The data on which to perform the transformation
|
|
1163 ### Only given here to preserve the pattern, not used.
|
|
1164 ){
|
|
1165 return(aData)
|
|
1166 ### Transformed data
|
|
1167 }
|
|
1168
|
|
1169 funcGetAnalysisMethods <- function(
|
|
1170 ### Returns the appropriate functions for regularization, analysis, data transformation, and analysis object inspection.
|
|
1171 ### This allows modular customization per analysis step.
|
|
1172 ### To add a new method insert an entry in the switch for either the selection, transform, or method
|
|
1173 ### Insert them by using the pattern optparse_keyword_without_quotes = function_in_AnalysisModules
|
|
1174 ### Order in the return listy is currently set and expected to be selection, transforms/links, analysis method
|
|
1175 ### none returns null
|
|
1176 sModelSelectionKey,
|
|
1177 ### Keyword defining the method of model selection
|
|
1178 sTransformKey,
|
|
1179 ### Keyword defining the method of data transformation
|
|
1180 sMethodKey,
|
|
1181 ### Keyword defining the method of analysis
|
|
1182 fZeroInflated = FALSE
|
|
1183 ### Indicates if using zero inflated models
|
|
1184 ){
|
|
1185 lRetMethods = list()
|
|
1186 #Insert selection methods here
|
|
1187 lRetMethods[[c_iSelection]] = switch(sModelSelectionKey,
|
|
1188 boost = funcBoostModel,
|
|
1189 penalized = funcPenalizedModel,
|
|
1190 forward = funcForwardModel,
|
|
1191 backward = funcBackwardsModel,
|
|
1192 none = NA)
|
|
1193
|
|
1194 #Insert transforms
|
|
1195 lRetMethods[[c_iTransform]] = switch(sTransformKey,
|
|
1196 asinsqrt = funcArcsinSqrt,
|
|
1197 none = funcNoTransform)
|
|
1198
|
|
1199 #Insert untransform
|
|
1200 lRetMethods[[c_iUnTransform]] = switch(sTransformKey,
|
|
1201 asinsqrt = funcNoTransform,
|
|
1202 none = funcNoTransform)
|
|
1203
|
|
1204 #Insert analysis
|
|
1205 lRetMethods[[c_iAnalysis]] = switch(sMethodKey,
|
|
1206 neg_binomial = funcBinomialMult,
|
|
1207 quasi = funcQuasiMult,
|
|
1208 univariate = funcDoUnivariate,
|
|
1209 lm = funcLM,
|
|
1210 none = NA)
|
|
1211
|
|
1212 # If a univariate method is used it is required to set this to true
|
|
1213 # For correct handling.
|
|
1214 lRetMethods[[c_iIsUnivariate]]=sMethodKey=="univariate"
|
|
1215
|
|
1216 #Insert method to get results
|
|
1217 if(fZeroInflated)
|
|
1218 {
|
|
1219 lRetMethods[[c_iResults]] = switch(sMethodKey,
|
|
1220 neg_binomial = funcGetZeroInflatedResults,
|
|
1221 quasi = funcGetZeroInflatedResults,
|
|
1222 univariate = funcGetUnivariateResults,
|
|
1223 lm = funcGetZeroInflatedResults,
|
|
1224 none = NA)
|
|
1225 } else {
|
|
1226 lRetMethods[[c_iResults]] = switch(sMethodKey,
|
|
1227 neg_binomial = funcGetLMResults,
|
|
1228 quasi = funcGetLMResults,
|
|
1229 univariate = funcGetUnivariateResults,
|
|
1230 lm = funcGetLMResults,
|
|
1231 none = NA)
|
|
1232 }
|
|
1233
|
|
1234 return(lRetMethods)
|
|
1235 ### Returns a list of functions to be passed for regularization, data transformation, analysis,
|
|
1236 ### and custom analysis results introspection functions to pull from return objects data of interest
|
|
1237 }
|