Mercurial > repos > george-weingart > maaslin
comparison src/lib/BoostGLM.R @ 0:e0b5980139d9
maaslin
| author | george-weingart | 
|---|---|
| date | Tue, 13 May 2014 22:00:40 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:e0b5980139d9 | 
|---|---|
| 1 ##################################################################################### | |
| 2 #Copyright (C) <2012> | |
| 3 # | |
| 4 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
| 5 #this software and associated documentation files (the "Software"), to deal in the | |
| 6 #Software without restriction, including without limitation the rights to use, copy, | |
| 7 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
| 8 #and to permit persons to whom the Software is furnished to do so, subject to | |
| 9 #the following conditions: | |
| 10 # | |
| 11 #The above copyright notice and this permission notice shall be included in all copies | |
| 12 #or substantial portions of the Software. | |
| 13 # | |
| 14 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
| 15 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
| 16 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
| 17 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
| 18 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
| 19 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
| 20 # | |
| 21 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | |
| 22 # authored by the Huttenhower lab at the Harvard School of Public Health | |
| 23 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | |
| 24 ##################################################################################### | |
| 25 | |
| 26 inlinedocs <- function( | |
| 27 ##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu> | |
| 28 ##description<< Manages the quality control of data and the performance of analysis (univariate or multivariate), regularization, and data (response) transformation. | |
| 29 ) { return( pArgs ) } | |
| 30 | |
| 31 ### Load libraries quietly | |
| 32 suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
| 33 suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
| 34 suppressMessages(library( logging, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
| 35 suppressMessages(library( outliers, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
| 36 suppressMessages(library( robustbase, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
| 37 suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
| 38 | |
| 39 ### Get constants | |
| 40 #source(file.path("input","maaslin","src","Constants.R")) | |
| 41 #source("Constants.R") | |
| 42 | |
| 43 ## Get logger | |
| 44 c_logrMaaslin <- getLogger( "maaslin" ) | |
| 45 | |
| 46 funcDoGrubbs <- function( | |
| 47 ### Use the Grubbs Test to identify outliers | |
| 48 iData, | |
| 49 ### Column index in the data frame to test | |
| 50 frmeData, | |
| 51 ### The data frame holding the data | |
| 52 dPOutlier, | |
| 53 ### P-value threshold to indicate an outlier is significant | |
| 54 lsQC | |
| 55 ### List holding the QC info of the cleaning step. Which indices are outliers is added. | |
| 56 ){ | |
| 57 adData <- frmeData[,iData] | |
| 58 | |
| 59 # Original number of NA | |
| 60 viNAOrig = which(is.na(adData)) | |
| 61 | |
| 62 while( TRUE ) | |
| 63 { | |
| 64 lsTest <- try( grubbs.test( adData ), silent = TRUE ) | |
| 65 if( ( class( lsTest ) == "try-error" ) || is.na( lsTest$p.value ) || ( lsTest$p.value > dPOutlier ) ) | |
| 66 {break} | |
| 67 viOutliers = outlier( adData, logical = TRUE ) | |
| 68 adData[viOutliers] <- NA | |
| 69 } | |
| 70 | |
| 71 # Record removed data | |
| 72 viNAAfter = which(is.na(adData)) | |
| 73 | |
| 74 # If all were set to NA then ignore the filtering | |
| 75 if(length(adData)==length(viNAAfter)) | |
| 76 { | |
| 77 viNAAfter = viNAOrig | |
| 78 adData = frmeData[,iData] | |
| 79 c_logrMaaslin$info( paste("Grubbs Test:: Identifed all data as outliers so was inactived for index=",iData," data=",paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep = " " )) | |
| 80 } else if(mean(adData, na.rm=TRUE) == 0) { | |
| 81 viNAAfter = viNAOrig | |
| 82 adData = frmeData[,iData] | |
| 83 c_logrMaaslin$info( paste("Grubbs Test::Removed all values but 0, ignored. Index=",iData,".",sep=" " ) ) | |
| 84 } else { | |
| 85 # Document removal | |
| 86 if( sum( is.na( adData ) ) ) | |
| 87 { | |
| 88 c_logrMaaslin$info( "Grubbs Test::Removing %d outliers from %s", sum( is.na( adData ) ), colnames(frmeData)[iData] ) | |
| 89 c_logrMaaslin$info( format( rownames( frmeData )[is.na( adData )] )) | |
| 90 } | |
| 91 } | |
| 92 | |
| 93 return(list(data=adData,outliers=length(viNAAfter)-length(viNAOrig),indices=setdiff(viNAAfter,viNAOrig))) | |
| 94 } | |
| 95 | |
| 96 funcDoFenceTest <- function( | |
| 97 ### Use a threshold based on the quartiles of the data to identify outliers | |
| 98 iData, | |
| 99 ### Column index in the data frame to test | |
| 100 frmeData, | |
| 101 ### The data frame holding the data | |
| 102 dFence | |
| 103 ### The fence outside the first and third quartiles to use as a threshold for cutt off. | |
| 104 ### This many times the interquartile range +/- to the 3rd/1st quartiles | |
| 105 ){ | |
| 106 # Establish fence | |
| 107 adData <- frmeData[,iData] | |
| 108 adQ <- quantile( adData, c(0.25, 0.5, 0.75), na.rm = TRUE ) | |
| 109 | |
| 110 dIQR <- adQ[3] - adQ[1] | |
| 111 if(!dIQR) | |
| 112 { | |
| 113 dIQR = sd(adData,na.rm = TRUE) | |
| 114 } | |
| 115 dUF <- adQ[3] + ( dFence * dIQR ) | |
| 116 dLF <- adQ[1] - ( dFence * dIQR ) | |
| 117 | |
| 118 # Record indices of values outside of fence to remove and remove. | |
| 119 aiRemove <- c() | |
| 120 for( j in 1:length( adData ) ) | |
| 121 { | |
| 122 d <- adData[j] | |
| 123 if( !is.na( d ) && ( ( d < dLF ) || ( d > dUF ) ) ) | |
| 124 { | |
| 125 aiRemove <- c(aiRemove, j) | |
| 126 } | |
| 127 } | |
| 128 | |
| 129 if(length(aiRemove)==length(adData)) | |
| 130 { | |
| 131 aiRemove = c() | |
| 132 c_logrMaaslin$info( "OutliersByFence:: Identified all data as outlier so was inactivated for index=", iData,"data=", paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep=" " ) | |
| 133 } else { | |
| 134 adData[aiRemove] <- NA | |
| 135 | |
| 136 # Document to screen | |
| 137 if( length( aiRemove ) ) | |
| 138 { | |
| 139 c_logrMaaslin$info( "OutliersByFence::Removing %d outliers from %s", length( aiRemove ), colnames(frmeData)[iData] ) | |
| 140 c_logrMaaslin$info( format( rownames( frmeData )[aiRemove] )) | |
| 141 } | |
| 142 } | |
| 143 | |
| 144 return(list(data=adData,outliers=length(aiRemove),indices=aiRemove)) | |
| 145 } | |
| 146 | |
| 147 funcZerosAreUneven = function( | |
| 148 ### | |
| 149 vdRawData, | |
| 150 ### Raw data to be checked during transformation | |
| 151 funcTransform, | |
| 152 ### Data transform to perform | |
| 153 vsStratificationFeatures, | |
| 154 ### Groupings to check for unevenness | |
| 155 dfData | |
| 156 ### Data frame holding the features | |
| 157 ){ | |
| 158 # Return indicator of unevenness | |
| 159 fUneven = FALSE | |
| 160 | |
| 161 # Transform the data to compare | |
| 162 vdTransformed = funcTransform( vdRawData ) | |
| 163 | |
| 164 # Go through each stratification of data | |
| 165 for( sStratification in vsStratificationFeatures ) | |
| 166 { | |
| 167 # Current stratification | |
| 168 vFactorStrats = dfData[[ sStratification ]] | |
| 169 | |
| 170 # If the metadata is not a factor then skip | |
| 171 # Only binned data can be evaluated this way. | |
| 172 if( !is.factor( vFactorStrats )){ next } | |
| 173 | |
| 174 viZerosCountsRaw = c() | |
| 175 for( sLevel in levels( vFactorStrats ) ) | |
| 176 { | |
| 177 vdTest = vdRawData[ which( vFactorStrats == sLevel ) ] | |
| 178 viZerosCountsRaw = c( viZerosCountsRaw, length(which(vdTest == 0))) | |
| 179 vdTest = vdTransformed[ which( vFactorStrats == sLevel ) ] | |
| 180 } | |
| 181 dExpectation = 1 / length( viZerosCountsRaw ) | |
| 182 dMin = dExpectation / 2 | |
| 183 dMax = dExpectation + dMin | |
| 184 viZerosCountsRaw = viZerosCountsRaw / sum( viZerosCountsRaw ) | |
| 185 if( ( length( which( viZerosCountsRaw <= dMin ) ) > 0 ) || ( length( which( viZerosCountsRaw >= dMax ) ) > 0 ) ) | |
| 186 { | |
| 187 return( TRUE ) | |
| 188 } | |
| 189 } | |
| 190 return( fUneven ) | |
| 191 } | |
| 192 | |
| 193 funcTransformIncreasesOutliers = function( | |
| 194 ### Checks if a data transform increases outliers in a distribution | |
| 195 vdRawData, | |
| 196 ### Raw data to check for outlier zeros | |
| 197 funcTransform | |
| 198 ){ | |
| 199 iUnOutliers = length( boxplot( vdRawData, plot = FALSE )$out ) | |
| 200 iTransformedOutliers = length( boxplot( funcTransform( vdRawData ), plot = FALSE )$out ) | |
| 201 | |
| 202 return( iUnOutliers <= iTransformedOutliers ) | |
| 203 } | |
| 204 | |
| 205 funcClean <- function( | |
| 206 ### Properly clean / get data ready for analysis | |
| 207 ### Includes custom analysis from the custom R script if it exists | |
| 208 frmeData, | |
| 209 ### Data frame, input data to be acted on | |
| 210 funcDataProcess, | |
| 211 ### Custom script that can be given to perform specialized processing before MaAsLin does. | |
| 212 aiMetadata, | |
| 213 ### Indices of columns in frmeData which are metadata for analysis. | |
| 214 aiData, | |
| 215 ### Indices of column in frmeData which are (abundance) data for analysis. | |
| 216 lsQCCounts, | |
| 217 ### List that will hold the quality control information which is written in the output directory. | |
| 218 astrNoImpute = c(), | |
| 219 ### An array of column names of frmeData not to impute. | |
| 220 dMinSamp, | |
| 221 ### Minimum number of samples | |
| 222 dMinAbd, | |
| 223 # Minimum sample abundance | |
| 224 dFence, | |
| 225 ### How many quartile ranges defines the fence to define outliers. | |
| 226 funcTransform, | |
| 227 ### The data transformation function or a dummy function that does not affect the data | |
| 228 dPOutlier = 0.05 | |
| 229 ### The significance threshold for the grubbs test to identify an outlier. | |
| 230 ){ | |
| 231 # Call the custom script and set current data and indicies to the processes data and indicies. | |
| 232 c_logrMaaslin$debug( "Start Clean") | |
| 233 if( !is.null( funcDataProcess ) ) | |
| 234 { | |
| 235 c_logrMaaslin$debug("Additional preprocess function attempted.") | |
| 236 | |
| 237 pTmp <- funcDataProcess( frmeData=frmeData, aiMetadata=aiMetadata, aiData=aiData) | |
| 238 frmeData = pTmp$frmeData | |
| 239 aiMetadata = pTmp$aiMetadata | |
| 240 aiData = pTmp$aiData | |
| 241 lsQCCounts$lsQCCustom = pTmp$lsQCCounts | |
| 242 } | |
| 243 # Set data indicies after custom QC process. | |
| 244 lsQCCounts$aiAfterPreprocess = aiData | |
| 245 | |
| 246 # Remove missing data, remove any sample that has less than dMinSamp * the number of data or low abundance | |
| 247 aiRemove = c() | |
| 248 aiRemoveLowAbundance = c() | |
| 249 for( iCol in aiData ) | |
| 250 { | |
| 251 adCol = frmeData[,iCol] | |
| 252 adCol[!is.finite( adCol )] <- NA | |
| 253 if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) || | |
| 254 ( length( unique( na.omit( adCol ) ) ) < 2 ) ) | |
| 255 { | |
| 256 aiRemove = c(aiRemove, iCol) | |
| 257 } | |
| 258 if( sum(adCol > dMinAbd, na.rm=TRUE ) < (dMinSamp * length( adCol))) | |
| 259 { | |
| 260 aiRemoveLowAbundance = c(aiRemoveLowAbundance, iCol) | |
| 261 } | |
| 262 } | |
| 263 # Remove and document | |
| 264 aiData = setdiff( aiData, aiRemove ) | |
| 265 aiData = setdiff( aiData, aiRemoveLowAbundance ) | |
| 266 lsQCCounts$iMissingData = aiRemove | |
| 267 lsQCCounts$iLowAbundanceData = aiRemoveLowAbundance | |
| 268 if(length(aiRemove)) | |
| 269 { | |
| 270 c_logrMaaslin$info( "Removing the following for data lower bound.") | |
| 271 c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] )) | |
| 272 } | |
| 273 if(length(aiRemoveLowAbundance)) | |
| 274 { | |
| 275 c_logrMaaslin$info( "Removing the following for too many low abundance bugs.") | |
| 276 c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveLowAbundance] )) | |
| 277 } | |
| 278 | |
| 279 #Transform data | |
| 280 iTransformed = 0 | |
| 281 viNotTransformedData = c() | |
| 282 for(aiDatum in aiData) | |
| 283 { | |
| 284 adValues = frmeData[,aiDatum] | |
| 285 # if( ! funcTransformIncreasesOutliers( adValues, funcTransform ) ) | |
| 286 # { | |
| 287 frmeData[,aiDatum] = funcTransform( adValues ) | |
| 288 # iTransformed = iTransformed + 1 | |
| 289 # } else { | |
| 290 # viNotTransformedData = c( viNotTransformedData, aiDatum ) | |
| 291 # } | |
| 292 } | |
| 293 c_logrMaaslin$info(paste("Number of features transformed = ",iTransformed)) | |
| 294 | |
| 295 # Metadata: Properly factorize all logical data and integer and number data with less than iNonFactorLevelThreshold | |
| 296 # Also record which are numeric metadata | |
| 297 aiNumericMetadata = c() | |
| 298 for( i in aiMetadata ) | |
| 299 { | |
| 300 if( ( class( frmeData[,i] ) %in% c("integer", "numeric", "logical") ) && | |
| 301 ( length( unique( frmeData[,i] ) ) < c_iNonFactorLevelThreshold ) ) { | |
| 302 c_logrMaaslin$debug(paste("Changing metadatum from numeric/integer/logical to factor",colnames(frmeData)[i],sep="=")) | |
| 303 frmeData[,i] = factor( frmeData[,i] ) | |
| 304 } | |
| 305 if( class( frmeData[,i] ) %in% c("integer","numeric") ) | |
| 306 { | |
| 307 aiNumericMetadata = c(aiNumericMetadata,i) | |
| 308 } | |
| 309 } | |
| 310 | |
| 311 # Remove outliers | |
| 312 # If the dFence Value is set use the method of defining the outllier as | |
| 313 # dFence * the interquartile range + or - the 3rd and first quartile respectively. | |
| 314 # If not the gibbs test is used. | |
| 315 lsQCCounts$aiDataSumOutlierPerDatum = c() | |
| 316 lsQCCounts$aiMetadataSumOutlierPerDatum = c() | |
| 317 lsQCCounts$liOutliers = list() | |
| 318 | |
| 319 if( dFence > 0.0 ) | |
| 320 { | |
| 321 # For data | |
| 322 for( iData in aiData ) | |
| 323 { | |
| 324 lOutlierInfo <- funcDoFenceTest(iData=iData,frmeData=frmeData,dFence=dFence) | |
| 325 frmeData[,iData] <- lOutlierInfo[["data"]] | |
| 326 lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
| 327 if(lOutlierInfo[["outliers"]]>0) | |
| 328 { | |
| 329 lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]] | |
| 330 } | |
| 331 } | |
| 332 | |
| 333 # Remove outlier non-factor metadata | |
| 334 for( iMetadata in aiNumericMetadata ) | |
| 335 { | |
| 336 lOutlierInfo <- funcDoFenceTest(iData=iMetadata,frmeData=frmeData,dFence=dFence) | |
| 337 frmeData[,iMetadata] <- lOutlierInfo[["data"]] | |
| 338 lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
| 339 if(lOutlierInfo[["outliers"]]>0) | |
| 340 { | |
| 341 lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]] | |
| 342 } | |
| 343 } | |
| 344 #Do not use the fence, use the Grubbs test | |
| 345 } else if(dPOutlier!=0.0){ | |
| 346 # For data | |
| 347 for( iData in aiData ) | |
| 348 { | |
| 349 lOutlierInfo <- funcDoGrubbs(iData=iData,frmeData=frmeData,dPOutlier=dPOutlier) | |
| 350 frmeData[,iData] <- lOutlierInfo[["data"]] | |
| 351 lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
| 352 if(lOutlierInfo[["outliers"]]>0) | |
| 353 { | |
| 354 lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]] | |
| 355 } | |
| 356 } | |
| 357 for( iMetadata in aiNumericMetadata ) | |
| 358 { | |
| 359 lOutlierInfo <- funcDoGrubbs(iData=iMetadata,frmeData=frmeData,dPOutlier=dPOutlier) | |
| 360 frmeData[,iMetadata] <- lOutlierInfo[["data"]] | |
| 361 lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
| 362 if(lOutlierInfo[["outliers"]]>0) | |
| 363 { | |
| 364 lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]] | |
| 365 } | |
| 366 } | |
| 367 } | |
| 368 | |
| 369 # Metadata: Remove missing data | |
| 370 # This is defined as if there is only one non-NA value or | |
| 371 # if the number of NOT NA data is less than a percentage of the data defined by dMinSamp | |
| 372 aiRemove = c() | |
| 373 for( iCol in c(aiMetadata) ) | |
| 374 { | |
| 375 adCol = frmeData[,iCol] | |
| 376 if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) || | |
| 377 ( length( unique( na.omit( adCol ) ) ) < 2 ) ) | |
| 378 { | |
| 379 aiRemove = c(aiRemove, iCol) | |
| 380 } | |
| 381 } | |
| 382 | |
| 383 # Remove metadata | |
| 384 aiMetadata = setdiff( aiMetadata, aiRemove ) | |
| 385 | |
| 386 # Update the data which was removed. | |
| 387 lsQCCounts$iMissingMetadata = aiRemove | |
| 388 if(length(aiRemove)) | |
| 389 { | |
| 390 c_logrMaaslin$info("Removing the following metadata for too much missing data or only one data value outside of NA.") | |
| 391 c_logrMaaslin$info(format(colnames( frmeData )[aiRemove])) | |
| 392 } | |
| 393 | |
| 394 # Keep track of factor levels in a list for later use | |
| 395 lslsFactors <- list() | |
| 396 for( iCol in c(aiMetadata) ) | |
| 397 { | |
| 398 aCol <- frmeData[,iCol] | |
| 399 if( class( aCol ) == "factor" ) | |
| 400 { | |
| 401 lslsFactors[[length( lslsFactors ) + 1]] <- list(iCol, levels( aCol )) | |
| 402 } | |
| 403 } | |
| 404 | |
| 405 # Replace missing data values by the mean of the data column. | |
| 406 # Remove samples that were all NA from the cleaning and so could not be imputed. | |
| 407 aiRemoveData = c() | |
| 408 for( iCol in aiData ) | |
| 409 { | |
| 410 adCol <- frmeData[,iCol] | |
| 411 adCol[is.infinite( adCol )] <- NA | |
| 412 adCol[is.na( adCol )] <- mean( adCol[which(adCol>0)], na.rm = TRUE ) | |
| 413 frmeData[,iCol] <- adCol | |
| 414 | |
| 415 if(length(which(is.na(frmeData[,iCol]))) == length(frmeData[,iCol])) | |
| 416 { | |
| 417 c_logrMaaslin$info( paste("Removing data", iCol, "for being all NA after QC")) | |
| 418 aiRemoveData = c(aiRemoveData,iCol) | |
| 419 } | |
| 420 } | |
| 421 | |
| 422 # Remove and document | |
| 423 aiData = setdiff( aiData, aiRemoveData ) | |
| 424 lsQCCounts$iMissingData = c(lsQCCounts$iMissingData,aiRemoveData) | |
| 425 if(length(aiRemoveData)) | |
| 426 { | |
| 427 c_logrMaaslin$info( "Removing the following for having only NAs after cleaning (maybe due to only having NA after outlier testing).") | |
| 428 c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveData] )) | |
| 429 } | |
| 430 | |
| 431 #Use na.gam.replace to manage NA metadata | |
| 432 aiTmp <- setdiff( aiMetadata, which( colnames( frmeData ) %in% astrNoImpute ) ) | |
| 433 # Keep tack of NAs so the may not be plotted later. | |
| 434 liNaIndices = list() | |
| 435 lsNames = names(frmeData) | |
| 436 for( i in aiTmp) | |
| 437 { | |
| 438 liNaIndices[[lsNames[i]]] = which(is.na(frmeData[,i])) | |
| 439 } | |
| 440 frmeData[,aiTmp] <- na.gam.replace( frmeData[,aiTmp] ) | |
| 441 | |
| 442 #If NA is a value in factor data, set the NA as a level. | |
| 443 for( lsFactor in lslsFactors ) | |
| 444 { | |
| 445 iCol <- lsFactor[[1]] | |
| 446 aCol <- frmeData[,iCol] | |
| 447 if( "NA" %in% levels( aCol ) ) | |
| 448 { | |
| 449 if(! lsNames[iCol] %in% astrNoImpute) | |
| 450 { | |
| 451 liNaIndices[[lsNames[iCol]]] = union(which(is.na(frmeData[,iCol])),which(frmeData[,iCol]=="NA")) | |
| 452 } | |
| 453 frmeData[,iCol] <- factor( aCol, levels = c(lsFactor[[2]], "NA") ) | |
| 454 } | |
| 455 } | |
| 456 | |
| 457 # Make sure there is a minimum number of non-0 measurements | |
| 458 aiRemove = c() | |
| 459 for( iCol in aiData ) | |
| 460 { | |
| 461 adCol = frmeData[,iCol] | |
| 462 if(length( which(adCol!=0)) < ( dMinSamp * length( adCol ) ) ) | |
| 463 { | |
| 464 aiRemove = c(aiRemove, iCol) | |
| 465 } | |
| 466 } | |
| 467 | |
| 468 # Remove and document | |
| 469 aiData = setdiff( aiData, aiRemove) | |
| 470 lsQCCounts$iZeroDominantData = aiRemove | |
| 471 if(length(aiRemove)) | |
| 472 { | |
| 473 c_logrMaaslin$info( "Removing the following for having not enough non-zero measurments for analysis.") | |
| 474 c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] )) | |
| 475 } | |
| 476 | |
| 477 c_logrMaaslin$debug("End FuncClean") | |
| 478 return( list(frmeData = frmeData, aiMetadata = aiMetadata, aiData = aiData, lsQCCounts = lsQCCounts, liNaIndices=liNaIndices, viNotTransformedData = viNotTransformedData) ) | |
| 479 ### Return list of | |
| 480 ### frmeData: The Data after cleaning | |
| 481 ### aiMetadata: The indices of the metadata still being used after filtering | |
| 482 ### aiData: The indices of the data still being used after filtering | |
| 483 ### lsQCCOunts: QC info | |
| 484 } | |
| 485 | |
| 486 funcBugs <- function( | |
| 487 ### Run analysis of all data features against all metadata | |
| 488 frmeData, | |
| 489 ### Cleaned data including metadata, and data | |
| 490 lsData, | |
| 491 ### This list is a general container for data as the analysis occurs, think about it as a cache for the analysis | |
| 492 aiMetadata, | |
| 493 ### Indices of metadata used in analysis | |
| 494 aiData, | |
| 495 ### Indices of response data | |
| 496 aiNotTransformedData, | |
| 497 ### Indicies of the data not transformed | |
| 498 strData, | |
| 499 ### Log file name | |
| 500 dSig, | |
| 501 ### Significance threshold for the qvalue cut off | |
| 502 fInvert=FALSE, | |
| 503 ### Invert images to have a black background | |
| 504 strDirOut = NA, | |
| 505 ### Output project directory | |
| 506 funcReg=NULL, | |
| 507 ### Function for regularization | |
| 508 funcTransform=NULL, | |
| 509 ### Function used to transform the data | |
| 510 funcUnTransform=NULL, | |
| 511 ### If a transform is used the opposite of that transfor must be used on the residuals in the partial residual plots | |
| 512 lsNonPenalizedPredictors=NULL, | |
| 513 ### These predictors will not be penalized in the feature (model) selection step | |
| 514 funcAnalysis=NULL, | |
| 515 ### Function to perform association analysis | |
| 516 lsRandomCovariates=NULL, | |
| 517 ### List of string names of metadata which will be treated as random covariates | |
| 518 funcGetResults=NULL, | |
| 519 ### Function to unpack results from analysis | |
| 520 fDoRPlot=TRUE, | |
| 521 ### Plot residuals | |
| 522 fOmitLogFile = FALSE, | |
| 523 ### Stops the creation of the log file | |
| 524 fAllvAll=FALSE, | |
| 525 ### Flag to turn on all against all comparisons | |
| 526 liNaIndices = list(), | |
| 527 ### Indicies of imputed NA data | |
| 528 lxParameters=list(), | |
| 529 ### List holds parameters for different variable selection techniques | |
| 530 strTestingCorrection = "BH", | |
| 531 ### Correction for multiple testing | |
| 532 fIsUnivariate = FALSE, | |
| 533 ### Indicates if the function is univariate | |
| 534 fZeroInflated = FALSE | |
| 535 ### Indicates to use a zero infalted model | |
| 536 ){ | |
| 537 c_logrMaaslin$debug("Start funcBugs") | |
| 538 # If no output directory is indicated | |
| 539 # Then make it the current directory | |
| 540 if( is.na( strDirOut ) || is.null( strDirOut ) ) | |
| 541 { | |
| 542 if( !is.na( strData ) ) | |
| 543 { | |
| 544 strDirOut <- paste( dirname( strData ), "/", sep = "" ) | |
| 545 } else { strDirOut = "" } | |
| 546 } | |
| 547 | |
| 548 # Make th log file and output file names based on the log file name | |
| 549 strLog = NA | |
| 550 strBase = "" | |
| 551 if(!is.na(strData)) | |
| 552 { | |
| 553 strBaseOut <- paste( strDirOut, sub( "\\.([^.]+)$", "", basename(strData) ), sep = "/" ) | |
| 554 strLog <- paste( strBaseOut,c_sLogFileSuffix, ".txt", sep = "" ) | |
| 555 } | |
| 556 | |
| 557 # If indicated, stop the creation of the log file | |
| 558 # Otherwise delete the log file if it exists and log | |
| 559 if(fOmitLogFile){ strLog = NA } | |
| 560 if(!is.na(strLog)) | |
| 561 { | |
| 562 c_logrMaaslin$info( "Outputting to: %s", strLog ) | |
| 563 unlink( strLog ) | |
| 564 } | |
| 565 | |
| 566 # Will contain pvalues | |
| 567 adP = c() | |
| 568 adPAdj = c() | |
| 569 | |
| 570 # List of lists with association information | |
| 571 lsSig <- list() | |
| 572 # Go through each data that was not previously removed and perform inference | |
| 573 for( iTaxon in aiData ) | |
| 574 { | |
| 575 # Log to screen progress per 10 associations. | |
| 576 # Can be thown off if iTaxon is missing a mod 10 value | |
| 577 # So the taxons may not be logged every 10 but not a big deal | |
| 578 if( !( iTaxon %% 10 ) ) | |
| 579 { | |
| 580 c_logrMaaslin$info( "Taxon %d/%d", iTaxon, max( aiData ) ) | |
| 581 } | |
| 582 | |
| 583 # Call analysis method | |
| 584 lsOne <- funcBugHybrid( iTaxon=iTaxon, frmeData=frmeData, lsData=lsData, aiMetadata=aiMetadata, dSig=dSig, adP=adP, lsSig=lsSig, funcTransform=funcTransform, funcUnTransform=funcUnTransform, strLog=strLog, funcReg=funcReg, lsNonPenalizedPredictors=lsNonPenalizedPredictors, funcAnalysis=funcAnalysis, lsRandomCovariates=lsRandomCovariates, funcGetResult=funcGetResults, fAllvAll=fAllvAll, fIsUnivariate=fIsUnivariate, lxParameters=lxParameters, fZeroInflated=fZeroInflated, fIsTransformed= ! iTaxon %in% aiNotTransformedData ) | |
| 585 | |
| 586 # If you get a NA (happens when the lmm gets all random covariates) move on | |
| 587 if( is.na( lsOne ) ){ next } | |
| 588 | |
| 589 # The updating of the following happens in the inference method call in the funcBugHybrid call | |
| 590 # New pvalue array | |
| 591 adP <- lsOne$adP | |
| 592 # New lsSig contains data about significant feature v metadata comparisons | |
| 593 lsSig <- lsOne$lsSig | |
| 594 # New qc data | |
| 595 lsData$lsQCCounts = lsOne$lsQCCounts | |
| 596 } | |
| 597 | |
| 598 # Log the QC info | |
| 599 c_logrMaaslin$debug("lsData$lsQCCounts") | |
| 600 c_logrMaaslin$debug(format(lsData$lsQCCounts)) | |
| 601 | |
| 602 if( is.null( adP ) ) { return( NULL ) } | |
| 603 | |
| 604 # Perform bonferonni corrections on factor data (for levels), calculate the number of tests performed, and FDR adjust for multiple hypotheses | |
| 605 # Perform Bonferonni adjustment on factor data | |
| 606 for( iADIndex in 1:length( adP ) ) | |
| 607 { | |
| 608 # Only perform on factor data | |
| 609 if( is.factor( lsSig[[ iADIndex ]]$metadata ) ) | |
| 610 { | |
| 611 adPAdj = c( adPAdj, funcBonferonniCorrectFactorData( dPvalue = adP[ iADIndex ], vsFactors = lsSig[[ iADIndex ]]$metadata, fIgnoreNAs = length(liNaIndices)>0) ) | |
| 612 } else { | |
| 613 adPAdj = c( adPAdj, adP[ iADIndex ] ) | |
| 614 } | |
| 615 } | |
| 616 | |
| 617 iTests = funcCalculateTestCounts(iDataCount = length(aiData), asMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ), asForced = lsNonPenalizedPredictors, asRandom = lsRandomCovariates, fAllvAll = fAllvAll) | |
| 618 | |
| 619 #Get indices of sorted data after the factor correction but before the multiple hypothesis corrections. | |
| 620 aiSig <- sort.list( adPAdj ) | |
| 621 | |
| 622 # Perform FDR BH | |
| 623 adQ = p.adjust(adPAdj, method=strTestingCorrection, n=max(length(adPAdj), iTests)) | |
| 624 | |
| 625 # Find all covariates that had significant associations | |
| 626 astrNames <- c() | |
| 627 for( i in 1:length( lsSig ) ) | |
| 628 { | |
| 629 astrNames <- c(astrNames, lsSig[[i]]$name) | |
| 630 } | |
| 631 astrNames <- unique( astrNames ) | |
| 632 | |
| 633 # Sets up named label return for global plotting | |
| 634 lsReturnTaxa <- list() | |
| 635 for( j in aiSig ) | |
| 636 { | |
| 637 if( adQ[j] > dSig ) { next } | |
| 638 strTaxon <- lsSig[[j]]$taxon | |
| 639 if(strTaxon %in% names(lsReturnTaxa)) | |
| 640 { | |
| 641 lsReturnTaxa[[strTaxon]] = min(lsReturnTaxa[[strTaxon]],adQ[j]) | |
| 642 } else { lsReturnTaxa[[strTaxon]] = adQ[j]} | |
| 643 } | |
| 644 | |
| 645 # For each covariate with significant associations | |
| 646 # Write out a file with association information | |
| 647 for( strName in astrNames ) | |
| 648 { | |
| 649 strFileTXT <- NA | |
| 650 strFilePDF <- NA | |
| 651 for( j in aiSig ) | |
| 652 { | |
| 653 lsCur <- lsSig[[j]] | |
| 654 strCur <- lsCur$name | |
| 655 | |
| 656 if( strCur != strName ) { next } | |
| 657 | |
| 658 strTaxon <- lsCur$taxon | |
| 659 adData <- lsCur$data | |
| 660 astrFactors <- lsCur$factors | |
| 661 adCur <- lsCur$metadata | |
| 662 adY <- adData | |
| 663 | |
| 664 if( is.na( strData ) ) { next } | |
| 665 | |
| 666 ## If the text file output is not written to yet | |
| 667 ## make the file names, and delete any previous file output | |
| 668 if( is.na( strFileTXT ) ) | |
| 669 { | |
| 670 strFileTXT <- sprintf( "%s-%s.txt", strBaseOut, strName ) | |
| 671 unlink(strFileTXT) | |
| 672 funcWrite( c("Variable", "Feature", "Value", "Coefficient", "N", "N not 0", "P-value", "Q-value"), strFileTXT ) | |
| 673 } | |
| 674 | |
| 675 ## Write text output | |
| 676 funcWrite( c(strName, strTaxon, lsCur$orig, lsCur$value, length( adData ), sum( adData > 0 ), adP[j], adQ[j]), strFileTXT ) | |
| 677 | |
| 678 ## If the significance meets the threshold | |
| 679 ## Write PDF file output | |
| 680 if( adQ[j] > dSig ) { next } | |
| 681 | |
| 682 # Do not make residuals plots if univariate is selected | |
| 683 strFilePDF = funcPDF( frmeTmp=frmeData, lsCur=lsCur, curPValue=adP[j], curQValue=adQ[j], strFilePDF=strFilePDF, strBaseOut=strBaseOut, strName=strName, funcUnTransform= funcUnTransform, fDoResidualPlot=fDoRPlot, fInvert=fInvert, liNaIndices=liNaIndices ) | |
| 684 } | |
| 685 if( dev.cur( ) != 1 ) { dev.off( ) } | |
| 686 } | |
| 687 aiTmp <- aiData | |
| 688 | |
| 689 logdebug("End funcBugs", c_logMaaslin) | |
| 690 return(list(lsReturnBugs=lsReturnTaxa, lsQCCounts=lsData$lsQCCounts)) | |
| 691 ### List of data features successfully associated without error and quality control data | |
| 692 } | |
| 693 | |
| 694 #Lightly Tested | |
| 695 ### Performs analysis for 1 feature | |
| 696 ### iTaxon: integer Taxon index to be associated with data | |
| 697 ### frmeData: Data frame The full data | |
| 698 ### lsData: List of all associated data | |
| 699 ### aiMetadata: Numeric vector of indices | |
| 700 ### dSig: Numeric significance threshold for q-value cut off | |
| 701 ### adP: List of pvalues from associations | |
| 702 ### lsSig: List which serves as a cache of data about significant associations | |
| 703 ### strLog: String file to log to | |
| 704 funcBugHybrid <- function( | |
| 705 ### Performs analysis for 1 feature | |
| 706 iTaxon, | |
| 707 ### integer Taxon index to be associated with data | |
| 708 frmeData, | |
| 709 ### Data frame, the full data | |
| 710 lsData, | |
| 711 ### List of all associated data | |
| 712 aiMetadata, | |
| 713 ### Numeric vector of indices | |
| 714 dSig, | |
| 715 ### Numeric significance threshold for q-value cut off | |
| 716 adP, | |
| 717 ### List of pvalues from associations | |
| 718 lsSig, | |
| 719 ### List which serves as a cache of data about significant associations | |
| 720 funcTransform, | |
| 721 ### The tranform used on the data | |
| 722 funcUnTransform, | |
| 723 ### The reverse transform on the data | |
| 724 strLog = NA, | |
| 725 ### String, file to which to log | |
| 726 funcReg=NULL, | |
| 727 ### Function to perform regularization | |
| 728 lsNonPenalizedPredictors=NULL, | |
| 729 ### These predictors will not be penalized in the feature (model) selection step | |
| 730 funcAnalysis=NULL, | |
| 731 ### Function to perform association analysis | |
| 732 lsRandomCovariates=NULL, | |
| 733 ### List of string names of metadata which will be treated as random covariates | |
| 734 funcGetResult=NULL, | |
| 735 ### Function to unpack results from analysis | |
| 736 fAllvAll=FALSE, | |
| 737 ### Flag to turn on all against all comparisons | |
| 738 fIsUnivariate = FALSE, | |
| 739 ### Indicates the analysis function is univariate | |
| 740 lxParameters=list(), | |
| 741 ### List holds parameters for different variable selection techniques | |
| 742 fZeroInflated = FALSE, | |
| 743 ### Indicates if to use a zero infalted model | |
| 744 fIsTransformed = TRUE | |
| 745 ### Indicates that the bug is transformed | |
| 746 ){ | |
| 747 #dTime00 <- proc.time()[3] | |
| 748 #Get metadata column names | |
| 749 astrMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ) | |
| 750 | |
| 751 #Get data measurements that are not NA | |
| 752 aiRows <- which( !is.na( frmeData[,iTaxon] ) ) | |
| 753 | |
| 754 #Get the dataframe of non-na data measurements | |
| 755 frmeTmp <- frmeData[aiRows,] | |
| 756 | |
| 757 #Set the min boosting selection frequency to a default if not given | |
| 758 if( is.na( lxParameters$dFreq ) ) | |
| 759 { | |
| 760 lxParameters$dFreq <- 0.5 / length( c(astrMetadata) ) | |
| 761 } | |
| 762 | |
| 763 # Get the full data for the bug feature | |
| 764 adCur = frmeTmp[,iTaxon] | |
| 765 lxParameters$sBugName = names(frmeTmp[iTaxon]) | |
| 766 | |
| 767 # This can run multiple models so some of the results are held in lists and some are not | |
| 768 llmod = list() | |
| 769 liTaxon = list() | |
| 770 lastrTerms = list() | |
| 771 | |
| 772 # Build formula for simple mixed effects models | |
| 773 # Removes random covariates from variable selection | |
| 774 astrMetadata = setdiff(astrMetadata, lsRandomCovariates) | |
| 775 strFormula <- paste( "adCur ~", paste( sprintf( "`%s`", astrMetadata ), collapse = " + " ), sep = " " ) | |
| 776 | |
| 777 # Document the model | |
| 778 funcWrite( c("#taxon", colnames( frmeTmp )[iTaxon]), strLog ) | |
| 779 funcWrite( c("#metadata", astrMetadata), strLog ) | |
| 780 funcWrite( c("#samples", rownames( frmeTmp )), strLog ) | |
| 781 | |
| 782 #Model terms | |
| 783 astrTerms <- c() | |
| 784 | |
| 785 # Attempt feature (model) selection | |
| 786 if(!is.na(funcReg)) | |
| 787 { | |
| 788 #Count model selection method attempts | |
| 789 lsData$lsQCCounts$iBoosts = lsData$lsQCCounts$iBoosts + 1 | |
| 790 #Perform model selection | |
| 791 astrTerms <- funcReg(strFormula=strFormula, frmeTmp=frmeTmp, adCur=adCur, lsParameters=lxParameters, lsForcedParameters=lsNonPenalizedPredictors, strLog=strLog) | |
| 792 #If the feature selection function is set to None, set all terms of the model to all the metadata | |
| 793 } else { astrTerms = astrMetadata } | |
| 794 | |
| 795 # Get look through the boosting results to get a model | |
| 796 # Holds the predictors in the predictors in the model that were selected by the boosting | |
| 797 if(is.null( astrTerms )){lsData$lsQCCounts$iBoostErrors = lsData$lsQCCounts$iBoostErrors + 1} | |
| 798 | |
| 799 # Get the indices that are transformed | |
| 800 # Of those indices check for uneven metadata | |
| 801 # Untransform any of the metadata that failed | |
| 802 # Failed means true for uneven occurences of zeros | |
| 803 # if( fIsTransformed ) | |
| 804 # { | |
| 805 # vdUnevenZeroCheck = funcUnTransform( frmeData[[ iTaxon ]] ) | |
| 806 # if( funcZerosAreUneven( vdRawData=vdUnevenZeroCheck, funcTransform=funcTransform, vsStratificationFeatures=astrTerms, dfData=frmeData ) ) | |
| 807 # { | |
| 808 # frmeData[[ iTaxon ]] = vdUnevenZeroCheck | |
| 809 # c_logrMaaslin$debug( paste( "Taxon transformation reversed due to unevenness of zero distribution.", iTaxon ) ) | |
| 810 # } | |
| 811 # } | |
| 812 | |
| 813 # Run association analysis if predictors exist and an analysis function is specified | |
| 814 # Run analysis | |
| 815 if(!is.na(funcAnalysis) ) | |
| 816 { | |
| 817 #If there are selected and forced fixed covariates | |
| 818 if( length( astrTerms ) ) | |
| 819 { | |
| 820 #Count the association attempt | |
| 821 lsData$lsQCCounts$iLms = lsData$lsQCCounts$iLms + 1 | |
| 822 | |
| 823 #Make the lm formula | |
| 824 #Build formula for simple mixed effects models using random covariates | |
| 825 strRandomCovariatesFormula = NULL | |
| 826 #Random covariates are forced | |
| 827 if(length(lsRandomCovariates)>0) | |
| 828 { | |
| 829 #Format for lme | |
| 830 #Needed for changes to not allowing random covariates through the boosting process | |
| 831 strRandomCovariatesFormula <- paste( "adCur ~ ", paste( sprintf( "1|`%s`", lsRandomCovariates), collapse = " + " )) | |
| 832 } | |
| 833 | |
| 834 #Set up a list of formula containing selected fixed variables changing and the forced fixed covariates constant | |
| 835 vstrFormula = c() | |
| 836 #Set up suppressing forced covariates in a all v all scenario only | |
| 837 asSuppress = c() | |
| 838 #Enable all against all comparisons | |
| 839 if(fAllvAll && !fIsUnivariate) | |
| 840 { | |
| 841 lsVaryingCovariates = setdiff(astrTerms,lsNonPenalizedPredictors) | |
| 842 lsConstantCovariates = setdiff(lsNonPenalizedPredictors,lsRandomCovariates) | |
| 843 strConstantFormula = paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " ) | |
| 844 asSuppress = lsConstantCovariates | |
| 845 | |
| 846 if(length(lsVaryingCovariates)==0L) | |
| 847 { | |
| 848 vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " )) ) | |
| 849 } else { | |
| 850 for( sVarCov in lsVaryingCovariates ) | |
| 851 { | |
| 852 strTempFormula = paste( "adCur ~ `", sVarCov,"`",sep="") | |
| 853 if(length(lsConstantCovariates)>0){ strTempFormula = paste(strTempFormula,strConstantFormula,sep=" + ") } | |
| 854 vstrFormula <- c( vstrFormula, strTempFormula ) | |
| 855 } | |
| 856 } | |
| 857 } else { | |
| 858 #This is either the multivariate case formula for all covariates in an lm or fixed covariates in the lmm | |
| 859 vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", astrTerms ), collapse = " + " )) ) | |
| 860 } | |
| 861 | |
| 862 #Run the association | |
| 863 for( strAnalysisFormula in vstrFormula ) | |
| 864 { | |
| 865 i = length(llmod)+1 | |
| 866 llmod[[i]] = funcAnalysis(strFormula=strAnalysisFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, lsHistory=list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts), strRandomFormula=strRandomCovariatesFormula, fZeroInflated=fZeroInflated) | |
| 867 | |
| 868 liTaxon[[i]] = iTaxon | |
| 869 lastrTerms[[i]] = funcFormulaStrToList(strAnalysisFormula) | |
| 870 } | |
| 871 } else { | |
| 872 #If there are no selected or forced fixed covariates | |
| 873 lsData$lsQCCounts$iNoTerms = lsData$lsQCCounts$iNoTerms + 1 | |
| 874 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts)) | |
| 875 } | |
| 876 } | |
| 877 | |
| 878 #Call funcBugResults and return it's return | |
| 879 if(!is.na(funcGetResult)) | |
| 880 { | |
| 881 #Format the results to a consistent expected result. | |
| 882 return( funcGetResult( llmod=llmod, frmeData=frmeData, liTaxon=liTaxon, dSig=dSig, adP=adP, lsSig=lsSig, strLog=strLog, lsQCCounts=lsData$lsQCCounts, lastrCols=lastrTerms, asSuppressCovariates=asSuppress ) ) | |
| 883 } else { | |
| 884 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts)) | |
| 885 } | |
| 886 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data | |
| 887 } | 
