--- title: "DI examples" author: "Raymaekers, J. and Rousseeuw, P.J." date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{DI examples} %\VignetteEngine{knitr::knitr} %\VignetteEncoding{UTF-8} --- ```{r, echo = FALSE} knitr::opts_chunk$set( fig.width = 6, fig.height = 6, fig.align ='center' ) ``` # Introduction This file contains some examples of the functions related to the DI and cellHandler routines. ```{r} library("cellWise") ``` # Artificial data In this example we consider an artificial dataset with cellwise outliers. First we construct a correlation matrix and then use it to generate the data. ```{r,fig.height=8,fig.width=8} d <- 10 mu <- rep(0, 10) Sigma <- generateCorMat(d = d, corrType = "A09") n <- 100 # number of observations outlierType <- "cellwiseStructured" # type of cellwise outliers perout <- 0.2 # percentage of outliers gamma <- 5 # how far the outliers are from the center data <- generateData(n, d, mu, Sigma, perout, gamma, outlierType, seed = 1) X <- data$X pairs(X) # we clearly see some marginal outliers, but also some more tricky ones. ``` Now we run the DI algorithm to detect cellwise outliers ```{r,fig.height=15,fig.width=8} tic = Sys.time() DI.out = DI(X) toc = Sys.time(); toc - tic DI.out$nits # the algorithm converges in 5 iterations and takes under 1 second flaggedCells <- DI.out$indcells # indices of the flagged Cells length(intersect(data$indcells, flaggedCells)) # 155 of the 200 cells are flagged # We can now compare this with the marginal flagging of outliers. locScale.out <- estLocScale(X) # robust location and scale of X Z <- scale(X, locScale.out$loc, locScale.out$scale) flaggedCells_marginal <- which(abs(Z) > sqrt(qchisq(p = 0.99, df = 1))) length(intersect(data$indcells, flaggedCells_marginal)) # only 61 of the 200 cells are flagged cellMap(D = X, R = DI.out$Zres, indcells = flaggedCells, mTitle = "cellHandler") cellMap(D = X, R = Z, indcells = flaggedCells_marginal, mTitle = "marginal analysis") ``` # VOC data We first load an inspect the data. ```{r} data("data_VOC") # ?VOC # The first 16 variables are the VOCs, the last 3 are: # SMD460: How many people who live here smoke tobacco? # SMD470: How many people smoke inside this home? # RIDAGEYR: age colnames(data_VOC) range(data_VOC$RIDAGEYR) # the subjects in this dataset are children between 3 and 10 ``` Now extract the VOC data and run the DI algorithm to estimate a covariance matrix and flag cellwise outliers. ```{r,fig.height=6,fig.width=6} X <- data_VOC[, -c(17:19)] # extract the VOC data dim(X) rownames(X) = 1:512 # Run the Detection Imputation (DI) algorithm: tic = Sys.time() DI.out = DI(X) toc = Sys.time(); toc - tic DI.out$nits # the algorithm converges in 4 iterations and takes roughly 2 seconds Zres <- DI.out$Zres dimnames(Zres) <- dimnames(X) indcells <- DI.out$indcells # Draw cellmap: # pdf("VOCs_20_cellmap.pdf", height = 5, width = 5) rowsToShow = 1:20 cellMap(Zres, showrows = rowsToShow, mTitle = "VOCs in children", rowtitle = "first 20 children", columntitle = "volatile components") # dev.off() rm(rowsToShow) ``` Now analyze the flagged cells and compare them with the cells found based on marginal outlyingness. ```{r,fig.height=6,fig.width=6} W <- matrix(0, nrow(X), ncol(X)) W[indcells] <- 1 # Variable 8 has a substantial number of red cells. # Its total number of outlying cells: sum(W[,8])/nrow(X) # Variable 8 has 11% of outlying cells. # Since quant = 0.99 these are the cells with absolute # residual above sqrt(qchisq(p=0.99,df=1)) = 2.575829 . # Variable 8 is "URXCYM": # N-Acetyl-S-(2-cyanoethyl)-L-cysteine (ng/mL) # this is a well-known biomarker for exposure to tobacco # smoke. see e.g. # https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0210104 # Adults who smoke usually have high values. # How many URXCYM values in this set are marginally outlying? # If we would use univariate outlier detection, few of # the URXCYM values in this set would be considered suspicious: meds = apply(X,2,FUN="median") mads = apply(X,2,FUN="mad") Z = scale(X,center=meds,scale=mads) cutoff = sqrt(qchisq(p=0.99,df=1)); cutoff cellInd = which(abs(Zres[,8]) > cutoff) length(cellInd)/nrow(X) # almost 11% marginalInd = which(abs(Z[,8]) > cutoff) length(marginalInd)/nrow(X) # under 2% # Even for perfectly gaussian data this would already be 1%. # pdf("ZresVersusZ.pdf",width=5.4,height=5.4) # sizes in inches plot(Z[,8], Zres[,8], xlab = "", ylab = "",main = "",pch = 16, col = "black", xlim=c(-3,5)) title(main="log(URXCYM) in children aged 10 or younger", line=1) # , cex.lab=1.2, family="Calibri Light") title(ylab="standardized cellwise residuals", line=2.3) title(xlab="robustly standardized marginal values", line=2.3) abline(h=cutoff, col="red") abline(h=-cutoff, col="red") abline(v=cutoff, col="red") abline(v=-cutoff, col="red") # dev.off() # Many outlying residuals occur at inlying marginal values. # These persons' URXCYM is high relative to the other # compounds (variables) in the same person. ``` We now link the findings with the data on household smoking. ```{r,fig.height=6,fig.width=6} # Look at the residuals for the children who # live together with people who smoke. # We consider 4 categories: # children without smokers in their family: nonsmokers = which(data_VOC$SMD460 == 0) length(nonsmokers) # at least one adult smokes, but not in the home: noneInHome = which((data_VOC$SMD460 > 0) & (data_VOC$SMD470 == 0)) length(noneInHome) # children with 1 person smoking in their home: oneInHome = which(data_VOC$SMD470 == 1) length(oneInHome) # children with 2 people smoking in their home: twoInHome = which(data_VOC$SMD470 == 2) length(twoInHome) length(which(Zres[nonsmokers,8] > 0))/length(nonsmokers) length(which(Zres[noneInHome,8] > 0))/length(noneInHome) length(which(Zres[oneInHome,8] > 0))/length(oneInHome) length(which(Zres[twoInHome,8] > 0))/length(twoInHome) # So 36% of the children living in a house with one smoker # have suspiciously high levels for this biomarker. # 63% of the children living in a house with two smokers # have suspiciously high levels for this biomarker. cellMap(Zres, showrows = oneInHome, mTitle = "VOCs in children", rowtitle = "", columntitle = "volatile components") ``` ```{r,fig.height=5,fig.width=4,fig.align ='center'} cellMap(Zres, showrows = twoInHome, mTitle = "VOCs in children", rowtitle = "", columntitle = "volatile components") ``` ```{r,fig.height=7,fig.width=6} # For one or more smokers in the house: smokeInHome = c(oneInHome,twoInHome) length(smokeInHome) cellMap(Zres, showrows = smokeInHome, mTitle = "VOCs in children", rowtitle = "", columntitle = "volatile components") ``` In all of these cellmaps the variable URXCYM stands out! If we would use a univariate detection bound, many of these values wouldn't be considered suspicious: ```{r,fig.height=6,fig.width=6} length(which(Z[nonsmokers] > cutoff))/length(nonsmokers) length(which(Z[noneInHome] > cutoff))/length(noneInHome) length(which(Z[oneInHome] > cutoff))/length(oneInHome) length(which(Z[twoInHome] > cutoff))/length(twoInHome) # Here the fractions are not even increasing with the number of smokers. plotdata = matrix(c(length(which(Zres[nonsmokers,8] > 0))/length(nonsmokers), length(which(Zres[noneInHome,8] > 0))/length(noneInHome), length(which(Zres[oneInHome,8] > 0))/length(oneInHome), length(which(Zres[twoInHome,8] > 0))/length(twoInHome), length(which(Z[nonsmokers] > cutoff))/length(nonsmokers), length(which(Z[noneInHome] > cutoff))/length(noneInHome), length(which(Z[oneInHome] > cutoff))/length(oneInHome), length(which(Z[twoInHome] > cutoff))/length(twoInHome)), nrow = 2, byrow = TRUE) # pdf("cellwise_marginal.pdf",width=5.4,height=5.4) matplot(1:4, t(plotdata), type = "b", pch = 16, lwd = 3, cex = 2, xlab = "", xaxt = "n", ylab = "", yaxt = "n", ylim = c(0, 0.7), col = c("blue", "red"), lty = 1) axis(side = 1, labels = c("none", "0 in home", "1 in home", "2 in home"), at = 1:4, cex.axis = 1.3) axis(side = 2, labels = seq(0, 100, by = 20), at = seq(0, 1, by = 0.2), cex.axis = 1.3) legend("topleft", fill = c("blue", "red"), legend = c("cell residuals","marginal values"), cex = 1.3) title(main="Effect of smokers on elevated URXCYM in children", line=1.2) title(ylab="% of children with elevated URXCYM",cex.lab=1.3, line=2.3) title(xlab="smoking adults in household",cex.lab=1.3, line=2.3) # dev.off() ```