--- title: "Correspondence analysis examples" author: "Raymaekers, J. and Rousseeuw, P.J." date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Correspondence analysis examples} %\VignetteEngine{knitr::knitr} %\VignetteEncoding{UTF-8} --- ```{r, echo = FALSE} knitr::opts_chunk$set( fig.width = 6, fig.height = 6, fig.align ='center' ) ``` ```{r} library(cellWise) ``` # Introduction In this vignette we illustrate the use of MacroPCA with fixed center in the context of correspondence analysis. It reproduces the examples in the report Raymaekers and Rousseeuw (2022), Challenges of cellwise outliers. # Clothes data In this section we analyze the clothes data through correspondence analysis. We start by loading the data, and constructing the matrix S: ```{r} data("data_clothes") X <- data_clothes dim(X) # create matrix S as in paper: sum(X) # total count is 4373 P <- X / sum(X) # relative frequencies that add up to 1 rvec <- rowSums(P) # vector of row totals cvec <- colSums(P) # vector of column totals R <- X / rowSums(X) # row profiles rowSums(R) # all 1, OK S <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*% t(cvec)) %*% diag(1/sqrt(cvec)) dimnames(S) <- dimnames(X) round(S, 3) d <- ncol(S) # We verify that the points of S are on a hyperplane by construction: (eigvals <- eigen(t(S) %*% S)$values) ``` Now we analyze the data using DDC to identify suspicious cells: ```{r,fig.height=8,fig.width=3} DDC.out <- DDC(S) ggp <- cellMap(R = DDC.out$stdResid, mTitle = "clothes data", rowtitle = "countries", columntitle = "brackets") # pdf("clothes_cellmap.pdf", width = 3, height = 6) plot(ggp) # dev.off() ``` United Kingdom (GB), Slovakia, and Romania stand out. The outlier removal in Riani et al (2022) takes out GB, SK, BG, GR, RO, MT, LV which are precisely the countries with flagged cells here! Their analysis doesn't say much about why these countries are outlying, but here we see which brackets they import unusually much or little from. For instance, the cellmap suggests that GB imports a lot of cheap clothing. The webpage says that much is imported from Bangladesh, China, and India. ```{r} countryInds <- which(rownames(S) %in% c("GB", "SK", "MT", "GR", "BG", "LV", "RO")) matplot(t(S[countryInds, ]), pch = 16, col = 1:7) lines(apply(S, 2, median), lwd = 3) # median profile # The profile of these countries deviate in a similar way: # they trade a lot of cheap clothes, and fewer expensive ones. ``` We now continue with classical correspondence analysis. ```{r,fig.height=6,fig.width=6} svd.out <- svd(S) svd.out$d# S is indeed singular: (svd.out$d)^2 - eigen(t(S) %*% S)$values # ~0 diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2))) # This can be shown in a scree plot. # For plotting the rows: rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d) # For plotting the columns = variables: colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d) # pdf(file="clothes_ClassCorrespA.pdf", width=7, height=6) plot(rowproj[, 1:2], col = "white", xlab="", ylab="", xlim = c(-1, 0.6), ylim = c(-0.6, 0.6)) title(main="Classical correspondence analysis of clothes data", line=1) text(rowproj[, 1:2], labels = rownames(S)) abline(v=0); abline(h=0) text(colproj[, 1:2], labels = colnames(S), col="red") facs = c(0.93,0.85,0.78,0.84,0.87) shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], col = "red", arr.type="simple", arr.width=0.1, arr.length = 0.1) # , arr.adj = 0) title(xlab="Dimension 1", line=2.3) title(ylab="Dimension 2", line=2.3) # dev.off() ``` Now we perform a cellwise robust correspondence analysis. ```{r,fig.height=6,fig.width=6} # We apply MacroPCA with center fixed at zero. # As in classical CA, we do not prescale S. MacroPCApar0 <- list(scale = FALSE, center = rep(0,d)) MacroPCA.out <- MacroPCA(S, k=0, MacroPCApars = MacroPCApar0) MacroPCA.out <- MacroPCA(S, k=2, MacroPCApars = MacroPCApar0) (eigvals <- MacroPCA.out$eigenvalues) diff(c(0,cumsum(eigvals)/sum(eigvals))) # Compute the principal coordinates for the biplot. # To make the plot match the orientation in Riani et al: V <- -MacroPCA.out$loadings Tscores <- -MacroPCA.out$scores sVals <- sqrt(nrow(S)*MacroPCA.out$eigenvalues) U <- Tscores %*% diag(1/sVals) rowproj <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals) colproj <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals) # pdf(file="clothes_MacroCA_biplot.pdf", width=7, height=6) plot(rowproj[, 1:2], col = "white", xlim=c(-0.95,0.65), ylim=c(-0.6,0.6), xlab="", ylab="") title(main="Cellwise robust correspondence analysis of clothes data", line=1) text(rowproj[,1:2], labels = rownames(S)) abline(h=0); abline(v=0) text(colproj[, 1:2], labels = colnames(S), col="red") facs = c(0.9,0.8,0.5,0.75,0.85) shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], col = "red", arr.type="simple", arr.width=0.1, arr.length = 0.1) title(xlab="Dimension 1", line=2.3) title(ylab="Dimension 2", line=2.3) # dev.off() # Matches Fig 4 of Riani quite well. ``` # Brand perception example We start by loading the data and constructing the matrix S. ```{r} data("data_brands") X <- data_brands dim(X) sum(X) # total count is 11713 P <- X/sum(X) # relative frequencies that add up to 1 rvec <- rowSums(P) # vector of row totals hist(rvec) # Right tail: Chevrolet, Ford, Honda, Toyota # These brands are well known and sold a lot in the US. cvec <- colSums(P) # vector of column totals R <- X / rowSums(X) # row profiles S <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*% t(cvec)) %*% diag(1/sqrt(cvec)) dimnames(S) <- dimnames(X) d <- ncol(S) ``` Now we continue by executing DDC to identify suspicious cells. ```{r,fig.height=8,fig.width=3} DDC.out <- DDC(S) ggp <- cellMap(R = DDC.out$stdResid, mTitle = "brands data", rowtitle = "brands", columntitle = "perceptions") # pdf("brands_cellmap.pdf", width = 3.5, height = 8) plot(ggp) # dev.off() # Volvo is most deviating (3 cells), followed by Hyundai # (2 cells) and Maserati (2 cells). ``` Now we analyze the brands data by means of classical correspondence analysis. ```{r,fig.height=6,fig.width=6} svd.out <- svd(S) svd.out$d # S is singular: diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2))) # Can be plotted in a scree plot. # To match the plot in Riani at al: svd.out$v[, 2] = -svd.out$v[, 2] svd.out$u[, 2] = -svd.out$u[, 2] rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d) colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d) # pdf(file="brands_ClassCorrespA.pdf", width=7, height=6) plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1), ylim=c(-0.8,1.5), xlab="", ylab="") title(main="Classical correspondence analysis of brands data", line=1) text(rowproj[,1:2], labels = rownames(S)) abline(v=0); abline(h=0) text(colproj[, 1:2], labels = colnames(S), col="red") facs = c(0.8,0.9,0.8,0.65,0.92,0.8,0.65) shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], col = "red", arr.type="simple", arr.width=0.1, arr.length = 0.1) title(xlab="Dimension 1", line=2.3) title(ylab="Dimension 2", line=2.3) # dev.off() ``` Cellwise robust correspondence analysis of the brands data: ```{r,fig.height=6,fig.width=6} MacroPCApar0 <- list(scale = FALSE, center = rep(0, d)) MacroPCA.out <- MacroPCA(S, k = 0, MacroPCApars = MacroPCApar0) MacroPCA.out <- MacroPCA(S, k = 3, MacroPCApars = MacroPCApar0) # Computations for the biplot. V <- MacroPCA.out$loadings scores <- MacroPCA.out$scores # To make the plot match the orientation in Riani et al: V[,2] <- -V[,2] scores[,2] <- -scores[,2] sVals <- sqrt(nrow(S)*MacroPCA.out$eigenvalues) U <- scores %*% diag(1/sVals) rowproj <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals) colproj <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals) # pdf(file="brands_MacroCA_biplot.pdf", width=7, height=6) plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1), ylim=c(-0.6,0.6), xlab="", ylab="") title(main="Cellwise robust correspondence analysis of brands data", line=1) text(rowproj[,1:2], labels = rownames(S)) abline(h=0); abline(v=0) text(colproj[, 1:2], labels = colnames(S), col="red") facs = c(0.75,0.76,0.9,0.88,0.65,0.74,0.52) shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], col = "red", arr.type="simple", arr.width=0.1, arr.length = 0.1) title(xlab="Dimension 1", line=2.3) title(ylab="Dimension 2", line=2.3) # dev.off() # Roughly matches Figure 7 of Riani et al (2022). ```