---
title: "cellwise weights examples"
author: "Rousseeuw, P.J."
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{cellwise weights examples}
  %\VignetteEngine{knitr::knitr}
  %\VignetteEncoding{UTF-8}
---

```{r, echo = FALSE}
knitr::opts_chunk$set(
 fig.width = 6,
 fig.height = 6,
 fig.align ='center'
)
```


# Introduction

This file contains examples of the use of the weightedEM, unpack, and cwLocScat functions. It reproduces all the figures of the report "Analyzing cellwise weighted data" by P.J. Rousseeuw.


```{r}
library("cellWise")

n = 10
d = 3
A = matrix(0.7, d, d); diag(A) = 1
A
set.seed(12345)
library("MASS")
X = mvrnorm(n, rep(0,d), A)
colnames(X) = c("X1","X2","X3")
X[1,3] = X[2,2] = X[3,1] = X[4,1] = X[6,2] = NA
X # rows 1, 2, 3, 4, 6 have NAs
w = c(1,2,1,1,2,2,1,1,1,1) # rowwise weights
```

```{r, fig.width = 5, fig.height = 5}
out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # number of iteration steps taken
out$mu
round(out$Sigma,6)
plot(1:out$niter, out$loglikhd[1:out$niter], type='l',
     lty=1, col=4, xlab='step', ylab='log(likelihood)',
     main='log(likelihood) of weighted EM iterations')
tail(out$loglikhd) # would have NAs for computeloglik=F
out$impX # imputed data, has no NA's
```

```{r}
# The weights may be multiplied by a constant:
#
(w = c(1,2,1,1,2,2,1,1,1,1)/3) # divide weights by 3
out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # OK, same results:
out$mu # same
round(out$Sigma,6) # same
tail(out$loglikhd)
# converges to -11.3573 = -34.07189 / 3


# Create an equivalent matrix y without weights, by repeating
# some rows according to their integer weights:
#
Y = X[c(1,2,2,3,4,5,5,6,6,7,8,9,10),]
dim(Y)
Y # This gives the same results:
out = weightedEM(Y,crit=1e-12,computeloglik=T) # OK, same
out$niter
out$mu
round(out$Sigma,6)
tail(out$loglikhd)
# converges to -34.07189 like before.
```


#  Unpack the toy example in section 2 of the paper

```{r}
X = matrix(c(2.8,5.3,4.9,7.4,
             2.3,5.7,4.3,7.2,
             2.5,5.1,4.4,7.6),nrow=3,byrow=T)
W = matrix(c(0.8,1.0,0.3,0.4,
             0.3,0.5,0.9,0.5,
             1.0,0.6,0,0.7),nrow=3,byrow=T)
rownames(X) = rownames(W) = c("A","B","C")
colnames(X) = colnames(W) = c("V1","V2","V3","V4")
n = nrow(X); d = ncol(X)
X
W
out = unpack(X,W)
cbind(out$U,out$v) # OK
dim(out$U)
```

# Playing with the function cwLocScat

```{r}
set.seed(12345)
n = 1000; d = 2
A = matrix(0.7, d, d); diag(A) = 1
A
X = mvrnorm(n, rep(0,d), A)
head(X)
W = abs(mvrnorm(n, rep(0,d), diag(rep(1,2))))
W = W/max(as.vector(W))
W[2,1] = 0
W[5,2] = 0
head(W)

fit = cwLocScat(X,W)
fit$cwMLEiter # number of iteration steps

fit$cwMLEmu

fit$cwMean

fit$cwMLEsigma

fit$cwCov # similar to cwMLEsigma:

fit$sqrtCov # same diagonal:

```

# Personality traits example from section 4

```{r}
data("data_personality_traits")
X <- data_personality_traits$X
W <- data_personality_traits$W
cbind(X,W) # as in table in the paper

out = unpack(X,W)
cbind(out$U,out$v)
fit = cwLocScat(X,W)
fit$cwMLEiter

round(fit$cwMLEmu,2)

round(fit$cwMean,2)

round(fit$cwMLEsigma, 2)

round(eigen(fit$cwMLEsigma)$values, 2)

round(fit$cwCov, 2)

round(eigen(fit$cwCov)$values,5)

round(cov(X), 2) # unweighted

round(eigen(cov(X))$values, 2)
```

Now we reproduce the figure in the paper

```{r, fig.width = 5, fig.height = 5}

ellips = function(covmat, mu, quant=0.95, npoints = 120)
{ # computes points of the ellipse t(X-mu)%*%covmat%*%(X-mu) = c
  # with c = qchisq(quant,df=2)
  if (!all(dim(covmat) == c(2, 2))) stop("covmat is not 2 by 2")
  eig = eigen(covmat)
  U = eig$vectors
  R = U %*% diag(sqrt(eig$values)) %*% t(U) # square root of covmat
  angles = seq(0, 2*pi, length = npoints+1)
  xy = cbind(cos(angles),sin(angles)) # points on the unit circle
  fac = sqrt(qchisq(quant, df=2))
  scale(fac*xy%*%R, center = -mu, scale=FALSE)
}  


n = nrow(X)
j = 3; k = 6 # to plot variables t3 and t6
xy = X[,c(j,k)]
cov2cor(cov(X)[c(j,k),c(j,k)]) # unweighted correlation is 0.10
cov2cor(fit$cwMLEsigma[c(j,k),c(j,k)]) # now correlation is 0.31
(wxy = W[,c(j,k)])
duplicated(xy) # ties: row 4 equals row 3, and row 9 equals row 5
wxy[3,] = wxy[3,] + wxy[4,] # add cell weights of rows 3 and 4
wxy[5,] = wxy[5,] + wxy[9,] # add cell weights of rows 5 and 9
(wxy = wxy[-c(4,9),]) # remove duplicate rows
(xy = xy[-c(4,9),]) # remove duplicate rows

# pdf("personality_cwMLE_cwCov.pdf",width=5.5,height=5.5)
myxlim = c(2,14); myylim = c(1,13)
plot(xy, pch=16, col="white", xlim=myxlim, ylim=myylim,
     xlab="",ylab="")
fac = 0.3 # for the size of the lines representing the cell weights
for(i in seq_len(nrow(xy))){
  WY = c(xy[i,1] - fac*wxy[i,1],xy[i,2]) # (WestX, Y) 
  EY = c(xy[i,1] + fac*wxy[i,1],xy[i,2]) # (EastX, Y) 
  XS = c(xy[i,1],xy[i,2] - fac*wxy[i,2]) # (X, SouthY)
  XN = c(xy[i,1],xy[i,2] + fac*wxy[i,2]) # (X, NorthY)
  lines(rbind(WY,EY),lwd=3)
  lines(rbind(XS,XN),lwd=3)
}
title(main="tolerance ellipses with and without cell weights",
      line=0.8,cex.main=1) # 1.2)
title(xlab="trait 3",line=2.1,cex.lab=1.0)
title(ylab="trait 6",line=2.1,cex.lab=1.0)
center1 = colMeans(X[,c(j,k)])
covmat1 = (n-1)*cov(X[,c(j,k)])/n                   
ell1 = ellips(covmat1, center1)
lines(ell1,lwd=1.5,col="red") # ellipse from unweighted covariance
fit2 = cwLocScat(xy,wxy)
center2 = fit2$cwMLEmu
covmat2 = fit2$cwMLEsigma
ell2 = ellips(covmat2, center2) # ellipse from cwMLE estimates
lines(ell2,lwd=1.5,col="blue")
center3 = fit2$cwMean
covmat3 = fit2$cwCov
ell3 = ellips(covmat3, center3) # ellipse from cwMean and cwCov
lines(ell3,lwd=1.5,lty=2,col="blue")
legend("topleft",c("cwMLE","cwCov","MLE"),
       col=c("blue","blue","red"), lty=c(1,2,1), lwd=1.5, cex=1)
# dev.off()

# The blue ellipses, that use the cell weights, are a bit
# higher and more slanted than the red ellipse that doesn't,
# mainly due to the high cell weight of the y-coordinate of 
# the point in the top right corner.
```