---
title: "DDC examples"
author: "Raymaekers, J. and Rousseeuw, P.J."
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{DDC examples}
  %\VignetteEngine{knitr::knitr}
  %\VignetteEncoding{UTF-8}
---

```{r, echo = FALSE}
knitr::opts_chunk$set(
  fig.width  = 5 ,
  fig.height = 3.5,
  fig.align  = 'center'
)
```

# Introduction

This file contains some examples of the functions related to the DDC routine. More specifically, `DDC` and `cellMap` will be illustrated.


```{r}
library("cellWise")
library("gridExtra") # has grid.arrange()

# Default options for DDC:
DDCpars = list(fracNA = 0.5, numDiscrete = 3, precScale = 1e-12,
               cleanNAfirst = "automatic", tolProb = 0.99, 
               corrlim = 0.5, combinRule = "wmean",
               returnBigXimp = FALSE, silent = FALSE,
               nLocScale = 25000, fastDDC = FALSE,
               standType = "1stepM", corrType = "gkwls",
               transFun = "wrap", nbngbrs = 100)

# A small list giving the same results:
DDCpars = list(fastDDC = FALSE)

```
***

# Example with row and column selection

First we illustrate the selection of columns and rows by DDC.
This is actually done by the function `checkDataSet` which is called by DDC.

```{r}

i = c(1,2,3,4,5,6,7,8,9) 
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii") 
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE) 
V1 = c(1.3,NaN,4.5,2.7,20.0,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,-10,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5) 
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s) 
datafr

DDCdatafr = DDC(datafr,DDCpars)

remX = DDCdatafr$remX; dim(remX)
cellMap(DDCdatafr$stdResid)
# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.
```
***

# Small generated dataset

```{r,fig.height=6,fig.width=3}
set.seed(12345) # for reproducibility
n <- 50; d <- 20
A <- matrix(0.9, d, d); diag(A) = 1
round(A[1:10,1:10],1) # true covariance matrix
library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 50, FALSE)] <- 10
x[sample(1:(n * d), 50, FALSE)] <- -10

# When not specifying DDCpars in the call to DDC
# all defaults are used:
DDCx <- DDC(x)

cellMap(DDCx$stdResid)
# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.
```

```{R}
DDCx$DDCpars # These are the default options:

names(DDCx)

# We will now go through these outputs one by one:

DDCx$colInAnalysis # all columns X1,...,X20 remain:
DDCx$rowInAnalysis # all rows 1,...,50 remain:
DDCx$namesNotNumeric # no non-numeric columns:
DDCx$namesCaseNumber # no column was the case number:
DDCx$namesNAcol # no columns with too many NA's:
DDCx$namesNArow # no columns with too many NA's:
DDCx$namesDiscrete # no discrete columns:
DDCx$namesZeroScale # no columns with scale = 0:
dim(DDCx$remX) # remaining data matrix
round(DDCx$locX,2) # robust location estimates of all 20 columns:
round(DDCx$scaleX,2) # robust scale estimates of all 20 columns:

round(DDCx$Z[1:10,1:10],1)
# Robustly standardized dataset. Due to the high correlations,
# cells in the same row look similar (except for outlying cells).

DDCx$nbngbrs
# For each column the code looked for up to 19 non-self neighbors (highly correlated columns).
# It goes through all of them, unless fastDDC is set to TRUE.

DDCx$ngbrs[1:3,]
# Shows the neighbors, e.g. the nearest non-self neighbor of X1 is X11, then X2,...

round(DDCx$robcors[1:3,],2)
# Robust correlations with these neighbors. In each row the correlations
# are sorted by decreasing absolute value. 

round(DDCx$robslopes[1:3,],2)
# For each column, the slope of each neighbor predicting it.
# For instance, X1 is predicted by its first neighbor with 
# slope 0.97 and by its second neighbor with slope 0.81 .

round(DDCx$deshrinkage,2)
# For each column, the factor by which its prediction is multiplied.
```

```{R}
round(DDCx$Xest[1:12,1:10],2) # the estimated cells of remX:

round(DDCx$stdResid[1:12,1:10],1)
# The standardized residuals of the cells. Note the NA's and some
# large positive and negative cell residuals.

qqnorm(as.vector(DDCx$stdResid)) # Note the far outliers on both sides:

as.vector(DDCx$indcells) # indices of the cells that were flagged by DDC:

plot(DDCx$Ti) # the Ti values of the rows. None are high.
qqnorm(DDCx$Ti) 
DDCx$medTi # median of the raw Ti (used to standardize Ti):
DDCx$madTi # median absolute deviation of the raw Ti (used to standardize Ti):

DDCx$indrows # numeric(0) means no rows are flagged:

as.vector(DDCx$indall) # indices of the flagged cells, including those in flagged rows:

as.vector(DDCx$indNAs) # indices of the missing cells:

round(DDCx$Ximp[1:10,1:10],2)
# The imputed matrix. Both the cellwise outliers and the missing values
# are replaced by their predicted values.

round((DDCx$Ximp - DDCx$remX)[1:10,1:10],2)
# The nonzero values and the NA's correspond to imputed cells.

```
***

# TopGear dataset
The Top Gear data contains information on 297 cars.

```{r,results='hide',message=FALSE,warning=FALSE}
library(robustHD)
data(TopGear)
```

```{R}
dim(TopGear)
rownames(TopGear)[1:13] # "1" to "297" are not useful names
rownames(TopGear) = paste(TopGear[,1],TopGear[,2]) 
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'

# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)

# Run the DDC method:
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCtransTG = DDC(transTG,DDCpars)
# With DDCpars = list(fastDDC = FALSE, silent = FALSE) we obtain more information:
# 
# The input data has 297 rows and 31 columns.
# 
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#   
#  [1] Maker         Model              Type               Fuel              
#  [5] DriveWheel    AdaptiveHeadlights AdjustableSteering AlarmSystem       
#  [9] Automatic     Bluetooth          ClimateControl     CruiseControl     
# [13] ElectricSeats Leather            ParkingSensors     PowerSteering     
# [17] SatNav        ESP                Origin            
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#   
# [1] Price Cylinders Displacement BHP   Torque Acceleration TopSpeed    
# [8] MPG   Weight    Length       Width Height      
# 
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#   
#   [1] Citroen C5 Tourer
# 
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#   
#   [1] Alfa Romeo Giulietta             Alfa Romeo MiTo                 
# .......
# [295] Volvo XC70                       Volvo XC90                      
# 
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
# 
# [1] Cylinders
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
# 
# [1] Price  Displacement BHP   Torque Acceleration TopSpeed MPG         
# [8] Weight Length       Width Height      
# 
# The final data set we will analyze has 296 rows and 11 columns.


```

```{r,fig.height=8,fig.width=6}
remX = DDCtransTG$remX # the remaining part of the dataset
dim(remX)

colSums(is.na(remX)) # There are still NAs, mainly in `Weight':

# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE), 
               apply(remX,2,mad,na.rm = TRUE))
dim(standX)
round(standX[1:5,],1) # has NAs where remX does
transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # has NAs in outlying cells as well:

# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
tempX$Price = tempX$Price/1000 # to avoid printing long numbers
dim(tempX)

# Show the following 17 cars in the cellmap:
showrows = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)

# Make two ggplot2 objects:
ggpcol = cellMap(standX, showcellvalues="D", D=tempX, 
                 mTitle="By column", showrows=showrows,
                 sizecellvalues = 0.6, adjustrowlabels=0.5)  
plot(ggpcol)
```

```{r,fig.height=10,fig.width=8}
ggpDDC = cellMap(DDCtransTG$stdResid, showcellvalues="D", D=tempX,  
                 mTitle="DetectDeviatingCells", showrows=showrows,
                 sizecellvalues = 0.6, adjustrowlabels=0.5)  
plot(ggpDDC)

# Creating the pdf:
# pdf("cellMap_TopGear.pdf", width = 12, height = 10)
# gridExtra::grid.arrange(ggpcol,ggpDDC,nrow=1) # combines 2 plots in a figure
# dev.off()

```
***

# Analyzing new data by DDCpredict
We now consider the 17 cars shown in the cellmap
as a `new' dataset.

```{r,fig.height=8,fig.width=6}
# Top Gear dataset: prediction of "new" data
############################################
# For comparison we first remake the cell map of the entire dataset, but now 
# showing the values of the residuals instead of the data values:

dim(remX) # 296 11

ggpDDC = cellMap(DDCtransTG$stdResid, showcellvalues="R", 
                 sizecellvalues = 0.7, mTitle="DetectDeviatingCells", 
                 showrows=showrows, adjustrowlabels=0.5)
plot(ggpDDC)
```

## Define the "initial" dataset as the rows not in these 17:
```{R}
initX = remX[-showrows,]
dim(initX) # 279 11

# Fit initX:
DDCinitX = DDC(initX,DDCpars=DDCpars) 
```

## Define the "new" dataset, and apply DDCpredict to it:
```{R}
newX = remX[showrows,]
dim(newX) # 17  11 

# Make predictions by DDCpredict. 
# Its inputs are:
#   Xnew       : the new data (test data)
#   InitialDDC : Must be provided.
#   DDCpars    : the input options to be used for the prediction.
#                By default the options of InitialDDC are used. 

predictDDC = DDCpredict(newX,DDCinitX)

names(DDCinitX)
# For comparison with:

names(predictDDC) # Fewer, since DDCpredict does not call checkDataSet:

# If you specify the parameters the result is the same:
predictDDC2 = DDCpredict(newX,DDCinitX,DDCpars=DDCpars)
all.equal(predictDDC,predictDDC2) # TRUE
```
```{r,fig.height=10,fig.width=8}

ggpnew = cellMap(predictDDC$stdResid, showcellvalues="R",
                 sizecellvalues = 0.7, mTitle="DDCpredict",
                 adjustrowlabels=0.5) 
plot(ggpnew) # Looks quite similar to the result using the entire dataset:

# Creating the pdf:
# pdf("TopGear_DDCpredict.pdf",width=12,height=10)
# gridExtra::grid.arrange(ggpDDC,ggpnew,nrow=1)
# dev.off()
```

***

# Philips data
The philips data contains 9 measurements of TV parts from a production line.

```{R}
data(data_philips)
dim(data_philips) 
colnames(data_philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")

DDCphilips = DDC(data_philips)

qqnorm(as.vector(DDCphilips$Z)) # rather gaussian, here we only see 2 outliers:

round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:

DDCphilips$indcells # indices of the cells that were flagged:

DDCphilips$indrows # flagged rows:

```

## We also apply the rowwise method MCD to detect outlying rows:

```{R,results='hide',message=FALSE,warning=FALSE}
library(robustbase) # for covMcd
```

```{r,fig.height=4,fig.width=8}
MCDphilips = robustbase::covMcd(data_philips)
indrowsMCD = which(mahalanobis(data_philips,MCDphilips$center,
                               MCDphilips$cov) > qchisq(0.975,df=9))

plot(sqrt(mahalanobis(data_philips,MCDphilips$center,MCDphilips$cov)),
     main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9))) # this horizontal line is the cutoff.
# dev.copy(pdf,"Figure_philips_left.pdf",width=10,height=4)
# dev.off()
```

```{r,fig.height=8,fig.width=6}
# cellMaps with rectangular blocks:

ggpMCDphilips = cellMap(data_philips, indrows=indrowsMCD,
                        mTitle="MCD", nrowsinblock=15,
                        ncolumnsinblock=1, drawCircles = TRUE)
plot(ggpMCDphilips)

ggpDDCphilips = cellMap(DDCphilips$stdResid, indrows=DDCphilips$indrows,
                        mTitle="DetectDeviatingCells", nrowsinblock=15,
                        ncolumnsinblock=1, drawCircles = TRUE)
plot(ggpDDCphilips)
# dev.copy(pdf,"Figure_philips_right.pdf",width=6,height=12)
# dev.off()
```
***

# Mortality dataset
This dataset contains the mortality of French males in the years 1816 to 2013.

```{r}
data(data_mortality)
dim(data_mortality)
# 198  91
rownames(data_mortality)[1:5] 
colnames(data_mortality)[1:5] 

DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCmortality = DDC(data_mortality,DDCpars) # 1 second

remX = DDCmortality$remX
dim(remX)
```
## We also apply the rowwise method ROBPCA to detect outlying rows:
```{r,results='hide',message=FALSE,warning=FALSE}
library(rrcov) # contains ROBPCA
```

```{r,fig.height=8,fig.width=6}
PCAmortality = rrcov::PcaHubert(data_mortality,alpha=0.75,scale=FALSE)

ggpROBPCA = cellMap(remX, indrows=which(PCAmortality@flag==FALSE),
                    mTitle="By row", nrowsinblock=5,
                    ncolumnsinblock=5, rowtitle = "Years",
                    columntitle = "Age", sizetitles = 1.5,
                    drawCircles = TRUE)

plot(ggpROBPCA)

ggpDDC = cellMap(DDCmortality$stdResid, 
                 mTitle="DetectDeviatingCells", nrowsinblock=5,
                 ncolumnsinblock=5, rowtitle = "Years",
                 columntitle = "Age", sizetitles = 1.5)
plot(ggpDDC) # Leads to a detailed interpretation:

# pdf("cellmap_mortality.pdf",width=9,height=8)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
# dev.off()
```


To illustrate the ability to make row blocks and
column blocks of arbitrary sizes, we create the
following artificial groupings:

```{r,fig.height=8,fig.width=6}
rowblocksizes = c(84,14,5,21,6,35,33)
rowlabels = c("19th century","1900-1913","WW1","1919-1939",
"WW2","1946-1980","recent")
colblocksizes = c(5,5,10,10,10,10,10,10,10,10,1)
collabels = c("upto 4","5 to 9","10 to 19","20 to 29","30 to 39","40 to 49","50 to 59","60 to 69","70 to 79","80 to 89","90+")

ggpDDC = cellMap(DDCmortality$stdResid,
                 mTitle = "Cellmap with manual blocks",
                 manualrowblocksizes = rowblocksizes,
                 rowblocklabels = rowlabels,
                 manualcolumnblocksizes = colblocksizes,
                 columnblocklabels = collabels,
                 rowtitle = "Epochs",
                 columntitle = "Age groups",
                 sizetitles = 1.2)
# pdf("cellmap_mortality_manual_blocks.pdf",width=5,height=3)
plot(ggpDDC)
# dev.off()
```
***

# Glass dataset
The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.

```{R}
data(data_glass)
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCglass = DDC(data_glass,DDCpars) # takes 8 seconds
remX = DDCglass$remX
# With DDCpars$silent = FALSE we obtain more information:
#
#  The input data has 180 rows and 750 columns.
#
#  The data contained 11 discrete columns with 3 or fewer values.
#  Their column names are:
#
#  [1] V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 739 columns:
#  
#   [1] V12  V13  V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25
#    ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The data contained 2 columns with zero or tiny median absolute deviation.
#  Their column names are:
#
# [1] V12 V13
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 737 columns:
#
#   [1] V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27
#    ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The final data set we will analyze has 180 rows and 737 columns.

dim(remX)
```
## We will compare this with the faster approximate algorithm of DDC, obtained by the option fastDDC=TRUE:
```{R}

fastDDCpars = list(fastDDC = TRUE, silent = TRUE)
fastDDCglass = DDC(data_glass, fastDDCpars) # takes 2 seconds
remXfast = fastDDCglass$remX
all.equal(remX,remXfast) # The remaining data is the same:
```
## We also apply the rowwise method ROBPCA to detect outlying rows:
```{r,results='hide',message=FALSE,warning=FALSE}
library(rrcov) # contains ROBPCA
```

```{R,fig.height=4,fig.width=8}
PCAglass = rrcov::PcaHubert(remX,alpha=0.75,scale=FALSE)

n = nrow(remX)
nrowsinblock = 5
rowtitle = "glass samples"
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/nrowsinblock)] = "n";

d = ncol(remX)
ncolumnsinblock = 5
columntitle = "wavelengths"
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/ncolumnsinblock)] = "d"

ggpROBPCA = cellMap(matrix(0,n,d),
                    indrows=which(PCAglass@flag==FALSE),
                    rowblocklabels=rowlabels,
                    columnblocklabels=columnlabels,
                    mTitle="By row", nrowsinblock=5,
                    ncolumnsinblock=5,
                    rowtitle="glass samples",
                    columntitle="wavelengths",
                    sizetitles=1.2,
                    columnangle=0,
                    drawCircles = TRUE)
plot(ggpROBPCA)

ggpDDC = cellMap(DDCglass$stdResid,
                 indrows=DDCglass$indrows,
                 rowblocklabels=rowlabels,
                 columnblocklabels=columnlabels,
                 mTitle="DDC", nrowsinblock=5,
                 ncolumnsinblock=5,
                 rowtitle="glass samples",
                 columntitle="wavelengths",
                 sizetitles=1.2,
                 columnangle=0,
                 drawCircles = TRUE)
plot(ggpDDC)
# pdf("cellmap_glass_ROBPCA_DDC.pdf",width=8,height=6)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
# dev.off()

ggpfastDDC = cellMap(fastDDCglass$stdResid,
                     indrows=fastDDCglass$indrows,
                     rowblocklabels=rowlabels,
                     columnblocklabels=columnlabels,
                     mTitle="fastDDC", nrowsinblock=5,
                     ncolumnsinblock=5,
                     rowtitle="glass samples",
                     columntitle="wavelengths",
                     sizetitles=1.2,
                     columnangle=0,
                     drawCircles = TRUE)
plot(ggpfastDDC)
# pdf("cellmap_glass_DDC_fastDDC.pdf",width=8,height=6)
# gridExtra::grid.arrange(ggpDDC,ggpfastDDC,ncol=1)
# dev.off()
```