1 Liver toxicity data

Load the dataset liver.toxicity available in the mixOmics package:

library("mixOmics")
data("liver.toxicity")
help(liver.toxicity)
x <- liver.toxicity$gene
y <- liver.toxicity$clinic$ALB.g.dL.
dim(x)
x[1:6, 1:6]
str(y)
  1. Build a RF with default parameters which computes the permutation variable importance index. Note its OOB error.
library(randomForest)
rfDef <- randomForest(x, y, importance=TRUE)
rfDef
## 
## Call:
##  randomForest(x = x, y = y, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1038
## 
##           Mean of squared residuals: 0.04233247
##                     % Var explained: 36.66
  1. Plot VI scores (sorted in decreasing order), then plot only the scores associated to the 100 most important variables.
    [The sort() function can be used to sort VI scores, and the index.return=TRUE must be specified to output the indices associated to the permutation performed during the sort]
rfDefImp <- rfDef$importance[, 1]
rfDefImpSort <- sort(rfDefImp, decreasing=TRUE, index.return=TRUE)
plot(1:ncol(x), rfDefImpSort$x, type ="l")

plot(1:100, rfDefImpSort$x[1:100], type ="b")

  1. Find a subset of variables containing the most important variables, based on the previous graph (you can apply an elbow rule, like in PCA for example). Keep the indices of the selected variables. We note \(p_{\mathrm{sel}}\) the number of selected variables.
varsel <- rfDefImpSort$ix[1:14]
xs <- x[, varsel]
varsel
##  [1] 2247 2864 2812 1382 1159 2939 2988 1389  941 2245 2308 2009 1063  488
# This choice is not very impartial, and that is why (automatic) variable
# selection procedure exists.
  1. Build a RF only using the previously selected variables. Comment on the associated OOB error.
rfs <- randomForest(xs, y)
rfs
## 
## Call:
##  randomForest(x = xs, y = y) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.02898392
##                     % Var explained: 56.63
# This OOB error is biased, because we already used all the data to determine
# the subset of variables that are in the RF model.
# To better estimate the prediction error, we must do a cross-validation
# procedure as follows.
  1. Estimate the prediction error of a RF only using the \(p_{\mathrm{sel}}\) most important variables with a 4-fold cross-validation procedure.
    [Variables can vary from one fold to the other, only \(p_{\mathrm{sel}}\) is fixed]
cvInd <- sample(rep(1:4, nrow(x)/4))
pred_RF_sel <- rep(NA, nrow(x))

for (i in 1:4){
  indi_test <- which(cvInd==i)
  xtrain <- x[-indi_test, ]
  ytrain <- y[-indi_test]
  xtest <- x[indi_test, ]
  
  rf <- randomForest(xtrain, ytrain)
  imp <- rf$importance[, 1]
  imp.sort <- sort(imp, decreasing=TRUE, index.return=TRUE)
  varsel <- imp.sort$ix[1:14]
  xs <- x[, varsel]
  rfs <- randomForest(xs, y)
  
  pred_RF_sel[indi_test] <- predict(rfs, xtest)
}

errCV_RF_sel <-  mean((pred_RF_sel - y)^2)
errCV_RF_sel
## [1] 0.008028667
# This prediction error estimation is again biased, because we already used
# all the data to determine the number of variables to select.
# 
# To conclude, to get an unbiased prediction error estimate, a cross-validation
# procedure muste be applied, but on each fold a fully automatic variable
# selection procedure must be used, as VSURF for example.

2 Ozone data

Load the dataset, available in the mlbench package:

library("mlbench")
data(Ozone)
str(Ozone)
  1. Build a RF with default parameters which computes the permutation variable importance index.
library(randomForest)
rfOz <- randomForest(V4~., Ozone, na.action = na.omit, importance = TRUE)
  1. Plot VI scores (sorted in decreasing order). Interprate the result.
rfOzImp <- rfOz$importance[, 1]
rfOzImpSort <- sort(rfOzImp, decreasing = TRUE, index.return = TRUE)
barplot(rfOzImpSort$x)