PARTIAL CORRECTION

1 Random Forests basics

Load the dataset liver.toxicity available in the mixOmics package:

library("mixOmics")
# install.packages('mixOmics')
data("liver.toxicity")
help(liver.toxicity)
x <- liver.toxicity$gene
y <- liver.toxicity$clinic$ALB.g.dL.
dim(x)
x[1:6, 1:6]
str(y)
  1. Load the randomForest package and consult the help page of the randomForest() function:

    library("randomForest")
    help(randomForest)

    Build, with the randomForest() function, a Bagging predictor, named bag, which involves maximal trees. Keep all default parameters for now.
    Determine its OOB error using the output print.

    bag <- randomForest(x, y, mtry = ncol(x))
    bag
    ## 
    ## Call:
    ##  randomForest(x = x, y = y, mtry = ncol(x)) 
    ##                Type of random forest: regression
    ##                      Number of trees: 500
    ## No. of variables tried at each split: 3116
    ## 
    ##           Mean of squared residuals: 0.04337727
    ##                     % Var explained: 35.1
  2. Build now a random forests predictor, named rf, which randomly selects \(p/3\) (the default value) variables at each node before optimizing the split.
    Compare its OOB error with the previous one.
    Apply the plot() function to the object rf.

    rf <- randomForest(x, y)
    rf
    ## 
    ## Call:
    ##  randomForest(x = x, y = y) 
    ##                Type of random forest: regression
    ##                      Number of trees: 500
    ## No. of variables tried at each split: 1038
    ## 
    ##           Mean of squared residuals: 0.04317001
    ##                     % Var explained: 35.41
    plot(rf)

  3. Tune the number of trees in the forest, while letting the number of variables selected at each node fixed to its default value.
    Use the previous plot to guide your choice of tested values.

    # The previous plot informs us that a number of trees smaller than 100 is
    # too small. Again from the plot, we see that a number of trees between 200
    # and 500 is reasonable. We can try to increase the number of trees and see
    # what happens (take car of the computational time for this example):
    
    rf1000 <- randomForest(x, y, ntree = 1000)
    plot(rf1000)

    # The OOB error is very stable when enough trees are grown, approximately
    # 200 for this example. Hence, to minimize the computational time (which can
    # be very large for this dataset), we fix the number of trees to 200 in the
    # sequel.
    
    nbtreeopt <- 200
  4. Tune now the number of variables selected at each node, while letting the number of trees to its value found in question 3.

    # The default value is 1038 in our case. We only test a few values of mtry
    # here, because of large computational time.
    
    p <- ncol(x)
    nbvars <- floor(c(sqrt(p), p/10, p/5, p/3, p/2, 2 * p/3))
    oobs_mtry <- rep(NA, length(nbvars))
    for (nbv in seq_along(nbvars)) {
        rf <- randomForest(x, y, ntree = nbtreeopt, mtry = nbv)
        oobs_mtry[nbv] <- rf$mse[nbtreeopt]
    }
    cbind(nbvars, oobs_mtry)
    ##      nbvars  oobs_mtry
    ## [1,]     55 0.05222704
    ## [2,]    311 0.04813764
    ## [3,]    623 0.04786281
    ## [4,]   1038 0.04465826
    ## [5,]   1558 0.04646818
    ## [6,]   2077 0.04485864
    plot(nbvars, oobs_mtry, type = "l")

    nbvaropt <- nbvars[which.min(oobs_mtry)]
    nbvaropt
    ## [1] 1038
    oobs_mtry[which.min(oobs_mtry)]
    ## [1] 0.04465826
  5. Build a random forest predictor with the following parameters values:

    (replace = FALSE, sampsize = nrow(x), mtry = ncol(x), ntree = 10, maxnodes = 5)

    What are the caracteristics of this RF ?
    Look carefully at the forest component of the resulting object (which is a list) and figure out what its content means.

    # Bootstrap samples are actually not bootstrap samples: draws are made
    # without replacement, and the number of observations drawn is fixed to the
    # total number of observations in the dataset. So, in this case, all trees
    # are built on the full original datatset.
    
    # Furthermore, all variables are chosen at each node because mtry = p.
    
    # Finally, the forest contains 10 trees and the trees have 5 leaves maximum.
    
    # If we look at the details of the resulting object we see that all trees
    # are not the same even if it should be the case ! This phenomenom comes
    # (maybe) from precision problems in R...

2 Prediction error estimate using cross-validation

The goal in this section is to estimate the prediction error of a RF and a regression tree on Liver toxicity data using 10-fold cross validation.

  1. Randomly choose which observation belongs to each fold of the 10-fold cross-validation.

    n <- nrow(x)
    cvInd <- sample(rep(1:10, floor(n/10) + 1))[1:n]
    cvInd
    ##  [1]  1  4  5  3  4  8 10  4  2  2  9  5  7  4  1  3  9  3 10  8  8  5 10
    ## [24]  2  6 10  8  6  5  4  8  9  2  6 10  5  8  1  9  7  9  2  1  5  6  3
    ## [47]  6  4 10  6  7  1  6  3  1  4  3  8  5  1  7  7 10  3
  2. Build a RF and a CART tree on all Liver tocixity data except those belonging to the first fold.

    indi_test1 <- which(cvInd == 1)
    xtrain1 <- x[-indi_test1, ]
    ytrain1 <- y[-indi_test1]
    
    rf1 <- randomForest(xtrain1, ytrain1)
    df1 <- data.frame(xtrain1, albu = ytrain1)
    tree1 <- rpart::rpart(albu ~ ., df1)
  3. Predict the observations belonging to the firest fold with both predictors (use the predict() method for that).

    xtest1 <- x[indi_test1, ]
    ytest1 <- y[indi_test1]
    predrf1 <- predict(rf1, xtest1)
    predtree1 <- predict(tree1, xtest1)
    cbind(predrf1, predtree1, ytest1)
    ##        predrf1 predtree1 ytest1
    ## ID202 4.934940  4.988889    4.9
    ## ID221 5.013387  4.894118    4.7
    ## ID407 5.188960  5.315385    5.4
    ## ID416 5.135990  5.315385    5.1
    ## ID506 5.267393  5.315385    5.2
    ## ID510 4.712023  4.571429    4.6
    ## ID518 5.197327  5.315385    5.4
  4. Compute the 10-fold cross validation error estimate for the RF and the CART tree. Comment.

    pred_RF_def <- rep(NA, n)
    pred_tree_def <- rep(NA, n)
    
    for (i in 1:10) {
        indi_test <- which(cvInd == i)
        xtrain <- x[-indi_test, ]
        ytrain <- y[-indi_test]
        xtest <- x[indi_test, ]
    
        rf <- randomForest(xtrain, ytrain)
        df <- data.frame(xtrain, albu = ytrain)
        arbre <- rpart::rpart(albu ~ ., df)
    
        pred_RF_def[indi_test] <- predict(rf, xtest)
        pred_tree_def[indi_test] <- predict(arbre, xtest)
    }
    
    errCV_RF_def <- mean((pred_RF_def - y)^2)
    errCV_tree_def <- mean((pred_tree_def - y)^2)
    
    cbind(errCV_RF_def, errCV_tree_def)
    ##      errCV_RF_def errCV_tree_def
    ## [1,]   0.04511597     0.08454725