Hi Samira,
Here is an example of performing cross-validation on
`h2o.ensemble`. Right now you can run this as a stand-alone
function, but in the future you can expect an "nfolds" argument to
be added to `h2o.ensemble`, similar to the other algos. This
function runs the cv process in a simple loop.
I have only tested with binary classification (demo below), but it
should work for regression as well. Let me know if you have any
issues.
kfold_h2o_ensemble <- function(x, y, training_frame, family,
learner, metalearner, nfolds = 5,
cvControl = list(V = 5, shuffle =
TRUE), seed = 1, fold_column = NULL) {
# Note: Only tested on a binary classification ensemble
# TO DO: Test regression
# Create the cross-validation folds (for external
cross-validation)
N <- nrow(training_frame)
if (is.numeric(seed)) set.seed(seed) #If seed is specified, set
seed prior to next step
if (is.null(fold_column)) {
folds <- as.h2o(sample(rep(seq(nfolds),
ceiling(N/nfolds)))[1:N]) # 1-col H2O Frame of fold ids for each
row
} else {
folds <- training_frame[,c(fold_column)]
}
# For storing results
models <- list()
preds <- h2o.createFrame(rows = N, cols = 1,
randomize = FALSE,
value = 0.0,
categorical_fraction = 0.0,
integer_fraction = 0.0,
missing_fraction = 0.0)
for (k in 1:nfolds) {
print(paste0("Begin outer cross-validation loop: ", k, " of ",
nfolds))
# Train an ensemble model on folds != k
fold_idx_test <- which(as.data.frame((folds==k))[,1]==1)
fold_idx_train <- which(as.data.frame((folds==k))[,1]==0)
fold_train <- training_frame[fold_idx_train,]
fold_test <- training_frame[fold_idx_test,]
fold_fit <- h2o.ensemble(x = x, y = y,
training_frame = fold_train,
family = family,
learner = learner,
metalearner = metalearner,
cvControl = cvControl,
seed = seed)
# Generate predictions on the test set
pp <- predict.h2o.ensemble(fold_fit, fold_test)
# Insert preds into appropriate rows
if (family == "binomial") {
preds[fold_idx_test,] <- pp$pred$p1
} else if (family == "gaussian") {
preds[fold_idx_test,] <- pp$pred$predict
}
# Collect models
models[[(length(models)+1)]] <- fold_fit
}
# Return the results
return(list(models = models, folds = folds, preds = preds))
}
# An example of binary classification on a local machine, which
cross-validates h2o.ensemble
library(h2oEnsemble) # Requires version >=0.0.4 of h2oEnsemble
library(cvAUC) # Used to calculate test set AUC (requires version
>=1.0.1 of cvAUC)
localH2O <- h2o.init(nthreads = -1) # Start an H2O cluster with
nthreads = num cores on your machine
# Import a sample binary outcome train/test set into H2O
train <-
h2o.importFile(
"http://www.stat.berkeley.edu/~ledell/data/higgs_10k.csv")
# Identify response variable and predictor cols
y <- "C1"
x <- setdiff(names(train), y)
# Convert response to a categorical (for binary classification)
family <- "binomial"
train[,y] <- as.factor(train[,y])
# Specify the base learner library & the metalearner
# Let's use a reproducible library (set seed on RF and GBM):
h2o.randomForest.1 <- function(..., ntrees = 100, seed = 1)
h2o.randomForest.wrapper(..., ntrees = ntrees, seed = seed)
h2o.gbm.1 <- function(..., ntrees = 100, seed = 1)
h2o.gbm.wrapper(..., ntrees = ntrees, seed = seed)
learner <- c("h2o.glm.wrapper", "h2o.randomForest.1",
"h2o.gbm.1")
metalearner <- "h2o.glm.wrapper"
# Cross-validate the ensemble with nfolds = 5
# nfolds relates to outer loop cross-validation (cross-validate the
ensemble)
# cvControl$V relates to inner loop cross-valiation (inside the
ensemble)
# Note: nfolds and cvControl$V do not have to be the same number
cve <- kfold_h2o_ensemble(x = x, y = y,
training_frame = train,
family = family,
learner = learner,
metalearner = metalearner,
nfolds = 5,
cvControl = list(V = 5, shuffle = TRUE),
seed = 1)
# If we want to calculate the CV AUC of the ensemble,
# we can use the cvAUC package, but that requires us to
# pull the preds, labels and into R as follows
library(cvAUC)
folds <- as.data.frame(cve$folds)[,1] #Folds vector
preds <- as.data.frame(cve$preds)[,1] #Cross-validated predicted
values
labels <- as.data.frame(train[,y])[,1] #Response vector
auc <- cvAUC(predictions = preds, labels = labels, folds = folds)
auc
#$fold.AUC
#[1] 0.7822530 0.7957805 0.7834457 0.7837479 0.7821255
#
#$cvAUC
#[1] 0.7854705