I am performing nested resampling using the following code:
MSvCon<-read.csv("MS v Control Proteomics Final.csv", row.names=1)
MSvCon$Status<-as.factor(MSvCon$Status)
MSvCon[,2:4399]<-scale(MSvCon[,2:4399], center=TRUE, scale=TRUE)
set.seed(123, "L'Ecuyer")
task = as_task_classif(MSvCon, target = "Status")
learner = lrn("classif.ranger", importance = "impurity", num.trees=10000)
set_threads(learner, n = 8)
measure = msr("classif.fbeta", beta=1, average="micro")
terminator = trm("none")
resampling_inner = rsmp("repeated_cv", folds = 10, repeats = 10)
at = AutoFSelector$new(
learner = learner,
resampling = resampling_inner,
measure = measure,
terminator = terminator,
fselect = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE))
resampling_outer = rsmp("repeated_cv", folds = 10, repeats = 10)
rr = resample(task, at, resampling_outer)
At the end of the run, I get the following message:
Error: No model stored`
If I specify store_models=TRUE, the run crashes for this large model due to RAM consumption. I am currently running the models on RStudio Workbench with 128GB of RAM. Therefore, I read that not specifying a store_model argument would not store the intermediate models to reduce RAM consumption but that I would be able to extract predictions and some performance measures to report. However, this is when I ran into the error. I haven't tried setting the store_backends=FALSE. Would this potentially help in any way?
Any assistance/insight into where I may be going wrong and how to adjust parameters to get this nested resampling to run and be able to extract predictions/performance measures would be very helpful. Thanks!
You can try the new version. Install the development version with remotes::install_github("mlr-org/mlr3fselect#model"). The models are now saved until the importance is extracted. So if you use holdout resampling you need enough RAM for one model.
If I run the following code it uses about 30GB RAM (I have 128GB on my RStudio Pro System):
MSvCon<-read.csv("MS v Control Proteomics Final.csv", row.names=1)
MSvCon$Status<-as.factor(MSvCon$Status)
MSvCon[,2:4399]<-scale(MSvCon[,2:4399], center=TRUE, scale=TRUE)
lgr::get_logger("mlr3")$set_threshold("info")
lgr::get_logger("bbotk")$set_threshold("info")
set.seed(123, "L'Ecuyer")
task = as_task_classif(MSvCon, target = "Status")
learner = lrn("classif.ranger", predict_type="prob", importance = "permutation", num.trees=10000)
set_threads(learner, n = 8)
resampling = rsmp("repeated_cv", folds = 10, repeats = 10)
measure = msr("classif.fbeta", beta=1, average="micro")
terminator = trm("none")
instance = FSelectInstanceSingleCrit$new(
task = task,
learner = learner,
resampling = resampling,
measure = measure,
terminator = terminator,
store_models = TRUE)
fselector = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE)
fselector$optimize(instance)
When I would run the following code as part of nested resampling prior to the package update it would max out RAM and crash:
MSvCon<-read.csv("MS v Control Proteomics Final.csv", row.names=1)
MSvCon$Status<-as.factor(MSvCon$Status)
MSvCon[,2:4399]<-scale(MSvCon[,2:4399], center=TRUE, scale=TRUE)
set.seed(123, "L'Ecuyer")
task = as_task_classif(MSvCon, target = "Status")
learner = lrn("classif.ranger", predict_type="prob", importance = "permutation", num.trees=10000)
set_threads(learner, n = 8)
measure = msr("classif.fbeta", beta=1, average="micro")
terminator = trm("none")
resampling_inner = rsmp("repeated_cv", folds = 10, repeats = 10)
at = AutoFSelector$new(
learner = learner,
resampling = resampling_inner,
measure = measure,
terminator = terminator,
fselect = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE),
store_models=TRUE)
resampling_outer = rsmp("repeated_cv", folds = 10, repeats = 10)
rr = resample(task, at, resampling_outer, store_models=TRUE)
Also, when I would run the following code as part of nested resampling prior to the package update it would max out RAM and crash:
MSvCon<-read.csv("MS v Control Proteomics Final.csv", row.names=1)
MSvCon$Status<-as.factor(MSvCon$Status)
MSvCon[,2:4399]<-scale(MSvCon[,2:4399], center=TRUE, scale=TRUE)
set.seed(123, "L'Ecuyer")
task = as_task_classif(MSvCon, target = "Status")
learner = lrn("classif.ranger", predict_type="prob", importance = "permutation", num.trees=10000)
set_threads(learner, n = 8)
measure = msr("classif.fbeta", beta=1, average="micro")
terminator = trm("none")
resampling_inner = rsmp("repeated_cv", folds = 10, repeats = 10)
at = AutoFSelector$new(
learner = learner,
resampling = resampling_inner,
measure = measure,
terminator = terminator,
fselect = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE),
store_models=FALSE)
resampling_outer = rsmp("repeated_cv", folds = 10, repeats = 10)
rr = resample(task, at, resampling_outer, store_models=TRUE)
When I would run the following code as part of nested resampling prior to the package update it would also use about 30GB of RAM but wouldn't have any of the inner FSelect results:
MSvCon<-read.csv("MS v Control Proteomics Final.csv", row.names=1)
MSvCon$Status<-as.factor(MSvCon$Status)
MSvCon[,2:4399]<-scale(MSvCon[,2:4399], center=TRUE, scale=TRUE)
set.seed(123, "L'Ecuyer")
task = as_task_classif(MSvCon, target = "Status")
learner = lrn("classif.ranger", predict_type="prob", importance = "permutation", num.trees=10000)
set_threads(learner, n = 8)
measure = msr("classif.fbeta", beta=1, average="micro")
terminator = trm("none")
resampling_inner = rsmp("repeated_cv", folds = 10, repeats = 10)
at = AutoFSelector$new(
learner = learner,
resampling = resampling_inner,
measure = measure,
terminator = terminator,
fselect = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE),
store_models=TRUE)
resampling_outer = rsmp("repeated_cv", folds = 10, repeats = 10)
rr = resample(task, at, resampling_outer)
Then if I run the following code with the GitHub version of the package, the RAM maxes out and crashes:
MSvCon<-read.csv("MS v Control Proteomics Final.csv", row.names=1)
MSvCon$Status<-as.factor(MSvCon$Status)
MSvCon[,2:4399]<-scale(MSvCon[,2:4399], center=TRUE, scale=TRUE)
set.seed(123, "L'Ecuyer")
task = as_task_classif(MSvCon, target = "Status")
learner = lrn("classif.ranger", predict_type="prob", importance = "permutation", num.trees=10000)
set_threads(learner, n = 8)
measure = msr("classif.fbeta", beta=1, average="micro")
terminator = trm("none")
resampling_inner = rsmp("repeated_cv", folds = 10, repeats = 10)
at = AutoFSelector$new(
learner = learner,
resampling = resampling_inner,
measure = measure,
terminator = terminator,
fselect = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE),
store_models=FALSE)
resampling_outer = rsmp("repeated_cv", folds = 10, repeats = 10)
rr = resample(task, at, resampling_outer, store_models=TRUE)
That is why I was thinking the nested resampling was the cause of the issue. Any thoughts on any calls I should tweak? Maybe store_backends?
Thanks for all your help!
I have generated a nested resampling object with the following code:
data<-read.csv("Data.csv", row.names=1)
data$factor<-as.factor(data$factor)
set.seed(123, "L'Ecuyer")
task = as_task_classif(data, target = "factor")
learner = lrn("classif.ranger", importance = "impurity", num.trees=10000)
measure = msr("classif.fbeta", beta=1)
terminator = trm("none")
resampling_inner = rsmp("repeated_cv", folds = 10, repeats = 10)
at = AutoFSelector$new(
learner = learner,
resampling = resampling_inner,
measure = measure,
terminator = terminator,
fselect = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE),
store_models = TRUE)
resampling_outer = rsmp("repeated_cv", folds = 10, repeats = 10)
rr = resample(task, at, resampling_outer)
I have a .csv file with the factor variable permuted/randomized and would like to apply the models of the nested resampling paradigm to this dataset so I can demonstrated differences in the model performance between the real dataset and the permuted/randomized dataset. I am interested in this to validate predictive performance because when sample sizes are small (which is common in biological contexts) prediction accuracy by chance alone can approach 70% or higher based on this paper (https://pubmed.ncbi.nlm.nih.gov/25596422/).
How would I do this using the resample object (rr)?
I think I figured out how to do it (do let me know if I went wrong somewhere):
data<-read.csv("Data.csv", row.names=1)
data$factor<-as.factor(data$factor)
permuted<-read.csv("Data.csv", row.names=1)
permuted$factor<-as.factor(permuted$factor)
set.seed(123, "L'Ecuyer")
task1 = as_task_classif(data, target = "factor")
task2 = as_task_classif(permuted, target = "factor")
task_list = list(task1, task2)
learner = lrn("classif.ranger", importance = "impurity", num.trees=10000)
measure = msr("classif.fbeta", beta=1)
terminator = trm("none")
resampling_inner = rsmp("repeated_cv", folds = 10, repeats = 10)
at = AutoFSelector$new(
learner = learner,
resampling = resampling_inner,
measure = measure,
terminator = terminator,
fselect = fs("rfe", n_features = 1, feature_fraction = 0.5, recursive = FALSE),
store_models = TRUE)
resampling_outer = rsmp("repeated_cv", folds = 10, repeats = 10)
design = benchmark_grid(task=task_list, learner=at, resampling=resampling_outer)
bmr = benchmark(design, store_models = TRUE)
Am I right in assuming that you have two tasks t1 and t2, where the task t2 is permuted and you wanted to compare the performance of a learner on these two tasks?
The way to go then is to use the benchmark() function instead of the resample function. You would have to create two different tasks (one permuted and one not permuted).
You might find the section Resampling and Benchmarking in our book helpful.
Hopefully a simple question but incredibly annoying lack of information in the mlr3 book! So I have a tuned learner (regr.bart) that I want to simply set one hyperparameter to a fixed (not tuned) value. The param in question is 'verbose' which annoyingly is set to TRUE so I get flooded with stupid messages I do not want. I cannot find a simple example where I can set verbose to FALSE. Please help.
library(mlr3tuning)
learner = lrn("classif.rpart", cp = to_tune(0.001, 0.1), keep_model = FALSE)
tune(
method = "random_search",
task = tsk("pima"),
learner = learner,
resampling = rsmp("cv", folds = 3),
measure = msr("classif.ce"),
term_evals = 10,
batch_size = 5
)
or
library(mlr3tuning)
search_space = ps(
cp = p_dbl(lower = 0.001, upper = 0.1)
)
learner = lrn("classif.rpart")
learner$param_set$values$keep_model = FALSE
tune(
method = "random_search",
task = tsk("pima"),
learner = learner,
resampling = rsmp("cv", folds = 3),
measure = msr("classif.ce"),
term_evals = 10,
search_space = search_space,
batch_size = 5
)
I am trying to fit coxph and parametric models and simultaneously perform feature selection and hyperparameter tuning. I have the following code below where I can use either auto_fselecter or auto_tuner inside resample but not both. How do I do that? Do I need to have 3 nested resampling (inner for feature selection, middle for tuning and outer for performance evaluation)? In mlr it was easily done where we use feature selection wrapper then tuning wrapper but not sure how it is best done in mlr3.
I also want to get the selected features at the end. It seems learner$selected_features() does not work for survival models
task = tsk("rats")
learner = lrn("surv.coxph")
outer_cv = rsmp("cv", folds = 10)$instantiate(task)
inner_cv = rsmp("cv", folds = 10)$instantiate(task)
Feat_select= auto_fselecter(method = "random_search",
learner = learner,
resampling = inner_cv,
measure = msr("x"),
term_evals = 200)
model_tune = auto_tuner(method = "irace",
learner = learner,
resampling = inner_cv,
measure = msr("x"),
search_space = ps())
model_res = resample(task, model_tune , outer_cv, store_models = TRUE)
task = tsk("rats")
learner2 = as_learner(po("encode") %>>% lrn("surv.cv_glmnet"))
learner2$selected_features()
Error: attempt to apply non-function
learner3 = mlr3extralearners::lrn("surv.rsfsrc")
learner$selected_features()
Error: attempt to apply non-function
You can nest AutoTuner and AutoFSelector in mlr3:
library(mlr3tuning)
library(mlr3fselect)
task = tsk("pima")
at = auto_tuner(
method = "random_search",
learner = lrn("classif.rpart", cp = to_tune(0.01, 0.1)),
resampling = rsmp("cv", folds = 3),
measure = msr("classif.ce"),
term_evals = 5
)
afs = auto_fselector(
method = "random_search",
learner = at,
resampling = rsmp("cv", folds = 3),
measure = msr("classif.ce"),
term_evals = 5
)
rr = resample(task, afs, resampling = rsmp("cv", folds = 3), store_models = TRUE)
extract_inner_fselect_results(rr)
The way to create stratified folds for cv in caret is like this
library(caret)
library(data.table)
train_dat <- data.table(group = c(rep("group1",10), rep("group2",5)), x1 = rnorm(15), x2 = rnorm(15), label = factor(c(rep("treatment",15), rep("control",15))))
folds <- createFolds(train_dat[, group], k = 5)
fitCtrl <- trainControl(method = "cv", index = folds, classProbs = T, summaryFunction = twoClassSummary)
train(label~., data = train_dat[, !c("group"), with = F], trControl = fitCtrl, method = "xgbTree", metric = "ROC")
To balance group1 and group2, the creation of fold indexes is based on "group" variable.
However, is there any way to createFolds for repeatedcv in caret? So, I can have a balanced split for repeatedcv. Should I combined several createFolds and run trainControl?
trControl = trainControl(method = "cv", index = many_repeated_folds)
Thanks!
createMultiFolds is probably what you are interested in.