Need help getting method = cforest to work within train() from caret using leave one out cross-validation - random-forest

examples_dataset.csv
I have tried looking up so many ways to fix this issue, but no solution so far. I am trying to train conditional inference forests with caret, using the leave one out cross-validation method. I have about 20 (larger) datasets to run this method on, hence the functions to automate some.
A lot of what I have found suggests that my QuantBins are not factors, but I have checked after running prep_df() on the df and those are indeed factors. I get an error when running the conditional inference forests (cif_model()), but not with random forests (rf_model()). The output from trying to make that model is "Something is wrong; all the Accuracy metric values are missing" (pictured below).
Any help and guidance is appreciated!
## Example code
## GOAL: create train() code from caret that uses conditional inference forests to assess variable importance with categorical dependent variable using leave one out cross validation
rm(list=ls())
setwd()
ex.all <- read.csv("examples_dataset.csv", header = TRUE)
loo_ctrl <- trainControl(method = "LOOCV")
#This function works!
rf_model <- function(file.name) {
model <- train(QuantBins ~ F_Cou + B_Cou + Height + GBH + N_b + N_f + L_u + D_w + N_p
+ P_Cou, data = file.name, method = "rf", trControl = loo_ctrl, tuneLength = 10, control =
rpart.control(minbucket = 10), ntree = 50)
return(model)
}
#This does not.
cif_model <- function(file.name) {
model <- train(QuantBins ~ F_Cou + B_Cou + Height + GBH + N_b + N_f + L_u + D_w + N_p
+ P_Cou, data = file.name, method = "cforest", trControl = loo_ctrl, tuneLength = 10, control
= ctree_control(minbucket = 10), ntree = 50)
return(model)
}
##### functions used #####
prep_df <- function(file.name) {
file.name$BINARY <- ifelse(file.name$TOTAL >= 1, "yes", "no")
file.name$BINARY <- as.factor(file.name$BINARY)
file.name$L_u <- as.factor(file.name$L_u)
file.name$TOTAL <- as.numeric(file.name$TOTAL)
## Quantile distribution of breaks in Total Fruit
numbers_of_bins = 5 #this will return four groups
file.name <- file.name %>% mutate(QuantBins = cut(TOTAL, breaks = unique(quantile(TOTAL,
probs=seq.int(0,1, by=1/numbers_of_bins))), include.lowest=TRUE))
print(length(levels(file.name$QuantBins)))
temp <- levels(file.name$QuantBins)
file.name$QuantBins <- as.character(file.name$QuantBins)
for(i in 1:length(file.name$QuantBins)) {
temp1 <- strsplit(file.name$QuantBins[i], ",")
temp2 <- strsplit(temp1[[1]][1], "\\(")
temp3 <- strsplit(temp1[[1]][[2]], "\\]")
file.name$QuantBins[i] <- paste("Fruit", temp2[[1]][2], "to", temp3[[1]][1])
}
file.name$QuantBins <- as.factor(file.name$QuantBins)
file.name$QuantBins <- droplevels(file.name$QuantBins)
print(length(levels(file.name$QuantBins)))
return(file.name)
}
##### running trees #####
ex.all <- prep_df(ex.all)
ex.rf <- rf_model(ex.all)
print(ex.rf)
ex.rf
ex.rf$finalModel$importance
ex.cf <- cif_model(ex.all)
print(ex.cf)
ex.cf
ex.cf$finalModel$importance
Error using cif_model(ex.all) showing "Something is wrong; all the Accuracy metric values are missing"

Related

Clustered resampling for inner layer of Caret recursive feature elimination

I have data where IDs are contained within clusters.
I would like to perform recursive feature elimination using Caret's rfe function which performs the following procedure:
Clustered resampling for the outer layer (line 2.1) is straightforward, using the index parameter.
However, within each outer resample, I would like to tune tuning parameters using cluster-based cross-validation (inner resampling) (line 2.9). Model tuning in the inner layer is possible by specifying a tuneGrid in rfe and having an appropriate trControl. It is this trControl that I would like to change to allow clustered resampling.
The outer resampling is specified in the rfeControl parameter of rfe.
The inner resampling is specified by trControl of rfe which is passed to train.
The trouble I am having is that I can't seem to specify any inner indices, because after the outer resampling, those indices are no longer valid or no longer present in the outer-resampled data.
I am looking for a way to tell train to take an outer resample (which will be missing a cluster against which to validate), and to tune the model using inner resampling by based on folds of the remaining clusters.
The MWE is as minimal as possible:
library(caret)
library(tidyverse)
library(parallel)
library(doParallel)
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
### Create some random data, 10 features, with some influence over a binomial outcome
set.seed(42)
id <- 1:1000
cluster <- rep(1:10, each = 100)
dat <- data.frame(id, cluster, replicate(10,rnorm(n = 1000, mean = runif(1, 0,100)+cluster, sd = runif(1, 0,20))))
dat <- dat %>% mutate(temp = rowSums(across(X1:X10)), prob = range01(temp), outcome = rbinom(n = nrow(dat), size = 1, prob = prob))
dat$outcome <- as.factor(dat$outcome)
levels(dat$outcome) <- c("control", "case")
dat$outcome <- factor(dat$outcome, levels=rev(levels(dat$outcome)))
### Manual outer folds-based cluster ###
for(i in 1:10) {
assign(paste0("index", i), which(dat$cluster!=i))
}
unit_indices <- list(index1, index2, index3, index4, index5, index6, index7, index8, index9, index10)
### Inner resampling method (THIS IS WHAT I'D LIKE TO CHANGE) ###
cv5 <- trainControl(classProbs = TRUE, method = "cv", number = 5, allowParallel = F) ## Is there a way to have inner cluster-based resampling WITHIN the outer cluster-based resampling?
caret_rfe_functions <- list(summary = twoClassSummary,
fit = function (x, y, first, last, ...) {
train(x, y, ...)
},
pred = caretFuncs$pred,
rank = function(object, x, y) {
vimp <- varImp(object)$importance
vimp <- vimp[order(vimp$Overall,decreasing = TRUE),,drop = FALSE]
vimp$var <- rownames(vimp)
vimp
},
selectSize = function (x, metric = "ROC", tol = 1, maximize = TRUE)
{
if (!maximize) {
best <- min(x[, metric])
perf <- (x[, metric] - best)/best * 100
flag <- perf <= tol
}
else {
best <- max(x[, metric])
perf <- (best - x[, metric])/best * 100
flag <- perf <= tol
}
min(x[flag, "Variables"])
},
selectVar = caretFuncs$selectVar)
caret_rfe_ctrl <- rfeControl(
functions = caret_rfe_functions,
saveDetails = TRUE,
index = unit_indices,
indexOut = NULL,
returnResamp = "all",
allowParallel = T, ### change this if you don't want to / can't go parallel
verbose = TRUE
)
#### Feature selection ####
set.seed(42)
cl <- makePSOCKcluster(10) ### for parallel processing if available
registerDoParallel(cl)
rfe_profile_nnet <- rfe(
form = outcome ~
X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = dat,
sizes = seq(2,10,1),
rfeControl = caret_rfe_ctrl,
## pass options to train()
method = "nnet",
preProc = c("center", "scale"),
metric = "ROC",
tuneGrid = expand.grid(size = c(1:5), decay = 5),
trControl = cv5) ### I would like to change this to allow inner cluster-based resampling
stopCluster(cl)
rfe_profile_nnet
plot(rfe_profile_nnet)
Presumably the inner cluster-based resampling would be achieved by specifying a new trainControl containing some dynamic inner index based on the outer resample that is selected at the time:
inner_cluster_tune <- trainControl(classProbs = TRUE,
index = {insert magic here}, ### This is the important bit
returnResamp = "all",
summaryFunction = twoClassSummary,
allowParallel = F) ### especially if the outer resample is parallelised
If you try with the original cluster indices e.g.
inner_cluster_tune <- trainControl(classProbs = TRUE,
index = unit_indices,
returnResamp = "all",
summaryFunction = twoClassSummary,
allowParallel = F)
There are various warnings about missing data in the resamples, and things like 24: In [<-.data.frame(*tmp*, , object$method$center, value = structure(list( ... : provided 81 variables to replace 9 variables.
All help greatly appreciated.
As a postscript question , you can see which parameters were used within your rfe like so:
> rfe_profile_nnet$fit
Neural Network
1000 samples
8 predictor
2 classes: 'case', 'control'
Pre-processing: centered (8), scaled (8)
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 800, 800, 800, 800, 800
Resampling results across tuning parameters:
size Accuracy Kappa
1 0.616 0.1605071
2 0.616 0.1686937
3 0.620 0.1820503
4 0.618 0.1788491
5 0.618 0.1788063
Tuning parameter 'decay' was held constant at a value of 5
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were size = 3 and decay = 5.
But does anyone know if this refers to one, or all of the outer resamples? Presumably the same tuning parameters won't necessarily be chosen across all outer resamples

CFA in data with 3 levels - estimating factor scores at level 2?

I am working on a dataset with 3 levels:
Teacher
School
Country
Using survey responses from the teachers, the aim is to use Confirmatory Factor Analysis (CFA) with the ultimate goal of having the factor scores at the school level.
A further objective is to test for measurement invariance across countries.
I want to use the lavaan package in R, because it is able to deal with the complex survey design of my data trough the lavaan.survey-extension (sampling design, weights etc.)
I have done some preliminary analysis, where i use country-ID as the group argument in the cfa-function. This gives me the possibility to perform the measurement invariance analysis across countries. The issue is, that my factor scores are given at individual teacher level, and i am interested in the school-level.
Any ideas about how to get these factor scores at the school level?
Here are some examples of the functions i use. I do not think that i data sample is needed, but i will create some if it is requested.
library(lavaan)
library(SEMtools)
#define model
reduced_mod <-'
leadership_sup =~ TC3G22D + TC3G22E + TC3G22K
continous_develop_collab =~ TT3G32A + TT3G32B + TT3G32C + TT3G32D '
#Fit model with different restraints:
fit_no_restraint <- cfa(model = reduced_mod, data = cfa_data, group="countryID")
fit_metric <- cfa(model = reduced_mod, data = cfa_data, group="countryID", group.equal = c("loadings"))
fit_scalar <- cfa(model = reduced_mod, data = cfa_data, group="countryID", group.equal = c("loadings", "intercepts"))
#Compare fit statistics
compareFit(scalar = fit_scalar , metric = fit_metric , config = fit_no_restraint)
It seems that you want multilevel measurement invariance. You should use the measEq.syntax() from the semTools package:
## ---------------------
## Multilevel Invariance
## ---------------------
## To test invariance across levels in a MLSEM, specify syntax as though
## you are fitting to 2 groups instead of 2 levels.
mlsem <- ' f1 =~ y1 + y2 + y3
f2 =~ y4 + y5 + y6 '
## metric invariance
syntax.metric <- measEq.syntax(configural.model = mlsem, meanstructure = TRUE,
ID.fac = "std.lv", sample.nobs = c(1, 1),
group = "cluster", group.equal = "loadings")
## by definition, Level-1 means must be zero, so fix them
syntax.metric <- update(syntax.metric,
change.syntax = paste0("y", 1:6, " ~ c(0, NA)*1"))
## save as a character string
mod.metric <- as.character(syntax.metric, groups.as.blocks = TRUE)
## convert from multigroup to multilevel
mod.metric <- gsub(pattern = "group:", replacement = "level:",
x = mod.metric, fixed = TRUE)
## fit model to data
fit.metric <- lavaan(mod.metric, data = Demo.twolevel, cluster = "cluster")
summary(fit.metric)
Source

Simstudy package duplicate keys error and variables referenced not previously defined error

I tried running the following code and encountered several errors with the simstudy package.
library(simstudy)
clusterDef <- defData(varname = "u_3", dist = "normal", formula = 0,
variance = 25.77, id="clus") #cluster-level random effect
clusterDef <- defData(clusterDef, varname = "error", dist = "normal", formula = 0,
variance = 38.35) #error termeriod
clusterDef <- defData(clusterDef, varname = "ind", dist = "nonrandom",
formula = 25) #individuals per cluster
#Generate individual-level random effect and treatment variable
indDef <- defDataAdd(varname = "u_2", dist = "normal", formula = 0,
variance = 120.62)
#Generate clusters of data
set.seed(12345)
cohortsw <- genData(3, clusterDef)
cohortswTm <- addPeriods(cohortsw, nPeriods = 6, idvars = "clus", perName = "period")
cohortswTm <- trtStepWedge(cohortswTm, "clus", nWaves = 3, lenWaves = 1, startPer = 1, grpName = "trt")
cohortswTm <- genCluster(cohortswTm, cLevelVar = "clus", numIndsVar = "ind", level1ID = "id")
Error in vecseq(f__, len__, if (allow.cartesian || notjoin ||
!anyDuplicated(f__, : Join results in 2700 rows; more than 468 =
nrow(x)+nrow(i). Check for duplicate key values in i each of which
join to the same group in x over and over again. If that's ok, try
by=.EACHI to run j for each group to avoid the large allocation. If
you are sure you wish to proceed, rerun with allow.cartesian=TRUE.
Otherwise, please search for this error message in the FAQ, Wiki,
Stack Overflow and data.table issue tracker for advice.
cohortswTm <- addColumns(indDef, cohortswTm)
#Define coefficients for time as a categorical variable
timecoeff1 <- -5.42
timecoeff2 <- -5.72
timecoeff3 <- -7.03
timecoeff4 <- -6.13
timecoeff5 <- -9.13
#Generate outcome y
y <- defDataAdd(varname = "Y", formula = "17.87 + 5.0*trt + timecoeff1*I(period == 1) + timecoeff2*I(period == 2) + timecoeff3*I(period == 3) + timecoeff4*I(period == 4) + timecoeff5*I(period == 5) + u_3 + u_2 + error", dist = "normal")
#Add outcome to dataset
cohortswTm <- addColumns(y, cohortswTm)
Error: Variable(s) referenced not previously defined: timecoeff1,
timecoeff2, timecoeff3, timecoeff4, timecoeff5
Does anybody know why I am getting the errors that were highlighted above? How would I fix the code to prevent them from occuring?
Any help is much appreciated.
The first error is generated because you are trying to create individual level data within each cluster, but each cluster appears repeatedly (over 6 periods). genCluster is expecting that cLevelVar is a unique id. In this case, you can generate 6 individuals per cluster per time period by modifying the genCluster command to be
cohortswTm <- genCluster(cohortswTm, cLevelVar = "timeID",
numIndsVar = "ind", level1ID = "id")
This code creates a "closed" cohort, individuals are observed only in a single period. Generating an open cohort, where individuals might be observed over time as well, is a bit more involved, and is described here.
The second error is generated because simstudy data definitions can only include variables that have been defined in the context of the data definition. So, any constants need to be in the formula. (The formula itself can be updated using updateDef and updateDefAdd if you want to explore the effects of different covariate levels.)
This is how y should be defined:
y <- defDataAdd(varname = "Y", formula = "17.87 + 5.0*trt -
5.42*I(period == 1) - 5.72*I(period == 2) - 7.03*I(period == 3) -
6.13*I(period == 4) - 9.13*I(period == 5) + u_3 + u_2 + error",
dist = "normal")

R - RW metropolis using gibbs fails

I want to sample from the posterior, where LambdaA and LambdaB are exponential rates of A and B. Also, y is the observations of the r.v.'s.
The posterior is given by
and for numerical reasons, i am taking the log of this function.
Data:
n<-100
y<- c(rexp(n))
Logarithm of posterior:
dmix<-function(LambdaA,LambdaB,w){
ifelse( LambdaA<=0|LambdaB<=0|w<0|w>1 ,0,log(w*LambdaA*LambdaB*exp(-2*(LambdaA+LambdaB))*prod(w*LambdaA*exp(-
LambdaA*y) + (1-w)*LambdaB*exp(-LambdaB*y)) ))}
U-values
U.lambdaB <- runif(1)
U.lambdaA<- runif(1)
U.w<- runif(1)
Count steps
REJLambdaB <- 1
REJw <- 1
REJLambdaA<-1
Initial points
LambdaB <- LambdaA<- w<- numeric(n)
LambdaA[1]<-0.5
LambdaB[1] <- 0.5
w[1] <- 0.5
Random walk MH algorithm, updating each component at a time:
for (t in 2:n){
LambdaBprop<- rnorm(1,LB[t-1],0.5)
wprop<- rnorm(1,w[t-1],0.5)
LambdaAprop<- rnorm(1,LB[t-1],0.5)
logalpha1 = dmix(LambdaAprop,LambdaB[t-1],w[t-1])-dmix(LambdaA[t-1],LambdaB[t-
1],w[t-1])
logalpha2 = dmix(LambdaA[t-1],LambdaBprop,w[t-1])-dmix(LA[t-1],LB[t-1],w[t-
1])
if (!is.null(log(U.lambdaB) > logalpha2))
{LambdaB[t] <- LambdaBprop} ## accepted
else{LambdaB[t] <- LambdaB[t-1] ##rejected
REJLambdaB<-REJLambdaB+1}
if (!is.null(log(U.lambdaA) > logalpha1))
{LambdaA[t]<-LambdaAprop}
else {LambdaA[t]<-LambdaA[t-1]
REJLambdaA<-REJLambdaA+1}
if (w[t]<0|w[t]>1)
{w[t]<-w[t-1]}
else {w[t]<-wprop
REJw<-REJw+1}
}
Ultimately, I am having problems with my posterior since I keep getting either infinity or 0's when evaluating logalpha's. Note that i am looking to compare
log($\alpha(x'|x))$ with log(U). Any help to get this code to work ?
If you really think that a random walk means
lambdB[t]<- lambdB[t-1] + runif(1)
w[t]<- w[t-1] + runif(1)
lambdA[t] <- lambdB[t-1] + runif(1)
you should reconsider and invest into reading the bases of Markov chain theory and Markov chain Monte Carlo: At each iteration you add a Uniform U(0,1) variate to the current value. Therefore you always propose to increase the current value. Do you think this could ever produce an ergodic Markov chain?
There is also a mistake in dmix: since you work with the logarithm, remember that log(0)=-oo. And the quantities logalpha1 and logalpha2 are not updated correctly. And many more programming errors, like the incorrect use of !is.null... Anyway here is a corrected R code that works:
n<-100
y<- c(rexp(n))
#Logarithm of posterior:
dmix<-function(LambdaA,LambdaB,w){
ifelse( (LambdaA<=0)|(LambdaB<=0)|(w<0)|(w>1) ,
-1e50,log(w*LambdaA*LambdaB)-2*(LambdaA+LambdaB)+sum(log(w*LambdaA*exp(-
LambdaA*y) + (1-w)*LambdaB*exp(-LambdaB*y))) )}
#Count steps
REJLambdaB <- 1
REJw <- 1
REJLambdaA<-1
#Initial points
N <- 1e4
LambdaB <- LambdaA <- w<- numeric(N)
LambdaA[1] <- LambdaB[1] <- w[1] <- 0.5
U.lambdaB <- runif(N)
U.lambdaA<- runif(N)
U.w <- runif(N)
for (t in 2:N){
LambdaBprop=rnorm(1,LambdaB[t-1],0.5)
LambdaAprop=rnorm(1,LambdaA[t-1],0.5)
wprop=rnorm(1,w[t-1],0.05)
logalpha2 = dmix(LambdaA[t-1],LambdaBprop,w[t-1])-dmix(LambdaA[t-1],LambdaB[t-1],w[t-1])
if ((log(U.lambdaB[t]) < logalpha2))
{LambdaB[t] <- LambdaBprop} ## accepted
else{LambdaB[t] <- LambdaB[t-1] ##rejected
REJLambdaB<-REJLambdaB+1}
logalpha1 = dmix(LambdaAprop,LambdaB[t],w[t-1])-dmix(LambdaA[t-1],LambdaB[t],w[t-1])
if ((log(U.lambdaA[t]) < logalpha1))
{LambdaA[t]<-LambdaAprop}
else {LambdaA[t]<-LambdaA[t-1]
REJLambdaA<-REJLambdaA+1}
logw = dmix(LambdaA[t],LambdaB[t],wprop)-dmix(LambdaA[t],LambdaB[t],w[t-1])
if (w[t]<0|w[t]>1|(log(U.w[t])>logw))
{w[t]<-w[t-1]}
else {w[t]<-wprop
REJw<-REJw+1}
}
As shown by the outcome
the posterior produces a symmetric outcome in the Lambda's.

Tuning parameters in caret error despite assigning grids and as.factor

Any help appreciated. Been at this for weeks. :(
install.packages("klaR", dependencies=TRUE)
library(klaR)
install.packages("caret", dependencies=TRUE)
library(caret)
install.packages("e1071", dependencies=TRUE)
library(e1071)
install.packages("gmodels", dependencies=TRUE)
library(gmodels)
install.packages("gbm", dependencies=TRUE)
library(gbm)
install.packages("foreach", dependencies=TRUE)
library(foreach)
Load Grading Data
grading <- read.csv("~/PA_DataFinal/GradingData160315.csv")
create stratified sample # 1%
dfstrat <- stratified(grading, "FailPass", .01)
save(dfstrat, file = "c:/Users/gillisn/Documents/PA_DataFinal/RResults/GradingRResults/iteration 1/dfstrat.rda")
split data into train and test #75:25. FailPass is the responseVble
set.seed(1)
inTrainingSet <- createDataPartition(dfstrat$FailPass, p = .75, list = FALSE)
trainSet <- dfstrat[inTrainingSet,]
testSet <- dfstrat[-inTrainingSet, ]
set predictors and labels
There are 48 labels and its the last one that want to train on.
Take all the predictors 1-47
x,y is training data
x <- trainSet[,-48]
y <- as.factor(trainSet$FailPass)
i,j is test data
i <- testSet[,-48,]
j <- as.factor(testSet$FailPass)
Set Training control parameters
Bootstrapping itself around in 25 times.
bootControl <- trainControl(number = 25)
The grid is for the decision tree
gbmGrid <- expand.grid(.interaction.depth = (1:5) * 2, .n.trees = (1:10)*25, .shrinkage = .1)
nbGrid <- expand.grid(.fL=0, .usekernel=FALSE)
svmGrid >- expandGrid(.sigma=, .c=)
set.seed(2)
Train the models
naive bayes
nbFit <- train(x,y,method='nb',tuneGrid="nbGrid")
svm
svmFit <- train(x, y,method = "svmRadial", tuneLength = 10,trControl = bootControl, scaled = FALSE)
gbm
gbmFit <- train(x, y,method = "gbm", trControl = bootControl, verbose = FALSE, bag.fraction = 0.5, tuneGrid = gbmGrid)
predict the models on training data
models <- list(svm = svmFit, nb = nbFit, gbm = gbmFit)
predict(models)

Resources