Is it possible to retrieve the variable importance for one, many, or the full stacked model after running tidymodels/stacks? This is not yet supported by the VIP package, but is there an alternative method to extracting that information?
Using the bulk of the blog from Simon Couch here this is what I am generally trying to attempt. Instead I will use random forests and SVMs to then try to retrieve a variable importance.
library(tidyverse)
library(tidymodels)
library(stacks)
library(vip)
wind_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')
wind <-
wind_raw %>%
dplyr::select(
province_territory,
total_project_capacity_mw,
turbine_rated_capacity_kw = turbine_rated_capacity_k_w,
rotor_diameter_m,
hub_height_m,
year = commissioning_date
) %>%
group_by(province_territory) %>%
mutate(
year = as.numeric(year),
province_territory = case_when(
n() < 50 ~ "Other",
TRUE ~ province_territory
)
) %>%
filter(!is.na(year)) %>%
ungroup() %>%
drop_na(turbine_rated_capacity_kw)
# split into training and testing sets
set.seed(1)
wind_split <- initial_split(wind)
wind_train <- training(wind_split)
wind_test <- testing(wind_split)
# use a 5-fold cross-validation
set.seed(1)
folds <- rsample::vfold_cv(wind_train, v = 5)
# set up a basic recipe
wind_rec <-
recipe(turbine_rated_capacity_kw ~ ., data = wind_train) %>%
step_impute_knn(all_predictors()) %>%
step_dummy(all_nominal()) %>%
step_zv(all_predictors())
# define a minimal workflow
wind_wflow <-
workflow() %>%
add_recipe(wind_rec)
ctrl_res <- control_stack_resamples()
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_mode('regression') %>%
set_engine("ranger", importance = "impurity")
# add it to a workflow
rf_wflow <-
wind_wflow %>%
add_model(rf_spec)
# tune cost and rand_forest and fit to the 5-fold cv
set.seed(1)
rf_res <-
tune_grid(
rf_wflow ,
resamples = folds,
grid = 5,
control = ctrl_grid
)
# define a model using parsnip
svm_spec <-
svm_rbf(
cost = tune(),
rbf_sigma = tune()
) %>%
set_engine("kernlab") %>%
set_mode("regression")
# add it to a workflow
svm_wflow <-
wind_wflow %>%
add_model(svm_spec)
# tune cost and rbf_sigma and fit to the 5-fold cv
set.seed(1)
svm_res <-
tune_grid(
svm_wflow,
resamples = folds,
grid = 5,
control = ctrl_grid
)
# add the models to the stack
wind_data_st <-
stacks() %>%
add_candidates(rf_res) %>%
add_candidates(svm_res) %>%
blend_predictions() %>%
fit_members()
# attempt to plot the variable importance of the stacked model
wind_data_st %>%
vip()
I return Error: Model-specific variable importance scores are currently not available for this type of model., which is self explanatory, but is there a work around to extract this information? Maybe outside of VIP? Is it possible to pluck out one of the viable models that went into the stack to evaluated? Does anyone know if VIP is planning on putting out a solution to this? Thanks in advance!
I've had a similar issue, and what I've done is make a tibble of variable importance for each member of the stack, then normalize them onto the same scale, and multiply by their relative weight in the stack to have a summed total relative importance.
I couldn't reproduce your code, but here's an example of what you can try...
After you've ran blend_predictions(), you can extract the weights. Then create a tibble for each that includes a column for Variable and a column for importance. Then join together and you'll have the weight importance.
library("DALEX")
library("dplyr")
library("tidymodels")
colnames(fifa)
fifa_small <- fifa %>%
select(value_eur, age,
attacking_crossing:attacking_volleys,
defending_marking:defending_sliding_tackle) %>%
as_tibble() %>% dplyr::slice_sample(n = 1000)
fifa_small_folds <- vfold_cv(fifa_small, v = 8, repeats = 1)
fifa_small_folds
basic_rec <-
recipe(value_eur ~ ., data = fifa_small) %>%
step_nzv(all_numeric_predictors()) %>%
step_normalize(all_numeric(), -all_outcomes())
model1 <-
boost_tree(trees = 1000) %>%
set_engine('xgboost', importance = TRUE) %>%
set_mode('regression')
model2 <-
linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine('glmnet')
model3 <-
linear_reg(penalty = tune(), mixture = 0) %>%
set_engine('glmnet')
wfs <-
workflow_set(
preproc = list(basic_rec),
models = list(model1, model2, model3),
cross = T )
wfs
doParallel::registerDoParallel()
wfs_rs <-
workflow_map(
wfs,
"tune_grid",
resamples = fifa_small_folds,
grid = 10,
control = control_grid(save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
) )
doParallel::stopImplicitCluster()
library(stacks)
tidymodels_prefer()
wfs_stack <-
stacks() %>%
add_candidates(wfs_rs)
blend_ens <- blend_predictions(wfs_stack, penalty = 10^seq(-2, 0, length = 10))
blend_ens
ens1_wt <- stacks:::top_coefs(blend_ens) %>% slice(1) %>% pull(weight)
ens2_wt <- stacks:::top_coefs(blend_ens) %>% slice(2) %>% pull(weight)
## Get the workflowset
individ_ens1_best_fit <- extract_workflow(wfs_rs, id = "recipe_boost_tree")
## extract the tuned results from the workflow
individ_ens1_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_boost_tree",
"result"][[1]][[1]]
individ_ens1_lowest_rmse <- individ_ens1_best_tuned %>%
show_best("rmse") %>%
slice(1)
## fit the final model
individ_ens1_best_final <- finalize_workflow(individ_ens1_best_fit, individ_ens1_lowest_rmse)
individ_ens1_bestfinal_1 <- individ_ens1_best_final %>% fit(fifa_small)
individ_ens1_vi_tbl <- individ_ens1_bestfinal_1 %>%
extract_fit_parsnip() %>%
vip::vi() %>%
mutate(
ens1_Importance = abs(Importance),
Variable = factor(Variable), .keep = "unused")
blend_ens
ens2_config <- ens_name_fn(blend_ens, 2)
ens2_id <- ens_id_fn(blend_ens, 2)
## Get the workflowset
individ_ens2_best_fit <- extract_workflow(wfs_rs, id = "recipe_linear_reg_3")
## extract the tuned results from the best workflow
individ_ens2_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_linear_reg_3",
"result"][[1]][[1]]
individ_ens2_lowest_rmse <- individ_ens2_best_tuned %>%
show_best("rmse") %>% filter(.config == "Preprocessor1_Model01") %>% slice(1)
## fit the final model
individ_ens2_best_final <- finalize_workflow(individ_ens2_best_fit, individ_ens2_lowest_rmse)
individ_ens2_bestfinal_1 <- individ_ens2_best_final %>% fit(fifa_small)
individ_ens2_vi_tbl <- individ_ens2_bestfinal_1 %>%
extract_fit_parsnip() %>%
vip::vi(lambda = individ_ens2_lowest_rmse$penalty) %>% # include lambda for lasso or ridge
mutate(
ens2_Importance = abs(Importance),
Variable = factor(Variable), .keep = "unused")
ens_vi_joined <- individ_ens1_vi_tbl %>%
left_join(individ_ens2_vi_tbl, by = c("Variable")) %>%
mutate(across(2:ncol(.), ~ifelse(is.na(.), 0, .)),
ens1_normed = ens1_Importance/ sum(ens1_Importance),
ens2_normed = ens2_Importance/ sum(ens2_Importance),
ens1_wted = ens1_normed * ens1_wt,
ens2_wted = ens2_normed * ens2_wt,
) %>%
rowwise() %>%
mutate(summed_importance = sum(c_across(ends_with("wted"))) ) %>%
ungroup() %>%
mutate(
total_importance = summed_importance/ sum(summed_importance), #normalized
)
ens_vi_joined %>% select(Variable, total_importance) %>%
ggplot(aes(total_importance, fct_reorder(Variable, total_importance)))+
geom_col()
Related
`# Create a split object
train_test_split <-
rsample::initial_split(
data = nomissingprep,
prop = 0.80
)
# Split the data and build a training and testing data set
train_test_split <- rsample::initial_split(data = nomissingprep,prop = 0.80)
train.data <- train_test_split %>% training()
test.data <- train_test_split %>% testing()
## Recipe Creation
rec <- recipe(preprecentyear ~ ., data = train.data)
## Validation Set
cv_folds <-
vfold_cv(train.data,
v = 5,
strata = preprecentyear)
## Model Fitting -- Random Forest
library(ranger)
rf_spec <-
rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
## Workflow --Random Forest
rf_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(rf_spec)
##Random Forest Metrics
rf_res <-
rf_wflow %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
`
Error in validate_function_class():
!
The combination of metric functions must be:
only numeric metrics
a mix of class metrics and class probability metrics
The following metric function types are being mixed:
other (recall namespace:caret, precision namespace:caret, spec namespace:readr)
class (f_meas, accuracy, kap, sens)
prob (roc_auc)
I am unsure of how to fix this error. All other code prior to the Random Forest Metrics fit well. Any advice is more than welcome. Thanks
log_res <-
tidymodels::tidymodels_prefer( log_wflow %>%
fit_resamples(
resamples = cross_validation,
metrics = metric_set(
kap, sensitivity, specificity, precision_vec, recall_vec),
control = control_resamples(
save_pred = TRUE)
)
)
log_res
Try it this way.
I am working with multivariant data linking Leaflet and d3scatter plots. It works well for one variable. If I try to include a second variable in Leaflet by a second addCircleMarkers and addLayersControl then the sharedData links break, the filtering doesn't work and the brushing doesn't work. Thanks in advance.
A MWE is attached:
library("crosstalk")
library("d3scatter")
library("leaflet")
Long <- c(117.4,117.5,117.6)
Lat<- c(-33.7,-33.8,-33.9)
var1 <- c(21,22,23)
var2 <- c(31,32,33)
species <- c(8,9,10)
df1<- data.frame(Long, Lat, var1, var2, species)
sdf1 <- SharedData$new(df1)
col_1 <- c( "yellow" ,"black" ,"orange")
col_2 <- c("red" ,"green" ,"blue")
l <- leaflet(sdf1)%>%
setView(117.5, -33.8, 10) %>%
addCircleMarkers(radius = 1, color = col_1, group = "1") %>%
# addCircleMarkers(radius = 1, color = col_2, group = "2") %>%
# PROBLEM - adding the second "addCircleMarkers" enables the overlayGroups but
# it breaks the link between the plots and breaks the filter
addLayersControl(overlayGroups=c("1","2"))
m <- list(l, filter_checkbox("unique_id_for_species", "Animal Species", sdf1, ~species))
n <- list(d3scatter(sdf1, ~var2, ~var1, color = ~species, x_lim = c(30,40), y_lim = c(20,25), width="70%", height=200),
d3scatter(sdf1, ~var1, ~var2, color = ~species, y_lim = c(30,40), x_lim = c(20,25), width="70%", height=200))
bscols(m, n)
I am trying to use drake for my workflow. It seems to have a lot of potential, but I noticed that that drake takes a very long time to run, and even simple step that take less than a second when I run "manually", can take 20 seconds or more when they are run with drake.
I'm aware that I did not provide enough details on this problem. Please tell me what kind of details to provide, and I will do so.
The dataset contains protein levels (a few tens) measured in patients undergoing various treatments. The protein levels are read from an ExpressionSet object, and then a linear model (including contrasts) is performed on each of these proteins. Here are the essential parts of the code:
pt_df_for_lm <- function(protein, eset){
as.data.frame(exprs(eset)[protein,]) %>%
rownames_to_column(var = "Sample.Name") %>%
magrittr::set_colnames(c("Sample.Name","pt_level")) %>%
as_tibble() %>%
inner_join(pData(eset), by = "Sample.Name") %>%
mutate(drug.visit = ifelse(visit_id=="W0", "W0", paste0(drug.dose, ".", visit_id))) %>%
mutate(drug.visit = fct_relevel(factor(drug.visit), "W0") ) %>%
select(Sample.Name, drug.dose, patient_id, visit_id, drug.visit, pt_level) %>%
return()
}
lm_contrasts_drug_vs_placebo <- function(res_lm){
coef_names <- names(coef(res_lm))
contrasts_mat <-
tibble(coef = coef_names) %>%
filter(!grepl("patient_id",coef)) %>%
mutate(term=make.names(sub("drug.visit","",coef))) %>%
inner_join(possible_terms_df) %>%
filter(drug!="Placebo") %>%
mutate(contrast_name = paste0(make.names(drg.ds),".",week, " - Placebo.0.", week)) %>%
mutate(coef_placebo = paste0("drug.visitPlacebo.0.",week)) %>%
mutate(contrast_vector = map2(coef, coef_placebo, function(cf_drug, cf_placebo){
contrast_vector <- rep(0,length(coef_names))
contrast_vector[which(coef_names==cf_drug)] <- 1
contrast_vector[which(coef_names==cf_placebo)] <- -1
return(contrast_vector)
} )) %>%
transmute(contrast_tbl = map2(contrast_name, contrast_vector, function(cname, cvec){
ctbl <- enframe(cvec, name = NULL)
names(ctbl) <- cname
return(ctbl)
} )) %>%
deframe() %>%
bind_cols() %>%
as.matrix() %>%
magrittr::set_rownames(coef_names) %>%
t()
contrast_results_df <-
multcomp::glht(model=res_lm, linfct = contrasts_mat ) %>%
summary() %>%
broom::tidy() %>%
dplyr::select(-rhs) %>%
rename(term = lhs)
possible_terms_df %>%
inner_join(contrast_results_df) %>%
return()
}
plan <- drake_plan(
pt_eset = target(readRDS(paste0(INDIR,"pt_results.rds"))),
pt_df = target(pt_df_for_lm(prot, pt_eset),
transform=map(prot=!!all_proteins)),
res_pt_lm = target(lm(pt_level ~0 + patient_id + drug.visit, data = pt_df),
transform=map(prot, .id=prot)),
res_pt_lm_df = target(res_pt_lm %>%
broom::tidy() %>%
filter(!grepl("patient_id",term)) %>%
mutate(term = make.names(sub("drug.visit","",term))) %>%
mutate(protein = prot) %>%
select(protein, everything()),
transform=map(res_pt_lm, prot, .id=prot)),
res_pt_lm_contrasts_df = target(lm_contrasts_drug_vs_placebo(res_pt_lm) %>%
mutate(protein=prot),
transform=map(res_pt_lm, prot, .id=prot)),
combined_res_pt_lm_df = target(bind_rows(res_pt_lm_df, res_pt_lm_contrasts_df),
transform=combine(res_pt_lm_df, res_pt_lm_contrasts_df)),
output_res_pt_lm_df = write_csv(combined_res_pt_lm_df,
file_out(!!file.path(OUTDIR,"pt_lm_results.csv"))),
trace = TRUE
)
config <- drake_config(plan)
#vis_drake_graph(config)
make(plan, lock_envir=FALSE)
The code is placed within an rmarkdown notebook.
Gil
I tried it with the following code, but somehow it didn't work for me:
x4 is a dataframe, n1 a character string, and n2 the number of counts per word.
hchart(x4 ,"wordcloud", hcaes(name = "n1", weight = "n2"))
There is a function for word cloud in highcharter, follow this code
data(reuters, package = "kernlab")
text = paste(
reuters[[1]])
textcld <- text %>%
map(str_to_lower) %>%
reduce(str_c) %>%
str_split("\\s+") %>%
unlist() %>%
data_frame(word = .) %>%
count(word, sort = TRUE) %>%
anti_join(tidytext::stop_words)
hchart(textcld, "wordcloud", hcaes(name = word, weight = log(n)))
And should get something like the image below:
enter image description here
Any help appreciated. Been at this for weeks. :(
install.packages("klaR", dependencies=TRUE)
library(klaR)
install.packages("caret", dependencies=TRUE)
library(caret)
install.packages("e1071", dependencies=TRUE)
library(e1071)
install.packages("gmodels", dependencies=TRUE)
library(gmodels)
install.packages("gbm", dependencies=TRUE)
library(gbm)
install.packages("foreach", dependencies=TRUE)
library(foreach)
Load Grading Data
grading <- read.csv("~/PA_DataFinal/GradingData160315.csv")
create stratified sample # 1%
dfstrat <- stratified(grading, "FailPass", .01)
save(dfstrat, file = "c:/Users/gillisn/Documents/PA_DataFinal/RResults/GradingRResults/iteration 1/dfstrat.rda")
split data into train and test #75:25. FailPass is the responseVble
set.seed(1)
inTrainingSet <- createDataPartition(dfstrat$FailPass, p = .75, list = FALSE)
trainSet <- dfstrat[inTrainingSet,]
testSet <- dfstrat[-inTrainingSet, ]
set predictors and labels
There are 48 labels and its the last one that want to train on.
Take all the predictors 1-47
x,y is training data
x <- trainSet[,-48]
y <- as.factor(trainSet$FailPass)
i,j is test data
i <- testSet[,-48,]
j <- as.factor(testSet$FailPass)
Set Training control parameters
Bootstrapping itself around in 25 times.
bootControl <- trainControl(number = 25)
The grid is for the decision tree
gbmGrid <- expand.grid(.interaction.depth = (1:5) * 2, .n.trees = (1:10)*25, .shrinkage = .1)
nbGrid <- expand.grid(.fL=0, .usekernel=FALSE)
svmGrid >- expandGrid(.sigma=, .c=)
set.seed(2)
Train the models
naive bayes
nbFit <- train(x,y,method='nb',tuneGrid="nbGrid")
svm
svmFit <- train(x, y,method = "svmRadial", tuneLength = 10,trControl = bootControl, scaled = FALSE)
gbm
gbmFit <- train(x, y,method = "gbm", trControl = bootControl, verbose = FALSE, bag.fraction = 0.5, tuneGrid = gbmGrid)
predict the models on training data
models <- list(svm = svmFit, nb = nbFit, gbm = gbmFit)
predict(models)