Is it possible to create a Word Cloud with highcharter? - highcharts

I tried it with the following code, but somehow it didn't work for me:
x4 is a dataframe, n1 a character string, and n2 the number of counts per word.
hchart(x4 ,"wordcloud", hcaes(name = "n1", weight = "n2"))

There is a function for word cloud in highcharter, follow this code
data(reuters, package = "kernlab")
text = paste(
reuters[[1]])
textcld <- text %>%
map(str_to_lower) %>%
reduce(str_c) %>%
str_split("\\s+") %>%
unlist() %>%
data_frame(word = .) %>%
count(word, sort = TRUE) %>%
anti_join(tidytext::stop_words)
hchart(textcld, "wordcloud", hcaes(name = word, weight = log(n)))
And should get something like the image below:
enter image description here

Related

Create dummy variable for all neighborhood region depending on another dummy variable (sf object)

I have a spatial dataframe (sf) of all European NUTS2 regions. Within this sf object certain regions have a dummy = 1, and others a dummy = 0.
How can I create a new dummy (lets say "dummy_neighbor") where all the neighboring (st_touches?) regions of the regions with dummy==1 get a dummy_neighbor==1, and all those not touching a dummy==1 region get a dummy_neighbor==0?
For the time being I have this work-around. But I guess there musst be an easier solution?
# load packages
library(sf)
library(here)
library(tidyverse)
library(spdep)
library(expp)
# import nuts 2 sf object
nuts2_sf <- st_read(here("Data", "nuts2_data", "final_nuts.shp"))
# take row numbers as extra column for later
nuts2_sf$rownumber = 1:nrow(nuts2_sf)
#neighbouring list
neighbour <- poly2nb(nuts2_sf, row.names="NUTS_ID", queen=TRUE)
# transform nb into data frame
nb_df <- plyr::ldply(neighbour, rbind)
nb_df$rownumber = 1:nrow(nb_df) # get rownbumer as column
# merge neighbour-list-df with sf by rownumber
df <- merge(nuts2_sf, nb_df, by = "rownumber")
# extract all neighbours of tp100_d=1
# (tp100_d is the name of the original dummy variable)
df_dummy <- df %>% filter(tp100_d == 1)
df_dummy$geometry <- NULL
all_neighbours <- as.vector(as.matrix(df_dummy[,c(66:76)])) %>% unique(.)
# create new neighbourhood-dummy for all neighbours (but not if its a neighbour that has a 1 in its original dummy)
nuts2_sf <- nuts2_sf %>% mutate(nb_dummy = ifelse(rownumber %in% all_neighbours & tp100_d == 0, 1, 0))

How do I fix the "Error in validate_function_class():" within Tidymodels when trying to explore Random Forest Metrics

`# Create a split object
train_test_split <-
rsample::initial_split(
data = nomissingprep,
prop = 0.80
)
# Split the data and build a training and testing data set
train_test_split <- rsample::initial_split(data = nomissingprep,prop = 0.80)
train.data <- train_test_split %>% training()
test.data <- train_test_split %>% testing()
## Recipe Creation
rec <- recipe(preprecentyear ~ ., data = train.data)
## Validation Set
cv_folds <-
vfold_cv(train.data,
v = 5,
strata = preprecentyear)
## Model Fitting -- Random Forest
library(ranger)
rf_spec <-
rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
## Workflow --Random Forest
rf_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(rf_spec)
##Random Forest Metrics
rf_res <-
rf_wflow %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
`
Error in validate_function_class():
!
The combination of metric functions must be:
only numeric metrics
a mix of class metrics and class probability metrics
The following metric function types are being mixed:
other (recall namespace:caret, precision namespace:caret, spec namespace:readr)
class (f_meas, accuracy, kap, sens)
prob (roc_auc)
I am unsure of how to fix this error. All other code prior to the Random Forest Metrics fit well. Any advice is more than welcome. Thanks
log_res <-
tidymodels::tidymodels_prefer( log_wflow %>%
fit_resamples(
resamples = cross_validation,
metrics = metric_set(
kap, sensitivity, specificity, precision_vec, recall_vec),
control = control_resamples(
save_pred = TRUE)
)
)
log_res
Try it this way.

Variable importance from a tidymodels/stacks output?

Is it possible to retrieve the variable importance for one, many, or the full stacked model after running tidymodels/stacks? This is not yet supported by the VIP package, but is there an alternative method to extracting that information?
Using the bulk of the blog from Simon Couch here this is what I am generally trying to attempt. Instead I will use random forests and SVMs to then try to retrieve a variable importance.
library(tidyverse)
library(tidymodels)
library(stacks)
library(vip)
wind_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')
wind <-
wind_raw %>%
dplyr::select(
province_territory,
total_project_capacity_mw,
turbine_rated_capacity_kw = turbine_rated_capacity_k_w,
rotor_diameter_m,
hub_height_m,
year = commissioning_date
) %>%
group_by(province_territory) %>%
mutate(
year = as.numeric(year),
province_territory = case_when(
n() < 50 ~ "Other",
TRUE ~ province_territory
)
) %>%
filter(!is.na(year)) %>%
ungroup() %>%
drop_na(turbine_rated_capacity_kw)
# split into training and testing sets
set.seed(1)
wind_split <- initial_split(wind)
wind_train <- training(wind_split)
wind_test <- testing(wind_split)
# use a 5-fold cross-validation
set.seed(1)
folds <- rsample::vfold_cv(wind_train, v = 5)
# set up a basic recipe
wind_rec <-
recipe(turbine_rated_capacity_kw ~ ., data = wind_train) %>%
step_impute_knn(all_predictors()) %>%
step_dummy(all_nominal()) %>%
step_zv(all_predictors())
# define a minimal workflow
wind_wflow <-
workflow() %>%
add_recipe(wind_rec)
ctrl_res <- control_stack_resamples()
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_mode('regression') %>%
set_engine("ranger", importance = "impurity")
# add it to a workflow
rf_wflow <-
wind_wflow %>%
add_model(rf_spec)
# tune cost and rand_forest and fit to the 5-fold cv
set.seed(1)
rf_res <-
tune_grid(
rf_wflow ,
resamples = folds,
grid = 5,
control = ctrl_grid
)
# define a model using parsnip
svm_spec <-
svm_rbf(
cost = tune(),
rbf_sigma = tune()
) %>%
set_engine("kernlab") %>%
set_mode("regression")
# add it to a workflow
svm_wflow <-
wind_wflow %>%
add_model(svm_spec)
# tune cost and rbf_sigma and fit to the 5-fold cv
set.seed(1)
svm_res <-
tune_grid(
svm_wflow,
resamples = folds,
grid = 5,
control = ctrl_grid
)
# add the models to the stack
wind_data_st <-
stacks() %>%
add_candidates(rf_res) %>%
add_candidates(svm_res) %>%
blend_predictions() %>%
fit_members()
# attempt to plot the variable importance of the stacked model
wind_data_st %>%
vip()
I return Error: Model-specific variable importance scores are currently not available for this type of model., which is self explanatory, but is there a work around to extract this information? Maybe outside of VIP? Is it possible to pluck out one of the viable models that went into the stack to evaluated? Does anyone know if VIP is planning on putting out a solution to this? Thanks in advance!
I've had a similar issue, and what I've done is make a tibble of variable importance for each member of the stack, then normalize them onto the same scale, and multiply by their relative weight in the stack to have a summed total relative importance.
I couldn't reproduce your code, but here's an example of what you can try...
After you've ran blend_predictions(), you can extract the weights. Then create a tibble for each that includes a column for Variable and a column for importance. Then join together and you'll have the weight importance.
library("DALEX")
library("dplyr")
library("tidymodels")
colnames(fifa)
fifa_small <- fifa %>%
select(value_eur, age,
attacking_crossing:attacking_volleys,
defending_marking:defending_sliding_tackle) %>%
as_tibble() %>% dplyr::slice_sample(n = 1000)
fifa_small_folds <- vfold_cv(fifa_small, v = 8, repeats = 1)
fifa_small_folds
basic_rec <-
recipe(value_eur ~ ., data = fifa_small) %>%
step_nzv(all_numeric_predictors()) %>%
step_normalize(all_numeric(), -all_outcomes())
model1 <-
boost_tree(trees = 1000) %>%
set_engine('xgboost', importance = TRUE) %>%
set_mode('regression')
model2 <-
linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine('glmnet')
model3 <-
linear_reg(penalty = tune(), mixture = 0) %>%
set_engine('glmnet')
wfs <-
workflow_set(
preproc = list(basic_rec),
models = list(model1, model2, model3),
cross = T )
wfs
doParallel::registerDoParallel()
wfs_rs <-
workflow_map(
wfs,
"tune_grid",
resamples = fifa_small_folds,
grid = 10,
control = control_grid(save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
) )
doParallel::stopImplicitCluster()
library(stacks)
tidymodels_prefer()
wfs_stack <-
stacks() %>%
add_candidates(wfs_rs)
blend_ens <- blend_predictions(wfs_stack, penalty = 10^seq(-2, 0, length = 10))
blend_ens
ens1_wt <- stacks:::top_coefs(blend_ens) %>% slice(1) %>% pull(weight)
ens2_wt <- stacks:::top_coefs(blend_ens) %>% slice(2) %>% pull(weight)
## Get the workflowset
individ_ens1_best_fit <- extract_workflow(wfs_rs, id = "recipe_boost_tree")
## extract the tuned results from the workflow
individ_ens1_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_boost_tree",
"result"][[1]][[1]]
individ_ens1_lowest_rmse <- individ_ens1_best_tuned %>%
show_best("rmse") %>%
slice(1)
## fit the final model
individ_ens1_best_final <- finalize_workflow(individ_ens1_best_fit, individ_ens1_lowest_rmse)
individ_ens1_bestfinal_1 <- individ_ens1_best_final %>% fit(fifa_small)
individ_ens1_vi_tbl <- individ_ens1_bestfinal_1 %>%
extract_fit_parsnip() %>%
vip::vi() %>%
mutate(
ens1_Importance = abs(Importance),
Variable = factor(Variable), .keep = "unused")
blend_ens
ens2_config <- ens_name_fn(blend_ens, 2)
ens2_id <- ens_id_fn(blend_ens, 2)
## Get the workflowset
individ_ens2_best_fit <- extract_workflow(wfs_rs, id = "recipe_linear_reg_3")
## extract the tuned results from the best workflow
individ_ens2_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_linear_reg_3",
"result"][[1]][[1]]
individ_ens2_lowest_rmse <- individ_ens2_best_tuned %>%
show_best("rmse") %>% filter(.config == "Preprocessor1_Model01") %>% slice(1)
## fit the final model
individ_ens2_best_final <- finalize_workflow(individ_ens2_best_fit, individ_ens2_lowest_rmse)
individ_ens2_bestfinal_1 <- individ_ens2_best_final %>% fit(fifa_small)
individ_ens2_vi_tbl <- individ_ens2_bestfinal_1 %>%
extract_fit_parsnip() %>%
vip::vi(lambda = individ_ens2_lowest_rmse$penalty) %>% # include lambda for lasso or ridge
mutate(
ens2_Importance = abs(Importance),
Variable = factor(Variable), .keep = "unused")
ens_vi_joined <- individ_ens1_vi_tbl %>%
left_join(individ_ens2_vi_tbl, by = c("Variable")) %>%
mutate(across(2:ncol(.), ~ifelse(is.na(.), 0, .)),
ens1_normed = ens1_Importance/ sum(ens1_Importance),
ens2_normed = ens2_Importance/ sum(ens2_Importance),
ens1_wted = ens1_normed * ens1_wt,
ens2_wted = ens2_normed * ens2_wt,
) %>%
rowwise() %>%
mutate(summed_importance = sum(c_across(ends_with("wted"))) ) %>%
ungroup() %>%
mutate(
total_importance = summed_importance/ sum(summed_importance), #normalized
)
ens_vi_joined %>% select(Variable, total_importance) %>%
ggplot(aes(total_importance, fct_reorder(Variable, total_importance)))+
geom_col()

Crosstalk links broken by second Leaflet addCircleMarkers call

I am working with multivariant data linking Leaflet and d3scatter plots. It works well for one variable. If I try to include a second variable in Leaflet by a second addCircleMarkers and addLayersControl then the sharedData links break, the filtering doesn't work and the brushing doesn't work. Thanks in advance.
A MWE is attached:
library("crosstalk")
library("d3scatter")
library("leaflet")
Long <- c(117.4,117.5,117.6)
Lat<- c(-33.7,-33.8,-33.9)
var1 <- c(21,22,23)
var2 <- c(31,32,33)
species <- c(8,9,10)
df1<- data.frame(Long, Lat, var1, var2, species)
sdf1 <- SharedData$new(df1)
col_1 <- c( "yellow" ,"black" ,"orange")
col_2 <- c("red" ,"green" ,"blue")
l <- leaflet(sdf1)%>%
setView(117.5, -33.8, 10) %>%
addCircleMarkers(radius = 1, color = col_1, group = "1") %>%
# addCircleMarkers(radius = 1, color = col_2, group = "2") %>%
# PROBLEM - adding the second "addCircleMarkers" enables the overlayGroups but
# it breaks the link between the plots and breaks the filter
addLayersControl(overlayGroups=c("1","2"))
m <- list(l, filter_checkbox("unique_id_for_species", "Animal Species", sdf1, ~species))
n <- list(d3scatter(sdf1, ~var2, ~var1, color = ~species, x_lim = c(30,40), y_lim = c(20,25), width="70%", height=200),
d3scatter(sdf1, ~var1, ~var2, color = ~species, y_lim = c(30,40), x_lim = c(20,25), width="70%", height=200))
bscols(m, n)

Very long run times for drake, compared to when I run simply through R

I am trying to use drake for my workflow. It seems to have a lot of potential, but I noticed that that drake takes a very long time to run, and even simple step that take less than a second when I run "manually", can take 20 seconds or more when they are run with drake.
I'm aware that I did not provide enough details on this problem. Please tell me what kind of details to provide, and I will do so.
The dataset contains protein levels (a few tens) measured in patients undergoing various treatments. The protein levels are read from an ExpressionSet object, and then a linear model (including contrasts) is performed on each of these proteins. Here are the essential parts of the code:
pt_df_for_lm <- function(protein, eset){
as.data.frame(exprs(eset)[protein,]) %>%
rownames_to_column(var = "Sample.Name") %>%
magrittr::set_colnames(c("Sample.Name","pt_level")) %>%
as_tibble() %>%
inner_join(pData(eset), by = "Sample.Name") %>%
mutate(drug.visit = ifelse(visit_id=="W0", "W0", paste0(drug.dose, ".", visit_id))) %>%
mutate(drug.visit = fct_relevel(factor(drug.visit), "W0") ) %>%
select(Sample.Name, drug.dose, patient_id, visit_id, drug.visit, pt_level) %>%
return()
}
lm_contrasts_drug_vs_placebo <- function(res_lm){
coef_names <- names(coef(res_lm))
contrasts_mat <-
tibble(coef = coef_names) %>%
filter(!grepl("patient_id",coef)) %>%
mutate(term=make.names(sub("drug.visit","",coef))) %>%
inner_join(possible_terms_df) %>%
filter(drug!="Placebo") %>%
mutate(contrast_name = paste0(make.names(drg.ds),".",week, " - Placebo.0.", week)) %>%
mutate(coef_placebo = paste0("drug.visitPlacebo.0.",week)) %>%
mutate(contrast_vector = map2(coef, coef_placebo, function(cf_drug, cf_placebo){
contrast_vector <- rep(0,length(coef_names))
contrast_vector[which(coef_names==cf_drug)] <- 1
contrast_vector[which(coef_names==cf_placebo)] <- -1
return(contrast_vector)
} )) %>%
transmute(contrast_tbl = map2(contrast_name, contrast_vector, function(cname, cvec){
ctbl <- enframe(cvec, name = NULL)
names(ctbl) <- cname
return(ctbl)
} )) %>%
deframe() %>%
bind_cols() %>%
as.matrix() %>%
magrittr::set_rownames(coef_names) %>%
t()
contrast_results_df <-
multcomp::glht(model=res_lm, linfct = contrasts_mat ) %>%
summary() %>%
broom::tidy() %>%
dplyr::select(-rhs) %>%
rename(term = lhs)
possible_terms_df %>%
inner_join(contrast_results_df) %>%
return()
}
plan <- drake_plan(
pt_eset = target(readRDS(paste0(INDIR,"pt_results.rds"))),
pt_df = target(pt_df_for_lm(prot, pt_eset),
transform=map(prot=!!all_proteins)),
res_pt_lm = target(lm(pt_level ~0 + patient_id + drug.visit, data = pt_df),
transform=map(prot, .id=prot)),
res_pt_lm_df = target(res_pt_lm %>%
broom::tidy() %>%
filter(!grepl("patient_id",term)) %>%
mutate(term = make.names(sub("drug.visit","",term))) %>%
mutate(protein = prot) %>%
select(protein, everything()),
transform=map(res_pt_lm, prot, .id=prot)),
res_pt_lm_contrasts_df = target(lm_contrasts_drug_vs_placebo(res_pt_lm) %>%
mutate(protein=prot),
transform=map(res_pt_lm, prot, .id=prot)),
combined_res_pt_lm_df = target(bind_rows(res_pt_lm_df, res_pt_lm_contrasts_df),
transform=combine(res_pt_lm_df, res_pt_lm_contrasts_df)),
output_res_pt_lm_df = write_csv(combined_res_pt_lm_df,
file_out(!!file.path(OUTDIR,"pt_lm_results.csv"))),
trace = TRUE
)
config <- drake_config(plan)
#vis_drake_graph(config)
make(plan, lock_envir=FALSE)
The code is placed within an rmarkdown notebook.
Gil

Resources