How do I fix the "Error in validate_function_class():" within Tidymodels when trying to explore Random Forest Metrics - random-forest

`# Create a split object
train_test_split <-
rsample::initial_split(
data = nomissingprep,
prop = 0.80
)
# Split the data and build a training and testing data set
train_test_split <- rsample::initial_split(data = nomissingprep,prop = 0.80)
train.data <- train_test_split %>% training()
test.data <- train_test_split %>% testing()
## Recipe Creation
rec <- recipe(preprecentyear ~ ., data = train.data)
## Validation Set
cv_folds <-
vfold_cv(train.data,
v = 5,
strata = preprecentyear)
## Model Fitting -- Random Forest
library(ranger)
rf_spec <-
rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
## Workflow --Random Forest
rf_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(rf_spec)
##Random Forest Metrics
rf_res <-
rf_wflow %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
`
Error in validate_function_class():
!
The combination of metric functions must be:
only numeric metrics
a mix of class metrics and class probability metrics
The following metric function types are being mixed:
other (recall namespace:caret, precision namespace:caret, spec namespace:readr)
class (f_meas, accuracy, kap, sens)
prob (roc_auc)
I am unsure of how to fix this error. All other code prior to the Random Forest Metrics fit well. Any advice is more than welcome. Thanks

log_res <-
tidymodels::tidymodels_prefer( log_wflow %>%
fit_resamples(
resamples = cross_validation,
metrics = metric_set(
kap, sensitivity, specificity, precision_vec, recall_vec),
control = control_resamples(
save_pred = TRUE)
)
)
log_res
Try it this way.

Related

Variable importance from a tidymodels/stacks output?

Is it possible to retrieve the variable importance for one, many, or the full stacked model after running tidymodels/stacks? This is not yet supported by the VIP package, but is there an alternative method to extracting that information?
Using the bulk of the blog from Simon Couch here this is what I am generally trying to attempt. Instead I will use random forests and SVMs to then try to retrieve a variable importance.
library(tidyverse)
library(tidymodels)
library(stacks)
library(vip)
wind_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')
wind <-
wind_raw %>%
dplyr::select(
province_territory,
total_project_capacity_mw,
turbine_rated_capacity_kw = turbine_rated_capacity_k_w,
rotor_diameter_m,
hub_height_m,
year = commissioning_date
) %>%
group_by(province_territory) %>%
mutate(
year = as.numeric(year),
province_territory = case_when(
n() < 50 ~ "Other",
TRUE ~ province_territory
)
) %>%
filter(!is.na(year)) %>%
ungroup() %>%
drop_na(turbine_rated_capacity_kw)
# split into training and testing sets
set.seed(1)
wind_split <- initial_split(wind)
wind_train <- training(wind_split)
wind_test <- testing(wind_split)
# use a 5-fold cross-validation
set.seed(1)
folds <- rsample::vfold_cv(wind_train, v = 5)
# set up a basic recipe
wind_rec <-
recipe(turbine_rated_capacity_kw ~ ., data = wind_train) %>%
step_impute_knn(all_predictors()) %>%
step_dummy(all_nominal()) %>%
step_zv(all_predictors())
# define a minimal workflow
wind_wflow <-
workflow() %>%
add_recipe(wind_rec)
ctrl_res <- control_stack_resamples()
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_mode('regression') %>%
set_engine("ranger", importance = "impurity")
# add it to a workflow
rf_wflow <-
wind_wflow %>%
add_model(rf_spec)
# tune cost and rand_forest and fit to the 5-fold cv
set.seed(1)
rf_res <-
tune_grid(
rf_wflow ,
resamples = folds,
grid = 5,
control = ctrl_grid
)
# define a model using parsnip
svm_spec <-
svm_rbf(
cost = tune(),
rbf_sigma = tune()
) %>%
set_engine("kernlab") %>%
set_mode("regression")
# add it to a workflow
svm_wflow <-
wind_wflow %>%
add_model(svm_spec)
# tune cost and rbf_sigma and fit to the 5-fold cv
set.seed(1)
svm_res <-
tune_grid(
svm_wflow,
resamples = folds,
grid = 5,
control = ctrl_grid
)
# add the models to the stack
wind_data_st <-
stacks() %>%
add_candidates(rf_res) %>%
add_candidates(svm_res) %>%
blend_predictions() %>%
fit_members()
# attempt to plot the variable importance of the stacked model
wind_data_st %>%
vip()
I return Error: Model-specific variable importance scores are currently not available for this type of model., which is self explanatory, but is there a work around to extract this information? Maybe outside of VIP? Is it possible to pluck out one of the viable models that went into the stack to evaluated? Does anyone know if VIP is planning on putting out a solution to this? Thanks in advance!
I've had a similar issue, and what I've done is make a tibble of variable importance for each member of the stack, then normalize them onto the same scale, and multiply by their relative weight in the stack to have a summed total relative importance.
I couldn't reproduce your code, but here's an example of what you can try...
After you've ran blend_predictions(), you can extract the weights. Then create a tibble for each that includes a column for Variable and a column for importance. Then join together and you'll have the weight importance.
library("DALEX")
library("dplyr")
library("tidymodels")
colnames(fifa)
fifa_small <- fifa %>%
select(value_eur, age,
attacking_crossing:attacking_volleys,
defending_marking:defending_sliding_tackle) %>%
as_tibble() %>% dplyr::slice_sample(n = 1000)
fifa_small_folds <- vfold_cv(fifa_small, v = 8, repeats = 1)
fifa_small_folds
basic_rec <-
recipe(value_eur ~ ., data = fifa_small) %>%
step_nzv(all_numeric_predictors()) %>%
step_normalize(all_numeric(), -all_outcomes())
model1 <-
boost_tree(trees = 1000) %>%
set_engine('xgboost', importance = TRUE) %>%
set_mode('regression')
model2 <-
linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine('glmnet')
model3 <-
linear_reg(penalty = tune(), mixture = 0) %>%
set_engine('glmnet')
wfs <-
workflow_set(
preproc = list(basic_rec),
models = list(model1, model2, model3),
cross = T )
wfs
doParallel::registerDoParallel()
wfs_rs <-
workflow_map(
wfs,
"tune_grid",
resamples = fifa_small_folds,
grid = 10,
control = control_grid(save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
) )
doParallel::stopImplicitCluster()
library(stacks)
tidymodels_prefer()
wfs_stack <-
stacks() %>%
add_candidates(wfs_rs)
blend_ens <- blend_predictions(wfs_stack, penalty = 10^seq(-2, 0, length = 10))
blend_ens
ens1_wt <- stacks:::top_coefs(blend_ens) %>% slice(1) %>% pull(weight)
ens2_wt <- stacks:::top_coefs(blend_ens) %>% slice(2) %>% pull(weight)
## Get the workflowset
individ_ens1_best_fit <- extract_workflow(wfs_rs, id = "recipe_boost_tree")
## extract the tuned results from the workflow
individ_ens1_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_boost_tree",
"result"][[1]][[1]]
individ_ens1_lowest_rmse <- individ_ens1_best_tuned %>%
show_best("rmse") %>%
slice(1)
## fit the final model
individ_ens1_best_final <- finalize_workflow(individ_ens1_best_fit, individ_ens1_lowest_rmse)
individ_ens1_bestfinal_1 <- individ_ens1_best_final %>% fit(fifa_small)
individ_ens1_vi_tbl <- individ_ens1_bestfinal_1 %>%
extract_fit_parsnip() %>%
vip::vi() %>%
mutate(
ens1_Importance = abs(Importance),
Variable = factor(Variable), .keep = "unused")
blend_ens
ens2_config <- ens_name_fn(blend_ens, 2)
ens2_id <- ens_id_fn(blend_ens, 2)
## Get the workflowset
individ_ens2_best_fit <- extract_workflow(wfs_rs, id = "recipe_linear_reg_3")
## extract the tuned results from the best workflow
individ_ens2_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_linear_reg_3",
"result"][[1]][[1]]
individ_ens2_lowest_rmse <- individ_ens2_best_tuned %>%
show_best("rmse") %>% filter(.config == "Preprocessor1_Model01") %>% slice(1)
## fit the final model
individ_ens2_best_final <- finalize_workflow(individ_ens2_best_fit, individ_ens2_lowest_rmse)
individ_ens2_bestfinal_1 <- individ_ens2_best_final %>% fit(fifa_small)
individ_ens2_vi_tbl <- individ_ens2_bestfinal_1 %>%
extract_fit_parsnip() %>%
vip::vi(lambda = individ_ens2_lowest_rmse$penalty) %>% # include lambda for lasso or ridge
mutate(
ens2_Importance = abs(Importance),
Variable = factor(Variable), .keep = "unused")
ens_vi_joined <- individ_ens1_vi_tbl %>%
left_join(individ_ens2_vi_tbl, by = c("Variable")) %>%
mutate(across(2:ncol(.), ~ifelse(is.na(.), 0, .)),
ens1_normed = ens1_Importance/ sum(ens1_Importance),
ens2_normed = ens2_Importance/ sum(ens2_Importance),
ens1_wted = ens1_normed * ens1_wt,
ens2_wted = ens2_normed * ens2_wt,
) %>%
rowwise() %>%
mutate(summed_importance = sum(c_across(ends_with("wted"))) ) %>%
ungroup() %>%
mutate(
total_importance = summed_importance/ sum(summed_importance), #normalized
)
ens_vi_joined %>% select(Variable, total_importance) %>%
ggplot(aes(total_importance, fct_reorder(Variable, total_importance)))+
geom_col()

Linear Regression in Tensor Flow - Error when modifying getting started code

I am very new to TensorFlow and I am in parallel learning traditional machine learning techniques. Previously, I was able to successfully implement linear regression modelling in matlab and in Python using scikit.
When I tried to reproduce it using Tensorflow with the same dataset, I am getting invalid outputs. Could someone advise me on where I am making the mistake or what I am missing!
Infact, I am using the code from tensor flow introductory tutorial and I just changed the x_train and y_train to a different data set.
# Loading the ML coursera course ex1 (Wk 2) data to try it out
'''
path = r'C:\Users\Prasanth\Dropbox\Python Folder\ML in Python\data\ex1data1.txt'
fh = open(path,'r')
l1 = []
l2 = []
for line in fh:
temp = (line.strip().split(','))
l1.append(float(temp[0]))
l2.append(float(temp[1]))
'''
l1 = [6.1101, 5.5277, 8.5186, 7.0032, 5.8598, 8.3829, 7.4764, 8.5781, 6.4862, 5.0546, 5.7107, 14.164, 5.734, 8.4084, 5.6407, 5.3794, 6.3654, 5.1301, 6.4296, 7.0708, 6.1891, 20.27, 5.4901, 6.3261, 5.5649, 18.945, 12.828, 10.957, 13.176, 22.203, 5.2524, 6.5894, 9.2482, 5.8918, 8.2111, 7.9334, 8.0959, 5.6063, 12.836, 6.3534, 5.4069, 6.8825, 11.708, 5.7737, 7.8247, 7.0931, 5.0702, 5.8014, 11.7, 5.5416, 7.5402, 5.3077, 7.4239, 7.6031, 6.3328, 6.3589, 6.2742, 5.6397, 9.3102, 9.4536, 8.8254, 5.1793, 21.279, 14.908, 18.959, 7.2182, 8.2951, 10.236, 5.4994, 20.341, 10.136, 7.3345, 6.0062, 7.2259, 5.0269, 6.5479, 7.5386, 5.0365, 10.274, 5.1077, 5.7292, 5.1884, 6.3557, 9.7687, 6.5159, 8.5172, 9.1802, 6.002, 5.5204, 5.0594, 5.7077, 7.6366, 5.8707, 5.3054, 8.2934, 13.394, 5.4369]
l2 = [17.592, 9.1302, 13.662, 11.854, 6.8233, 11.886, 4.3483, 12.0, 6.5987, 3.8166, 3.2522, 15.505, 3.1551, 7.2258, 0.71618, 3.5129, 5.3048, 0.56077, 3.6518, 5.3893, 3.1386, 21.767, 4.263, 5.1875, 3.0825, 22.638, 13.501, 7.0467, 14.692, 24.147, -1.22, 5.9966, 12.134, 1.8495, 6.5426, 4.5623, 4.1164, 3.3928, 10.117, 5.4974, 0.55657, 3.9115, 5.3854, 2.4406, 6.7318, 1.0463, 5.1337, 1.844, 8.0043, 1.0179, 6.7504, 1.8396, 4.2885, 4.9981, 1.4233, -1.4211, 2.4756, 4.6042, 3.9624, 5.4141, 5.1694, -0.74279, 17.929, 12.054, 17.054, 4.8852, 5.7442, 7.7754, 1.0173, 20.992, 6.6799, 4.0259, 1.2784, 3.3411, -2.6807, 0.29678, 3.8845, 5.7014, 6.7526, 2.0576, 0.47953, 0.20421, 0.67861, 7.5435, 5.3436, 4.2415, 6.7981, 0.92695, 0.152, 2.8214, 1.8451, 4.2959, 7.2029, 1.9869, 0.14454, 9.0551, 0.61705]
print ('List length and data type', len(l1), type(l1))
#------------------#
import tensorflow as tf
# Model parameters
W = tf.Variable([0], dtype=tf.float64)
b = tf.Variable([0], dtype=tf.float64)
# Model input and output
x = tf.placeholder(tf.float64)
linear_model = W * x + b
y = tf.placeholder(tf.float64)
# loss or cost function
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
# optimizer (gradient descent) with learning rate = 0.01
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = optimizer.minimize(loss)
# training data (labelled input & output swt)
# Using coursera data instead of sample data
#x_train = [1.0, 2, 3, 4]
#y_train = [0, -1, -2, -3]
x_train = l1
y_train = l2
# training loop (1000 iterations)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init) # reset values to wrong
for i in range(1000):
sess.run(train, {x: x_train, y: y_train})
# evaluate training accuracy
curr_W, curr_b, curr_loss = sess.run([W, b, loss], {x: x_train, y: y_train})
print("W: %s b: %s loss: %s"%(curr_W, curr_b, curr_loss))
Output
List length and data type: 97 <class 'list'>
W: [ nan] b: [ nan] loss: nan
One major problem with your estimator is the loss function. Since you use tf.reduce_sum, the loss grows with the number of samples, which you have to compensate by using a smaller learning rate. A better solution would be to use mean square error loss
loss = tf.reduce_mean(tf.square(linear_model - y))

Tensorflow KNN : How can we assign the K parameter for defining number of neighbors in KNN?

I have started working on a machine learning project using K-Nearest-Neighbors method on python tensorflow library. I have no experience working with tensorflow tools, so I found some code in github and modified it for my data.
My dataset is like this:
2,2,2,2,0,0,3
2,2,2,2,0,1,0
2,2,2,4,2,2,1
...
2,2,2,4,2,0,0
And this is the code which actually works fine:
import tensorflow as tf
import numpy as np
# Whole dataset => 1428 samples
dataset = 'car-eval-data-1.csv'
# samples for train, remaining for test
samples = 1300
reader = np.loadtxt(open(dataset, "rb"), delimiter=",", skiprows=1, dtype=np.int32)
train_x, train_y = reader[:samples,:5], reader[:samples,6]
test_x, test_y = reader[samples:, :5], reader[samples:, 6]
# Placeholder you can assign values in future. its kind of a variable
# v = ("variable type",[None,4]) -- you can have multidimensional values here
training_values = tf.placeholder("float",[None,len(train_x[0])])
test_values = tf.placeholder("float",[len(train_x[0])])
# MANHATTAN distance
distance = tf.abs(tf.reduce_sum(tf.square(tf.subtract(training_values,test_values)),reduction_indices=1))
prediction = tf.arg_min(distance, 0)
init = tf.global_variables_initializer()
accuracy = 0.0
with tf.Session() as sess:
sess.run(init)
# Looping through the test set to compare against the training set
for i in range (len(test_x)):
# Tensor flow method to get the prediction near to the test parameters in the training set.
index_in_trainingset = sess.run(prediction, feed_dict={training_values:train_x,test_values:test_x[i]})
print("Test %d, and the prediction is %s, the real value is %s"%(i,train_y[index_in_trainingset],test_y[i]))
if train_y[index_in_trainingset] == test_y[i]:
# if prediction is right so accuracy increases.
accuracy += 1. / len(test_x)
print('Accuracy -> ', accuracy * 100, ' %')
The only thing I do not understand is that if it's the KNN method so there has to be some K parameter which defines the number of neighbors for predicting the label for each test sample.
How can we assign the K parameter to tune the number of nearest neighbors for the code?
Is there any way to modify this code to make use of K parameter?
You're right that the example above does not have the provision to select K-Nearest neighbours. In the code below, I have added the ability to add such a parameter(knn_size) along with other corrections
import tensorflow as tf
import numpy as np
# Whole dataset => 1428 samples
dataset = 'PATH_TO_DATASET_CSV'
knn_size = 1
# samples for train, remaining for test
samples = 1300
reader = np.loadtxt(open(dataset, "rb"), delimiter=",", skiprows=1, dtype=np.int32)
train_x, train_y = reader[:samples,:6], reader[:samples,6]
test_x, test_y = reader[samples:, :6], reader[samples:, 6]
# Placeholder you can assign values in future. its kind of a variable
# v = ("variable type",[None,4]) -- you can have multidimensional values here
training_values = tf.placeholder("float",[None, len(train_x[0])])
test_values = tf.placeholder("float",[len(train_x[0])])
# MANHATTAN distance
distance = tf.abs(tf.reduce_sum(tf.square(tf.subtract(training_values,test_values)),reduction_indices=1))
# Here, we multiply the distance by -1 to reverse the magnitude of distances, i.e. the largest distance becomes the smallest distance
# tf.nn.top_k returns the top k values and their indices, here k is controlled by the parameter knn_size
k_nearest_neighbour_values, k_nearest_neighbour_indices = tf.nn.top_k(tf.scalar_mul(-1,distance),k=knn_size)
#Based on the indices we obtain from the previous step, we locate the exact class label set of the k closest matches in the training data
best_training_labels = tf.gather(train_y,k_nearest_neighbour_indices)
if knn_size==1:
prediction = tf.squeeze(best_training_labels)
else:
# Now we make our prediction based on the class label that appears most frequently
# tf.unique_with_counts() gives us all unique values that appear in a 1-D tensor along with their indices and counts
values, indices, counts = tf.unique_with_counts(best_training_labels)
# This gives us the index of the class label that has repeated the most
max_count_index = tf.argmax(counts,0)
#Retrieve the required class label
prediction = tf.gather(values,max_count_index)
init = tf.global_variables_initializer()
accuracy = 0.0
with tf.Session() as sess:
sess.run(init)
# Looping through the test set to compare against the training set
for i in range (len(test_x)):
# Tensor flow method to get the prediction near to the test parameters in the training set.
prediction_value = sess.run([prediction], feed_dict={training_values:train_x,test_values:test_x[i]})
print("Test %d, and the prediction is %s, the real value is %s"%(i,prediction_value[0],test_y[i]))
if prediction_value[0] == test_y[i]:
# if prediction is right so accuracy increases.
accuracy += 1. / len(test_x)
print('Accuracy -> ', accuracy * 100, ' %')

How to split data on balanced training set and test set on sklearn

I am using sklearn for multi-classification task. I need to split alldata into train_set and test_set. I want to take randomly the same sample number from each class.
Actually, I amusing this function
X_train, X_test, y_train, y_test = cross_validation.train_test_split(Data, Target, test_size=0.3, random_state=0)
but it gives unbalanced dataset! Any suggestion.
Although Christian's suggestion is correct, technically train_test_split should give you stratified results by using the stratify param.
So you could do:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(Data, Target, test_size=0.3, random_state=0, stratify=Target)
The trick here is that it starts from version 0.17 in sklearn.
From the documentation about the parameter stratify:
stratify : array-like or None (default is None)
If not None, data is split in a stratified fashion, using this as the labels array.
New in version 0.17: stratify splitting
You can use StratifiedShuffleSplit to create datasets featuring the same percentage of classes as the original one:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 3], [3, 7], [2, 4], [4, 8]])
y = np.array([0, 1, 0, 1])
stratSplit = StratifiedShuffleSplit(y, n_iter=1, test_size=0.5, random_state=42)
for train_idx, test_idx in stratSplit:
X_train=X[train_idx]
y_train=y[train_idx]
print(X_train)
# [[3 7]
# [2 4]]
print(y_train)
# [1 0]
If the classes are not balanced but you want the split to be balanced, then stratifying isn't going to help. There doesn't seem to be a method for doing balanced sampling in sklearn but it's kind of easy using basic numpy, for example a function like this might help you:
def split_balanced(data, target, test_size=0.2):
classes = np.unique(target)
# can give test_size as fraction of input data size of number of samples
if test_size<1:
n_test = np.round(len(target)*test_size)
else:
n_test = test_size
n_train = max(0,len(target)-n_test)
n_train_per_class = max(1,int(np.floor(n_train/len(classes))))
n_test_per_class = max(1,int(np.floor(n_test/len(classes))))
ixs = []
for cl in classes:
if (n_train_per_class+n_test_per_class) > np.sum(target==cl):
# if data has too few samples for this class, do upsampling
# split the data to training and testing before sampling so data points won't be
# shared among training and test data
splitix = int(np.ceil(n_train_per_class/(n_train_per_class+n_test_per_class)*np.sum(target==cl)))
ixs.append(np.r_[np.random.choice(np.nonzero(target==cl)[0][:splitix], n_train_per_class),
np.random.choice(np.nonzero(target==cl)[0][splitix:], n_test_per_class)])
else:
ixs.append(np.random.choice(np.nonzero(target==cl)[0], n_train_per_class+n_test_per_class,
replace=False))
# take same num of samples from all classes
ix_train = np.concatenate([x[:n_train_per_class] for x in ixs])
ix_test = np.concatenate([x[n_train_per_class:(n_train_per_class+n_test_per_class)] for x in ixs])
X_train = data[ix_train,:]
X_test = data[ix_test,:]
y_train = target[ix_train]
y_test = target[ix_test]
return X_train, X_test, y_train, y_test
Note that if you use this and sample more points per class than in the input data, then those will be upsampled (sample with replacement). As a result, some data points will appear multiple times and this may have an effect on the accuracy measures etc. And if some class has only one data point, there will be an error. You can easily check the numbers of points per class for example with np.unique(target, return_counts=True)
Another approach is to over- or under- sample from your stratified test/train split. The imbalanced-learn library is quite handy for this, specially useful if you are doing online learning & want to guarantee balanced train data within your pipelines.
from imblearn.pipeline import Pipeline as ImbalancePipeline
model = ImbalancePipeline(steps=[
('data_balancer', RandomOverSampler()),
('classifier', SVC()),
])
This is my implementation that I use to get train/test data indexes
def get_safe_balanced_split(target, trainSize=0.8, getTestIndexes=True, shuffle=False, seed=None):
classes, counts = np.unique(target, return_counts=True)
nPerClass = float(len(target))*float(trainSize)/float(len(classes))
if nPerClass > np.min(counts):
print("Insufficient data to produce a balanced training data split.")
print("Classes found %s"%classes)
print("Classes count %s"%counts)
ts = float(trainSize*np.min(counts)*len(classes)) / float(len(target))
print("trainSize is reset from %s to %s"%(trainSize, ts))
trainSize = ts
nPerClass = float(len(target))*float(trainSize)/float(len(classes))
# get number of classes
nPerClass = int(nPerClass)
print("Data splitting on %i classes and returning %i per class"%(len(classes),nPerClass ))
# get indexes
trainIndexes = []
for c in classes:
if seed is not None:
np.random.seed(seed)
cIdxs = np.where(target==c)[0]
cIdxs = np.random.choice(cIdxs, nPerClass, replace=False)
trainIndexes.extend(cIdxs)
# get test indexes
testIndexes = None
if getTestIndexes:
testIndexes = list(set(range(len(target))) - set(trainIndexes))
# shuffle
if shuffle:
trainIndexes = random.shuffle(trainIndexes)
if testIndexes is not None:
testIndexes = random.shuffle(testIndexes)
# return indexes
return trainIndexes, testIndexes
This is the function I am using. You can adapt it and optimize it.
# Returns a Test dataset that contains an equal amounts of each class
# y should contain only two classes 0 and 1
def TrainSplitEqualBinary(X, y, samples_n): #samples_n per class
indicesClass1 = []
indicesClass2 = []
for i in range(0, len(y)):
if y[i] == 0 and len(indicesClass1) < samples_n:
indicesClass1.append(i)
elif y[i] == 1 and len(indicesClass2) < samples_n:
indicesClass2.append(i)
if len(indicesClass1) == samples_n and len(indicesClass2) == samples_n:
break
X_test_class1 = X[indicesClass1]
X_test_class2 = X[indicesClass2]
X_test = np.concatenate((X_test_class1,X_test_class2), axis=0)
#remove x_test from X
X_train = np.delete(X, indicesClass1 + indicesClass2, axis=0)
Y_test_class1 = y[indicesClass1]
Y_test_class2 = y[indicesClass2]
y_test = np.concatenate((Y_test_class1,Y_test_class2), axis=0)
#remove y_test from y
y_train = np.delete(y, indicesClass1 + indicesClass2, axis=0)
if (X_test.shape[0] != 2 * samples_n or y_test.shape[0] != 2 * samples_n):
raise Exception("Problem with split 1!")
if (X_train.shape[0] + X_test.shape[0] != X.shape[0] or y_train.shape[0] + y_test.shape[0] != y.shape[0]):
raise Exception("Problem with split 2!")
return X_train, X_test, y_train, y_test

Tuning parameters in caret error despite assigning grids and as.factor

Any help appreciated. Been at this for weeks. :(
install.packages("klaR", dependencies=TRUE)
library(klaR)
install.packages("caret", dependencies=TRUE)
library(caret)
install.packages("e1071", dependencies=TRUE)
library(e1071)
install.packages("gmodels", dependencies=TRUE)
library(gmodels)
install.packages("gbm", dependencies=TRUE)
library(gbm)
install.packages("foreach", dependencies=TRUE)
library(foreach)
Load Grading Data
grading <- read.csv("~/PA_DataFinal/GradingData160315.csv")
create stratified sample # 1%
dfstrat <- stratified(grading, "FailPass", .01)
save(dfstrat, file = "c:/Users/gillisn/Documents/PA_DataFinal/RResults/GradingRResults/iteration 1/dfstrat.rda")
split data into train and test #75:25. FailPass is the responseVble
set.seed(1)
inTrainingSet <- createDataPartition(dfstrat$FailPass, p = .75, list = FALSE)
trainSet <- dfstrat[inTrainingSet,]
testSet <- dfstrat[-inTrainingSet, ]
set predictors and labels
There are 48 labels and its the last one that want to train on.
Take all the predictors 1-47
x,y is training data
x <- trainSet[,-48]
y <- as.factor(trainSet$FailPass)
i,j is test data
i <- testSet[,-48,]
j <- as.factor(testSet$FailPass)
Set Training control parameters
Bootstrapping itself around in 25 times.
bootControl <- trainControl(number = 25)
The grid is for the decision tree
gbmGrid <- expand.grid(.interaction.depth = (1:5) * 2, .n.trees = (1:10)*25, .shrinkage = .1)
nbGrid <- expand.grid(.fL=0, .usekernel=FALSE)
svmGrid >- expandGrid(.sigma=, .c=)
set.seed(2)
Train the models
naive bayes
nbFit <- train(x,y,method='nb',tuneGrid="nbGrid")
svm
svmFit <- train(x, y,method = "svmRadial", tuneLength = 10,trControl = bootControl, scaled = FALSE)
gbm
gbmFit <- train(x, y,method = "gbm", trControl = bootControl, verbose = FALSE, bag.fraction = 0.5, tuneGrid = gbmGrid)
predict the models on training data
models <- list(svm = svmFit, nb = nbFit, gbm = gbmFit)
predict(models)

Resources