How to fix the dataTest error in Timeseries in R? - holtwinters

#Training and Test Split
split <- ceiling(0.7 * length(data))
dataTrain <- ts(data[1:split], frequency = 15, start = c(2021,5))
dataTest <- ts(data[c((split+1) : nrow(data))], frequency = 15, start = c(2022,5))
actual <- unclass(dataTrain)
actualFull <- unclass(data)
dataTest <- ts(data[c((split+1) : nrow(data))], frequency = 15, start = c(2022,5))

Related

how to use tune_nested() of mlr3tuning?

rm(list = ls())
library(mlr3verse)
task <- tsk("pima")
learner <- lrn("classif.rpart")
measure <- msr("classif.ce")
inner_resample <- rsmp("cv", folds = 5)
outer_resample <- rsmp("cv", folds = 5)
search_space <- ps(
cp = p_dbl(lower = 0.001, upper = 0.1)
)
rr <- tune_nested(
method = "grid_search",
resolution = 5,
task = task,
learner = learner,
inner_resampling = inner_resample,
outer_resampling = outer_resample,
search_space = search_space,
term_evals = 5
)
i always get this error:
Error in terminator_selection(term_evals, term_time) :
Assertion on 'term_evals' failed: Must be of type 'single integerish value' (or 'NULL'), not 'ParamSet/R6'.
i don't know what's wrong with my code. Someone can give some suggestions?
Thanks a lot.
You found a bug. We will fix this. However, if you set the measure, it should work.
library(mlr3verse)
task <- tsk("pima")
learner <- lrn("classif.rpart")
measure <- msr("classif.ce")
inner_resample <- rsmp("cv", folds = 5)
outer_resample <- rsmp("cv", folds = 5)
search_space <- ps(
cp = p_dbl(lower = 0.001, upper = 0.1)
)
rr <- tune_nested(
method = "grid_search",
resolution = 5,
task = task,
learner = learner,
inner_resampling = inner_resample,
outer_resampling = outer_resample,
measure = measure,
search_space = search_space,
term_evals = 5
)

Need help getting method = cforest to work within train() from caret using leave one out cross-validation

examples_dataset.csv
I have tried looking up so many ways to fix this issue, but no solution so far. I am trying to train conditional inference forests with caret, using the leave one out cross-validation method. I have about 20 (larger) datasets to run this method on, hence the functions to automate some.
A lot of what I have found suggests that my QuantBins are not factors, but I have checked after running prep_df() on the df and those are indeed factors. I get an error when running the conditional inference forests (cif_model()), but not with random forests (rf_model()). The output from trying to make that model is "Something is wrong; all the Accuracy metric values are missing" (pictured below).
Any help and guidance is appreciated!
## Example code
## GOAL: create train() code from caret that uses conditional inference forests to assess variable importance with categorical dependent variable using leave one out cross validation
rm(list=ls())
setwd()
ex.all <- read.csv("examples_dataset.csv", header = TRUE)
loo_ctrl <- trainControl(method = "LOOCV")
#This function works!
rf_model <- function(file.name) {
model <- train(QuantBins ~ F_Cou + B_Cou + Height + GBH + N_b + N_f + L_u + D_w + N_p
+ P_Cou, data = file.name, method = "rf", trControl = loo_ctrl, tuneLength = 10, control =
rpart.control(minbucket = 10), ntree = 50)
return(model)
}
#This does not.
cif_model <- function(file.name) {
model <- train(QuantBins ~ F_Cou + B_Cou + Height + GBH + N_b + N_f + L_u + D_w + N_p
+ P_Cou, data = file.name, method = "cforest", trControl = loo_ctrl, tuneLength = 10, control
= ctree_control(minbucket = 10), ntree = 50)
return(model)
}
##### functions used #####
prep_df <- function(file.name) {
file.name$BINARY <- ifelse(file.name$TOTAL >= 1, "yes", "no")
file.name$BINARY <- as.factor(file.name$BINARY)
file.name$L_u <- as.factor(file.name$L_u)
file.name$TOTAL <- as.numeric(file.name$TOTAL)
## Quantile distribution of breaks in Total Fruit
numbers_of_bins = 5 #this will return four groups
file.name <- file.name %>% mutate(QuantBins = cut(TOTAL, breaks = unique(quantile(TOTAL,
probs=seq.int(0,1, by=1/numbers_of_bins))), include.lowest=TRUE))
print(length(levels(file.name$QuantBins)))
temp <- levels(file.name$QuantBins)
file.name$QuantBins <- as.character(file.name$QuantBins)
for(i in 1:length(file.name$QuantBins)) {
temp1 <- strsplit(file.name$QuantBins[i], ",")
temp2 <- strsplit(temp1[[1]][1], "\\(")
temp3 <- strsplit(temp1[[1]][[2]], "\\]")
file.name$QuantBins[i] <- paste("Fruit", temp2[[1]][2], "to", temp3[[1]][1])
}
file.name$QuantBins <- as.factor(file.name$QuantBins)
file.name$QuantBins <- droplevels(file.name$QuantBins)
print(length(levels(file.name$QuantBins)))
return(file.name)
}
##### running trees #####
ex.all <- prep_df(ex.all)
ex.rf <- rf_model(ex.all)
print(ex.rf)
ex.rf
ex.rf$finalModel$importance
ex.cf <- cif_model(ex.all)
print(ex.cf)
ex.cf
ex.cf$finalModel$importance
Error using cif_model(ex.all) showing "Something is wrong; all the Accuracy metric values are missing"

Forecasting using mutiple seasonal STL and arima

I am attempting to forecast half hourly electricity data. The method I am using is to decompose the electricity consumption data using 'mstl' from the 'Forecast' package by Rob Hyndman and then forecast the seasonally adjusted data using ARIMA.
df <- IntervalData %>% select(CONSUMPTION_MW)
length_test_set = 17520
h = 17520
# create msts object with daily, weekly and monthly seasonality
data_msts <- msts(df, seasonal.periods=c(48,48*7,365/12*48))
train_msts = msts(df[1:(nrow(df)-length_test_set),],seasonal.periods=c(48,48*7,365/12*48))
test_msts = msts(df[((nrow(df)-length_test_set)+1):(nrow(df)),],seasonal.periods=c(48,48*7,365/12*48))
fit_mstl = mstl(train_msts, iterate = 4, s.window = 19, robust = TRUE)
fcast_arima=forecast(fit_mstl,method='arima',h=h)
How do I specify the order of my ARIMA model eg. ARIMA(2,1,6)?
You will need to write your own forecast function like this (using fake data so it can be reproduced).
library(forecast)
df <- data.frame(y=rnorm(50000))
length_test_set <- 17520
h <- 17520
# create msts object with daily, weekly and monthly seasonality
data_msts <- msts(df, seasonal.periods = c(48, 48*7, 365/12*48))
train_msts <- msts(df[1:(nrow(df) - length_test_set), ], seasonal.periods = c(48, 48 * 7, 365 / 12 * 48))
test_msts <- msts(df[((nrow(df) - length_test_set) + 1):(nrow(df)), ], seasonal.periods = c(48, 48 * 7, 365 / 12 * 48))
fit_mstl <- mstl(train_msts, iterate = 4, s.window = 19, robust = TRUE)
# Function to fit specific ARIMA model and return forecasts
arima_forecast <- function(x, h, level, order, ...) {
fit <- Arima(x, order=order, seasonal = c(0,0,0), ...)
return(forecast(fit, h = h, level = level))
}
# Example using an ARIMA(3,0,0) model
fcast_arima <- forecast(fit_mstl, forecastfunction=arima_forecast, h = h, order=c(3,0,0))
Created on 2020-07-25 by the reprex package (v0.3.0)

Oversampling or SMOTE in Pyspark

I have 7 classes and the total number of records are 115 and I wanted to run Random Forest model over this data. But as the data is not enough to get a high accuracy. So i wanted to apply oversampling over all the classes in a way that the majority class itself get higher count and then minority accordingly. Is this possible in PySpark?
+---------+-----+
| SubTribe|count|
+---------+-----+
| Chill| 10|
| Cool| 18|
|Adventure| 18|
| Quirk| 13|
| Mystery| 25|
| Party| 18|
|Glamorous| 13|
+---------+-----+
Here is another implementation of Pyspark and Scala smote that I have used in the past. I have copped the code across and referenced the source because its quite small:
Pyspark:
import random
import numpy as np
from pyspark.sql import Row
from sklearn import neighbors
from pyspark.ml.feature import VectorAssembler
def vectorizerFunction(dataInput, TargetFieldName):
if(dataInput.select(TargetFieldName).distinct().count() != 2):
raise ValueError("Target field must have only 2 distinct classes")
columnNames = list(dataInput.columns)
columnNames.remove(TargetFieldName)
dataInput = dataInput.select((','.join(columnNames)+','+TargetFieldName).split(','))
assembler=VectorAssembler(inputCols = columnNames, outputCol = 'features')
pos_vectorized = assembler.transform(dataInput)
vectorized = pos_vectorized.select('features',TargetFieldName).withColumn('label',pos_vectorized[TargetFieldName]).drop(TargetFieldName)
return vectorized
def SmoteSampling(vectorized, k = 5, minorityClass = 1, majorityClass = 0, percentageOver = 200, percentageUnder = 100):
if(percentageUnder > 100|percentageUnder < 10):
raise ValueError("Percentage Under must be in range 10 - 100");
if(percentageOver < 100):
raise ValueError("Percentage Over must be in at least 100");
dataInput_min = vectorized[vectorized['label'] == minorityClass]
dataInput_maj = vectorized[vectorized['label'] == majorityClass]
feature = dataInput_min.select('features')
feature = feature.rdd
feature = feature.map(lambda x: x[0])
feature = feature.collect()
feature = np.asarray(feature)
nbrs = neighbors.NearestNeighbors(n_neighbors=k, algorithm='auto').fit(feature)
neighbours = nbrs.kneighbors(feature)
gap = neighbours[0]
neighbours = neighbours[1]
min_rdd = dataInput_min.drop('label').rdd
pos_rddArray = min_rdd.map(lambda x : list(x))
pos_ListArray = pos_rddArray.collect()
min_Array = list(pos_ListArray)
newRows = []
nt = len(min_Array)
nexs = percentageOver/100
for i in range(nt):
for j in range(nexs):
neigh = random.randint(1,k)
difs = min_Array[neigh][0] - min_Array[i][0]
newRec = (min_Array[i][0]+random.random()*difs)
newRows.insert(0,(newRec))
newData_rdd = sc.parallelize(newRows)
newData_rdd_new = newData_rdd.map(lambda x: Row(features = x, label = 1))
new_data = newData_rdd_new.toDF()
new_data_minor = dataInput_min.unionAll(new_data)
new_data_major = dataInput_maj.sample(False, (float(percentageUnder)/float(100)))
return new_data_major.unionAll(new_data_minor)
dataInput = spark.read.format('csv').options(header='true',inferSchema='true').load("sam.csv").dropna()
SmoteSampling(vectorizerFunction(dataInput, 'Y'), k = 2, minorityClass = 1, majorityClass = 0, percentageOver = 90, percentageUnder = 5)
Scala:
// Import the necessary packages
import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.expressions.Window
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.functions.rand
import org.apache.spark.sql.functions._
object smoteClass{
def KNNCalculation(
dataFinal:org.apache.spark.sql.DataFrame,
feature:String,
reqrows:Int,
BucketLength:Int,
NumHashTables:Int):org.apache.spark.sql.DataFrame = {
val b1 = dataFinal.withColumn("index", row_number().over(Window.partitionBy("label").orderBy("label")))
val brp = new BucketedRandomProjectionLSH().setBucketLength(BucketLength).setNumHashTables(NumHashTables).setInputCol(feature).setOutputCol("values")
val model = brp.fit(b1)
val transformedA = model.transform(b1)
val transformedB = model.transform(b1)
val b2 = model.approxSimilarityJoin(transformedA, transformedB, 2000000000.0)
require(b2.count > reqrows, println("Change bucket lenght or reduce the percentageOver"))
val b3 = b2.selectExpr("datasetA.index as id1",
"datasetA.feature as k1",
"datasetB.index as id2",
"datasetB.feature as k2",
"distCol").filter("distCol>0.0").orderBy("id1", "distCol").dropDuplicates().limit(reqrows)
return b3
}
def smoteCalc(key1: org.apache.spark.ml.linalg.Vector, key2: org.apache.spark.ml.linalg.Vector)={
val resArray = Array(key1, key2)
val res = key1.toArray.zip(key2.toArray.zip(key1.toArray).map(x => x._1 - x._2).map(_*0.2)).map(x => x._1 + x._2)
resArray :+ org.apache.spark.ml.linalg.Vectors.dense(res)}
def Smote(
inputFrame:org.apache.spark.sql.DataFrame,
feature:String,
label:String,
percentOver:Int,
BucketLength:Int,
NumHashTables:Int):org.apache.spark.sql.DataFrame = {
val groupedData = inputFrame.groupBy(label).count
require(groupedData.count == 2, println("Only 2 labels allowed"))
val classAll = groupedData.collect()
val minorityclass = if (classAll(0)(1).toString.toInt > classAll(1)(1).toString.toInt) classAll(1)(0).toString else classAll(0)(0).toString
val frame = inputFrame.select(feature,label).where(label + " == " + minorityclass)
val rowCount = frame.count
val reqrows = (rowCount * (percentOver/100)).toInt
val md = udf(smoteCalc _)
val b1 = KNNCalculation(frame, feature, reqrows, BucketLength, NumHashTables)
val b2 = b1.withColumn("ndtata", md($"k1", $"k2")).select("ndtata")
val b3 = b2.withColumn("AllFeatures", explode($"ndtata")).select("AllFeatures").dropDuplicates
val b4 = b3.withColumn(label, lit(minorityclass).cast(frame.schema(1).dataType))
return inputFrame.union(b4).dropDuplicates
}
}
Source
Maybe this project can be useful for your goal:
Spark SMOTE
But I think that 115 records aren't enough for a random forest. You can use other simplest technique like decision trees
You can check this answer:
Is Random Forest suitable for very small data sets?

Tuning parameters in caret error despite assigning grids and as.factor

Any help appreciated. Been at this for weeks. :(
install.packages("klaR", dependencies=TRUE)
library(klaR)
install.packages("caret", dependencies=TRUE)
library(caret)
install.packages("e1071", dependencies=TRUE)
library(e1071)
install.packages("gmodels", dependencies=TRUE)
library(gmodels)
install.packages("gbm", dependencies=TRUE)
library(gbm)
install.packages("foreach", dependencies=TRUE)
library(foreach)
Load Grading Data
grading <- read.csv("~/PA_DataFinal/GradingData160315.csv")
create stratified sample # 1%
dfstrat <- stratified(grading, "FailPass", .01)
save(dfstrat, file = "c:/Users/gillisn/Documents/PA_DataFinal/RResults/GradingRResults/iteration 1/dfstrat.rda")
split data into train and test #75:25. FailPass is the responseVble
set.seed(1)
inTrainingSet <- createDataPartition(dfstrat$FailPass, p = .75, list = FALSE)
trainSet <- dfstrat[inTrainingSet,]
testSet <- dfstrat[-inTrainingSet, ]
set predictors and labels
There are 48 labels and its the last one that want to train on.
Take all the predictors 1-47
x,y is training data
x <- trainSet[,-48]
y <- as.factor(trainSet$FailPass)
i,j is test data
i <- testSet[,-48,]
j <- as.factor(testSet$FailPass)
Set Training control parameters
Bootstrapping itself around in 25 times.
bootControl <- trainControl(number = 25)
The grid is for the decision tree
gbmGrid <- expand.grid(.interaction.depth = (1:5) * 2, .n.trees = (1:10)*25, .shrinkage = .1)
nbGrid <- expand.grid(.fL=0, .usekernel=FALSE)
svmGrid >- expandGrid(.sigma=, .c=)
set.seed(2)
Train the models
naive bayes
nbFit <- train(x,y,method='nb',tuneGrid="nbGrid")
svm
svmFit <- train(x, y,method = "svmRadial", tuneLength = 10,trControl = bootControl, scaled = FALSE)
gbm
gbmFit <- train(x, y,method = "gbm", trControl = bootControl, verbose = FALSE, bag.fraction = 0.5, tuneGrid = gbmGrid)
predict the models on training data
models <- list(svm = svmFit, nb = nbFit, gbm = gbmFit)
predict(models)

Resources