Forecasting using mutiple seasonal STL and arima - time-series

I am attempting to forecast half hourly electricity data. The method I am using is to decompose the electricity consumption data using 'mstl' from the 'Forecast' package by Rob Hyndman and then forecast the seasonally adjusted data using ARIMA.
df <- IntervalData %>% select(CONSUMPTION_MW)
length_test_set = 17520
h = 17520
# create msts object with daily, weekly and monthly seasonality
data_msts <- msts(df, seasonal.periods=c(48,48*7,365/12*48))
train_msts = msts(df[1:(nrow(df)-length_test_set),],seasonal.periods=c(48,48*7,365/12*48))
test_msts = msts(df[((nrow(df)-length_test_set)+1):(nrow(df)),],seasonal.periods=c(48,48*7,365/12*48))
fit_mstl = mstl(train_msts, iterate = 4, s.window = 19, robust = TRUE)
fcast_arima=forecast(fit_mstl,method='arima',h=h)
How do I specify the order of my ARIMA model eg. ARIMA(2,1,6)?

You will need to write your own forecast function like this (using fake data so it can be reproduced).
library(forecast)
df <- data.frame(y=rnorm(50000))
length_test_set <- 17520
h <- 17520
# create msts object with daily, weekly and monthly seasonality
data_msts <- msts(df, seasonal.periods = c(48, 48*7, 365/12*48))
train_msts <- msts(df[1:(nrow(df) - length_test_set), ], seasonal.periods = c(48, 48 * 7, 365 / 12 * 48))
test_msts <- msts(df[((nrow(df) - length_test_set) + 1):(nrow(df)), ], seasonal.periods = c(48, 48 * 7, 365 / 12 * 48))
fit_mstl <- mstl(train_msts, iterate = 4, s.window = 19, robust = TRUE)
# Function to fit specific ARIMA model and return forecasts
arima_forecast <- function(x, h, level, order, ...) {
fit <- Arima(x, order=order, seasonal = c(0,0,0), ...)
return(forecast(fit, h = h, level = level))
}
# Example using an ARIMA(3,0,0) model
fcast_arima <- forecast(fit_mstl, forecastfunction=arima_forecast, h = h, order=c(3,0,0))
Created on 2020-07-25 by the reprex package (v0.3.0)

Related

Clustered resampling for inner layer of Caret recursive feature elimination

I have data where IDs are contained within clusters.
I would like to perform recursive feature elimination using Caret's rfe function which performs the following procedure:
Clustered resampling for the outer layer (line 2.1) is straightforward, using the index parameter.
However, within each outer resample, I would like to tune tuning parameters using cluster-based cross-validation (inner resampling) (line 2.9). Model tuning in the inner layer is possible by specifying a tuneGrid in rfe and having an appropriate trControl. It is this trControl that I would like to change to allow clustered resampling.
The outer resampling is specified in the rfeControl parameter of rfe.
The inner resampling is specified by trControl of rfe which is passed to train.
The trouble I am having is that I can't seem to specify any inner indices, because after the outer resampling, those indices are no longer valid or no longer present in the outer-resampled data.
I am looking for a way to tell train to take an outer resample (which will be missing a cluster against which to validate), and to tune the model using inner resampling by based on folds of the remaining clusters.
The MWE is as minimal as possible:
library(caret)
library(tidyverse)
library(parallel)
library(doParallel)
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
### Create some random data, 10 features, with some influence over a binomial outcome
set.seed(42)
id <- 1:1000
cluster <- rep(1:10, each = 100)
dat <- data.frame(id, cluster, replicate(10,rnorm(n = 1000, mean = runif(1, 0,100)+cluster, sd = runif(1, 0,20))))
dat <- dat %>% mutate(temp = rowSums(across(X1:X10)), prob = range01(temp), outcome = rbinom(n = nrow(dat), size = 1, prob = prob))
dat$outcome <- as.factor(dat$outcome)
levels(dat$outcome) <- c("control", "case")
dat$outcome <- factor(dat$outcome, levels=rev(levels(dat$outcome)))
### Manual outer folds-based cluster ###
for(i in 1:10) {
assign(paste0("index", i), which(dat$cluster!=i))
}
unit_indices <- list(index1, index2, index3, index4, index5, index6, index7, index8, index9, index10)
### Inner resampling method (THIS IS WHAT I'D LIKE TO CHANGE) ###
cv5 <- trainControl(classProbs = TRUE, method = "cv", number = 5, allowParallel = F) ## Is there a way to have inner cluster-based resampling WITHIN the outer cluster-based resampling?
caret_rfe_functions <- list(summary = twoClassSummary,
fit = function (x, y, first, last, ...) {
train(x, y, ...)
},
pred = caretFuncs$pred,
rank = function(object, x, y) {
vimp <- varImp(object)$importance
vimp <- vimp[order(vimp$Overall,decreasing = TRUE),,drop = FALSE]
vimp$var <- rownames(vimp)
vimp
},
selectSize = function (x, metric = "ROC", tol = 1, maximize = TRUE)
{
if (!maximize) {
best <- min(x[, metric])
perf <- (x[, metric] - best)/best * 100
flag <- perf <= tol
}
else {
best <- max(x[, metric])
perf <- (best - x[, metric])/best * 100
flag <- perf <= tol
}
min(x[flag, "Variables"])
},
selectVar = caretFuncs$selectVar)
caret_rfe_ctrl <- rfeControl(
functions = caret_rfe_functions,
saveDetails = TRUE,
index = unit_indices,
indexOut = NULL,
returnResamp = "all",
allowParallel = T, ### change this if you don't want to / can't go parallel
verbose = TRUE
)
#### Feature selection ####
set.seed(42)
cl <- makePSOCKcluster(10) ### for parallel processing if available
registerDoParallel(cl)
rfe_profile_nnet <- rfe(
form = outcome ~
X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = dat,
sizes = seq(2,10,1),
rfeControl = caret_rfe_ctrl,
## pass options to train()
method = "nnet",
preProc = c("center", "scale"),
metric = "ROC",
tuneGrid = expand.grid(size = c(1:5), decay = 5),
trControl = cv5) ### I would like to change this to allow inner cluster-based resampling
stopCluster(cl)
rfe_profile_nnet
plot(rfe_profile_nnet)
Presumably the inner cluster-based resampling would be achieved by specifying a new trainControl containing some dynamic inner index based on the outer resample that is selected at the time:
inner_cluster_tune <- trainControl(classProbs = TRUE,
index = {insert magic here}, ### This is the important bit
returnResamp = "all",
summaryFunction = twoClassSummary,
allowParallel = F) ### especially if the outer resample is parallelised
If you try with the original cluster indices e.g.
inner_cluster_tune <- trainControl(classProbs = TRUE,
index = unit_indices,
returnResamp = "all",
summaryFunction = twoClassSummary,
allowParallel = F)
There are various warnings about missing data in the resamples, and things like 24: In [<-.data.frame(*tmp*, , object$method$center, value = structure(list( ... : provided 81 variables to replace 9 variables.
All help greatly appreciated.
As a postscript question , you can see which parameters were used within your rfe like so:
> rfe_profile_nnet$fit
Neural Network
1000 samples
8 predictor
2 classes: 'case', 'control'
Pre-processing: centered (8), scaled (8)
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 800, 800, 800, 800, 800
Resampling results across tuning parameters:
size Accuracy Kappa
1 0.616 0.1605071
2 0.616 0.1686937
3 0.620 0.1820503
4 0.618 0.1788491
5 0.618 0.1788063
Tuning parameter 'decay' was held constant at a value of 5
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were size = 3 and decay = 5.
But does anyone know if this refers to one, or all of the outer resamples? Presumably the same tuning parameters won't necessarily be chosen across all outer resamples

Need help getting method = cforest to work within train() from caret using leave one out cross-validation

examples_dataset.csv
I have tried looking up so many ways to fix this issue, but no solution so far. I am trying to train conditional inference forests with caret, using the leave one out cross-validation method. I have about 20 (larger) datasets to run this method on, hence the functions to automate some.
A lot of what I have found suggests that my QuantBins are not factors, but I have checked after running prep_df() on the df and those are indeed factors. I get an error when running the conditional inference forests (cif_model()), but not with random forests (rf_model()). The output from trying to make that model is "Something is wrong; all the Accuracy metric values are missing" (pictured below).
Any help and guidance is appreciated!
## Example code
## GOAL: create train() code from caret that uses conditional inference forests to assess variable importance with categorical dependent variable using leave one out cross validation
rm(list=ls())
setwd()
ex.all <- read.csv("examples_dataset.csv", header = TRUE)
loo_ctrl <- trainControl(method = "LOOCV")
#This function works!
rf_model <- function(file.name) {
model <- train(QuantBins ~ F_Cou + B_Cou + Height + GBH + N_b + N_f + L_u + D_w + N_p
+ P_Cou, data = file.name, method = "rf", trControl = loo_ctrl, tuneLength = 10, control =
rpart.control(minbucket = 10), ntree = 50)
return(model)
}
#This does not.
cif_model <- function(file.name) {
model <- train(QuantBins ~ F_Cou + B_Cou + Height + GBH + N_b + N_f + L_u + D_w + N_p
+ P_Cou, data = file.name, method = "cforest", trControl = loo_ctrl, tuneLength = 10, control
= ctree_control(minbucket = 10), ntree = 50)
return(model)
}
##### functions used #####
prep_df <- function(file.name) {
file.name$BINARY <- ifelse(file.name$TOTAL >= 1, "yes", "no")
file.name$BINARY <- as.factor(file.name$BINARY)
file.name$L_u <- as.factor(file.name$L_u)
file.name$TOTAL <- as.numeric(file.name$TOTAL)
## Quantile distribution of breaks in Total Fruit
numbers_of_bins = 5 #this will return four groups
file.name <- file.name %>% mutate(QuantBins = cut(TOTAL, breaks = unique(quantile(TOTAL,
probs=seq.int(0,1, by=1/numbers_of_bins))), include.lowest=TRUE))
print(length(levels(file.name$QuantBins)))
temp <- levels(file.name$QuantBins)
file.name$QuantBins <- as.character(file.name$QuantBins)
for(i in 1:length(file.name$QuantBins)) {
temp1 <- strsplit(file.name$QuantBins[i], ",")
temp2 <- strsplit(temp1[[1]][1], "\\(")
temp3 <- strsplit(temp1[[1]][[2]], "\\]")
file.name$QuantBins[i] <- paste("Fruit", temp2[[1]][2], "to", temp3[[1]][1])
}
file.name$QuantBins <- as.factor(file.name$QuantBins)
file.name$QuantBins <- droplevels(file.name$QuantBins)
print(length(levels(file.name$QuantBins)))
return(file.name)
}
##### running trees #####
ex.all <- prep_df(ex.all)
ex.rf <- rf_model(ex.all)
print(ex.rf)
ex.rf
ex.rf$finalModel$importance
ex.cf <- cif_model(ex.all)
print(ex.cf)
ex.cf
ex.cf$finalModel$importance
Error using cif_model(ex.all) showing "Something is wrong; all the Accuracy metric values are missing"

CFA in data with 3 levels - estimating factor scores at level 2?

I am working on a dataset with 3 levels:
Teacher
School
Country
Using survey responses from the teachers, the aim is to use Confirmatory Factor Analysis (CFA) with the ultimate goal of having the factor scores at the school level.
A further objective is to test for measurement invariance across countries.
I want to use the lavaan package in R, because it is able to deal with the complex survey design of my data trough the lavaan.survey-extension (sampling design, weights etc.)
I have done some preliminary analysis, where i use country-ID as the group argument in the cfa-function. This gives me the possibility to perform the measurement invariance analysis across countries. The issue is, that my factor scores are given at individual teacher level, and i am interested in the school-level.
Any ideas about how to get these factor scores at the school level?
Here are some examples of the functions i use. I do not think that i data sample is needed, but i will create some if it is requested.
library(lavaan)
library(SEMtools)
#define model
reduced_mod <-'
leadership_sup =~ TC3G22D + TC3G22E + TC3G22K
continous_develop_collab =~ TT3G32A + TT3G32B + TT3G32C + TT3G32D '
#Fit model with different restraints:
fit_no_restraint <- cfa(model = reduced_mod, data = cfa_data, group="countryID")
fit_metric <- cfa(model = reduced_mod, data = cfa_data, group="countryID", group.equal = c("loadings"))
fit_scalar <- cfa(model = reduced_mod, data = cfa_data, group="countryID", group.equal = c("loadings", "intercepts"))
#Compare fit statistics
compareFit(scalar = fit_scalar , metric = fit_metric , config = fit_no_restraint)
It seems that you want multilevel measurement invariance. You should use the measEq.syntax() from the semTools package:
## ---------------------
## Multilevel Invariance
## ---------------------
## To test invariance across levels in a MLSEM, specify syntax as though
## you are fitting to 2 groups instead of 2 levels.
mlsem <- ' f1 =~ y1 + y2 + y3
f2 =~ y4 + y5 + y6 '
## metric invariance
syntax.metric <- measEq.syntax(configural.model = mlsem, meanstructure = TRUE,
ID.fac = "std.lv", sample.nobs = c(1, 1),
group = "cluster", group.equal = "loadings")
## by definition, Level-1 means must be zero, so fix them
syntax.metric <- update(syntax.metric,
change.syntax = paste0("y", 1:6, " ~ c(0, NA)*1"))
## save as a character string
mod.metric <- as.character(syntax.metric, groups.as.blocks = TRUE)
## convert from multigroup to multilevel
mod.metric <- gsub(pattern = "group:", replacement = "level:",
x = mod.metric, fixed = TRUE)
## fit model to data
fit.metric <- lavaan(mod.metric, data = Demo.twolevel, cluster = "cluster")
summary(fit.metric)
Source

Tuning parameters in caret error despite assigning grids and as.factor

Any help appreciated. Been at this for weeks. :(
install.packages("klaR", dependencies=TRUE)
library(klaR)
install.packages("caret", dependencies=TRUE)
library(caret)
install.packages("e1071", dependencies=TRUE)
library(e1071)
install.packages("gmodels", dependencies=TRUE)
library(gmodels)
install.packages("gbm", dependencies=TRUE)
library(gbm)
install.packages("foreach", dependencies=TRUE)
library(foreach)
Load Grading Data
grading <- read.csv("~/PA_DataFinal/GradingData160315.csv")
create stratified sample # 1%
dfstrat <- stratified(grading, "FailPass", .01)
save(dfstrat, file = "c:/Users/gillisn/Documents/PA_DataFinal/RResults/GradingRResults/iteration 1/dfstrat.rda")
split data into train and test #75:25. FailPass is the responseVble
set.seed(1)
inTrainingSet <- createDataPartition(dfstrat$FailPass, p = .75, list = FALSE)
trainSet <- dfstrat[inTrainingSet,]
testSet <- dfstrat[-inTrainingSet, ]
set predictors and labels
There are 48 labels and its the last one that want to train on.
Take all the predictors 1-47
x,y is training data
x <- trainSet[,-48]
y <- as.factor(trainSet$FailPass)
i,j is test data
i <- testSet[,-48,]
j <- as.factor(testSet$FailPass)
Set Training control parameters
Bootstrapping itself around in 25 times.
bootControl <- trainControl(number = 25)
The grid is for the decision tree
gbmGrid <- expand.grid(.interaction.depth = (1:5) * 2, .n.trees = (1:10)*25, .shrinkage = .1)
nbGrid <- expand.grid(.fL=0, .usekernel=FALSE)
svmGrid >- expandGrid(.sigma=, .c=)
set.seed(2)
Train the models
naive bayes
nbFit <- train(x,y,method='nb',tuneGrid="nbGrid")
svm
svmFit <- train(x, y,method = "svmRadial", tuneLength = 10,trControl = bootControl, scaled = FALSE)
gbm
gbmFit <- train(x, y,method = "gbm", trControl = bootControl, verbose = FALSE, bag.fraction = 0.5, tuneGrid = gbmGrid)
predict the models on training data
models <- list(svm = svmFit, nb = nbFit, gbm = gbmFit)
predict(models)

combine time series plot by using R

I wanna combine three graphics on one graph. The data from inside of R which is " nottem ". Can someone help me to write code to put a seasonal mean and harmonic (cosine model) and its time series plots together by using different colors? I already wrote model code just don't know how to combine them together to compare.
Code :library(TSA)
nottem
month.=season(nottem)
model=lm(nottem~month.-1)
summary(nottem)
har.=harmonic(nottem,1)
model1=lm(nottem~har.)
summary(model1)
plot(nottem,type="l",ylab="Average monthly temperature at Nottingham castle")
points(y=nottem,x=time(nottem), pch=as.vector(season(nottem)))
Just put your time series inside a matrix:
x = cbind(serie1 = ts(cumsum(rnorm(100)), freq = 12, start = c(2013, 2)),
serie2 = ts(cumsum(rnorm(100)), freq = 12, start = c(2013, 2)))
plot(x)
Or configure the plot region:
par(mfrow = c(2, 1)) # 2 rows, 1 column
serie1 = ts(cumsum(rnorm(100)), freq = 12, start = c(2013, 2))
serie2 = ts(cumsum(rnorm(100)), freq = 12, start = c(2013, 2))
require(zoo)
plot(serie1)
lines(rollapply(serie1, width = 10, FUN = mean), col = 'red')
plot(serie2)
lines(rollapply(serie2, width = 10, FUN = mean), col = 'blue')
hope it helps.
PS.: zoo package is not needed in this example, you could use the filter function.
You can extract the seasonal mean with:
s.mean = tapply(serie, cycle(serie), mean)
# January, assuming serie is monthly data
print(s.mean[1])
This graph is pretty hard to read, because your three sets of values are so similar. Still, if you want to simply want to graph all of these on the sample plot, you can do it pretty easily by using the coefficients generated by your models.
Step 1: Plot the raw data. This comes from your original code.
plot(nottem,type="l",ylab="Average monthly temperature at Nottingham castle")
Step 2: Set up x-values for the mean and cosine plots.
x <- seq(1920, (1940 - 1/12), by=1/12)
Step 3: Plot the seasonal means by repeating the coefficients from the first model.
lines(x=x, y=rep(model$coefficients, 20), col="blue")
Step 4: Calculate the y-values for the cosine function using the coefficients from the second model, and then plot.
y <- model1$coefficients[2] * cos(2 * pi * x) + model1$coefficients[1]
lines(x=x, y=y, col="red")
ggplot variant: If you decide to switch to the popular 'ggplot2' package for your plot, you would do it like so:
x <- seq(1920, (1940 - 1/12), by=1/12)
y.seas.mean <- rep(model$coefficients, 20)
y.har.cos <- model1$coefficients[2] * cos(2 * pi * x) + model1$coefficients[1]
plot_Data <- melt(data.frame(x=x, temp=nottem, seas.mean=y.seas.mean, har.cos=y.har.cos), id="x")
ggplot(plot_Data, aes(x=x, y=value, col=variable)) + geom_line()

Resources