How to change format of dataset using tensorflow? - machine-learning

Is there an efficient way to deal with datasets using tensorflow library? I would like to manipulate them, like delete columns/rows, edit them etc.

This sample code should help you get started with importing a CSV, doing some basic manipulations and running NN learning in tensorflow. You can get the data here.
import numpy as np
import pandas as pd
import tensorflow as tf
# Read and manipulate data from CSV
df = pd.read_csv('df.csv')
df = df.dropna(how='any')
df = df.drop('DayOfWeek', axis=1)
df.Customers = df.Customers / 1000.0
df.CompetitionDistance = df.CompetitionDistance / 1000.0
df.Sales = df.Sales / 1000.0
# Parameters
features = 2
hidden = 3
learning_rate = 0.2
# Prepare input and output arrays
train_x = np.array(df[['CompetitionDistance', 'Customers']])
train_y = np.array(df[['Sales']]).reshape([-1])
# Build a simple TF graph
x = tf.placeholder(tf.float32, shape=[None, features], name='x')
y = tf.placeholder(tf.float32, shape=[None], name='y')
W = tf.get_variable(name='W', shape=[features, hidden])
b = tf.get_variable(name='b', shape=[hidden], initializer=tf.zeros_initializer)
z = tf.matmul(x, W) + b
predict = tf.reduce_sum(z, axis=1)
loss = tf.reduce_mean(tf.square(y - predict))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
# Run the training
with tf.Session() as session:
session.run(tf.global_variables_initializer())
for i in xrange(101):
_, loss_value = session.run([optimizer, loss],
feed_dict={x: train_x, y: train_y})
if i % 10 == 0:
print "epoch=%03i, loss=%.5f" % (i, loss_value)

Related

How to forecast actual future values using XGBoost?

So I have a solar Irradiation dataset having around 61000+ rows & 2 columns. I have made the model using XGBoost to predict the future values.
I have splitted the data in 2 parts train and test and trained the model accordingly. Furthermore, I have made the predictions on the test data set. Everything is going fine. But now I want to predict the actual forecast How can I do that ??
import os
import pandas as pd
import numpy as np
import xgboost
import matplotlib.pyplot as plt
from xgboost import plot_importance
from sklearn import metrics
# Dataset
df=pd.read_csv('Readings_last_7yr.csv')
df.index = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S')
## Copied the dataset
df2 = df.copy()
del df2['Date']
## Test Train Split
from pandas import read_csv
from matplotlib import pyplot
# series = read_csv('sunspots.csv', header=0, index_col=0)
X = df2
train_size = int(len(X) * 0.75)
train, test = X[0:train_size], X[train_size:len(X)]
print('Observations: %d' % (len(X)))
print('Training Observations: %d' % (len(train)))
print('Testing Observations: %d' % (len(test)))
pyplot.plot(train)
pyplot.plot(test)
pyplot.show()
## Creating Features
def create_features(df, target_variable):
df['date'] = df.index
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.weekofyear
X = df[['hour','dayofweek','quarter','month','year',
'dayofyear','dayofmonth','weekofyear']]
if target_variable:
y = df[target_variable]
return X, y
return X
## METRICS
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def timeseries_evaluation_metrics_func(y_true, y_pred):
print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
print(f'MAPE is : {mean_absolute_percentage_error(y_true, y_pred)}')
print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')
train_copy = train.copy()
test_copy = test.copy()
trainX, trainY = create_features(train_copy, target_variable='Irr')
testX, testY = create_features(test_copy, target_variable='Irr')
xgb = XGBRegressor(objective= 'reg:linear', n_estimators=1000)
xgb
xgb.fit(trainX, trainY,
eval_set=[(trainX, trainY), (testX, testY)],
early_stopping_rounds=50,
verbose=False)
# Predictions
predicted_results = xgb.predict(testX)
# Metrics
timeseries_evaluation_metrics_func(testY, predicted_results)
# Plotting graph for test and Predicted
plt.figure(figsize=(13,8))
plt.plot(list(testY))
plt.plot(list(predicted_results))
plt.title("Actual vs Predicted")
plt.ylabel("Irr")
plt.legend(('Actual','predicted'))
plt.show()
# Making graph of predicted on the whole dataframe
test['Prediction'] = predicted_results
Irr_all = pd.concat([test, train], sort=False)
Irr_all = Irr_all.rename(columns={'Irradiation':'Original_Value'})
Overview_Complete_Data_And_Prediction = Irr_all[['Irr','Prediction']].plot(figsize=(12, 5))
I am getting the results as:
So now I want to predict future values from the year 2021 till 2023.
FUTURE PREDICTION
dti = pd.date_range("2021-01-01 00:30:00", periods=20000, freq="H")
df_future_dates = pd.DataFrame(dti, columns = ['Date'])
df_future_dates['Irr'] = np.nan
df_future_dates.index = pd.to_datetime(df_future_dates['Date'], format='%Y-%m-%d %H:%M:%S')
df_future_dates_copy = df_future_dates.copy()
testX_future, testY_future = create_features(df_future_dates, target_variable='Irr')
xgb = XGBRegressor(objective= 'reg:linear', n_estimators=1000)
xgb
## Now here I have used train and test from above
xgb.fit(trainX, trainY,
eval_set=[(trainX, trainY), (testX, testY)],
early_stopping_rounds=50,
verbose=False)
predicted_results_future = xgb.predict(testX_future)
# Graph
plt.figure(figsize=(13,8))
plt.plot(list(predicted_results_future))
plt.title("Predicted")
plt.ylabel("Irr")
plt.legend(('predicted'))
plt.show()
df_future_dates_copy['Prediction'] = predicted_results_future
Irr_all_future = pd.concat([df2, df_future_dates_copy], sort=False)
# Future Graph
Overview_Complete_Data_And_Prediction_future = Irr_all_future[['Irr','Prediction']].plot(figsize=(15, 5))

Multiple Linear Regression Machine Learning in Python --ValueError: shapes (8,15) and (390,) not aligned

I am trying to evaluate output based on certain input, using Multiple Linear Regression Machine Learning .I have trained the data and getting correct expected values while running below code:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
#dataset = pd.read_csv('50_Startups.csv')
dataset = pd.read_excel('MAHI2.xlsx')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 5].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
labelencoder1 = LabelEncoder()
X[:, 1] = labelencoder.fit_transform(X[:, 1])
labelencoder2 = LabelEncoder()
X[:, 2] = labelencoder.fit_transform(X[:, 2])
labelencoder3 = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = "all")
#X = onehotencoder.fit_transform(X).toarray()
X = onehotencoder.fit_transform(X).toarray()
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X, y)
y_pred = regressor.predict(X)
df = pd.DataFrame({'Actual': y.flatten(), 'Predicted': y_pred.flatten()})
df
Now I am trying to use same model to evaluate another set of input data as below :
dataset1 = pd.read_excel('MAHI3.xlsx')
#dataset2 = pd.get_dummies(dataset1)
X1 = dataset1.iloc[:, :-1].values
y2 = dataset1.iloc[:, 5].values
# Encoding categorical data
#labelencoder3 = LabelEncoder()
X1[:, 0] = labelencoder.fit_transform(X1[:, 0])
#labelencoder4 = LabelEncoder()
X1[:, 1] = labelencoder.fit_transform(X1[:, 1])
#labelencoder5 = LabelEncoder()
X1[:, 2] = labelencoder.fit_transform(X1[:, 2])
#labelencoder6 = LabelEncoder()
X1[:, 3] = labelencoder.fit_transform(X1[:, 3])
#onehotencoder2 = OneHotEncoder(categorical_features = "all")
X1 = onehotencoder.fit_transform(X1).toarray()
output = regressor.predict(X1)
df1 = pd.DataFrame({'Actual1': y2.flatten(), 'Predicted1': output.flatten()})
df1
But while I am running this code getting below error:
ValueError: shapes (6,13) and (390,) not aligned: 13 (dim 1) != 390 (dim 0)
It will be great if anyone help me to resolve this issue.
I don't have access to your dataset but I seems that your problem is a dimensionality problem. The thing that seems to change dimensions is the "onehotencoder".
Try to use the same one hot encoder for both.
ohe = onehotencoder.fit(X)
X = ohe.transform(X).toarray()
X1 = ohe.transform(X1).toarray()
You should make sure that the number of features that the "regressor" model is receiving is the same that when it is trained.

Binary Classification using logistic regression with Tensorflow

I just too an ML course and am trying to get better at tensorflow. To that end, I purchased the book by Nishant Shukhla (ML with tensorflow) and am trying to run the 2 feature example with a different data set.
With the fake dataset in the book, my code runs fine. However, with data I used in the ML course, the code refuses to converge. With a really small learning rate it does converge, but the learned weights are wrong.
Also attaching the plot of the feature data. It should not a feature scaling issue as values on both features vary between 30-100 units.
I am really struggling with how opaque tensorflow is- any help would be appreciated:
""" Solution for simple logistic regression model
"""
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt
# Define paramaters for the model
learning_rate = 0.0001
training_epochs = 300
data = np.loadtxt('ex2data1.txt', delimiter=',')
x1s = np.array(data[:,0]).astype(np.float32)
x2s = np.array(data[:,1]).astype(np.float32)
ys = np.array(data[:,2]).astype(np.float32)
print('Plotting data with + indicating (y = 1) examples and o \n indicating (y = 0) examples.\n')
color = ['red' if l == 0 else 'blue' for l in ys]
myplot = plt.scatter(x1s, x2s, color = color)
# Put some labels
plt.xlabel("Exam 1 score")
plt.ylabel("Exam 2 score")
# Specified in plot order
plt.show()
# Step 2: Create datasets
X1 = tf.placeholder(tf.float32, shape=(None,), name="x1")
X2 = tf.placeholder(tf.float32, shape=(None,), name="x2")
Y = tf.placeholder(tf.float32, shape=(None,), name="y")
w = tf.Variable(np.random.rand(3,1), name='w', dtype='float32',trainable=True)
y_model = tf.sigmoid(w[2]*X2 + w[1]*X1 + w[0])
cost = tf.reduce_mean(-tf.log(y_model*Y + (1-y_model)*(1-Y)))
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
writer = tf.summary.FileWriter('./graphs/logreg', tf.get_default_graph())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
prev_error = 0.0;
for epoch in range(training_epochs):
error, loss = sess.run([cost, train_op], feed_dict={X1:x1s, X2:x2s, Y:ys})
print("epoch = ", epoch, "loss = ", loss)
if abs(prev_error - error) < 0.0001:
break
prev_error = error
w_val = sess.run(w, {X1:x1s, X2:x2s, Y:ys})
print("w learned = ", w_val)
writer.close()
sess.close()
Both X1 and X2 range from ~20-100. However, once I scaled them, the solution converged just fine.

value prediction with tensorflow and python

I have a data set which contains a list of stock prices. I need to use the tensorflow and python to predict the close price.
Q1: I have the following code which takes the first 2000 records as training and 2001 to 20000 records as test but I don't know how to change the code to do the prediction of the close price of today and 1 day later??? Please advise!
#!/usr/bin/env python2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
def feature_scaling(input_pd, scaling_meathod):
if scaling_meathod == 'z-score':
scaled_pd = (input_pd - input_pd.mean()) / input_pd.std()
elif scaling_meathod == 'min-max':
scaled_pd = (input_pd - input_pd.min()) / (input_pd.max() -
input_pd.min())
return scaled_pd
def input_reshape(input_pd, start, end, batch_size, batch_shift, n_features):
temp_pd = input_pd[start-1: end+batch_size-1]
output_pd = map(lambda y : temp_pd[y:y+batch_size], xrange(0, end-start+1, batch_shift))
output_temp = map(lambda x : np.array(output_pd[x]).reshape([-1]), xrange(len(output_pd)))
output = np.reshape(output_temp, [-1, batch_size, n_features])
return output
def target_reshape(input_pd, start, end, batch_size, batch_shift, n_step_ahead, m_steps_pred):
temp_pd = input_pd[start+batch_size+n_step_ahead-2: end+batch_size+n_step_ahead+m_steps_pred-2]
print temp_pd
output_pd = map(lambda y : temp_pd[y:y+m_steps_pred], xrange(0, end-start+1, batch_shift))
output_temp = map(lambda x : np.array(output_pd[x]).reshape([-1]), xrange(len(output_pd)))
output = np.reshape(output_temp, [-1,1])
return output
def lstm(input, n_inputs, n_steps, n_of_layers, scope_name):
num_layers = n_of_layers
input = tf.transpose(input,[1, 0, 2])
input = tf.reshape(input,[-1, n_inputs])
input = tf.split(0, n_steps, input)
with tf.variable_scope(scope_name):
cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_inputs)
cell = tf.nn.rnn_cell.MultiRNNCell([cell]*num_layers)
output, state = tf.nn.rnn(cell, input, dtype=tf.float32) yi1
output = output[-1]
return output
feature_to_input = ['open price', 'highest price', 'lowest price', 'close price','turnover', 'volume','mean price']
feature_to_predict = ['close price']
feature_to_scale = ['volume']
sacling_meathod = 'min-max'
train_start = 1
train_end = 1000
test_start = 1001
test_end = 20000
batch_size = 100
batch_shift = 1
n_step_ahead = 1
m_steps_pred = 1
n_features = len(feature_to_input)
lstm_scope_name = 'lstm_prediction'
n_lstm_layers = 1
n_pred_class = 1
learning_rate = 0.1
EPOCHS = 1000
PRINT_STEP = 100
read_data_pd = pd.read_csv('./stock_price.csv')
temp_pd = feature_scaling(input_pd[feature_to_scale],sacling_meathod)
input_pd[feature_to_scale] = temp_pd
train_input_temp_pd = input_pd[feature_to_input]
train_input_nparr = input_reshape(train_input_temp_pd,
train_start, train_end, batch_size, batch_shift, n_features)
train_target_temp_pd = input_pd[feature_to_predict]
train_target_nparr = target_reshape(train_target_temp_pd, train_start, train_end, batch_size, batch_shift, n_step_ahead, m_steps_pred)
test_input_temp_pd = input_pd[feature_to_input]
test_input_nparr = input_reshape(test_input_temp_pd, test_start, test_end, batch_size, batch_shift, n_features)
test_target_temp_pd = input_pd[feature_to_predict]
test_target_nparr = target_reshape(test_target_temp_pd, test_start, test_end, batch_size, batch_shift, n_step_ahead, m_steps_pred)
tf.reset_default_graph()
x_ = tf.placeholder(tf.float32, [None, batch_size, n_features])
y_ = tf.placeholder(tf.float32, [None, 1])
lstm_output = lstm(x_, n_features, batch_size, n_lstm_layers, lstm_scope_name)
W = tf.Variable(tf.random_normal([n_features, n_pred_class]))
b = tf.Variable(tf.random_normal([n_pred_class]))
y = tf.matmul(lstm_output, W) + b
cost_func = tf.reduce_mean(tf.square(y - y_))
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_func)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
for ii in range(EPOCHS):
sess.run(train_op, feed_dict={x_:train_input_nparr, y_:train_target_nparr})
if ii % PRINT_STEP == 0:
cost = sess.run(cost_func, feed_dict={x_:train_input_nparr, y_:train_target_nparr})
print 'iteration =', ii, 'training cost:', cost
Very simply, prediction (a.k.a. scoring or inference) comes from running the input through only the forward pass, and collecting the score for each input vector. It's the same process flow as testing. The difference is the four stages of model use:
Train: learn from the training data set; adjust weights as needed.
Test: evaluate the model's performance; if accuracy has converged, stop training.
Validate: evaluate the accuracy of the trained model. If it doesn't meet acceptance criteria, change something and start over with the training.
Predict: you've passed validation -- release the model for use by the intended application.
All four steps follow the same forward logic flow; training includes back-propagation; the others do not. Simply follow the forward-only process, and you'll get the result form you need.
I worry about your data partition: only 10% for training, 90% for testing, and none for validation. A more typical split is 50-30-20, or something in that general area.
Q-1 : You should change your LSTM parameter to return a sequence of size two which will be prediction for that day and the day after.
Q-2 it's clearly that your model is underfitting the data, which is so obvious with your 10% train 90% test data ! You should more equilibrated ratio as suggested in the previous answer.

Tensorflow Grid3LSTMCell visualization

I'm having a difficult time visualizing what this Tensorflow class creates. I want to implement a LSTM RNN that handles 3D data.
class Grid3LSTMCell(GridRNNCell):
"""3D BasicLSTM cell
This creates a 2D cell which receives input and gives output in the first dimension.
The first dimension can optionally be non-recurrent if `non_recurrent_fn` is specified.
The second and third dimensions are LSTM.
"""
def __init__(self, num_units, tied=False, non_recurrent_fn=None,
use_peepholes=False, forget_bias=1.0):
super(Grid3LSTMCell, self).__init__(num_units=num_units, num_dims=3,
input_dims=0, output_dims=0, priority_dims=0, tied=tied,
non_recurrent_dims=None if non_recurrent_fn is None else 0,
cell_fn=lambda n, i: rnn_cell.LSTMCell(
num_units=n, input_size=i, forget_bias=forget_bias,
use_peepholes=use_peepholes),
non_recurrent_fn=non_recurrent_fn)
The class is found in `from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell`.
This is difficult to explain, so I've provided a drawing. Here is what I want it to do...
However the comment sounds like it isn't doing this. The comment makes it sound like the RNN is still a flat RNN, where the first dimension is outputting to, what is commonly called, the outputs variable (see below). The second dimension is outputting to the next step in the RNN, and the third dimension is outputting to the next hidden layer.
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
If this is the case, what is the point in having the first and second dimensions? Aren't they essentially the same thing? The BasicLSTMCell sends the output to the next step into outputs -- in other words they are one in the same.
Clarity?
For reference, here is my example code...
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
import numpy as np
#define parameters
learning_rate = 0.01
batch_size = 2
n_input_x = 10
n_input_y = 10
n_input_z = 10
n_hidden = 128
n_classes = 2
n_output = n_input_x * n_classes
x = tf.placeholder("float", [n_input_x, n_input_y, n_input_z])
y = tf.placeholder("float", [n_input_x, n_input_y, n_input_z, n_classes])
weights = {}
biases = {}
for i in xrange(n_input_y * n_input_z):
weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
biases[i] = tf.Variable(tf.random_normal([n_output]))
#generate random data
input_data = np.random.rand(n_input_x, n_input_y, n_input_z)
ground_truth = np.random.rand(n_input_x, n_input_y, n_input_z, n_classes)
#build GridLSTM
def GridLSTM_network(x):
x = tf.reshape(x, [-1,n_input_x])
x = tf.split(0, n_input_y * n_input_z, x)
lstm_cell = grid_rnn_cell.Grid3LSTMCell(n_hidden)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
output = []
for i in xrange(n_input_y * n_input_z):
output.append(tf.matmul(outputs[i], weights[i]) + biases[i])
return output
#initialize network, cost, optimizer and all variables
pred = GridLSTM_network(x)
# import pdb
# pdb.set_trace()
pred = tf.pack(pred)
pred = tf.transpose(pred,[1,0,2])
pred= tf.reshape(pred, [-1, n_input_x, n_input_y, n_input_z, n_classes])
temp_pred = tf.reshape(pred, [-1,n_classes])
temp_y = tf.reshape(y,[-1, n_classes])
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(temp_pred, temp_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(0,tf.cast(tf.sub(tf.nn.sigmoid(temp_pred),temp_y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 0
while 1:
print step
step = step + 1
# pdb.set_trace
sess.run(optimizer, feed_dict={x: input_data, y: ground_truth})

Resources