MXNet - application of GANs to MNIST

MXNet - application of GANs to MNIST - machine-learning

So this question is about GANs.
I am trying to do a trivial example for my own proof of concept; namely, generate images of hand written digits (MNIST). While most will approach this via deep convolutional gans (dgGANs), I am just trying to achieve this via the 1D array (i.e. instead of 28x28 gray-scale pixel values, a 28*28 1d array).
This git repo features a "vanilla" gans which treats the MNIST dataset as a 1d array of 784 values. Their output values look pretty acceptable so I wanted to do something similar.
Import statements
from __future__ import print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
import mxnet as mx
from mxnet import nd, gluon, autograd
from mxnet.gluon import nn, utils
import numpy as np
import os
from math import floor
from random import random
import time
from datetime import datetime
import logging
ctx = mx.gpu()
np.random.seed(3)
Hyper parameters
batch_size = 100
epochs = 100
generator_learning_rate = 0.001
discriminator_learning_rate = 0.001
beta1 = 0.5
latent_z_size = 100
Load data
mnist = mx.test_utils.get_mnist()
# convert imgs to arrays
flattened_training_data = mnist["test_data"].reshape(10000, 28*28)
define models
G = nn.Sequential()
with G.name_scope():
G.add(nn.Dense(300, activation="relu"))
G.add(nn.Dense(28 * 28, activation="tanh"))
D = nn.Sequential()
with D.name_scope():
D.add(nn.Dense(128, activation="relu"))
D.add(nn.Dense(64, activation="relu"))
D.add(nn.Dense(32, activation="relu"))
D.add(nn.Dense(2, activation="tanh"))
loss = gluon.loss.SoftmaxCrossEntropyLoss()
init stuff
G.initialize(mx.init.Normal(0.02), ctx=ctx)
D.initialize(mx.init.Normal(0.02), ctx=ctx)
trainer_G = gluon.Trainer(G.collect_params(), 'adam', {"learning_rate": generator_learning_rate, "beta1": beta1})
trainer_D = gluon.Trainer(D.collect_params(), 'adam', {"learning_rate": discriminator_learning_rate, "beta1": beta1})
metric = mx.metric.Accuracy()
dynamic plot (for juptyer notebook)
import matplotlib.pyplot as plt
import time
def dynamic_line_plt(ax, y_data, colors=['r', 'b', 'g'], labels=['Line1', 'Line2', 'Line3']):
x_data = []
y_max = 0
y_min = 0
x_min = 0
x_max = 0
for y in y_data:
x_data.append(list(range(len(y))))
if max(y) > y_max:
y_max = max(y)
if min(y) < y_min:
y_min = min(y)
if len(y) > x_max:
x_max = len(y)
ax.set_ylim(y_min, y_max)
ax.set_xlim(x_min, x_max)
if ax.lines:
for i, line in enumerate(ax.lines):
line.set_xdata(x_data[i])
line.set_ydata(y_data[i])
else:
for i in range(len(y_data)):
l = ax.plot(x_data[i], y_data[i], colors[i], label=labels[i])
ax.legend()
fig.canvas.draw()
train
stamp = datetime.now().strftime('%Y_%m_%d-%H_%M')
logging.basicConfig(level=logging.DEBUG)
# arrays to store data for plotting
loss_D = nd.array([0], ctx=ctx)
loss_G = nd.array([0], ctx=ctx)
acc_d = nd.array([0], ctx=ctx)
labels = ['Discriminator Loss', 'Generator Loss', 'Discriminator Acc.']
%matplotlib notebook
fig, ax = plt.subplots(1, 1)
ax.set_xlabel('Time')
ax.set_ylabel('Loss')
dynamic_line_plt(ax, [loss_D.asnumpy(), loss_G.asnumpy(), acc_d.asnumpy()], labels=labels)
for epoch in range(epochs):
tic = time.time()
data_iter.reset()
for i, batch in enumerate(data_iter):
####################################
# Update Disriminator: maximize log(D(x)) + log(1-D(G(z)))
####################################
# extract batch of real data
data = batch.data[0].as_in_context(ctx)
# add noise
# Produce our noisey input to the generator
latent_z = mx.nd.random_normal(0,1,shape=(batch_size, latent_z_size), ctx=ctx)
# soft and noisy labels
# real_label = mx.nd.ones((batch_size, ), ctx=ctx) * nd.random_uniform(.7, 1.2, shape=(1)).asscalar()
# fake_label = mx.nd.ones((batch_size, ), ctx=ctx) * nd.random_uniform(0, .3, shape=(1)).asscalar()
# real_label = nd.random_uniform(.7, 1.2, shape=(batch_size), ctx=ctx)
# fake_label = nd.random_uniform(0, .3, shape=(batch_size), ctx=ctx)
real_label = mx.nd.ones((batch_size, ), ctx=ctx)
fake_label = mx.nd.zeros((batch_size, ), ctx=ctx)
with autograd.record():
# train with real data
real_output = D(data)
errD_real = loss(real_output, real_label)
# train with fake data
fake = G(latent_z)
fake_output = D(fake.detach())
errD_fake = loss(fake_output, fake_label)
errD = errD_real + errD_fake
errD.backward()
trainer_D.step(batch_size)
metric.update([real_label, ], [real_output,])
metric.update([fake_label, ], [fake_output,])
####################################
# Update Generator: maximize log(D(G(z)))
####################################
with autograd.record():
output = D(fake)
errG = loss(output, real_label)
errG.backward()
trainer_G.step(batch_size)
####
# Plot Loss
####
# append new data to arrays
loss_D = nd.concat(loss_D, nd.mean(errD), dim=0)
loss_G = nd.concat(loss_G, nd.mean(errG), dim=0)
name, acc = metric.get()
acc_d = nd.concat(acc_d, nd.array([acc], ctx=ctx), dim=0)
# plot array
dynamic_line_plt(ax, [loss_D.asnumpy(), loss_G.asnumpy(), acc_d.asnumpy()], labels=labels)
name, acc = metric.get()
metric.reset()
logging.info('Binary training acc at epoch %d: %s=%f' % (epoch, name, acc))
logging.info('time: %f' % (time.time() - tic))
output
img = G(mx.nd.random_normal(0,1,shape=(100, latent_z_size), ctx=ctx))[0].reshape((28, 28))
plt.imshow(img.asnumpy(),cmap='gray')
plt.show()
Now this doesn't get nearly as good as the repo's example from above. Although fairly similar.
Thus I was wondering if you could take a look and figure out why:
the colors are inverted
why the results are sub par
I have been fiddling around with this trying a lot of various things to improve the results (I will list this in a second), but for the MNIST dataset this really shouldn't be needed.
Things I have tried (and I have also tried a host of combinations):
increasing the generator network
increasing the discriminator network
using soft labeling
using noisy labeling
batch norm after every layer in the generator
batch norm of the data
normalizing all values between -1 and 1
leaky relus in the generator
drop out layers in the generator
increased learning rate of discriminator compared to generator
decreased learning rate of i compared to generator
Please let me know if you have any ideas.

1) If you look into original dataset:
training_set = mnist["train_data"].reshape(60000, 28, 28)
plt.imshow(training_set[10,:,:], cmap='gray')
you will notice that the digits are white on a black background. So, technically speaking, your results are not inversed - they match the pattern of original images you used as a real data.
If you want to invert colors for visualization purposes, you can easily do that by changing the pallete to reversed one by adding '_r' (it works for all color palletes):
plt.imshow(img.asnumpy(), cmap='gray_r')
You also can play with ranges of colors by changing vmin and vmax parameters. They control how big the difference between colors should be. By default it is calculated automatically based on provided set.
2) "Why the results are sub par" - I think this is exactly the reason why the community started to use dcGANs. To me the results in the git repo you provided are quite noisy. Surely, they are different from what you receive, and you can achieve the same quality just by changing your activation functions from tanh to sigmoid as in the example on github:
G = nn.Sequential()
with G.name_scope():
G.add(nn.Dense(300, activation="relu"))
G.add(nn.Dense(28 * 28, activation="sigmoid"))
D = nn.Sequential()
with D.name_scope():
D.add(nn.Dense(128, activation="relu"))
D.add(nn.Dense(64, activation="relu"))
D.add(nn.Dense(32, activation="relu"))
D.add(nn.Dense(2, activation="sigmoid"))
Sigmoid never goes below zero and it works better in this scenario. Here is a sample picture I get if I train updated model for 30 epochs (the rest of the hyperparameters are same).
If you decide to explore dcGAN to get even better results, take a look here - https://mxnet.incubator.apache.org/tutorials/unsupervised_learning/gan.html It is a well explained tutorial on how to build dcGAN with Mxnet and Gluon. By using dcGAN you will get way better results than that.

Related

How to feed previous time-stamp prediction as additional input to the next time-stamp?

This question might have been asked, but I got confused.
I am trying to apply one of RNN types, e.g. LSTM for time-series forecasting. I have inputs, y (stock returns). For each timestamp, I'd like to get the predictions. Q1 - Am I correct choosing seq2seq approach?
I also want to use predictions from previous timestamp (initializing initial values with some constant) as additional (still using my existing inputs) input in the form of squared residuals, i.e. using
eps_{t-1} = (y_{t-1} - y^_{t-1})^2 as additional input at t (as well as previous inputs).
So, how can I do this in tensorflow or in pytorch?
I tried to depict what I want on the attached graph. The graph
p.s. Sorry, it the question is poorly formulated

Let say your input if of dimension (32,10,1) with batch_size 32, time steps of length 10 and dimension of 1. Same for your target (stock return). This code make use of the tf.scan function, which is usefull when implementing custom recurrent networks (it will iterate over the timesteps). It remains to use the residual of t-1 in t somewhere, as you would like to.
ps: it is a very basic implementation of lstm from scratch, without any bias or output activation.
import tensorflow as tf
import numpy as np
tf.reset_default_graph()
BS = 32
TS = 10
inputs_dim = 1
target_dim = 1
inputs = tf.placeholder(shape=[BS, TS, inputs_dim], dtype=tf.float32)
stock_returns = tf.placeholder(shape=[BS, TS, target_dim], dtype=tf.float32)
state_size = 16
# initial hidden state
init_state = tf.placeholder(shape=[2, BS, state_size],
dtype=tf.float32, name='initial_state')
# initializer
xav_init = tf.contrib.layers.xavier_initializer
# params
W = tf.get_variable('W', shape=[4, state_size, state_size],
initializer=xav_init())
U = tf.get_variable('U', shape=[4, inputs_dim, state_size],
initializer=xav_init())
W_out = tf.get_variable('W_out', shape=[state_size, target_dim],
initializer=xav_init())
#the function to feed tf.scan with
def step(prev, inputs_):
#unpack all inputs and previous outputs
st_1, ct_1 = prev[0][0], prev[0][1]
x = inputs_[0]
target = inputs_[1]
#get previous squared residual
eps = prev[1]
"""
here do whatever you want with eps_t-1
like x += eps if x if of the same dimension
or include it somewhere in your graph
"""
# lstm gates (add bias if needed)
#
# input gate
i = tf.sigmoid(tf.matmul(x,U[0]) + tf.matmul(st_1,W[0]))
# forget gate
f = tf.sigmoid(tf.matmul(x,U[1]) + tf.matmul(st_1,W[1]))
# output gate
o = tf.sigmoid(tf.matmul(x,U[2]) + tf.matmul(st_1,W[2]))
# gate weights
g = tf.tanh(tf.matmul(x,U[3]) + tf.matmul(st_1,W[3]))
ct = ct_1*f + g*i
st = tf.tanh(ct)*o
"""
make prediction, compute residual in t
and pass it to t+1
Normaly, we would compute prediction outside the scan function,
but as we need it here, we could just keep it and return it back
as an output of the scan function
"""
prediction_t = tf.matmul(st, W_out) # + bias
eps = (target - prediction_t)**2
return [tf.stack((st, ct), axis=0), eps, prediction_t]
states, eps, preds = tf.scan(step, [tf.transpose(inputs, [1,0,2]),
tf.transpose(stock_returns, [1,0,2])], initializer=[init_state,
tf.zeros((32,1), dtype=tf.float32),
tf.zeros((32,1),dtype=tf.float32)])
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
out = sess.run(preds, feed_dict=
{inputs:np.random.rand(BS,TS,inputs_dim),
stock_returns:np.random.rand(BS,TS,target_dim),
init_state:np.zeros((2,BS,state_size))})
out = tf.transpose(out,[1,0,2])
print(out)
And the output :
Tensor("transpose_2:0", shape=(32, 10, 1), dtype=float32)
Base code from here

Tensorflow and Scikit learn: Same solution but different outputs

Im implementing a simple linear regression with scikitlearn and tensorflow.
My solution in scikitlearn seem fine but with tensorflow my evaluation output is showing some crazy numbers.
The problem is basically to try to predict a salary based in years of experience.
I not sure what Im doing wrong in Tensorflow's code.
Thanks!
ScikitLearn solution
import pandas as pd
data = pd.read_csv('Salary_Data.csv')
X = data.iloc[:, :-1].values
y = data.iloc[:, 1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
X_single_data = [[4.6]]
y_single_pred = regressor.predict(X_single_data)
print(f'Train score: {regressor.score(X_train, y_train)}')
print(f'Test score: {regressor.score(X_test, y_test)}')
Train score: 0.960775692121653
Test score: 0.9248580247217076
Tensorflow solution
import tensorflow as tf
f_cols = [tf.feature_column.numeric_column(key='X', shape=[1])]
estimator = tf.estimator.LinearRegressor(feature_columns=f_cols)
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={'X': X_train}, y=y_train,shuffle=False)
test_input_fn = tf.estimator.inputs.numpy_input_fn(x={'X': X_test}, y=y_test,shuffle=False)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn)
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
({'average_loss': 7675087400.0,
'label/mean': 84588.11,
'loss': 69075790000.0,
'prediction/mean': 5.0796494,
'global_step': 6},
[])
Data
YearsExperience,Salary
1.1,39343.00
1.3,46205.00
1.5,37731.00
2.0,43525.00
2.2,39891.00
2.9,56642.00
3.0,60150.00
3.2,54445.00
3.2,64445.00
3.7,57189.00
3.9,63218.00
4.0,55794.00
4.0,56957.00
4.1,57081.00
4.5,61111.00
4.9,67938.00
5.1,66029.00
5.3,83088.00
5.9,81363.00
6.0,93940.00
6.8,91738.00
7.1,98273.00
7.9,101302.00
8.2,113812.00
8.7,109431.00
9.0,105582.00
9.5,116969.00
9.6,112635.00
10.3,122391.00
10.5,121872.00

Per your code request in the comments: Though I had used my online curve and surface fitting web site zunzun.com for this equation at http://zunzun.com/Equation/2/Sigmoidal/Sigmoid%20B/ for the modeling work, here is a graphing source code example using the scipy differential_evolution genetic algorithm module to estimate initial parameter estimates. The scipy implementation of Differential Evolution uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, which requires bounds within which to search - in this example those bounds are taken from the data maximum and minimum values, and the fit statistics and parameter values are almost identical to those from the web site.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
xData = numpy.array([ 1.1, 1.3, 1.5, 2.0, 2.2, 2.9, 3.0, 3.2, 3.2, 3.7, 3.9, 4.0, 4.0, 4.1, 4.5, 4.9, 5.1, 5.3, 5.9, 6.0, 6.8, 7.1, 7.9, 8.2, 8.7, 9.0, 9.5, 9.6, 10.3, 10.5])
yData = numpy.array([ 39.343, 46.205, 37.731, 43.525, 39.891, 56.642, 60.15, 54.445, 64.445, 57.189, 63.218, 55.794, 56.957, 57.081, 61.111, 67.938, 66.029, 83.088, 81.363, 93.94, 91.738, 98.273, 101.302, 113.812, 109.431, 105.582, 116.969, 112.635, 122.391, 121.872])
def func(x, a, b, c):
return a / (1.0 + numpy.exp(-(x-b)/c))
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minY, maxY]) # search bounds for a
parameterBounds.append([minX, maxX]) # search bounds for b
parameterBounds.append([minX, maxX]) # search bounds for c
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('Years of experience') # X axis data label
axes.set_ylabel('Salary in thousands') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

I cannot place an image in a comment, and so place it here. I suspected the relationship might be sigmoidal rather than linear, and found the following sigmoidal equation and fit statistics using units of thousands for salary: "y = a / (1.0 + exp(-(x-b)/c))" with fitted parameters a = 1.5535069418318591E+02, b = 5.4580059234664899E+00, and c = 3.7724942500630938E+00 giving an R-squared = 0.96 and RMSE = 5.30 (thousand)

Binary Classification using logistic regression with Tensorflow

I just too an ML course and am trying to get better at tensorflow. To that end, I purchased the book by Nishant Shukhla (ML with tensorflow) and am trying to run the 2 feature example with a different data set.
With the fake dataset in the book, my code runs fine. However, with data I used in the ML course, the code refuses to converge. With a really small learning rate it does converge, but the learned weights are wrong.
Also attaching the plot of the feature data. It should not a feature scaling issue as values on both features vary between 30-100 units.
I am really struggling with how opaque tensorflow is- any help would be appreciated:
""" Solution for simple logistic regression model
"""
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt
# Define paramaters for the model
learning_rate = 0.0001
training_epochs = 300
data = np.loadtxt('ex2data1.txt', delimiter=',')
x1s = np.array(data[:,0]).astype(np.float32)
x2s = np.array(data[:,1]).astype(np.float32)
ys = np.array(data[:,2]).astype(np.float32)
print('Plotting data with + indicating (y = 1) examples and o \n indicating (y = 0) examples.\n')
color = ['red' if l == 0 else 'blue' for l in ys]
myplot = plt.scatter(x1s, x2s, color = color)
# Put some labels
plt.xlabel("Exam 1 score")
plt.ylabel("Exam 2 score")
# Specified in plot order
plt.show()
# Step 2: Create datasets
X1 = tf.placeholder(tf.float32, shape=(None,), name="x1")
X2 = tf.placeholder(tf.float32, shape=(None,), name="x2")
Y = tf.placeholder(tf.float32, shape=(None,), name="y")
w = tf.Variable(np.random.rand(3,1), name='w', dtype='float32',trainable=True)
y_model = tf.sigmoid(w[2]*X2 + w[1]*X1 + w[0])
cost = tf.reduce_mean(-tf.log(y_model*Y + (1-y_model)*(1-Y)))
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
writer = tf.summary.FileWriter('./graphs/logreg', tf.get_default_graph())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
prev_error = 0.0;
for epoch in range(training_epochs):
error, loss = sess.run([cost, train_op], feed_dict={X1:x1s, X2:x2s, Y:ys})
print("epoch = ", epoch, "loss = ", loss)
if abs(prev_error - error) < 0.0001:
break
prev_error = error
w_val = sess.run(w, {X1:x1s, X2:x2s, Y:ys})
print("w learned = ", w_val)
writer.close()
sess.close()

Both X1 and X2 range from ~20-100. However, once I scaled them, the solution converged just fine.

RNN can't learn integral function

For studying deep learning, RNN, LSTM and so on I tried to make RNN fit integration function. I have put random signal from 0 to 1 as input to RNN and made integral from biased by -0.5 input signal, made the limit for integral between 0:1 and put it as RNN target to learn. Blue - random input, orange - integrated input
So I have time series with only one input (random) and one output (limited integral of input) and I want RNN to predict output by the input.
I used Pytorch and tried to use vanilla RNN, GRU cell, different sizes of hidden layers, stacking several RNN, putting dense connected layers to the RNN output, different deep in backpropagation through time (from 2 to 50 gradients rolling-back). And I can't get a good result at all! It works somehow, but I can't find a way to fit integral function precisely. Here is the best of my results:
green - RNN output. Green line (model output) does not fit orange line in many cases - that is the problem.
Here is my source code in jupyter.
My questions: is it possible - to learn a saturated integral function by RNN? Where is my problem? What can I try more to achieve good quality? Ideally I want to RNN output be equal desired output (integral function) through all time series.
PS:
My code in raw format:
import numpy as np
from scipy.stats import truncnorm
import random
import math
import copy
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.cm as cm
def generate_data(num_of_data):
input_data=[]
output_data=[]
current_input_value=0
current_output_value=0
for i in range(num_of_data):
if (random.random()<0.1):
current_input_value=random.random()
# current_output_value=0
current_input_value=current_input_value+(random.random()-0.5)*0
current_output_value=current_output_value+0.0*(current_input_value-current_output_value)+(current_input_value-0.5)*0.1
if (current_output_value<0):
current_output_value=0
if (current_output_value>1):
current_output_value=1
input_data.append(current_input_value)
output_data.append(current_output_value)
return input_data,output_data
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (20, 6)
input_data,output_data=generate_data(500)
plt.plot(input_data)
plt.plot(output_data)
plt.show()
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.number_of_layers=1
self.hidden_size = hidden_size
self.gru = nn.GRU(input_size, hidden_size,self.number_of_layers)
self.Dense1 = nn.Linear(hidden_size, hidden_size)
self.Dense1A = nn.ReLU()
self.Dense2 = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden):
gru_output, hidden = self.gru(input, hidden)
Dense1Out=self.Dense1(gru_output)
Dense1OutAct=self.Dense1A(Dense1Out)
output=self.Dense2(Dense1OutAct)
return output, hidden
def initHidden(self):
return Variable(torch.zeros(self.number_of_layers,1,self.hidden_size))
import time
import math
import operator
def timeSince(since):
now = time.time()
s = now - since
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
rnn = RNN(1, 50, 1)
n_iters = 250000
print_every = 2000
plot_every = 2000
all_losses = []
total_loss_print = 0
total_loss_plot = 0
criterion=nn.L1Loss()
print("training...\n")
start = time.time()
optimizer = optim.Adam(rnn.parameters(), lr=0.0002)
rnn_hidden = rnn.initHidden()
rnn.zero_grad()
loss = 0
#for gata_q in range(int(n_iters/500)):
# rnn_hidden = rnn.initHidden()
input_data,output_data=generate_data(n_iters)
for data_index in range(len(input_data)):
input_tensor=torch.zeros(1, 1, 1)
input_tensor[0][0][0]=input_data[data_index]
output_tensor=torch.zeros(1, 1, 1)
output_tensor[0][0][0]=output_data[data_index]
rnn_output, rnn_hidden = rnn(Variable(input_tensor), rnn_hidden)
loss += criterion(rnn_output, Variable(output_tensor))
if data_index%2==0:
loss.backward()
total_loss_print += loss.data[0]
total_loss_plot += loss.data[0]
optimizer.step()
rnn_hidden=Variable(rnn_hidden.data)
rnn.zero_grad()
loss = 0
if data_index % print_every == 0:
print('%s (%d %d%%) tl=%.4f' % (timeSince(start), data_index, data_index / n_iters * 100,total_loss_print/print_every))
total_loss_print = 0
if data_index % plot_every == 0:
all_losses.append(total_loss_plot / plot_every)
total_loss_plot = 0
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.figure()
plt.plot(all_losses)
plt.show()
rnn_hidden = rnn.initHidden()
rnn.zero_grad()
loss = 0
rnn_output_data=[]
input_data,output_data=generate_data(1500)
for data_index in range(len(input_data)):
input_tensor=torch.zeros(1, 1, 1)
input_tensor[0][0][0]=input_data[data_index]
rnn_output, rnn_hidden = rnn(Variable(input_tensor), rnn_hidden)
rnn_output_data.append(rnn_output.data.numpy()[0][0][0])
plt.plot(input_data)#blue
plt.plot(output_data)#ogange
plt.plot(rnn_output_data)#green
plt.show()

I have found the problem by myself. The problem was in some case of overfitting on latest data, as in reinforcement learning case overfitting can occur with exploiting latest strategy. As I was not using any mini-batches and applied optimiser directly after a new point of data, and as because of data points similar through 20-50 of samples, optimiser simply fitted network to only latest points forgetting of fitting previous. I solved it by collecting gradient data through time for 50 points and only after it I apply one step of optimiser. The network can learn now much better, but still not perfect.
Here is modification of code to make it work:
rnn_output, rnn_hidden = rnn(Variable(input_tensor), rnn_hidden)
loss += criterion(rnn_output, Variable(output_tensor))
if data_index % 2==0:
loss.backward()
total_loss_print += loss.data[0]
rnn_hidden=Variable(rnn_hidden.data)
loss = 0
# torch.nn.utils.clip_grad_norm(rnn.parameters(), 0.01)
if data_index % 50==0:
optimizer.step()
rnn.zero_grad()
and new result of learning of integral:
pic.

How do you make TensorFlow + Keras fast with a TFRecord dataset?

What is an example of how to use a TensorFlow TFRecord with a Keras Model and tf.session.run() while keeping the dataset in tensors w/ queue runners?
Below is a snippet that works but it needs the following improvements:
Use the Model API
specify an Input()
Load a dataset from a TFRecord
Run through a dataset in parallel (such as with a queuerunner)
Here is the snippet, there are several TODO lines indicating what is needed:
from keras.models import Model
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense, Input
from keras.objectives import categorical_crossentropy
from tensorflow.examples.tutorials.mnist import input_data
sess = tf.Session()
K.set_session(sess)
# Can this be done more efficiently than placeholders w/ TFRecords?
img = tf.placeholder(tf.float32, shape=(None, 784))
labels = tf.placeholder(tf.float32, shape=(None, 10))
# TODO: Use Input()
x = Dense(128, activation='relu')(img)
x = Dense(128, activation='relu')(x)
preds = Dense(10, activation='softmax')(x)
# TODO: Construct model = Model(input=inputs, output=preds)
loss = tf.reduce_mean(categorical_crossentropy(labels, preds))
# TODO: handle TFRecord data, is it the same?
mnist_data = input_data.read_data_sets('MNIST_data', one_hot=True)
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
sess.run(tf.global_variables_initializer())
# TODO remove default, add queuerunner
with sess.as_default():
for i in range(1000):
batch = mnist_data.train.next_batch(50)
train_step.run(feed_dict={img: batch[0],
labels: batch[1]})
print(loss.eval(feed_dict={img: mnist_data.test.images,
labels: mnist_data.test.labels}))
Why is this question relevant?
For high performance training without going back to python
no TFRecord to numpy to tensor conversions
Keras will soon be part of tensorflow
Demonstrate how Keras Model() classes can accept tensors for input data correctly.
Here is some starter information for a semantic segmentation problem example:
example unet Keras model unet.py, happens to be for semantic segmentation.
Keras + Tensorflow Blog Post
An attempt at running the unet model a tf session with TFRecords and a Keras model (not working)
Code to create the TFRecords: tf_records.py
An attempt at running the unet model a tf session with TFRecords and a Keras model is in densenet_fcn.py (not working)

I don't use tfrecord dataset format so won't argue on the pros and cons, but I got interested in extending Keras to support the same.
github.com/indraforyou/keras_tfrecord is the repository. Will briefly explain the main changes.
Dataset creation and loading
data_to_tfrecord and read_and_decode here takes care of creating tfrecord dataset and loading the same. Special care must be to implement the read_and_decode otherwise you will face cryptic errors during training.
Initialization and Keras model
Now both tf.train.shuffle_batch and Keras Input layer returns tensor. But the one returned by tf.train.shuffle_batch don't have metadata needed by Keras internally. As it turns out, any tensor can be easily turned into a tensor with keras metadata by calling Input layer with tensor param.
So this takes care of initialization:
x_train_, y_train_ = ktfr.read_and_decode('train.mnist.tfrecord', one_hot=True, n_class=nb_classes, is_train=True)
x_train_batch, y_train_batch = K.tf.train.shuffle_batch([x_train_, y_train_],
batch_size=batch_size,
capacity=2000,
min_after_dequeue=1000,
num_threads=32) # set the number of threads here
x_train_inp = Input(tensor=x_train_batch)
Now with x_train_inp any keras model can be developed.
Training (simple)
Lets say train_out is the output tensor of your keras model. You can easily write a custom training loop on the lines of:
loss = tf.reduce_mean(categorical_crossentropy(y_train_batch, train_out))
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
# sess.run(tf.global_variables_initializer())
sess.run(tf.initialize_all_variables())
with sess.as_default():
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
start_time = time.time()
_, loss_value = sess.run([train_op, loss], feed_dict={K.learning_phase(): 0})
duration = time.time() - start_time
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
duration))
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
coord.request_stop()
coord.join(threads)
sess.close()
Training (keras style)
One of the features of keras that makes it so lucrative is its generalized training mechanism with the callback functions.
But to support tfrecords type training there are several changes that are need in the fit function
running the queue threads
no feeding in batch data through feed_dict
supporting validation becomes tricky as the validation data will also be coming in through another tensor an different model needs to be internally created with shared upper layers and validation tensor fed in by other tfrecord reader.
But all this can be easily supported by another flag parameter. What makes things messing are the keras features sample_weight and class_weight they are used to weigh each sample and weigh each class. For this in compile() keras creates placeholders (here) and placeholders are also implicitly created for the targets (here) which is not needed in our case the labels are already fed in by tfrecord readers. These placeholders needs to be fed in during session run which is unnecessary in our cae.
So taking into account these changes, compile_tfrecord(here) and fit_tfrecord(here) are the extension of compile and fit and shares say 95% of the code.
They can be used in the following way:
import keras_tfrecord as ktfr
train_model = Model(input=x_train_inp, output=train_out)
ktfr.compile_tfrecord(train_model, optimizer='rmsprop', loss='categorical_crossentropy', out_tensor_lst=[y_train_batch], metrics=['accuracy'])
train_model.summary()
ktfr.fit_tfrecord(train_model, X_train.shape[0], batch_size, nb_epoch=3)
train_model.save_weights('saved_wt.h5')
You are welcome to improve on the code and pull requests.

Update 2018-08-29 this is now directly supported in keras, see the following example:
https://github.com/keras-team/keras/blob/master/examples/mnist_tfrecord.py
Original Answer:
TFRecords are supported by using an external loss. Here are the key lines constructing an external loss:
# tf yield ops that supply dataset images and labels
x_train_batch, y_train_batch = read_and_decode_recordinput(...)
# create a basic cnn
x_train_input = Input(tensor=x_train_batch)
x_train_out = cnn_layers(x_train_input)
model = Model(inputs=x_train_input, outputs=x_train_out)
loss = keras.losses.categorical_crossentropy(y_train_batch, x_train_out)
model.add_loss(loss)
model.compile(optimizer='rmsprop', loss=None)
Here is an example for Keras 2. It works after applying the small patch #7060:
'''MNIST dataset with TensorFlow TFRecords.
Gets to 99.25% test accuracy after 12 epochs
(there is still a lot of margin for parameter tuning).
'''
import os
import copy
import time
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
from keras import backend as K
from keras.models import Model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Input
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard
from keras.objectives import categorical_crossentropy
from keras.utils import np_utils
from keras.utils.generic_utils import Progbar
from keras import callbacks as cbks
from keras import optimizers, objectives
from keras import metrics as metrics_module
from keras.datasets import mnist
if K.backend() != 'tensorflow':
raise RuntimeError('This example can only run with the '
'TensorFlow backend for the time being, '
'because it requires TFRecords, which '
'are not supported on other platforms.')
def images_to_tfrecord(images, labels, filename):
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
""" Save data into TFRecord """
if not os.path.isfile(filename):
num_examples = images.shape[0]
rows = images.shape[1]
cols = images.shape[2]
depth = images.shape[3]
print('Writing', filename)
writer = tf.python_io.TFRecordWriter(filename)
for index in range(num_examples):
image_raw = images[index].tostring()
example = tf.train.Example(features=tf.train.Features(feature={
'height': _int64_feature(rows),
'width': _int64_feature(cols),
'depth': _int64_feature(depth),
'label': _int64_feature(int(labels[index])),
'image_raw': _bytes_feature(image_raw)}))
writer.write(example.SerializeToString())
writer.close()
else:
print('tfrecord %s already exists' % filename)
def read_and_decode_recordinput(tf_glob, one_hot=True, classes=None, is_train=None,
batch_shape=[1000, 28, 28, 1], parallelism=1):
""" Return tensor to read from TFRecord """
print 'Creating graph for loading %s TFRecords...' % tf_glob
with tf.variable_scope("TFRecords"):
record_input = data_flow_ops.RecordInput(
tf_glob, batch_size=batch_shape[0], parallelism=parallelism)
records_op = record_input.get_yield_op()
records_op = tf.split(records_op, batch_shape[0], 0)
records_op = [tf.reshape(record, []) for record in records_op]
progbar = Progbar(len(records_op))
images = []
labels = []
for i, serialized_example in enumerate(records_op):
progbar.update(i)
with tf.variable_scope("parse_images", reuse=True):
features = tf.parse_single_example(
serialized_example,
features={
'label': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
})
img = tf.decode_raw(features['image_raw'], tf.uint8)
img.set_shape(batch_shape[1] * batch_shape[2])
img = tf.reshape(img, [1] + batch_shape[1:])
img = tf.cast(img, tf.float32) * (1. / 255) - 0.5
label = tf.cast(features['label'], tf.int32)
if one_hot and classes:
label = tf.one_hot(label, classes)
images.append(img)
labels.append(label)
images = tf.parallel_stack(images, 0)
labels = tf.parallel_stack(labels, 0)
images = tf.cast(images, tf.float32)
images = tf.reshape(images, shape=batch_shape)
# StagingArea will store tensors
# across multiple steps to
# speed up execution
images_shape = images.get_shape()
labels_shape = labels.get_shape()
copy_stage = data_flow_ops.StagingArea(
[tf.float32, tf.float32],
shapes=[images_shape, labels_shape])
copy_stage_op = copy_stage.put(
[images, labels])
staged_images, staged_labels = copy_stage.get()
return images, labels
def save_mnist_as_tfrecord():
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]
images_to_tfrecord(images=X_train, labels=y_train, filename='train.mnist.tfrecord')
images_to_tfrecord(images=X_test, labels=y_test, filename='test.mnist.tfrecord')
def cnn_layers(x_train_input):
x = Conv2D(32, (3, 3), activation='relu', padding='valid')(x_train_input)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x_train_out = Dense(classes,
activation='softmax',
name='x_train_out')(x)
return x_train_out
sess = tf.Session()
K.set_session(sess)
save_mnist_as_tfrecord()
batch_size = 100
batch_shape = [batch_size, 28, 28, 1]
epochs = 3000
classes = 10
parallelism = 10
x_train_batch, y_train_batch = read_and_decode_recordinput(
'train.mnist.tfrecord',
one_hot=True,
classes=classes,
is_train=True,
batch_shape=batch_shape,
parallelism=parallelism)
x_test_batch, y_test_batch = read_and_decode_recordinput(
'test.mnist.tfrecord',
one_hot=True,
classes=classes,
is_train=True,
batch_shape=batch_shape,
parallelism=parallelism)
x_batch_shape = x_train_batch.get_shape().as_list()
y_batch_shape = y_train_batch.get_shape().as_list()
x_train_input = Input(tensor=x_train_batch, batch_shape=x_batch_shape)
x_train_out = cnn_layers(x_train_input)
y_train_in_out = Input(tensor=y_train_batch, batch_shape=y_batch_shape, name='y_labels')
cce = categorical_crossentropy(y_train_batch, x_train_out)
train_model = Model(inputs=[x_train_input], outputs=[x_train_out])
train_model.add_loss(cce)
train_model.compile(optimizer='rmsprop',
loss=None,
metrics=['accuracy'])
train_model.summary()
tensorboard = TensorBoard()
# tensorboard disabled due to Keras bug
train_model.fit(batch_size=batch_size,
epochs=epochs) # callbacks=[tensorboard])
train_model.save_weights('saved_wt.h5')
K.clear_session()
# Second Session, pure Keras
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]
x_test_inp = Input(batch_shape=(None,) + (X_test.shape[1:]))
test_out = cnn_layers(x_test_inp)
test_model = Model(inputs=x_test_inp, outputs=test_out)
test_model.load_weights('saved_wt.h5')
test_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
test_model.summary()
loss, acc = test_model.evaluate(X_test, np_utils.to_categorical(y_test), classes)
print('\nTest accuracy: {0}'.format(acc))
I've also been working to improve the support for TFRecords in the following issue and pull request:
#6928 Yield Op support: High Performance Large Datasets via TFRecords, and RecordInput
#7102 Keras Input Tensor API Design Proposal
Finally, it is possible to use tf.contrib.learn.Experiment to train Keras models in TensorFlow.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

MXNet - application of GANs to MNIST - machine-learning

Related

How to feed previous time-stamp prediction as additional input to the next time-stamp?

Tensorflow and Scikit learn: Same solution but different outputs

Binary Classification using logistic regression with Tensorflow

RNN can't learn integral function

How do you make TensorFlow + Keras fast with a TFRecord dataset?

Categories

Resources