Updating parameters of a function with Flux.jl - machine-learning

I'm playing around with flux.jl and I'm having trouble updating the parameters of a custom function.
The function is defined below as objective:
using Distributions
using Flux.Tracker: gradient, param, Params
using Flux.Optimise: Descent, ADAM, update!
D = 2
num_samples = 100
function log_density(params)
mu, log_sigma = params
d1 = Normal(0, 1.35)
d2 = Normal(0, exp(log_sigma))
d1_density = logpdf(d1, log_sigma)
d2_density = logpdf(d2, mu)
return d1_density + d2_density
end
function J(log_std)
H = 0.5 * D * (1.0 + log(2 * pi)) + sum(log_std)
return H
end
function objective(mu, log_std; D=2)
samples = rand(Normal(), num_samples, D) .* sqrt.(log_std) .+ mu
log_px = mapslices(log_density, samples; dims=2)
elbo = J(log_std) + mean(log_px)
return -elbo
end
And I attempt to do a single update as follows:
mu = param(reshape([-1, -1], 1, :))
sigma = param(reshape([5, 5], 1, :))
grads = gradient(() -> objective(mu, sigma), Params([mu, sigma]))
opt = Descent(0.001)
for p in (mu, sigma)
update!(opt, p, grads[p])
end
Produces the error:
ERROR: Can't differentiate `setindex!`
Stacktrace:
[1] error(::String) at ./error.jl:33
[2] setindex!(::TrackedArray{…,Array{Float64,2}}, ::Flux.Tracker.TrackedReal{Float64}, ::CartesianIndex{2}) at /Users/vasya/.julia/packages/Flux/T3PhK/src/tracker/lib/array.jl:63
[3] macro expansion at ./broadcast.jl:838 [inlined]
[4] macro expansion at ./simdloop.jl:73 [inlined]
[5] copyto! at ./broadcast.jl:837 [inlined]
[6] copyto! at ./broadcast.jl:792 [inlined]
[7] materialize! at ./broadcast.jl:751 [inlined]
[8] update!(::Descent, ::TrackedArray{…,Array{Float64,2}}, ::TrackedArray{…,Array{Float64,2}}) at /Users/vasya/.julia/packages/Flux/T3PhK/src/optimise/optimisers.jl:22
[9] top-level scope at ./REPL[23]:2 [inlined]
[10] top-level scope at ./none:0
I have also tried replacing grads[p] with grads[p].data. This produces no error, but does not update the parameters!
Environment details:
- Julia Version 1.0.2
- Flux v0.7.0
- Distributions v0.16.4

Discussion via Slack cleared up the correct usage of update! functions. The code below makes the module references explicit, and produces updated parameters (for Flux v0.7.0):
using Distributions
using Flux
D = 2
num_samples = 100
function log_density(params)
mu, log_sigma = params
d1 = Normal(0, 1.35)
d2 = Normal(0, exp(log_sigma))
d1_density = logpdf(d1, log_sigma)
d2_density = logpdf(d2, mu)
return d1_density + d2_density
end
function J(log_std)
H = 0.5 * D * (1.0 + log(2 * pi)) + sum(log_std)
return H
end
function objective(mu, log_std; D=2)
samples = rand(Normal(), num_samples, D) .* sqrt.(log_std) .+ mu
log_px = mapslices(log_density, samples; dims=2)
elbo = J(log_std) + mean(log_px)
return -elbo
end
mu = Flux.Tracker.param(reshape([-1, -1], 1, :))
sigma = Flux.Tracker.param(reshape([5, 5], 1, :))
grads = Flux.Tracker.gradient(() -> objective(mu, sigma), Flux.Tracker.Params([mu, sigma]))
println(mu, sigma)
opt = Flux.Optimise.Descent(0.01)
for p in (mu, sigma)
Flux.Tracker.update!(p, Flux.Optimise.update!(opt, p, Flux.data(grads[p])))
end
println(mu, sigma)
This prints:
[-1.0 -1.0] (tracked)[5.0 5.0] (tracked)
[-198.742 -459.423] (tracked)[31.0583 225.657] (tracked)

Related

Precision and recall missunderstanding

In pycocotools in cocoeval.py sctipt there is COCOeval class and in this class there is accumulate function for calculating Precision and Recall. Does anyone know what is this npig variable? Is this negative-positive or?
Because I saw this formula for recall: Recall = (True Positive)/(True Positive + False Negative)
Can I just use this precision and recall variable inside dictionary self.eval to get precision and recall of my model which I'm testing, and plot a precision-recall curve?
And the variable scores is this F1 score?
Because I'm not very well understand this T,R,K,A,M what is happening with this.
How can I print precision and recall in terminal?
def accumulate(self, p = None):
'''
Accumulate per image evaluation results and store the result in self.eval
:param p: input params for evaluation
:return: None
'''
print('Accumulating evaluation results...')
tic = time.time()
if not self.evalImgs:
print('Please run evaluate() first')
# allows input customized parameters
if p is None:
p = self.params
p.catIds = p.catIds if p.useCats == 1 else [-1]
T = len(p.iouThrs)
R = len(p.recThrs)
K = len(p.catIds) if p.useCats else 1
A = len(p.areaRng)
M = len(p.maxDets)
precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
recall = -np.ones((T,K,A,M))
scores = -np.ones((T,R,K,A,M))
# create dictionary for future indexing
_pe = self._paramsEval
catIds = _pe.catIds if _pe.useCats else [-1]
setK = set(catIds)
setA = set(map(tuple, _pe.areaRng))
setM = set(_pe.maxDets)
setI = set(_pe.imgIds)
# get inds to evaluate
k_list = [n for n, k in enumerate(p.catIds) if k in setK]
m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
I0 = len(_pe.imgIds)
A0 = len(_pe.areaRng)
# retrieve E at each category, area range, and max number of detections
for k, k0 in enumerate(k_list):
Nk = k0*A0*I0
for a, a0 in enumerate(a_list):
Na = a0*I0
for m, maxDet in enumerate(m_list):
E = [self.evalImgs[Nk + Na + i] for i in i_list]
E = [e for e in E if not e is None]
if len(E) == 0:
continue
dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
# different sorting method generates slightly different results.
# mergesort is used to be consistent as Matlab implementation.
inds = np.argsort(-dtScores, kind='mergesort')
dtScoresSorted = dtScores[inds]
dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
gtIg = np.concatenate([e['gtIgnore'] for e in E])
npig = np.count_nonzero(gtIg==0 )
if npig == 0:
continue
tps = np.logical_and( dtm, np.logical_not(dtIg) )
fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
tp = np.array(tp)
fp = np.array(fp)
nd = len(tp)
rc = tp / npig
pr = tp / (fp+tp+np.spacing(1))
q = np.zeros((R,))
ss = np.zeros((R,))
if nd:
recall[t,k,a,m] = rc[-1]
else:
recall[t,k,a,m] = 0
# numpy is slow without cython optimization for accessing elements
# use python array gets significant speed improvement
pr = pr.tolist(); q = q.tolist()
for i in range(nd-1, 0, -1):
if pr[i] > pr[i-1]:
pr[i-1] = pr[i]
inds = np.searchsorted(rc, p.recThrs, side='left')
try:
for ri, pi in enumerate(inds):
q[ri] = pr[pi]
ss[ri] = dtScoresSorted[pi]
except:
pass
precision[t,:,k,a,m] = np.array(q)
scores[t,:,k,a,m] = np.array(ss)
self.eval = {
'params': p,
'counts': [T, R, K, A, M],
'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'precision': precision,
'recall': recall,
'scores': scores,
}
toc = time.time()
print('DONE (t={:0.2f}s).'.format( toc-tic))

Pytorch, slicing tensor causes RuntimeError:: one of the variables needed for gradient computation has been modified by an inplace operation:

I wrote a RNN with LSTM cell with Pycharm. The peculiarity of this network is that the output of the RNN is fed into a integration opeartion, computed with Runge-kutta.
The integration takes some input and propagate that in time one step ahead. In order to do so I need to slice the feature tensor X along the batch dimension, and pass this to the Runge-kutta.
class MyLSTM(torch.nn.Module):
def __init__(self, ni, no, sampling_interval, nh=10, nlayers=1):
super(MyLSTM, self).__init__()
self.device = torch.device("cpu")
self.dtype = torch.float
self.ni = ni
self.no = no
self.nh = nh
self.nlayers = nlayers
self.lstms = torch.nn.ModuleList(
[torch.nn.LSTMCell(self.ni, self.nh)] + [torch.nn.LSTMCell(self.nh, self.nh) for i in range(nlayers - 1)])
self.out = torch.nn.Linear(self.nh, self.no)
self.do = torch.nn.Dropout(p=0.2)
self.actfn = torch.nn.Sigmoid()
self.sampling_interval = sampling_interval
self.scaler_states = None
# Options
# description of the whole block
def forward(self, x, h0, train=False, integrate_ode=True):
x0 = x.clone().requires_grad_(True)
hs = x # initiate hidden state
if h0 is None:
h = torch.zeros(hs.shape[0], self.nh, device=self.device)
c = torch.zeros(hs.shape[0], self.nh, device=self.device)
else:
(h, c) = h0
# LSTM cells
for i in range(self.nlayers):
h, c = self.lstms[i](hs, (h, c))
if train:
hs = self.do(h)
else:
hs = h
# Output layer
# y = self.actfn(self.out(hs))
y = self.out(hs)
if integrate_ode:
p = y
y = self.integrate(x0, p)
return y, (h, c)
def integrate(self, x0, p):
# RK4 steps per interval
M = 4
DT = self.sampling_interval / M
X = x0
# X = self.scaler_features.inverse_transform(x0)
for b in range(X.shape[0]):
xx = X[b, :]
for j in range(M):
k1 = self.ode(xx, p[b, :])
k2 = self.ode(xx + DT / 2 * k1, p[b, :])
k3 = self.ode(xx + DT / 2 * k2, p[b, :])
k4 = self.ode(xx + DT * k3, p[b, :])
xx = xx + DT / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
X_all[b, :] = xx
return X_all
def ode(self, x0, y):
# Here I a dynamic model
I get this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of SelectBackward, is at version 64; expected version 63 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
the problem is in the operations xx = X[b, :] and p[b,:]. I know that because I choose batch dimension of 1, then I can replace the previous two equations with xx=X and p, and this works. How can split the tensor without loosing the gradient?
I had the same question, and after a lot of searching, I added .detach() function after "h" and "c" in the RNN cell.

Scipy.optimize - minimize not respecting constraints

Using the code below to to understand how Scipy optmization/minimization works. The results are not matching what I am expecting.
"""
Minimize: f = 2*x[0]*x[1] + 2*x[0] - x[0]**2 - 2*x[1]**2
Subject to: -2*x[0] + 2*x[1] <= -2
2*x[0] - 4*x[1] <= 0
x[0]**3 -x[1] == 0
where: 0 <= x[0] <= inf
1 <= x[1] <= inf
"""
import numpy as np
from scipy.optimize import minimize
def objective(x):
return 2.0*x[0]*x[1] + 2.0*x[0] - x[0]**2 - 2.0*x[1]**2
def constraint1(x):
return +2.0*x[0] - 2.0*x[1] - 2.0
def constraint2(x):
return -2.0*x[0] + 4.0*x[1]
def constraint3(x):
sum_eq = x[0]**3.0 -x[1]
return sum_eq
# initial guesses
n = 2
x0 = np.zeros(n)
x0[0] = 10.0
x0[1] = 100.0
# show initial objective
print('Initial SSE Objective: ' + str(objective(x0)))
# optimize
#b = (1.0,None)
bnds = ((0.0,1000.0), (1.0,1000.0))
con1 = {'type': 'ineq', 'fun': constraint1}
con2 = {'type': 'ineq', 'fun': constraint2}
con3 = {'type': 'eq', 'fun': constraint3}
cons = ([con1, con2, con3])
solution = minimize(objective,
x0,
method='SLSQP',
bounds=bnds,
constraints=cons)
x = solution.x
print(solution)
# show final objective
print('Final SSE Objective: ' + str(objective(x)))
# print solution
print('Solution')
print('x1 = ' + str(x[0]))
print('x2 = ' + str(x[1]))
print('\n')
print('x', x)
print('constraint1', constraint1(x))
print('constraint2', constraint2(x))
print('constraint3', constraint3(x))
When I run, this is what Python throws on its output console:
Initial SSE Objective: -18080.0
fun: 2.0
jac: array([ 0.00000000e+00, -2.98023224e-08])
message: 'Optimization terminated successfully.'
nfev: 122
nit: 17
njev: 13
status: 0
success: True
x: array([2., 1.])
Final SSE Objective: 2.0
Solution
x1 = 2.0000000000010196
x2 = 1.0000000000012386
x [2. 1.]
constraint1 -4.3787196091216174e-13
constraint2 2.915001573455811e-12
constraint3 7.000000000010997
Despite the optimizer says the result was successful, the constraint3 is not respected because the result should be zero. What am I missing?
Your problem is incompatible. You can eliminate the 3rd constraint (which makes your problem simpler in the first place - only a scalar optimization), after this it is a bit more clear to see what is the problem. From constraint 3 and the lower bound on the original x1 follows, that x0 is not feasible from 0 to 1, so the lower bound in the 1D problem should be 1. It is easy to see that constraint 2 will be always positive, when x0 is larger than 1, therefore it will never be satisfied.
When I run your original problem for me it stops with positive directional derivative (and for the rewritten problem with 'Inequality constraints incompatible').
Which SciPy are you using? For me it is 1.4.1.
On the picture below you can see the objective and the remaining constraints for the 1D problem (horizontal axis is the original x0 variable)
"""
Minimize: f = 2*x[0]*x1 + 2*x[0] - x[0]**2 - 2*x1**2
Subject to: -2*x[0] + 2*x[1] <= -2
2*x[0] - 4*x[1] <= 0
x[0]**3 -x[1] == 0
where: 0 <= x[0] <= inf
1 <= x[1] <= inf
"""
import numpy as np
from scipy.optimize import minimize
def objective(x):
return 2*x**4 + 2*x - x**2 - 2*x**6
def constraint1(x):
return x - x**3 - 1
def constraint2(x):
return 2 * x**3 - x
#
# def constraint3(x):
# sum_eq = x[0]**3.0 -x[1]
# return sum_eq
# initial guesses
n = 1
x0 = np.zeros(n)
x0[0] = 2.
# x0[1] = 100.0
# show initial objective
print('Initial SSE Objective: ' + str(objective(x0)))
# optimize
#b = (1.0,None)
bnds = ((1.0,1000.0),)
con1 = {'type': 'ineq', 'fun': constraint1}
con2 = {'type': 'ineq', 'fun': constraint2}
# con3 = {'type': 'eq', 'fun': constraint3}
cons = [
# con1,
con2,
# con3,
]
solution = minimize(objective,
x0,
method='SLSQP',
bounds=bnds,
constraints=cons)
x = solution.x
print(solution)
# show final objective
print('Final SSE Objective: ' + str(objective(x)))
# print solution
print('Solution')
print('x1 = ' + str(x[0]))
# print('x2 = ' + str(x[1]))
print('\n')
print('x', x)
print('constraint1', constraint1(x))
print('constraint2', constraint2(x))
# print('constraint3', constraint3(x))
x_a = np.linspace(1, 2, 200)
f = objective(x_a)
c1 = constraint1(x_a)
c2 = constraint2(x_a)
import matplotlib.pyplot as plt
plt.figure()
plt.plot(x_a, f, label="f")
plt.plot(x_a, c1, label="c1")
plt.plot(x_a, c2, label="c2")
plt.legend()
plt.show()

How to solve logistic regression using gradient descent in octave?

I am learning Machine Learning course from coursera from Andrews Ng. I have written a code for logistic regression in octave. But, it is not working. Can someone help me?
I have taken the dataset from the following link:
Titanic survivors
Here is my code:
pkg load io;
[An, Tn, Ra, limits] = xlsread("~/ML/ML Practice/dataset/train_and_test2.csv", "Sheet2", "A2:H1000");
# As per CSV file we are reading columns from 1 to 7. 8-th column is Survived, which is what we are going to predict
X = [An(:, [1:7])];
Y = [An(:, 8)];
X = horzcat(ones(size(X,1), 1), X);
# Initializing theta values as zero for all
#theta = zeros(size(X,2),1);
theta = [-3;1;1;-3;1;1;1;1];
learningRate = -0.00021;
#learningRate = -0.00011;
# Step 1: Calculate Hypothesis
function g_z = estimateHypothesis(X, theta)
z = theta' * X';
z = z';
e_z = -1 * power(2.72, z);
denominator = 1.+e_z;
g_z = 1./denominator;
endfunction
# Step 2: Calculate Cost function
function cost = estimateCostFunction(hypothesis, Y)
log_1 = log(hypothesis);
log_2 = log(1.-hypothesis);
y1 = Y;
term_1 = y1.*log_1;
y2 = 1.-Y;
term_2 = y2.*log_2;
cost = term_1 + term_2;
cost = sum(cost);
# no.of.rows
m = size(Y, 1);
cost = -1 * (cost/m);
endfunction
# Step 3: Using gradient descent I am updating theta values
function updatedTheta = updateThetaValues(_X, _Y, _theta, _hypothesis, learningRate)
#s1 = _X * _theta;
#s2 = s1 - _Y;
#s3 = _X' * s2;
# no.of.rows
#m = size(_Y, 1);
#s4 = (learningRate * s3)/m;
#updatedTheta = _theta - s4;
s1 = _hypothesis - _Y;
s2 = s1 .* _X;
s3 = sum(s2);
# no.of.rows
m = size(_Y, 1);
s4 = (learningRate * s3)/m;
updatedTheta = _theta .- s4';
endfunction
costVector = [];
iterationVector = [];
for i = 1:1000
# Step 1
hypothesis = estimateHypothesis(X, theta);
#disp("hypothesis");
#disp(hypothesis);
# Step 2
cost = estimateCostFunction(hypothesis, Y);
costVector = vertcat(costVector, cost);
#disp("Cost");
#disp(cost);
# Step 3 - Updating theta values
theta = updateThetaValues(X, Y, theta, hypothesis, learningRate);
iterationVector = vertcat(iterationVector, i);
endfor
function plotGraph(iterationVector, costVector)
plot(iterationVector, costVector);
ylabel('Cost Function');
xlabel('Iteration');
endfunction
plotGraph(iterationVector, costVector);
This is the graph I am getting when I am plotting against no.of.iterations and cost function.
I am tired by adjusting theta values and learning rate. Can someone help me to solve this problem.
Thanks.
I have done a mathematical error. I should have used either power(2.72, -z) or exp(-z). Instead I have used as -1 * power(2.72, z). Now, I'm getting a proper curve.
Thanks.

Gradient in continuous regression using a neural network

I'm trying to implement a regression NN that has 3 layers (1 input, 1 hidden and 1 output layer with a continuous result). As a basis I took a classification NN from coursera.org class, but changed the cost function and gradient calculation so as to fit a regression problem (and not a classification one):
My nnCostFunction now is:
function [J grad] = nnCostFunctionLinear(nn_params, ...
input_layer_size, ...
hidden_layer_size, ...
num_labels, ...
X, y, lambda)
Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ...
hidden_layer_size, (input_layer_size + 1));
Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ...
num_labels, (hidden_layer_size + 1));
m = size(X, 1);
a1 = X;
a1 = [ones(m, 1) a1];
a2 = a1 * Theta1';
a2 = [ones(m, 1) a2];
a3 = a2 * Theta2';
Y = y;
J = 1/(2*m)*sum(sum((a3 - Y).^2))
th1 = Theta1;
th1(:,1) = 0; %set bias = 0 in reg. formula
th2 = Theta2;
th2(:,1) = 0;
t1 = th1.^2;
t2 = th2.^2;
th = sum(sum(t1)) + sum(sum(t2));
th = lambda * th / (2*m);
J = J + th; %regularization
del_3 = a3 - Y;
t1 = del_3'*a2;
Theta2_grad = 2*(t1)/m + lambda*th2/m;
t1 = del_3 * Theta2;
del_2 = t1 .* a2;
del_2 = del_2(:,2:end);
t1 = del_2'*a1;
Theta1_grad = 2*(t1)/m + lambda*th1/m;
grad = [Theta1_grad(:) ; Theta2_grad(:)];
end
Then I use this func in fmincg algorithm, but in firsts iterations fmincg end it's work. I think my gradient is wrong, but I can't find the error.
Can anybody help?
If I understand correctly, your first block of code (shown below) -
m = size(X, 1);
a1 = X;
a1 = [ones(m, 1) a1];
a2 = a1 * Theta1';
a2 = [ones(m, 1) a2];
a3 = a2 * Theta2';
Y = y;
is to get the output a(3) at the output layer.
Ng's slides about NN has the below configuration to calculate a(3). It's different from what your code presents.
in the middle/output layer, you are not doing the activation function g, e.g., a sigmoid function.
In terms of the cost function J without regularization terms, Ng's slides has the below formula:
I don't understand why you can compute it using:
J = 1/(2*m)*sum(sum((a3 - Y).^2))
because you are not including the log function at all.
Mikhaill, I´ve been playing with a NN for continuous regression as well, and had a similar issues at some point. The best thing to do here would be to test gradient computation against a numerical calculation before running the model. If that´s not correct, fmincg won´t be able to train the model. (Btw, I discourage you of using numerical gradient as the time involved is much bigger).
Taking into account that you took this idea from Ng´s Coursera class, I´ll implement a possible solution for you to try using the same notation for Octave.
% Cost function without regularization.
J = 1/2/m^2*sum((a3-Y).^2);
% In case it´s needed, regularization term is added (i.e. for Training).
if (reg==true);
J=J+lambda/2/m*(sum(sum(Theta1(:,2:end).^2))+sum(sum(Theta2(:,2:end).^2)));
endif;
% Derivatives are computed for layer 2 and 3.
d3=(a3.-Y);
d2=d3*Theta2(:,2:end);
% Theta grad is computed without regularization.
Theta1_grad=(d2'*a1)./m;
Theta2_grad=(d3'*a2)./m;
% Regularization is added to grad computation.
Theta1_grad(:,2:end)=Theta1_grad(:,2:end)+(lambda/m).*Theta1(:,2:end);
Theta2_grad(:,2:end)=Theta2_grad(:,2:end)+(lambda/m).*Theta2(:,2:end);
% Unroll gradients.
grad = [Theta1_grad(:) ; Theta2_grad(:)];
Note that, since you have taken out all the sigmoid activation, the derivative calculation is quite simple and results in a simplification of the original code.
Next steps:
1. Check this code to understand if it makes sense to your problem.
2. Use gradient checking to test gradient calculation.
3. Finally, use fmincg and check you get different results.
Try to include sigmoid function to compute second layer (hidden layer) values and avoid sigmoid in calculating the target (output) value.
function [J grad] = nnCostFunction1(nnParams, ...
inputLayerSize, ...
hiddenLayerSize, ...
numLabels, ...
X, y, lambda)
Theta1 = reshape(nnParams(1:hiddenLayerSize * (inputLayerSize + 1)), ...
hiddenLayerSize, (inputLayerSize + 1));
Theta2 = reshape(nnParams((1 + (hiddenLayerSize * (inputLayerSize + 1))):end), ...
numLabels, (hiddenLayerSize + 1));
Theta1Grad = zeros(size(Theta1));
Theta2Grad = zeros(size(Theta2));
m = size(X,1);
a1 = [ones(m, 1) X]';
z2 = Theta1 * a1;
a2 = sigmoid(z2);
a2 = [ones(1, m); a2];
z3 = Theta2 * a2;
a3 = z3;
Y = y';
r1 = lambda / (2 * m) * sum(sum(Theta1(:, 2:end) .* Theta1(:, 2:end)));
r2 = lambda / (2 * m) * sum(sum(Theta2(:, 2:end) .* Theta2(:, 2:end)));
J = 1 / ( 2 * m ) * (a3 - Y) * (a3 - Y)' + r1 + r2;
delta3 = a3 - Y;
delta2 = (Theta2' * delta3) .* sigmoidGradient([ones(1, m); z2]);
delta2 = delta2(2:end, :);
Theta2Grad = 1 / m * (delta3 * a2');
Theta2Grad(:, 2:end) = Theta2Grad(:, 2:end) + lambda / m * Theta2(:, 2:end);
Theta1Grad = 1 / m * (delta2 * a1');
Theta1Grad(:, 2:end) = Theta1Grad(:, 2:end) + lambda / m * Theta1(:, 2:end);
grad = [Theta1Grad(:) ; Theta2Grad(:)];
end
Normalize the inputs before passing it in nnCostFunction.
In accordance with Week 5 Lecture Notes guideline for a Linear System NN you should make following changes in the initial code:
Remove num_lables or make it 1 (in reshape() as well)
No need to convert y into a logical matrix
For a2 - replace sigmoid() function to tanh()
In d2 calculation - replace sigmoidGradient(z2) with (1-tanh(z2).^2)
Remove sigmoid from output layer (a3 = z3)
Replace cost function in the unregularized portion to linear one: J = (1/(2*m))*sum((a3-y).^2)
Create predictLinear(): use predict() function as a basis, replace sigmoid with tanh() for the first layer hypothesis, remove second sigmoid for the second layer hypothesis, remove the line with max() function, use output of the hidden layer hypothesis as a prediction result
Verify your nnCostFunctionLinear() on the test case from the lecture note

Resources