Cost does not converge/converges very slowly in the soft-coded version? - machine-learning

I don't understand. When I hardcode my script, it converges excellent, but in the softcode version, given the same structure and learning rate, it converges very slowly and then simply stops converging from some point on.
Here is the softcode version:
def BCE_loss(Y_hat, Y):
m = Y_hat.shape[1]
cost = (-1 / m) * (np.dot(Y, np.log(Y_hat+1e-5).T) + np.dot(1-Y, np.log(1-Y_hat+1e-5).T))
cost = np.squeeze(cost)
return cost
def BCE_loss_backward(Y_hat, Y):
dA_prev = - (np.divide(Y, Y_hat) - np.divide(1-Y, 1-Y_hat))
return dA_prev
def gradient(dZ, A_prev):
dW = np.dot(dZ, A_prev.T) * (1 / A_prev.shape[1])
db = np.sum(dZ, axis=1, keepdims=True) * (1 / A_prev.shape[1])
return dW, db
def update(W, b, dW, db, learning_rate):
W -= np.dot(learning_rate, dW)
b -= np.dot(learning_rate, db)
return W, b
for i in range(epochs+1):
## Forward pass
for l in range(1, L):
if l==L-1:
if out_dim==1:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = sigmoid(grads_GD['Z'+str(l)])
else:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = softmax(grads_GD['Z'+str(l)])
else:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = relu(grads_GD['Z'+str(l)])
## Compute cost
if out_dim==1:
cost_GD = BCE_loss(grads_GD['A'+str(L-1)], Y)
cost_list_GD.append(cost_GD)
else:
cost_GD = CE_loss(grads_GD['A'+str(L-1)], Y)
cost_list_GD.append(cost_GD)
## Print cost
if i % print_num == 0:
print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
elif cost_GD < cost_lim or i == epochs:
last_epoch_GD = i
print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
break
else:
continue
## Backward pass
if out_dim==1:
grads_GD['dA'+str(L-1)] = BCE_loss_backward(grads_GD['A'+str(L-1)], Y)
grads_GD['dZ'+str(L-1)] = sigmoid_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
else:
grads_GD['dA'+str(L-1)] = CE_loss_backward(grads_GD['A'+str(L-1)], Y)
grads_GD['dZ'+str(L-1)] = softmax_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
grads_GD['dW'+str(L-1)], grads_GD['db'+str(L-1)] = gradient(grads_GD['dZ'+str(L-1)], grads_GD['A'+str(L-2)])
for l in reversed(range(1, L-1)):
grads_GD['dA'+str(l)] = linear_backward(params_GD['W'+str(l+1)], grads_GD['dZ'+str(l+1)])
grads_GD['dZ'+str(l)] = relu_backward(grads_GD['dA'+str(l)], grads_GD['Z'+str(l)])
grads_GD['dW'+str(l)], grads_GD['db'+str(l)] = gradient(grads_GD['dZ'+str(l)], grads_GD['A'+str(l-1)])
## Update parameters
for l in range(1, L):
params_GD['W'+str(l)], params_GD['b'+str(l)] = update(params_GD['W'+str(l)], params_GD['b'+str(l)], grads_GD['dW'+str(l)], grads_GD['db'+str(l)], learning_rate)
and here is the hardcode version:
def cost_function(Y, A4, N, epsilon):
cost = (-1 / N) * np.sum(np.multiply(Y, np.log(A4 + epsilon)) + np.multiply(1 - Y, np.log(1 - A4 + epsilon)))
return cost
for i in range(epochs):
Z1_GD = np.dot(W1_GD, X) + b1_GD
A1_GD = np.maximum(0, Z1_GD)
Z2_GD = np.dot(W2_GD, A1_GD) + b2_GD
A2_GD = np.maximum(0, Z2_GD)
Z3_GD = np.dot(W3_GD, A2_GD) + b3_GD
A3_GD = np.maximum(0, Z3_GD)
Z4_GD = np.dot(W4_GD, A3_GD) + b4_GD
A4_GD = class_layer(Z4_GD)
dZ4_GD = A4_GD - Y
dW4_GD = np.dot(dZ4_GD, A3_GD.T) * (1. / A3_GD.shape[1])
db4_GD = np.sum(dZ4_GD, axis=1, keepdims=True) * (1. / A3_GD.shape[1])
dA3_GD = np.dot(W4_GD.T, dZ4_GD)
dZ3_GD = np.array(dA3_GD, copy=True)
dZ3_GD[Z3_GD <= 0] = 0
dW3_GD = np.dot(dZ3_GD, A2_GD.T) * (1. / A2_GD.shape[1])
db3_GD = np.sum(dZ3_GD, axis=1, keepdims=True) * (1. / A2_GD.shape[1])
dA2_GD = np.dot(W3_GD.T, dZ3_GD)
dZ2_GD = np.array(dA2_GD, copy=True)
dZ2_GD[Z2_GD <= 0] = 0
dW2_GD = np.dot(dZ2_GD, A1_GD.T) * (1. / A1_GD.shape[1])
db2_GD = np.sum(dZ2_GD, axis=1, keepdims=True) * (1. / A1_GD.shape[1])
dA1_GD = np.dot(W2_GD.T, dZ2_GD)
dZ1_GD = np.array(dA1_GD, copy=True)
dZ1_GD[Z1_GD <= 0] = 0
dW1_GD = np.dot(dZ1_GD, X.T) * (1. / X.shape[1])
db1_GD = np.sum(dZ1_GD, axis=1, keepdims=True) * (1. / X.shape[1])
W1_GD = W1_GD - learning_rate * dW1_GD
b1_GD = b1_GD - learning_rate * db1_GD
W2_GD = W2_GD - learning_rate * dW2_GD
b2_GD = b2_GD - learning_rate * db2_GD
W3_GD = W3_GD - learning_rate * dW3_GD
b3_GD = b3_GD - learning_rate * db3_GD
W4_GD = W4_GD - learning_rate * dW4_GD
b4_GD = b4_GD - learning_rate * db4_GD
cost_GD = cost_function(Y, A4_GD, N, epsilon)
cost_GD = np.squeeze(cost_GD)
cost_list_GD.append(cost_GD)
I suppose something went wrong during softcoding.

I solved it myself. Apparently, the "else: continue" line in the print cost section caused the algorithm to do a backward pass only once. After that, it was just looping through the forward pass. Can anyone please explain the reason for such behavior?

Related

How to solve logistic regression using gradient descent in octave?

I am learning Machine Learning course from coursera from Andrews Ng. I have written a code for logistic regression in octave. But, it is not working. Can someone help me?
I have taken the dataset from the following link:
Titanic survivors
Here is my code:
pkg load io;
[An, Tn, Ra, limits] = xlsread("~/ML/ML Practice/dataset/train_and_test2.csv", "Sheet2", "A2:H1000");
# As per CSV file we are reading columns from 1 to 7. 8-th column is Survived, which is what we are going to predict
X = [An(:, [1:7])];
Y = [An(:, 8)];
X = horzcat(ones(size(X,1), 1), X);
# Initializing theta values as zero for all
#theta = zeros(size(X,2),1);
theta = [-3;1;1;-3;1;1;1;1];
learningRate = -0.00021;
#learningRate = -0.00011;
# Step 1: Calculate Hypothesis
function g_z = estimateHypothesis(X, theta)
z = theta' * X';
z = z';
e_z = -1 * power(2.72, z);
denominator = 1.+e_z;
g_z = 1./denominator;
endfunction
# Step 2: Calculate Cost function
function cost = estimateCostFunction(hypothesis, Y)
log_1 = log(hypothesis);
log_2 = log(1.-hypothesis);
y1 = Y;
term_1 = y1.*log_1;
y2 = 1.-Y;
term_2 = y2.*log_2;
cost = term_1 + term_2;
cost = sum(cost);
# no.of.rows
m = size(Y, 1);
cost = -1 * (cost/m);
endfunction
# Step 3: Using gradient descent I am updating theta values
function updatedTheta = updateThetaValues(_X, _Y, _theta, _hypothesis, learningRate)
#s1 = _X * _theta;
#s2 = s1 - _Y;
#s3 = _X' * s2;
# no.of.rows
#m = size(_Y, 1);
#s4 = (learningRate * s3)/m;
#updatedTheta = _theta - s4;
s1 = _hypothesis - _Y;
s2 = s1 .* _X;
s3 = sum(s2);
# no.of.rows
m = size(_Y, 1);
s4 = (learningRate * s3)/m;
updatedTheta = _theta .- s4';
endfunction
costVector = [];
iterationVector = [];
for i = 1:1000
# Step 1
hypothesis = estimateHypothesis(X, theta);
#disp("hypothesis");
#disp(hypothesis);
# Step 2
cost = estimateCostFunction(hypothesis, Y);
costVector = vertcat(costVector, cost);
#disp("Cost");
#disp(cost);
# Step 3 - Updating theta values
theta = updateThetaValues(X, Y, theta, hypothesis, learningRate);
iterationVector = vertcat(iterationVector, i);
endfor
function plotGraph(iterationVector, costVector)
plot(iterationVector, costVector);
ylabel('Cost Function');
xlabel('Iteration');
endfunction
plotGraph(iterationVector, costVector);
This is the graph I am getting when I am plotting against no.of.iterations and cost function.
I am tired by adjusting theta values and learning rate. Can someone help me to solve this problem.
Thanks.
I have done a mathematical error. I should have used either power(2.72, -z) or exp(-z). Instead I have used as -1 * power(2.72, z). Now, I'm getting a proper curve.
Thanks.

Gradient Descent cost function explosion

I am writing this code for linear regression and trying Gradient Descent to minimize the RSS. The cost function seems to explode to infinity within 12 iterations. I know this is not supposed to happen. Maybe, I have used the wrong gradient function for RSS (can be seen in the function "grad()")?
NumberObservations=100
minVal=1
maxVal=20
X = np.random.uniform(minVal,maxVal,(NumberObservations,1))
e = np.random.normal(0, 1, (NumberObservations,1))
Y= 10 + 5*X + e
B = np.array([[0], [0]])
sum_y = sum(Y)
sum_x = sum(X)
sum_xy = sum(np.multiply(X, Y))
sum_x2 = sum(X*X)
alpha = 0.00001
iterations = 15
def cost_fun(X, Y, B):
b0 = B[0]
b1 = B[1]
s = (Y - (b0 + (b1*X)))**2
rss = sum(s)
return rss
def grad(X, Y, B):
print("B = " + str(B))
b0 = B[0]
b1 = B[1]
g0 = -2*(Y - b0 - (b1*X))
g1 = -2*((X*Y) - (b0*X) - (b1*X**2))
grad = np.concatenate((g0, g1), axis = 1)
return grad
def gradient_descent(X, Y, B, alpha, iterations):
cost_history = [0] * iterations
m = len(Y)
x0 = np.array(np.ones(m))
x0 = x0.reshape((100, 1))
X1 = np.concatenate((x0, X), axis = 1)
for iteration in range(iterations):
h = np.dot(X1, B)
h = h.reshape((100, 1))
loss = h - Y
g = grad(X, Y, B)
gradient = (np.dot(g.T, loss) / m)
B = B - alpha * gradient
cost = cost_fun(X, Y, B)
cost_history[iteration] = cost
print("Iteration %d | Cost: %f" % (iteration, cost))
print("-----------------------------------------------------------------------")
return B, cost_history
newB, cost_history = gradient_descent(X, Y, B, alpha, iterations)
# New Values of B
print(newB)
Please help.

Chainer how to save and load DQN model

I'm learning the Deep Reinforcement learning
framework Chainer.
I've followed a tutorial and gotten the following code:
def train_dddqn(env):
class Q_Network(chainer.Chain):
def __init__(self, input_size, hidden_size, output_size):
super(Q_Network, self).__init__(
fc1=L.Linear(input_size, hidden_size),
fc2=L.Linear(hidden_size, hidden_size),
fc3=L.Linear(hidden_size, hidden_size // 2),
fc4=L.Linear(hidden_size, hidden_size // 2),
state_value=L.Linear(hidden_size // 2, 1),
advantage_value=L.Linear(hidden_size // 2, output_size)
)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
def __call__(self, x):
h = F.relu(self.fc1(x))
h = F.relu(self.fc2(h))
hs = F.relu(self.fc3(h))
ha = F.relu(self.fc4(h))
state_value = self.state_value(hs)
advantage_value = self.advantage_value(ha)
advantage_mean = (F.sum(advantage_value, axis=1) / float(self.output_size)).reshape(-1, 1)
q_value = F.concat([state_value for _ in range(self.output_size)], axis=1) + (
advantage_value - F.concat([advantage_mean for _ in range(self.output_size)], axis=1))
return q_value
def reset(self):
self.cleargrads()
Q = Q_Network(input_size=env.history_t + 1, hidden_size=100, output_size=3)
Q_ast = copy.deepcopy(Q)
optimizer = chainer.optimizers.Adam()
optimizer.setup(Q)
epoch_num = 50
step_max = len(env.data) - 1
memory_size = 200
batch_size = 50
epsilon = 1.0
epsilon_decrease = 1e-3
epsilon_min = 0.1
start_reduce_epsilon = 200
train_freq = 10
update_q_freq = 20
gamma = 0.97
show_log_freq = 5
memory = []
total_step = 0
total_rewards = []
total_losses = []
start = time.time()
for epoch in range(epoch_num):
pobs = env.reset()
step = 0
done = False
total_reward = 0
total_loss = 0
while not done and step < step_max:
# select act
pact = np.random.randint(3)
if np.random.rand() > epsilon:
pact = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
pact = np.argmax(pact.data)
# act
obs, reward, done = env.step(pact)
# add memory
memory.append((pobs, pact, reward, obs, done))
if len(memory) > memory_size:
memory.pop(0)
# train or update q
if len(memory) == memory_size:
if total_step % train_freq == 0:
shuffled_memory = np.random.permutation(memory)
memory_idx = range(len(shuffled_memory))
for i in memory_idx[::batch_size]:
batch = np.array(shuffled_memory[i:i + batch_size])
b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
q = Q(b_pobs)
indices = np.argmax(q.data, axis=1)
maxqs = Q_ast(b_obs).data
target = copy.deepcopy(q.data)
for j in range(batch_size):
Q.reset()
loss = F.mean_squared_error(q, target)
total_loss += loss.data
loss.backward()
optimizer.update()
if total_step % update_q_freq == 0:
Q_ast = copy.deepcopy(Q)
# epsilon
if epsilon > epsilon_min and total_step > start_reduce_epsilon:
epsilon -= epsilon_decrease
# next step
total_reward += reward
pobs = obs
step += 1
total_step += 1
total_rewards.append(total_reward)
total_losses.append(total_loss)
if (epoch + 1) % show_log_freq == 0:
log_reward = sum(total_rewards[((epoch + 1) - show_log_freq):]) / show_log_freq
log_loss = sum(total_losses[((epoch + 1) - show_log_freq):]) / show_log_freq
elapsed_time = time.time() - start
print('\t'.join(map(str, [epoch + 1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
start = time.time()
return Q, total_losses, total_rewards
Q, total_losses, total_rewards = train_dddqn(Environment1(train))
My question is how can I save and load this Model which has been train very well?I know Kreas has some function like: model.save and load_model.
So what's the specify code I need for this Chainer code?
You can use serializer module to save/load chainer's model's parameter (Chain class).
from chainer import serializers
Q = Q_Network(input_size=env.history_t + 1, hidden_size=100, output_size=3)
Q_ast = Q_Network(input_size=env.history_t + 1, hidden_size=100, output_size=3)
# --- train Q here... ---
# copy Q parameter into Q_ast by saving Q's parameter and load to Q_ast
serializers.save_npz('my.model', Q)
serializers.load_npz('my.model', Q_ast)
See official document for details:
http://docs.chainer.org/en/stable/guides/serializers.html
Also, you may refer chainerrl, which is a chainer library for reinforcement learning.
https://github.com/chainer/chainerrl
chainerrl have a util function copy_param to copy parameter from network source_link to target_link.
https://github.com/chainer/chainerrl/blob/master/chainerrl/misc/copy_param.py#L12-L30

XOR Neural Network does not converge

I've been trying to replicate the [2,2,1] neural network that learns the XOR gate, and I can't get my model to converge. I'm not sure where i'm going wrong, and I would really appreciate some feedback.
Here is my code for the class originally run in a Jupyter notebook:
# Define activation function and derivative
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def dsigmoid(x):
return sigmoid(x) * (1 - sigmoid(x))
# Define loss function and derivative
def mse(targets, predictions):
return (1 / (2 * len(targets))) * (targets - predictions) ** 2
def dmse(targets, predictions):
return targets - predictions
# Feedforward function
def feedforward(X):
z1 = np.dot(l1_weights, X.T) + l1_biases
a1 = sigmoid(z1)
z2 = np.dot(l2_weights, a1) + l2_biases
a2 = sigmoid(z2)
return z1, a1, z2, a2
# Backpropogation function
def backprop(x, y):
z1, a1, z2, a2 = feedforward(x)
delta_l2 = dmse(y.T, a2) * dsigmoid(z2)
delta_l1 = np.dot(l2_weights.T, delta_l2) * dsigmoid(z1)
l2_dw = np.dot(delta_l2, a1.T)
l2_db = delta_l2
l1_dw = np.dot(delta_l1, x)
l1_db = delta_l1
return l1_dw, l1_db, l2_dw, l2_db
# Input data and labels
X = np.array([[1,0],[0,1],[1,1],[0,0]])
Y = np.array([[1],[1],[0],[0]])
# Create the data set
data = [(np.array(x), np.array(y)) for x, y in zip(X, Y)]
# Randomly initialize weights and biases
np.random.seed(1)
l1_weights = np.random.randn(2,2)
l2_weights = np.random.randn(1,2)
l1_biases = np.random.randn(2,1)
l2_biases = np.random.randn(1,1)
# Train the model
epochs = 100000
eta = 0.05
batch_size = 4
# Batch the data
batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
for batch in batches:
feature_batch = np.array([x[0] for x in batch])
label_batch = np.array([x[1] for x in batch])
# Update network weights with stochastic gradient descent
l1_dw, l1_db, l2_dw, l2_db = backprop(feature_batch, label_batch)
l1_weights = l1_weights - (eta / batch_size) * l1_dw
l2_weights = l2_weights - (eta / batch_size) * l2_dw
l1_biases = l1_biases - (eta / batch_size) * l1_db
l2_biases = l2_biases - (eta / batch_size) * l2_db
Here is a sample output after training with:
training epochs = 100000
learning rate = 0.5
logging periods = 10
batch size = 4
Error for period 1: 0.135619
Error for period 2: 0.249879
Error for period 3: 0.249941
Error for period 4: 0.249961
Error for period 5: 0.249956
Error for period 6: 0.249963
Error for period 7: 0.249972
Error for period 8: 0.249983
Error for period 9: 0.249986
Error for period 10: 0.249981
# OUTPUT
print(feedforward(X)[-1])
>>> array([[0.99997653, 0.99995257, 0.99997791, 0.99995534]])
Please help!

Cost value doesn't converge

I'm trying code a logistic regression but I'm in trouble getting a convergent COST, can anyone help me? Below are my codes. Thank you!
#input:
m = 3, n = 4
# we have 3 training examples and each of them has 4 features (Sorry, I know it looks weired here). Y is a label matrix.
X = np.array([[1,2,1],[1,1,0],[1,2,1],[1,0,2]])
Y = np.array([[0,1,0]])
h = 100000 #iterations
alpha = 0.05 #learning rate
b = 0 #scalar bias
W = np.zeros(n).reshape(1,n) #weights
J = np.zeros(h).reshape(1,h) #a vector for holing cost value
Yhat = np.zeros(m).reshape(1,m) #predicted value
def activation(yhat):
return 1/(1+np.exp(-yhat))
W=W.T
for g in range(h):
m = X.T.shape[0]
Y_hat = activation(X.dot(W)+b)
cost = -1/m * np.sum(Y*np.log(Y_hat)+(1-Y)*np.log(1-Y_hat))
current_error = Y.T - Y_hat
dW = 1/m * np.dot(X.T, current_error)
db = 1/m * np.sum(current_error)
W = W + alpha * dW
b = b + alpha * db
J[0][g] = cost

Resources