Flux.jl : Customizing optimizer - machine-learning

I'm trying to implement a gradient-free optimizer function to train convolutional neural networks with Julia using Flux.jl. The reference paper is this: https://arxiv.org/abs/2005.05955. This paper proposes RSO, a gradient-free optimization algorithm updates single weight at a time on a sampling bases. The pseudocode of this algorithm is depicted in the picture below.
optimizer_pseudocode
I'm using MNIST dataset.
function train(; kws...)
args = Args(; kws...) # collect options in a stuct for convinience
if CUDA.functional() && args.use_cuda
#info "Training on CUDA GPU"
CUDA.allwoscalar(false)
device = gpu
else
#info "Training on CPU"
device = cpu
end
# Prepare datasets
x_train, x_test, y_train, y_test = getdata(args, device)
# Create DataLoaders (mini-batch iterators)
train_loader = DataLoader((x_train, y_train), batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader((x_test, y_test), batchsize=args.batchsize)
# Construct model
model = build_model() |> device
ps = Flux.params(model) # model's trainable parameters
best_param = ps
if args.optimiser == "SGD"
# Regular training step with SGD
elseif args.optimiser == "RSO"
# Run RSO function and update ps
best_param .= RSO(x_train, y_train, args.RSOupdate, model, args.batchsize, device)
end
And the corresponding RSO function:
function RSO(X,L,C,model, batch_size, device)
"""
model = convolutional model structure
X = Input data
L = labels
C = Number of rounds to update parameters
W = Weight set of layers
Wd = Weight tensors of layer d that generates an activation
wid = weight tensor that generates an activation aᵢ
wj = a weight in wid
"""
# Normalize input data to have zero mean and unit standard deviation
X .= (X .- sum(X))./std(X)
train_loader = DataLoader((X, L), batchsize=batch_size, shuffle=true)
#println("model = $(typeof(model))")
std_prep = []
σ_d = Float64[]
D = 1
for layer in model
D += 1
Wd = Flux.params(layer)
# Initialize the weights of the network with Gaussian distribution
for id in Wd
wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), (3,3,4,4)))
id = wj
append!(std_prep, vec(wj))
end
# Compute std of all elements in the weight tensor Wd
push!(σ_d, std(std_prep))
end
W = Flux.params(model)
# Weight update
for _ in 1:C
d = D
while d > 0
for id in 1:length(W[d])
# Randomly sample change in weights from Gaussian distribution
for j in 1:length(w[d][id])
# Randomly sample mini-batch
(x, l) = train_loader[rand(1:length(train_loader))]
# Sample a weight from normal distribution
ΔWj[d][id][j] = rand(Normal(0, σ_d[d]), 1)
loss, acc = loss_and_accuracy(data_loader, model, device)
W = argmin(F(x,l, W+ΔWj), F(x,l,W), F(x,l, W-ΔWj))
end
end
d -= 1
end
end
return W
end
The problem here is the second block of the RSO function. I'm trying to evaluate the loss with the change of single weight in three scenarios, which are F(w, l, W+gW), F(w, l, W), F(w, l, W-gW), and choose the weight-set with minimum loss. But how do I do that using Flux.jl? The loss function I'm trying to use is logitcrossentropy(ŷ, y, agg=sum). In order to generate y_hat, we should use model(W), but changing single weight parameter in Zygote.Params() form was already challenging....

Based on the paper you shared, it looks like you need to change the weight arrays per each output neuron per each layer. Unfortunately, this means that the implementation of your optimization routine is going to depend on the layer type, since an "output neuron" for a convolution layer is quite different than a fully-connected layer. In other words, just looping over Flux.params(model) is not going to be sufficient, since this is just a set of all the weight arrays in the model and each weight array is treated differently depending on which layer it comes from.
Fortunately, Julia's multiple dispatch does make this easier to write if you use separate functions instead of a giant loop. I'll summarize the algorithm using the pseudo-code below:
for layer in model
for output_neuron in layer
for weight_element in parameters(output_neuron)
weight_element = sample(N(0, sqrt(2 / num_outputs(layer))))
end
end
sigmas[layer] = stddev(parameters(layer))
end
for c in 1 to C
for layer in reverse(model)
for output_neuron in layer
for weight_element in parameters(output_neuron)
x, y = sample(batches)
dw = N(0, sigmas[layer])
# optimize weights
end
end
end
end
It's the for output_neuron ... portions that we need to isolate into separate functions.
In the first block, we don't actually do anything different to every weight_element, they are all sampled from the same normal distribution. So, we don't actually need to iterate the output neurons, but we do need to know how many there are.
using Statistics: std
# this function will set the weights according to the
# normal distribution and the number of output neurons
# it also returns the standard deviation of the weights
function sample_weight!(layer::Dense)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 1)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
function sample_weight!(layer::Conv)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 4)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
sigmas = map(sample_weights!, model)
Now, for the second block, we will do a similar trick by defining different functions for each layer.
function optimize_layer!(loss, layer::Dense, data, sigma)
for i in 1:size(layer.weight, 1)
for j in 1:size(layer.weight, 2)
wj = layer.weight[i, j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
layer.weight[i, j] = w
losses[k] = loss(x, y)
end
layer.weight[i, j] = ws[argmin(losses)]
end
end
end
function optimize_layer!(loss, layer::Conv, data, sigma)
for i in 1:size(layer.weight, 4)
# we use a view to reference the full kernel
# for this output channel
wid = view(layer.weight, :, :, :, i)
# each index let's us treat wid like a vector
for j in eachindex(wid)
wj = wid[j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
wid[j] = w
losses[k] = loss(x, y)
end
wid[j] = ws[argmin(losses)]
end
end
end
for c in 1:C
for (layer, sigma) in reverse(zip(model, sigmas))
optimize_layer!(layer, data, sigma) do x, y
logitcrossentropy(model(x), y; agg = sum)
end
end
end
Notice that nowhere did I use Flux.params which does not help us here. Also, Flux.params would include both the weight and bias, and the paper doesn't look like it bothers with the bias at all. If you had an optimization method that generically optimized any parameter regardless of layer type the same (i.e. like gradient descent), then you could use for p in Flux.params(model) ....

Thanks #darsnack :)
I found your answer a bit late, so in the meantime I could figure out my own script that works. Mine is indeed a bit hardcoded but could you also give feedback on this?
function RSO(train_loader, test_loader, C,model, batch_size, device, args)
"""
model = convolutional model structure
C = Number of rounds to update parameters (epochs)
batch_size = size of the mini batch that will be used to calculate loss
device = CPU or GPU
"""
# Evaluate initial weight
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
println("Initial Weight:")
println(" test_loss = $test_loss, test_accuracy = $test_acc")
random_batch = []
for (x, l) in train_loader
push!(random_batch, (x,l))
end
# Initialize weights
std_prep = []
σ_d = Float64[]
D = 0
for layer in model
D += 1
Wd = Flux.params(layer)
# Initialize the weights of the network with Gaussian distribution
for id in Wd
if typeof(id) == Array{Float32, 4}
wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), size(id)))
elseif typeof(id) == Vector{Float32}
wj = convert(Vector{Float32}, rand(Normal(0, sqrt(2/length(id))), length(id)))
elseif typeof(id) == Matrix{Float32}
wj = convert(Matrix{Float32}, rand(Normal(0, sqrt(2/length(id))), size(id)))
end
id = wj
append!(std_prep, vec(wj))
end
# Compute std of all elements in the weight tensor Wd
push!(σ_d, std(std_prep))
end
# Weight update
for c in 1:C
d = D
# First update the weights of the layer closest to the labels
# and then sequentially move closer to the input
while d > 0
Wd = Flux.params(model[d])
for id in Wd
# Randomly sample change in weights from Gaussian distribution
for j in 1:length(id)
# Randomly sample mini-batch
(x, y) = rand(random_batch, 1)[1]
x, y = device(x), device(y)
# Sample a weight from normal distribution
ΔWj = rand(Normal(0, σ_d[d]), 1)[1]
# Weight update with three scenario
## F(x,l, W+ΔWj)
id[j] = id[j]+ΔWj
ŷ = model(x)
ls_pos = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
## F(x,l,W)
id[j] = id[j]-ΔWj
ŷ = model(x)
ls_org = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
## F(x,l, W-ΔWj)
id[j] = id[j]-ΔWj
ŷ = model(x)
ls_neg = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
# Check weight update that gives minimum loss
min_loss = argmin([ls_org, ls_pos, ls_neg])
# Save weight update with minimum loss
if min_loss == 1
id[j] = id[j] + ΔWj
elseif min_loss == 2
id[j] = id[j] + 2*ΔWj
elseif min_loss == 3
id[j] = id[j]
end
end
end
d -= 1
end
train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
track!(args.tracker, test_acc)
println("RSO Round=$c")
println(" train_loss = $train_loss, train_accuracy = $train_acc")
println(" test_loss = $test_loss, test_accuracy = $test_acc")
end
return Flux.params(model)
end

Related

Large, exploding loss in Pytorch transformer model

I am trying to solve a sequence to sequence problem with a transformer model. The data is derived from a set of crossword puzzles.
The positional encoding and transformer classes are as follows:
class PositionalEncoding(nn.Module):
def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 3000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(1, max_len, d_model)
pe[0, :, 0::2] = torch.sin(position * div_term)
pe[0, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def debug(self, x):
return x.shape, x.size()
def forward(self, x: Tensor) -> Tensor:
x = x + self.pe[:, :x.size(1), :]
return self.dropout(x)
class Transformer(nn.Module):
def __init__(
self,
num_tokens,
dim_model,
num_heads,
num_encoder_layers,
num_decoder_layers,
batch_first,
dropout_p,
):
super().__init__()
self.model_type = "Transformer"
self.dim_model = dim_model
self.positional_encoder = PositionalEncoding(
d_model=dim_model, dropout=dropout_p, max_len=3000
)
self.embedding = nn.Embedding.from_pretrained(vec_weights, freeze=False)#nn.Embedding(num_tokens, dim_model)
self.transformer = nn.Transformer(
d_model=dim_model,
nhead=num_heads,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dropout=dropout_p,
batch_first = batch_first
)
self.out = nn.Linear(dim_model, num_tokens)
def forward(self, src, tgt, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None):
src = self.embedding(src)*math.sqrt(self.dim_model)
tgt = self.embedding(tgt)*math.sqrt(self.dim_model)
src = self.positional_encoder(src)
tgt = self.positional_encoder(tgt)
transformer_out = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask)
out = self.out(transformer_out)
return out
def get_tgt_mask(self, size) -> torch.tensor:
mask = torch.tril(torch.ones(size, size) == 1)
mask = mask.float()
mask = mask.masked_fill(mask == 0, float('-inf'))
mask = mask.masked_fill(mask == 1, float(0.0))
return mask
def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
return (matrix == pad_token)
The input tensors are a source tensor of size N by S, where N is the batch size and S is the source sequence length, and a target tensor of size N by T, where T is the target sequence length. S is about 10 and T is about 5, while the total number of items is about 160,000-200,000, divided into batch sizes of 512. They are torch.IntTensors, with elements in the range from 0 to V, where V is the vocabulary length.
The first layer is an embedding layer that takes the input from N by S to N by S by E, where E is the embedding dimension (300), or to N by T by E in the case of the target. The second layer adds position encoding without changing the shape. Then both tensors are passed through the transformer layer, which outputs an N by T by E tensor. Finally, we pass this output through a linear layer, which produces an N by T by V output, where V is the size of the vocabulary used in the problem. Here V is about 56,697. The most frequent tokens (words) appear about 50-60 times in the target tensor.
The transformer class also contains the functions for implementing the masking matrices.
Then we create the model and run it (this process is wrapped in a function).
device = "cuda"
src_train, src_test = torch.utils.data.random_split(src_t, [int(0.9*len(src_t)), len(src_t)-int(0.9*len(src_t))])
src_train, src_test = src_train[:512], src_test[:512]
tgt_train, tgt_test = torch.utils.data.random_split(tgt_t, [int(0.9*len(tgt_t)), len(tgt_t)-int(0.9*len(tgt_t))])
tgt_train, tgt_test = tgt_train[:512], tgt_test[:512]
train_data, test_data = list(zip(src_train, tgt_train)), list(zip(src_test, tgt_test))
train, test = torch.utils.data.DataLoader(dataset=train_data), torch.utils.data.DataLoader(dataset=test_data)
model = Transformer(num_tokens=ntokens, dim_model=300, num_heads=2, num_encoder_layers=3, num_decoder_layers=3, batch_first = True, dropout_p=0.1).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0000001)
n_epochs = 50
def train_model(model, optimizer, loss_function, n_epochs):
loss_value=0
for epoch in range(n_epochs):
print(f"Starting epoch {epoch}")
for batch, data in enumerate(train):
x, y = data
if batch%100 == 0:
print(f"Batch is {batch}")
batch += 1
optimizer.zero_grad()
x, y = torch.tensor(x).to(device), torch.tensor(y).to(device)
y_input, y_base = y[:, :-1], y[:, 1:]
y_input, y_base = y_input.to(device), y_base.to(device)
tgt_mask = model.get_tgt_mask(y_input.shape[1]).to(device)
pad_token = vocabulary_table[embeddings.key_to_index["/"]]
src_pad_mask = model.create_pad_mask(x, pad_token).to(device)
tgt_pad_mask = model.create_pad_mask(y_input, pad_token).to(device)
z = model(x, y_input, tgt_mask, src_pad_mask, tgt_pad_mask)
z = z.permute(0, 2, 1).to(device)
y_base = y_base.long().to(device)
loss = loss_function(z, y_base).to(device)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
optimizer.step()
loss_value += float(loss)
if batch%100 == 0:
print(f"For epoch {epoch}, batch {batch} the cross-entropy loss is {loss_value}")
#Free GPU memory.
del z
del x
del y
del y_input
del y_base
del loss
torch.cuda.empty_cache()
return model.parameters(), loss_value
Basically, we split the data into test and training sets and use an SGD optimizer and cross-entropy loss. We create a masking matrix for the padding for both the target and source tensors, and a masking matrix for unseen elements for the target tensor. We then do the usual gradient update steps. Right now, there is no validation loop, because I cannot even get the training loss to decrease.
The loss is very high, reaching more than 1000 after 100 batches. More concerningly, the loss also increases rapidly during training, rather than decreasing. In the code that I included, I tried clipping the gradients, lowering the learning rate, and using a much smaller sample to debug the code.
What could be causing this behavior?
You are only adding things to your loss, so naturally it can only increase.
loss_value += float(loss)
You're supposed to set it to zero after every epoch. Now you set it to zero once, in the beginning of the training process. There is a training loop template here if you're interested (https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html). This explains the increasing loss. To further troubleshoot (if needed) I'd throw in a validation loop.

Pytorch model stuck at 0.5 though loss decreases consistently

This is using PyTorch
I have been trying to implement UNet model on my images, however, my model accuracy is always exact 0.5. Loss does decrease.
I have also checked for class imbalance. I have also tried playing with learning rate. Learning rate affects loss but not the accuracy.
My architecture below ( from here )
""" `UNet` class is based on https://arxiv.org/abs/1505.04597
The U-Net is a convolutional encoder-decoder neural network.
Contextual spatial information (from the decoding,
expansive pathway) about an input tensor is merged with
information representing the localization of details
(from the encoding, compressive pathway).
Modifications to the original paper:
(1) padding is used in 3x3 convolutions to prevent loss
of border pixels
(2) merging outputs does not require cropping due to (1)
(3) residual connections can be used by specifying
UNet(merge_mode='add')
(4) if non-parametric upsampling is used in the decoder
pathway (specified by upmode='upsample'), then an
additional 1x1 2d convolution occurs after upsampling
to reduce channel dimensionality by a factor of 2.
This channel halving happens with the convolution in
the tranpose convolution (specified by upmode='transpose')
Arguments:
in_channels: int, number of channels in the input tensor.
Default is 3 for RGB images. Our SPARCS dataset is 13 channel.
depth: int, number of MaxPools in the U-Net. During training, input size needs to be
(depth-1) times divisible by 2
start_filts: int, number of convolutional filters for the first conv.
up_mode: string, type of upconvolution. Choices: 'transpose' for transpose convolution
"""
class UNet(nn.Module):
def __init__(self, num_classes, depth, in_channels, start_filts=16, up_mode='transpose', merge_mode='concat'):
super(UNet, self).__init__()
if up_mode in ('transpose', 'upsample'):
self.up_mode = up_mode
else:
raise ValueError("\"{}\" is not a valid mode for upsampling. Only \"transpose\" and \"upsample\" are allowed.".format(up_mode))
if merge_mode in ('concat', 'add'):
self.merge_mode = merge_mode
else:
raise ValueError("\"{}\" is not a valid mode for merging up and down paths.Only \"concat\" and \"add\" are allowed.".format(up_mode))
# NOTE: up_mode 'upsample' is incompatible with merge_mode 'add'
if self.up_mode == 'upsample' and self.merge_mode == 'add':
raise ValueError("up_mode \"upsample\" is incompatible with merge_mode \"add\" at the moment "
"because it doesn't make sense to use nearest neighbour to reduce depth channels (by half).")
self.num_classes = num_classes
self.in_channels = in_channels
self.start_filts = start_filts
self.depth = depth
self.down_convs = []
self.up_convs = []
# create the encoder pathway and add to a list
for i in range(depth):
ins = self.in_channels if i == 0 else outs
outs = self.start_filts*(2**i)
pooling = True if i < depth-1 else False
down_conv = DownConv(ins, outs, pooling=pooling)
self.down_convs.append(down_conv)
# create the decoder pathway and add to a list
# - careful! decoding only requires depth-1 blocks
for i in range(depth-1):
ins = outs
outs = ins // 2
up_conv = UpConv(ins, outs, up_mode=up_mode, merge_mode=merge_mode)
self.up_convs.append(up_conv)
self.conv_final = conv1x1(outs, self.num_classes)
# add the list of modules to current module
self.down_convs = nn.ModuleList(self.down_convs)
self.up_convs = nn.ModuleList(self.up_convs)
self.reset_params()
#staticmethod
def weight_init(m):
if isinstance(m, nn.Conv2d):
#https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/
##Doc: https://pytorch.org/docs/stable/nn.init.html?highlight=xavier#torch.nn.init.xavier_normal_
init.xavier_normal_(m.weight)
init.constant_(m.bias, 0)
def reset_params(self):
for i, m in enumerate(self.modules()):
self.weight_init(m)
def forward(self, x):
encoder_outs = []
# encoder pathway, save outputs for merging
for i, module in enumerate(self.down_convs):
x, before_pool = module(x)
encoder_outs.append(before_pool)
for i, module in enumerate(self.up_convs):
before_pool = encoder_outs[-(i+2)]
x = module(before_pool, x)
# No softmax is used. This means we need to use
# nn.CrossEntropyLoss is your training script,
# as this module includes a softmax already.
x = self.conv_final(x)
return x
Parameters are :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x,y = train_sequence[0] ; batch_size = x.shape[0]
model = UNet(num_classes = 2, depth=5, in_channels=5, merge_mode='concat').to(device)
optim = torch.optim.Adam(model.parameters(),lr=0.01, weight_decay=1e-3)
criterion = nn.BCEWithLogitsLoss() #has sigmoid internally
epochs = 1000
The function for training is :
import torch.nn.functional as f
def train_model(epoch,train_sequence):
"""Train the model and report validation error with training error
Args:
model: the model to be trained
criterion: loss function
data_train (DataLoader): training dataset
"""
model.train()
for idx in range(len(train_sequence)):
X, y = train_sequence[idx]
images = Variable(torch.from_numpy(X)).to(device) # [batch, channel, H, W]
masks = Variable(torch.from_numpy(y)).to(device)
outputs = model(images)
print(masks.shape, outputs.shape)
loss = criterion(outputs, masks)
optim.zero_grad()
loss.backward()
# Update weights
optim.step()
# total_loss = get_loss_train(model, data_train, criterion)
My function for calculating loss and accuracy is below:
def get_loss_train(model, train_sequence):
"""
Calculate loss over train set
"""
model.eval()
total_acc = 0
total_loss = 0
for idx in range(len(train_sequence)):
with torch.no_grad():
X, y = train_sequence[idx]
images = Variable(torch.from_numpy(X)).to(device) # [batch, channel, H, W]
masks = Variable(torch.from_numpy(y)).to(device)
outputs = model(images)
loss = criterion(outputs, masks)
preds = torch.argmax(outputs, dim=1).float()
acc = accuracy_check_for_batch(masks.cpu(), preds.cpu(), images.size()[0])
total_acc = total_acc + acc
total_loss = total_loss + loss.cpu().item()
return total_acc/(len(train_sequence)), total_loss/(len(train_sequence))
Edit : Code which runs (calls) the functions:
for epoch in range(epochs):
train_model(epoch, train_sequence)
train_acc, train_loss = get_loss_train(model,train_sequence)
print("Train Acc:", train_acc)
print("Train loss:", train_loss)
Can someone help me identify as why is accuracy always exact 0.5?
Edit-2:
As asked accuracy_check_for_batch function is here:
def accuracy_check_for_batch(masks, predictions, batch_size):
total_acc = 0
for index in range(batch_size):
total_acc += accuracy_check(masks[index], predictions[index])
return total_acc/batch_size
and
def accuracy_check(mask, prediction):
ims = [mask, prediction]
np_ims = []
for item in ims:
if 'str' in str(type(item)):
item = np.array(Image.open(item))
elif 'PIL' in str(type(item)):
item = np.array(item)
elif 'torch' in str(type(item)):
item = item.numpy()
np_ims.append(item)
compare = np.equal(np_ims[0], np_ims[1])
accuracy = np.sum(compare)
return accuracy/len(np_ims[0].flatten())
I found the mistake.
model = UNet(num_classes = 2, depth=5, in_channels=5, merge_mode='concat').to(device)
should be
model = UNet(num_classes = 1, depth=5, in_channels=5, merge_mode='concat').to(device)
because I am using BCELosswithLogits.

ReLU activation function outputs HUGE numbers

I have FINALLY been able to implement backpropagation, but there are still some bugs I need to fix. The main is issue the following: My ReLU activation function produces really big dJdW values (derivative of error function wrt weights). When this gets subtracted from the weights, my output becomes a matrix of -int or inf. How do I stop this? As of now, the only solution I have is to make my learning rate scalar variable REALLY small.
import numpy as np
class Neural_Network(object):
def __init__(self, input_, hidden_, output_, numHiddenLayer_, numExamples_):
# Define Hyperparameters
self.inputLayerSize = input_
self.outputLayerSize = output_
self.hiddenLayerSize = hidden_
self.numHiddenLayer = numHiddenLayer_
self.numExamples = numExamples_
self.learningRate = 0.000000001 # LEARNING RATE: Why does ReLU produce such large dJdW values?
self.weightDecay = 0.5
# in -> out
self.weights = [] # stores matrices of each layer of weights
self.z = [] # stores matrices of each layer of weighted sums
self.a = [] # stores matrices of each layer of activity
self.biases = [] # stores all biases
# Biases are matrices that are added to activity matrix
# Dimensions -> numExamples_*hiddenLayerSize or numExamples_*outputLayerSize
for i in range(self.numHiddenLayer):
# Biases for hidden layer
b = [np.random.random() for x in range(self.hiddenLayerSize)];
B = [b for x in range(self.numExamples)];
self.biases.append(np.mat(B))
# Biases for output layer
b = [np.random.random() for x in range(self.outputLayerSize)]
B = [b for x in range(self.numExamples)];
self.biases.append(np.mat(B))
# Weights (Parameters)
# Weight matrix between input and first layer
W = np.random.rand(self.inputLayerSize, self.hiddenLayerSize)
self.weights.append(W)
for i in range(self.numHiddenLayer-1):
# Weight matrices between hidden layers
W = np.random.rand(self.hiddenLayerSize, self.hiddenLayerSize)
self.weights.append(W)
# Weight matric between hiddenlayer and outputlayer
self.weights.append(np.random.rand(self.hiddenLayerSize, self.outputLayerSize))
def setBatchSize(self, numExamples):
# Changes the number of rows (examples) for biases
if (self.numExamples > numExamples):
self.biases = [b[:numExamples] for b in self.biases]
def sigmoid(self, z):
# Apply sigmoid activation function
return 1/(1+np.exp(-z))
def sigmoidPrime(self, z):
# Derivative of sigmoid function
return self.sigmoid(x)*(1-self.sigmoid(z))
def ReLU(self, z):
# Apply activation function
'''
for (i, j), item in np.ndenumerate(z):
if (item < 0):
item *= 0.01
else:
item = item
return z'''
return np.multiply((z < 0), z * 0.01) + np.multiply((z >= 0), z)
def ReLUPrime(self, z):
# Derivative of ReLU activation function\
'''
for (i, j), item in np.ndenumerate(z):
if (item < 0):
item = 0.01
else:
item = 1
return z'''
return (z < 0) * 0.01 + (z >= 0) * 1
def forward(self, X):
# Propagate outputs through network
self.z = []
self.a = []
self.z.append(np.dot(X, self.weights[0]) + self.biases[0])
self.a.append(self.ReLU(self.z[0]))
#viewZ = self.z
#viewA = self.a
for i in range(1, self.numHiddenLayer):
self.z.append(np.dot(self.a[-1], self.weights[i]) + self.biases[i])
self.a.append(self.ReLU(self.z[-1]))
self.z.append(np.dot(self.z[-1], self.weights[-1]) + self.biases[-1])
self.a.append(self.ReLU(self.z[-1]))
yHat = self.ReLU(self.z[-1])
return yHat
def backProp(self, X, y):
# Compute derivative wrt W
# out -> in
dJdW = [] # stores matrices of each dJdW (equal in size to self.weights[])
delta = [] # stores matrices of each backpropagating error
self.yHat = self.forward(X)
# Quantifying Error
J = np.multiply((y-self.yHat),(y-self.yHat)) * 0.5
Javrg = np.dot(J.T, np.mat([1 for x in range(self.numExamples)]).reshape(self.numExamples, 1))
print(Javrg.item(0))
delta.insert(0,np.multiply(-(y-self.yHat), self.ReLUPrime(self.z[-1]))) # delta = (y-yHat)(sigmoidPrime(final layer unactivated))
dJdW.insert(0, np.dot(self.a[-2].T, delta[0]) + (self.weightDecay*self.weights[-1])) # dJdW
for i in range(len(self.weights)-1, 1, -1):
# Iterate from self.weights[-1] -> self.weights[1]
delta.insert(0, np.multiply(np.dot(delta[0], self.weights[i].T), self.ReLUPrime(self.z[i-1])))
dJdW.insert(0, np.dot(self.a[i-2].T, delta[0]) + (self.weightDecay*self.weights[i-1]))
delta.insert(0, np.multiply(np.dot(delta[0], self.weights[1].T), self.ReLUPrime(self.z[0])))
dJdW.insert(0, np.dot(X.T, delta[0]) + (self.weightDecay*self.weights[0]))
return dJdW
def train(self, X, y):
for t in range(60000):
dJdW = self.backProp(X, y)
for i in range(len(dJdW)):
self.weights[i] -= self.learningRate*dJdW[i]
# Instantiating Neural Network
inputs = [int(np.random.randint(0,1000)) for x in range(1000)]
x = np.mat([x for x in inputs]).reshape(1000,1)
y = np.mat([x+1 for x in inputs]).reshape(1000,1)
NN = Neural_Network(1,3,1,1,1000)
# Training
print("INPUT: ", end = '\n')
print(x, end = '\n\n')
print("BEFORE TRAINING", NN.forward(x), sep = '\n', end = '\n\n')
print("ERROR: ")
NN.train(x,y)
print("\nAFTER TRAINING", NN.forward(x), sep = '\n', end = '\n\n')
# Testing
test = np.mat([int(np.random.randint(0,10080)) for x in range(1000)]).reshape(1000,1)
print("TEST INPUT:", test, sep = '\n', end = '\n\n')
print(NN.forward(test), end = '\n\n')
NN.setBatchSize(1) # changing settings to receive one input at a time
while True:
# Give numbers between 0-100 (I need to fix overfitting) and it will get next value
inputs = input()
x = np.mat([int(i) for i in inputs.split(" ")])
print(NN.forward(x))
I first made the ANN using sigmoid but Leaky ReLU is faster.
The code is a bit much so here is a summary:
Neural Network Class
define hyperparameter and stuff (include really small learning rate scalar)
activation functions and their derivatives (ReLU and sigmoid)
Member functions: forward propagation, backpropagation, setBatchSize etc.
Instantiating ANN
setting hyperparameters (topology of ANN)
creating data (one array has values x and the output array has values x+1)
Training
using inputs generated in step 2 to train ANN
Testing
Testing using randomly generated inputs
User can give inputs
Hope that helps you help me. Thanks!
Your ReLU and ReLUPrime are wrong. When you iterate over a collection and mutate items it doesn't change the collection. Also: try to not explicitly iterate over arrays in numpy, but use vectorized operations, because they are way faster. It should be a good exercise to rewrite ReLU and its derivative in vectorized form. If you aren't sure what I mean, check out this answer.
Apart from that sigmoidPrime is wrong, it should be
self.sigmoid(z) * (1-self.sigmoid(z))
PS
This problem isn't really well suited for neural network, at least not for this encoding - I've tried it with exact hyperparameters with scikit-learn MLPRegressor and its output doesn't make much sense.

Neural Network MNIST: Backpropagation is correct, but training/test accuracy very low

I am building a neural network to learn to recognize handwritten digits from MNIST. I have confirmed that backpropagation calculates the gradients perfectly (gradient checking gives error < 10 ^ -10).
It appears that no matter how I train the weights, the cost function always tends towards around 3.24-3.25 (never below that, just approaching from above) and the training/test set accuracy is very low (around 11% for the test set). It appears that the h values in the end are all very close to 0.1 and to each other.
I cannot find why my program cannot produce better results. I was wondering if anyone could maybe take a look at my code and please tell me any reasons for this occurring. Thank you so much for all your help, I really appreciate it!
Here is my Python code:
import numpy as np
import math
from tensorflow.examples.tutorials.mnist import input_data
# Neural network has four layers
# The input layer has 784 nodes
# The two hidden layers each have 5 nodes
# The output layer has 10 nodes
num_layer = 4
num_node = [784,5,5,10]
num_output_node = 10
# 30000 training sets are used
# 10000 test sets are used
# Can be adjusted
Ntrain = 30000
Ntest = 10000
# Sigmoid Function
def g(X):
return 1/(1 + np.exp(-X))
# Forwardpropagation
def h(W,X):
a = X
for l in range(num_layer - 1):
a = np.insert(a,0,1)
z = np.dot(a,W[l])
a = g(z)
return a
# Cost Function
def J(y, W, X, Lambda):
cost = 0
for i in range(Ntrain):
H = h(W,X[i])
for k in range(num_output_node):
cost = cost + y[i][k] * math.log(H[k]) + (1-y[i][k]) * math.log(1-H[k])
regularization = 0
for l in range(num_layer - 1):
for i in range(num_node[l]):
for j in range(num_node[l+1]):
regularization = regularization + W[l][i+1][j] ** 2
return (-1/Ntrain * cost + Lambda / (2*Ntrain) * regularization)
# Backpropagation - confirmed to be correct
# Algorithm based on https://www.coursera.org/learn/machine-learning/lecture/1z9WW/backpropagation-algorithm
# Returns D, the value of the gradient
def BackPropagation(y, W, X, Lambda):
delta = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
delta[l] = np.zeros((num_node[l]+1,num_node[l+1]))
for i in range(Ntrain):
A = np.empty(num_layer-1, dtype = object)
a = X[i]
for l in range(num_layer - 1):
A[l] = a
a = np.insert(a,0,1)
z = np.dot(a,W[l])
a = g(z)
diff = a - y[i]
delta[num_layer-2] = delta[num_layer-2] + np.outer(np.insert(A[num_layer-2],0,1),diff)
for l in range(num_layer-2):
index = num_layer-2-l
diff = np.multiply(np.dot(np.array([W[index][k+1] for k in range(num_node[index])]), diff), np.multiply(A[index], 1-A[index]))
delta[index-1] = delta[index-1] + np.outer(np.insert(A[index-1],0,1),diff)
D = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
D[l] = np.zeros((num_node[l]+1,num_node[l+1]))
for l in range(num_layer-1):
for i in range(num_node[l]+1):
if i == 0:
for j in range(num_node[l+1]):
D[l][i][j] = 1/Ntrain * delta[l][i][j]
else:
for j in range(num_node[l+1]):
D[l][i][j] = 1/Ntrain * (delta[l][i][j] + Lambda * W[l][i][j])
return D
# Neural network - this is where the learning/adjusting of weights occur
# W is the weights
# learn is the learning rate
# iterations is the number of iterations we pass over the training set
# Lambda is the regularization parameter
def NeuralNetwork(y, X, learn, iterations, Lambda):
W = np.empty(num_layer-1, dtype = object)
for l in range(num_layer - 1):
W[l] = np.random.rand(num_node[l]+1,num_node[l+1])/100
for k in range(iterations):
print(J(y, W, X, Lambda))
D = BackPropagation(y, W, X, Lambda)
for l in range(num_layer-1):
W[l] = W[l] - learn * D[l]
print(J(y, W, X, Lambda))
return W
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Training data, read from MNIST
inputpix = []
output = []
for i in range(Ntrain):
inputpix.append(2 * np.array(mnist.train.images[i]) - 1)
output.append(np.array(mnist.train.labels[i]))
np.savetxt('input.txt', inputpix, delimiter=' ')
np.savetxt('output.txt', output, delimiter=' ')
# Train the weights
finalweights = NeuralNetwork(output, inputpix, 2, 5, 1)
# Test data
inputtestpix = []
outputtest = []
for i in range(Ntest):
inputtestpix.append(2 * np.array(mnist.test.images[i]) - 1)
outputtest.append(np.array(mnist.test.labels[i]))
np.savetxt('inputtest.txt', inputtestpix, delimiter=' ')
np.savetxt('outputtest.txt', outputtest, delimiter=' ')
# Determine the accuracy of the training data
count = 0
for i in range(Ntrain):
H = h(finalweights,inputpix[i])
print(H)
for j in range(num_output_node):
if H[j] == np.amax(H) and output[i][j] == 1:
count = count + 1
print(count/Ntrain)
# Determine the accuracy of the test data
count = 0
for i in range(Ntest):
H = h(finalweights,inputtestpix[i])
print(H)
for j in range(num_output_node):
if H[j] == np.amax(H) and outputtest[i][j] == 1:
count = count + 1
print(count/Ntest)
Your network is tiny, 5 neurons make it basically a linear model. Increase it to 256 per layer.
Notice, that trivial linear model has 768 * 10 + 10 (biases) parameters, adding up to 7690 floats. Your neural network on the other hand has 768 * 5 + 5 + 5 * 5 + 5 + 5 * 10 + 10 = 3845 + 30 + 60 = 3935. In other words despite being nonlinear neural network, it is actualy a simpler model than a trivial logistic regression applied to this problem. And logistic regression obtains around 11% error on its own, thus you cannot really expect to beat it. Of course this is not a strict argument, but should give you some intuition for why it should not work.
Second issue is related to other hyperparameters, you seem to be using:
huge learning rate (is it 2?) it should be more of order 0.0001
very little training iterations (are you just executing 5 epochs?)
your regularization parameter is huge (it is set to 1), so your network is heavily penalised for learning anything, again - change it to something order of magnitude smaller
The NN architecture is most likely under-fitting. Maybe, the learning rate is high/low. Or there are most issues with the regularization parameter.

How to explain high AUC-ROC with mediocre precision and recall in unbalanced data?

I have some machine learning results that I am trying to make sense of. The task is to predict/label "Irish" vs. "non-Irish". Python 2.7's output:
1= ir
0= non-ir
Class count:
0 4090942
1 940852
Name: ethnicity_scan, dtype: int64
Accuracy: 0.874921350119
Classification report:
precision recall f1-score support
0 0.89 0.96 0.93 2045610
1 0.74 0.51 0.60 470287
avg / total 0.87 0.87 0.87 2515897
Confusion matrix:
[[1961422 84188]
[ 230497 239790]]
AUC-ir= 0.901238104773
As you can see, the precision and recall are mediocre, but the AUC-ROC is higher (~0.90). And I am trying to figure out why, which I suspect is due to data imbalance (about 1:5). Based on the confusion matrix, and using Irish as the target (+), I calculated the TPR=0.51 and FPR=0.04. If I am considering non-Irish as (+), then TPR=0.96 and FPR=0.49. So how can I get a 0.9 AUC while the TPR can be only 0.5 at FPR=0.04?
Codes:
try:
for i in mass[k]:
df = df_temp # reset df before each loop
#$$
#$$
if 1==1:
###if i == singleEthnic:
count+=1
ethnicity_tar = str(i) # fr, en, ir, sc, others, ab, rus, ch, it, jp
# fn, metis, inuit; algonquian, iroquoian, athapaskan, wakashan, siouan, salish, tsimshian, kootenay
############################################
############################################
def ethnicity_target(row):
try:
if row[ethnicity_var] == ethnicity_tar:
return 1
else:
return 0
except: return None
df['ethnicity_scan'] = df.apply(ethnicity_target, axis=1)
print '1=', ethnicity_tar
print '0=', 'non-'+ethnicity_tar
# Random sampling a smaller dataframe for debugging
rows = df.sample(n=subsample_size, random_state=seed) # Seed gives fixed randomness
df = DataFrame(rows)
print 'Class count:'
print df['ethnicity_scan'].value_counts()
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_full_last_name(nameString):
try:
last_name = nameString.rsplit(None, 1)[-1]
if len(last_name) > 1: # not accept name with only 1 character
return last_name
else: return '?'
except: return '?'
def feature_full_first_name(nameString):
try:
first_name = nameString.rsplit(' ', 1)[0]
if len(first_name) > 1: # not accept name with only 1 character
return first_name
else: return '?'
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict = [{'last-name': feature_full_last_name(i)} for i in X]
my_dict5 = [{'first-name': feature_full_first_name(i)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(
my_dict[i].items() + my_dict5[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(newX, y, test_size=testTrainSplit)
# Fitting X and y into model, using training data
classifierUsed2.fit(X_train, y_train)
# Making predictions using trained data
y_train_predictions = classifierUsed2.predict(X_train)
y_test_predictions = classifierUsed2.predict(X_test)
Inserted codes for resampling:
try:
for i in mass[k]:
df = df_temp # reset df before each loop
#$$
#$$
if 1==1:
###if i == singleEthnic:
count+=1
ethnicity_tar = str(i) # fr, en, ir, sc, others, ab, rus, ch, it, jp
# fn, metis, inuit; algonquian, iroquoian, athapaskan, wakashan, siouan, salish, tsimshian, kootenay
############################################
############################################
def ethnicity_target(row):
try:
if row[ethnicity_var] == ethnicity_tar:
return 1
else:
return 0
except: return None
df['ethnicity_scan'] = df.apply(ethnicity_target, axis=1)
print '1=', ethnicity_tar
print '0=', 'non-'+ethnicity_tar
# Resampled
df_resampled = df.append(df[df.ethnicity_scan==0].sample(len(df)*5, replace=True))
# Random sampling a smaller dataframe for debugging
rows = df_resampled.sample(n=subsample_size, random_state=seed) # Seed gives fixed randomness
df = DataFrame(rows)
print 'Class count:'
print df['ethnicity_scan'].value_counts()
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_full_last_name(nameString):
try:
last_name = nameString.rsplit(None, 1)[-1]
if len(last_name) > 1: # not accept name with only 1 character
return last_name
else: return '?'
except: return '?'
def feature_full_first_name(nameString):
try:
first_name = nameString.rsplit(' ', 1)[0]
if len(first_name) > 1: # not accept name with only 1 character
return first_name
else: return '?'
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict = [{'last-name': feature_full_last_name(i)} for i in X]
my_dict5 = [{'first-name': feature_full_first_name(i)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(
my_dict[i].items() + my_dict5[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(newX, y, test_size=testTrainSplit)
# Fitting X and y into model, using training data
classifierUsed2.fit(X_train, y_train)
# Making predictions using trained data
y_train_predictions = classifierUsed2.predict(X_train)
y_test_predictions = classifierUsed2.predict(X_test)
Your model outputs a probability P (between 0 and 1) for each row in the test set that it scores. The summary stats (precision, recall, etc) are for a single value of P as a prediction threshold, probably P=0.5, unless you've changed this in your code. However the ROC contains more information, the idea is that you probably won't want to use this default value as your prediction threshold, so the ROC is plotted by calculating the ratio of true positives to false positives, across every prediction threshold betwen 0 and 1.
If you've undersampled your non-Irish people in the data, then you're correct that the AUC and precision will be overestimated; if your dataset is only 5000 rows, then you will have no problem running your model on a larger training set; just rebalance your dataset (by bootstrap sampling to increase your non-Irish people) until your accurately reflect your sample population.

Resources