Loss Not Converging Caffe Regression - machine-learning

I'm doing regression in Caffe. The dataset is 400 RGB images of 128x128 size and label contains float numbers in range(-1,1).
The only transformation I applied to the dataset was Normalization (Divided every pixel value in RGB by 255). But the loss doesn't seem to converge at all.
What might be the possible reason for this? Can anyone please suggest me?
Here is my training log:
Training..
Using solver: solver_hdf5.prototxt
I0929 21:50:21.657784 13779 caffe.cpp:112] Use CPU.
I0929 21:50:21.658033 13779 caffe.cpp:174] Starting Optimization
I0929 21:50:21.658107 13779 solver.cpp:34] Initializing solver from parameters:
test_iter: 100
test_interval: 500
base_lr: 0.0001
display: 25
max_iter: 10000
lr_policy: "inv"
gamma: 0.0001
power: 0.75
momentum: 0.9
weight_decay: 0.0005
snapshot: 5000
snapshot_prefix: "lenet_hdf5"
solver_mode: CPU
net: "train_test_hdf5.prototxt"
I0929 21:50:21.658143 13779 solver.cpp:75] Creating training net from net file: train_test_hdf5.prototxt
I0929 21:50:21.658567 13779 net.cpp:334] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
I0929 21:50:21.658709 13779 net.cpp:46] Initializing net from parameters:
name: "MSE regression"
state {
phase: TRAIN
}
layer {
name: "data"
type: "HDF5Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
hdf5_data_param {
source: "train_hdf5file.txt"
batch_size: 64
shuffle: true
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 20
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "dropout1"
type: "Dropout"
bottom: "pool1"
top: "pool1"
dropout_param {
dropout_ratio: 0.1
}
}
layer {
name: "fc1"
type: "InnerProduct"
bottom: "pool1"
top: "fc1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "dropout2"
type: "Dropout"
bottom: "fc1"
top: "fc1"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc2"
type: "InnerProduct"
bottom: "fc1"
top: "fc2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss"
type: "EuclideanLoss"
bottom: "fc2"
bottom: "label"
top: "loss"
}
I0929 21:50:21.658833 13779 layer_factory.hpp:74] Creating layer data
I0929 21:50:21.658859 13779 net.cpp:96] Creating Layer data
I0929 21:50:21.658871 13779 net.cpp:415] data -> data
I0929 21:50:21.658902 13779 net.cpp:415] data -> label
I0929 21:50:21.658926 13779 net.cpp:160] Setting up data
I0929 21:50:21.658936 13779 hdf5_data_layer.cpp:80] Loading list of HDF5 filenames from: train_hdf5file.txt
I0929 21:50:21.659220 13779 hdf5_data_layer.cpp:94] Number of HDF5 files: 1
I0929 21:50:21.920578 13779 net.cpp:167] Top shape: 64 3 128 128 (3145728)
I0929 21:50:21.920656 13779 net.cpp:167] Top shape: 64 1 (64)
I0929 21:50:21.920686 13779 layer_factory.hpp:74] Creating layer conv1
I0929 21:50:21.920740 13779 net.cpp:96] Creating Layer conv1
I0929 21:50:21.920774 13779 net.cpp:459] conv1 <- data
I0929 21:50:21.920825 13779 net.cpp:415] conv1 -> conv1
I0929 21:50:21.920877 13779 net.cpp:160] Setting up conv1
I0929 21:50:21.921985 13779 net.cpp:167] Top shape: 64 20 124 124 (19681280)
I0929 21:50:21.922050 13779 layer_factory.hpp:74] Creating layer relu1
I0929 21:50:21.922085 13779 net.cpp:96] Creating Layer relu1
I0929 21:50:21.922108 13779 net.cpp:459] relu1 <- conv1
I0929 21:50:21.922137 13779 net.cpp:404] relu1 -> conv1 (in-place)
I0929 21:50:21.922185 13779 net.cpp:160] Setting up relu1
I0929 21:50:21.922227 13779 net.cpp:167] Top shape: 64 20 124 124 (19681280)
I0929 21:50:21.922250 13779 layer_factory.hpp:74] Creating layer pool1
I0929 21:50:21.922277 13779 net.cpp:96] Creating Layer pool1
I0929 21:50:21.922298 13779 net.cpp:459] pool1 <- conv1
I0929 21:50:21.922323 13779 net.cpp:415] pool1 -> pool1
I0929 21:50:21.922418 13779 net.cpp:160] Setting up pool1
I0929 21:50:21.922472 13779 net.cpp:167] Top shape: 64 20 62 62 (4920320)
I0929 21:50:21.922495 13779 layer_factory.hpp:74] Creating layer dropout1
I0929 21:50:21.922534 13779 net.cpp:96] Creating Layer dropout1
I0929 21:50:21.922555 13779 net.cpp:459] dropout1 <- pool1
I0929 21:50:21.922582 13779 net.cpp:404] dropout1 -> pool1 (in-place)
I0929 21:50:21.922613 13779 net.cpp:160] Setting up dropout1
I0929 21:50:21.922652 13779 net.cpp:167] Top shape: 64 20 62 62 (4920320)
I0929 21:50:21.922672 13779 layer_factory.hpp:74] Creating layer fc1
I0929 21:50:21.922709 13779 net.cpp:96] Creating Layer fc1
I0929 21:50:21.922729 13779 net.cpp:459] fc1 <- pool1
I0929 21:50:21.922757 13779 net.cpp:415] fc1 -> fc1
I0929 21:50:21.922801 13779 net.cpp:160] Setting up fc1
I0929 21:50:22.301134 13779 net.cpp:167] Top shape: 64 500 (32000)
I0929 21:50:22.301193 13779 layer_factory.hpp:74] Creating layer dropout2
I0929 21:50:22.301210 13779 net.cpp:96] Creating Layer dropout2
I0929 21:50:22.301218 13779 net.cpp:459] dropout2 <- fc1
I0929 21:50:22.301232 13779 net.cpp:404] dropout2 -> fc1 (in-place)
I0929 21:50:22.301244 13779 net.cpp:160] Setting up dropout2
I0929 21:50:22.301254 13779 net.cpp:167] Top shape: 64 500 (32000)
I0929 21:50:22.301259 13779 layer_factory.hpp:74] Creating layer fc2
I0929 21:50:22.301270 13779 net.cpp:96] Creating Layer fc2
I0929 21:50:22.301275 13779 net.cpp:459] fc2 <- fc1
I0929 21:50:22.301285 13779 net.cpp:415] fc2 -> fc2
I0929 21:50:22.301295 13779 net.cpp:160] Setting up fc2
I0929 21:50:22.301317 13779 net.cpp:167] Top shape: 64 1 (64)
I0929 21:50:22.301328 13779 layer_factory.hpp:74] Creating layer loss
I0929 21:50:22.301338 13779 net.cpp:96] Creating Layer loss
I0929 21:50:22.301343 13779 net.cpp:459] loss <- fc2
I0929 21:50:22.301350 13779 net.cpp:459] loss <- label
I0929 21:50:22.301360 13779 net.cpp:415] loss -> loss
I0929 21:50:22.301374 13779 net.cpp:160] Setting up loss
I0929 21:50:22.301385 13779 net.cpp:167] Top shape: (1)
I0929 21:50:22.301391 13779 net.cpp:169] with loss weight 1
I0929 21:50:22.301419 13779 net.cpp:239] loss needs backward computation.
I0929 21:50:22.301425 13779 net.cpp:239] fc2 needs backward computation.
I0929 21:50:22.301430 13779 net.cpp:239] dropout2 needs backward computation.
I0929 21:50:22.301436 13779 net.cpp:239] fc1 needs backward computation.
I0929 21:50:22.301441 13779 net.cpp:239] dropout1 needs backward computation.
I0929 21:50:22.301446 13779 net.cpp:239] pool1 needs backward computation.
I0929 21:50:22.301452 13779 net.cpp:239] relu1 needs backward computation.
I0929 21:50:22.301457 13779 net.cpp:239] conv1 needs backward computation.
I0929 21:50:22.301463 13779 net.cpp:241] data does not need backward computation.
I0929 21:50:22.301468 13779 net.cpp:282] This network produces output loss
I0929 21:50:22.301482 13779 net.cpp:531] Collecting Learning Rate and Weight Decay.
I0929 21:50:22.301491 13779 net.cpp:294] Network initialization done.
I0929 21:50:22.301496 13779 net.cpp:295] Memory required for data: 209652228
I0929 21:50:22.301908 13779 solver.cpp:159] Creating test net (#0) specified by net file: train_test_hdf5.prototxt
I0929 21:50:22.301935 13779 net.cpp:334] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
I0929 21:50:22.302028 13779 net.cpp:46] Initializing net from parameters:
name: "MSE regression"
state {
phase: TEST
}
layer {
name: "data"
type: "HDF5Data"
top: "data"
top: "label"
include {
phase: TEST
}
hdf5_data_param {
source: "test_hdf5file.txt"
batch_size: 30
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 20
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "dropout1"
type: "Dropout"
bottom: "pool1"
top: "pool1"
dropout_param {
dropout_ratio: 0.1
}
}
layer {
name: "fc1"
type: "InnerProduct"
bottom: "pool1"
top: "fc1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "dropout2"
type: "Dropout"
bottom: "fc1"
top: "fc1"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc2"
type: "InnerProduct"
bottom: "fc1"
top: "fc2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss"
type: "EuclideanLoss"
bottom: "fc2"
bottom: "label"
top: "loss"
}
I0929 21:50:22.302146 13779 layer_factory.hpp:74] Creating layer data
I0929 21:50:22.302158 13779 net.cpp:96] Creating Layer data
I0929 21:50:22.302165 13779 net.cpp:415] data -> data
I0929 21:50:22.302176 13779 net.cpp:415] data -> label
I0929 21:50:22.302186 13779 net.cpp:160] Setting up data
I0929 21:50:22.302191 13779 hdf5_data_layer.cpp:80] Loading list of HDF5 filenames from: test_hdf5file.txt
I0929 21:50:22.302305 13779 hdf5_data_layer.cpp:94] Number of HDF5 files: 1
I0929 21:50:22.434798 13779 net.cpp:167] Top shape: 30 3 128 128 (1474560)
I0929 21:50:22.434849 13779 net.cpp:167] Top shape: 30 1 (30)
I0929 21:50:22.434864 13779 layer_factory.hpp:74] Creating layer conv1
I0929 21:50:22.434895 13779 net.cpp:96] Creating Layer conv1
I0929 21:50:22.434914 13779 net.cpp:459] conv1 <- data
I0929 21:50:22.434944 13779 net.cpp:415] conv1 -> conv1
I0929 21:50:22.434996 13779 net.cpp:160] Setting up conv1
I0929 21:50:22.435084 13779 net.cpp:167] Top shape: 30 20 124 124 (9225600)
I0929 21:50:22.435119 13779 layer_factory.hpp:74] Creating layer relu1
I0929 21:50:22.435205 13779 net.cpp:96] Creating Layer relu1
I0929 21:50:22.435237 13779 net.cpp:459] relu1 <- conv1
I0929 21:50:22.435292 13779 net.cpp:404] relu1 -> conv1 (in-place)
I0929 21:50:22.435328 13779 net.cpp:160] Setting up relu1
I0929 21:50:22.435371 13779 net.cpp:167] Top shape: 30 20 124 124 (9225600)
I0929 21:50:22.435400 13779 layer_factory.hpp:74] Creating layer pool1
I0929 21:50:22.435443 13779 net.cpp:96] Creating Layer pool1
I0929 21:50:22.435470 13779 net.cpp:459] pool1 <- conv1
I0929 21:50:22.435511 13779 net.cpp:415] pool1 -> pool1
I0929 21:50:22.435550 13779 net.cpp:160] Setting up pool1
I0929 21:50:22.435597 13779 net.cpp:167] Top shape: 30 20 62 62 (2306400)
I0929 21:50:22.435626 13779 layer_factory.hpp:74] Creating layer dropout1
I0929 21:50:22.435669 13779 net.cpp:96] Creating Layer dropout1
I0929 21:50:22.435698 13779 net.cpp:459] dropout1 <- pool1
I0929 21:50:22.435739 13779 net.cpp:404] dropout1 -> pool1 (in-place)
I0929 21:50:22.435780 13779 net.cpp:160] Setting up dropout1
I0929 21:50:22.435823 13779 net.cpp:167] Top shape: 30 20 62 62 (2306400)
I0929 21:50:22.435853 13779 layer_factory.hpp:74] Creating layer fc1
I0929 21:50:22.435899 13779 net.cpp:96] Creating Layer fc1
I0929 21:50:22.435926 13779 net.cpp:459] fc1 <- pool1
I0929 21:50:22.435971 13779 net.cpp:415] fc1 -> fc1
I0929 21:50:22.436018 13779 net.cpp:160] Setting up fc1
I0929 21:50:22.816076 13779 net.cpp:167] Top shape: 30 500 (15000)
I0929 21:50:22.816138 13779 layer_factory.hpp:74] Creating layer dropout2
I0929 21:50:22.816154 13779 net.cpp:96] Creating Layer dropout2
I0929 21:50:22.816160 13779 net.cpp:459] dropout2 <- fc1
I0929 21:50:22.816170 13779 net.cpp:404] dropout2 -> fc1 (in-place)
I0929 21:50:22.816182 13779 net.cpp:160] Setting up dropout2
I0929 21:50:22.816192 13779 net.cpp:167] Top shape: 30 500 (15000)
I0929 21:50:22.816197 13779 layer_factory.hpp:74] Creating layer fc2
I0929 21:50:22.816208 13779 net.cpp:96] Creating Layer fc2
I0929 21:50:22.816249 13779 net.cpp:459] fc2 <- fc1
I0929 21:50:22.816262 13779 net.cpp:415] fc2 -> fc2
I0929 21:50:22.816277 13779 net.cpp:160] Setting up fc2
I0929 21:50:22.816301 13779 net.cpp:167] Top shape: 30 1 (30)
I0929 21:50:22.816316 13779 layer_factory.hpp:74] Creating layer loss
I0929 21:50:22.816329 13779 net.cpp:96] Creating Layer loss
I0929 21:50:22.816337 13779 net.cpp:459] loss <- fc2
I0929 21:50:22.816347 13779 net.cpp:459] loss <- label
I0929 21:50:22.816359 13779 net.cpp:415] loss -> loss
I0929 21:50:22.816370 13779 net.cpp:160] Setting up loss
I0929 21:50:22.816381 13779 net.cpp:167] Top shape: (1)
I0929 21:50:22.816388 13779 net.cpp:169] with loss weight 1
I0929 21:50:22.816407 13779 net.cpp:239] loss needs backward computation.
I0929 21:50:22.816416 13779 net.cpp:239] fc2 needs backward computation.
I0929 21:50:22.816426 13779 net.cpp:239] dropout2 needs backward computation.
I0929 21:50:22.816433 13779 net.cpp:239] fc1 needs backward computation.
I0929 21:50:22.816442 13779 net.cpp:239] dropout1 needs backward computation.
I0929 21:50:22.816452 13779 net.cpp:239] pool1 needs backward computation.
I0929 21:50:22.816460 13779 net.cpp:239] relu1 needs backward computation.
I0929 21:50:22.816468 13779 net.cpp:239] conv1 needs backward computation.
I0929 21:50:22.816478 13779 net.cpp:241] data does not need backward computation.
I0929 21:50:22.816486 13779 net.cpp:282] This network produces output loss
I0929 21:50:22.816500 13779 net.cpp:531] Collecting Learning Rate and Weight Decay.
I0929 21:50:22.816510 13779 net.cpp:294] Network initialization done.
I0929 21:50:22.816517 13779 net.cpp:295] Memory required for data: 98274484
I0929 21:50:22.816565 13779 solver.cpp:47] Solver scaffolding done.
I0929 21:50:22.816587 13779 solver.cpp:363] Solving MSE regression
I0929 21:50:22.816596 13779 solver.cpp:364] Learning Rate Policy: inv
I0929 21:50:22.870337 13779 solver.cpp:424] Iteration 0, Testing net (#0)
Update (This is after the #lejlot 's reply)
Training Images After Changing My Data:

It seems to be learning, the loss goes down. However there is clearly something wrong with your data. Before learning (iteration 0) you already have a loss 0.0006. This is extremely small loss for a random model. Thus it looks like your data is very odd. Look at your dependent values, are they really nicely distributed between -1 and 1? Or is it like having 99% of "0" and just a few other values? There is nothing wrong with the approach itself, you simply need to do more analysis of your data. Make sure that it actually nicely spans [-1, 1] interval. Once you fix it, there will be plenty more small things to play around - but this is the biggest issue right now - you get way to small error with random model, thus the problem is data, not algorithm/method/parameters. To make things go faster, you can also increase the learning rate from 0.0001 that you are using right now, but as said before - first fix the data.

Related

Is Caffe Net Surgery requires fine tuning?

I am new at Caffe and I want to use already trained caffeNet model with ImageNet. I applied net surgery by removing a convolutional intermediate conv4 layer.
'layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "relu5-new"
type: "ReLU"
bottom: "conv5-new"
top: "conv5-new"
}
layer {
name: "pool5-new"
type: "Pooling"
bottom: "conv5-new"
top: "pool5-new"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fc6"
type: "InnerProduct"
bottom: "pool5-new"
top: "fc6"
inner_product_param {
num_output: 4096
}
}'
Full of prototxt file can be found here
After saving this new network the accuracy became 0. Should I make fine tuning on ImageNet validation set, or is there something wrong on my new prototxt file?
Any help will be appreciated!
The original net you started with had conv4 between conv3 and conv5: this means the filters (weights) of conv5 were expecting certain number of inputs and certain "order" or "meaning" of inputs. Once you removed conv4, you had to alter conv5 to accept different number of inputs. Therefore, the new conv5 layer must be trained to accommodate to the new inputs it receives.
In this case, when you introduced a new conv5 layer, you should have weight_filler defined in your prototxt to guide caffe as to how to initialize the new weights. Otherwise caffe will set the weights to zero and it will be almost impossible to finetune in this case.

How to prevent weight update in caffe

Some layers of my net load a pretrained model. I want to fix their parameters and train other layers.
I followed this page and set lr_multi decay_multi to 0, propagate_down: false, and even base_lr: 0 weight_decay: 0 in the solver. However, the test loss (use all test images for each test) is still changing very slowly in each iter. After thousands iters the accuracy would go to 0 (from 80% when the pre-trained model was loaded).
Here is a a two-layer example which I just initialize the weights and set above parameters to 0. I want to feeze all layers in this example but when training start, the loss keeps changing...
layer {
name: "data"
type: "ImageData"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
scale: 0.017
mirror: true
crop_size: 32
mean_value: 115
mean_value: 126
mean_value: 130
color: true
contrast: true
brightness: true
}
image_data_param {
source: "/data/zhuhao5/data/cifar100/cifar100_train_replicate.txt"
batch_size: 64
shuffle: true
#pair_size: 3
}
}
layer {
name: "data"
type: "ImageData"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
scale: 0.017
mirror: false
crop_size: 32
mean_value: 115
mean_value: 126
mean_value: 130
}
image_data_param {
source: "/data/zhuhao5/data/cifar100/cifar100_test.txt"
batch_size: 100
shuffle: false
}
}
#-------------- TEACHER --------------------
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
propagate_down: false
top: "conv1"
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 16
bias_term: false
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "msra"
}
}
}
layer {
name: "res2_1a_1_bn"
type: "BatchNorm"
bottom: "conv1"
propagate_down: false
top: "res2_1a_1_bn"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
}
layer {
name: "res2_1a_1_scale"
type: "Scale"
bottom: "res2_1a_1_bn"
propagate_down: false
top: "res2_1a_1_bn"
param {
lr_mult: 0
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "res2_1a_1_relu"
type: "ReLU"
bottom: "res2_1a_1_bn"
propagate_down: false
top: "res2_1a_1_bn"
}
layer {
name: "pool_5"
type: "Pooling"
bottom: "res2_1a_1_bn"
propagate_down: false
top: "pool_5"
pooling_param {
pool: AVE
global_pooling: true
}
}
layer {
name: "fc100"
type: "InnerProduct"
bottom: "pool_5"
propagate_down: false
top: "fc100"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
inner_product_param {
num_output: 100
weight_filler {
type: "msra"
}
bias_filler {
type: "constant"
value: 0
}
}
}
#---------------------------------
layer {
name: "tea_soft_loss"
type: "SoftmaxWithLoss"
bottom: "fc100"
bottom: "label"
propagate_down: false
propagate_down: false
top: "tea_soft_loss"
loss_weight: 0
}
##----------- ACCURACY----------------
layer {
name: "teacher_accuracy"
type: "Accuracy"
bottom: "fc100"
bottom: "label"
top: "teacher_accuracy"
accuracy_param {
top_k: 1
}
}
Here is the solver:
test_iter: 100
test_interval: 10
base_lr: 0
momentum: 0
weight_decay: 0
lr_policy: "poly"
power: 1
display: 10000
max_iter: 80000
snapshot: 5000
type: "SGD"
solver_mode: GPU
random_seed: 10086
and log:
I0829 16:31:39.363433 14986 net.cpp:200] teacher_accuracy does not need backward computation.
I0829 16:31:39.363438 14986 net.cpp:200] tea_soft_loss does not need backward computation.
I0829 16:31:39.363442 14986 net.cpp:200] fc100_fc100_0_split does not need backward computation.
I0829 16:31:39.363446 14986 net.cpp:200] fc100 does not need backward computation.
I0829 16:31:39.363451 14986 net.cpp:200] pool_5 does not need backward computation.
I0829 16:31:39.363454 14986 net.cpp:200] res2_1a_1_relu does not need backward computation.
I0829 16:31:39.363458 14986 net.cpp:200] res2_1a_1_scale does not need backward computation.
I0829 16:31:39.363462 14986 net.cpp:200] res2_1a_1_bn does not need backward computation.
I0829 16:31:39.363466 14986 net.cpp:200] conv1 does not need backward computation.
I0829 16:31:39.363471 14986 net.cpp:200] label_data_1_split does not need backward computation.
I0829 16:31:39.363485 14986 net.cpp:200] data does not need backward computation.
I0829 16:31:39.363490 14986 net.cpp:242] This network produces output tea_soft_loss
I0829 16:31:39.363494 14986 net.cpp:242] This network produces output teacher_accuracy
I0829 16:31:39.363507 14986 net.cpp:255] Network initialization done.
I0829 16:31:39.363559 14986 solver.cpp:56] Solver scaffolding done.
I0829 16:31:39.363852 14986 caffe.cpp:248] Starting Optimization
I0829 16:31:39.363862 14986 solver.cpp:272] Solving WRN_22_12_to_WRN_18_4_v5_net
I0829 16:31:39.363865 14986 solver.cpp:273] Learning Rate Policy: poly
I0829 16:31:39.365981 14986 solver.cpp:330] Iteration 0, Testing net (#0)
I0829 16:31:39.366190 14986 blocking_queue.cpp:49] Waiting for data
I0829 16:31:39.742347 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 85.9064
I0829 16:31:39.742437 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0113
I0829 16:31:39.749806 14986 solver.cpp:218] Iteration 0 (0 iter/s, 0.385886s/10000 iters), loss = 0
I0829 16:31:39.749862 14986 solver.cpp:237] Train net output #0: tea_soft_loss = 4.97483
I0829 16:31:39.749877 14986 solver.cpp:237] Train net output #1: teacher_accuracy = 0
I0829 16:31:39.749908 14986 sgd_solver.cpp:105] Iteration 0, lr = 0
I0829 16:31:39.794306 14986 solver.cpp:330] Iteration 10, Testing net (#0)
I0829 16:31:40.171447 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.9119
I0829 16:31:40.171510 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0115
I0829 16:31:40.219133 14986 solver.cpp:330] Iteration 20, Testing net (#0)
I0829 16:31:40.596911 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.91862
I0829 16:31:40.596971 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0116
I0829 16:31:40.645246 14986 solver.cpp:330] Iteration 30, Testing net (#0)
I0829 16:31:41.021711 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.92105
I0829 16:31:41.021772 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0117
I0829 16:31:41.069464 14986 solver.cpp:330] Iteration 40, Testing net (#0)
I0829 16:31:41.447345 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.91916
I0829 16:31:41.447407 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0117
I0829 16:31:41.495157 14986 solver.cpp:330] Iteration 50, Testing net (#0)
I0829 16:31:41.905607 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.9208
I0829 16:31:41.905654 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0117
I0829 16:31:41.952659 14986 solver.cpp:330] Iteration 60, Testing net (#0)
I0829 16:31:42.327942 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.91936
I0829 16:31:42.328025 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0117
I0829 16:31:42.374279 14986 solver.cpp:330] Iteration 70, Testing net (#0)
I0829 16:31:42.761359 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.91859
I0829 16:31:42.761430 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0117
I0829 16:31:42.807821 14986 solver.cpp:330] Iteration 80, Testing net (#0)
I0829 16:31:43.232321 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.91668
I0829 16:31:43.232398 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0117
I0829 16:31:43.266436 14986 solver.cpp:330] Iteration 90, Testing net (#0)
I0829 16:31:43.514633 14986 blocking_queue.cpp:49] Waiting for data
I0829 16:31:43.638617 14986 solver.cpp:397] Test net output #0: tea_soft_loss = 4.91836
I0829 16:31:43.638684 14986 solver.cpp:397] Test net output #1: teacher_accuracy = 0.0117
I0829 16:31:43.685451 14986 solver.cpp:330] Iteration 100, Testing net (#0)
I wonder what I missed in the updating process in caffe :(
Found the reason.
BatchNorm layer use different use_global_stats in TRAIN and TEST phases.
In my issue, I should set use_global_stats: true in the training process.
And also don't forget the Scale layer.
The revised layers should be
layer {
name: "res2_1a_1_bn"
type: "BatchNorm"
bottom: "conv1"
top: "res2_1a_1_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "res2_1a_1_scale"
type: "Scale"
bottom: "res2_1a_1_bn"
top: "res2_1a_1_bn"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
scale_param {
bias_term: true
}
}

Training error and validation error are the same - zero accuracy during training

I am basically using caffeNet to do some sort of image classification, with 256 classes. I am feeding the network a list of HDF5 files. But my network doesn't seem to learn, I m having accuracy 0 all the time and the training error and validation error are the same. I would think if the dataset was not enough to learn, training error should be very small and the validation error should be large right? Also tried it with different batch sizes and learning rates, with no success. Here is the solver.prototxt and the network prototxt(a caffenet arch.). Any suggestion is appreciate.
I1103 12:01:41.822055 108615 solver.cpp:337] Iteration 0, Testing net (#0)
I1103 12:01:41.849742 108615 solver.cpp:404] Test net output #0: accuracy = 0
I1103 12:01:41.849761 108615 solver.cpp:404] Test net output #1: loss = 6.02617 (* 1 = 6.02617 loss)
I1103 12:01:41.869380 108615 solver.cpp:228] Iteration 0, loss = 6.05644
I1103 12:01:41.869398 108615 solver.cpp:244] Train net output #0: loss = 6.05644 (* 1 = 6.05644 loss)
I1103 12:01:41.869413 108615 sgd_solver.cpp:106] Iteration 0, lr = 0.1
I1103 12:01:47.624855 108615 solver.cpp:228] Iteration 500, loss = 87.3365
I1103 12:01:47.624876 108615 solver.cpp:244] Train net output #0: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:01:47.624882 108615 sgd_solver.cpp:106] Iteration 500, lr = 0.1
I1103 12:01:53.290213 108615 solver.cpp:337] Iteration 1000, Testing net (#0)
I1103 12:01:53.299310 108615 solver.cpp:404] Test net output #0: accuracy = 0
I1103 12:01:53.299327 108615 solver.cpp:404] Test net output #1: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:01:53.314584 108615 solver.cpp:228] Iteration 1000, loss = 87.3365
I1103 12:01:53.314615 108615 solver.cpp:244] Train net output #0: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:01:53.314621 108615 sgd_solver.cpp:106] Iteration 1000, lr = 0.01
I1103 12:01:58.991268 108615 solver.cpp:228] Iteration 1500, loss = 87.3365
I1103 12:01:58.991315 108615 solver.cpp:244] Train net output #0: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:01:58.991322 108615 sgd_solver.cpp:106] Iteration 1500, lr = 0.01
I1103 12:02:04.664419 108615 solver.cpp:337] Iteration 2000, Testing net (#0)
I1103 12:02:04.673518 108615 solver.cpp:404] Test net output #0: accuracy = 0
I1103 12:02:04.673537 108615 solver.cpp:404] Test net output #1: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:02:04.690434 108615 solver.cpp:228] Iteration 2000, loss = 87.3365
I1103 12:02:04.690469 108615 solver.cpp:244] Train net output #0: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:02:04.690481 108615 sgd_solver.cpp:106] Iteration 2000, lr = 0.001
I1103 12:02:10.373788 108615 solver.cpp:228] Iteration 2500, loss = 87.3365
I1103 12:02:10.373852 108615 solver.cpp:244] Train net output #0: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:02:10.373859 108615 sgd_solver.cpp:106] Iteration 2500, lr = 0.001
I1103 12:02:16.047372 108615 solver.cpp:337] Iteration 3000, Testing net (#0)
I1103 12:02:16.056390 108615 solver.cpp:404] Test net output #0: accuracy = 0
I1103 12:02:16.056407 108615 solver.cpp:404] Test net output #1: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:02:16.070235 108615 solver.cpp:228] Iteration 3000, loss = 87.3365
I1103 12:02:16.070261 108615 solver.cpp:244] Train net output #0: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:02:16.070267 108615 sgd_solver.cpp:106] Iteration 3000, lr = 0.0001
I1103 12:02:21.755348 108615 solver.cpp:228] Iteration 3500, loss = 87.3365
I1103 12:02:21.755369 108615 solver.cpp:244] Train net output #0: loss = 87.3365 (* 1 = 87.3365 loss)
I1103 12:02:21.755375 108615 sgd_solver.cpp:106] Iteration 3500, lr = 0.0001
----------------------------------
net: "/A/B/train.prototxt"
test_iter: 10
test_interval: 1000
base_lr: 0.1
lr_policy: "step"
gamma: 0.1
stepsize: 1000
display: 10
max_iter: 4000
momentum: 0.9
weight_decay: 0.0005
snapshot: 1000
snapshot_prefix: "/A/B/model_"
solver_mode: GPU
--------------------------------------------------
layer {
name: "data"
type: "HDF5Data"
top: "X"
top: "y"
hdf5_data_param{
source:"/Path/to/trainh5list.txt"
batch_size: 1
}
include{phase: TRAIN}
}
layer {
name: "data"
type: "HDF5Data"
top: "X"
top: "y"
hdf5_data_param{
source:"/Path/to/testh5list.txt"
batch_size: 1
}
include{phase: TEST}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "X"
top: "conv1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "norm1"
type: "LRN"
bottom: "pool1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
.
.
.
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
}
layer {
name: "drop7"
type: "Dropout"
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc8"
type: "InnerProduct"
bottom: "fc7"
top: "fc8"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 256
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "fc8"
bottom: "y"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8"
bottom: "y"
top: "loss"
}

Caffe : train network accuracy = 1 constant ! Accuracy issue

Right now, I am train network with with 2 class data... but accuracy is constant 1 after first iteration !
Input data is grayscale images. both class images are randomly selected when HDF5Data creation.
Why is that happened ? What's wrong or where is mistake !
network.prototxt :
name: "brainMRI"
layer {
name: "data"
type: "HDF5Data"
top: "data"
top: "label"
include: {
phase: TRAIN
}
hdf5_data_param {
source: "/home/shivangpatel/caffe/brainMRI1/train_file_location.txt"
batch_size: 10
}
}
layer {
name: "data"
type: "HDF5Data"
top: "data"
top: "label"
include: {
phase: TEST
}
hdf5_data_param {
source: "/home/shivangpatel/caffe/brainMRI1/test_file_location.txt"
batch_size: 10
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 20
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 50
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool2"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "ip1"
top: "ip1"
}
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 2
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "softmax"
type: "Softmax"
bottom: "ip2"
top: "smip2"
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2"
bottom: "label"
top: "loss"
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "smip2"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
Output :
I0217 17:41:07.912580 2913 net.cpp:270] This network produces output loss
I0217 17:41:07.912607 2913 net.cpp:283] Network initialization done.
I0217 17:41:07.912739 2913 solver.cpp:60] Solver scaffolding done.
I0217 17:41:07.912789 2913 caffe.cpp:212] Starting Optimization
I0217 17:41:07.912813 2913 solver.cpp:288] Solving brainMRI
I0217 17:41:07.912832 2913 solver.cpp:289] Learning Rate Policy: inv
I0217 17:41:07.920737 2913 solver.cpp:341] Iteration 0, Testing net (#0)
I0217 17:41:08.235076 2913 solver.cpp:409] Test net output #0: accuracy = 0.98
I0217 17:41:08.235194 2913 solver.cpp:409] Test net output #1: loss = 0.0560832 (* 1 = 0.0560832 loss)
I0217 17:41:35.831647 2913 solver.cpp:341] Iteration 100, Testing net (#0)
I0217 17:41:36.140849 2913 solver.cpp:409] Test net output #0: accuracy = 1
I0217 17:41:36.140949 2913 solver.cpp:409] Test net output #1: loss = 0.00757247 (* 1 = 0.00757247 loss)
I0217 17:42:05.465395 2913 solver.cpp:341] Iteration 200, Testing net (#0)
I0217 17:42:05.775877 2913 solver.cpp:409] Test net output #0: accuracy = 1
I0217 17:42:05.776000 2913 solver.cpp:409] Test net output #1: loss = 0.0144996 (* 1 = 0.0144996 loss)
.............
.............
Summarizing some information from the comments:
- You run test at intervals of test_interval:100 iterations.
- Each test interval goes over test_iter:5 * batch_size:10 = 50 samples.
- Your train and test sets seems to be very nit: all the negative samples (label=0) are grouped together before all the positive samples.
Consider your SGD iterative solver, you feed it batches of batch_size:10 during training. Your training set has 14,746 negative samples (that is 1474 batches) before any positive sample. So, for the first 1474 iterations your solver only "sees" negative examples and no positive ones.
What do you expect this solver will learn?
The problem
Your solver only sees negative examples, thus learns that no matter what the input is it should output "0". Your test set is also ordered in the same fashion, so testing only 50 samples at each test_interval, you only test on the negative examples in the test set resulting with a perfect accuracy of 1.
But as you noted, your net actually learned nothing.
Solution
I suppose you already guess what the solution should be by now. You need to shuffle your training set, and test your net on your entire test set.

Convolution issue in Caffe

i have 96x96 pixel images in grayscale format stored in HDF5 files. i am trying to do multi output regression using caffe however convolution is not working. What exactly is the problem here? Why is convolutions not working?
I0122 17:18:39.474860 5074 net.cpp:67] Creating Layer fkp
I0122 17:18:39.474889 5074 net.cpp:356] fkp -> data
I0122 17:18:39.474930 5074 net.cpp:356] fkp -> label
I0122 17:18:39.474967 5074 net.cpp:96] Setting up fkp
I0122 17:18:39.474987 5074 hdf5_data_layer.cpp:57] Loading filename from train.txt
I0122 17:18:39.475103 5074 hdf5_data_layer.cpp:69] Number of files: 1
I0122 17:18:39.475131 5074 hdf5_data_layer.cpp:29] Loading HDF5 filefacialkp-train.hd5
I0122 17:18:40.337786 5074 hdf5_data_layer.cpp:49] Successully loaded 4934 rows
I0122 17:18:40.337862 5074 hdf5_data_layer.cpp:81] output data size: 100,9216,1,1
I0122 17:18:40.337906 5074 net.cpp:103] Top shape: 100 9216 1 1 (921600)
I0122 17:18:40.337929 5074 net.cpp:103] Top shape: 100 30 1 1 (3000)
I0122 17:18:40.337971 5074 net.cpp:67] Creating Layer conv1
I0122 17:18:40.338001 5074 net.cpp:394] conv1 <- data
I0122 17:18:40.338069 5074 net.cpp:356] conv1 -> conv1
I0122 17:18:40.338109 5074 net.cpp:96] Setting up conv1
F0122 17:18:40.599761 5074 blob.cpp:13] Check failed: height >= 0 (-3 vs. 0)
My prototxt layer file is like this
name: "LogReg"
layers {
top: "data"
top: "label"
name: "fkp"
type: HDF5_DATA
hdf5_data_param {
source: "train.txt"
batch_size: 100
}
include {
phase: TRAIN
}
}
layers {
bottom: "data"
top: "conv1"
name: "conv1"
type: CONVOLUTION
blobs_lr: 1
blobs_lr: 2
convolution_param {
num_output: 64
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
bottom: "conv1"
top: "pool1"
name: "pool1"
type: POOLING
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layers {
bottom: "pool1"
top: "conv2"
name: "conv2"
type: CONVOLUTION
blobs_lr: 1
blobs_lr: 2
convolution_param {
num_output: 256
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
bottom: "conv2"
top: "pool2"
name: "pool2"
type: POOLING
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layers {
bottom: "pool2"
top: "ip1"
name: "ip1"
type: INNER_PRODUCT
blobs_lr: 1
blobs_lr: 2
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
bottom: "ip1"
top: "ip1"
name: "relu1"
type: RELU
}
layers {
bottom: "ip1"
top: "ip2"
name: "ip2"
type: INNER_PRODUCT
blobs_lr: 1
blobs_lr: 2
inner_product_param {
num_output: 30
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
bottom: "ip2"
bottom: "label"
top: "loss"
name: "loss"
type: EUCLIDEAN_LOSS
}
the lines
I0122 17:18:40.337906 5074 net.cpp:103] Top shape: 100 9216 1 1 (921600)
I0122 17:18:40.337929 5074 net.cpp:103] Top shape: 100 30 1 1 (3000)
suggest that your input data is not in the correct shape. For an input of 100 batchs of 96x96 grey-scale image the shape should be: 100 1 96 96.
Try to change this. (my guess is that for shape: N C H W, where N number of batches, c channels, h height, w weight)

Resources