What can make loss stop going down? - machine-learning

I'm training my network on a very small dataset (size = 2) over and over again. It looks fine on the beginning but after Epoch 34 the loss stop going down.
I though it could be caused by the Dying Relu problem, so I replaced all my relu activations with Softplus. However, the problem is still the same.
I also tried to add some Linear hidden layers combined with Softplus with no results.
If i try with a dataset with only one example, then loss goes to 0 as expected.
What can cause this behavior and how to avoid it?
Epoch 0
Expected: [243.0, 0.0]; Predicted: [-2367.9, 3.8]; Loss: 6059457.6000
Expected: [178.0, 32.0]; Predicted: [-1731.4, 10.9]; Loss: 3241238.0000
Epoch 1
Expected: [243.0, 0.0]; Predicted: [-1237.8, 8.1]; Loss: 1949257.6000
Expected: [178.0, 32.0]; Predicted: [-883.9, 14.1]; Loss: 1002567.6000
Epoch 2
Expected: [243.0, 0.0]; Predicted: [-602.2, 10.6]; Loss: 635017.6000
Expected: [178.0, 32.0]; Predicted: [-407.1, 15.9]; Loss: 304548.4500
...
Epoch 12
Expected: [243.0, 0.0]; Predicted: [212.6, 13.7]; Loss: 991.0653
Expected: [178.0, 32.0]; Predicted: [203.9, 18.3]; Loss: 764.2527
Epoch 13
Expected: [243.0, 0.0]; Predicted: [213.7, 13.7]; Loss: 930.9330
Expected: [178.0, 32.0]; Predicted: [204.8, 18.3]; Loss: 803.9944
...
Epoch 32
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9812
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9783
Epoch 33
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9806
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9789
Epoch 34
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 35
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 36
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 37
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 38
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 39
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 40
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 41
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 42
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Here's the code:
class NN(Module):
def __init__(self, vocab_size, *_, hidden_size=200, text_max_len=200, rnn_num_layers=1, dropout=0.2):
super(NN, self).__init__()
self.rnn_layer_range = list(range(rnn_num_layers))
self.hidden_size = hidden_size
self.text_max_len = text_max_len
self.softplus = Softplus()
# text:
self.text_embed = Embedding(vocab_size, hidden_size, padding_idx=0)
self.text_conv1 = Conv2d( 1, 6, 5)
self.text_conv2 = Conv2d( 6, 12, 5)
self.text_conv3 = Conv2d(12, 24, 5)
self.text_lin1 = Linear(4536, hidden_size)
# image:
self.img_conv1 = Conv2d( 3, 6, 5)
self.img_conv2 = Conv2d( 6, 16, 5)
self.img_conv3 = Conv2d(16, 20, 5)
self.img_lin1 = Linear(2420, hidden_size)
# union:
self.u_size = 3*hidden_size
self.u_linear_augment = Linear(9, hidden_size)
self.u_gru = GRU(input_size=self.u_size, hidden_size=self.u_size, dropout=dropout)
self.u_linear_reduce1 = Linear(self.u_size, self.u_size // 2)
self.u_linear_reduce2 = Linear(self.u_size // 2, 2)
def initHidden(self):
return Variable(zeros(1, 1, self.hidden_size)), Variable(zeros(1, 1, self.u_size))
def forward(self, text, img, data, *_, text_hidden=None, u_hidden=None):
text_hidden, u_hidden = self.initHidden()
# encode text
max_len = self.text_max_len
if len(text) < max_len:
text = cat((text, Variable(LongTensor(max_len - len(text)).zero_())))
text = self.text_embed(text)
text = text.view(1, 1, max_len, self.hidden_size)
text = max_pool2d(self.softplus(self.text_conv1(text)), 2)
text = max_pool2d(self.softplus(self.text_conv2(text)), 2)
text = max_pool2d(self.softplus(self.text_conv3(text)), 2)
text = text.view(1, -1)
text = self.softplus(self.text_lin1(text))
text = text.view(-1, 1)
# encode image
img = max_pool2d(self.softplus(self.img_conv1(img)), 2)
img = max_pool2d(self.softplus(self.img_conv2(img)), 2)
img = max_pool2d(self.softplus(self.img_conv3(img)), 2)
img = img.view(1, -1)
img = self.softplus(self.img_lin1(img))
img = img.view(-1, 1)
# join
data = self.softplus(self.u_linear_augment(data))
vector = text.view(1, 1, -1)
vector = cat((data, text, img)).view(1, 1, -1)
for _ in self.rnn_layer_range:
vector, u_hidden = self.u_gru(vector, u_hidden)
vector = self.softplus(self.u_linear_reduce1(vector))
vector = self.u_linear_reduce2(vector)
return vector.view(-1, 1)
and
def train(neuron_network, optimizer, criterion, text, img, data, target, *_, loop_size=5):
optimizer.zero_grad()
loss = 0
for _ in range(loop_size):
output = neuron_network(text, img, data)
loss += criterion(output, target)
loss.backward()
optimizer.step()
return loss.data[0]/loop_size
And here is how i'm training it:
neural = NN(vocab_size=len(letter_dict)+1, dropout=0, rnn_num_layers=1, hidden_size=100)
optimizer = optim.SGD(neural.parameters(), lr=0.01)
criterion = nn.MSELoss()
for epoch in range(500):
for item in dataset:
loss = train(neural, optimizer, criterion, item["text"], item["img"], item["data"], item["target"])

Related

Validation accuracy always zero for LSTM model for categorical data

I tried to build a LSTM model of a categorical time series. However the validation accuracy was always zero. I thought my data were problematic so I replaced my original with random number. The validation accuracy was still zero. Is there anything wrong with the model?
import os
import sys
import pyodbc as pyodbc
import numpy as np
import pandas as pd
import array as arr
from matplotlib import pyplot
from numpy import array, argmax
from time import process_time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, InputLayer, LSTM, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
# 17-Mar-2022: Force run on CPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
# CONSTANTS
TRAINING_DATA_RATIO = 0.9
DSN_STRING = 'DSN=M6_local'
LOOK_BACK_WINDOW = 4
# This is to get source data
def get_raw_data():
conn = pyodbc.connect(DSN_STRING)
cursor = conn.cursor()
SQL_EXTRACTION = 'select hashID_123 from tbl_M6 order by DrawID'
return pd.read_sql(SQL_EXTRACTION, conn)
# This is to generate random numbers for training data
def generate_raw_data(size):
arr = np.random.rand(size) * size
return np.trunc(arr)
raw_df = generate_raw_data(15180)
raw_df = raw_df.reshape(-1, 1)
oh_encoder = OneHotEncoder(categories=All_categories, sparse=False)
encoded_input = oh_encoder.fit_transform(raw_df)
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
"""
Frame a time series as a supervised learning dataset.
Arguments:
data: Sequence of observations as a list or NumPy array.
n_in: Number of lag observations as input (X).
n_out: Number of observations as output (y).
dropnan: Boolean whether or not to drop rows with NaN values.
Returns:
Pandas DataFrame of series framed for supervised learning.
"""
n_vars = 1 if type(data) is list else data.shape[1]
df = pd.DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
# put it all together
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
# Splitting Training/Testing datasets with ratio pratio:1
def Split_data(pdf, pratio):
train_size = int(len(pdf) * pratio)
test_size = len(pdf) - train_size
return pdf.iloc[0:train_size], pdf.iloc[train_size:len(pdf)]
draw_reframe = series_to_supervised(encoded_input, LOOK_BACK_WINDOW,1)
train, test = Split_data(draw_reframe, TRAINING_DATA_RATIO)
# Total input = all possible One-Hot Encoding outcome * number of look-back samples.
ALL_INPUT = POSSIBLE_OUTCOME_COL * LOOK_BACK_WINDOW
# split into input and outputs
train_X, train_y = train.iloc[:,:ALL_INPUT], train.iloc[:,ALL_INPUT:]
test_X, test_y = test.iloc[:,:ALL_INPUT], test.iloc[:,ALL_INPUT:]
def M6_lstm_model():
# Hyper-parameters
INPUT_NODES = 45
LEARNING_RATE = 0.0001
model = Sequential()
model.add(LSTM(INPUT_NODES,
return_sequences=False,
input_shape=(train_X.shape[1], train_X.shape[2])
,activation='relu'
))
# Output layer
#model.add(Dense(units=POSSIBLE_OUTCOME_COL, activation='relu'))
model.add(Dense(units=train_y.shape[1]))
model.compile(
loss='categorical_crossentropy',
optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
metrics = ['categorical_accuracy']
)
return model
lstm_model = M6_lstm_model()
lstm_model.summary()
custom_early_stopping = EarlyStopping(
monitor='categorical_accuracy',
patience=10,
min_delta=0.001,
mode='max'
)
EPOCHS = 50
history = lstm_model.fit(
train_X, train_y,
epochs=EPOCHS,
batch_size=16,
validation_data=(test_X, test_y),
verbose=1,
shuffle=False,
callbacks=[custom_early_stopping]
)
The output is like the following;
Epoch 1/20
854/854 [==============================] - 54s 62ms/step - loss: 11.6208 - categorical_accuracy: 0.0000e+00 - val_loss: 13.1296 - val_categorical_accuracy: 0.0000e+00
Epoch 2/20
854/854 [==============================] - 32s 38ms/step - loss: 12.9591 - categorical_accuracy: 7.3217e-05 - val_loss: 11.5824 - val_categorical_accuracy: 0.0000e+00
Epoch 3/20
854/854 [==============================] - 32s 38ms/step - loss: 12.8105 - categorical_accuracy: 1.4643e-04 - val_loss: 12.4107 - val_categorical_accuracy: 0.0000e+00
Epoch 4/20
854/854 [==============================] - 31s 37ms/step - loss: 12.7316 - categorical_accuracy: 1.4643e-04 - val_loss: 10.9091 - val_categorical_accuracy: 0.0000e+00
Epoch 5/20
854/854 [==============================] - 32s 37ms/step - loss: 13.4749 - categorical_accuracy: 2.1965e-04 - val_loss: 10.9705 - val_categorical_accuracy: 0.0000e+00
Epoch 6/20
854/854 [==============================] - 32s 38ms/step - loss: 13.2239 - categorical_accuracy: 2.9287e-04 - val_loss: 11.6188 - val_categorical_accuracy: 0.0000e+00
Epoch 7/20
854/854 [==============================] - 32s 38ms/step - loss: 13.5012 - categorical_accuracy: 2.9287e-04 - val_loss: 10.6353 - val_categorical_accuracy: 0.0000e+00
Epoch 8/20
854/854 [==============================] - 32s 37ms/step - loss: 13.4562 - categorical_accuracy: 2.9287e-04 - val_loss: 9.8759 - val_categorical_accuracy: 0.0000e+00
Epoch 9/20
854/854 [==============================] - 32s 37ms/step - loss: 13.6172 - categorical_accuracy: 2.1965e-04 - val_loss: 12.6144 - val_categorical_accuracy: 0.0000e+00
Epoch 10/20
854/854 [==============================] - 32s 37ms/step - loss: 13.3903 - categorical_accuracy: 3.6609e-04 - val_loss: 9.6623 - val_categorical_accuracy: 0.0000e+00
Epoch 11/20
854/854 [==============================] - 32s 37ms/step - loss: 12.9621 - categorical_accuracy: 3.6609e-04 - val_loss: 12.8088 - val_categorical_accuracy: 0.0000e+00
Epoch 12/20
854/854 [==============================] - 32s 38ms/step - loss: 13.4995 - categorical_accuracy: 2.1965e-04 - val_loss: 9.7154 - val_categorical_accuracy: 0.0000e+00
Epoch 13/20
854/854 [==============================] - 32s 38ms/step - loss: 13.4103 - categorical_accuracy: 2.1965e-04 - val_loss: 12.4104 - val_categorical_accuracy: 0.0000e+00
Epoch 14/20
854/854 [==============================] - 32s 38ms/step - loss: 13.8077 - categorical_accuracy: 8.0539e-04 - val_loss: 10.1903 - val_categorical_accuracy: 0.0000e+00
Epoch 15/20
854/854 [==============================] - 32s 37ms/step - loss: 13.8100 - categorical_accuracy: 6.5895e-04 - val_loss: 9.7783 - val_categorical_accuracy: 0.0000e+00
Epoch 16/20
854/854 [==============================] - 32s 37ms/step - loss: 13.8371 - categorical_accuracy: 5.8574e-04 - val_loss: 12.1615 - val_categorical_accuracy: 0.0000e+00
Epoch 17/20
854/854 [==============================] - 32s 38ms/step - loss: 14.0756 - categorical_accuracy: 5.1252e-04 - val_loss: 9.9183 - val_categorical_accuracy: 0.0000e+00
Epoch 18/20
854/854 [==============================] - 32s 38ms/step - loss: 14.2117 - categorical_accuracy: 4.3930e-04 - val_loss: 10.1652 - val_categorical_accuracy: 0.0000e+00
Epoch 19/20
854/854 [==============================] - 32s 37ms/step - loss: 14.4263 - categorical_accuracy: 3.6609e-04 - val_loss: 9.9861 - val_categorical_accuracy: 0.0000e+00
Epoch 20/20
854/854 [==============================] - 32s 37ms/step - loss: 14.2520 - categorical_accuracy: 3.6609e-04 - val_loss: 10.3836 - val_categorical_accuracy: 0.0000e+00

Getting Different results on Each Iteration using Long Short Term Memory[LSTM] for text classification

I am using LTSM Deep-learning technique to classify my text, First i am dividing them into text and lables using panda library and making their tokens and then dividing them into into training and text data sets,whenever i runs the code, i get different results which varies from (80 to 100)percent.
Here is my code,
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~',
lower=True)
tokenizer.fit_on_texts(trainDF['texts'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
X = tokenizer.texts_to_sequences(trainDF['texts'])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)
Y = pd.get_dummies(trainDF['label'])
print('Shape of label tensor:', Y.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
variables_for_classification=6 #change it as per your number of categories
model.add(Dense(variables_for_classification, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
epochs = 5
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs,
batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3,
min_delta=0.0001)])
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0],accr[1]))
Train on 794 samples, validate on 89 samples
Epoch 1/5
794/794 [==============================] - 19s 24ms/step - loss: 1.6401 - accuracy: 0.6297 - val_loss: 0.9098 - val_accuracy: 0.5843
Epoch 2/5
794/794 [==============================] - 16s 20ms/step - loss: 0.8365 - accuracy: 0.7166 - val_loss: 0.7487 - val_accuracy: 0.7753
Epoch 3/5
794/794 [==============================] - 16s 20ms/step - loss: 0.7093 - accuracy: 0.8401 - val_loss: 0.6519 - val_accuracy: 0.8652
Epoch 4/5
794/794 [==============================] - 16s 20ms/step - loss: 0.5857 - accuracy: 0.8829 - val_loss: 0.4935 - val_accuracy: 1.0000
Epoch 5/5
794/794 [==============================] - 16s 20ms/step - loss: 0.4248 - accuracy: 0.9345 - val_loss: 0.3512 - val_accuracy: 0.8652
99/99 [==============================] - 0s 2ms/step
Test set
Loss: 0.348
Accuracy: 0.869
in the last run accuracy was 100 percent.

Model loss remains unchaged

I would like to understand what could be responsible for this model loss behaviour. Training a CNN network, with 6 hidden-layers, the loss shoots up from around 1.8 to above 12 after the first epoch and remains constant for the remaining 99 epochs.
724504/724504 [==============================] - 358s 494us/step - loss: 1.8143 - acc: 0.7557 - val_loss: 16.1181 - val_acc: 0.0000e+00
Epoch 2/100
724504/724504 [==============================] - 355s 490us/step - loss: 12.0886 - acc: 0.2500 - val_loss: 16.1181 - val_acc: 0.0000e+00
Epoch 3/100
724504/724504 [==============================] - 354s 489us/step - loss: 12.0886 - acc: 0.2500 - val_loss: 16.1181 - val_acc: 0.0000e+00
Epoch 4/100
724504/724504 [==============================] - 348s 481us/step - loss: 12.0886 - acc: 0.2500 - val_loss: 16.1181 - val_acc: 0.0000e+00
Epoch 5/100
724504/724504 [==============================] - 355s 490us/step - loss: 12.0886 - acc: 0.2500 - val_loss: 16.1181 - val_acc: 0.0000e+00
I cannot believe this got to do with the dataset I work with, because I tried this with a different, publicly available dataset, the performance is exactly the same (in fact exact figures for loss/accuracy).
I also tested this with a somehow show network having 2 hidden-layers, see the performance below:
724504/724504 [==============================] - 41s 56us/step - loss: 0.4974 - acc: 0.8236 - val_loss: 15.5007 - val_acc: 0.0330
Epoch 2/100
724504/724504 [==============================] - 40s 56us/step - loss: 0.5204 - acc: 0.8408 - val_loss: 15.5543 - val_acc: 0.0330
Epoch 3/100
724504/724504 [==============================] - 41s 56us/step - loss: 0.6646 - acc: 0.8439 - val_loss: 15.3904 - val_acc: 0.0330
Epoch 4/100
724504/724504 [==============================] - 41s 57us/step - loss: 8.8982 - acc: 0.4342 - val_loss: 15.5867 - val_acc: 0.0330
Epoch 5/100
724504/724504 [==============================] - 41s 57us/step - loss: 0.5627 - acc: 0.8444 - val_loss: 15.5449 - val_acc: 0.0330
Can someone points the probable cause of this behaviour? What parameter / configuration needs be adjusted?
EDIT
Model creation
model = Sequential()
activ = 'relu'
model.add(Conv2D(32, (1, 3), strides=(1, 1), padding='same', activation=activ, input_shape=(1, n_points, 4)))
model.add(Conv2D(32, (1, 3), strides=(1, 1), padding='same', activation=activ))
model.add(MaxPooling2D(pool_size=(1, 2)))
#model.add(Dropout(.5))
model.add(Conv2D(64, (1, 3), strides=(1, 1), padding='same', activation=activ))
model.add(Conv2D(64, (1, 3), strides=(1, 1), padding='same', activation=activ))
model.add(MaxPooling2D(pool_size=(1, 2)))
#model.add(Dropout(.5))
model.add(Conv2D(128, (1, 3), strides=(1, 1), padding='same', activation=activ))
model.add(Conv2D(128, (1, 3), strides=(1, 1), padding='same', activation=activ))
model.add(MaxPooling2D(pool_size=(1, 2)))
model.add(Dropout(.5))
model.add(Flatten())
A = model.output_shape
model.add(Dense(int(A[1] * 1/4.), activation=activ))
model.add(Dropout(.5))
model.add(Dense(NoClass, activation='softmax'))
optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_reample, Y_resample, epochs=100, batch_size=64, shuffle=False,
validation_data=(Test_X, Test_Y))
Changing the learning rate to lr=0.0001 here's the result after 100 epochs.
72090/72090 [==============================] - 29s 397us/step - loss: 0.5040 - acc: 0.8347 - val_loss: 4.3529 - val_acc: 0.2072
Epoch 99/100
72090/72090 [==============================] - 28s 395us/step - loss: 0.4958 - acc: 0.8382 - val_loss: 6.3422 - val_acc: 0.1806
Epoch 100/100
72090/72090 [==============================] - 28s 393us/step - loss: 0.5084 - acc: 0.8342 - val_loss: 4.3781 - val_acc: 0.1925
the optimal epoch size: 97, the value of high accuracy 0.20716827656581954
EDIT 2
Apparently, SMOTE isn't good for sampling all but majority class in a multiclassification, see below the trian/test plot:
Can you please try using BatchNormalization also, place just after your pooling layers. it is good to include it

Keras Image Classification - Prediction accuracy on validation dataset does not match val_acc

I am trying to classify a set of images within two categories: left and right.
I built a CNN using Keras, my classifier seems to work well:
I have 1,939 images used for training (50% left, 50% right)
I have 648 images used for validation (50% left, 50% right)
All images are 115x45, in greyscale
acc is increasing up to 99.53%
val_acc is increasing up to 98.38%
Both loss and val_loss are converging close to 0
Keras verbose looks normal to me:
60/60 [==============================] - 6s 98ms/step - loss: 0.6295 - acc: 0.6393 - val_loss: 0.4877 - val_acc: 0.7641
Epoch 2/32
60/60 [==============================] - 5s 78ms/step - loss: 0.4825 - acc: 0.7734 - val_loss: 0.3403 - val_acc: 0.8799
Epoch 3/32
60/60 [==============================] - 5s 77ms/step - loss: 0.3258 - acc: 0.8663 - val_loss: 0.2314 - val_acc: 0.9042
Epoch 4/32
60/60 [==============================] - 5s 83ms/step - loss: 0.2498 - acc: 0.8942 - val_loss: 0.2329 - val_acc: 0.9042
Epoch 5/32
60/60 [==============================] - 5s 76ms/step - loss: 0.2408 - acc: 0.9002 - val_loss: 0.1426 - val_acc: 0.9432
Epoch 6/32
60/60 [==============================] - 5s 80ms/step - loss: 0.1968 - acc: 0.9260 - val_loss: 0.1484 - val_acc: 0.9367
Epoch 7/32
60/60 [==============================] - 5s 77ms/step - loss: 0.1621 - acc: 0.9319 - val_loss: 0.1141 - val_acc: 0.9578
Epoch 8/32
60/60 [==============================] - 5s 81ms/step - loss: 0.1600 - acc: 0.9361 - val_loss: 0.1229 - val_acc: 0.9513
Epoch 9/32
60/60 [==============================] - 4s 70ms/step - loss: 0.1358 - acc: 0.9462 - val_loss: 0.0884 - val_acc: 0.9692
Epoch 10/32
60/60 [==============================] - 4s 74ms/step - loss: 0.1193 - acc: 0.9542 - val_loss: 0.1232 - val_acc: 0.9529
Epoch 11/32
60/60 [==============================] - 5s 79ms/step - loss: 0.1075 - acc: 0.9595 - val_loss: 0.0865 - val_acc: 0.9724
Epoch 12/32
60/60 [==============================] - 4s 73ms/step - loss: 0.1209 - acc: 0.9531 - val_loss: 0.1067 - val_acc: 0.9497
Epoch 13/32
60/60 [==============================] - 4s 73ms/step - loss: 0.1135 - acc: 0.9609 - val_loss: 0.0860 - val_acc: 0.9838
Epoch 14/32
60/60 [==============================] - 4s 70ms/step - loss: 0.0869 - acc: 0.9682 - val_loss: 0.0907 - val_acc: 0.9675
Epoch 15/32
60/60 [==============================] - 4s 71ms/step - loss: 0.0960 - acc: 0.9637 - val_loss: 0.0996 - val_acc: 0.9643
Epoch 16/32
60/60 [==============================] - 4s 73ms/step - loss: 0.0951 - acc: 0.9625 - val_loss: 0.1223 - val_acc: 0.9481
Epoch 17/32
60/60 [==============================] - 4s 70ms/step - loss: 0.0685 - acc: 0.9729 - val_loss: 0.1220 - val_acc: 0.9513
Epoch 18/32
60/60 [==============================] - 4s 73ms/step - loss: 0.0791 - acc: 0.9715 - val_loss: 0.0959 - val_acc: 0.9692
Epoch 19/32
60/60 [==============================] - 4s 71ms/step - loss: 0.0595 - acc: 0.9802 - val_loss: 0.0648 - val_acc: 0.9773
Epoch 20/32
60/60 [==============================] - 4s 71ms/step - loss: 0.0486 - acc: 0.9844 - val_loss: 0.0691 - val_acc: 0.9838
Epoch 21/32
60/60 [==============================] - 4s 70ms/step - loss: 0.0499 - acc: 0.9812 - val_loss: 0.1166 - val_acc: 0.9627
Epoch 22/32
60/60 [==============================] - 4s 71ms/step - loss: 0.0481 - acc: 0.9844 - val_loss: 0.0875 - val_acc: 0.9734
Epoch 23/32
60/60 [==============================] - 4s 70ms/step - loss: 0.0533 - acc: 0.9814 - val_loss: 0.1094 - val_acc: 0.9724
Epoch 24/32
60/60 [==============================] - 4s 70ms/step - loss: 0.0487 - acc: 0.9812 - val_loss: 0.0722 - val_acc: 0.9740
Epoch 25/32
60/60 [==============================] - 4s 72ms/step - loss: 0.0441 - acc: 0.9828 - val_loss: 0.0992 - val_acc: 0.9773
Epoch 26/32
60/60 [==============================] - 4s 71ms/step - loss: 0.0667 - acc: 0.9726 - val_loss: 0.0964 - val_acc: 0.9643
Epoch 27/32
60/60 [==============================] - 4s 73ms/step - loss: 0.0436 - acc: 0.9835 - val_loss: 0.0771 - val_acc: 0.9708
Epoch 28/32
60/60 [==============================] - 4s 71ms/step - loss: 0.0322 - acc: 0.9896 - val_loss: 0.0872 - val_acc: 0.9756
Epoch 29/32
60/60 [==============================] - 5s 80ms/step - loss: 0.0294 - acc: 0.9943 - val_loss: 0.1414 - val_acc: 0.9578
Epoch 30/32
60/60 [==============================] - 5s 76ms/step - loss: 0.0348 - acc: 0.9870 - val_loss: 0.1102 - val_acc: 0.9659
Epoch 31/32
60/60 [==============================] - 5s 76ms/step - loss: 0.0306 - acc: 0.9922 - val_loss: 0.0794 - val_acc: 0.9659
Epoch 32/32
60/60 [==============================] - 5s 76ms/step - loss: 0.0152 - acc: 0.9953 - val_loss: 0.1051 - val_acc: 0.9724
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_1 (Conv2D) (None, 113, 43, 32) 896
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 56, 21, 32) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 54, 19, 32) 9248
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 27, 9, 32) 0
_________________________________________________________________
flatten_1 (Flatten) (None, 7776) 0
_________________________________________________________________
dense_1 (Dense) (None, 128) 995456
_________________________________________________________________
dense_2 (Dense) (None, 1) 129
=================================================================
Total params: 1,005,729
Trainable params: 1,005,729
Non-trainable params: 0
So everything looks great, but when I tried to predict the category of 2,000 samples I got very strange results, with an accuracy < 70%.
At first I thought this sample might be biased, so I tried, instead, to predict the images in the validation dataset.
I should have a 98.38% accuracy, and a perfect 50-50 split, but instead, once again I got:
170 images predicted right, instead of 324, with an accuracy of 98.8%
478 images predicted left, instead of 324, with an accuracy of 67.3%
Average accuracy: 75.69% and not 98.38%
I guess something is wrong either in my CNN or my prediction script.
CNN classifier code:
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
# Init CNN
classifier = Sequential()
# Step 1 - Convolution
classifier.add(Conv2D(32, (3, 3), input_shape = (115, 45, 3), activation = 'relu'))
# Step 2 - Pooling
classifier.add(MaxPooling2D(pool_size = (2, 2)))
# Adding a second convolutional layer
classifier.add(Conv2D(32, (3, 3), activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2, 2)))
# Step 3 - Flattening
classifier.add(Flatten())
# Step 4 - Full connection
classifier.add(Dense(units = 128, activation = 'relu'))
classifier.add(Dense(units = 1, activation = 'sigmoid'))
# Compiling the CNN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Part 2 - Fitting the CNN to the images
from keras.preprocessing.image import ImageDataGenerator
import numpy
train_datagen = ImageDataGenerator(rescale = 1./255, shear_range = 0.2, zoom_range = 0.2, horizontal_flip = False)
test_datagen = ImageDataGenerator(rescale = 1./255)
training_set = train_datagen.flow_from_directory('./dataset/training_set',
target_size = (115, 45),
batch_size = 32,
class_mode = 'binary')
test_set = test_datagen.flow_from_directory('./dataset/test_set',
target_size = (115, 45),
batch_size = 32,
class_mode = 'binary')
classifier.fit_generator(training_set,
steps_per_epoch = 1939/32, # total samples / batch size
epochs = 32,
validation_data = test_set,
validation_steps = 648/32)
# Save the classifier
classifier.evaluate_generator(generator=test_set)
classifier.summary()
classifier.save('./classifier.h5')
Prediction code:
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator
import os
import numpy as np
from keras.preprocessing import image
from shutil import copyfile
classifier = load_model('./classifier.h5')
folder = './small/'
files = os.listdir(folder)
pleft = 0
pright = 0
for f in files:
test_image = image.load_img(folder+f, target_size = (115, 45))
test_image = image.img_to_array(test_image)
test_image = np.expand_dims(test_image, axis = 0)
result = classifier.predict(test_image)
#print training_set.class_indices
if result[0][0] == 1:
pright=pright+1
prediction = 'right'
copyfile(folder+'../'+f, '/found_right/'+f)
else:
prediction = 'left'
copyfile(folder+'../'+f, '/found_left/'+f)
pleft=pleft+1
ptot = pleft + pright
print 'Left = '+str(pleft)+' ('+str(pleft / (ptot / 100))+'%)'
print 'Right = '+str(pright)
print 'Total = '+str(ptot)
Output:
Left = 478 (79%)
Right = 170
Total = 648
Your help will be much appreciated.
I resolved this issue by doing two things:
As #Matias Valdenegro suggested, I had to rescale the image values before predicting, I added test_image /= 255. before calling predict().
As my val_loss was still a bit high, I added an EarlyStopping callback as well as two Dropout() before my Dense layers.
My prediction results are now consistent with the ones obtained during training/validation.

LSTM labeling all samples as the same class

I'm trying to design an LSTM network using Keras to combine word embeddings and other features in a binary classification setting. My test set contains 250 samples per class.
When I run my model using only the word embedding layers (the "model" layer in the code), I get an average F1 of around 0.67. When I create a new branch with the other features of fixed size that I compute separately ("branch2") and merge these with the word embeddings using "concat", the predictions all revert to a single class (giving perfect recall for that class), and average F1 drops to 0.33.
Am I adding in the features and training/testing incorrectly?
def create_model(embedding_index, sequence_features, optimizer='rmsprop'):
# Branch 1: word embeddings
model = Sequential()
embedding_layer = create_embedding_matrix(embedding_index, word_index)
model.add(embedding_layer)
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='tanh'))
model.add(MaxPooling1D(pool_length=2))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))
# Branch 2: other features
branch2 = Sequential()
dim = sequence_features.shape[1]
branch2.add(Dense(15, input_dim=dim, init='normal', activation='tanh'))
branch2.add(BatchNormalization())
# Merging branches to create final model
final_model = Sequential()
final_model.add(Merge([model,branch2], mode='concat'))
final_model.add(Dense(2, init='normal', activation='sigmoid'))
final_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
metrics=['accuracy','precision','recall','fbeta_score','fmeasure'])
return final_model
def run(input_train, input_dev, input_test, text_col, label_col, resfile, embedding_index):
# Processing text and features
data_train, labels_train, data_test, labels_test = vectorize_text(input_train, input_test, text_col,label_col)
x_train, y_train = data_train, labels_train
x_test, y_test = data_test, labels_test
seq_train = get_sequence_features(input_train).as_matrix()
seq_test = get_sequence_features(input_test).as_matrix()
# Generating model
filepath = lstm_config.WEIGHTS_PATH
checkpoint = ModelCheckpoint(filepath, monitor='val_fmeasure', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
model = create_model(embedding_index, seq_train)
model.fit([x_train, seq_train], y_train, validation_split=0.33, nb_epoch=3, batch_size=100, callbacks=callbacks_list, verbose=1)
# Evaluating
scores = model.evaluate([x_test, seq_test], y_test, verbose=1)
time.sleep(0.2)
preds = model.predict_classes([x_test, seq_test])
preds = to_categorical(preds)
print(metrics.f1_score(y_true=y_test, y_pred=preds, average="micro"))
print(metrics.f1_score(y_true=y_test, y_pred=preds, average="macro"))
print(metrics.classification_report(y_test, preds))
Output:
Using Theano backend. Found 2999999 word vectors.
Processing text dataset Found 7165 unique tokens.
Shape of data tensor: (1996, 50)
Shape of label tensor: (1996, 2)
1996 train 500 test
Train on 1337 samples, validate on 659 samples
Epoch 1/3 1300/1337
[============================>.] - ETA: 0s - loss: 0.6767 - acc:
0.6669 - precision: 0.5557 - recall: 0.6815 - fbeta_score: 0.6120 - fmeasure: 0.6120Epoch 00000: val_fmeasure im1337/1337
[==============================] - 10s - loss: 0.6772 - acc: 0.6672 -
precision: 0.5551 - recall: 0.6806 - fbeta_score: 0.6113 - fmeasure:
0.6113 - val_loss: 0.7442 - val_acc: 0 .0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - val_fbeta_score: 0.0000e+00 - val_fmeasure: 0.0000e+00
Epoch 2/3 1300/1337
[============================>.] - ETA: 0s - loss: 0.6634 - acc:
0.7269 - precision: 0.5819 - recall: 0.7292 - fbeta_score: 0.6462 - fmeasure: 0.6462Epoch 00001: val_fmeasure di1337/1337
[==============================] - 9s - loss: 0.6634 - acc: 0.7263 -
precision: 0.5830 - recall: 0.7300 - fbeta_score: 0.6472 - fmeasure:
0.6472 - val_loss: 0.7616 - val_acc: 0. 0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - val_fbeta_score: 0.0000e+00 - val_fmeasure: 0.0000e+00
Epoch 3/3 1300/1337
[============================>.] - ETA: 0s - loss: 0.6542 - acc:
0.7354 - precision: 0.5879 - recall: 0.7308 - fbeta_score: 0.6508 - fmeasure: 0.6508Epoch 00002: val_fmeasure di1337/1337
[==============================] - 8s - loss: 0.6545 - acc: 0.7337 -
precision: 0.5866 - recall: 0.7307 - fbeta_score: 0.6500 - fmeasure:
0.6500 - val_loss: 0.7801 - val_acc: 0. 0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - val_fbeta_score: 0.0000e+00 - val_fmeasure: 0.0000e+00 500/500 [==============================] - 0s
500/500 [==============================] - 1s
0.5 /usr/local/lib/python3.4/dist-packages/sklearn/metrics/classification.py:1074:
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in
labels with no predicted samples. 'precision', 'predicted', average,
warn_for)
0.333333333333 /usr/local/lib/python3.4/dist-packages/sklearn/metrics/classification.py:1074:
UndefinedMetricWarning: Precision and F-score are ill-defined and
being set to 0.0 in labels with no predicted samples.
precision recall f1-score support
0 0.00 0.00 0.00 250
1 0.50 1.00 0.67 250
avg / total 0.25 0.50 0.33 500

Resources