How to implement DQN algorithm correctly - machine-learning

I'm trying to implement the Deep Q Learning algorithm introduced by DeepMind in this paper:
https://arxiv.org/pdf/1312.5602.pdf
I'm using it to make an agent that learns to play Pong, however it doesn't seem to work (even after 2 hours of training I'm not seeing any improvement). This is the code,
import gym
import universe
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Activation
from keras.models import load_model
import random
env = gym.make('gym-core.Pong-v0')
env.configure(remotes=1)
def num2str(number, obs):
number = np.argmax(number)
if number == 0:
action = [[('KeyEvent', 'ArrowRight', False), ('KeyEvent', 'ArrowLeft', True)] for ob in obs]
elif number == 1:
action = [[('KeyEvent', 'ArrowLeft', False), ('KeyEvent', 'ArrowRight', True)] for ob in obs]
return action
def preprocess(original_obs):
obs = original_obs
obs = np.array(obs)[0]['vision']
obs = np.delete(obs, np.s_[195:769], axis=0)
obs = np.delete(obs, np.s_[0:35], axis=0)
obs = np.delete(obs, np.s_[160:1025], axis=1)
obs = np.mean(obs, axis=2)
obs = obs[::2,::2]
obs = np.reshape(obs, (80, 80, 1))
return obs
model = Sequential()
model.add(Conv2D(32, kernel_size = (8, 8), strides = (4, 4), border_mode='same', activation='relu', init='uniform', input_shape = (80, 80, 4)))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Conv2D(64, kernel_size = (2, 2), strides = (2, 2)))
model.add(Conv2D(64, kernel_size = (3, 3), strides = (1, 1)))
model.add(Flatten())
model.add(Dense(256, init='uniform', activation='relu'))
model.add(Dense(2, init='uniform', activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
init_observe_time = 500
D = []
e = 1.0
e_threshold = 0.05
e_decay = 0.01
gamma = 0.99
batch_size = 15
frequency = 10
Q_values = np.array([0, 0])
obs = env.reset()
while True:
obs = env.step(num2str(np.array([random.randint(0, 1) for i in range(0, 2)]), obs))[0]
if obs != [None]:
break
x_t1 = preprocess(obs)
s_t1 = np.stack((x_t1, x_t1, x_t1, x_t1), axis = 2)
s_t1 = np.reshape(s_t1, (80, 80, 4))
t = 0
while True:
print("Time since last start: ", t)
a_t = np.zeros(2)
if random.random() < e:
a_index = random.randint(0, 1)
a_t[a_index] = 1
else:
Q_values = model.predict(np.array([s_t1]))[0]
a_index = np.argmax(Q_values)
a_t[a_index] = 1
print("Q Values: ", Q_values)
print("action taken: ", np.argmax(a_t))
print("epsilon: ", e)
if e > e_threshold:
e -= e_decay
obs, r_t, done, info = env.step(num2str(a_t, obs))
if obs == [None]:
continue
x_t2 = preprocess(obs)
print(x_t2.shape, s_t1[:,:,0:3].shape)
s_t2 = np.append(x_t2, s_t1[:,:,0:3], axis = 2)
D.append((s_t1, a_t, r_t, s_t2, done))
if t > init_observe_time and t%frequency == 0:
minibatch = random.sample(D, batch_size)
s1_batch = [i[0] for i in minibatch]
a_batch = [i[1] for i in minibatch]
r_batch = [i[2] for i in minibatch]
s2_batch = [i[3] for i in minibatch]
q_batch = model.predict(np.array(s2_batch))
y_batch = np.zeros((batch_size, 2))
y_batch = model.predict(np.array(s1_batch))
print("Q batch: ", q_batch)
print("y batch: ", y_batch)
for i in range(0, batch_size):
if (minibatch[i][4]):
y_batch[i][np.argmax(a_batch[i])] = r_batch[i][0]
else:
y_batch[i][np.argmax(a_batch[i])] = r_batch[i][0] + gamma * np.max(q_batch[i])
model.train_on_batch(np.array(s1_batch), y_batch)
s_t1 = s_t2
t += 1
env.render()
does anyone have any suggestion on how to make it work properly?

Your second and third Conv2D layers appear to be missing their relu activations.
Your epsilon (or e) decays way too quickly. After only 95 time steps it will already be down to 0.05. I can't quickly find what they did in that 2013 paper, but in the 2015 paper they decay it from 1 to 0.1 over 1 million frames.
Those are the two things that immediately jump out to me. I'd recommend starting out by fixing those.

Related

Pytorch linear regression loss increase

I tried to implement a simple demo that gets a polynomial regression, but the linear model's loss fails to decrease.
I am confused about where I went wrong.
If I trained the model one sample(batch size = 1) each time, it works fine. but when I feed the model with many samples a time, the loss increase and get inf.
import numpy as np
import torch
import math
from matplotlib import pyplot as plt
def rand_series(size):
x = np.linspace(-100, 100, size)
np.random.shuffle(x)
base_y = 20 * np.sin(2 * math.pi / 200 * x)
y = base_y + 10 * np.random.rand(size)
return x, y
def rescale_vec(vector):
vec_as_tensor = torch.tensor(vector, dtype=torch.float32)
max_in_vec = torch.max(vec_as_tensor)
min_in_vec = torch.min(vec_as_tensor)
if max_in_vec - min_in_vec == 0:
return torch.ones(vec_as_tensor.size(), dtype=torch.float32)
else:
return (vec_as_tensor - min_in_vec) / (max_in_vec - min_in_vec)
def rescale(vectors):
if len(vectors.shape) == 1:
return rescale_vec(vectors)
nor_vecs = torch.empty(vectors.shape)
for i in range(vectors.shape[0]):
nor_vecs[i] = rescale_vec(vectors[i])
return nor_vecs
class LinearRegression (torch.nn.Module):
def __init__ (self, power=4):
super().__init__()
self.layer = torch.nn.Linear(power, 1)
def forward(self, x):
return self.layer(x)
def regression(x_, y_, learning_rate):
x = torch.t(torch.tensor(x_, dtype=torch.float32))
y = torch.tensor(y_, dtype=torch.float32)
dim_size = x.size()[1]
print(dim_size, x.size())
model = LinearRegression(dim_size)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_func = torch.nn.MSELoss(reduction='sum')
batch_size = 400
for round in range(50):
sample_indices = torch.randint(0, len(x), (batch_size, ))
x_samples = torch.index_select(x, 0, sample_indices)
y_samples = torch.index_select(y, 0, sample_indices)
optimizer.zero_grad()
y_hat = model(x_samples.view(-1, dim_size))
loss = loss_func(y_hat, y_samples)
print(loss.item())
loss.backward()
optimizer.step()
return model
x_one, y = rand_series(1000)
b = np.ones(len(x_one))
x = np.array([b, x_one, x_one ** 2, x_one ** 3, x_one ** 4, x_one ** 5])
model = regression(rescale(x), torch.tensor(y, dtype=torch.float32), 0.002)
nor_x = rescale(x)
y_hat = model(torch.t(torch.tensor(x, dtype=torch.float32)))
plt.scatter(x_one, y)
plt.scatter(x_one, y_hat.data, c='red')
plt.show()
the loss:
4.7375866968775066e+19
1.6979300048622735e+26
6.0214270068868396e+32
inf
inf
inf
You need to use loss_func = torch.nn.MSELoss(reduction='mean') to solve the NaN problem. A batch of one or two seems to work because the loss was small enough. By adding more epochs, you should see that your loss tend exponentially to infinity.

Wasserstein GAN problem with last discriminator layer and clipping

When I use linear or No activation in the last Discriminator layer using weight clipping Discriminator accuracy goes to 1 and Generator goes to 0. In case when I remove weight clipping, Generator accuracy goes to 1 and discriminator goes to 0 around 300 iterations. But when I use sigmoid activation as the last layer in the discriminator with clipping Generator accuracy goes to 1 and without clipping the generator loss get stuck while accuracies going as they should around 0.5.
NOTE - in all cases, results are produced and all of the show WARNING:tensorflow:Discrepancy between trainable weights and collected trainable weights, did you set model.trainable without calling model.compile after ?
Code is given here, please do not mind the indentation on copying and pasting it's everywhere -
class WGAN():
def __init__(self,
input_dim,
disc_filter,
disc_kernel,
disc_strides,
disc_dropout,
disc_lr,
gen_filter,
gen_kernel,
gen_strides,
gen_upsample,
gen_lr,
z_dim,
batch_size):
self.input_dim = input_dim
self.disc_filter = disc_filter
self.disc_kernel = disc_kernel
self.disc_strides = disc_strides
self.disc_dropout = disc_dropout
self.disc_lr = disc_lr
self.gen_filter = gen_filter
self.gen_kernel = gen_kernel
self.gen_strides = gen_strides
self.gen_upsample = gen_upsample
self.gen_lr = gen_lr
self.z_dim = z_dim
self.batch_size = batch_size
self.weight_init = RandomNormal(mean=0., stddev=0.02)
self.d_losses = []
self.g_losses = []
self.epoch = 0
self.Discriminator()
self.Generator()
self.full_model()
def wasserstein(self, y_true, y_pred):
return -K.mean(y_true * y_pred)
def Discriminator(self):
disc_input = Input(shape=self.input_dim, name='discriminator_input')
x = disc_input
for i in range(len(self.disc_filter)):
x = Conv2D(filters=self.disc_filter[i], kernel_size=self.disc_kernel[i], strides=self.disc_strides[i], padding='same', name='disc_'+str(i))(x)
x = LeakyReLU()(x)
x = Dropout(self.disc_dropout)(x)
x = BatchNormalization()(x)
x = Flatten()(x)
disc_output = Dense(1, activation='sigmoid', kernel_initializer = self.weight_init)(x)
self.discriminator = Model(disc_input, disc_output)
def Generator(self):
gen_input = Input(shape=(self.z_dim,), name='generator_input')
x = gen_input
x = Dense(7*7*self.batch_size, kernel_initializer = self.weight_init)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = Reshape(target_shape=(7,7,self.batch_size))(x)
for i in range(len(self.gen_filter)):
if self.gen_upsample[i]==2:
x = UpSampling2D(size=self.gen_upsample[i], name='upsample_'+str(i/2))(x)
x = Conv2D(filters=self.gen_filter[i], kernel_size=self.gen_kernel[i], strides=self.gen_strides[i], padding='same', name='gen_'+str(i))(x)
else:
x = Conv2DTranspose(filters=self.gen_filter[i], kernel_size=self.gen_kernel[i], strides=self.gen_strides[i], padding='same', name='gen_'+str(i))(x)
if i<len(self.gen_filter)-1:
x = BatchNormalization()(x)
x = LeakyReLU()(x)
else:
x = Activation("tanh")(x)
gen_output = x
self.generator = Model(gen_input, gen_output)
def set_trainable(self, model, val):
model.trainable=val
for l in model.layers:
l.trainable=val
def full_model(self):
### COMPILE DISCRIMINATOR
self.discriminator.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
### COMPILE THE FULL GAN
self.set_trainable(self.discriminator, False)
self.discriminator.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
model_input = Input(shape=(self.z_dim,), name='model_input')
model_output = self.discriminator(self.generator(model_input))
self.model = Model(model_input, model_output)
self.model.compile(optimizer= Adam(self.disc_lr), loss = self.wasserstein, metrics=['accuracy'])
self.set_trainable(self.discriminator, True)
def train_generator(self, batch_size):
valid = np.ones((batch_size,1))
noise = np.random.normal(0, 1, (batch_size, self.z_dim))
return self.model.train_on_batch(noise, valid)
def train_discriminator(self, x_train, batch_size, using_generator):
valid = np.ones((batch_size,1))
fake = np.zeros((batch_size,1))
if using_generator:
true_imgs = next(x_train)[0]
if true_imgs.shape[0] != batch_size:
true_imgs = next(x_train)[0]
else:
idx = np.random.randint(0, x_train.shape[0], batch_size)
true_imgs = x_train[idx]
noise = np.random.normal(0, 1, (batch_size, self.z_dim))
gen_imgs = self.generator.predict(noise)
d_loss_real, d_acc_real = self.discriminator.train_on_batch(true_imgs, valid)
d_loss_fake, d_acc_fake = self.discriminator.train_on_batch(gen_imgs, fake)
d_loss = 0.5 * (d_loss_real + d_loss_fake)
d_acc = 0.5 * (d_acc_real + d_acc_fake)
for l in self.discriminator.layers:
weights = l.get_weights()
weights = [np.clip(w, -0.01, 0.01) for w in weights]
l.set_weights(weights)
return [d_loss, d_loss_real, d_loss_fake, d_acc, d_acc_real, d_acc_fake]
def train(self, x_train, batch_size, epochs, print_every_n_batches = 50, using_generator = False):
for epoch in range(self.epoch, self.epoch + epochs):
d = self.train_discriminator(x_train, batch_size, using_generator)
g = self.train_generator(batch_size)
if self.epoch % print_every_n_batches == 0:
print ("%d [D loss: (%.3f)(R %.3f, F %.3f)] [D acc: (%.3f)(%.3f, %.3f)] [G loss: %.3f] [G acc: %.3f]" % (epoch, d[0], d[1], d[2], d[3], d[4], d[5], g[0], g[1]))
self.d_losses.append(d)
self.g_losses.append(g)
self.epoch+=1

run() got an unexpected keyword argument 'feed'

I'm started on tensorflow and I'm trying to read handwritten letters from a MNIST. I've got an error in my code but I don't understand why. I've found a post which is similar to this one but i've got the same error with this code. (link of this topic TensorFlow Cannot feed value of shape (100, 784) for Tensor 'Placeholder:0')
enter code here import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
X = tf.placeholder(tf.float32,[None,28,28,1])
W = tf.Variable(tf.zeros([784,10]))
B = tf.Variable(tf.zeros([10]))
init = tf.global_variables_initializer()
#Model
Y = tf.nn.softmax(tf.matmul(tf.reshape(X,[-1,784]),W)+B)
#Placeholder for correct answer
Y_ = tf.placeholder(tf.float32,[None,10])
#Calcul de l'erreur
cross_entropy = -tf.reduce_sum(Y_ * tf.log(Y))
# pourcentage de bonne réponse
is_correct = tf.equal(tf.argmax(Y,1),tf.argmax(Y_,1))
accuracy = tf.reduce_mean(tf.cast(is_correct,tf.float32))
#Regression linéaire
optimizer = tf.train.GradientDescentOptimizer(0.003)
train_step = optimizer.minimize(cross_entropy)
#Training process
sess = tf.Session()
sess.run(init)
for i in range(1000):
#On charge les images
batch_X,batch_Y = mnist.train.next_batch(100)
batch_X = np.reshape(batch_X, (-1, 28, 28, 1))
train_data = {X: batch_X, Y_: batch_Y}
#train
sess.run(train_step, feed_dict = train_data)
#success ?
a,c = sess.run([accuracy,cross_entropy],feed_dict = train_data)
#success on train data ?
test_data = {X:mnist.test.images, Y_:mnist.test.labels}
a,c = sess.run([accuracy, cross_entropy],feed=test_data)
Change last lines to:
test_images = np.reshape(mnist.test.images, (-1, 28, 28, 1))
test_data = {X:mnist.test.images, Y_:test_images}
a,c = sess.run([accuracy, cross_entropy],feed_dict=test_data)

Classifying sequences with different lengths with error batching

I'm using Keras with the TensorFlow backend. I've just figured out how to train and classify sequences of different lengths without masking, because I can't get masking to work. In the toy example I'm working with, I'm trying to train an LSTM to detect whether a sequence of arbitrary length starts with a 1 or not.
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np
def gen_sig(num_samples, seq_len):
one_indices = np.random.choice(a=num_samples, size=num_samples // 2, replace=False)
x_val = np.zeros((num_samples, seq_len), dtype=np.bool)
x_val[one_indices, 0] = 1
y_val = np.zeros(num_samples, dtype=np.bool)
y_val[one_indices] = 1
return x_val, y_val
N_train = 100
N_test = 10
recall_len = 20
X_train, y_train = gen_sig(N_train, recall_len)
X_test, y_test = gen_sig(N_train, recall_len)
print('Build STATEFUL model...')
model = Sequential()
model.add(LSTM(10, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
for epoch in range(15):
mean_tr_acc = []
mean_tr_loss = []
for seq_idx in range(X_train.shape[0]):
start_val = X_train[seq_idx, 0]
assert y_train[seq_idx] == start_val
assert tuple(np.nonzero(X_train[seq_idx, :]))[0].shape[0] == start_val
y_in = np.array([y_train[seq_idx]], dtype=np.bool)
for j in range(np.random.choice(a=np.arange(5, recall_len+1))):
x_in = np.array([[[X_train[seq_idx][j]]]])
tr_loss, tr_acc = model.train_on_batch(x_in, y_in)
mean_tr_acc.append(tr_acc)
mean_tr_loss.append(tr_loss)
model.reset_states()
print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
print('loss training = {}'.format(np.mean(mean_tr_loss)))
print('___________________________________')
mean_te_acc = []
mean_te_loss = []
for seq_idx in range(X_test.shape[0]):
start_val = X_test[seq_idx, 0]
assert y_test[seq_idx] == start_val
assert tuple(np.nonzero(X_test[seq_idx, :]))[0].shape[0] == start_val
y_in = np.array([y_test[seq_idx]], dtype=np.bool)
for j in range(np.random.choice(a=np.arange(5, recall_len+1))):
te_loss, te_acc = model.test_on_batch(np.array([[[X_test[seq_idx][j]]]], dtype=np.bool), y_in)
mean_te_acc.append(te_acc)
mean_te_loss.append(te_loss)
model.reset_states()
print('accuracy testing = {}'.format(np.mean(mean_te_acc)))
print('loss testing = {}'.format(np.mean(mean_te_loss)))
print('___________________________________')
As seen in the code, my error is being batched over each time-step. This is bad for multiple reasons. How do I train the network in two steps? For example:
Run a bunch of values through the network to accumulate the error
Adjust the weights of the network given this accumulated error
To do what is described in the original question, the easiest way is to train the original network with masking, but then test with a stateful network so any length input can be classified:
import numpy as np
np.random.seed(1)
import tensorflow as tf
tf.set_random_seed(1)
from keras import models
from keras.layers import Dense, Masking, LSTM
import matplotlib.pyplot as plt
def stateful_model():
hidden_units = 256
model = models.Sequential()
model.add(LSTM(hidden_units, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
return model
def train_rnn(x_train, y_train, max_len, mask):
epochs = 10
batch_size = 200
vec_dims = 1
hidden_units = 256
in_shape = (max_len, vec_dims)
model = models.Sequential()
model.add(Masking(mask, name="in_layer", input_shape=in_shape,))
model.add(LSTM(hidden_units, return_sequences=False))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
validation_split=0.05)
return model
def gen_train_sig_cls_pair(t_stops, num_examples, mask):
x = []
y = []
max_t = int(np.max(t_stops))
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, max_t), dtype=np.int8)
sig[one_indices, 0] = 1
sig[:, t_stop:] = mask
x.append(sig)
cls = np.zeros(num_examples, dtype=np.bool)
cls[one_indices] = 1
y.append(cls)
return np.concatenate(x, axis=0), np.concatenate(y, axis=0)
def gen_test_sig_cls_pair(t_stops, num_examples):
x = []
y = []
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, t_stop), dtype=np.bool)
sig[one_indices, 0] = 1
x.extend(list(sig))
cls = np.zeros((num_examples, t_stop), dtype=np.bool)
cls[one_indices] = 1
y.extend(list(cls))
return x, y
if __name__ == '__main__':
noise_mag = 0.01
mask_val = -10
signal_lengths = (10, 15, 20)
x_in, y_in = gen_train_sig_cls_pair(signal_lengths, 10, mask_val)
mod = train_rnn(x_in[:, :, None], y_in, int(np.max(signal_lengths)), mask_val)
testing_dat, expected = gen_test_sig_cls_pair(signal_lengths, 3)
state_mod = stateful_model()
state_mod.set_weights(mod.get_weights())
res = []
for s_i in range(len(testing_dat)):
seq_in = list(testing_dat[s_i])
seq_len = len(seq_in)
for t_i in range(seq_len):
res.extend(state_mod.predict(np.array([[[seq_in[t_i]]]])))
state_mod.reset_states()
fig, axes = plt.subplots(2)
axes[0].plot(np.concatenate(testing_dat), label="input")
axes[1].plot(res, "ro", label="result", alpha=0.2)
axes[1].plot(np.concatenate(expected, axis=0), "bo", label="expected", alpha=0.2)
axes[1].legend(bbox_to_anchor=(1.1, 1))
plt.show()

Has anyone successfully trained Squeezenet with residual connections?

I have trained the two versions of Squeezenet, both with success, thanks #forresti !
When training the one with residual connections, I am stucked. Whatever learning policy I took, the one shipped in this repo, or the plainly step, I cannot train it to the results given in the paper. The accuracy is a bit lower than Squeezenet v1.0....
I know that I should post this in that repo, but I can't find issues tab there....
Anyone could shed me some light? Thanks in advance!
====================EDIT=============================
I firstly adopted the solver hyperparameters shipped with SqueezeNet-v1.0. Then, I changed the learning policy from poly to step, keeping the remaining parameters untouched and closely monitored the loss and accuracy, when they became apparently flat, I changed the learning rate by a factor of 0.4. In both cases, I got top-5 accuracies 81.9x% and 79.8x%, lower than the benchmark provided in the paper, seems rather weird....
You can use newest SqueezeNet v1.1 version of Squezenet from: https://github.com/rcmalli/keras-squeezenet
Model Definition:
from keras import backend as K
from keras.layers import Input, Convolution2D, MaxPooling2D, Activation, concatenate, Dropout
from keras.layers import GlobalAveragePooling2D, GlobalMaxPooling2D
from keras.models import Model
from keras.utils.layer_utils import get_source_inputs #https://stackoverflow.com/questions/68862735/keras-vggface-no-module-named-keras-engine-topology
from tensorflow.keras.utils import get_file
from keras.utils import layer_utils
sq1x1 = "squeeze1x1"
exp1x1 = "expand1x1"
exp3x3 = "expand3x3"
relu = "relu_"
WEIGHTS_PATH = "https://github.com/rcmalli/keras-squeezenet/releases/download/v1.0/squeezenet_weights_tf_dim_ordering_tf_kernels.h5"
WEIGHTS_PATH_NO_TOP = "https://github.com/rcmalli/keras-squeezenet/releases/download/v1.0/squeezenet_weights_tf_dim_ordering_tf_kernels_notop.h5"
# Modular function for Fire Node
def fire_module(x, fire_id, squeeze=16, expand=64):
s_id = 'fire' + str(fire_id) + '/'
if K.image_data_format() == 'channels_first':
channel_axis = 1
else:
channel_axis = 3
x = Convolution2D(squeeze, (1, 1), padding='valid', name=s_id + sq1x1)(x)
x = Activation('relu', name=s_id + relu + sq1x1)(x)
left = Convolution2D(expand, (1, 1), padding='valid', name=s_id + exp1x1)(x)
left = Activation('relu', name=s_id + relu + exp1x1)(left)
right = Convolution2D(expand, (3, 3), padding='same', name=s_id + exp3x3)(x)
right = Activation('relu', name=s_id + relu + exp3x3)(right)
x = concatenate([left, right], axis=channel_axis, name=s_id + 'concat')
return x
# Original SqueezeNet from paper.
def SqueezeNet(include_top=True, weights='imagenet',
input_tensor=None, input_shape=None,
pooling=None,
classes=1000):
"""Instantiates the SqueezeNet architecture."""
if weights not in {'imagenet', None}:
raise ValueError('The `weights` argument should be either '
'`None` (random initialization) or `imagenet` '
'(pre-training on ImageNet).')
input_shape = input_shape
if input_tensor is None:
img_input = Input(shape=input_shape)
else:
if not K.is_keras_tensor(input_tensor):
img_input = Input(tensor=input_tensor, shape=input_shape)
else:
img_input = input_tensor
x = Convolution2D(64, (3, 3), strides=(2, 2), padding='valid', name='conv1')(img_input)
x = Activation('relu', name='relu_conv1')(x)
x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool1')(x)
x = fire_module(x, fire_id=2, squeeze=16, expand=64)
x = fire_module(x, fire_id=3, squeeze=16, expand=64)
x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool3')(x)
x = fire_module(x, fire_id=4, squeeze=32, expand=128)
x = fire_module(x, fire_id=5, squeeze=32, expand=128)
x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool5')(x)
x = fire_module(x, fire_id=6, squeeze=48, expand=192)
x = fire_module(x, fire_id=7, squeeze=48, expand=192)
x = fire_module(x, fire_id=8, squeeze=64, expand=256)
x = fire_module(x, fire_id=9, squeeze=64, expand=256)
if include_top:
# It's not obvious where to cut the network...
# Could do the 8th or 9th layer... some work recommends cutting earlier layers.
x = Dropout(0.5, name='drop9')(x)
x = Convolution2D(classes, (1, 1), padding='valid', name='conv10')(x)
x = Activation('relu', name='relu_conv10')(x)
x = GlobalAveragePooling2D()(x)
x = Activation('softmax', name='loss')(x)
else:
if pooling == 'avg':
x = GlobalAveragePooling2D()(x)
elif pooling=='max':
x = GlobalMaxPooling2D()(x)
elif pooling==None:
pass
else:
raise ValueError("Unknown argument for 'pooling'=" + pooling)
#x = Dense(10, activation= 'softmax')(x)
# Ensure that the model takes into account
# any potential predecessors of `input_tensor`.
if input_tensor is not None:
inputs = get_source_inputs(input_tensor)
else:
inputs = img_input
model = Model(inputs, x, name='squeezenet')
# load weights
if weights == 'imagenet':
if include_top:
weights_path = get_file('squeezenet_weights_tf_dim_ordering_tf_kernels.h5',
WEIGHTS_PATH,
cache_subdir='models')
else:
weights_path = get_file('squeezenet_weights_tf_dim_ordering_tf_kernels_notop.h5',
WEIGHTS_PATH_NO_TOP,
cache_subdir='models')
model.load_weights(weights_path)
if K.backend() == 'theano':
layer_utils.convert_all_kernels_in_model(model)
return model
Example Usage:
import numpy as np
from keras_squeezenet import SqueezeNet
from keras.applications.imagenet_utils import preprocess_input, decode_predictions
from keras.preprocessing import image
model = SqueezeNet()
img = image.load_img('../images/cat.jpeg', target_size=(227, 227))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
preds = model.predict(x)
print('Predicted:', decode_predictions(preds))

Resources