Has anyone successfully trained Squeezenet with residual connections? - machine-learning

I have trained the two versions of Squeezenet, both with success, thanks #forresti !
When training the one with residual connections, I am stucked. Whatever learning policy I took, the one shipped in this repo, or the plainly step, I cannot train it to the results given in the paper. The accuracy is a bit lower than Squeezenet v1.0....
I know that I should post this in that repo, but I can't find issues tab there....
Anyone could shed me some light? Thanks in advance!
====================EDIT=============================
I firstly adopted the solver hyperparameters shipped with SqueezeNet-v1.0. Then, I changed the learning policy from poly to step, keeping the remaining parameters untouched and closely monitored the loss and accuracy, when they became apparently flat, I changed the learning rate by a factor of 0.4. In both cases, I got top-5 accuracies 81.9x% and 79.8x%, lower than the benchmark provided in the paper, seems rather weird....

You can use newest SqueezeNet v1.1 version of Squezenet from: https://github.com/rcmalli/keras-squeezenet
Model Definition:
from keras import backend as K
from keras.layers import Input, Convolution2D, MaxPooling2D, Activation, concatenate, Dropout
from keras.layers import GlobalAveragePooling2D, GlobalMaxPooling2D
from keras.models import Model
from keras.utils.layer_utils import get_source_inputs #https://stackoverflow.com/questions/68862735/keras-vggface-no-module-named-keras-engine-topology
from tensorflow.keras.utils import get_file
from keras.utils import layer_utils
sq1x1 = "squeeze1x1"
exp1x1 = "expand1x1"
exp3x3 = "expand3x3"
relu = "relu_"
WEIGHTS_PATH = "https://github.com/rcmalli/keras-squeezenet/releases/download/v1.0/squeezenet_weights_tf_dim_ordering_tf_kernels.h5"
WEIGHTS_PATH_NO_TOP = "https://github.com/rcmalli/keras-squeezenet/releases/download/v1.0/squeezenet_weights_tf_dim_ordering_tf_kernels_notop.h5"
# Modular function for Fire Node
def fire_module(x, fire_id, squeeze=16, expand=64):
s_id = 'fire' + str(fire_id) + '/'
if K.image_data_format() == 'channels_first':
channel_axis = 1
else:
channel_axis = 3
x = Convolution2D(squeeze, (1, 1), padding='valid', name=s_id + sq1x1)(x)
x = Activation('relu', name=s_id + relu + sq1x1)(x)
left = Convolution2D(expand, (1, 1), padding='valid', name=s_id + exp1x1)(x)
left = Activation('relu', name=s_id + relu + exp1x1)(left)
right = Convolution2D(expand, (3, 3), padding='same', name=s_id + exp3x3)(x)
right = Activation('relu', name=s_id + relu + exp3x3)(right)
x = concatenate([left, right], axis=channel_axis, name=s_id + 'concat')
return x
# Original SqueezeNet from paper.
def SqueezeNet(include_top=True, weights='imagenet',
input_tensor=None, input_shape=None,
pooling=None,
classes=1000):
"""Instantiates the SqueezeNet architecture."""
if weights not in {'imagenet', None}:
raise ValueError('The `weights` argument should be either '
'`None` (random initialization) or `imagenet` '
'(pre-training on ImageNet).')
input_shape = input_shape
if input_tensor is None:
img_input = Input(shape=input_shape)
else:
if not K.is_keras_tensor(input_tensor):
img_input = Input(tensor=input_tensor, shape=input_shape)
else:
img_input = input_tensor
x = Convolution2D(64, (3, 3), strides=(2, 2), padding='valid', name='conv1')(img_input)
x = Activation('relu', name='relu_conv1')(x)
x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool1')(x)
x = fire_module(x, fire_id=2, squeeze=16, expand=64)
x = fire_module(x, fire_id=3, squeeze=16, expand=64)
x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool3')(x)
x = fire_module(x, fire_id=4, squeeze=32, expand=128)
x = fire_module(x, fire_id=5, squeeze=32, expand=128)
x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool5')(x)
x = fire_module(x, fire_id=6, squeeze=48, expand=192)
x = fire_module(x, fire_id=7, squeeze=48, expand=192)
x = fire_module(x, fire_id=8, squeeze=64, expand=256)
x = fire_module(x, fire_id=9, squeeze=64, expand=256)
if include_top:
# It's not obvious where to cut the network...
# Could do the 8th or 9th layer... some work recommends cutting earlier layers.
x = Dropout(0.5, name='drop9')(x)
x = Convolution2D(classes, (1, 1), padding='valid', name='conv10')(x)
x = Activation('relu', name='relu_conv10')(x)
x = GlobalAveragePooling2D()(x)
x = Activation('softmax', name='loss')(x)
else:
if pooling == 'avg':
x = GlobalAveragePooling2D()(x)
elif pooling=='max':
x = GlobalMaxPooling2D()(x)
elif pooling==None:
pass
else:
raise ValueError("Unknown argument for 'pooling'=" + pooling)
#x = Dense(10, activation= 'softmax')(x)
# Ensure that the model takes into account
# any potential predecessors of `input_tensor`.
if input_tensor is not None:
inputs = get_source_inputs(input_tensor)
else:
inputs = img_input
model = Model(inputs, x, name='squeezenet')
# load weights
if weights == 'imagenet':
if include_top:
weights_path = get_file('squeezenet_weights_tf_dim_ordering_tf_kernels.h5',
WEIGHTS_PATH,
cache_subdir='models')
else:
weights_path = get_file('squeezenet_weights_tf_dim_ordering_tf_kernels_notop.h5',
WEIGHTS_PATH_NO_TOP,
cache_subdir='models')
model.load_weights(weights_path)
if K.backend() == 'theano':
layer_utils.convert_all_kernels_in_model(model)
return model
Example Usage:
import numpy as np
from keras_squeezenet import SqueezeNet
from keras.applications.imagenet_utils import preprocess_input, decode_predictions
from keras.preprocessing import image
model = SqueezeNet()
img = image.load_img('../images/cat.jpeg', target_size=(227, 227))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
preds = model.predict(x)
print('Predicted:', decode_predictions(preds))

Related

Variational Autoencoder's sampling problem

My vae class looks like this:
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
c = capacity
self.conv1 = nn.Conv2d(in_channels=1, out_channels=c, kernel_size=4, stride=2, padding=1) # out: c x 14 x 14
self.conv2 = nn.Conv2d(in_channels=c, out_channels=c*2, kernel_size=4, stride=2, padding=1) # out: c x 7 x 7
self.fc_mu = nn.Linear(in_features=c*2*7*7, out_features=latent_dims)
self.fc_logvar = nn.Linear(in_features=c*2*7*7, out_features=latent_dims)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = x.view(x.size(0), -1) # flatten batch of multi-channel feature maps to a batch of feature vectors
x_mu = self.fc_mu(x)
x_logvar = self.fc_logvar(x)
return x_mu, x_logvar
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
c = capacity
self.fc = nn.Linear(in_features=latent_dims, out_features=c*2*7*7)
self.conv2 = nn.ConvTranspose2d(in_channels=c*2, out_channels=c, kernel_size=4, stride=2, padding=1)
self.conv1 = nn.ConvTranspose2d(in_channels=c, out_channels=1, kernel_size=4, stride=2, padding=1)
def forward(self, x):
x = self.fc(x)
x = x.view(x.size(0), capacity*2, 7, 7) # unflatten batch of feature vectors to a batch of multi-channel feature maps
x = F.relu(self.conv2(x))
x = torch.sigmoid(self.conv1(x)) # last layer before output is sigmoid, since we are using BCE as reconstruction loss
return x
class VariationalAutoencoder(nn.Module):
def __init__(self):
super(VariationalAutoencoder, self).__init__()
self.encoder = Encoder()
self.decoder = Decoder()
def forward(self, x):
latent_mu, latent_logvar = self.encoder(x)
latent = self.latent_sample(latent_mu, latent_logvar)
x_recon = self.decoder(latent)
return x_recon, latent_mu, latent_logvar
def latent_sample(self, mu, logvar):
if self.training:
# the reparameterization trick
std = logvar.mul(0.5).exp_()
eps = torch.empty_like(std).normal_()
return eps.mul(std).add_(mu)
else:
return mu
def vae_loss(recon_x, x, mu, logvar):
# recon_x is the probability of a multivariate Bernoulli distribution p.
# -log(p(x)) is then the pixel-wise binary cross-entropy.
# Averaging or not averaging the binary cross-entropy over all pixels here
# is a subtle detail with big effect on training, since it changes the weight
# we need to pick for the other loss term by several orders of magnitude.
# Not averaging is the direct implementation of the negative log likelihood,
# but averaging makes the weight of the other loss term independent of the image resolution.
recon_loss = F.binary_cross_entropy(recon_x.view(-1, 784), x.view(-1, 784), reduction='sum')
kldivergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return recon_loss + variational_beta * kldivergence
I train it on MNIST dataset.
I want to sample it, or generate an array and give it to the decoder and see what the output will be.
The problem is that I don't really understand, what my z array should look like and what shape should it need.
Here is the code for sampling:
z = ...
input = torch.FloatTensor(z).to(device)
vae.eval()
output = vae.decoder(input)
plot_gallery(output.data.cpu().numpy(), 24, 24, n_row=5, n_col=5)

Deep CNN doesn't learn and accuracy just stay in same value

I have a Deep CNN based on ResNet, and a dataset(10000, 50,50,1) to classify digits . when I run it to start leanrning , accuracy just stops in some value and gently occilating(around 0.2). I am wondering if it has overfitting or there is another issue involved ?
here is the identity block :
def identity_block(X, f, filters, stage, block):
# defining name basics
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
# retrieve filters
F1, F2, F3 = filters
# save the shortcut
X_shortcut = X
# first component
X = Conv2D(filters=F1, kernel_size=(1, 1), strides=(1, 1), padding='valid', name=conv_name_base + '2a',
kernel_initializer=initializers.glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2a')(X)
X = Activation('relu')(X)
# second component
X = Conv2D(filters=F2, kernel_size=(f, f), strides=(1, 1), padding='same', name=conv_name_base + '2b',
kernel_initializer=initializers.glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2b')(X)
X = Activation('relu')(X)
# third component
X = Conv2D(filters=F3, kernel_size=(1, 1), strides=(1, 1), padding='valid', name=conv_name_base + '2c',
kernel_initializer=initializers.glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2c')(X)
# final component
X = Add()([X, X_shortcut])
X = Activation('relu')(X)
return X
and convolutional block :
def conv_block(X, f, filters, stage, block, s=2):
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
# Retivr filters
F1, F2, F3 = filters
# Save shortcut
X_shortcut = X
# First component
X = Conv2D(F1, kernel_size=(1, 1), strides=(s, s), name=conv_name_base + '2a',
kernel_initializer=initializers.glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2a')(X)
X = Activation('relu')(X)
# Second component
X = Conv2D(F2, kernel_size=(f, f), strides=(1, 1), padding='same', name=conv_name_base + '2b',
kernel_initializer=initializers.glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2b')(X)
X = Activation('relu')(X)
# third component
X = Conv2D(F3, kernel_size=(1, 1), strides=(1, 1), name=conv_name_base + '2c', padding='valid',
kernel_initializer=initializers.glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2c')(X)
# short cut
X_shortcut = Conv2D(F3, kernel_size=(1, 1), strides=(s, s), name=conv_name_base + '1',
kernel_initializer=initializers.glorot_uniform(seed=0))(X_shortcut)
X_shortcut = BatchNormalization(axis=3, name=bn_name_base + '1')(X_shortcut)
# finaly
X = Add()([X, X_shortcut])
X = Activation('relu')(X)
return X
and finaly the ResNet:
def ResNet( input_shape=(50, 50, 1), classes=10):
inp = Input(shape=(50,50,1))
# zero padding
X = ZeroPadding2D((3, 3), name='pad0')(inp)
# stage1
X = Conv2D(32, (5,5), name='conv1', input_shape=input_shape,
kernel_initializer=initializers.glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name='bn1')(X)
X = Activation('relu')(X)
X = MaxPooling2D((2,2), name='pool1')(X)
# Stage 2
stage2_filtersize = 32
X = conv_block(X, 3, filters=[stage2_filtersize, stage2_filtersize, stage2_filtersize], stage=2, block='a', s=1)
X = identity_block(X, 3, [stage2_filtersize,stage2_filtersize, stage2_filtersize], stage=2, block='b')
X = identity_block(X, 3, [stage2_filtersize, stage2_filtersize, stage2_filtersize], stage=2, block='c')
# Stage 3
stage3_filtersize = 64
X = conv_block(X, 3, filters=[stage3_filtersize, stage3_filtersize, stage3_filtersize], stage=3, block='a', s=1)
X = identity_block(X, 3, [stage3_filtersize, stage3_filtersize, stage3_filtersize], stage=3, block='b')
X = identity_block(X, 3, [stage3_filtersize, stage3_filtersize, stage3_filtersize], stage=3, block='c')
# Stage 4
stage4_filtersize = 128
X = conv_block(X, 3, filters=[stage4_filtersize, stage4_filtersize, stage4_filtersize], stage=4, block='a', s=1)
X = identity_block(X, 3, [stage4_filtersize, stage4_filtersize, stage4_filtersize], stage=4, block='b')
X = identity_block(X, 3, [stage4_filtersize, stage4_filtersize, stage4_filtersize], stage=4, block='c')
# final
X = AveragePooling2D((2, 2), padding='same', name='Pool0')(X)
# FC
X = Flatten(name='D0')(X)
X = Dense(classes, activation='softmax', kernel_initializer=initializers.glorot_uniform(seed=0), name='D2')(X)
# creat model
model = Model(inputs=inp, outputs=X)
return model
update 1 : here are the fitting and compile methods :
model.compile(optimizer='adam',
loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.compile(optimizer='adam',
loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
print("model compiled settings imported successfully")
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model.fit(X_train, Y_train, validation_split=0.2, callbacks=[early_stopping], epochs=10)
test_loss, test_acc = model.evaluate(X_test, Y_test, verbose=2)
First try normalizing the values of the digit image (50x50).
Then also consider how a neural network learns its weights. Convolutional Neural Networks learns by continually adding gradient error vectors that are multiplied by a learning rate computed from backpropagation to various weight matrices throughout the network as training examples are passed through.
The most important thing to consider is the multiplication of the learning rate, because once we didn't scale the training inputs the range of distributions of the feature values will be likely different from each feature, thus the learning rate would cause corrections in each dimension that would differ from one another. This is random, so the machine could be overcompensating a correction in one weight dimension and under compensating in another. Which is very non-ideal as this might result in an oscillation state or a very slow training state.
Oscillating means that the model is unable to locate the center for the better maxima in weights.
Slow training means moving too slow to achieve a better maxima.
This is why it is a common practice to normalize images before using it as an input for Neural Network or any Models that is Gradient-Based.
TF_Support's answer:
Provide few samples of the dataset, loss curve, accuracy plot so we can clearly understand what you're trying to learn, it's more important than the code you provided.
I would guess, you are trying to learn very hard samples, 50by50 grayscale is not much. Is your network overfitting? (We would only figure that out after looking into some plots of the validation metrics) (0.2 is it your training accuracy?)
First do a sanity check on the dataset, by training a very simple CNN. I see you have 10 classes (not sure, just guessing from the function's default value), the randomized accuracy is 10%, so set a baseline first with a simple CNN and then try to improve with ResNet.
Increase the learning rate and see how the accuracy fluctuates. After a few epochs, reduce the learning rate when the accuracy better than the baseline.

How to get prediction and confidence of that prediction using resnet

I have a binary classifier which predicts whether the image is positive or negative. I am using model.predict for getting the detections. So basically what I want is the class index and the confidence value with which it belongs to that class. I am able to get the detections and able to show it on the image, but for background images also it is showing some false predictions so I would like to remove those by setting a threshold for the confidence. For information about the training file and testing file I have asked a question on StackOverflow, please refer the link "Resnet is showing wrong predictions even without any object"
My Resnet code:
# import the necessary packages
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import AveragePooling2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.convolutional import ZeroPadding2D
from keras.layers.core import Activation
from keras.layers.core import Dense
from keras.layers import Flatten
from keras.layers import Input
from keras.models import Model
from keras.layers import add
from keras.regularizers import l2
from keras import backend as K
class ResNet:
#staticmethod
def residual_module(data, K, stride, chanDim, red=False,
reg=0.0001, bnEps=2e-5, bnMom=0.9):
# the shortcut branch of the ResNet module should be
# initialize as the input (identity) data
shortcut = data
# the first block of the ResNet module are the 1x1 CONVs
bn1 = BatchNormalization(axis=chanDim, epsilon=bnEps,
momentum=bnMom)(data)
act1 = Activation("relu")(bn1)
conv1 = Conv2D(int(K * 0.25), (1, 1), use_bias=False,
kernel_regularizer=l2(reg))(act1)
# the second block of the ResNet module are the 3x3 CONVs
bn2 = BatchNormalization(axis=chanDim, epsilon=bnEps,
momentum=bnMom)(conv1)
act2 = Activation("relu")(bn2)
conv2 = Conv2D(int(K * 0.25), (3, 3), strides=stride,
padding="same", use_bias=False,
kernel_regularizer=l2(reg))(act2)
# the third block of the ResNet module is another set of 1x1
# CONVs
bn3 = BatchNormalization(axis=chanDim, epsilon=bnEps,
momentum=bnMom)(conv2)
act3 = Activation("relu")(bn3)
conv3 = Conv2D(K, (1, 1), use_bias=False,
kernel_regularizer=l2(reg))(act3)
# if we are to reduce the spatial size, apply a CONV layer to
# the shortcut
if red:
shortcut = Conv2D(K, (1, 1), strides=stride,
use_bias=False, kernel_regularizer=l2(reg))(act1)
# add together the shortcut and the final CONV
x = add([conv3, shortcut])
# return the addition as the output of the ResNet module
return x
#staticmethod
def build(width, height, depth, classes, stages, filters,
reg=0.0001, bnEps=2e-5, bnMom=0.9):
# initialize the input shape to be "channels last" and the
# channels dimension itself
inputShape = (height, width, depth)
chanDim = -1
# if we are using "channels first", update the input shape
# and channels dimension
if K.image_data_format() == "channels_first":
inputShape = (depth, height, width)
chanDim = 1
# set the input and apply BN
inputs = Input(shape=inputShape)
x = BatchNormalization(axis=chanDim, epsilon=bnEps,
momentum=bnMom)(inputs)
# apply CONV => BN => ACT => POOL to reduce spatial size
x = Conv2D(filters[0], (5, 5), use_bias=False,
padding="same", kernel_regularizer=l2(reg))(x)
x = BatchNormalization(axis=chanDim, epsilon=bnEps,
momentum=bnMom)(x)
x = Activation("relu")(x)
x = ZeroPadding2D((1, 1))(x)
x = MaxPooling2D((3, 3), strides=(2, 2))(x)
# loop over the number of stages
for i in range(0, len(stages)):
# initialize the stride, then apply a residual module
# used to reduce the spatial size of the input volume
stride = (1, 1) if i == 0 else (2, 2)
x = ResNet.residual_module(x, filters[i + 1], stride,
chanDim, red=True, bnEps=bnEps, bnMom=bnMom)
# loop over the number of layers in the stage
for j in range(0, stages[i] - 1):
# apply a ResNet module
x = ResNet.residual_module(x, filters[i + 1],
(1, 1), chanDim, bnEps=bnEps, bnMom=bnMom)
# apply BN => ACT => POOL
x = BatchNormalization(axis=chanDim, epsilon=bnEps,
momentum=bnMom)(x)
x = Activation("relu")(x)
x = AveragePooling2D((8, 8))(x)
# softmax classifier
x = Flatten()(x)
x = Dense(classes, kernel_regularizer=l2(reg))(x)
x = Activation("softmax")(x)
# create the model
model = Model(inputs, x, name="resnet")
# return the constructed network architecture
return model
Any kind of suggestion to get rid of my this problem would be really helpful

How can I improve my Neural Network accucary ( Speaker Recognition - MFCC )

Im working on a speaker recognition Neural Network.
What I am doing is taking wav files [ of the Bing Bang Theory first espiode :-) ], than convert it to MFCC coeffs than I make it as an input to an open source api of Neural Network (MLPClassifier) and as output I define a unique vector to each speaker ( Let's say : [1,0,0,0] - sheldon; [0,1,0,0] - Penny; and ect... ), I take 50 random values for testing and the others for fitting ( training )
This is my code, At the begining I got about random accucary for the NN but after some help of amazing guy I improved it to ~42% but I want more :) about 70% :
from sklearn.neural_network import MLPClassifier
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
from os import listdir
from os.path import isfile, join
from random import shuffle
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import randint
import random
winner = [] # this array count how much Bingo we had when we test the NN
random_winner = []
win_len = 0.04 # in seconds
step = win_len / 2
nfft = 2048
for TestNum in tqdm(range(20)): # in every round we build NN with X,Y that out of them we check 50 after we build the NN
X = []
Y = []
onlyfiles = [f for f in listdir("FinalAudios/") if isfile(join("FinalAudios/", f))] # Files in dir
names = [] # names of the speakers
for file in onlyfiles: # for each wav sound
# UNESSECERY TO UNDERSTAND THE CODE
if " " not in file.split("_")[0]:
names.append(file.split("_")[0])
else:
names.append(file.split("_")[0].split(" ")[0])
only_speakers = [] + names
#print only_speakers
names = list(dict.fromkeys(names)) # names of speakers
print names
vector_names = [] # vector for each name
i = 0
vector_for_each_name = [0] * len(names)
for name in names:
vector_for_each_name[i] += 1
vector_names.append(np.array(vector_for_each_name))
vector_for_each_name[i] -= 1
i += 1
for f in onlyfiles:
if " " not in f.split("_")[0]:
f_speaker = f.split("_")[0]
else:
f_speaker = f.split("_")[0].split(" ")[0]
fs, audio = wav.read("FinalAudios/" + f) # read the file
try:
mfcc_feat = python_speech_features.mfcc(audio, samplerate=fs, winlen=win_len,
winstep=step, nfft=nfft, appendEnergy=False)
flat_list = [item for sublist in mfcc_feat for item in sublist]
X.append(np.array(flat_list))
Y.append(np.array(vector_names[names.index(f_speaker)]))
except IndexError:
pass
Z = list(zip(X, Y))
shuffle(Z) # WE SHUFFLE X,Y TO PERFORM RANDOM ON THE TEST LEVEL
X, Y = zip(*Z)
X = list(X)
Y = list(Y)
X = np.asarray(X)
Y = np.asarray(Y)
Y_test = Y[:50] # CHOOSE 50 FOR TEST, OTHERS FOR TRAIN
X_test = X[:50]
X = X[50:]
Y = Y[50:]
print len(X)
clf = MLPClassifier(solver='lbfgs', alpha=3e-2, hidden_layer_sizes=(50, 20), random_state=2) # create the NN
clf.fit(X, Y) # Train it
print list(clf.predict_proba([X[0]])[0])
print list(Y_test[0])
for sample in range(len(X_test)): # add 1 to winner array if we correct and 0 if not, than in the end it plot it
arr = list(clf.predict([X_test[sample]])[0])
if arr.index(max(arr)) == list(Y_test[sample]).index(1):
winner.append(1)
else:
winner.append(0)
if only_speakers[randint(0, len(only_speakers) - 1)] == only_speakers[randint(0, len(only_speakers) - 1)]:
random_winner.append(1)
else:
random_winner.append(0)
# plot winner
plot_x = []
plot_y = []
for i in range(1, len(winner)):
plot_y.append(sum(winner[0:i])*1.0/len(winner[0:i]))
plot_x.append(i)
plot_random_x = []
plot_random_y = []
for i in range(1, len(random_winner)):
plot_random_y.append(sum(random_winner[0:i])*1.0/len(random_winner[0:i]))
plot_random_x.append(i)
plt.plot(plot_x, plot_y, 'r', label='machine learning')
plt.plot(plot_random_x, plot_random_y, 'b', label='random')
plt.xlabel('Number Of Samples')
# naming the y axis
plt.ylabel('Success Rate')
# giving a title to my graph
plt.title('Success Rate : Random Vs ML!')
# function to show the plot
plt.show()
This is my zip file that contains the code and the audio file : https://ufile.io/eggjm1gw
Somebody have an idea how can I improve my accucary?
Edit :
I improved my data set and put convolution model and got 60% accucarry, which is ok but also not good enoguh
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
from os import listdir
import os
import shutil
from os.path import isfile, join
from random import shuffle
from matplotlib import pyplot
from tqdm import tqdm
from random import randint
import tensorflow as tf
from ast import literal_eval as str2arr
from tempfile import TemporaryFile
#win_len = 0.04 # in seconds
#step = win_len / 2
#nfft = 2048
win_len = 0.05 # in seconds
step = win_len
nfft = 16384
results = []
outfile_x = None
outfile_y = None
winner = []
for TestNum in tqdm(range(40)): # We check it several times
if not outfile_x: # if path not exist we create it
X = [] # inputs
Y = [] # outputs
onlyfiles = [f for f in listdir("FinalAudios") if isfile(join("FinalAudios", f))] # Files in dir
names = [] # names of the speakers
for file in onlyfiles: # for each wav sound
# UNESSECERY TO UNDERSTAND THE CODE
if " " not in file.split("_")[0]:
names.append(file.split("_")[0])
else:
names.append(file.split("_")[0].split(" ")[0])
only_speakers = [] + names
namesWithoutDuplicate = list(dict.fromkeys(names))
namesWithoutDuplicateCopy = namesWithoutDuplicate[:]
for name in namesWithoutDuplicateCopy: # we remove low samples files
if names.count(name) < 107:
namesWithoutDuplicate.remove(name)
names = namesWithoutDuplicate
print(names) # print it
vector_names = [] # output for each name
i = 0
for name in names:
vector_for_each_name = i
vector_names.append(np.array(vector_for_each_name))
i += 1
for f in onlyfiles: # for all the files
if " " not in f.split("_")[0]:
f_speaker = f.split("_")[0]
else:
f_speaker = f.split("_")[0].split(" ")[0]
if f_speaker in namesWithoutDuplicate:
fs, audio = wav.read("FinalAudios\\" + f) # read the file
try:
# compute MFCC
mfcc_feat = python_speech_features.mfcc(audio, samplerate=fs, winlen=win_len, winstep=step, nfft=nfft, appendEnergy=False)
#flat_list = [item for sublist in mfcc_feat for item in sublist]
# Create output + inputs
for i in mfcc_feat:
X.append(np.array(i))
Y.append(np.array(vector_names[names.index(f_speaker)]))
except IndexError:
pass
else:
if not os.path.exists("TooLowSamples"): # if path not exist we create it
os.makedirs("TooLowSamples")
shutil.move("FinalAudios\\" + f, "TooLowSamples\\" + f)
outfile_x = TemporaryFile()
np.save(outfile_x, X)
outfile_y = TemporaryFile()
np.save(outfile_y, Y)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
else:
outfile_x.seek(0)
X = np.load(outfile_x)
outfile_y.seek(0)
Y = np.load(outfile_y)
Z = list(zip(X, Y))
shuffle(Z) # WE SHUFFLE X,Y TO PERFORM RANDOM ON THE TEST LEVEL
X, Y = zip(*Z)
X = list(X)
Y = list(Y)
lenX = len(X)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
y_test = np.asarray(Y[:4000]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_test = np.asarray(X[:4000]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_train = np.asarray(X[4000:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
y_train = np.asarray(Y[4000:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_val = x_train[-4000:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_val = y_train[-4000:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train[:-4000] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_train = y_train[:-4000] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train.reshape(np.append(x_train.shape, (1, 1))) # RESHAPE FOR INPUT
x_test = x_test.reshape(np.append(x_test.shape, (1, 1))) # RESHAPE FOR INPUT
x_val = x_val.reshape(np.append(x_val.shape, (1, 1))) # RESHAPE FOR INPUT
features_shape = x_val.shape
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
model = tf.keras.models.Sequential([
tf.keras.layers.Input(name='inputs', shape=(13, 1, 1), dtype='float32'),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', strides=1, name='block1_conv', input_shape=(13, 1, 1)),
tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same', name='block1_pool'),
tf.keras.layers.BatchNormalization(name='block1_norm'),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', strides=1, name='block2_conv',
input_shape=(13, 1, 1)),
tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block2_pool'),
tf.keras.layers.BatchNormalization(name='block2_norm'),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', strides=1, name='block3_conv',
input_shape=(13, 1, 1)),
tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block3_pool'),
tf.keras.layers.BatchNormalization(name='block3_norm'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu', name='dense'),
tf.keras.layers.BatchNormalization(name='dense_norm'),
tf.keras.layers.Dropout(0.2, name='dropout'),
tf.keras.layers.Dense(10, activation='softmax', name='pred')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
print("fitting")
history = model.fit(x_train, y_train, epochs=15, validation_data=(x_val, y_val))
print("testing")
results.append(model.evaluate(x_test, y_test)[1])
print(results)
print(sum(results)/len(results))
for i in range(10000):
f_1 = only_speakers[randint(0, len(only_speakers) - 1)]
f_2 = only_speakers[randint(0, len(only_speakers) - 1)]
if " " not in f_1.split("_")[0]:
f_speaker_1 = f_1.split("_")[0]
else:
f_speaker_1 =f_1.split("_")[0].split(" ")[0]
if " " not in f_2.split("_")[0]:
f_speaker_2 = f_2.split("_")[0]
else:
f_speaker_2 =f_2.split("_")[0].split(" ")[0]
if f_speaker_2 == f_speaker_1:
winner.append(1)
else:
winner.append(0)
print(sum(winner)/len(winner))
#]
# if onlyfiles[randint(len(onlyfiles) - 1)] == onlyfiles[randint(len(onlyfiles) - 1)]
#pyplot.plot(history.history['loss'], label='train')
#pyplot.plot(history.history['val_loss'], label='test') Q
#pyplot.legend()
#pyplot.show()
Readin your post these are the following things I could suggest you fix/explore
42% is not that impressive of an accuracy for the task you have at hand, consider the way you are cross-validating e.g. how do you split between a validation, test and training dataset
Your dataset seems very limited. Your task is to identify the speaker. A single episode might not be enough data for this task.
You might want to consider Deep Neural Network libraries such as Keras and Tensorflow. Convolutions is something you can apply directly to the MFC Graph.
If you decide using Tensorflow or Keras consider Triplet-Loss, where you preset a positive and negative example.
Consider reading the current state of the art for your task: https://github.com/grausof/keras-sincnet
Consider reading https://arxiv.org/abs/1503.03832 and adopting it for speech recognition.
The easiest thing you can do to improve your results is adding CNN layers to extract features from the MFCC

How to implement DQN algorithm correctly

I'm trying to implement the Deep Q Learning algorithm introduced by DeepMind in this paper:
https://arxiv.org/pdf/1312.5602.pdf
I'm using it to make an agent that learns to play Pong, however it doesn't seem to work (even after 2 hours of training I'm not seeing any improvement). This is the code,
import gym
import universe
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Activation
from keras.models import load_model
import random
env = gym.make('gym-core.Pong-v0')
env.configure(remotes=1)
def num2str(number, obs):
number = np.argmax(number)
if number == 0:
action = [[('KeyEvent', 'ArrowRight', False), ('KeyEvent', 'ArrowLeft', True)] for ob in obs]
elif number == 1:
action = [[('KeyEvent', 'ArrowLeft', False), ('KeyEvent', 'ArrowRight', True)] for ob in obs]
return action
def preprocess(original_obs):
obs = original_obs
obs = np.array(obs)[0]['vision']
obs = np.delete(obs, np.s_[195:769], axis=0)
obs = np.delete(obs, np.s_[0:35], axis=0)
obs = np.delete(obs, np.s_[160:1025], axis=1)
obs = np.mean(obs, axis=2)
obs = obs[::2,::2]
obs = np.reshape(obs, (80, 80, 1))
return obs
model = Sequential()
model.add(Conv2D(32, kernel_size = (8, 8), strides = (4, 4), border_mode='same', activation='relu', init='uniform', input_shape = (80, 80, 4)))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Conv2D(64, kernel_size = (2, 2), strides = (2, 2)))
model.add(Conv2D(64, kernel_size = (3, 3), strides = (1, 1)))
model.add(Flatten())
model.add(Dense(256, init='uniform', activation='relu'))
model.add(Dense(2, init='uniform', activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
init_observe_time = 500
D = []
e = 1.0
e_threshold = 0.05
e_decay = 0.01
gamma = 0.99
batch_size = 15
frequency = 10
Q_values = np.array([0, 0])
obs = env.reset()
while True:
obs = env.step(num2str(np.array([random.randint(0, 1) for i in range(0, 2)]), obs))[0]
if obs != [None]:
break
x_t1 = preprocess(obs)
s_t1 = np.stack((x_t1, x_t1, x_t1, x_t1), axis = 2)
s_t1 = np.reshape(s_t1, (80, 80, 4))
t = 0
while True:
print("Time since last start: ", t)
a_t = np.zeros(2)
if random.random() < e:
a_index = random.randint(0, 1)
a_t[a_index] = 1
else:
Q_values = model.predict(np.array([s_t1]))[0]
a_index = np.argmax(Q_values)
a_t[a_index] = 1
print("Q Values: ", Q_values)
print("action taken: ", np.argmax(a_t))
print("epsilon: ", e)
if e > e_threshold:
e -= e_decay
obs, r_t, done, info = env.step(num2str(a_t, obs))
if obs == [None]:
continue
x_t2 = preprocess(obs)
print(x_t2.shape, s_t1[:,:,0:3].shape)
s_t2 = np.append(x_t2, s_t1[:,:,0:3], axis = 2)
D.append((s_t1, a_t, r_t, s_t2, done))
if t > init_observe_time and t%frequency == 0:
minibatch = random.sample(D, batch_size)
s1_batch = [i[0] for i in minibatch]
a_batch = [i[1] for i in minibatch]
r_batch = [i[2] for i in minibatch]
s2_batch = [i[3] for i in minibatch]
q_batch = model.predict(np.array(s2_batch))
y_batch = np.zeros((batch_size, 2))
y_batch = model.predict(np.array(s1_batch))
print("Q batch: ", q_batch)
print("y batch: ", y_batch)
for i in range(0, batch_size):
if (minibatch[i][4]):
y_batch[i][np.argmax(a_batch[i])] = r_batch[i][0]
else:
y_batch[i][np.argmax(a_batch[i])] = r_batch[i][0] + gamma * np.max(q_batch[i])
model.train_on_batch(np.array(s1_batch), y_batch)
s_t1 = s_t2
t += 1
env.render()
does anyone have any suggestion on how to make it work properly?
Your second and third Conv2D layers appear to be missing their relu activations.
Your epsilon (or e) decays way too quickly. After only 95 time steps it will already be down to 0.05. I can't quickly find what they did in that 2013 paper, but in the 2015 paper they decay it from 1 to 0.1 over 1 million frames.
Those are the two things that immediately jump out to me. I'd recommend starting out by fixing those.

Resources