How to Batch Multiple Videoframes before run Tensorflow Inference Session - opencv

I made a project that basically uses googles object detection api with tensorflow.
All i am doing is inference with a pre-trained model: Which means realtime object detection where the Input is the Videostream of a webcam or something similar using OpenCV.
Right now i got pretty decent performance results, but i want to further increase the FPS.
Because what i experience is that Tensorflow uses my whole Memory while Inference but the GPU Usage is not maxed out at all (around 40% with a NVIDIA GTX 1050 Laptop, and 6% on a NVIDIA Jetson Tx2).
So my idea was to increase the GPU Usage by increasing the image batch size which is fed in each session run.
So my question is: How can i Batch multiple Frames of the Input-Videostream together before i feed them to sess.run()?
Have a look at my code object_detetection.py on my github repo: (https://github.com/GustavZ/realtime_object_detection).
I would be very thankful if you come up with some Hints or Code Implementations!
import numpy as np
import os
import six.moves.urllib as urllib
import tarfile
import tensorflow as tf
import cv2
# Protobuf Compilation (once necessary)
os.system('protoc object_detection/protos/*.proto --python_out=.')
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
from stuff.helper import FPS2, WebcamVideoStream
# INPUT PARAMS
# Must be OpenCV readable
# 0 = Default Camera
video_input = 0
visualize = True
max_frames = 300 #only used if visualize==False
width = 640
height = 480
fps_interval = 3
bbox_thickness = 8
# Model preparation
# What model to download.
MODEL_NAME = 'ssd_mobilenet_v1_coco_2017_11_17'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = 'models/' + MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
LABEL_MAP = 'mscoco_label_map.pbtxt'
PATH_TO_LABELS = 'object_detection/data/' + LABEL_MAP
NUM_CLASSES = 90
# Download Model
if not os.path.isfile(PATH_TO_CKPT):
print('Model not found. Downloading it now.')
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
file_name = os.path.basename(file.name)
if 'frozen_inference_graph.pb' in file_name:
tar_file.extract(file, os.getcwd())
os.remove('../' + MODEL_FILE)
else:
print('Model found. Proceed.')
# Load a (frozen) Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
# Loading label map
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Start Video Stream
video_stream = WebcamVideoStream(video_input,width,height).start()
cur_frames = 0
# Detection
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
# Definite input and output Tensors for detection_graph
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# fps calculation
fps = FPS2(fps_interval).start()
print ("Press 'q' to Exit")
while video_stream.isActive():
image_np = video_stream.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Actual detection.
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=bbox_thickness)
if visualize:
cv2.imshow('object_detection', image_np)
# Exit Option
if cv2.waitKey(1) & 0xFF == ord('q'):
break
else:
cur_frames += 1
if cur_frames >= max_frames:
break
# fps calculation
fps.update()
# End everything
fps.stop()
video_stream.stop()
cv2.destroyAllWindows()
print('[INFO] elapsed time (total): {:.2f}'.format(fps.elapsed()))
print('[INFO] approx. FPS: {:.2f}'.format(fps.fps()))

Well, I'd just collect batch_size frames and feed them:
batch_size = 5
while video_stream.isActive():
image_np_list = []
for _ in range(batch_size):
image_np_list.append(video_stream.read())
fps.update()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.asarray(image_np_list)
# Actual detection.
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
for i in range(batch_size):
vis_util.visualize_boxes_and_labels_on_image_array(
image_np_expanded[i],
boxes[i],
classes[i].astype(np.int32),
scores[i],
category_index,
use_normalized_coordinates=True,
line_thickness=bbox_thickness)
if visualize:
cv2.imshow('object_detection', image_np_expanded[i])
# Exit Option
if cv2.waitKey(1) & 0xFF == ord('q'):
break
Of course you'll have to make the relevant changes after that if you're reading the results from the detection, since they will now have batch_size rows.
Be careful though: before tensorflow 1.4 (I think), the object detection API only supports batch size of 1 in image_tensor, so this will not work unless you upgrade your tensorflow.
Also note that your resulting FPS will be an average, but that the frames in a same batch will actually be closer in time than between different batches (since you'll still need to wait for the sess.run() to finish). The average should still be significantly better than your current FPS, although the max time between two consecutive frames should increase.
If you want your frames to all have roughly the same interval between them, I guess you'll need more sophisticated tools like multithreading and queueing: one thread would read the images from the stream and store them in a queue, the other one would take them from the queue and call sess.run() on them asynchronously; it could also tell the 1st thread to hurry up or slow down depending on its own computing capacity. This is trickier to implement.

Related

is binary cross entropy an additive function?

I am trying to train a machine learning model where the loss function is binary cross entropy, because of gpu limitations i can only do batch size of 4 and i'm having lot of spikes in the loss graph. So I'm thinking to back-propagate after some predefined batch size(>4). So it's like i'll do 10 iterations of batch size 4 store the losses, after 10th iteration add the losses and back-propagate. will it be similar to batch size of 40.
TL;DR
f(a+b) = f(a)+f(b) is it true for binary cross entropy?
f(a+b) = f(a) + f(b) doesn't seem to be what you're after. This would imply that BCELoss is additive which it clearly isn't. I think what you really care about is if for some index i
# false
f(x, y) == f(x[:i], y[:i]) + f([i:], y[i:])
is true?
The short answer is no, because you're missing some scale factors. What you probably want is the following identity
# true
f(x, y) == (i / b) * f(x[:i], y[:i]) + (1.0 - i / b) * f(x[i:], y[i:])
where b is the total batch size.
This identity is used as motivation for the gradient accumulation method (see below). Also, this identity applies to any objective function which returns an average loss across each batch element, not just BCE.
Caveat/Pitfall: Keep in mind that batch norm will not behave exactly the same when using this approach since it updates its internal statistics based on batch size during the forward pass.
We can actually do a little better memory-wise than just computing the loss as a sum followed by backpropagation. Instead we can compute the gradient of each component in the equivalent sum individually and allow the gradients to accumulate. To better explain I'll give some examples of equivalent operations
Consider the following model
import torch
import torch.nn as nn
import torch.nn.functional as F
class MyModel(nn.Module):
def __init__(self):
super().__init__()
num_outputs = 5
# assume input shape is 10x10
self.conv_layer = nn.Conv2d(3, 10, 3, 1, 1)
self.fc_layer = nn.Linear(10*5*5, num_outputs)
def forward(self, x):
x = self.conv_layer(x)
x = F.max_pool2d(x, 2, 2, 0, 1, False, False)
x = F.relu(x)
x = self.fc_layer(x.flatten(start_dim=1))
x = torch.sigmoid(x) # or omit this and use BCEWithLogitsLoss instead of BCELoss
return x
# to ensure same results for this example
torch.manual_seed(0)
model = MyModel()
# the examples will work as long as the objective averages across batch elements
objective = nn.BCELoss()
# doesn't matter what type of optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
and lets say our data and targets for a single batch are
torch.manual_seed(1) # to ensure same results for this example
batch_size = 32
input_data = torch.randn((batch_size, 3, 10, 10))
targets = torch.randint(0, 1, (batch_size, 20)).float()
Full batch
The body of our training loop for an entire batch may look something like this
# entire batch
output = model(input_data)
loss = objective(output, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_value = loss.item()
print("Loss value: ", loss_value)
print("Model checksum: ", sum([p.sum().item() for p in model.parameters()]))
Weighted sum of loss on sub-batches
We could have computed this using the sum of multiple loss functions using
# This is simpler if the sub-batch size is a factor of batch_size
sub_batch_size = 4
assert (batch_size % sub_batch_size == 0)
# for this to work properly the batch_size must be divisible by sub_batch_size
num_sub_batches = batch_size // sub_batch_size
loss = 0
for sub_batch_idx in range(num_sub_batches):
start_idx = sub_batch_size * sub_batch_idx
end_idx = start_idx + sub_batch_size
sub_input = input_data[start_idx:end_idx]
sub_targets = targets[start_idx:end_idx]
sub_output = model(sub_input)
# add loss component for sub_batch
loss = loss + objective(sub_output, sub_targets) / num_sub_batches
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_value = loss.item()
print("Loss value: ", loss_value)
print("Model checksum: ", sum([p.sum().item() for p in model.parameters()]))
Gradient accumulation
The problem with the previous approach is that in order to apply back-propagation, pytorch needs to store intermediate results of layers in memory for every sub-batch. This ends up requiring a relatively large amount of memory and you may still run into memory consumption issues.
To alleviate this problem, instead of computing a single loss and performing back-propagation once, we could perform gradient accumulation. This gives equivalent results of the previous version. The difference here is that we instead perform a backward pass on each component of
the loss, only stepping the optimizer once all of them have been backpropagated. This way the computation graph is cleared after each sub-batch which will help with memory usage. Note that this works because .backward() actually accumulates (adds) the newly computed gradients to the existing .grad member of each model parameter. This is why optimizer.zero_grad() must be called only once, before the loop, and not during or after.
# This is simpler if the sub-batch size is a factor of batch_size
sub_batch_size = 4
assert (batch_size % sub_batch_size == 0)
# for this to work properly the batch_size must be divisible by sub_batch_size
num_sub_batches = batch_size // sub_batch_size
# Important! zero the gradients before the loop
optimizer.zero_grad()
loss_value = 0.0
for sub_batch_idx in range(num_sub_batches):
start_idx = sub_batch_size * sub_batch_idx
end_idx = start_idx + sub_batch_size
sub_input = input_data[start_idx:end_idx]
sub_targets = targets[start_idx:end_idx]
sub_output = model(sub_input)
# compute loss component for sub_batch
sub_loss = objective(sub_output, sub_targets) / num_sub_batches
# accumulate gradients
sub_loss.backward()
loss_value += sub_loss.item()
optimizer.step()
print("Loss value: ", loss_value)
print("Model checksum: ", sum([p.sum().item() for p in model.parameters()]))
I think 10 iterations of batch size 4 is same as one iteration of batch size 40, only here the time taken will be more. Across different training examples losses are added before backprop. But that doesn't make the function linear. BCELoss has a log component, and hence it is not a linear function. However what you said is correct. It will be similar to batch size 40.

My speaker recognition neural network doesn’t work well

I have a final project in my first degree and I want to build a Neural Network that gonna take the first 13 mfcc coeffs of a wav file and return who talked in the audio file from a banch of talkers.
I want you to notice that:
My audio files are text independent, therefore they have different length and words
I have trained the machine on about 35 audio files of 10 speaker ( the first speaker had about 15, the second 10, and the third and fourth about 5 each )
I defined :
X=mfcc(sound_voice)
Y=zero_array + 1 in the i_th position ( where i_th position is 0 for the first speaker, 1 for the second, 2 for the third... )
And than trained the machine, and than checked the output of the machine for some files...
So that’s what I did... but unfortunately it’s look like the results are completely random...
Can you help me understand why?
This is my code in python -
from sklearn.neural_network import MLPClassifier
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
from os import listdir
from os.path import isfile, join
from random import shuffle
import matplotlib.pyplot as plt
from tqdm import tqdm
winner = [] # this array count how much Bingo we had when we test the NN
for TestNum in tqdm(range(5)): # in every round we build NN with X,Y that out of them we check 50 after we build the NN
X = []
Y = []
onlyfiles = [f for f in listdir("FinalAudios/") if isfile(join("FinalAudios/", f))] # Files in dir
names = [] # names of the speakers
for file in onlyfiles: # for each wav sound
# UNESSECERY TO UNDERSTAND THE CODE
if " " not in file.split("_")[0]:
names.append(file.split("_")[0])
else:
names.append(file.split("_")[0].split(" ")[0])
names = list(dict.fromkeys(names)) # names of speakers
vector_names = [] # vector for each name
i = 0
vector_for_each_name = [0] * len(names)
for name in names:
vector_for_each_name[i] += 1
vector_names.append(np.array(vector_for_each_name))
vector_for_each_name[i] -= 1
i += 1
for f in onlyfiles:
if " " not in f.split("_")[0]:
f_speaker = f.split("_")[0]
else:
f_speaker = f.split("_")[0].split(" ")[0]
(rate, sig) = wav.read("FinalAudios/" + f) # read the file
try:
mfcc_feat = python_speech_features.mfcc(sig, rate, winlen=0.2, nfft=512) # mfcc coeffs
for index in range(len(mfcc_feat)): # adding each mfcc coeff to X, meaning if there is 50000 coeffs than
# X will be [first coeff, second .... 50000'th coeff] and Y will be [f_speaker_vector] * 50000
X.append(np.array(mfcc_feat[index]))
Y.append(np.array(vector_names[names.index(f_speaker)]))
except IndexError:
pass
Z = list(zip(X, Y))
shuffle(Z) # WE SHUFFLE X,Y TO PERFORM RANDOM ON THE TEST LEVEL
X, Y = zip(*Z)
X = list(X)
Y = list(Y)
X = np.asarray(X)
Y = np.asarray(Y)
Y_test = Y[:50] # CHOOSE 50 FOR TEST, OTHERS FOR TRAIN
X_test = X[:50]
X = X[50:]
Y = Y[50:]
clf = MLPClassifier(solver='lbfgs', alpha=1e-2, hidden_layer_sizes=(5, 3), random_state=2) # create the NN
clf.fit(X, Y) # Train it
for sample in range(len(X_test)): # add 1 to winner array if we correct and 0 if not, than in the end it plot it
if list(clf.predict([X[sample]])[0]) == list(Y_test[sample]):
winner.append(1)
else:
winner.append(0)
# plot winner
plot_x = []
plot_y = []
for i in range(1, len(winner)):
plot_y.append(sum(winner[0:i])*1.0/len(winner[0:i]))
plot_x.append(i)
plt.plot(plot_x, plot_y)
plt.xlabel('x - axis')
# naming the y axis
plt.ylabel('y - axis')
# giving a title to my graph
plt.title('My first graph!')
# function to show the plot
plt.show()
This is my zip file that contains the code and the audio file : https://ufile.io/eggjm1gw
You have a number of issues in your code and it will be close to impossible to get it right in one go, but let's give it a try. There are two major issues:
Currently you're trying to teach your neural network with very few training examples, as few as a single one per speaker (!). It's impossible for any machine learning algorithm to learn anything.
To make matters worse, what you do is that you feed to the ANN only MFCC for the first 25 ms of each recording (25 comes from winlen parameter of python_speech_features). In each of these recordings, first 25 ms will be close to identical. Even if you had 10k recordings per speaker, with this approach you'd not get anywhere.
I will give you concrete advise, but won't do all the coding - it's your homework after all.
Use all MFCC, not just first 25 ms. Many of these should be skipped, simply because there's no voice activity. Normally there should be VOD (Voice Activity Detector) telling you which ones to take, but in this exercise I'd skip it for starter (you need to learn basics first).
Don't use dictionaries. Not only it won't fly with more than one MFCC vector per speaker, but also it's very inefficient data structure for your task. Use numpy arrays, they're much faster and memory efficient. There's a ton of tutorials, including scikit-learn that demonstrate how to use numpy in this context. In essence, you create two arrays: one with training data, second with labels. Example: if omersk speaker "produces" 50000 MFCC vectors, you will get (50000, 13) training array. Corresponding label array would be 50000 with single constant value (id) that corresponds to the speaker (say, omersk is 0, lucas is 1 and so on). I'd consider taking longer windows (perhaps 200 ms, experiment!) to reduce the variance.
Don't forget to split your data for training, validation and test. You will have more than enough data. Also, for this exercise I'd watch for not feeding too much of data for any single speaker - ot taking steps to make sure algorithm is not biased.
Later, when you make prediction, you will again compute MFCCs for the speaker. With 10 sec recording, 200 ms window and 100 ms overlap, you'll get 99 MFCC vectors, shape (99, 13). The model should run on each of the 99 vectors, for each producing probability. When you sum it (and normalise, to make it nice) and take top value, you'll get the most likely speaker.
There's a dozen of other things that typically would be taken into account, but in this case (homework) I'd focus on getting the basics right.
EDIT: I decided to take a stab at creating the model with your idea at heart, but basics fixed. It's not exactly clean Python, all because it's adapted from Jupyter Notebook I was running.
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
import glob
import os
from collections import defaultdict
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
audio_files_path = glob.glob('audio/*.wav')
win_len = 0.04 # in seconds
step = win_len / 2
nfft = 2048
mfccs_all_speakers = []
names = []
data = []
for path in audio_files_path:
fs, audio = wav.read(path)
if audio.size > 0:
mfcc = python_speech_features.mfcc(audio, samplerate=fs, winlen=win_len,
winstep=step, nfft=nfft, appendEnergy=False)
filename = os.path.splitext(os.path.basename(path))[0]
speaker = filename[:filename.find('_')]
data.append({'filename': filename,
'speaker': speaker,
'samples': mfcc.shape[0],
'mfcc': mfcc})
else:
print(f'Skipping {path} due to 0 file size')
speaker_sample_size = defaultdict(int)
for entry in data:
speaker_sample_size[entry['speaker']] += entry['samples']
person_with_fewest_samples = min(speaker_sample_size, key=speaker_sample_size.get)
print(person_with_fewest_samples)
max_accepted_samples = int(speaker_sample_size[person_with_fewest_samples] * 0.8)
print(max_accepted_samples)
training_idx = []
test_idx = []
accumulated_size = defaultdict(int)
for entry in data:
if entry['speaker'] not in accumulated_size:
training_idx.append(entry['filename'])
accumulated_size[entry['speaker']] += entry['samples']
elif accumulated_size[entry['speaker']] < max_accepted_samples:
accumulated_size[entry['speaker']] += entry['samples']
training_idx.append(entry['filename'])
X_train = []
label_train = []
X_test = []
label_test = []
for entry in data:
if entry['filename'] in training_idx:
X_train.append(entry['mfcc'])
label_train.extend([entry['speaker']] * entry['mfcc'].shape[0])
else:
X_test.append(entry['mfcc'])
label_test.extend([entry['speaker']] * entry['mfcc'].shape[0])
X_train = np.concatenate(X_train, axis=0)
X_test = np.concatenate(X_test, axis=0)
assert (X_train.shape[0] == len(label_train))
assert (X_test.shape[0] == len(label_test))
print(f'Training: {X_train.shape}')
print(f'Testing: {X_test.shape}')
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(label_train)
y_test = le.transform(label_test)
clf = MLPClassifier(solver='lbfgs', alpha=1e-2, hidden_layer_sizes=(5, 3), random_state=42, max_iter=1000)
cv_results = cross_validate(clf, X_train, y_train, cv=4)
print(cv_results)
{'fit_time': array([3.33842635, 4.25872731, 4.73704267, 5.9454329 ]),
'score_time': array([0.00125694, 0.00073504, 0.00074005, 0.00078583]),
'test_score': array([0.40380048, 0.52969121, 0.48448687, 0.46043165])}
The test_score isn't stellar. There's a lot to improve (for starter, choice of algorithm), but the basics are there. Notice for starter how I get the training samples. It's not random, I only consider recordings as whole. You can't put samples from a given recording to both training and test, as test is supposed to be novel.
What was not working in your code? I'd say a lot. You were taking 200ms samples and yet very short fft. python_speech_features likely complained to you that the fft is should be longer than the frame you're processing.
I leave to you testing the model. It won't be good, but it's a starter.

Object detection From videos in openCV (int() argument must be a string, a bytes-like object or a number, not 'NoneType')

I am using python 3.5 in Ubuntu 16.04
https://www.learnopencv.com/install-opencv3-on-ubuntu/
used this link to download opencv3
File "<ipython-input-12-e1defa92c813>", line 1, in <module>
runfile('/home/abhishek/models/research/object_detection/Video_detection.py', wdir='/home/abhishek/models/research/object_detection')
File "/home/abhishek/anaconda3/lib/python3.5/site-packages/spyder/utils/site/sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "/home/abhishek/anaconda3/lib/python3.5/site-packages/spyder/utils/site/sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/home/abhishek/models/research/object_detection/Video_detection.py", line 139, in <module>
feed_dict={image_tensor: image_np_expanded})
File "/home/abhishek/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/home/abhishek/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1093, in _run
np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
File "/home/abhishek/anaconda3/lib/python3.5/site-packages/numpy/core/numeric.py", line 531, in asarray
return array(a, dtype, copy=False, order=order)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Python Programming Video Detection Tutorial #2
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
import cv2
cap = cv2.VideoCapture(0)
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
sys.path.append(sys.executable)
# ## Object detection imports
# Here are the imports from the object detection module.
# In[3]:
from utils import label_map_util
from utils import visualization_utils as vis_util
# # Model preparation
# ## Variables
#
# Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_CKPT` to point to a new .pb file.
#
# By default we use an "SSD with Mobilenet" model here. See the [detection model zoo](https://github.com/tensorflow/models/blob/master/object_detection/g3doc/detection_model_zoo.md) for a list of other models that can be run out-of-the-box with varying speeds and accuracies.
# In[4]:
# What model to download.
MODEL_NAME = 'ssd_mobilenet_v1_coco_11_06_2017'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')
NUM_CLASSES = 90
print(12)
# ## Download Model
# In[5]:
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
file_name = os.path.basename(file.name)
if 'frozen_inference_graph.pb' in file_name:
tar_file.extract(file, os.getcwd())
print(13)
# ## Load a (frozen) Tensorflow model into memory.
# In[6]:
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
print(14)
# ## Loading label map
# Label maps map indices to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`. Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine
# In[7]:
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
print(15)
# ## Helper code
# In[8]:
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
# # Detection
# In[9]:
# For the sake of simplicity we will use only 2 images:
# image1.jpg
# image2.jpg
# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
PATH_TO_TEST_IMAGES_DIR = 'test_images'
TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(1, 3) ]
# Size, in inches, of the output images.
IMAGE_SIZE = (12, 8)
# In[10]:
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
while True:
ret, image_np = cap.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8)
cv2.imshow('object detection', cv2.resize(image_np, (800,600)))
if cv2.waitKey(25) & 0xFF == ord('q'):
cv2.destroyAllWindows()
break
>
Please help me to proceed.I have already added my code which mostly copied from https://pythonprogramming.net/video-tensorflow-object-detection-api-tutorial/
I had the same problem.
Firstly crosscheck if your opencv has been correctly installed.
and before you take the source for object detection from your webcam try it with stock photos and check if it works.
Later upgrade your opencv to opencv3.
conda install opencv3
If problem still persists then check your webcam for input issuses.
Nonetype is only returned when no frames are being captured by your webcam.
Had same problem, file does'nt not exist when it is picture, maybe could not transform in picture when come from video or cam. check your input

Weights from Conv Layer when applied to image layer gives saturated output

I am visualizing my first layer output with image_layer when applied with trained weight. However, when I try to visualize, I get white images as follows:
Ignore the last four, the filters are of size 7x7 and there are 32 of them.
The model is built on the following architecture (Code Attached):
import numpy as np
import tensorflow as tf
import cv2
from matplotlib import pyplot as plt
% matplotlib inline
model_path = "T_set_4/Model/model.ckpt"
# Define the model parameters
# Convolutional Layer 1.
filter_size1 = 7 # Convolution filters are 7 x 7 pixels.
num_filters1 = 32 # There are 32 of these filters.
# Convolutional Layer 2.
filter_size2 = 7 # Convolution filters are 7 x 7 pixels.
num_filters2 = 64 # There are 64 of these filters.
# Fully-connected layer.
fc_size = 512 # Number of neurons in fully-connected layer.
# Define the data dimensions
# We know that MNIST images are 48 pixels in each dimension.
img_size = 48
# Images are stored in one-dimensional arrays of this length.
img_size_flat = img_size * img_size
# Tuple with height and width of images used to reshape arrays.
img_shape = (img_size, img_size)
# Number of colour channels for the images: 1 channel for gray-scale.
num_channels = 1
# Number of classes, one class for each of 10 digits.
num_classes = 2
def new_weights(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
def new_biases(length):
return tf.Variable(tf.constant(0.05, shape=[length]))
def new_conv_layer(input, # The previous layer.
num_input_channels, # Num. channels in prev. layer.
filter_size, # Width and height of each filter.
num_filters, # Number of filters.
use_pooling=True): # Use 2x2 max-pooling.
# Shape of the filter-weights for the convolution.
# This format is determined by the TensorFlow API.
shape = [filter_size, filter_size, num_input_channels, num_filters]
# Create new weights aka. filters with the given shape.
weights = new_weights(shape=shape)
# Create new biases, one for each filter.
biases = new_biases(length=num_filters)
# Create the TensorFlow operation for convolution.
# Note the strides are set to 1 in all dimensions.
# The first and last stride must always be 1,
# because the first is for the image-number and
# the last is for the input-channel.
# But e.g. strides=[1, 2, 2, 1] would mean that the filter
# is moved 2 pixels across the x- and y-axis of the image.
# The padding is set to 'SAME' which means the input image
# is padded with zeroes so the size of the output is the same.
layer = tf.nn.conv2d(input=input,
filter=weights,
strides=[1, 1, 1, 1],
padding='SAME')
# Add the biases to the results of the convolution.
# A bias-value is added to each filter-channel.
layer += biases
# Rectified Linear Unit (ReLU).
# It calculates max(x, 0) for each input pixel x.
# This adds some non-linearity to the formula and allows us
# to learn more complicated functions.
layer = tf.nn.relu(layer)
# Use pooling to down-sample the image resolution?
if use_pooling:
# This is 2x2 max-pooling, which means that we
# consider 2x2 windows and select the largest value
# in each window. Then we move 2 pixels to the next window.
layer = tf.nn.max_pool(value=layer,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME')
# norm1
norm1 = tf.nn.lrn(layer, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
name='norm1')
# Note that ReLU is normally executed before the pooling,
# but since relu(max_pool(x)) == max_pool(relu(x)) we can
# save 75% of the relu-operations by max-pooling first.
# We return both the resulting layer and the filter-weights
# because we will plot the weights later.
return layer, weights
def flatten_layer(layer):
# Get the shape of the input layer.
layer_shape = layer.get_shape()
# The shape of the input layer is assumed to be:
# layer_shape == [num_images, img_height, img_width, num_channels]
# The number of features is: img_height * img_width * num_channels
# We can use a function from TensorFlow to calculate this.
num_features = layer_shape[1:4].num_elements()
# Reshape the layer to [num_images, num_features].
# Note that we just set the size of the second dimension
# to num_features and the size of the first dimension to -1
# which means the size in that dimension is calculated
# so the total size of the tensor is unchanged from the reshaping.
layer_flat = tf.reshape(layer, [-1, num_features])
# The shape of the flattened layer is now:
# [num_images, img_height * img_width * num_channels]
# Return both the flattened layer and the number of features.
return layer_flat, num_features
def new_fc_layer(input, # The previous layer.
num_inputs, # Num. inputs from prev. layer.
num_outputs, # Num. outputs.
use_relu=True): # Use Rectified Linear Unit (ReLU)?
# Create new weights and biases.
weights = new_weights(shape=[num_inputs, num_outputs])
biases = new_biases(length=num_outputs)
# Calculate the layer as the matrix multiplication of
# the input and weights, and then add the bias-values.
layer = tf.matmul(input, weights) + biases
# Use ReLU?
if use_relu:
layer = tf.nn.relu(layer)
return layer
# Create the model
tf.reset_default_graph()
x = tf.placeholder(tf.float32, shape=[None, img_size_flat], name='x')
x_image = tf.reshape(x, [-1, img_size, img_size, num_channels])
y_true = tf.placeholder(tf.float32, shape=[None, num_classes],
name='y_true')
y_true_cls = tf.argmax(y_true, dimension=1)
# Create the model footprint
layer_conv1, weights_conv1 = new_conv_layer(input=x_image,
num_input_channels=num_channels,
filter_size=filter_size1,
num_filters=num_filters1,
use_pooling=True)
layer_conv2, weights_conv2 = new_conv_layer(input=layer_conv1,
num_input_channels=num_filters1,
filter_size=filter_size2,
num_filters=num_filters2,
use_pooling=True)
layer_flat, num_features = flatten_layer(layer_conv2)
layer_fc1 = new_fc_layer(input=layer_flat,
num_inputs=num_features,
num_outputs=fc_size,
use_relu=True)
layer_fc2 = new_fc_layer(input=layer_fc1,
num_inputs=fc_size,
num_outputs=num_classes,
use_relu=False)
y_pred = tf.nn.softmax(layer_fc2)
y_pred_cls = tf.argmax(y_pred, dimension=1)
# Restore the model
saver = tf.train.Saver()
session = tf.Session()
saver.restore(session, model_path)
The code I followed to create visualized weights is from the following:
Source code
Can someone tell me is the training or network is too shallow?
This is a perfectly fine visualization of the feature maps (not the weights) produced by the first convolutional layer on the early stages of the training.
The first layers learn to extract simple features, the learning process is somehow slow and thus you first learn to "blur" the input images, but once the network starts to converge you'll see that the first layers will start extracting meaningful low-level features (edges and so on).
Just monitor the training process and let the network training a bit more.
If, instead, you got bad performance (always look at the validation accuracy) your feature maps will always look noisy and you should start tuning the hyperparameters (lowering the learning rate, regularize, ...) in order to extract meaningful features and thus get good results

Labeling Images using Inception Getting ValueError: GraphDef cannot be larger than 2GB

I am using the TensorFlow for Poets code lab to guide me as I retrain the Inceptionv3 CNN to classify a list of images. I have successfully trained the model, and it works when i employ the given code to classify individual images. But when i try and use it on a large batch of images, then i get the GraphDef cannot be larger than 2GB. Please advise.
import pandas as pd
import os, sys
import tensorflow as tf
test_images = pd.read_csv('test_images.csv')
testid = test_images['Id']
listx= list(range(4320))
predlist=[]
output = pd.DataFrame({'Id': listx})
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
for x in listx:
path = 'test/'+str(x+1)+'.jpg'
# change this as you see fit
image_path = path
# Read in the image_data
image_data = tf.gfile.FastGFile(image_path, 'rb').read()
# Loads label file, strips off carriage return
label_lines = [line.rstrip() for line
in tf.gfile.GFile("retrained_labels.txt")]
# Unpersists graph from file
with tf.gfile.FastGFile("retrained_graph.pb", 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
with tf.Session() as sess:
# Feed the image_data as input to the graph and get first prediction
with tf.Graph().as_default():
softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
predictions = sess.run(softmax_tensor, \
{'DecodeJpeg/contents:0': image_data})
# Sort to show labels of first prediction in order of confidence
top_k = predictions[0].argsort()[-len(predictions[0]):][::-1]
# print('the top result is' + label_lines[node_id])
flag = 0
for node_id in top_k:
while flag == 0:
human_string = label_lines[node_id]
score = predictions[0][node_id]
predlist.append(int(human_string[:3]))
print('%s' % (human_string))
flag = 1 # we only want the top prediction
output['Prediction']=predlist
output.to_csv('outputtest.csv')
One way by which this error can e solved is by placing
with tf.Graph().as_default():
after for loop.
This is the piece of code that worked for me while trying to read bulk image:
for filename in os.listdir(image_path):
with tf.Graph().as_default():
# Read in the image_data
image_data = tf.gfile.FastGFile(image_path + filename, 'rb').read()

Resources