How to export bounding boxes as .jpg - opencv

for my project I want to save the Bounding Boxes found by the Object Detection API as .jpg for feeding in another CNN for further classification.
Here is my code (derived from EdjeElectronics GitHub):
import os
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
# Import utilites
from utils import label_map_util
from utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = '_model_ssd_v2'
IMAGE_NAME = 'image.jpg'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,'_data','label_map.pbtxt')
# Path to image
PATH_TO_IMAGE = os.path.join(CWD_PATH,"_images",IMAGE_NAME)
# Number of classes the object detector can identify
NUM_CLASSES = 6
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Load image using OpenCV and
# expand image dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
image = cv2.imread(PATH_TO_IMAGE)
image_expanded = np.expand_dims(image, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: image_expanded})
# Draw the results of the detection (aka 'visulaize the results')
vis_util.visualize_boxes_and_labels_on_image_array(
image,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8,
min_score_thresh=0.3)
# All the results have been drawn on image. Now display the image.
# cv2.imshow('Object detector', cv2.resize(image, (int(2592/2),int(1944/2))))
# # Press any key to close the image
# cv2.waitKey(0)
# # Clean up
# cv2.destroyAllWindows()
cv2.imwrite("C:/tensorflow/models/research/object_detection/_images/test1.jpg", image)
A similar question was asked here but I donĀ“t know how to apply it with the Tensorflow Object Detection API.
Thank You!

I've found the function draw_bounding_boxes_on_image in the vis_util. Try this:
#create a white back ground image with the same shape as image
white_bg_img = 255*np.ones(image.shape, np.uint8)
vis_util.draw_bounding_boxes_on_image(
white_bg_img ,
np.squeeze(boxes),
color='red',
thickness=4)
cv2.imwrite("bounding_boxes.jpg", white_bg_img )
To draw the image within the bounding boxes.
boxes = np.squeeze(boxes)
for i in range(len(boxes)):
ymin = box[i,0]
xmin = box[i,1]
ymax = box[i,2]
xmax = box[i,3]
roi = image[ymin:ymax,xmin:xmax].copy()
cv2.imwrite("box_{}.jpg".format(str(i)), roi)
Save files will be like box_1.jpg, box_2.jpg ...

I followed this link and it worked. Add the following code:
min_score_thresh=0.60
true_boxes = boxes[0][scores[0] > min_score_thresh]
for i in range(true_boxes.shape[0]):
ymin = int(true_boxes[i,0]*height)
xmin = int(true_boxes[i,1]*width)
ymax = int(true_boxes[i,2]*height)
xmax = int(true_boxes[i,3]*width)
roi = image[ymin:ymax,xmin:xmax].copy()
cv2.imwrite("box_{}.jpg".format(str(i)), roi)
Make sure you define true height and width of image.

this will work
enter code here
box = np.squeeze(boxes)
for i in range(len(boxes)):
ymin = (int(box[i,0]*height))
xmin = (int(box[i,1]*width))
ymax = (int(box[i,2]*height))
xmax = (int(box[i,3]*width))
print(ymin,xmin,ymax,xmax)
roi =image[ymin:ymax,xmin:xmax].copy()

Related

Is it common for sift.compute to eliminate almost all key points generated by shift.detect?

I am trying to align multispectral drone images using opencv, and when I try and use homography to align the images I get an error stating I need at least 4 matching points. I went back and broke up the sift.detectandcompute function into two separate lines and printed the number of detected points after each. after sift.detect I had over 100,000 points, when I ran sift.compute, it eliminated that number down to just two. Is there a way to make it less restrictive?
Ive included my code below in case that helps.
import cv2
import numpy as np
import os
def final_align(file_paths):
# Load the images from the file paths
images = [cv2.imread(file_path) for file_path in file_paths]
# Define the calibrated optical centers for each image
calibrated_optical_centers = {
"1": (834.056702, 643.766418),
"2": (836.952271, 631.696899),
"3": (832.183411, 642.485901),
"4": (795.311279, 680.615906),
"5": (807.490295, 685.338379),
}
# Create a list to store the aligned images
aligned_images = []
for file_path in file_paths:
# Get the 5th from last character in the file path
image_id = file_path[-5]
# Get the calibrated optical center for the image
calibrated_optical_center = calibrated_optical_centers[image_id]
# Load the image
image = cv2.imread(file_path)
# Get the shape of the image
height, width = image.shape[:2]
# Calculate the center of the image
center_x = width // 2
center_y = height // 2
# Calculate the shift needed to align the image
shift_x = float(calibrated_optical_center[0] - center_x)
shift_y = float(calibrated_optical_center[1] - center_y)
# Create a translation matrix
M = np.float32([[1, 0, shift_x], [0, 1, shift_y]])
# Apply the translation to the image
aligned_image = cv2.warpAffine(image, M, (width, height))
# Add the aligned image to the list of aligned images
aligned_images.append(aligned_image)
return aligned_images
file_paths = [
"G:\Shared\Mulitband\\flights\\flight_1\\2611_DJI_0011.TIF",
"G:\Shared\Mulitband\\flights\\flight_1\\2612_DJI_0012.TIF",
"G:\Shared\Mulitband\\flights\\flight_1\\2613_DJI_0013.TIF",
"G:\Shared\Mulitband\\flights\\flight_1\\2614_DJI_0014.TIF",
"G:\Shared\Mulitband\\flights\\flight_1\\2615_DJI_0015.TIF",
]
# Call the final_align function
final_aligned_images = final_align(file_paths)
# Get the center of the first image
height, width = final_aligned_images[0].shape[:2]
center_y = height // 2
center_x = width // 2
# Specify the crop size in the y and x direction
crop_y = 1220
crop_x = 1520
#crop function
def crop_images(final_aligned_images, center_y, center_x, crop_y, crop_x):
cropped_images = []
for image in final_aligned_images:
height, width = image.shape[:2]
start_y = center_y - crop_y // 2
end_y = center_y + crop_y // 2 + 1
start_x = center_x - crop_x // 2
end_x = center_x + crop_x // 2 + 1
cropped_image = image[start_y:end_y, start_x:end_x]
cropped_images.append(cropped_image)
return cropped_images
cropped_images = crop_images(final_aligned_images, center_y, center_x, crop_y, crop_x)
#print(cropped_images)
for i, final_complete_image in enumerate(cropped_images):
# Create the Results/aligned directory if it doesn't exist
os.makedirs("G:\Shared\Mulitband\Results\\aligned", exist_ok=True)
# Construct the file path for the aligned image
final_aligned_image_path = "G:\Shared\Mulitband\Results\\aligned\\aligned_{}.tif".format(i)
# Save the final aligned image to the file path
cv2.imwrite(final_aligned_image_path, final_complete_image)
"""
# TEST OF FUNCTION
img = cropped_images[1]
# Call the sift_align function
sift = cv2.xfeatures2d.SIFT_create()
kp = sift.detect(cropped_images[1], None)
img=cv2.drawKeypoints(cropped_images[1] ,
kp ,
img,
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
cv2.imwrite('G:\Shared\Mulitband\Results\\aligned\image-with-keypoints.jpg', img)"""
#Create the SIFT Function
def sift_align(cropped_images):
# Create the SIFT detector and descriptor
sift = cv2.SIFT_create()
# Create a list to store the aligned images
aligned_images = []
# Choose the first image as the reference image
reference_image = cropped_images[0]
# reference_image = cv2.convertScaleAbs(reference_image, alpha=(255.0/65535.0))
# Detect the keypoints and compute the descriptors for the reference image ", reference_descriptors"
reference_keypoints = sift.detect(reference_image, None)
reference_keypoints = sift.compute(reference_image, reference_keypoints)
print("Number of keypoints in reference image:", len(reference_keypoints))
# Iterate over the remaining images
for i, image in enumerate(cropped_images[1:]):
# Detect the keypoints and compute the descriptors for the current image
image_keypoints, = sift.detect(image, None)
# Use the BFMatcher to find the best matches between the reference and current image descriptors
bf = cv2.BFMatcher()
# matches = bf.match(image_descriptors, image_descriptors)
# Sort the matches based on their distances
matches = sorted(matches, key = lambda x:x.distance)
# Use the best matches to estimate the homography between the reference and current image
src_pts = np.float32([reference_keypoints[m.queryIdx].pt for m in matches[:50]]).reshape(-1,1,2)
dst_pts = np.float32([image_keypoints[m.trainIdx].pt for m in matches[:50]]).reshape(-1,1,2)
homography, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
# Use the homography to align the current image with the reference image
aligned_image = cv2.warpPerspective(image, homography, (reference_image.shape[1], reference_image.shape[0]))
# Add the aligned image to the list of aligned images
aligned_images.append(aligned_image)
# Stack the aligned images along the third dimension
aligned_images = np.stack(aligned_images, axis=-1)
return aligned_images
final_complete_images = sift_align(cropped_images)
"""# Save the final aligned images to the Results/aligned directory
for i, final_complete_image in enumerate(final_complete_images):
# Create the Results/aligned directory if it doesn't exist
os.makedirs("G:\Shared\Mulitband\Results\\aligned", exist_ok=True)
# Construct the file path for the aligned image
final_aligned_image_path = "G:\Shared\Mulitband\Results\\aligned\\aligned_{}.tif".format(i)
# Save the final aligned image to the file path
cv2.imwrite(final_aligned_image_path, final_complete_image)"""

Why my feature map seems incorrect when the prediction of the class is correct

from torchvision.models.feature_extraction import create_feature_extractor
# Data processing
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)])
image_path = './data/test_images/anemone.jpg'
image = Image.open(image_path).convert('RGB')
img_processed = preprocess(image)
batch_img_cat_tensor = torch.unsqueeze(img_processed, 0)
# Model initialization
resnet50_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# Eval model for predictions
resnet50_model.eval()
# Creating feature extractor (Detailed example here: https://pytorch.org/blog/FX-feature-extraction-torchvision/)
feature_extractor = create_feature_extractor(resnet50_model,
return_nodes=['layer4.2.conv3', 'fc'])
# Forward pass
out = feature_extractor(batch_img_cat_tensor)
pred = torch.argmax(out['fc'])
# Transforming last conv output to numpy and reshaping it so that the channels would be last
last_conv_output = torch.squeeze(out['layer4.2.conv3'])
last_conv_output = torch.reshape(last_conv_output, (7, 7, -1))
last_conv_output = last_conv_output.detach().numpy()
last_conv_output = last_conv_output.astype(np.uint8)
Calculating the upscale factors for last conv output
width_factor = int(image.size[0] / last_conv_output.shape[0])
height_factor = int(image.size[1] / last_conv_output.shape[1])
# Getting the shapes of the last conv output
last_conv_w, last_conv_h, n_channels = last_conv_output.shape
# Calculate the
upscaled_h = last_conv_h * height_factor
upscaled_w = last_conv_w * width_factor
# Upscaling the last_conv_output so that it could be "masked" with original image
upsampled_last_conv_output = np.zeros((upscaled_h, upscaled_w, n_channels))
upsampled_last_conv_output = []
for x in range(0, n_channels, 512):
upsampled_last_conv_output.append(cv2.resize(last_conv_output[:, :, x:x+512], (upscaled_w, upscaled_h), cv2.INTER_CUBIC))
upsampled_last_conv_output = np.concatenate(upsampled_last_conv_output, axis=2)
# Getting the weights of the predicted class
last_layer_weights = resnet50_model.fc.weight.T
last_layer_weights_for_pred = last_layer_weights[:, pred]
# Dot multiplying the upsampled_last_conv_output with last_layer_weights_for_pred
upsampled_last_conv_output = upsampled_last_conv_output.reshape((-1, 2048))
heat_map = np.dot(upsampled_last_conv_output,
last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
# Plotting the results
fig, ax = plt.subplots()
ax.imshow(image)
ax.imshow(heat_map, cmap='jet', alpha=0.5)
ax.set_title(prediction)
I have followed the tutorial from here: https://www.youtube.com/watch?v=GiyldmoYe_M&t=665s&ab_channel=DigitalSreeni
The main problem with this is that I get the feature map that looks like this:
As you see it looks like the model reacts to multiple areas on the image and no matter what image I use it always has the biggest reaction in the middle.
PS. If you think this question should be posted on the AI stack exchange please notify me
I have found an error I made. It was that after creating a
heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
I had to apply this as well:
heat_map = heat_map - np.min(heat_map)
heat_map = heat_map / np.max(heat_map)
Since I normalized the image, the generated heatmap was also normalized, so I needed to "denormalize" it back to it's original values.

keras neural network predicts the same number for every handwritten digit

I am new to machine learning so as a first project I've tried to built a handwritten digit recognition neural network based on the mnist dataset and when I test it with the test images provided by the data set itself it seems to work pretty well (that's what the function test_predict is for). Now I would like to step it up and have the network recognise some actual handwritten digits that I've taken photos of.
The function partial_img_rec takes on an image containing multiple digits and it will be called by multiple_digits. I know it might seem weird that I use recursion here and I'm sure there are some more efficient ways to do this but that's not the matter. In order to test partial_img_rec I provide some photos of individual digits that are stored in the folder .\individual_test and they all look something like this:
The problem is: My neural network's prediction for every single one of my test images is "5". The probability is always about 22% no matter the actual digit displayed. I totally get why the results are not as great as those achieved with the mnist dataset's test images but I certainly didn't expect this. Do you have any idea why this is happening? Any advise is welcome.
Thank you in advance.
Here's my code (edited, now working):
# import keras and the MNIST dataset
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.utils import np_utils
# numpy is necessary since keras uses numpy arrays
import numpy as np
# imports for pictures
from PIL import Image
from PIL import ImageOps
# imports for tests
import random
import os
class mnist_network():
def __init__(self):
""" load data, create and train model """
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# flatten 28*28 images to a 784 vector for each image
num_pixels = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape((X_train.shape[0], num_pixels)).astype('float32')
X_test = X_test.reshape((X_test.shape[0], num_pixels)).astype('float32')
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255
# one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]
# create model
self.model = Sequential()
self.model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
# Compile model
self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# train the model
self.model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=200, verbose=2)
self.train_img = X_train
self.train_res = y_train
self.test_img = X_test
self.test_res = y_test
def test_all(self):
""" evaluates the success rate using all the test data """
scores = self.model.evaluate(self.test_img, self.test_res, verbose=0)
print("Baseline Error: %.2f%%" % (100-scores[1]*100))
def predict_result(self, img, num_pixels = None, show=False):
""" predicts the number in a picture (vector) """
assert type(img) == np.ndarray and img.shape == (784,)
"""if show:
# show the picture!!!! some problem here
plt.imshow(img, cmap='Greys')
plt.show()"""
num_pixels = img.shape[0]
# the actual number
res_number = np.argmax(self.model.predict(img.reshape(-1,num_pixels)), axis = 1)
# the probabilities
res_probabilities = self.model.predict(img.reshape(-1,num_pixels))
return (res_number[0], res_probabilities.tolist()[0]) # we only need the first element since they only have one
def test_predict(self, amount_test = 100):
""" test some random numbers from the test part of the data set """
assert type(amount_test) == int and amount_test <= 10000
cnt_right = 0
cnt_wrong = 0
for i in range(amount_test):
ind = random.randrange(0,10000) # there are 10000 images in the test part of the data set
""" correct_res is the actual result stored in the data set
It's represented as a list of 10 elements one of which being 1, the rest 0 """
correct_list = self.test_res.tolist()
correct_list = correct_list[ind] # the correct sublist
correct_res = correct_list.index(1.0)
predicted_res = self.predict_result(self.test_img[ind])[0]
if correct_res != predicted_res:
cnt_wrong += 1
print("Error in predict ! \
index = ", ind, " predicted result = ", predicted_res, " correct result = ", correct_res)
else:
cnt_right += 1
print("The machine predicted correctly ",cnt_right," out of ",amount_test," examples. That is a success rate of ", (cnt_right/amount_test)*100,"%.")
def partial_img_rec(self, image, upper_left, lower_right, results=[]):
""" partial is a part of an image """
left_x, left_y = upper_left
right_x, right_y = lower_right
print("current test part: ", upper_left, lower_right)
print("results: ", results)
# condition to stop recursion: we've reached the full width of the picture
width, height = image.size
if right_x > width:
return results
partial = image.crop((left_x, left_y, right_x, right_y))
# rescale image to 28 *28 dimension
partial = partial.resize((28,28), Image.ANTIALIAS)
partial.show()
# transform to vector
partial = ImageOps.invert(partial)
partial = np.asarray(partial, "float32")
partial = partial / 255.
partial[partial < 0.5] = 0.
# flatten image to 28*28 = 784 vector
num_pixels = partial.shape[0] * partial.shape[1]
partial = partial.reshape(num_pixels)
step = height // 10
# is there a number in this part of the image?
res, prop = self.predict_result(partial)
print("result: ", res, ". probabilities: ", prop)
# only count this result if the network is >= 50% sure
if prop[res] >= 0.5:
results.append(res)
# step is 80% of the partial image's size (which is equivalent to the original image's height)
step = int(height * 0.8)
print("found valid result")
else:
# if there is no number found we take smaller steps
step = height // 20
print("step: ", step)
# recursive call with modified positions ( move on step variables )
return self.partial_img_rec(image, (left_x+step, left_y), (right_x+step, right_y), results=results)
def test_individual_digits(self):
""" test partial_img_rec with some individual digits (square shaped images)
saved in the folder 'individual_test' following the pattern 'number_digit.jpg' """
cnt_right, cnt_wrong = 0,0
folder_content = os.listdir(".\individual_test")
for imageName in folder_content:
# image file must be a jpg or png
assert imageName[-4:] == ".jpg" or imageName[-4:] == ".png"
correct_res = int(imageName[0])
image = Image.open(".\\individual_test\\" + imageName).convert("L")
# only square images in this test
if image.size[0] != image.size[1]:
print(imageName, " has the wrong proportions: ", image.size,". It has to be a square.")
continue
predicted_res = self.partial_img_rec(image, (0,0), (image.size[0], image.size[1]), results=[])
if predicted_res == []:
print("No prediction possible for ", imageName)
else:
predicted_res = predicted_res[0]
if predicted_res != correct_res:
print("error in partial_img-rec! Predicted ", predicted_res, ". The correct result would have been ", correct_res)
cnt_wrong += 1
else:
cnt_right += 1
print("correctly predicted ",imageName)
print(cnt_right, " out of ", cnt_right + cnt_wrong," digits were correctly recognised. The success rate is therefore ", (cnt_right / (cnt_right + cnt_wrong)) * 100," %.")
def multiple_digits(self, img):
""" takes as input an image without unnecessary whitespace surrounding the digits """
#assert type(img) == myImage
width, height = img.size
# start with the first quadratic part of the image
res_list = self.partial_img_rec(img, (0,0),(height ,height))
res_str =""
for elem in res_list:
res_str += str(elem)
return res_str
network = mnist_network()
network.test_individual_digits()
EDIT
#Geecode's answer was very helpful and the network now predicts correctly some of the pictures including the one shown above. Yet the overall success rate is lower than 50%. Do you have any ideas how to improve this?
Examples for images returning bad results:
Nothing wrong with your image in itself, your model can correctly classify it.
The issue is that you made a Floor Division on your partial:
partial = partial // 255
which always results in 0. So you always get a black image.
You have to do a "normal" division and some preparation, because your model was trained on black i.e. 0. valued pixel backgrounded negative images:
# transform to vector
partial = ImageOps.invert(partial)
partial = np.asarray(partial, "float32")
partial = partial / 255.
partial[partial < 0.5] = 0.
After then your model will classify correctly:
Out:
result: 1 . probabilities: [0.000431705528171733, 0.7594985961914062, 0.0011404436081647873, 0.00018972357793245465, 0.03162384033203125, 0.008697531186044216, 0.0014472954208031297, 0.18429973721504211, 0.006838776171207428, 0.005832481198012829]
found valid result
Note, that of course you can play on the image preparation yet, that was not the purpose of this answer.
Update:
My detailed answer regarding how to achive better performance in this task, see here.

How to count vehicles using opencv in python?

I am working on a VCS (vehicle counting system) project. The scope of the project is to classify and count vehicles. I have built a custom model using Faster-RCNN in Tensorflow-object-detection-API This model only contains 7 classes such as car motorbike, bicycle and etc. The model works perfectly, But, the problem is "COUNTING". It is very hard to count vehicles in video frame. I did a pre-research on the internet. I tried a lot. but i could not find any useful information. There are some projects on github, they use tracking methods.
I want the following things. I want to draw an horizontal line in the frame. when the vehicle touch it, the counting should take place. How to do it. I don't know the algorithm behind it. I heard that centroid tracking would help me.
My question is, i want to count vehicles when it touch the horizontal line. I have linked a sample image bellow.
Sample_Image
import os
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
# Import utilites
from utils import label_map_util
from utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = 'inference_graph'
VIDEO_NAME = 'Video_105.mp4'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,'training','labelmap.pbtxt')
# Path to video
PATH_TO_VIDEO = os.path.join(CWD_PATH,VIDEO_NAME)
# Number of classes the object detector can identify
NUM_CLASSES = 7
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Open video file
video = cv2.VideoCapture(PATH_TO_VIDEO)
while(video.isOpened()):
# Acquire frame and expand frame dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
ret, frame = video.read()
frame_expanded = np.expand_dims(frame, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: frame_expanded})
# Draw the results of the detection (aka 'visulaize the results')
vis_util.visualize_boxes_and_labels_on_image_array(
frame,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8,
min_score_thresh=0.90)
# All the results have been drawn on the frame, so it's time to display it.
final_score = np.squeeze(scores)
count = 0
cv2.line(frame, (1144, 568), (1723,664), (0,0,255), 2) #Line
for i in range(100):
if scores is None or final_score[i] > 0.90:
min_score_thresh = 0.90
bboxes = boxes[scores > min_score_thresh]
im_width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
im_height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
final_box = []
for box in bboxes:
ymin, xmin, ymax, xmax = box
print("Ymin:{}:Xmin:{}:Ymax:{}Xmax{}".format(ymin*im_width,xmin*im_width,ymax*im_width,xmax*im_width))
final_box.append([xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height])
#print(final_box)
cv2.imshow('Object detector', frame)
# Press 'q' to quit
if cv2.waitKey(1) == ord('q'):
break
# Clean up
video.release()
cv2.destroyAllWindows()
# import the necessary packages
from imutils.video import VideoStream
from imutils.video import FPS
import argparse
import imutils
import time
import cv2
tracker = cv2.TrackerCSRT_create()
vs = cv2.VideoCapture("Video.mp4")
initBB = None
detec = []
def pega_centro(x, y, w, h):
x1 = int(w / 2)
y1 = int(h / 2)
cx = x + x1
cy = y + y1
return cx,cy
roi = 480
counter = 0
offset = 6
# loop over frames from the video stream
while vs.isOpened():
ret,frame = vs.read()
cv2.line(frame, (769 , roi), (1298 , roi), (255,0,0), 3)
# check to see if we are currently tracking an object
if initBB is not None:
# grab the new bounding box coordinates of the object
(success, box) = tracker.update(frame)
# check to see if the tracking was a success
if success:
(x, y, w, h) = [int(v) for v in box]
cv2.rectangle(frame, (x, y), (x + w, y + h),
(0, 255, 0), 2)
cX = int((x + x+w) / 2.0)
cY = int((y + y+h) / 2.0)
cv2.circle(frame, (cX, cY), 3, (0, 0, 255), -1)
c=pega_centro(x, y, w, h)
detec.append(c)
for (x,y) in detec:
if y<(roi+offset) and y>(roi-offset):
counter+=1
print(counter)
cv2.line(frame, (769 , roi), (1298 , roi), (0,0,255), 3)
detec.remove((x,y))
# show the output frame
cv2.imshow("Frame", frame)
key = cv2.waitKey(1) & 0xFF
if key == ord("s"):
# select the bounding box of the object we want to track (make
# sure you press ENTER or SPACE after selecting the ROI)
initBB = cv2.selectROI("Frame", frame, fromCenter=False,
showCrosshair=True)
# start OpenCV object tracker using the supplied bounding box
# coordinates, then start the FPS throughput estimator as well
tracker.init(frame, initBB)
fps = FPS().start()
# if the `q` key was pressed, break from the loop
elif key == ord("q"):
break
else:
vs.release()
cv2.destroyAllWindows()

Optimize performance of real-time object detection from camera with TensorFlow GPU and OpenCV

Trying to recognize objects real time using TensorFlow Object Detection API OpenCV using ssd_mobilenet_v1_coco_11_06_2017 model in GPU.
# What model to archieve.
MODEL_NAME = 'ssd_mobilenet_v1_coco_11_06_2017'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')
NUM_CLASSES = 90
# ## Loading label map
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
def detect_objects(image_np, sess, detection_graph):
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
rect_points, class_names, class_colors = draw_boxes_and_labels(
boxes=np.squeeze(boxes),
classes=np.squeeze(classes).astype(np.int32),
scores=np.squeeze(scores),
category_index=category_index,
min_score_thresh=.5
)
return dict(rect_points=rect_points, class_names=class_names, class_colors=class_colors)
# Archieving Model
# Load a (frozen) Tensorflow model into memory.
def initiateModel(input_queue, output_queue ):
# Load a (frozen) Tensorflow model into memory.
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=.8)
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True), graph=detection_graph)
while True:
frame = input_queue.get()
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
output_queue.put(detect_objects(frame_rgb, sess, detection_graph))
sess.close()
if __name__ == '__main__':
input_queue = Queue(5)
output_queue = Queue()
for i in range(1):
t = Thread(target=initiateModel, args=(input_queue, output_queue ))
t.daemon = True
t.start()
video_capture = WebcamVideoStream(src=args.video_source,
width=args.width,
height=args.height).start()
frameRate=0
start_time = time.time()
while True:
frame = video_capture.read()
frame = cv2.resize(frame, (640,480))
height, width, channels = frame.shape
#print("height,width : ", height, width)
input_queue.put(frame)
if output_queue.empty():
pass # fill up queue
else:
font = cv2.FONT_HERSHEY_SIMPLEX
data = output_q.get()
rec_points = data['rect_points']
class_names = data['class_names']
class_colors = data['class_colors']
for point, name, color in zip(rec_points, class_names, class_colors):
cv2.rectangle(frame, (int(point['xmin'] * args.width), int(point['ymin'] * args.height)),
(int(point['xmax'] * args.width), int(point['ymax'] * args.height)), color, 3)
cv2.rectangle(frame, (int(point['xmin'] * args.width), int(point['ymin'] * args.height)),
(int(point['xmin'] * args.width) + len(name[0]) * 6,
int(point['ymin'] * args.height) - 10), color, -1, cv2.LINE_AA)
cv2.putText(frame, name[0], (int(point['xmin'] * args.width), int(point['ymin'] * args.height)), font,
0.3, (0, 0, 0), 1)
cv2.namedWindow("Video", cv2.WINDOW_NORMAL)
cv2.imshow('Video', frame)
if (time.time() >= start_time+1):
print ("Frame Rate : ", frameRate)
start_time = time.time()
frameRate=0
else:
frameRate=frameRate+1
fps.update()
#print('[INFO] elapsed time: {:.2f}'.format(time.time() - t))
if cv2.waitKey(1) & 0xFF == ord('q'):
break
fps.stop()
video_capture.stop()
cv2.destroyAllWindows()
Getting around 17~18 on single GPU( NVIDIA 1070 6 gb).
GPU uses most of its dedicated memory but still overall GPU usage is around 30-32%.
How can I enhance usage of GPU to increase performance?
I need to achieve around 60 fps with custom object detector to detect only particularly patterned objects.

Resources