Is it common for sift.compute to eliminate almost all key points generated by shift.detect? - opencv

I am trying to align multispectral drone images using opencv, and when I try and use homography to align the images I get an error stating I need at least 4 matching points. I went back and broke up the sift.detectandcompute function into two separate lines and printed the number of detected points after each. after sift.detect I had over 100,000 points, when I ran sift.compute, it eliminated that number down to just two. Is there a way to make it less restrictive?
Ive included my code below in case that helps.
import cv2
import numpy as np
import os
def final_align(file_paths):
# Load the images from the file paths
images = [cv2.imread(file_path) for file_path in file_paths]
# Define the calibrated optical centers for each image
calibrated_optical_centers = {
"1": (834.056702, 643.766418),
"2": (836.952271, 631.696899),
"3": (832.183411, 642.485901),
"4": (795.311279, 680.615906),
"5": (807.490295, 685.338379),
# Create a list to store the aligned images
aligned_images = []
for file_path in file_paths:
# Get the 5th from last character in the file path
image_id = file_path[-5]
# Get the calibrated optical center for the image
calibrated_optical_center = calibrated_optical_centers[image_id]
# Load the image
image = cv2.imread(file_path)
# Get the shape of the image
height, width = image.shape[:2]
# Calculate the center of the image
center_x = width // 2
center_y = height // 2
# Calculate the shift needed to align the image
shift_x = float(calibrated_optical_center[0] - center_x)
shift_y = float(calibrated_optical_center[1] - center_y)
# Create a translation matrix
M = np.float32([[1, 0, shift_x], [0, 1, shift_y]])
# Apply the translation to the image
aligned_image = cv2.warpAffine(image, M, (width, height))
# Add the aligned image to the list of aligned images
return aligned_images
file_paths = [
# Call the final_align function
final_aligned_images = final_align(file_paths)
# Get the center of the first image
height, width = final_aligned_images[0].shape[:2]
center_y = height // 2
center_x = width // 2
# Specify the crop size in the y and x direction
crop_y = 1220
crop_x = 1520
#crop function
def crop_images(final_aligned_images, center_y, center_x, crop_y, crop_x):
cropped_images = []
for image in final_aligned_images:
height, width = image.shape[:2]
start_y = center_y - crop_y // 2
end_y = center_y + crop_y // 2 + 1
start_x = center_x - crop_x // 2
end_x = center_x + crop_x // 2 + 1
cropped_image = image[start_y:end_y, start_x:end_x]
return cropped_images
cropped_images = crop_images(final_aligned_images, center_y, center_x, crop_y, crop_x)
for i, final_complete_image in enumerate(cropped_images):
# Create the Results/aligned directory if it doesn't exist
os.makedirs("G:\Shared\Mulitband\Results\\aligned", exist_ok=True)
# Construct the file path for the aligned image
final_aligned_image_path = "G:\Shared\Mulitband\Results\\aligned\\aligned_{}.tif".format(i)
# Save the final aligned image to the file path
cv2.imwrite(final_aligned_image_path, final_complete_image)
img = cropped_images[1]
# Call the sift_align function
sift = cv2.xfeatures2d.SIFT_create()
kp = sift.detect(cropped_images[1], None)
img=cv2.drawKeypoints(cropped_images[1] ,
kp ,
cv2.imwrite('G:\Shared\Mulitband\Results\\aligned\image-with-keypoints.jpg', img)"""
#Create the SIFT Function
def sift_align(cropped_images):
# Create the SIFT detector and descriptor
sift = cv2.SIFT_create()
# Create a list to store the aligned images
aligned_images = []
# Choose the first image as the reference image
reference_image = cropped_images[0]
# reference_image = cv2.convertScaleAbs(reference_image, alpha=(255.0/65535.0))
# Detect the keypoints and compute the descriptors for the reference image ", reference_descriptors"
reference_keypoints = sift.detect(reference_image, None)
reference_keypoints = sift.compute(reference_image, reference_keypoints)
print("Number of keypoints in reference image:", len(reference_keypoints))
# Iterate over the remaining images
for i, image in enumerate(cropped_images[1:]):
# Detect the keypoints and compute the descriptors for the current image
image_keypoints, = sift.detect(image, None)
# Use the BFMatcher to find the best matches between the reference and current image descriptors
bf = cv2.BFMatcher()
# matches = bf.match(image_descriptors, image_descriptors)
# Sort the matches based on their distances
matches = sorted(matches, key = lambda x:x.distance)
# Use the best matches to estimate the homography between the reference and current image
src_pts = np.float32([reference_keypoints[m.queryIdx].pt for m in matches[:50]]).reshape(-1,1,2)
dst_pts = np.float32([image_keypoints[m.trainIdx].pt for m in matches[:50]]).reshape(-1,1,2)
homography, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
# Use the homography to align the current image with the reference image
aligned_image = cv2.warpPerspective(image, homography, (reference_image.shape[1], reference_image.shape[0]))
# Add the aligned image to the list of aligned images
# Stack the aligned images along the third dimension
aligned_images = np.stack(aligned_images, axis=-1)
return aligned_images
final_complete_images = sift_align(cropped_images)
"""# Save the final aligned images to the Results/aligned directory
for i, final_complete_image in enumerate(final_complete_images):
# Create the Results/aligned directory if it doesn't exist
os.makedirs("G:\Shared\Mulitband\Results\\aligned", exist_ok=True)
# Construct the file path for the aligned image
final_aligned_image_path = "G:\Shared\Mulitband\Results\\aligned\\aligned_{}.tif".format(i)
# Save the final aligned image to the file path
cv2.imwrite(final_aligned_image_path, final_complete_image)"""


Why my feature map seems incorrect when the prediction of the class is correct

from torchvision.models.feature_extraction import create_feature_extractor
# Data processing
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
image_path = './data/test_images/anemone.jpg'
image ='RGB')
img_processed = preprocess(image)
batch_img_cat_tensor = torch.unsqueeze(img_processed, 0)
# Model initialization
resnet50_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# Eval model for predictions
# Creating feature extractor (Detailed example here:
feature_extractor = create_feature_extractor(resnet50_model,
return_nodes=['layer4.2.conv3', 'fc'])
# Forward pass
out = feature_extractor(batch_img_cat_tensor)
pred = torch.argmax(out['fc'])
# Transforming last conv output to numpy and reshaping it so that the channels would be last
last_conv_output = torch.squeeze(out['layer4.2.conv3'])
last_conv_output = torch.reshape(last_conv_output, (7, 7, -1))
last_conv_output = last_conv_output.detach().numpy()
last_conv_output = last_conv_output.astype(np.uint8)
Calculating the upscale factors for last conv output
width_factor = int(image.size[0] / last_conv_output.shape[0])
height_factor = int(image.size[1] / last_conv_output.shape[1])
# Getting the shapes of the last conv output
last_conv_w, last_conv_h, n_channels = last_conv_output.shape
# Calculate the
upscaled_h = last_conv_h * height_factor
upscaled_w = last_conv_w * width_factor
# Upscaling the last_conv_output so that it could be "masked" with original image
upsampled_last_conv_output = np.zeros((upscaled_h, upscaled_w, n_channels))
upsampled_last_conv_output = []
for x in range(0, n_channels, 512):
upsampled_last_conv_output.append(cv2.resize(last_conv_output[:, :, x:x+512], (upscaled_w, upscaled_h), cv2.INTER_CUBIC))
upsampled_last_conv_output = np.concatenate(upsampled_last_conv_output, axis=2)
# Getting the weights of the predicted class
last_layer_weights = resnet50_model.fc.weight.T
last_layer_weights_for_pred = last_layer_weights[:, pred]
# Dot multiplying the upsampled_last_conv_output with last_layer_weights_for_pred
upsampled_last_conv_output = upsampled_last_conv_output.reshape((-1, 2048))
heat_map =,
last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
# Plotting the results
fig, ax = plt.subplots()
ax.imshow(heat_map, cmap='jet', alpha=0.5)
I have followed the tutorial from here:
The main problem with this is that I get the feature map that looks like this:
As you see it looks like the model reacts to multiple areas on the image and no matter what image I use it always has the biggest reaction in the middle.
PS. If you think this question should be posted on the AI stack exchange please notify me
I have found an error I made. It was that after creating a
heat_map =, last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
I had to apply this as well:
heat_map = heat_map - np.min(heat_map)
heat_map = heat_map / np.max(heat_map)
Since I normalized the image, the generated heatmap was also normalized, so I needed to "denormalize" it back to it's original values.

Why does Tesseract fail to recognize 6 out of 26 of my alphabetic keyboard keys even with several parameter tunings?

TL;DR I'm using:
adaptive thresholding
segmenting by keys (width/height ratio) - see green boxes in image result
psm 10 to treat each key as a character
but it fails to recognize some keys, falsely identifies others or identifies 2 for 1 char (see the L character in the image result, it's an L and P), etc.
Note: I cropped the image and re-ran the results to get it to fit on this site, but before cropping it did slightly better (recognized more keys, fewer false positives, etc).
I just want it to recognize the alphabet keys. Ultimately I will want it to work for realtime video.
'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
I've tried scaling the image differently, scaling the individual key segments, using opening/closing/etc but it doesn't recognize all the keys.
original image
image result
Update: new results if I make the image straighter (bird's eye) and remove the whitelisting, it manages to detect all for the most part (although it thinks the O is a 0 and the I is a |, which is understandable). Why is this and how could I make this adaptive enough for a dynamic video when it is so sensitive to these conditions?
import pytesseract
import numpy as np
from PIL import Image
except ImportError:
import Image
import cv2
from tqdm import tqdm
from collections import defaultdict
def get_missing_chars(dict):
capital_alphabet = [chr(ascii) for ascii in range(65, 91)]
return [let for let in capital_alphabet if let not in dict]
def draw_box_and_char(img, contour_dims, c, box_col, text_col):
x, y, w, h = contour_dims
top_left = (x, y)
bot_right = (x + w, y+h)
font_offset = 3
text_pos = (x+h//2+12, y+h-font_offset)
img_copy = img.copy()
cv2.rectangle(img_copy, top_left, bot_right, box_col, 2)
cv2.putText(img_copy, c, text_pos, cv2.FONT_HERSHEY_SIMPLEX, fontScale=.5, color=text_col, thickness=1, lineType=cv2.LINE_AA)
return img_copy
def detect_keys(img):
scaling = .25
img = cv2.resize(img, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_AREA)
print("img shape", img.shape)
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ratio_min = 0.7
area_min = 1000
nbrhood_size = 1001
bias = 2
# adapt to different lighting
bin_img = cv2.adaptiveThreshold(gray_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
cv2.THRESH_BINARY_INV, nbrhood_size, bias)
items = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = items[0] if len(items) == 2 else items[1]
key_contours = []
for c in contours:
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
# square-like ratio, try to get character
if ratio > ratio_min and area > area_min:
detected = defaultdict(int)
n_kept = 0
img_copy = cv2.cvtColor(bin_img, cv2.COLOR_GRAY2RGB)
let_to_contour = {}
n_contours = len(key_contours)
# offset to get smaller square within the key segment for easier char recognition
offset = 10
show_each_char = False
for _, c in tqdm(enumerate(key_contours), total=n_contours):
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
base = np.zeros(bin_img.shape, dtype=np.uint8)
n_kept += 1
new_y = y+offset
new_x = x+offset
new_h = h-2*offset
new_w = w-2*offset
base[new_y:new_y+new_h, new_x:new_x+new_w] = bin_img[new_y:new_y+new_h, new_x:new_x+new_w]
segment = cv2.bitwise_not(base)
# try scaling up individual keys
# scaling = 2
# segment = cv2.resize(segment, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_CUBIC)
# psm 10: treats the segment as a single character
custom_config = r'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
d = pytesseract.image_to_data(segment, config=custom_config, output_type='dict')
conf = d['conf']
c = d['text'][-1]
if c:
# sometimes recognizes multiple keys even though there is only 1
for sub_c in c:
# save character and contour to draw on image and show bounds/detection
if sub_c not in let_to_contour or (sub_c in let_to_contour and conf > let_to_contour[sub_c]['conf']):
let_to_contour[sub_c] = {'conf': conf, 'cont': (new_x, new_y, new_w, new_h)}
c = "?"
text_col = (0, 0, 255)
if show_each_char:
contour_dims = (new_x, new_y, new_w, new_h)
box_col = (0, 255, 0)
text_col = (0, 0, 0)
segment_with_boxes = draw_box_and_char(segment, contour_dims, c, box_col, text_col)
cv2.imshow('segment', segment_with_boxes)
# draw boxes around recognized keys
for c, data in let_to_contour.items():
box_col = (0, 255, 0)
text_col = (0, 0, 0)
img_copy = draw_box_and_char(img_copy, data['cont'], c, box_col, text_col)
detected = {k: 1 for k in let_to_contour}
for det in let_to_contour:
print(det, let_to_contour[det])
print("total detected: ", let_to_contour.keys())
missing = get_missing_chars(detected)
print(f"n_missing: {len(missing)}")
print(f"chars missing: {missing}")
return img_copy
if __name__ == "__main__":
img_file = "keyboard.jpg"
img = cv2.imread(img_file)
img_with_detected_keys = detect_keys(img)
cv2.imshow("detected", img_with_detected_keys)

error: (-215:Assertion failed) !ssize.empty() in function 'cv::resize' OpenCV

I have this old code that is used to run fine in Python 2.7 a while ago. I just updated the code to run in Python 3.8, but when I try to execute it code in Python 3.8 and OpenCV 3.4 I get a resize error and a warning (below)!
Here is the link to the two tif images that are required to run this code.
It's worth noting that both tif images are in the same folder as the Python code
import cv2
import matplotlib.pyplot as plt
import numpy as np
## Code for C_preferred Mask and C_images##
## There are three outputs to this code:
## Change the image name here
filename_image = '2.tif'
filename_mask = '1.tif'
## OpenCV verison Checking
#print 'OpenCV version used', cv2.__version__
filename = open("Output_C.txt","w")
filename.write("Processing Image : " + str(filename_image) + '\n\n')
## Function to sort the contours : Parameters that you can tune : tolerance_factor and size 0f the image.Here, I have used a fix size of
## (800,800)
def get_contour_precedence(contour, cols):
tolerance_factor = 10
origin = cv2.boundingRect(contour)
return ((origin[1] // tolerance_factor) * tolerance_factor) * cols + origin[0]
## Loading the colored mask, resizing it to (800,800) and converting it from RGB to HSV space, so that the color values are emphasized
p_mask_c = cv2.cvtColor(cv2.resize(cv2.imread(filename_mask),(800,800)),cv2.COLOR_RGB2HSV);
# Loading the original Image
b_image_1 = cv2.resize(cv2.imread(filename_image),(800,800));
# convert the target color to HSV, As our target mask portion to be considered is green. So I have chosen target color to be green
b = 0;
g = 255;
r = 0;
# Converting target color to HSV space
target_color = np.uint8([[[b, g, r]]])
target_color_hsv = cv2.cvtColor(target_color, cv2.COLOR_BGR2HSV)
# boundaries for Hue define the proper color boundaries, saturation and values can vary a lot
target_color_h = target_color_hsv[0,0,0]
tolerance = 20
lower_hsv = np.array([max(0, target_color_h - tolerance), 10, 10])
upper_hsv = np.array([min(179, target_color_h + tolerance), 250, 250])
# apply threshold on hsv image
mask = cv2.inRange(p_mask_c, lower_hsv, upper_hsv)
# Eroding the binary mask, such that every white portion (grids) are seperated from each other, to avoid overlapping and mixing of
# adjacent grids
b_mask = mask;
kernel = np.ones((5,5))
#kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3))
sharp = cv2.erode(b_mask,kernel, iterations=2)
# Finding all the grids (from binary image)
contours, hierarchy = cv2.findContours(sharp,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
print (' Number of contours', len(contours))
# Sorting contours
contours.sort(key=lambda x:get_contour_precedence(x, np.shape(b_mask)[0]))
#cv2.drawContours(b_image_1, contours, -1, (0,255,0), 1)
# Label variable for each grid/panel
label = 1;
b_image = b_image_1.copy();
temp =np.zeros(np.shape(b_image_1),np.uint8)
print (' size of temp',np.shape(temp), np.shape(b_image))
out_img = b_image_1.copy()
# Processing in each contour/label one by one
for cnt in contours:
cv2.drawContours(b_image_1,[cnt],0,(255,255,0), 1)
## Just to draw labels in the center of each grid
((x, y), r) = cv2.minEnclosingCircle(cnt)
x = int(x)
y = int(y)
r = int(r)
cv2.putText(b_image_1, "#{}".format(label), (int(x) - 10, int(y)),cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
cv2.drawContours(temp,[cnt],0,(255,255,255), -1)
#crop_img = np.bitwise_and(b_image,temp)
r = cv2.boundingRect(cnt)
crop_img = b_image[r[1]:r[1]+r[3], r[0]:r[0]+r[2]]
mean = cv2.mean(crop_img);
mean = np.array(mean).reshape(-1,1)
print (' Mean color', mean, np.shape(mean))
if mean[1] < 50:
cv2.putText(out_img, "M", (int(x) - 10, int(y)),cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1)
filename.write("Block number #"+ str(label)+ ' is : ' + 'Magenta'+'\n');
cv2.putText(out_img, "G", (int(x) - 10, int(y)),cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1)
filename.write("Block number #"+ str(label)+ ' is : ' +'Gray'+'\n');
label = label+1;
[ WARN:0] global C:\projects\opencv-python\opencv\modules\imgcodecs\src\grfmt_tiff.cpp (449) cv::TiffDecoder::readData OpenCV TIFF: TIFFRGBAImageOK: Sorry, can not handle images with IEEE floating-point samples
Traceback (most recent call last):
File "", line 32, in
p_mask_c = cv2.cvtColor(cv2.resize(cv2.imread(filename_mask),(800,800)),cv2.COLOR_RGB2HSV);
cv2.error: OpenCV(4.2.0) C:\projects\opencv-python\opencv\modules\imgproc\src\resize.cpp:4045: error: (-215:Assertion failed) !ssize.empty() in function 'cv::resize'
When you read in the image pass the cv::IMREAD_ANYDEPTH = 2 parameter as the second parameter in cv2.imread().
Changing your lines to
p_mask_c = cv2.cvtColor(cv2.resize(cv2.imread(filename_mask, 2),(800,800)),cv2.COLOR_RGB2HSV);
b_image_1 = cv2.resize(cv2.imread(filename_image, 2),(800,800));
removes the resize error you're seeing.
But you get another error when changing the color since your TIFF image apparently has only one channel so cv2.COLOR_RGB2HSV won't work..
You could also use multiple flags like cv::IMREAD_COLOR = 1,
p_mask_c = cv2.cvtColor(cv2.resize(cv2.imread(filename_mask, 2 | 1),(800,800)),cv2.COLOR_BGR2HSV);
to read in a color image. But you get a different error. Perhaps you understand this image better than I do and can solve the problem from here on out.

How to count vehicles using opencv in python?

I am working on a VCS (vehicle counting system) project. The scope of the project is to classify and count vehicles. I have built a custom model using Faster-RCNN in Tensorflow-object-detection-API This model only contains 7 classes such as car motorbike, bicycle and etc. The model works perfectly, But, the problem is "COUNTING". It is very hard to count vehicles in video frame. I did a pre-research on the internet. I tried a lot. but i could not find any useful information. There are some projects on github, they use tracking methods.
I want the following things. I want to draw an horizontal line in the frame. when the vehicle touch it, the counting should take place. How to do it. I don't know the algorithm behind it. I heard that centroid tracking would help me.
My question is, i want to count vehicles when it touch the horizontal line. I have linked a sample image bellow.
import os
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
# Import utilites
from utils import label_map_util
from utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = 'inference_graph'
VIDEO_NAME = 'Video_105.mp4'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,'training','labelmap.pbtxt')
# Path to video
# Number of classes the object detector can identify
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph =
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Open video file
video = cv2.VideoCapture(PATH_TO_VIDEO)
# Acquire frame and expand frame dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
ret, frame =
frame_expanded = np.expand_dims(frame, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) =
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: frame_expanded})
# Draw the results of the detection (aka 'visulaize the results')
# All the results have been drawn on the frame, so it's time to display it.
final_score = np.squeeze(scores)
count = 0
cv2.line(frame, (1144, 568), (1723,664), (0,0,255), 2) #Line
for i in range(100):
if scores is None or final_score[i] > 0.90:
min_score_thresh = 0.90
bboxes = boxes[scores > min_score_thresh]
im_width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
im_height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
final_box = []
for box in bboxes:
ymin, xmin, ymax, xmax = box
final_box.append([xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height])
cv2.imshow('Object detector', frame)
# Press 'q' to quit
if cv2.waitKey(1) == ord('q'):
# Clean up
# import the necessary packages
from import VideoStream
from import FPS
import argparse
import imutils
import time
import cv2
tracker = cv2.TrackerCSRT_create()
vs = cv2.VideoCapture("Video.mp4")
initBB = None
detec = []
def pega_centro(x, y, w, h):
x1 = int(w / 2)
y1 = int(h / 2)
cx = x + x1
cy = y + y1
return cx,cy
roi = 480
counter = 0
offset = 6
# loop over frames from the video stream
while vs.isOpened():
ret,frame =
cv2.line(frame, (769 , roi), (1298 , roi), (255,0,0), 3)
# check to see if we are currently tracking an object
if initBB is not None:
# grab the new bounding box coordinates of the object
(success, box) = tracker.update(frame)
# check to see if the tracking was a success
if success:
(x, y, w, h) = [int(v) for v in box]
cv2.rectangle(frame, (x, y), (x + w, y + h),
(0, 255, 0), 2)
cX = int((x + x+w) / 2.0)
cY = int((y + y+h) / 2.0), (cX, cY), 3, (0, 0, 255), -1)
c=pega_centro(x, y, w, h)
for (x,y) in detec:
if y<(roi+offset) and y>(roi-offset):
cv2.line(frame, (769 , roi), (1298 , roi), (0,0,255), 3)
# show the output frame
cv2.imshow("Frame", frame)
key = cv2.waitKey(1) & 0xFF
if key == ord("s"):
# select the bounding box of the object we want to track (make
# sure you press ENTER or SPACE after selecting the ROI)
initBB = cv2.selectROI("Frame", frame, fromCenter=False,
# start OpenCV object tracker using the supplied bounding box
# coordinates, then start the FPS throughput estimator as well
tracker.init(frame, initBB)
fps = FPS().start()
# if the `q` key was pressed, break from the loop
elif key == ord("q"):

How to export bounding boxes as .jpg

for my project I want to save the Bounding Boxes found by the Object Detection API as .jpg for feeding in another CNN for further classification.
Here is my code (derived from EdjeElectronics GitHub):
import os
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
# Import utilites
from utils import label_map_util
from utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = '_model_ssd_v2'
IMAGE_NAME = 'image.jpg'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,'_data','label_map.pbtxt')
# Path to image
PATH_TO_IMAGE = os.path.join(CWD_PATH,"_images",IMAGE_NAME)
# Number of classes the object detector can identify
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph =
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Load image using OpenCV and
# expand image dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
image = cv2.imread(PATH_TO_IMAGE)
image_expanded = np.expand_dims(image, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) =
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: image_expanded})
# Draw the results of the detection (aka 'visulaize the results')
# All the results have been drawn on image. Now display the image.
# cv2.imshow('Object detector', cv2.resize(image, (int(2592/2),int(1944/2))))
# # Press any key to close the image
# cv2.waitKey(0)
# # Clean up
# cv2.destroyAllWindows()
cv2.imwrite("C:/tensorflow/models/research/object_detection/_images/test1.jpg", image)
A similar question was asked here but I donĀ“t know how to apply it with the Tensorflow Object Detection API.
Thank You!
I've found the function draw_bounding_boxes_on_image in the vis_util. Try this:
#create a white back ground image with the same shape as image
white_bg_img = 255*np.ones(image.shape, np.uint8)
white_bg_img ,
cv2.imwrite("bounding_boxes.jpg", white_bg_img )
To draw the image within the bounding boxes.
boxes = np.squeeze(boxes)
for i in range(len(boxes)):
ymin = box[i,0]
xmin = box[i,1]
ymax = box[i,2]
xmax = box[i,3]
roi = image[ymin:ymax,xmin:xmax].copy()
cv2.imwrite("box_{}.jpg".format(str(i)), roi)
Save files will be like box_1.jpg, box_2.jpg ...
I followed this link and it worked. Add the following code:
true_boxes = boxes[0][scores[0] > min_score_thresh]
for i in range(true_boxes.shape[0]):
ymin = int(true_boxes[i,0]*height)
xmin = int(true_boxes[i,1]*width)
ymax = int(true_boxes[i,2]*height)
xmax = int(true_boxes[i,3]*width)
roi = image[ymin:ymax,xmin:xmax].copy()
cv2.imwrite("box_{}.jpg".format(str(i)), roi)
Make sure you define true height and width of image.
this will work
enter code here
box = np.squeeze(boxes)
for i in range(len(boxes)):
ymin = (int(box[i,0]*height))
xmin = (int(box[i,1]*width))
ymax = (int(box[i,2]*height))
xmax = (int(box[i,3]*width))
roi =image[ymin:ymax,xmin:xmax].copy()
