darknet_images.py doesn't detect any objects. Darknet YOLOv4 - opencv
So I'm running the default darknet_images.py script in pycharm (the one from Alexey's github repo) and when I give the img path, the img shows but the bounding boxes don't. I tried to solve the problem but couldn't find a solution. I found that my predictions variable is empty(in the main() function:
image, detections = image_detection(image_name, network, class_names, class_colors, args.thresh )
Just in case I am missing something, I'll print here the darknet.py code and darknet_images.py code.
darknet.py:
#!python3
"""
Python 3 wrapper for identifying objects in images
Requires DLL compilation
Both the GPU and no-GPU version should be compiled; the no-GPU version should be renamed "yolo_cpp_dll_nogpu.dll".
On a GPU system, you can force CPU evaluation by any of:
- Set global variable DARKNET_FORCE_CPU to True
- Set environment variable CUDA_VISIBLE_DEVICES to -1
- Set environment variable "FORCE_CPU" to "true"
- Set environment variable "DARKNET_PATH" to path darknet lib .so (for Linux)
Directly viewing or returning bounding-boxed images requires scikit-image to be installed (`pip install scikit-image`)
Original *nix 2.7: https://github.com/pjreddie/darknet/blob/0f110834f4e18b30d5f101bf8f1724c34b7b83db/python/darknet.py
Windows Python 2.7 version: https://github.com/AlexeyAB/darknet/blob/fc496d52bf22a0bb257300d3c79be9cd80e722cb/build/darknet/x64/darknet.py
#author: Philip Kahn
#date: 20180503
"""
from ctypes import *
import math
import random
import os
class BOX(Structure):
_fields_ = [("x", c_float),
("y", c_float),
("w", c_float),
("h", c_float)]
class DETECTION(Structure):
_fields_ = [("bbox", BOX),
("classes", c_int),
("prob", POINTER(c_float)),
("mask", POINTER(c_float)),
("objectness", c_float),
("sort_class", c_int),
("uc", POINTER(c_float)),
("points", c_int),
("embeddings", POINTER(c_float)),
("embedding_size", c_int),
("sim", c_float),
("track_id", c_int)]
class DETNUMPAIR(Structure):
_fields_ = [("num", c_int),
("dets", POINTER(DETECTION))]
class IMAGE(Structure):
_fields_ = [("w", c_int),
("h", c_int),
("c", c_int),
("data", POINTER(c_float))]
class METADATA(Structure):
_fields_ = [("classes", c_int),
("names", POINTER(c_char_p))]
def network_width(net):
return lib.network_width(net)
def network_height(net):
return lib.network_height(net)
def bbox2points(bbox):
"""
From bounding box yolo format
to corner points cv2 rectangle
"""
x, y, w, h = bbox
xmin = int(round(x - (w / 2)))
xmax = int(round(x + (w / 2)))
ymin = int(round(y - (h / 2)))
ymax = int(round(y + (h / 2)))
return xmin, ymin, xmax, ymax
def class_colors(names):
"""
Create a dict with one random BGR color for each
class name
"""
return {name: (
random.randint(0, 255),
random.randint(0, 255),
random.randint(0, 255)) for name in names}
def load_network(config_file, data_file, weights, batch_size=1):
"""
load model description and weights from config files
args:
config_file (str): path to .cfg model file
data_file (str): path to .data model file
weights (str): path to weights
returns:
network: trained model
class_names
class_colors
"""
network = load_net_custom(
config_file.encode("ascii"),
weights.encode("ascii"), 0, batch_size)
metadata = load_meta(data_file.encode("ascii"))
class_names = [metadata.names[i].decode("ascii") for i in range(metadata.classes)]
colors = class_colors(class_names)
return network, class_names, colors
def print_detections(detections, coordinates=False):
print("\nObjects:")
for label, confidence, bbox in detections:
x, y, w, h = bbox
if coordinates:
print("{}: {}% (left_x: {:.0f} top_y: {:.0f} width: {:.0f} height: {:.0f})".format(label, confidence, x, y, w, h))
else:
print("{}: {}%".format(label, confidence))
def draw_boxes(detections, image, colors):
import cv2
for label, confidence, bbox in detections:
left, top, right, bottom = bbox2points(bbox)
cv2.rectangle(image, (left, top), (right, bottom), colors[label], 1)
cv2.putText(image, "{} [{:.2f}]".format(label, float(confidence)),
(left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
colors[label], 2)
return image
def decode_detection(detections):
decoded = []
for label, confidence, bbox in detections:
confidence = str(round(confidence * 100, 2))
decoded.append((str(label), confidence, bbox))
return decoded
def remove_negatives(detections, class_names, num):
"""
Remove all classes with 0% confidence within the detection
"""
predictions = []
for j in range(num):
for idx, name in enumerate(class_names):
if detections[j].prob[idx] > 0:
bbox = detections[j].bbox
bbox = (bbox.x, bbox.y, bbox.w, bbox.h)
predictions.append((name, detections[j].prob[idx], (bbox)))
return predictions
def detect_image(network, class_names, image, thresh=.5, hier_thresh=.5, nms=.45):
"""
Returns a list with highest confidence class and their bbox
"""
pnum = pointer(c_int(0))
predict_image(network, image)
detections = get_network_boxes(network, image.w, image.h,
thresh, hier_thresh, None, 0, pnum, 0)
num = pnum[0]
if nms:
do_nms_sort(detections, num, len(class_names), nms)
predictions = remove_negatives(detections, class_names, num)
predictions = decode_detection(predictions)
free_detections(detections, num)
return sorted(predictions, key=lambda x: x[1])
# lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
# lib = CDLL("libdarknet.so", RTLD_GLOBAL)
hasGPU = True
if os.name == "nt":
cwd = os.path.dirname(__file__)
os.environ['PATH'] = cwd + ';' + os.environ['PATH']
winGPUdll = os.path.join(cwd, "yolo_cpp_dll.dll")
winNoGPUdll = os.path.join(cwd, "yolo_cpp_dll_nogpu.dll")
envKeys = list()
for k, v in os.environ.items():
envKeys.append(k)
try:
try:
tmp = os.environ["FORCE_CPU"].lower()
if tmp in ["1", "true", "yes", "on"]:
raise ValueError("ForceCPU")
else:
print("Flag value {} not forcing CPU mode".format(tmp))
except KeyError:
# We never set the flag
if 'CUDA_VISIBLE_DEVICES' in envKeys:
if int(os.environ['CUDA_VISIBLE_DEVICES']) < 0:
raise ValueError("ForceCPU")
try:
global DARKNET_FORCE_CPU
if DARKNET_FORCE_CPU:
raise ValueError("ForceCPU")
except NameError as cpu_error:
print(cpu_error)
if not os.path.exists(winGPUdll):
raise ValueError("NoDLL")
lib = CDLL(winGPUdll, RTLD_GLOBAL)
except (KeyError, ValueError):
hasGPU = False
if os.path.exists(winNoGPUdll):
lib = CDLL(winNoGPUdll, RTLD_GLOBAL)
print("Notice: CPU-only mode")
else:
# Try the other way, in case no_gpu was compile but not renamed
lib = CDLL(winGPUdll, RTLD_GLOBAL)
print("Environment variables indicated a CPU run, but we didn't find {}. Trying a GPU run anyway.".format(winNoGPUdll))
else:
lib = CDLL(os.path.join(
os.environ.get('DARKNET_PATH', './'),
"libdarknet.so"), RTLD_GLOBAL)
lib.network_width.argtypes = [c_void_p]
lib.network_width.restype = c_int
lib.network_height.argtypes = [c_void_p]
lib.network_height.restype = c_int
copy_image_from_bytes = lib.copy_image_from_bytes
copy_image_from_bytes.argtypes = [IMAGE,c_char_p]
predict = lib.network_predict_ptr
predict.argtypes = [c_void_p, POINTER(c_float)]
predict.restype = POINTER(c_float)
if hasGPU:
set_gpu = lib.cuda_set_device
set_gpu.argtypes = [c_int]
init_cpu = lib.init_cpu
make_image = lib.make_image
make_image.argtypes = [c_int, c_int, c_int]
make_image.restype = IMAGE
get_network_boxes = lib.get_network_boxes
get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int), c_int]
get_network_boxes.restype = POINTER(DETECTION)
make_network_boxes = lib.make_network_boxes
make_network_boxes.argtypes = [c_void_p]
make_network_boxes.restype = POINTER(DETECTION)
free_detections = lib.free_detections
free_detections.argtypes = [POINTER(DETECTION), c_int]
free_batch_detections = lib.free_batch_detections
free_batch_detections.argtypes = [POINTER(DETNUMPAIR), c_int]
free_ptrs = lib.free_ptrs
free_ptrs.argtypes = [POINTER(c_void_p), c_int]
network_predict = lib.network_predict_ptr
network_predict.argtypes = [c_void_p, POINTER(c_float)]
reset_rnn = lib.reset_rnn
reset_rnn.argtypes = [c_void_p]
load_net = lib.load_network
load_net.argtypes = [c_char_p, c_char_p, c_int]
load_net.restype = c_void_p
load_net_custom = lib.load_network_custom
load_net_custom.argtypes = [c_char_p, c_char_p, c_int, c_int]
load_net_custom.restype = c_void_p
free_network_ptr = lib.free_network_ptr
free_network_ptr.argtypes = [c_void_p]
free_network_ptr.restype = c_void_p
do_nms_obj = lib.do_nms_obj
do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
do_nms_sort = lib.do_nms_sort
do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
free_image = lib.free_image
free_image.argtypes = [IMAGE]
letterbox_image = lib.letterbox_image
letterbox_image.argtypes = [IMAGE, c_int, c_int]
letterbox_image.restype = IMAGE
load_meta = lib.get_metadata
lib.get_metadata.argtypes = [c_char_p]
lib.get_metadata.restype = METADATA
load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE
rgbgr_image = lib.rgbgr_image
rgbgr_image.argtypes = [IMAGE]
predict_image = lib.network_predict_image
predict_image.argtypes = [c_void_p, IMAGE]
predict_image.restype = POINTER(c_float)
predict_image_letterbox = lib.network_predict_image_letterbox
predict_image_letterbox.argtypes = [c_void_p, IMAGE]
predict_image_letterbox.restype = POINTER(c_float)
network_predict_batch = lib.network_predict_batch
network_predict_batch.argtypes = [c_void_p, IMAGE, c_int, c_int, c_int,
c_float, c_float, POINTER(c_int), c_int, c_int]
network_predict_batch.restype = POINTER(DETNUMPAIR)
darknet_images.py:
import argparse
import os
import glob
import random
import darknet
import time
import cv2
import numpy as np
import darknet
def parser():
parser = argparse.ArgumentParser(description="YOLO Object Detection")
parser.add_argument("--input", type=str, default="",
help="image source. It can be a single image, a"
"txt with paths to them, or a folder. Image valid"
" formats are jpg, jpeg or png."
"If no input is given, ")
parser.add_argument("--batch_size", default=1, type=int,
help="number of images to be processed at the same time")
parser.add_argument("--weights", default="yolov4.weights",
help="yolo weights path")
parser.add_argument("--dont_show", action='store_true',
help="windown inference display. For headless systems")
parser.add_argument("--ext_output", action='store_true',
help="display bbox coordinates of detected objects")
parser.add_argument("--save_labels", action='store_true',
help="save detections bbox for each image in yolo format")
parser.add_argument("--config_file", default="./cfg/yolov4.cfg",
help="path to config file")
parser.add_argument("--data_file", default="./cfg/coco.data",
help="path to data file")
parser.add_argument("--thresh", type=float, default=.25,
help="remove detections with lower confidence")
return parser.parse_args()
def check_arguments_errors(args):
assert 0 < args.thresh < 1, "Threshold should be a float between zero and one (non-inclusive)"
if not os.path.exists(args.config_file):
raise(ValueError("Invalid config path {}".format(os.path.abspath(args.config_file))))
if not os.path.exists(args.weights):
raise(ValueError("Invalid weight path {}".format(os.path.abspath(args.weights))))
if not os.path.exists(args.data_file):
raise(ValueError("Invalid data file path {}".format(os.path.abspath(args.data_file))))
if args.input and not os.path.exists(args.input):
raise(ValueError("Invalid image path {}".format(os.path.abspath(args.input))))
def check_batch_shape(images, batch_size):
"""
Image sizes should be the same width and height
"""
shapes = [image.shape for image in images]
if len(set(shapes)) > 1:
raise ValueError("Images don't have same shape")
if len(shapes) > batch_size:
raise ValueError("Batch size higher than number of images")
return shapes[0]
def load_images(images_path):
"""
If image path is given, return it directly
For txt file, read it and return each line as image path
In other case, it's a folder, return a list with names of each
jpg, jpeg and png file
"""
input_path_extension = images_path.split('.')[-1]
if input_path_extension in ['jpg', 'jpeg', 'png']:
return [images_path]
elif input_path_extension == "txt":
with open(images_path, "r") as f:
return f.read().splitlines()
else:
return glob.glob(
os.path.join(images_path, "*.jpg")) + \
glob.glob(os.path.join(images_path, "*.png")) + \
glob.glob(os.path.join(images_path, "*.jpeg"))
def prepare_batch(images, network, channels=3):
width = darknet.network_width(network)
height = darknet.network_height(network)
darknet_images = []
for image in images:
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),
interpolation=cv2.INTER_LINEAR)
custom_image = image_resized.transpose(2, 0, 1)
darknet_images.append(custom_image)
batch_array = np.concatenate(darknet_images, axis=0)
batch_array = np.ascontiguousarray(batch_array.flat, dtype=np.float32)/255.0
darknet_images = batch_array.ctypes.data_as(darknet.POINTER(darknet.c_float))
return darknet.IMAGE(width, height, channels, darknet_images)
def image_detection(image_path, network, class_names, class_colors, thresh):
# Darknet doesn't accept numpy images.
# Create one with image we reuse for each detect
width = darknet.network_width(network)
height = darknet.network_height(network)
darknet_image = darknet.make_image(width, height, 3)
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),
interpolation=cv2.INTER_LINEAR)
darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
detections = darknet.detect_image(network, class_names, darknet_image, thresh=thresh)
darknet.free_image(darknet_image)
image = darknet.draw_boxes(detections, image_resized, class_colors)
return cv2.cvtColor(image, cv2.COLOR_BGR2RGB), detections
def batch_detection(network, images, class_names, class_colors,
thresh=0.25, hier_thresh=.5, nms=.45, batch_size=4):
image_height, image_width, _ = check_batch_shape(images, batch_size)
darknet_images = prepare_batch(images, network)
batch_detections = darknet.network_predict_batch(network, darknet_images, batch_size, image_width,
image_height, thresh, hier_thresh, None, 0, 0)
batch_predictions = []
for idx in range(batch_size):
num = batch_detections[idx].num
detections = batch_detections[idx].dets
if nms:
darknet.do_nms_obj(detections, num, len(class_names), nms)
predictions = darknet.remove_negatives(detections, class_names, num)
images[idx] = darknet.draw_boxes(predictions, images[idx], class_colors)
batch_predictions.append(predictions)
darknet.free_batch_detections(batch_detections, batch_size)
return images, batch_predictions
def image_classification(image, network, class_names):
width = darknet.network_width(network)
height = darknet.network_height(network)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),
interpolation=cv2.INTER_LINEAR)
darknet_image = darknet.make_image(width, height, 3)
darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
detections = darknet.predict_image(network, darknet_image)
predictions = [(name, detections[idx]) for idx, name in enumerate(class_names)]
darknet.free_image(darknet_image)
return sorted(predictions, key=lambda x: -x[1])
def convert2relative(image, bbox):
"""
YOLO format use relative coordinates for annotation
"""
x, y, w, h = bbox
height, width, _ = image.shape
return x/width, y/height, w/width, h/height
def save_annotations(name, image, detections, class_names):
"""
Files saved with image_name.txt and relative coordinates
"""
file_name = os.path.splitext(name)[0] + ".txt"
with open(file_name, "w") as f:
for label, confidence, bbox in detections:
x, y, w, h = convert2relative(image, bbox)
label = class_names.index(label)
f.write("{} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}\n".format(label, x, y, w, h, float(confidence)))
def batch_detection_example():
args = parser()
check_arguments_errors(args)
batch_size = 3
random.seed(3) # deterministic bbox colors
network, class_names, class_colors = darknet.load_network(
args.config_file,
args.data_file,
args.weights,
batch_size=batch_size
)
image_names = ['data/horses.jpg', 'data/horses.jpg', 'data/eagle.jpg']
images = [cv2.imread(image) for image in image_names]
images, detections, = batch_detection(network, images, class_names,
class_colors, batch_size=batch_size)
for name, image in zip(image_names, images):
cv2.imwrite(name.replace("data/", ""), image)
print(detections)
def main():
args = parser()
check_arguments_errors(args)
random.seed(3) # deterministic bbox colors
network, class_names, class_colors = darknet.load_network(
args.config_file,
args.data_file,
args.weights,
batch_size=args.batch_size
)
images = load_images(args.input)
index = 0
while True:
# loop asking for new image paths if no list is given
if args.input:
if index >= len(images):
break
image_name = images[index]
else:
image_name = input("Enter Image Path: ")
prev_time = time.time()
image, detections = image_detection(
image_name, network, class_names, class_colors, args.thresh
)
if args.save_labels:
save_annotations(image_name, image, detections, class_names)
darknet.print_detections(detections, args.ext_output)
fps = int(1/(time.time() - prev_time))
print("FPS: {}".format(fps))
if not args.dont_show:
cv2.imshow('Inference', image)
if cv2.waitKey() & 0xFF == ord('q'):
break
index += 1
if __name__ == "__main__":
# unconmment next line for an example of batch processing
# batch_detection_example()
main()
Related
Why does Tesseract fail to recognize 6 out of 26 of my alphabetic keyboard keys even with several parameter tunings?
TL;DR I'm using: adaptive thresholding segmenting by keys (width/height ratio) - see green boxes in image result psm 10 to treat each key as a character but it fails to recognize some keys, falsely identifies others or identifies 2 for 1 char (see the L character in the image result, it's an L and P), etc. Note: I cropped the image and re-ran the results to get it to fit on this site, but before cropping it did slightly better (recognized more keys, fewer false positives, etc). I just want it to recognize the alphabet keys. Ultimately I will want it to work for realtime video. config: '-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"' I've tried scaling the image differently, scaling the individual key segments, using opening/closing/etc but it doesn't recognize all the keys. original image image result Update: new results if I make the image straighter (bird's eye) and remove the whitelisting, it manages to detect all for the most part (although it thinks the O is a 0 and the I is a |, which is understandable). Why is this and how could I make this adaptive enough for a dynamic video when it is so sensitive to these conditions? Code: import pytesseract import numpy as np try: from PIL import Image except ImportError: import Image import cv2 from tqdm import tqdm from collections import defaultdict def get_missing_chars(dict): capital_alphabet = [chr(ascii) for ascii in range(65, 91)] return [let for let in capital_alphabet if let not in dict] def draw_box_and_char(img, contour_dims, c, box_col, text_col): x, y, w, h = contour_dims top_left = (x, y) bot_right = (x + w, y+h) font_offset = 3 text_pos = (x+h//2+12, y+h-font_offset) img_copy = img.copy() cv2.rectangle(img_copy, top_left, bot_right, box_col, 2) cv2.putText(img_copy, c, text_pos, cv2.FONT_HERSHEY_SIMPLEX, fontScale=.5, color=text_col, thickness=1, lineType=cv2.LINE_AA) return img_copy def detect_keys(img): scaling = .25 img = cv2.resize(img, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_AREA) print("img shape", img.shape) gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) ratio_min = 0.7 area_min = 1000 nbrhood_size = 1001 bias = 2 # adapt to different lighting bin_img = cv2.adaptiveThreshold(gray_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\ cv2.THRESH_BINARY_INV, nbrhood_size, bias) items = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = items[0] if len(items) == 2 else items[1] key_contours = [] for c in contours: x, y, w, h = cv2.boundingRect(c) ratio = h/w area = cv2.contourArea(c) # square-like ratio, try to get character if ratio > ratio_min and area > area_min: key_contours.append(c) detected = defaultdict(int) n_kept = 0 img_copy = cv2.cvtColor(bin_img, cv2.COLOR_GRAY2RGB) let_to_contour = {} n_contours = len(key_contours) # offset to get smaller square within the key segment for easier char recognition offset = 10 show_each_char = False for _, c in tqdm(enumerate(key_contours), total=n_contours): x, y, w, h = cv2.boundingRect(c) ratio = h/w area = cv2.contourArea(c) base = np.zeros(bin_img.shape, dtype=np.uint8) base.fill(255) n_kept += 1 new_y = y+offset new_x = x+offset new_h = h-2*offset new_w = w-2*offset base[new_y:new_y+new_h, new_x:new_x+new_w] = bin_img[new_y:new_y+new_h, new_x:new_x+new_w] segment = cv2.bitwise_not(base) # try scaling up individual keys # scaling = 2 # segment = cv2.resize(segment, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_CUBIC) # psm 10: treats the segment as a single character custom_config = r'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"' d = pytesseract.image_to_data(segment, config=custom_config, output_type='dict') conf = d['conf'] c = d['text'][-1] if c: # sometimes recognizes multiple keys even though there is only 1 for sub_c in c: # save character and contour to draw on image and show bounds/detection if sub_c not in let_to_contour or (sub_c in let_to_contour and conf > let_to_contour[sub_c]['conf']): let_to_contour[sub_c] = {'conf': conf, 'cont': (new_x, new_y, new_w, new_h)} else: c = "?" text_col = (0, 0, 255) if show_each_char: contour_dims = (new_x, new_y, new_w, new_h) box_col = (0, 255, 0) text_col = (0, 0, 0) segment_with_boxes = draw_box_and_char(segment, contour_dims, c, box_col, text_col) cv2.imshow('segment', segment_with_boxes) cv2.waitKey(0) cv2.destroyAllWindows() # draw boxes around recognized keys for c, data in let_to_contour.items(): box_col = (0, 255, 0) text_col = (0, 0, 0) img_copy = draw_box_and_char(img_copy, data['cont'], c, box_col, text_col) detected = {k: 1 for k in let_to_contour} for det in let_to_contour: print(det, let_to_contour[det]) print("total detected: ", let_to_contour.keys()) missing = get_missing_chars(detected) print(f"n_missing: {len(missing)}") print(f"chars missing: {missing}") return img_copy if __name__ == "__main__": img_file = "keyboard.jpg" img = cv2.imread(img_file) img_with_detected_keys = detect_keys(img) cv2.imshow("detected", img_with_detected_keys) cv2.waitKey(0) cv2.destroyAllWindows()
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fd42c66d3b0>
I am trying to do an object detection problem and been working with aquarium dataset from roboflow. I have been trying to create a bounding box for the fishes, but I have getting the error: UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fd42c66d3b0> I also tried to see what images are corrupted and ran a code import PIL from pathlib import Path from PIL import UnidentifiedImageError count = 0 path = Path("/content/drive/MyDrive/archive/Aquarium Combined").rglob("*.jpg") for img_p in path: try: img = PIL.Image.open(img_p) except PIL.UnidentifiedImageError: print(img_p) count +=1 print(count) It has given me a count of 651 images, but my dataset has 662 images. I guess PIL doesn't know how to decode it or I don't know what the problem is. I will attach a sample image file name /content/drive/MyDrive/archive/Aquarium Combined/test/IMG_2301_jpeg_jpg.rf.2c19ae5efbd1f8611b5578125f001695.jpg Full traceback: UnidentifiedImageError Traceback (most recent call last) <ipython-input-31-2785d562a97e> in <module>() 4 sample[1]['boxes'][:, [1, 0, 3, 2]], 5 [classes[i] for i in sample[1]['labels']], ----> 6 width=4).permute(1, 2, 0) 7 ) 3 frames /usr/local/lib/python3.7/dist-packages/PIL/Image.py in open(fp, mode) 2894 if mode == "P": 2895 from . import ImagePalette -> 2896 2897 im.palette = ImagePalette.ImagePalette("RGB", im.im.getpalette("RGB")) 2898 im.readonly = 1 UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fd42c66d3b0> Also I am providing the class functions" class AquariumDetection(datasets.VisionDataset): def __init__( self, root: str, split = "train", transform= None, target_transform = None, transforms = None, ) -> None: super().__init__(root, transforms, transform, target_transform) self.split = split self.coco = COCO(os.path.join(root, split, "_annotations.coco.json")) self.ids = list(sorted(self.coco.imgs.keys())) self.ids = [id for id in self.ids if (len(self._load_target(id)) > 0)] def _load_image(self, id: int) -> Image.Image: path = self.coco.loadImgs(id)[0]["file_name"] image = cv2.imread(os.path.join(self.root, self.split, path)) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) return image def _load_target(self, id: int): return self.coco.loadAnns(self.coco.getAnnIds(id)) def __getitem__(self, index: int): id = self.ids[index] image = self._load_image(id) target = copy.deepcopy(self._load_target(id)) boxes = [t['bbox'] + [t['category_id']] for t in target] if self.transforms is not None: transformed = self.transforms(image=image, bboxes=boxes) image = transformed['image'] boxes = transformed['bboxes'] new_boxes = [] for box in boxes: xmin = box[0] ymin = box[1] xmax = xmin + box[2] ymax = ymin + box[3] new_boxes.append([ymin, xmin, ymax, xmax]) boxes = torch.tensor(new_boxes, dtype=torch.float32) _, h, w = image.shape targ = {} targ["boxes"] = boxes targ["labels"] = torch.tensor([t["category_id"] for t in target], dtype=torch.int64) targ["image_id"] = torch.tensor([t["image_id"] for t in target]) targ["area"] = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) targ["iscrowd"] = torch.tensor([t["iscrowd"] for t in target], dtype=torch.int64) targ["img_scale"] = torch.tensor([1.0]) targ['img_size'] = (h, w) image = image.div(255) normalize = T.Compose([T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) return normalize(image), targ, index def __len__(self) -> int: return len(self.ids)
Struggling with Normalization values when training model
I am trying to train an adversarial patch located at the bottom left corner of the image to cause a misclassification. Currently, I am using these parameters to normalize the CIFAR10 dataset. transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.201))] This would result in the images having a maximum and minimum value of around 2.55 and -2.55 respectively. However, I'm not sure how to work with this range when training my patch. I struggle between converting the patch from a range of (0,1) to (-2.55,2.55). Any help is appreciated! My code for training is below: (I don't think its training properly for now) import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torchvision.models as models from torch.utils.data import DataLoader from torch.utils.data.sampler import SubsetRandomSampler import matplotlib.pyplot as plt import numpy as np from torch.autograd import Variable import torchattacks import random import torch.nn.functional as F dictionary ={ '0':'airplane', '1':'automobile', '2':'bird', '3':'cat', '4':'deer', '5':'dog', '6':'frog', '7':'horse', '8':'ship', '9':'truck', } transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.201))]) #transform1 = transforms.Compose([transforms.ToTensor()]) normalize = transforms.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.201)) mean =(0.4914,0.4822,0.4465) std =(0.2023,0.1994,0.201) inv_normalize = transforms.Normalize( mean=[-0.4914/0.2023, -0.4822/0.1994, -0.4465/0.201], std=[1/0.2023, 1/0.1994, 1/0.201]) batch_size = 1 trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=False, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=2) model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar10_resnet20", pretrained=True) model = model.cuda() import os os.environ["CUDA_VISIBLE_DEVICES"] = '0' patch = np.random.rand(3,32,32) model.eval() def mask_generation(mask_type='rectangle', patch = patch, image_size=(3, 7, 7)): applied_patch = np.zeros(image_size) #0,1 #patch = torch.tensor(patch) #padding = (3,3,3,3) #patch = F.pad(patch, padding) if mask_type == 'rectangle': rotation_angle = 0 for i in range(patch.shape[0]): patch[i] = np.rot90(patch[i], rotation_angle) x_location , y_location = 25,0 for i in range(patch.shape[0]): applied_patch[:, x_location:x_location + patch.shape[1], y_location:y_location + patch.shape[2]] = patch mask = applied_patch.copy() mask[mask != 0] = 1.0 return patch , applied_patch, mask, x_location, y_location , rotation_angle def patch_attack(image, applied_patch, mask, target, probability_threshold, model, lr, max_iteration): applied_patch = torch.from_numpy(applied_patch) mask = torch.from_numpy(mask) image = inv_normalize(image) target_probability, count = 0,0 perturbated_image = torch.mul(mask.type(torch.FloatTensor), applied_patch.type(torch.FloatTensor)) + torch.mul((1 - mask.type(torch.FloatTensor)), image.type(torch.FloatTensor)) perturbated_image = normalize(perturbated_image) while target_probability < probability_threshold and count < max_iteration: count += 1 # Optimize the patch perturbated_image = Variable(perturbated_image.data, requires_grad=True) per_image = perturbated_image.cuda() output = model(per_image) target_log_softmax = torch.nn.functional.log_softmax(output, dim=1)[0][target] target_log_softmax.backward() patch_grad = perturbated_image.grad.clone().cpu() applied_patch = (lr * patch_grad) + applied_patch.type(torch.FloatTensor) applied_patch = torch.clamp(applied_patch,0,1) perturbated_image.grad.data.zero_() # Test the patch perturbated_image = torch.mul(mask.type(torch.FloatTensor), applied_patch.type(torch.FloatTensor)) + torch.mul((1-mask.type(torch.FloatTensor)), image.type(torch.FloatTensor)) perturbated_image = normalize(perturbated_image) perturbated_image = perturbated_image.cuda() output = model(perturbated_image) target_probability = torch.nn.functional.softmax(output, dim=1).data[0][target] perturbated_image = perturbated_image.detach().cpu().numpy() applied_patch = applied_patch.cpu().numpy() return perturbated_image, applied_patch def test_patch(patch_type, target, patch, test_loader, model): test_total, test_actual_total, test_success = 0, 0, 0 for (image, label) in test_loader: test_total += label.shape[0] assert image.shape[0] == 1, 'Only one picture should be loaded each time.' image = image.cuda() #-3,3 label = label.cuda() output = model(image) _, predicted = torch.max(output.data, 1) if predicted[0] != label and predicted[0].data.cpu().numpy() != target: test_actual_total += 1 patch ,applied_patch, mask, x_location, y_location = mask_generation('rectangle', patch, (3, 32, 32)) applied_patch = torch.from_numpy(applied_patch) mask = torch.from_numpy(mask) mask = normalize(mask) applied_patch = normalize(applied_patch) perturbated_image = torch.mul(mask.type(torch.FloatTensor), applied_patch.type(torch.FloatTensor)) + torch.mul((1 - mask.type(torch.FloatTensor)), image.type(torch.FloatTensor)) perturbated_image = perturbated_image.cuda() #-3,3 output = model(perturbated_image) _, predicted = torch.max(output.data, 1) if predicted[0].data.cpu().numpy() == target: test_success += 1 return test_success / test_actual_total #training parameters epochs = 1 target = 0 probability_threshold = 0.99 lr = 1/255 max_iteration = 1 runs = 0 for epoch in range(epochs): train_total, train_actual_total, train_success = 0, 0, 0 for (image, label) in trainloader: runs+=1 assert image.shape[0] == 1 image = image.cuda() label = label.cuda() train_total += label.shape[0] output = model(image) _, predicted = torch.max(output.data, 1) if predicted[0] != label or predicted[0].data.cpu().numpy() != target: train_actual_total += 1 patch , applied_patch, mask, x_location, y_location ,rotation_angle = mask_generation('rectangle', patch, (3, 32, 32)) perturbated_image, applied_patch = patch_attack(image, applied_patch, mask, target, probability_threshold, model, lr,max_iteration) perturbated_image = torch.from_numpy(perturbated_image).cuda() output = model(perturbated_image) _, predicted = torch.max(output.data, 1) if predicted[0].data.cpu().numpy() == target: train_success += 1 patch = applied_patch[0][:, x_location:x_location + patch.shape[1], y_location:y_location + patch.shape[2]] patch = np.array(patch)
To convert a number x in the range [0,1] to the range [-2.55,2.55]: Multiply by size of final range / size of original range or in this case 5.1/1.0. Add min of final range - min of starting range to the result, so in this case -2.55+0 = 0.
How to use multiple GPUs effectively when training deep networks?
I am using a machine which has 2 GPUs Titan Black to train my deep learning model which has 3 layers (3x3, 3x3 and 5x5). The training runs pretty well but when I watch nvidia-smi (watch every 1 sec), I realized that my program uses only one GPU for computation, the second one always 0% even when the first one reach 100%. I am trying to use tf.device to assign specific tasks for each of them but then they run one-by-one, not in parallel, and the total time was even increased, not reduced (I guess because 2 GPUs had to exchange values with each other) Below is my program. It is quite messy, maybe you just need to pay attention at the graph where I use tf.device is enough... Thank you so much! import tensorflow as tf import numpy as np from six.moves import cPickle as pickle import matplotlib.pyplot as plt from os import listdir, sys from os.path import isfile, join from time import gmtime, strftime import time def validatePath(path): path = path.replace("\\","/") if (path[len(path)-1] != "/"): path = path + "/" return path hidden_size_default = np.array([16, 32, 64, 32]) cnn1_default = 3 cnn2_default = 3 cnn3_default = 5 SIZE_BATCH_VALID = 200 input_path = 'ARCHIVES-sub-dataset' output_path = 'ARCHIVES-model' log_address = "trainlog.txt" tf.app.flags.DEFINE_integer('h0', hidden_size_default[0], 'Size of hidden layer 0th') tf.app.flags.DEFINE_integer('h1', hidden_size_default[1], 'Size of hidden layer 1st') tf.app.flags.DEFINE_integer('h2', hidden_size_default[2], 'Size of hidden layer 2nd') tf.app.flags.DEFINE_integer('h3', hidden_size_default[3], 'Size of hidden layer 3rd') tf.app.flags.DEFINE_integer('k1', cnn1_default , 'Size of kernel 1st') tf.app.flags.DEFINE_integer('k2', cnn2_default , 'Size of kernel 2nd') tf.app.flags.DEFINE_integer('k3', cnn3_default , 'Size of kernel 3rd') tf.app.flags.DEFINE_string('input_path', input_path, 'The parent directory which contains 2 directories: dataset and label') tf.app.flags.DEFINE_string('output_path', output_path, 'The directory which will store models (you have to create)') tf.app.flags.DEFINE_string('log_address', log_address, 'The file name which will store the log') FLAGS = tf.app.flags.FLAGS load_path = FLAGS.input_path save_model_path = FLAGS.output_path log_addr = FLAGS.log_address load_path = validatePath(load_path) save_model_path = validatePath(save_model_path) cnn1 = FLAGS.k1 cnn2 = FLAGS.k2 cnn3 = FLAGS.k3 hidden_size = np.array([FLAGS.h0, FLAGS.h1, FLAGS.h2, FLAGS.h3]) # Shuffle the dataset and its label def randomize(dataset, labels): permutation = np.random.permutation(labels.shape[0]) shuffled_dataset = dataset[permutation,:] shuffled_labels = labels[permutation] return shuffled_dataset, shuffled_labels def writemyfile(mystring): with open(log_addr, "a") as myfile: myfile.write(str(mystring + "\n")) num_labels = 5 def accuracy(predictions, labels): return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))/ predictions.shape[0]) def weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_2x2(x): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') def DivideSets(input_set): length_set = input_set.shape[0] index_70 = int(length_set*0.7) index_90 = int(length_set*0.9) set_train = input_set[0:index_70] set_valid = input_set[index_70:index_90] set_test = input_set[index_90:length_set] return np.float32(set_train), np.float32(set_valid), np.float32(set_test) # from 1-value labels to 5 values of (0 and 1) def LabelReconstruct(label_set): label_set = label_set.astype(int) new_label_set = np.zeros(shape=(len(label_set),num_labels)) for i in range(len(label_set)): new_label_set[i][label_set[i]] = 1 return new_label_set.astype(int) def LoadDataSet(load_path): list_data = [f for f in listdir(load_path + "dataset/") if isfile(join(load_path + "dataset/", f))] list_label = [f for f in listdir(load_path + "label/") if isfile(join(load_path + "dataset/", f))] if list_data.sort() == list_label.sort(): return list_data else: print("data and labels are not suitable") return 0 # load, randomize, normalize images and reconstruct labels def PrepareData(*arg): filename = arg[0] loaded_dataset = pickle.load( open( load_path + "dataset/" + filename, "rb" )) loaded_labels = pickle.load( open( load_path + "label/" + filename, "rb" )) if len(arg) == 1: datasize = len(loaded_labels) elif len(arg) == 2: datasize = int(arg[1]) else: print("not more than 2 arguments please!") dataset_full,labels_full = randomize(loaded_dataset[0:datasize], loaded_labels[0:datasize]) return NormalizeData(dataset_full), LabelReconstruct(labels_full) def NormalizeData(dataset): dataset = dataset - (dataset.mean()) dataset = dataset / (dataset.std()) return dataset ### LOAD DATA listfiles = LoadDataSet(load_path) # divide listfiles_train = listfiles[0:15] listfiles_valid = listfiles[15:25] listfiles_test = listfiles[25:len(listfiles)] graphCNN = tf.Graph() with graphCNN.as_default(): with tf.device('/gpu:0'): x = tf.placeholder(tf.float32, shape=(None, 224,224,3)) # X y_ = tf.placeholder(tf.float32, shape=(None, num_labels)) # Y_ dropout = tf.placeholder(tf.float32) if dropout == 1.0: keep_prob = tf.constant([0.2, 0.3, 0.5], dtype=tf.float32) else: keep_prob = tf.constant([1.0, 1.0, 1.0], dtype=tf.float32) weights_1 = weight_variable([cnn1,cnn1,3, hidden_size[0]]) biases_1 = bias_variable([hidden_size[0]]) weights_2 = weight_variable([cnn2,cnn2,hidden_size[0], hidden_size[1]]) biases_2 = bias_variable([hidden_size[1]]) weights_3 = weight_variable([cnn3,cnn3,hidden_size[1], hidden_size[2]]) biases_3 = bias_variable([hidden_size[2]]) weights_4 = weight_variable([56 * 56 * hidden_size[2], hidden_size[3]]) biases_4 = bias_variable([hidden_size[3]]) weights_5 = weight_variable([hidden_size[3], num_labels]) biases_5 = bias_variable([num_labels]) def model(data): with tf.device('/gpu:1'): train_hidden_1 = tf.nn.relu(conv2d(data, weights_1) + biases_1) train_hidden_2 = max_pool_2x2(tf.nn.relu(conv2d(train_hidden_1, weights_2) + biases_2)) train_hidden_2_drop = tf.nn.dropout(train_hidden_2, keep_prob[0]) train_hidden_3 = max_pool_2x2(tf.nn.relu(conv2d(train_hidden_2_drop, weights_3) + biases_3)) train_hidden_3_drop = tf.nn.dropout(train_hidden_3, keep_prob[1]) train_hidden_3_drop = tf.reshape(train_hidden_3_drop,[-1, 56 * 56 * hidden_size[2]]) train_hidden_4 = tf.nn.relu(tf.matmul(train_hidden_3_drop, weights_4) + biases_4) train_hidden_4_drop = tf.nn.dropout(train_hidden_4, keep_prob[2]) logits = tf.matmul(train_hidden_4_drop, weights_5) + biases_5 return logits t_train_labels = tf.argmax(y_, 1) # From one-hot (one and zeros) vectors to values loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model(x), labels=t_train_labels)) optimizer = tf.train.AdamOptimizer(0.01).minimize(loss) y = tf.nn.softmax(model(x)) ### RUNNING print("log address: %s" % (log_addr)) #num_steps = 10001 times_repeat = 20 # number of epochs batch_size = 100 with tf.Session(graph=graphCNN,config=tf.ConfigProto(log_device_placement=True)) as session: tf.initialize_all_variables().run() saver = tf.train.Saver(max_to_keep=0) writemyfile("---ARCHIVES_M1----") mytime = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) writemyfile(str("\nTime: %s \nLayers: %d,%d,%d \epochs: %d" % (mytime,cnn1,cnn2,cnn3,times_repeat))) writemyfile("Train files:" + str(listfiles_train)) writemyfile("Valid files:" + str(listfiles_valid)) writemyfile("Test files:" + str(listfiles_test)) print("Model will be saved in file: %s" % save_model_path) writemyfile(str("Model will be saved in file: %s" % save_model_path)) ### TRAINING & VALIDATION valid_accuracies_epochs = np.array([]) for time_repeat in range(times_repeat): print("- time_repeat:",time_repeat) writemyfile("- time_repeat:"+str(time_repeat)) for file_train in listfiles_train: file_train_id = int(file_train[0:len(file_train)-4]) time_start_this_file = time.time() #LOAD DATA print("- - file:",file_train_id, end=' ') writemyfile("- - file:" + str(file_train_id)) Data_train, Label_train= PrepareData(file_train) for step in range(0,len(Data_train)-batch_size,batch_size): batch_data = Data_train[step:step+batch_size] batch_labels = Label_train[step:step+batch_size] feed_dict = {x : batch_data, y_ : batch_labels, dropout: 1.0} opti, l, predictions = session.run([optimizer, loss, y], feed_dict=feed_dict) train_accuracies = np.array([]) for index_tr_accu in range(0,len(Data_train)-SIZE_BATCH_VALID,SIZE_BATCH_VALID): current_predictions = y.eval(feed_dict={x: Data_train[index_tr_accu:index_tr_accu+SIZE_BATCH_VALID],dropout: 0.0}) current_accuracy = accuracy(current_predictions, Label_train[index_tr_accu:index_tr_accu+SIZE_BATCH_VALID]) train_accuracies = np.r_[train_accuracies,current_accuracy] train_accuracy = train_accuracies.mean() print("batch accu: %.2f%%" %(train_accuracy),end=" | ") writemyfile("batch accu: %.2f%%" %(train_accuracy)) time_done_this_file = time.time() - time_start_this_file print("time: %.2fs" % (time_done_this_file)) writemyfile("time: %.2fs" % (time_done_this_file)) # save model model_addr = save_model_path + "model335" + "-epoch-" + str(time_repeat) + ".ckpt" save_path = saver.save(session, model_addr,) # max_to_keep default was 5 mytime = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print("epoch finished at %s \n model address: %s" % (mytime,model_addr)) writemyfile("epoch finished at %s \n model address: %s" % (mytime,model_addr)) # validation valid_accuracies = np.array([]) for file_valid in listfiles_valid: file_valid_id = int(file_valid[0:len(file_valid)-4]) Data_valid, Label_valid = PrepareData(file_valid) for index_vl_accu in range(0,len(Data_valid)-SIZE_BATCH_VALID,SIZE_BATCH_VALID): current_predictions = y.eval(feed_dict={x: Data_valid[index_vl_accu:index_vl_accu+SIZE_BATCH_VALID],dropout: 0.0}) current_accuracy = accuracy(current_predictions, Label_valid[index_vl_accu:index_vl_accu+SIZE_BATCH_VALID]) valid_accuracies = np.r_[valid_accuracies,current_accuracy] valid_accuracy = valid_accuracies.mean() print("epoch %d - valid accu: %.2f%%" %(time_repeat,valid_accuracy)) writemyfile("epoch %d - valid accu: %.2f%%" %(time_repeat,valid_accuracy)) valid_accuracies_epochs = np.hstack([valid_accuracies_epochs,valid_accuracy]) print('Done!!') writemyfile(str('Done!!')) session.close() Update: I found cifar10_multi_gpu_train.py seems to be a good example for training with multi GPUs, but honestly I don't know how to apply on my case.
I think you need to change def model(data): with tf.device('/gpu:1'): to: def model(data): for d in ['/gpu:0', '/gpu:1']: with tf.device(d): and ditch the line with tf.device('/gpu:0'): Since at the first with tf.device... you are only doing initiation of variables and then you are resetting your devices with the next with tf.device. Let me know if this works since I can't test it.
Cropping A Detected Object On A Video With Tensorflow Api And Opencv
-Python 3.6 -Tensorflow 1.11 with GPU support. -Opencv 3.4.2 I am working on Tensorflow Api, and I have already trained my dataset. It works fine. But I have to crop the detected object and make some preprocess on it. It seems easy, because Tensroflow draws the detected object with green box as well. When I try to find the coordinates of the object it gives me numbers of range 0 to 1. When I put the coordinates on Opencv Crop Image I have to multply the image with pictures height and width, but it works wrong. Tensorflow.org says that I can use "tf.image.crop_and_resize" function. But I can't run it on my own code. This is my run_inference_for_single_image function and returns output_dict: def run_inference_for_single_image(image, graph): with graph.as_default(): #with tf.Session() as sess: # Get handles to input and output tensors ops = tf.get_default_graph().get_operations() all_tensor_names = {output.name for op in ops for output in op.outputs} tensor_dict = {} for key in [ 'num_detections', 'detection_boxes', 'detection_scores', 'detection_classes', 'detection_masks' ]: tensor_name = key + ':0' if tensor_name in all_tensor_names: tensor_dict[key] = tf.get_default_graph().get_tensor_by_name( tensor_name) if 'detection_masks' in tensor_dict: # The following processing is only for single image detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0]) detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0]) # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size. real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32) detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1]) detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1]) detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks( detection_masks, detection_boxes, image.shape[0], image.shape[1]) detection_masks_reframed = tf.cast( tf.greater(detection_masks_reframed, 0.5), tf.uint8) # Follow the convention by adding back the batch dimension tensor_dict['detection_masks'] = tf.expand_dims( detection_masks_reframed, 0) image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0') # Run inference output_dict = sess.run(tensor_dict, feed_dict={image_tensor: np.expand_dims(image, 0)}) # all outputs are float32 numpy arrays, so convert types as appropriate output_dict['num_detections'] = int(output_dict['num_detections'][0]) output_dict['detection_classes'] = output_dict[ 'detection_classes'][0].astype(np.uint8) output_dict['detection_boxes'] = output_dict['detection_boxes'][0] output_dict['detection_scores'] = output_dict['detection_scores'][0] if 'detection_masks' in output_dict: output_dict['detection_masks'] = output_dict['detection_masks'][0] return output_dict This is my video capture funtion. It Crops the wrong coordinates. video = cv2.VideoCapture(0) ret = video.set(3,1080) ret = video.set(4,720) while(True): # Acquire frame and expand frame dimensions to have shape: [1, None, None, 3] # i.e. a single-column array, where each item in the column has the pixel RGB value ret,frame = video.read() frame = cv2.flip(frame, 1) frame_expanded = np.expand_dims(frame, axis=0) # Perform the actual detection by running the model with the image as input (boxes, scores, classes, num) = sess.run( [detection_boxes, detection_scores, detection_classes, num_detections], feed_dict={image_tensor: frame_expanded}) vis_util.visualize_boxes_and_labels_on_image_array( frame, np.squeeze(boxes), np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, use_normalized_coordinates=True, line_thickness=8, min_score_thresh=0.50) # Draw the results of the detection (aka 'visulaize the results') output_dict = run_inference_for_single_image(frame, detection_graph) max_boxes_to_draw = output_dict['detection_boxes'].shape[0] for i in range(min(max_boxes_to_draw, output_dict['detection_boxes'].shape[0])): if output_dict['detection_scores'][i] > 0.95: if output_dict['detection_classes'][i] in category_index.keys(): class_name = category_index[output_dict['detection_classes'][i]]['name'] print(output_dict['detection_boxes'][i]) crop_img = frame[int((output_dict['detection_boxes'][i][0]) * 720): int( (output_dict['detection_boxes'][i][2]) * 720), int((output_dict['detection_boxes'][i][1]) * 1080):int( (output_dict['detection_boxes'][i][3]) * 1080)] cv2.imshow("asdasd", crop_img) print(class_name) cv2.imshow('Object detector', frame) # Press 'q' to quit if cv2.waitKey(1) == ord('q'): break It might be about output_dict. class_name = category_index[output_dict['detection_classes'][i]]['name'] => This codes give me the name of the class. It works well.
I found an answer for my question. This is the solution code: while(True): # Acquire frame and expand frame dimensions to have shape: [1, None, None, 3] # i.e. a single-column array, where each item in the column has the pixel RGB value ret,frame = video.read() frame = cv2.flip(frame, 1) frame_expanded = np.expand_dims(frame, axis=0) # Perform the actual detection by running the model with the image as input (boxes, scores, classes, num) = sess.run( [detection_boxes, detection_scores, detection_classes, num_detections], feed_dict={image_tensor: frame_expanded}) vis_util.visualize_boxes_and_labels_on_image_array( frame, np.squeeze(boxes), np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, use_normalized_coordinates=True, line_thickness=8, min_score_thresh=0.50) # Draw the results of the detection (aka 'visulaize the results') output_dict = run_inference_for_single_image(frame, detection_graph) max_boxes_to_draw = output_dict['detection_boxes'].shape[0] for i in range(min(max_boxes_to_draw, output_dict['detection_boxes'].shape[0])): if output_dict['detection_scores'][i] > 0.80: if output_dict['detection_classes'][i] in category_index.keys(): class_name = category_index[output_dict['detection_classes'][i]]['name'] #print(output_dict['detection_boxes'][i]) ymin = boxes[0, i, 0] xmin = boxes[0, i, 1] ymax = boxes[0, i, 2] xmax = boxes[0, i, 3] im_width = 1280 im_height = 720 (xminn, xmaxx, yminn, ymaxx) = (xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height) crop_img=tf.image.crop_to_bounding_box(frame,int(yminn), int(xminn), int(ymaxx-yminn), int(xmaxx-xminn)) crop_img=frame[int(yminn):int(ymaxx),int(xminn):int(xmaxx)] # print(session.run(file)) """crop_img = frame[int((output_dict['detection_boxes'][i][0]) * 720): int( (output_dict['detection_boxes'][i][2]) * 720), int((output_dict['detection_boxes'][i][1]) * 1080):int( (output_dict['detection_boxes'][i][3]) * 1080)]""" cv2.imshow("asdsda",crop_img) #print(class_name) cv2.imshow('Object detector', frame) # Press 'q' to quit if cv2.waitKey(1) == ord('q'): break im_width = 1280 means nothing for me but it works on my project, but It works. Thanks for helps.