Frame generation for video in python - image-processing

I am using the Wand library to generate images. Next, I create a video from these frames. Each frame represents a set of animated objects, text, shapes, embedded images. To create a one-minute sequence, my machine takes about three minutes of working time. How can i use resources more efficiently and speed up execution using python?
from wand.color import Color
from wand.image import Image
from wand.drawing import Drawing
from wand.color import Color
from wand.font import Font
from pathlib import Path
resolution = "1080, 1920".split(",")
duration = "2"
framerate = "24"
width, height = int(resolution[0]), int(resolution[1])
amount = int(duration) * int(framerate)
temp_dir = Path().absolute() / "generator" / "temp_2"
font_path = "C:\\Windows\\Fonts\\arial.ttf"
src_path = Path().absolute() / "generator" / "src"
def framegen(width, height, amount) -> list:
frames = []
x_position = int(width/2)
y_position = int(height/2)
font_size = 80
text = "Hello world!"
background_pattern = Image(filename=src_path / "pattern.png")
for i in range(amount+1):
frames.append(Image(width=width, height=height,
background=Color("white")))
with Drawing() as draw:
# Draw pattern image
draw.composite(operator='over', left=0, top=0, width=background_pattern.width,
height=background_pattern.height, image=background_pattern)
draw.fill_color = Color("blue")
draw.rectangle(left=x_position-100, top=y_position -
100, right=x_position+100, bottom=y_position+100)
draw.font = font_path
draw.font_size = font_size
draw.fill_color = Color("black")
draw.text_alignment = "center"
draw.text(x_position, y_position, text)
draw(frames[i])
background_pattern.rotate(1)
y_position -= 5
font_size += 2
return frames
def save_frames(frames, temp_dir):
paths = []
for i, frame in enumerate(frames):
frame.save(filename=f"{temp_dir}/frame_{i}.png")
paths.append(f"{temp_dir}/frame_{i}.png")
return paths
if __name__ == "__main__":
frames = framegen(width, height, amount)
save_frames(frames, temp_dir)

Related

Adding dynamic plots to video

I'm trying to append dynamic graph plots to an existing video.
Currently I create a pyplot figure for each frame and concatenate the video frame with the figure (using np.from_buffer to extract the frame of each figure), which is very slow. In the end I use OpenCV's VideoWriter to export the concatenated frames into a video file.
I tried to use the FuncAnimation class, but I couldn't figure how to adopt it into my pipeline.
Below is my current code. Any way I could speed things up?
class DynamicGraph(ABC):
def __init__(self, data, height, dpi):
self._data = data.copy()
self.width, self.height = int(1.5 * height), height
self.filters = filters
self.dpi = dpi
self.data = data
nona = data[~np.isnan(data)]
self.ylim = (np.min(nona), np.max(nona))
def to_numpy(self, fig):
fig.tight_layout()
fig.canvas.draw()
frame = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
frame = cv2.cvtColor(frame.reshape(fig.canvas.get_width_height()[::-1] + (3,)), cv2.COLOR_RGB2BGR)
plt.close(fig)
return frame
def __call__(self, i):
return self.plot(i)
#abstractmethod
def plot(self, i):
pass
class DynamicSignal(DynamicGraph):
def __init__(self, title, data, legend, xlabel, ylabel, window_size, height, filters=(), dpi=128):
self.title, self.legend, self.xlabel, self.ylabel = title, legend, xlabel, ylabel
self.window_size = window_size
self.radius = self.window_size // 2
self.ticks = 50
pad = np.zeros((self.radius, data.shape[1]))
super().__init__(data=np.concatenate((pad, data, pad), axis=0), height=height, filters=filters, dpi=dpi)
def plot(self, i):
fig, ax = plt.subplots(figsize=(self.width / self.dpi, self.height / self.dpi), dpi=self.dpi)
x = np.arange(i-self.radius, i+self.radius+1)
y = self.data[i:i+2*self.radius+1]
ax.plot(x, y)
mn, mx = x.min(), x.max()
xticks = np.linspace(*(np.round(np.array([mn, mx])/self.ticks) * self.ticks), (mx - mn) // self.ticks + 1)
ax.set(title=self.title, xlabel=self.xlabel, xlim=(mn, mx), xticks=xticks, ylabel=self.ylabel, ylim=self.ylim)
ax.axvline(x=i, color='r', linestyle='dotted')
ax.grid()
ax.legend(self.legend, loc='upper right')
return self.to_numpy(fig)
class VideoCreator:
def __init__(self, painters=(), graphs=()):
self.painters = painters
self.graphs = graphs
def process_frame(self, frame, i):
out_frame = frame.copy()
if any(self.graphs):
graphs = [graph(i) for graph in self.graphs]
out_frame = np.concatenate((out_frame, np.concatenate(graphs, axis=0)), axis=1)
return out_frame
def create_video(self, video_path, video_data, out_path, start=None, end=None):
fps, length, (width, height) = video_data['fps'], video_data['frame_count'], video_data['resolution']
start, end = int(0 if start is None else start), int(length if end is None else end)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(out_path, fourcc, fps, width + self.graphs[0].width if any(self.graphs) else 0, height))
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, start)
for i in tqdm(range(start, end), desc="Writing video result"):
ret, frame = cap.read()
frame = self.process_frame(frame, i)
out.write(frame)
cap.release()
out.release()

Why does Tesseract fail to recognize 6 out of 26 of my alphabetic keyboard keys even with several parameter tunings?

TL;DR I'm using:
adaptive thresholding
segmenting by keys (width/height ratio) - see green boxes in image result
psm 10 to treat each key as a character
but it fails to recognize some keys, falsely identifies others or identifies 2 for 1 char (see the L character in the image result, it's an L and P), etc.
Note: I cropped the image and re-ran the results to get it to fit on this site, but before cropping it did slightly better (recognized more keys, fewer false positives, etc).
I just want it to recognize the alphabet keys. Ultimately I will want it to work for realtime video.
config:
'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
I've tried scaling the image differently, scaling the individual key segments, using opening/closing/etc but it doesn't recognize all the keys.
original image
image result
Update: new results if I make the image straighter (bird's eye) and remove the whitelisting, it manages to detect all for the most part (although it thinks the O is a 0 and the I is a |, which is understandable). Why is this and how could I make this adaptive enough for a dynamic video when it is so sensitive to these conditions?
Code:
import pytesseract
import numpy as np
try:
from PIL import Image
except ImportError:
import Image
import cv2
from tqdm import tqdm
from collections import defaultdict
def get_missing_chars(dict):
capital_alphabet = [chr(ascii) for ascii in range(65, 91)]
return [let for let in capital_alphabet if let not in dict]
def draw_box_and_char(img, contour_dims, c, box_col, text_col):
x, y, w, h = contour_dims
top_left = (x, y)
bot_right = (x + w, y+h)
font_offset = 3
text_pos = (x+h//2+12, y+h-font_offset)
img_copy = img.copy()
cv2.rectangle(img_copy, top_left, bot_right, box_col, 2)
cv2.putText(img_copy, c, text_pos, cv2.FONT_HERSHEY_SIMPLEX, fontScale=.5, color=text_col, thickness=1, lineType=cv2.LINE_AA)
return img_copy
def detect_keys(img):
scaling = .25
img = cv2.resize(img, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_AREA)
print("img shape", img.shape)
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ratio_min = 0.7
area_min = 1000
nbrhood_size = 1001
bias = 2
# adapt to different lighting
bin_img = cv2.adaptiveThreshold(gray_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
cv2.THRESH_BINARY_INV, nbrhood_size, bias)
items = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = items[0] if len(items) == 2 else items[1]
key_contours = []
for c in contours:
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
# square-like ratio, try to get character
if ratio > ratio_min and area > area_min:
key_contours.append(c)
detected = defaultdict(int)
n_kept = 0
img_copy = cv2.cvtColor(bin_img, cv2.COLOR_GRAY2RGB)
let_to_contour = {}
n_contours = len(key_contours)
# offset to get smaller square within the key segment for easier char recognition
offset = 10
show_each_char = False
for _, c in tqdm(enumerate(key_contours), total=n_contours):
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
base = np.zeros(bin_img.shape, dtype=np.uint8)
base.fill(255)
n_kept += 1
new_y = y+offset
new_x = x+offset
new_h = h-2*offset
new_w = w-2*offset
base[new_y:new_y+new_h, new_x:new_x+new_w] = bin_img[new_y:new_y+new_h, new_x:new_x+new_w]
segment = cv2.bitwise_not(base)
# try scaling up individual keys
# scaling = 2
# segment = cv2.resize(segment, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_CUBIC)
# psm 10: treats the segment as a single character
custom_config = r'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
d = pytesseract.image_to_data(segment, config=custom_config, output_type='dict')
conf = d['conf']
c = d['text'][-1]
if c:
# sometimes recognizes multiple keys even though there is only 1
for sub_c in c:
# save character and contour to draw on image and show bounds/detection
if sub_c not in let_to_contour or (sub_c in let_to_contour and conf > let_to_contour[sub_c]['conf']):
let_to_contour[sub_c] = {'conf': conf, 'cont': (new_x, new_y, new_w, new_h)}
else:
c = "?"
text_col = (0, 0, 255)
if show_each_char:
contour_dims = (new_x, new_y, new_w, new_h)
box_col = (0, 255, 0)
text_col = (0, 0, 0)
segment_with_boxes = draw_box_and_char(segment, contour_dims, c, box_col, text_col)
cv2.imshow('segment', segment_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()
# draw boxes around recognized keys
for c, data in let_to_contour.items():
box_col = (0, 255, 0)
text_col = (0, 0, 0)
img_copy = draw_box_and_char(img_copy, data['cont'], c, box_col, text_col)
detected = {k: 1 for k in let_to_contour}
for det in let_to_contour:
print(det, let_to_contour[det])
print("total detected: ", let_to_contour.keys())
missing = get_missing_chars(detected)
print(f"n_missing: {len(missing)}")
print(f"chars missing: {missing}")
return img_copy
if __name__ == "__main__":
img_file = "keyboard.jpg"
img = cv2.imread(img_file)
img_with_detected_keys = detect_keys(img)
cv2.imshow("detected", img_with_detected_keys)
cv2.waitKey(0)
cv2.destroyAllWindows()

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fd42c66d3b0>

I am trying to do an object detection problem and been working with aquarium dataset from roboflow. I have been trying to create a bounding box for the fishes, but I have getting the error:
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fd42c66d3b0>
I also tried to see what images are corrupted and ran a code
import PIL
from pathlib import Path
from PIL import UnidentifiedImageError
count = 0
path = Path("/content/drive/MyDrive/archive/Aquarium Combined").rglob("*.jpg")
for img_p in path:
try:
img = PIL.Image.open(img_p)
except PIL.UnidentifiedImageError:
print(img_p)
count +=1
print(count)
It has given me a count of 651 images, but my dataset has 662 images. I guess PIL doesn't know how to decode it or I don't know what the problem is. I will attach a sample image file name
/content/drive/MyDrive/archive/Aquarium Combined/test/IMG_2301_jpeg_jpg.rf.2c19ae5efbd1f8611b5578125f001695.jpg
Full traceback:
UnidentifiedImageError Traceback (most recent call last)
<ipython-input-31-2785d562a97e> in <module>()
4 sample[1]['boxes'][:, [1, 0, 3, 2]],
5 [classes[i] for i in sample[1]['labels']],
----> 6 width=4).permute(1, 2, 0)
7 )
3 frames
/usr/local/lib/python3.7/dist-packages/PIL/Image.py in open(fp, mode)
2894 if mode == "P":
2895 from . import ImagePalette
-> 2896
2897 im.palette = ImagePalette.ImagePalette("RGB", im.im.getpalette("RGB"))
2898 im.readonly = 1
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fd42c66d3b0>
Also I am providing the class functions"
class AquariumDetection(datasets.VisionDataset):
def __init__(
self,
root: str,
split = "train",
transform= None,
target_transform = None,
transforms = None,
) -> None:
super().__init__(root, transforms, transform, target_transform)
self.split = split
self.coco = COCO(os.path.join(root, split, "_annotations.coco.json"))
self.ids = list(sorted(self.coco.imgs.keys()))
self.ids = [id for id in self.ids if (len(self._load_target(id)) > 0)]
def _load_image(self, id: int) -> Image.Image:
path = self.coco.loadImgs(id)[0]["file_name"]
image = cv2.imread(os.path.join(self.root, self.split, path))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
return image
def _load_target(self, id: int):
return self.coco.loadAnns(self.coco.getAnnIds(id))
def __getitem__(self, index: int):
id = self.ids[index]
image = self._load_image(id)
target = copy.deepcopy(self._load_target(id))
boxes = [t['bbox'] + [t['category_id']] for t in target]
if self.transforms is not None:
transformed = self.transforms(image=image, bboxes=boxes)
image = transformed['image']
boxes = transformed['bboxes']
new_boxes = []
for box in boxes:
xmin = box[0]
ymin = box[1]
xmax = xmin + box[2]
ymax = ymin + box[3]
new_boxes.append([ymin, xmin, ymax, xmax])
boxes = torch.tensor(new_boxes, dtype=torch.float32)
_, h, w = image.shape
targ = {}
targ["boxes"] = boxes
targ["labels"] = torch.tensor([t["category_id"] for t in target], dtype=torch.int64)
targ["image_id"] = torch.tensor([t["image_id"] for t in target])
targ["area"] = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
targ["iscrowd"] = torch.tensor([t["iscrowd"] for t in target], dtype=torch.int64)
targ["img_scale"] = torch.tensor([1.0])
targ['img_size'] = (h, w)
image = image.div(255)
normalize = T.Compose([T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
return normalize(image), targ, index
def __len__(self) -> int:
return len(self.ids)

darknet_images.py doesn't detect any objects. Darknet YOLOv4

So I'm running the default darknet_images.py script in pycharm (the one from Alexey's github repo) and when I give the img path, the img shows but the bounding boxes don't. I tried to solve the problem but couldn't find a solution. I found that my predictions variable is empty(in the main() function:
image, detections = image_detection(image_name, network, class_names, class_colors, args.thresh )
Just in case I am missing something, I'll print here the darknet.py code and darknet_images.py code.
darknet.py:
#!python3
"""
Python 3 wrapper for identifying objects in images
Requires DLL compilation
Both the GPU and no-GPU version should be compiled; the no-GPU version should be renamed "yolo_cpp_dll_nogpu.dll".
On a GPU system, you can force CPU evaluation by any of:
- Set global variable DARKNET_FORCE_CPU to True
- Set environment variable CUDA_VISIBLE_DEVICES to -1
- Set environment variable "FORCE_CPU" to "true"
- Set environment variable "DARKNET_PATH" to path darknet lib .so (for Linux)
Directly viewing or returning bounding-boxed images requires scikit-image to be installed (`pip install scikit-image`)
Original *nix 2.7: https://github.com/pjreddie/darknet/blob/0f110834f4e18b30d5f101bf8f1724c34b7b83db/python/darknet.py
Windows Python 2.7 version: https://github.com/AlexeyAB/darknet/blob/fc496d52bf22a0bb257300d3c79be9cd80e722cb/build/darknet/x64/darknet.py
#author: Philip Kahn
#date: 20180503
"""
from ctypes import *
import math
import random
import os
class BOX(Structure):
_fields_ = [("x", c_float),
("y", c_float),
("w", c_float),
("h", c_float)]
class DETECTION(Structure):
_fields_ = [("bbox", BOX),
("classes", c_int),
("prob", POINTER(c_float)),
("mask", POINTER(c_float)),
("objectness", c_float),
("sort_class", c_int),
("uc", POINTER(c_float)),
("points", c_int),
("embeddings", POINTER(c_float)),
("embedding_size", c_int),
("sim", c_float),
("track_id", c_int)]
class DETNUMPAIR(Structure):
_fields_ = [("num", c_int),
("dets", POINTER(DETECTION))]
class IMAGE(Structure):
_fields_ = [("w", c_int),
("h", c_int),
("c", c_int),
("data", POINTER(c_float))]
class METADATA(Structure):
_fields_ = [("classes", c_int),
("names", POINTER(c_char_p))]
def network_width(net):
return lib.network_width(net)
def network_height(net):
return lib.network_height(net)
def bbox2points(bbox):
"""
From bounding box yolo format
to corner points cv2 rectangle
"""
x, y, w, h = bbox
xmin = int(round(x - (w / 2)))
xmax = int(round(x + (w / 2)))
ymin = int(round(y - (h / 2)))
ymax = int(round(y + (h / 2)))
return xmin, ymin, xmax, ymax
def class_colors(names):
"""
Create a dict with one random BGR color for each
class name
"""
return {name: (
random.randint(0, 255),
random.randint(0, 255),
random.randint(0, 255)) for name in names}
def load_network(config_file, data_file, weights, batch_size=1):
"""
load model description and weights from config files
args:
config_file (str): path to .cfg model file
data_file (str): path to .data model file
weights (str): path to weights
returns:
network: trained model
class_names
class_colors
"""
network = load_net_custom(
config_file.encode("ascii"),
weights.encode("ascii"), 0, batch_size)
metadata = load_meta(data_file.encode("ascii"))
class_names = [metadata.names[i].decode("ascii") for i in range(metadata.classes)]
colors = class_colors(class_names)
return network, class_names, colors
def print_detections(detections, coordinates=False):
print("\nObjects:")
for label, confidence, bbox in detections:
x, y, w, h = bbox
if coordinates:
print("{}: {}% (left_x: {:.0f} top_y: {:.0f} width: {:.0f} height: {:.0f})".format(label, confidence, x, y, w, h))
else:
print("{}: {}%".format(label, confidence))
def draw_boxes(detections, image, colors):
import cv2
for label, confidence, bbox in detections:
left, top, right, bottom = bbox2points(bbox)
cv2.rectangle(image, (left, top), (right, bottom), colors[label], 1)
cv2.putText(image, "{} [{:.2f}]".format(label, float(confidence)),
(left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
colors[label], 2)
return image
def decode_detection(detections):
decoded = []
for label, confidence, bbox in detections:
confidence = str(round(confidence * 100, 2))
decoded.append((str(label), confidence, bbox))
return decoded
def remove_negatives(detections, class_names, num):
"""
Remove all classes with 0% confidence within the detection
"""
predictions = []
for j in range(num):
for idx, name in enumerate(class_names):
if detections[j].prob[idx] > 0:
bbox = detections[j].bbox
bbox = (bbox.x, bbox.y, bbox.w, bbox.h)
predictions.append((name, detections[j].prob[idx], (bbox)))
return predictions
def detect_image(network, class_names, image, thresh=.5, hier_thresh=.5, nms=.45):
"""
Returns a list with highest confidence class and their bbox
"""
pnum = pointer(c_int(0))
predict_image(network, image)
detections = get_network_boxes(network, image.w, image.h,
thresh, hier_thresh, None, 0, pnum, 0)
num = pnum[0]
if nms:
do_nms_sort(detections, num, len(class_names), nms)
predictions = remove_negatives(detections, class_names, num)
predictions = decode_detection(predictions)
free_detections(detections, num)
return sorted(predictions, key=lambda x: x[1])
# lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
# lib = CDLL("libdarknet.so", RTLD_GLOBAL)
hasGPU = True
if os.name == "nt":
cwd = os.path.dirname(__file__)
os.environ['PATH'] = cwd + ';' + os.environ['PATH']
winGPUdll = os.path.join(cwd, "yolo_cpp_dll.dll")
winNoGPUdll = os.path.join(cwd, "yolo_cpp_dll_nogpu.dll")
envKeys = list()
for k, v in os.environ.items():
envKeys.append(k)
try:
try:
tmp = os.environ["FORCE_CPU"].lower()
if tmp in ["1", "true", "yes", "on"]:
raise ValueError("ForceCPU")
else:
print("Flag value {} not forcing CPU mode".format(tmp))
except KeyError:
# We never set the flag
if 'CUDA_VISIBLE_DEVICES' in envKeys:
if int(os.environ['CUDA_VISIBLE_DEVICES']) < 0:
raise ValueError("ForceCPU")
try:
global DARKNET_FORCE_CPU
if DARKNET_FORCE_CPU:
raise ValueError("ForceCPU")
except NameError as cpu_error:
print(cpu_error)
if not os.path.exists(winGPUdll):
raise ValueError("NoDLL")
lib = CDLL(winGPUdll, RTLD_GLOBAL)
except (KeyError, ValueError):
hasGPU = False
if os.path.exists(winNoGPUdll):
lib = CDLL(winNoGPUdll, RTLD_GLOBAL)
print("Notice: CPU-only mode")
else:
# Try the other way, in case no_gpu was compile but not renamed
lib = CDLL(winGPUdll, RTLD_GLOBAL)
print("Environment variables indicated a CPU run, but we didn't find {}. Trying a GPU run anyway.".format(winNoGPUdll))
else:
lib = CDLL(os.path.join(
os.environ.get('DARKNET_PATH', './'),
"libdarknet.so"), RTLD_GLOBAL)
lib.network_width.argtypes = [c_void_p]
lib.network_width.restype = c_int
lib.network_height.argtypes = [c_void_p]
lib.network_height.restype = c_int
copy_image_from_bytes = lib.copy_image_from_bytes
copy_image_from_bytes.argtypes = [IMAGE,c_char_p]
predict = lib.network_predict_ptr
predict.argtypes = [c_void_p, POINTER(c_float)]
predict.restype = POINTER(c_float)
if hasGPU:
set_gpu = lib.cuda_set_device
set_gpu.argtypes = [c_int]
init_cpu = lib.init_cpu
make_image = lib.make_image
make_image.argtypes = [c_int, c_int, c_int]
make_image.restype = IMAGE
get_network_boxes = lib.get_network_boxes
get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int), c_int]
get_network_boxes.restype = POINTER(DETECTION)
make_network_boxes = lib.make_network_boxes
make_network_boxes.argtypes = [c_void_p]
make_network_boxes.restype = POINTER(DETECTION)
free_detections = lib.free_detections
free_detections.argtypes = [POINTER(DETECTION), c_int]
free_batch_detections = lib.free_batch_detections
free_batch_detections.argtypes = [POINTER(DETNUMPAIR), c_int]
free_ptrs = lib.free_ptrs
free_ptrs.argtypes = [POINTER(c_void_p), c_int]
network_predict = lib.network_predict_ptr
network_predict.argtypes = [c_void_p, POINTER(c_float)]
reset_rnn = lib.reset_rnn
reset_rnn.argtypes = [c_void_p]
load_net = lib.load_network
load_net.argtypes = [c_char_p, c_char_p, c_int]
load_net.restype = c_void_p
load_net_custom = lib.load_network_custom
load_net_custom.argtypes = [c_char_p, c_char_p, c_int, c_int]
load_net_custom.restype = c_void_p
free_network_ptr = lib.free_network_ptr
free_network_ptr.argtypes = [c_void_p]
free_network_ptr.restype = c_void_p
do_nms_obj = lib.do_nms_obj
do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
do_nms_sort = lib.do_nms_sort
do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
free_image = lib.free_image
free_image.argtypes = [IMAGE]
letterbox_image = lib.letterbox_image
letterbox_image.argtypes = [IMAGE, c_int, c_int]
letterbox_image.restype = IMAGE
load_meta = lib.get_metadata
lib.get_metadata.argtypes = [c_char_p]
lib.get_metadata.restype = METADATA
load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE
rgbgr_image = lib.rgbgr_image
rgbgr_image.argtypes = [IMAGE]
predict_image = lib.network_predict_image
predict_image.argtypes = [c_void_p, IMAGE]
predict_image.restype = POINTER(c_float)
predict_image_letterbox = lib.network_predict_image_letterbox
predict_image_letterbox.argtypes = [c_void_p, IMAGE]
predict_image_letterbox.restype = POINTER(c_float)
network_predict_batch = lib.network_predict_batch
network_predict_batch.argtypes = [c_void_p, IMAGE, c_int, c_int, c_int,
c_float, c_float, POINTER(c_int), c_int, c_int]
network_predict_batch.restype = POINTER(DETNUMPAIR)
darknet_images.py:
import argparse
import os
import glob
import random
import darknet
import time
import cv2
import numpy as np
import darknet
def parser():
parser = argparse.ArgumentParser(description="YOLO Object Detection")
parser.add_argument("--input", type=str, default="",
help="image source. It can be a single image, a"
"txt with paths to them, or a folder. Image valid"
" formats are jpg, jpeg or png."
"If no input is given, ")
parser.add_argument("--batch_size", default=1, type=int,
help="number of images to be processed at the same time")
parser.add_argument("--weights", default="yolov4.weights",
help="yolo weights path")
parser.add_argument("--dont_show", action='store_true',
help="windown inference display. For headless systems")
parser.add_argument("--ext_output", action='store_true',
help="display bbox coordinates of detected objects")
parser.add_argument("--save_labels", action='store_true',
help="save detections bbox for each image in yolo format")
parser.add_argument("--config_file", default="./cfg/yolov4.cfg",
help="path to config file")
parser.add_argument("--data_file", default="./cfg/coco.data",
help="path to data file")
parser.add_argument("--thresh", type=float, default=.25,
help="remove detections with lower confidence")
return parser.parse_args()
def check_arguments_errors(args):
assert 0 < args.thresh < 1, "Threshold should be a float between zero and one (non-inclusive)"
if not os.path.exists(args.config_file):
raise(ValueError("Invalid config path {}".format(os.path.abspath(args.config_file))))
if not os.path.exists(args.weights):
raise(ValueError("Invalid weight path {}".format(os.path.abspath(args.weights))))
if not os.path.exists(args.data_file):
raise(ValueError("Invalid data file path {}".format(os.path.abspath(args.data_file))))
if args.input and not os.path.exists(args.input):
raise(ValueError("Invalid image path {}".format(os.path.abspath(args.input))))
def check_batch_shape(images, batch_size):
"""
Image sizes should be the same width and height
"""
shapes = [image.shape for image in images]
if len(set(shapes)) > 1:
raise ValueError("Images don't have same shape")
if len(shapes) > batch_size:
raise ValueError("Batch size higher than number of images")
return shapes[0]
def load_images(images_path):
"""
If image path is given, return it directly
For txt file, read it and return each line as image path
In other case, it's a folder, return a list with names of each
jpg, jpeg and png file
"""
input_path_extension = images_path.split('.')[-1]
if input_path_extension in ['jpg', 'jpeg', 'png']:
return [images_path]
elif input_path_extension == "txt":
with open(images_path, "r") as f:
return f.read().splitlines()
else:
return glob.glob(
os.path.join(images_path, "*.jpg")) + \
glob.glob(os.path.join(images_path, "*.png")) + \
glob.glob(os.path.join(images_path, "*.jpeg"))
def prepare_batch(images, network, channels=3):
width = darknet.network_width(network)
height = darknet.network_height(network)
darknet_images = []
for image in images:
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),
interpolation=cv2.INTER_LINEAR)
custom_image = image_resized.transpose(2, 0, 1)
darknet_images.append(custom_image)
batch_array = np.concatenate(darknet_images, axis=0)
batch_array = np.ascontiguousarray(batch_array.flat, dtype=np.float32)/255.0
darknet_images = batch_array.ctypes.data_as(darknet.POINTER(darknet.c_float))
return darknet.IMAGE(width, height, channels, darknet_images)
def image_detection(image_path, network, class_names, class_colors, thresh):
# Darknet doesn't accept numpy images.
# Create one with image we reuse for each detect
width = darknet.network_width(network)
height = darknet.network_height(network)
darknet_image = darknet.make_image(width, height, 3)
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),
interpolation=cv2.INTER_LINEAR)
darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
detections = darknet.detect_image(network, class_names, darknet_image, thresh=thresh)
darknet.free_image(darknet_image)
image = darknet.draw_boxes(detections, image_resized, class_colors)
return cv2.cvtColor(image, cv2.COLOR_BGR2RGB), detections
def batch_detection(network, images, class_names, class_colors,
thresh=0.25, hier_thresh=.5, nms=.45, batch_size=4):
image_height, image_width, _ = check_batch_shape(images, batch_size)
darknet_images = prepare_batch(images, network)
batch_detections = darknet.network_predict_batch(network, darknet_images, batch_size, image_width,
image_height, thresh, hier_thresh, None, 0, 0)
batch_predictions = []
for idx in range(batch_size):
num = batch_detections[idx].num
detections = batch_detections[idx].dets
if nms:
darknet.do_nms_obj(detections, num, len(class_names), nms)
predictions = darknet.remove_negatives(detections, class_names, num)
images[idx] = darknet.draw_boxes(predictions, images[idx], class_colors)
batch_predictions.append(predictions)
darknet.free_batch_detections(batch_detections, batch_size)
return images, batch_predictions
def image_classification(image, network, class_names):
width = darknet.network_width(network)
height = darknet.network_height(network)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, (width, height),
interpolation=cv2.INTER_LINEAR)
darknet_image = darknet.make_image(width, height, 3)
darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
detections = darknet.predict_image(network, darknet_image)
predictions = [(name, detections[idx]) for idx, name in enumerate(class_names)]
darknet.free_image(darknet_image)
return sorted(predictions, key=lambda x: -x[1])
def convert2relative(image, bbox):
"""
YOLO format use relative coordinates for annotation
"""
x, y, w, h = bbox
height, width, _ = image.shape
return x/width, y/height, w/width, h/height
def save_annotations(name, image, detections, class_names):
"""
Files saved with image_name.txt and relative coordinates
"""
file_name = os.path.splitext(name)[0] + ".txt"
with open(file_name, "w") as f:
for label, confidence, bbox in detections:
x, y, w, h = convert2relative(image, bbox)
label = class_names.index(label)
f.write("{} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}\n".format(label, x, y, w, h, float(confidence)))
def batch_detection_example():
args = parser()
check_arguments_errors(args)
batch_size = 3
random.seed(3) # deterministic bbox colors
network, class_names, class_colors = darknet.load_network(
args.config_file,
args.data_file,
args.weights,
batch_size=batch_size
)
image_names = ['data/horses.jpg', 'data/horses.jpg', 'data/eagle.jpg']
images = [cv2.imread(image) for image in image_names]
images, detections, = batch_detection(network, images, class_names,
class_colors, batch_size=batch_size)
for name, image in zip(image_names, images):
cv2.imwrite(name.replace("data/", ""), image)
print(detections)
def main():
args = parser()
check_arguments_errors(args)
random.seed(3) # deterministic bbox colors
network, class_names, class_colors = darknet.load_network(
args.config_file,
args.data_file,
args.weights,
batch_size=args.batch_size
)
images = load_images(args.input)
index = 0
while True:
# loop asking for new image paths if no list is given
if args.input:
if index >= len(images):
break
image_name = images[index]
else:
image_name = input("Enter Image Path: ")
prev_time = time.time()
image, detections = image_detection(
image_name, network, class_names, class_colors, args.thresh
)
if args.save_labels:
save_annotations(image_name, image, detections, class_names)
darknet.print_detections(detections, args.ext_output)
fps = int(1/(time.time() - prev_time))
print("FPS: {}".format(fps))
if not args.dont_show:
cv2.imshow('Inference', image)
if cv2.waitKey() & 0xFF == ord('q'):
break
index += 1
if __name__ == "__main__":
# unconmment next line for an example of batch processing
# batch_detection_example()
main()

How to convert a pygame image to Open CV image?

I am currently getting a real time RGB video from a Kinect2 camera using Pygame and pykinect2. I want to convert it into an open cv image so that it would be helpful for me in my further Computations.
import pykinect2
import pygame
import cv2
import ctypes
from pykinect2 import PyKinectV2
from pykinect2 import PyKinectRuntime
kinectcam = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Color)
def draw_color_frame(frame, target_surface):
target_surface.lock()
address = kinectcam.surface_as_array(target_surface.get_buffer())
ctypes.memmove(address, frame.ctypes.data, frame.size)
del address
target_surface.unlock()
pygame.init()
frame_surface = pygame.Surface((kinectcam.color_frame_desc.Width, kinectcam.color_frame_desc.Height), 0, 32)
clock = pygame.time.Clock()
pygame.display.set_caption("Kinect View")
infoObject = pygame.display.Info()
screen = pygame.display.set_mode((infoObject.current_w >> 1, infoObject.current_h >> 1),
pygame.HWSURFACE|pygame.DOUBLEBUF|pygame.RESIZABLE, 32)
clock = pygame.time.Clock()
done = False
while not done:
for event in pygame.event.get(): # User did something
if event.type == pygame.QUIT: # If user clicked close
done = True # Flag that we are done so we exit this loop
elif event.type == pygame.VIDEORESIZE: # window resized
screen = pygame.display.set_mode(event.dict['size'],
pygame.HWSURFACE|pygame.DOUBLEBUF|pygame.RESIZABLE, 32)
if kinectcam.has_new_color_frame():
frame = kinectcam.get_last_color_frame()
draw_color_frame(frame, frame_surface)
frame = None
h_to_w = float(frame_surface.get_height()) / frame_surface.get_width()
target_height = int(h_to_w * screen.get_width())
surface_to_draw = pygame.transform.scale(frame_surface, (screen.get_width(), target_height));
screen.blit(surface_to_draw, (0,0))
surface_to_draw = None
pygame.display.update()
pygame.display.flip()
clock.tick(60)
pygame.quit()
kinectcam.close()
I assume your are trying to convert the image you are blitting (surface_to_draw). To convert pygame.Surface object to opencv image:
# create a copy of the surface
view = pygame.surfarray.array3d(surface_to_draw)
# convert from (width, height, channel) to (height, width, channel)
view = view.transpose([1, 0, 2])
# convert from rgb to bgr
img_bgr = cv2.cvtColor(view, cv2.COLOR_RGB2BGR)
Update: I also assumed your pygame image is color image.
Using pyKinect2 library, you can create an acquisitionClass.py which defines different methods and properties required for Kinect frame processing. Then call the acquisitionClass.py which contains the get_color_frame() that does the conversion, from your main script (Run.py) to use and display the converted color frame to opencv frame as follows:
Note: I'm answering this assuming you do not want to use the pyGame library but OpenCV instead.
Run.py
import cv2
from pykinect2 import PyKinectV2
from pykinect2.PyKinectV2 import *
from pykinect2 import PyKinectRuntime
from acquisitionKinect import AcquisitionKinect
from frame import Frame
if __name__ == '__main__':
kinect = AcquisitionKinect()
frame = Frame()
while True:
kinect.get_frame(frame)
kinect.get_color_frame()
image = kinect._frameRGB
#OpenCv uses RGB image, kinect returns type RGBA, remove extra dim.
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
if not image is None:
cv2.imshow("Output-Keypoints",image)
cv2.waitKey(30)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
acquisitionKinect.py
import ctypes
import _ctypes
import sys
if sys.hexversion >= 0x03000000:
import _thread as thread
else:
import thread
class AcquisitionKinect():
#Create a constructor to initialize different types of array and frame objects
def __init__(self, resolution_mode=1.0):
self.resolution_mode = resolution_mode
self._done = False
# Kinect runtime object, we want only color and body frames
self._kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Color | PyKinectV2.FrameSourceTypes_Body | PyKinectV2.FrameSourceTypes_Depth)
# here we will store skeleton data
self._bodies = None
self.body_tracked = False
self.joint_points = np.array([])
self.joint_points3D = np.array([])
self.joint_points_RGB = np.array([])
self.joint_state = np.array([])
self._frameRGB = None
self._frameDepth = None
self._frameDepthQuantized = None
self._frameSkeleton = None
self.frameNum = 0
def get_frame(self, frame):
self.acquireFrame()
frame.ts = int(round(time.time() * 1000))
self.frameNum += 1
frame.frameRGB = self._frameRGB
frame.frameDepth = self._frameDepth
frame.frameDepthQuantized = self._frameDepthQuantized
frame.frameSkeleton = self._frameSkeleton
#Get a color frame object
def get_color_frame(self):
self._frameRGB = self._kinect.get_last_color_frame()
self._frameRGB = self._frameRGB.reshape((1080, 1920,-1)).astype(np.uint8)
self._frameRGB = cv2.resize(self._frameRGB, (0,0), fx=1/self.resolution_mode, fy=1/self.resolution_mode)
#Acquire the type of frame required
def acquireFrame(self):
if self._kinect.has_new_color_frame():
self.get_color_frame()
def close(self):
self._kinect.close()
self._frameDepth = None
self._frameRGB = None
self._frameSkeleton = None
Frame.py
class Frame():
frameRGB = None
frameDepth = None
frameDepthQuantized = None
frameSkeleton = None
frame_num = 0
shoulder_orientation_euler = None
shoulder_orientation_quat = None

Resources