I am currently getting a real time RGB video from a Kinect2 camera using Pygame and pykinect2. I want to convert it into an open cv image so that it would be helpful for me in my further Computations.
import pykinect2
import pygame
import cv2
import ctypes
from pykinect2 import PyKinectV2
from pykinect2 import PyKinectRuntime
kinectcam = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Color)
def draw_color_frame(frame, target_surface):
address = kinectcam.surface_as_array(target_surface.get_buffer())
ctypes.memmove(address, frame.ctypes.data, frame.size)
del address
frame_surface = pygame.Surface((kinectcam.color_frame_desc.Width, kinectcam.color_frame_desc.Height), 0, 32)
clock = pygame.time.Clock()
pygame.display.set_caption("Kinect View")
infoObject = pygame.display.Info()
screen = pygame.display.set_mode((infoObject.current_w >> 1, infoObject.current_h >> 1),
pygame.HWSURFACE|pygame.DOUBLEBUF|pygame.RESIZABLE, 32)
clock = pygame.time.Clock()
done = False
while not done:
for event in pygame.event.get(): # User did something
if event.type == pygame.QUIT: # If user clicked close
done = True # Flag that we are done so we exit this loop
elif event.type == pygame.VIDEORESIZE: # window resized
screen = pygame.display.set_mode(event.dict['size'],
pygame.HWSURFACE|pygame.DOUBLEBUF|pygame.RESIZABLE, 32)
if kinectcam.has_new_color_frame():
frame = kinectcam.get_last_color_frame()
draw_color_frame(frame, frame_surface)
frame = None
h_to_w = float(frame_surface.get_height()) / frame_surface.get_width()
target_height = int(h_to_w * screen.get_width())
surface_to_draw = pygame.transform.scale(frame_surface, (screen.get_width(), target_height));
screen.blit(surface_to_draw, (0,0))
surface_to_draw = None
I assume your are trying to convert the image you are blitting (surface_to_draw). To convert pygame.Surface object to opencv image:
# create a copy of the surface
view = pygame.surfarray.array3d(surface_to_draw)
# convert from (width, height, channel) to (height, width, channel)
view = view.transpose([1, 0, 2])
# convert from rgb to bgr
img_bgr = cv2.cvtColor(view, cv2.COLOR_RGB2BGR)
Update: I also assumed your pygame image is color image.
Using pyKinect2 library, you can create an acquisitionClass.py which defines different methods and properties required for Kinect frame processing. Then call the acquisitionClass.py which contains the get_color_frame() that does the conversion, from your main script (Run.py) to use and display the converted color frame to opencv frame as follows:
Note: I'm answering this assuming you do not want to use the pyGame library but OpenCV instead.
import cv2
from pykinect2 import PyKinectV2
from pykinect2.PyKinectV2 import *
from pykinect2 import PyKinectRuntime
from acquisitionKinect import AcquisitionKinect
from frame import Frame
if __name__ == '__main__':
kinect = AcquisitionKinect()
frame = Frame()
while True:
image = kinect._frameRGB
#OpenCv uses RGB image, kinect returns type RGBA, remove extra dim.
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
if not image is None:
if cv2.waitKey(1) & 0xFF == ord('q'):
import ctypes
import _ctypes
import sys
if sys.hexversion >= 0x03000000:
import _thread as thread
import thread
class AcquisitionKinect():
#Create a constructor to initialize different types of array and frame objects
def __init__(self, resolution_mode=1.0):
self.resolution_mode = resolution_mode
self._done = False
# Kinect runtime object, we want only color and body frames
self._kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Color | PyKinectV2.FrameSourceTypes_Body | PyKinectV2.FrameSourceTypes_Depth)
# here we will store skeleton data
self._bodies = None
self.body_tracked = False
self.joint_points = np.array([])
self.joint_points3D = np.array([])
self.joint_points_RGB = np.array([])
self.joint_state = np.array([])
self._frameRGB = None
self._frameDepth = None
self._frameDepthQuantized = None
self._frameSkeleton = None
self.frameNum = 0
def get_frame(self, frame):
frame.ts = int(round(time.time() * 1000))
self.frameNum += 1
frame.frameRGB = self._frameRGB
frame.frameDepth = self._frameDepth
frame.frameDepthQuantized = self._frameDepthQuantized
frame.frameSkeleton = self._frameSkeleton
#Get a color frame object
def get_color_frame(self):
self._frameRGB = self._kinect.get_last_color_frame()
self._frameRGB = self._frameRGB.reshape((1080, 1920,-1)).astype(np.uint8)
self._frameRGB = cv2.resize(self._frameRGB, (0,0), fx=1/self.resolution_mode, fy=1/self.resolution_mode)
#Acquire the type of frame required
def acquireFrame(self):
if self._kinect.has_new_color_frame():
def close(self):
self._frameDepth = None
self._frameRGB = None
self._frameSkeleton = None
class Frame():
frameRGB = None
frameDepth = None
frameDepthQuantized = None
frameSkeleton = None
frame_num = 0
shoulder_orientation_euler = None
shoulder_orientation_quat = None
TL;DR I'm using:
adaptive thresholding
segmenting by keys (width/height ratio) - see green boxes in image result
psm 10 to treat each key as a character
but it fails to recognize some keys, falsely identifies others or identifies 2 for 1 char (see the L character in the image result, it's an L and P), etc.
Note: I cropped the image and re-ran the results to get it to fit on this site, but before cropping it did slightly better (recognized more keys, fewer false positives, etc).
I just want it to recognize the alphabet keys. Ultimately I will want it to work for realtime video.
'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
I've tried scaling the image differently, scaling the individual key segments, using opening/closing/etc but it doesn't recognize all the keys.
original image
image result
Update: new results if I make the image straighter (bird's eye) and remove the whitelisting, it manages to detect all for the most part (although it thinks the O is a 0 and the I is a |, which is understandable). Why is this and how could I make this adaptive enough for a dynamic video when it is so sensitive to these conditions?
import pytesseract
import numpy as np
from PIL import Image
except ImportError:
import Image
import cv2
from tqdm import tqdm
from collections import defaultdict
def get_missing_chars(dict):
capital_alphabet = [chr(ascii) for ascii in range(65, 91)]
return [let for let in capital_alphabet if let not in dict]
def draw_box_and_char(img, contour_dims, c, box_col, text_col):
x, y, w, h = contour_dims
top_left = (x, y)
bot_right = (x + w, y+h)
font_offset = 3
text_pos = (x+h//2+12, y+h-font_offset)
img_copy = img.copy()
cv2.rectangle(img_copy, top_left, bot_right, box_col, 2)
cv2.putText(img_copy, c, text_pos, cv2.FONT_HERSHEY_SIMPLEX, fontScale=.5, color=text_col, thickness=1, lineType=cv2.LINE_AA)
return img_copy
def detect_keys(img):
scaling = .25
img = cv2.resize(img, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_AREA)
print("img shape", img.shape)
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ratio_min = 0.7
area_min = 1000
nbrhood_size = 1001
bias = 2
# adapt to different lighting
bin_img = cv2.adaptiveThreshold(gray_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
cv2.THRESH_BINARY_INV, nbrhood_size, bias)
items = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = items[0] if len(items) == 2 else items[1]
key_contours = []
for c in contours:
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
# square-like ratio, try to get character
if ratio > ratio_min and area > area_min:
detected = defaultdict(int)
n_kept = 0
img_copy = cv2.cvtColor(bin_img, cv2.COLOR_GRAY2RGB)
let_to_contour = {}
n_contours = len(key_contours)
# offset to get smaller square within the key segment for easier char recognition
offset = 10
show_each_char = False
for _, c in tqdm(enumerate(key_contours), total=n_contours):
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
base = np.zeros(bin_img.shape, dtype=np.uint8)
n_kept += 1
new_y = y+offset
new_x = x+offset
new_h = h-2*offset
new_w = w-2*offset
base[new_y:new_y+new_h, new_x:new_x+new_w] = bin_img[new_y:new_y+new_h, new_x:new_x+new_w]
segment = cv2.bitwise_not(base)
# try scaling up individual keys
# scaling = 2
# segment = cv2.resize(segment, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_CUBIC)
# psm 10: treats the segment as a single character
custom_config = r'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
d = pytesseract.image_to_data(segment, config=custom_config, output_type='dict')
conf = d['conf']
c = d['text'][-1]
if c:
# sometimes recognizes multiple keys even though there is only 1
for sub_c in c:
# save character and contour to draw on image and show bounds/detection
if sub_c not in let_to_contour or (sub_c in let_to_contour and conf > let_to_contour[sub_c]['conf']):
let_to_contour[sub_c] = {'conf': conf, 'cont': (new_x, new_y, new_w, new_h)}
c = "?"
text_col = (0, 0, 255)
if show_each_char:
contour_dims = (new_x, new_y, new_w, new_h)
box_col = (0, 255, 0)
text_col = (0, 0, 0)
segment_with_boxes = draw_box_and_char(segment, contour_dims, c, box_col, text_col)
cv2.imshow('segment', segment_with_boxes)
# draw boxes around recognized keys
for c, data in let_to_contour.items():
box_col = (0, 255, 0)
text_col = (0, 0, 0)
img_copy = draw_box_and_char(img_copy, data['cont'], c, box_col, text_col)
detected = {k: 1 for k in let_to_contour}
for det in let_to_contour:
print(det, let_to_contour[det])
print("total detected: ", let_to_contour.keys())
missing = get_missing_chars(detected)
print(f"n_missing: {len(missing)}")
print(f"chars missing: {missing}")
return img_copy
if __name__ == "__main__":
img_file = "keyboard.jpg"
img = cv2.imread(img_file)
img_with_detected_keys = detect_keys(img)
cv2.imshow("detected", img_with_detected_keys)
I am using the Wand library to generate images. Next, I create a video from these frames. Each frame represents a set of animated objects, text, shapes, embedded images. To create a one-minute sequence, my machine takes about three minutes of working time. How can i use resources more efficiently and speed up execution using python?
from wand.color import Color
from wand.image import Image
from wand.drawing import Drawing
from wand.color import Color
from wand.font import Font
from pathlib import Path
resolution = "1080, 1920".split(",")
duration = "2"
framerate = "24"
width, height = int(resolution[0]), int(resolution[1])
amount = int(duration) * int(framerate)
temp_dir = Path().absolute() / "generator" / "temp_2"
font_path = "C:\\Windows\\Fonts\\arial.ttf"
src_path = Path().absolute() / "generator" / "src"
def framegen(width, height, amount) -> list:
frames = []
x_position = int(width/2)
y_position = int(height/2)
font_size = 80
text = "Hello world!"
background_pattern = Image(filename=src_path / "pattern.png")
for i in range(amount+1):
frames.append(Image(width=width, height=height,
with Drawing() as draw:
# Draw pattern image
draw.composite(operator='over', left=0, top=0, width=background_pattern.width,
height=background_pattern.height, image=background_pattern)
draw.fill_color = Color("blue")
draw.rectangle(left=x_position-100, top=y_position -
100, right=x_position+100, bottom=y_position+100)
draw.font = font_path
draw.font_size = font_size
draw.fill_color = Color("black")
draw.text_alignment = "center"
draw.text(x_position, y_position, text)
y_position -= 5
font_size += 2
return frames
def save_frames(frames, temp_dir):
paths = []
for i, frame in enumerate(frames):
return paths
if __name__ == "__main__":
frames = framegen(width, height, amount)
save_frames(frames, temp_dir)
I would be glad if you help.
Subject: PyQt5
In Pyqt5, I can open the webcam image using opencv.
What I want to do is to add text to the point I want on the camera image. I couldn't do it anyway. I left the code I wrote below.
Note: I don't want to add text with opencv or PIL library. There are no Turkish characters in Opencv, PIL works very slowly.
Thank you from now.
from PyQt5 import QtGui
from PyQt5.QtWidgets import QWidget, QApplication, QLabel,
from PyQt5.QtGui import QPixmap
import sys
import cv2
from PyQt5.QtCore import pyqtSignal, pyqtSlot, Qt, QThread
import numpy as np
class VideoThread(QThread):
change_pixmap_signal = pyqtSignal(np.ndarray)
def __init__(self):
self._run_flag = True
def run(self):
# capture from analog camera
cap = cv2.VideoCapture(0)
while self._run_flag:
ret, cv_img = cap.read()
if ret:
cv_img = cv2.resize(cv_img, (1024, 768))
# shut down capture system
def stop(self):
#stop capture
self._run_flag = False
class App(QWidget):
def __init__(self):
self.disply_width = 1024
self.display_height = 768
# create the label that holds the image
self.image_label = QLabel(self)
self.image_label.resize(self.disply_width, self.display_height)
# create a text label
self.textLabel = QLabel('XXXX')
# create a vertical box layout and add the two labels
vbox = QVBoxLayout()
# set the vbox layout as the widgets layout
# create the video capture thread
self.thread = VideoThread()
# connect its signal to the update_image slot
# start the thread
def closeEvent(self, event):
def update_image(self, cv_img):
"""Updates the image_label with a new opencv image"""
qt_img = self.convert_cv_qt(cv_img)
def convert_cv_qt(self, cv_img):
"""Convert from an opencv image to QPixmap"""
rgb_image = cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)
h, w, ch = rgb_image.shape
bytes_per_line = ch * w
convert_to_Qt_format = QtGui.QImage(rgb_image.data, w, h, bytes_per_line, QtGui.QImage.Format_RGB888)
p = convert_to_Qt_format.scaled(self.disply_width, self.display_height, Qt.KeepAspectRatio)
return QPixmap.fromImage(p)
if __name__ == "__main__":
app = QApplication(sys.argv)
a = App()
I am trying to deploy PyTorch classifier on webcam, but always getting errors, mostly "AttributeError: 'collections.OrderedDict' object has no attribute 'load_state_dict'". The classifier is a binary classifier. Saved the model as .pt file.
Hope for your support to resolve the issue.
Here are the codes I am using:
import numpy as np
import torch
import torch.nn
import torchvision
from torch.autograd import Variable
from torchvision import transforms
import PIL
import cv2
#This is the Label
Labels = { 0 : 'Perfect',
1 : 'Defected'
# Let's preprocess the inputted frame
data_transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(224, 224)),
torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ##Assigning the Device which will do the calculation
model = torch.load("defect_classifier.pt") #Load model to CPU
model = model.to(device) #set where to run the model and matrix calculation
model.eval() #set the device to eval() mode for testing
#Set the Webcam
def Webcam_720p():
def argmax(prediction):
prediction = prediction.cpu()
prediction = prediction.detach().numpy()
top_1 = np.argmax(prediction, axis=1)
score = np.amax(prediction)
score = '{:6f}'.format(score)
prediction = top_1[0]
result = Labels[prediction]
return result,score
def preprocess(image):
image = PIL.Image.fromarray(image) #Webcam frames are numpy array format
#Therefore transform back to PIL image
image = data_transforms(image)
image = image.float()
#image = Variable(image, requires_autograd=True)
image = image.cuda()
image = image.unsqueeze(0) #I don't know for sure but Resnet-50 model seems to only
#accpets 4-D Vector Tensor so we need to squeeze another
return image #dimension out of our 3-D vector Tensor
#Let's start the real-time classification process!
cap = cv2.VideoCapture(0) #Set the webcam
fps = 0
show_score = 0
show_res = 'Nothing'
sequence = 0
while True:
ret, frame = cap.read() #Capture each frame
if fps == 4:
image = frame[100:450,150:570]
image_data = preprocess(image)
prediction = model(image_data)
result,score = argmax(prediction)
fps = 0
if result >= 0.5:
show_res = result
show_score= score
show_res = "Nothing"
show_score = score
fps += 1
cv2.putText(frame, '%s' %(show_res),(950,250), cv2.FONT_HERSHEY_SIMPLEX, 2, (255,255,255), 3)
cv2.putText(frame, '(score = %.5f)' %(show_score), (950,300), cv2.FONT_HERSHEY_SIMPLEX, 1,(255,255,255),2)
cv2.rectangle(frame,(400,150),(900,550), (250,0,0), 2)
cv2.imshow("ASL SIGN DETECTER", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
cv2.destroyWindow("ASL SIGN DETECTER")
Dont use this line
model = torch.load("defect_classifier.pt")
instead use model = Your_model_class() , since your model is object of your model class
I'm trying a self-driving project. I want to label the frame by receiving it through Jetson Nano's camera. However, the same error as the title occurred.
temp_array = roi.reshape(1, int(height / 2) * width).astype(np.float32)
ValueError: cannot reshape array of size 230400 into shape (1,153600)
Can you take a look at my code?
import socket
import numpy as np
import cv2
import pygame
HOST = ''
PORT = 9999
def recvall(sock, count):
buf = b''
while count:
newbuf = sock.recv(count)
if not newbuf: return None
buf += newbuf
count -= len(newbuf)
return buf
client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client_socket.connect((HOST, PORT))
while True:
length = recvall(client_socket, 16)
stringData = recvall(client_socket, int(length))
data = np.frombuffer(stringData, dtype='uint8')
decimg = cv2.imdecode(data, cv2. )
print(type(decimg)) # class numpy.ndarray
print(decimg.shape) # 480,640,3
height, width,_ = decimg.shape
roi = decimg[120:240, :] #120,640,3
cv2.imshow('Client', decimg)
temp_array = roi.reshape(1, int(height / 2) * width).astype(np.float32)
# print(temp_array)
key = cv2.waitKey(1)
if key == 27:
The problem is with the array sizes. You are going to reshape an array of 230400 into (1, 153600). Can I know what you want to do exactly?