How to implementation ONNX file for real time semantic segmentation using Deep Neural Network - opencv

i have a problem in my code as shown in this code
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
labels = ['Background', 'Korosi', 'Tanah', 'Tanaman']
COLORS = np.random.randint(0, 255, size=(len(labels), 3), dtype="uint8")
net = cv2.dnn.readNetFromONNX('anomali_model1.onnx')
layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers)]
capture = cv2.VideoCapture(0)
while True: re, img =
#img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
#height, width, channels = img.shape
#blob = cv2.dnn.blobFromImage(img, 0.00392, (256, 256),
#swapRB=True, crop=False)
blob = cv2.dnn.blobFromImage(img, swapRB=True, crop=False)
outs = net.forward(output_layers)
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.5:
# Object detected
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
# Rectangle coordinates
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
colors = np.random.uniform(0, 255, size=(len(classes), 3))
for i in range(len(boxes)):
if i in indexes:
x, y, w, h = boxes[i]
label = str(classes[class_ids[i]])
color = colors[class_ids[i]]
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
cv2.putText(img, label, (x, y + 30), font, 2, color, 3)
cv2.imshow("Image",cv2.resize(img, (800,600)))
if cv2.waitKey(1) & 0xFF == ord('q'):
And i get error like this:
error Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_10644\ in <module>
9 blob = cv2.dnn.blobFromImage(img, swapRB=True, crop=False)
10 net.setInput(blob)
---> 11 outs = net.forward(output_layers)
13 class_ids = []
error: OpenCV(3.4.17) D:\a\opencv-python\opencv-python\opencv\modules\dnn\src\layers\convolution_layer.cpp:331: error: (-2:Unspecified error) Number of input channels should be multiple of 3 but got 640 in function 'cv::dnn::ConvolutionLayerImpl::getMemoryShapes'
can you help me to solve this problem? because I've looked into various sources but did not find a solution.
library version for this code
python version 3.7
tensorflow version 2.0
opencv version 3.4.17
I hope you all can solve this problem and share with me


I want to change this code from how to trcak after cars to track after circle in circle

this is the code but it didn't work with the sign
import cv2
import numpy as np
from object_detection import ObjectDetection
import math
# Initialize Object Detection
od = ObjectDetection()
cap = cv2.VideoCapture("Sign black.mp4")
#Initalize count
count = 0
center_points_prev_frame = []
tracking_objects = {}
track_id = 0
while True:
ret, frame =
count += 1
if not ret:
# Point current frame
center_points_cur_frame = []
# Detect objects on frame
(class_ids, scores, boxes) = od.detect(frame)
for box in boxes:
(x, y, w, h) = box
cx = int((x + x + w) / 2)
cy = int((y + y + h) / 2)
center_points_cur_frame.append((cx, cy))
#print("FRAME N* ", count, " ", x, y, w, h), (cx,cy), 5, (0, 0, 255), -1)
cv2.rectangle(frame, (x,y), (x + w, y + h), (0, 255, 0), 2)
# Only at the beginning we compare previouse and current frame
if count <= 2:
for pt in center_points_cur_frame:
for pt2 in center_points_prev_frame:
distance = math.hypot(pt2[0] - pt[0], pt2[1] - pt[1])
if distance < 20:
tracking_objects[track_id] = pt
track_id += 1
tracking_objects_copy = tracking_objects.copy()
for object_id, pt2 in tracking_objects_copy.items():
object_exists = False
for pt in center_points_cur_frame:
distance = math.hypot(pt2[0] - pt[0], pt2[1] - pt[1])
# Update object position
if distance < 20:
tracking_objects[object_id] = pt
object_exists = True
# Remove IDs lost
if not object_exists:
for object_id, pt in tracking_objects.items():, pt, 5, (0, 0, 255), -1)
cv2.putText(frame, str(object_id), (pt[0], pt[1] - 7), 0, 1, (0, 0, 255), 2)
print("Tracking objects")
#, pt, 5, (0, 0, 255), -1)
#print("PREV FRAME")
cv2.imshow("Frame", frame)
# Make a copy of the points
center_points_prev_frame = center_points_cur_frame.copy()
Key = cv2.waitKey(1)
if Key == 27:
and this is
import numpy as np
class ObjectDetection:
def __init__(self, weights_path="dnn_model/yolov4.weights", cfg_path="dnn_model/yolov4.cfg"):
print("Loading Object Detection")
print("Running opencv dnn with YOLOv4")
self.nmsThreshold = 0.4
self.confThreshold = 0.5
self.image_size = 608
# Load Network
net = cv2.dnn.readNet(weights_path, cfg_path)
# Enable GPU CUDA
self.model = cv2.dnn_DetectionModel(net)
self.classes = []
self.colors = np.random.uniform(0, 255, size=(80, 3))
self.model.setInputParams(size=(self.image_size, self.image_size), scale=1/255)
def load_class_names(self, classes_path="dnn_model/classes.txt"):
with open(classes_path, "r") as file_object:
for class_name in file_object.readlines():
class_name = class_name.strip()
self.colors = np.random.uniform(0, 255, size=(80, 3))
return self.classes
def detect(self, frame):
return self.model.detect(frame, nmsThreshold=self.nmsThreshold, confThreshold=self.confThreshold)

Cannot open display in WSL 2, py-qt5

How to display the application in windows.
Code for Reference:
from tkinter import N
import numpy as np
from keras.preprocessing.image import img_to_array
import cv2
import imutils
from keras.models import load_model
import numpy as np
# parameters for loading data and images
detection_model_path = 'ER_Project//haar-cascade-files-master/haarcascade_frontalface_default.xml'
emotion_model_path = 'ER_Project/_mini_XCEPTION.102-0.66.hdf5'
# hyper-parameters for bounding boxes shape
# loading models
face_detection = cv2.CascadeClassifier(detection_model_path)
emotion_classifier = load_model(emotion_model_path, compile=False)
EMOTIONS = ["angry", "disgust", "scared", "happy", "sad", "surprised",
#feelings_faces = []
# for index, emotion in enumerate(EMOTIONS):
# feelings_faces.append(cv2.imread('emojis/' + emotion + '.png', -1))
# starting video streaming
camera = cv2.VideoCapture(0)
while True:
frame =[1]
# reading the frame
frame = imutils.resize(frame, width=300)
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_detection.detectMultiScale(
gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE)
canvas = np.zeros((250, 300, 3), dtype="uint8")
frameClone = frame.copy()
if len(faces) > 0:
faces = sorted(faces, reverse=True,
key=lambda x: (x[2] - x[0]) * (x[3] - x[1]))[0]
(fX, fY, fW, fH) = faces
# Extract the ROI of the face from the grayscale image, resize it to a fixed 28x28 pixels, and then prepare
# the ROI for classification via the CNN
roi = gray[fY:fY + fH, fX:fX + fW]
roi = cv2.resize(roi, (64, 64))
roi = roi.astype("float") / 255.0
roi = img_to_array(roi)
roi = np.expand_dims(roi, axis=0)
preds = emotion_classifier.predict(roi)[0]
emotion_probability = np.max(preds)
label = EMOTIONS[preds.argmax()]
for (i, (emotion, prob)) in enumerate(zip(EMOTIONS, preds)):
# construct the label text
text = "{}: {:.2f}%".format(emotion, prob * 100)
# draw the label + probability bar on the canvas
# emoji_face = feelings_faces[np.argmax(preds)]
w = int(prob * 300)
cv2.rectangle(canvas, (7, (i * 35) + 5),
(w, (i * 35) + 35), (0, 0, 255), -1)
cv2.putText(canvas, text, (10, (i * 35) + 23),
(255, 255, 255), 2)
cv2.putText(frameClone, label, (fX, fY - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 0, 255), 2)
cv2.rectangle(frameClone, (fX, fY), (fX + fW, fY + fH),
(0, 0, 255), 2)
# for c in range(0, 3):
# frame[200:320, 10:130, c] = emoji_face[:, :, c] * \
# (emoji_face[:, :, 3] / 255.0) + frame[200:320,
# 10:130, c] * (1.0 - emoji_face[:, :, 3] / 255.0)
cv2.imshow('your_face', frameClone)
cv2.imshow("Probabilities", canvas)
if cv2.waitKey(1) & 0xFF == ord('q'):
2022-04-20 04:36:21.181568: I tensorflow/stream_executor/cuda/] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-04-20 04:36:21.181664: I tensorflow/core/common_runtime/gpu/] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3951 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5
I need to run this openCV gui app on windows.

Real-time OCR Videstreaming is lagging when running real-time and implementing text recognition

I'm done installing packages and debugging it. This code is from When I run it, my camera or the video-streaming gets lag or it has delayed from my end.
Here's the code of what I executed through CLI.
# coding: utf-8
# =====================================================================
# Filename:
# py Ver: python 3.6 or later
# Description: Recognizes regions of text in a given video or through the webcam feed
# Usage: python --east frozen_east_text_detection.pb
# or
# python --east frozen_east_text_detection.pb --video test.avi
# Note: Requires opencv 3.4.2 or later
# For more in-script documentation, look at
# Author: Ankit Saxena (
# =====================================================================
from import VideoStream
from import FPS
from imutils.object_detection import non_max_suppression
import numpy as np
import argparse
import imutils
import time
import cv2
import pytesseract
# setting up tesseract path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def box_extractor(scores, geometry, min_confidence):
num_rows, num_cols = scores.shape[2:4]
rectangles = []
confidences = []
for y in range(num_rows):
scores_data = scores[0, 0, y]
x_data0 = geometry[0, 0, y]
x_data1 = geometry[0, 1, y]
x_data2 = geometry[0, 2, y]
x_data3 = geometry[0, 3, y]
angles_data = geometry[0, 4, y]
for x in range(num_cols):
if scores_data[x] < min_confidence:
offset_x, offset_y = x * 4.0, y * 4.0
angle = angles_data[x]
cos = np.cos(angle)
sin = np.sin(angle)
box_h = x_data0[x] + x_data2[x]
box_w = x_data1[x] + x_data3[x]
end_x = int(offset_x + (cos * x_data1[x]) + (sin * x_data2[x]))
end_y = int(offset_y + (cos * x_data2[x]) - (sin * x_data1[x]))
start_x = int(end_x - box_w)
start_y = int(end_y - box_h)
rectangles.append((start_x, start_y, end_x, end_y))
return rectangles, confidences
def get_arguments():
ap = argparse.ArgumentParser()
ap.add_argument('-v', '--video', type=str,
help='path to optional video file')
ap.add_argument('-east', '--east', type=str, required=True,
help='path to EAST text detection model')
ap.add_argument('-c', '--min_confidence', type=float, default=0.5,
help='minimum confidence to process a region')
ap.add_argument('-w', '--width', type=int, default=320,
help='resized image width (multiple of 32)')
ap.add_argument('-e', '--height', type=int, default=320,
help='resized image height (multiple of 32)')
ap.add_argument('-p', '--padding', type=float, default=0.0,
help='padding on each ROI border')
arguments = vars(ap.parse_args())
return arguments
if __name__ == '__main__':
args = get_arguments()
w, h = None, None
new_w, new_h = args['width'], args['height']
ratio_w, ratio_h = None, None
layer_names = ['feature_fusion/Conv_7/Sigmoid', 'feature_fusion/concat_3']
print("[INFO] loading EAST text detector...")
net = cv2.dnn.readNet(args["east"])
if not args.get('video', False):
print("[INFO] starting video stream...")
vs = VideoStream(src=0).start()
vs = cv2.VideoCapture(args['video'])
fps = FPS().start()
while True:
frame =
frame = frame[1] if args.get('video', False) else frame
if frame is None:
frame = imutils.resize(frame, width=500)
orig = frame.copy()
orig_h, orig_w = orig.shape[:2]
if w is None or h is None:
h, w = frame.shape[:2]
ratio_w = w / float(new_w)
ratio_h = h / float(new_h)
frame = cv2.resize(frame, (new_w, new_h))
blob = cv2.dnn.blobFromImage(frame, 1.0, (new_w, new_h), (123.68, 116.78, 103.94),
swapRB=True, crop=False)
scores, geometry = net.forward(layer_names)
rectangles, confidences = box_extractor(scores, geometry, min_confidence=args['min_confidence'])
boxes = non_max_suppression(np.array(rectangles), probs=confidences)
for (start_x, start_y, end_x, end_y) in boxes:
start_x = int(start_x * ratio_w)
start_y = int(start_y * ratio_h)
end_x = int(end_x * ratio_w)
end_y = int(end_y * ratio_h)
dx = int((end_x - start_x) * args['padding'])
dy = int((end_y - start_y) * args['padding'])
start_x = max(0, start_x - dx)
start_y = max(0, start_y - dy)
end_x = min(orig_w, end_x + (dx * 2))
end_y = min(orig_h, end_y + (dy * 2))
# ROI to be recognized
roi = orig[start_y:end_y, start_x:end_x]
# recognizing text
config = '-l eng --oem 1 --psm 7'
text = pytesseract.image_to_string(roi, config=config)
cv2.rectangle(orig, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)
cv2.putText(orig, text, (start_x, start_y - 20),
cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3)
cv2.imshow("Detection", orig)
key = cv2.waitKey(1) & 0xFF
if key == ord('q'):
print(f"[INFO] elapsed time {round(fps.elapsed(), 2)}")
print(f"[INFO] approx. FPS : {round(fps.fps(), 2)}")
if not args.get('video', False):
Is there a way easiest way to make the videostreaming smoother with this code?

playing cards detection with custom Yolo with OpenCv. How to know the inputs and outputs from the custom Yolo .cfg file

I want to detect playing cards and found .cfg and .weights for it. Classes has 52cards names. Following code is giving index out of range error. I couldn't understand the outputs of Yolo and how to get the detected labels. I am new to this, have been trying to understand. Can someone please help!
import cv2
import numpy as np
# Load Yolo
net = cv2.dnn.readNet("yolocards_608.weights", "yolocards.cfg")
classes = []
with open("cards.names", "r") as f:
classes = [line.strip() for line in f.readlines()]
layer_names = net.getLayerNames()
output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
colors = np.random.uniform(0, 255, size=(len(classes), 3))
# Loading image
img = cv2.imread("playing_cards_image.jpg")
img = cv2.resize(img, None, fx=0.4, fy=0.4)
height, width, channels = img.shape
# Detecting objects
blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
outs = net.forward(output_layers)
# Showing informations on the screen
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.5:
# Object detected
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
# Rectangle coordinates
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
for j in range(len(boxes)):
if i in indexes:
x, y, w, h = boxes[i]
label = str(classes[class_ids[i]])
color = colors[i]
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
cv2.putText(img, label, (x, y + 30), font, 3, color, 3)
IndexError Traceback (most recent call last)
<ipython-input-46-adaf82305ab8> in <module>
6 label = str(classes[class_ids[i]])
7 print(label)
----> 8 color = colors[i]
9 cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
10 cv2.putText(img, label, (x, y + 30), font, 3, color, 3)
IndexError: index 52 is out of bounds for axis 0 with size 52

how to get the min a contour of the color with HSV?

I'm trying to work on an image-processing. So, I need to grab the max and min area of the contour under for pic, contour in enumerate(contours): after selecting the min area if (area > 2000):
I could grab the max and min of the contour outside for loop, the problem that I need which min contour greater than 2000 in this code.
my full code:
import cv2
import numpy as np
from import FPS
import time
cap = cv2.VideoCapture(0)
width = cap.get(3) # float
height = cap.get(4) # float
print width, height
fps = FPS().start()
while (1):
_, img =
if _ is True:
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
blue_lower = np.array([86,0,90], np.uint8)
blue_upper = np.array([163, 64, 145], np.uint8)
blue = cv2.inRange(hsv, blue_lower, blue_upper)
kernal = np.ones((9, 9), "uint8")
blue = cv2.dilate(blue, kernal)
res_blue = cv2.bitwise_and(img, img, mask=blue)
(_, contours, hierarchy) = cv2.findContours(blue, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
for pic, contour in enumerate(contours):
area = cv2.contourArea(contour)
if (area > 2000):
print area
x, y, w, h = cv2.boundingRect(contour)
img = cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 2)
cv2.putText(img, "Blue Colour", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0))
if len(contours) > 0:
c = max(contours, key=cv2.contourArea)
x, y, w, h = cv2.boundingRect(c)
img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), 5)
cv2.putText(img, "Blue Colour", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0))
cv2.imshow("Color Tracking", img)
if cv2.waitKey(10) & 0xFF == ord('q'):
Any ideas or suggestions will be appreciated
