Camera calibration with OpenCV-python for autonomous car doesn't work well - opencv

Problems
doesn't work well
When I use my code with my Image, it doesn't work well.
I only edited 'wc' and 'hc' from OpenCV DOC
import glob
import cv2 as cv
import numpy as np
wc = 7
hc = 4
# termination criteria
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 30, 0.001)
# prepare object points, like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0)
objp = np.zeros((wc * hc, 3), np.float32)
objp[:, :2] = np.mgrid[0:hc, 0:wc].T.reshape(-1, 2)
# Arrays to store object points and image points from all the images.
objpoints = [] # 3d point in real world space
imgpoints = [] # 2d points in image plane.
images = glob. Glob('1.jpg')
for fname in images:
img = cv.imread(fname)
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
# Find the chess board corners
ret, corners = cv.findChessboardCorners(gray, (hc, wc), None)
# If found, add object points, image points (after refining them)
print(ret, wc, hc)
if True:
objpoints.append(objp)
corners2 = cv.cornerSubPix(gray, corners, (20, 20), (-1, -1),
criteria) # image, corners, winSize, zeroZone, criteria
imgpoints.append(corners2)
# Draw and display the corners
cv.drawChessboardCorners(img, (hc, wc), corners2, ret)
cv.imwrite('ChessboardCorners.png', img)
cv.waitKey(0)
ret, mtx, dist, rvecs, tvecs = cv.calibrateCamera(objpoints, imgpoints, gray.shape[::-1], None, None)
img = cv.imread('1.jpg')
print(img.shape[:2])
h, w = img.shape[:2]
newcameramtx, roi = cv.getOptimalNewCameraMatrix(mtx, dist, (w, h), 1, (w, h))
# undistort
dst = cv.undistort(img, mtx, dist, None, newcameramtx)
# crop the image
x, y, w, h = roi
dst = dst[y:y + h, x:x + w]
cv.imwrite('calibresult.png', dst)
cv.waitKey(0)
mean_error = 0
for i in range(len(objpoints)):
imgpoints2, _ = cv.projectPoints(objpoints[i], rvecs[i], tvecs[i], mtx, dist)
error = cv.norm(imgpoints[i], imgpoints2, cv.NORM_L2) / len(imgpoints2)
mean_error += error
print("total error: {}".format(mean_error / len(objpoints)))
print("\n\n", fname, "claer")
cv.destroyAllWindows()
exit(0)
original image - not well
ChessboardCorners - (I'm not sure that this is not well)
calibresult image - not well
works well with other images
But, when I use my code with the Image which was in the example in OpenCV DOC, it works well.
import glob
import cv2 as cv
import numpy as np
wc = 6
hc = 7
# termination criteria
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 30, 0.001)
# prepare object points, like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0)
objp = np.zeros((wc * hc, 3), np.float32)
objp[:, :2] = np.mgrid[0:hc, 0:wc].T.reshape(-1, 2)
# Arrays to store object points and image points from all the images.
objpoints = [] # 3d point in real world space
imgpoints = [] # 2d points in image plane.
images = glob. Glob('img.png')
for fname in images:
img = cv.imread(fname)
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
# Find the chess board corners
ret, corners = cv.findChessboardCorners(gray, (hc, wc), None)
# If found, add object points, image points (after refining them)
print(ret, wc, hc)
if True:
objpoints.append(objp)
corners2 = cv.cornerSubPix(gray, corners, (20, 20), (-1, -1),
criteria) # image, corners, winSize, zeroZone, criteria
imgpoints.append(corners2)
# Draw and display the corners
cv.drawChessboardCorners(img, (hc, wc), corners2, ret)
cv.imwrite('ChessboardCorners.png', img)
cv.waitKey(0)
ret, mtx, dist, rvecs, tvecs = cv.calibrateCamera(objpoints, imgpoints, gray.shape[::-1], None, None)
img = cv.imread('img.png')
print(img.shape[:2])
h, w = img.shape[:2]
newcameramtx, roi = cv.getOptimalNewCameraMatrix(mtx, dist, (w, h), 1, (w, h))
# undistort
dst = cv.undistort(img, mtx, dist, None, newcameramtx)
# crop the image
x, y, w, h = roi
dst = dst[y:y + h, x:x + w]
cv.imwrite('calibresult.png', dst)
cv.waitKey(0)
mean_error = 0
for i in range(len(objpoints)):
imgpoints2, _ = cv.projectPoints(objpoints[i], rvecs[i], tvecs[i], mtx, dist)
error = cv.norm(imgpoints[i], imgpoints2, cv.NORM_L2) / len(imgpoints2)
mean_error += error
print("total error: {}".format(mean_error / len(objpoints)))
print("\n\n", fname, "claer")
cv.destroyAllWindows()
exit(0)
I removed the images because "Your question appears to be spam."
Please see the images on OpenCV DOC
Please, give me the solution to this problem.
Do I need to modify the parameters, or what should I do?
Is my chessboard wrong?
Below is what I have tried.
First, I tried to find correct numbers of 'wc' and 'hc'
I used this code to find.
import glob
import cv2 as cv
import numpy as np
for i in range(3, 50):
for j in range(i + 1, 50): # I used this code becuase I found that the order of the variables does not matter last time.
wc = i
hc = j
# termination criteria
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 30, 0.001)
# prepare object points, like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0)
objp = np.zeros((wc * hc, 3), np.float32)
objp[:, :2] = np.mgrid[0:hc, 0:wc].T.reshape(-1, 2)
# Arrays to store object points and image points from all the images.
objpoints = [] # 3d point in real world space
imgpoints = [] # 2d points in image plane.
images = glob.glob('1.jpg')
for fname in images:
img = cv.imread(fname)
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
# Find the chess board corners
ret, corners = cv.findChessboardCorners(gray, (hc, wc), None)
# If found, add object points, image points (after refining them)
print(ret, wc, hc)
And the result here:
False 3 4
False 3 5
False 3 6
···
False 4 5
False 4 6
**True 4 7**
False 4 8
False 4 9
···
False 47 48
False 47 49
False 48 49
Process finished with exit code 0
I also found that the Image which was in the example in OpenCV DOC has another 'wc' and 'hc', (4, 4).
And result here:
ChessboardCorners by (4, 4
calibresult by (4, 4)
So, I'm expecting that the 'wc'and 'hc' of my Image (4, 7) might be small.
Should I increase the max and do a brute-force search again?

I can't tell for sure, but it looks like you are only using a single input image for calibration. If that's true, try increasing to at least 5 images (more would be better) and see if that helps. The images should be at different angles and distances.
I notice that your lens has significant distortion. A few suggestions for getting that to work well:
Use the rational model for distortion - the basic kappa 1, kappa 2 model won't do well.
Your data set will need to include image points from all parts of the image, including near the edges and corners of the image. This can be difficult/impossible to achieve using the normal chessboard pattern (because the entire pattern must be visible in the image) - I suggest using the ChAruco calibration pattern/functions. This uses a modified chessboard pattern that includes Aruco markers embedded in the white squares, which allows for partial patterns to be used.
Note that the wc and hc parameters you are searching for are used to describe the chessboard pattern width and height. This should be known to you ahead of time and you shouldn't need to search for it.

Related

Pixel per mm calculation of an image using camera calibration matrix and object distance is not same as pixel dimension of object in MS Paint

I want to calculate number of pixels per grid (i.e. pixels per 11 mm) of the checkerboard. I am doing this to validate that mm/pixel calculation I obtain using calibration matrix and formula (below) is same as what I will see when I open the image in MS Paint.
For the checkerboard image (1920x1080 resolution): 11 grid x 7 grid in size with each grid as 11x 11 mm, at a distance of 500 mm (picture below).
I compute a calibration matrix using code:
import cv2
import numpy as np
import pathlib
#from utils import *
import glob
from argparse import ArgumentParser
topview_image_path = 'checkerboard_top\*.png'
camera_orientation = 'topview'
if camera_orientation == 'topview':
image_path = topview_image_path
def calibrate_chessboard(folder):
# Defining the dimensions of checkerboard
CHECKERBOARD = (6,9)
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001)
# Creating vector to store vectors of 3D points for each checkerboard image
objpoints = []
# Creating vector to store vectors of 2D points for each checkerboard image
imgpoints = []
# Defining the world coordinates for 3D points
objp = np.zeros((1, CHECKERBOARD[0] * CHECKERBOARD[1], 3), np.float32)
objp[0,:,:2] = np.mgrid[0:CHECKERBOARD[0], 0:CHECKERBOARD[1]].T.reshape(-1, 2)
prev_img_shape = None
# Extracting path of individual image stored in a given directory
print("image path:", image_path)
images = glob.glob(image_path)
#images = glob.glob(f'{folder}/*.png')
# if len(images) == 0:
# images = glob.glob(f'{folder}/*.jpg')
# print(images)
for fname in images:
img = cv2.imread(fname)
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
# Find the chess board corners
# If desired number of corners are found in the image then ret = true
ret, corners = cv2.findChessboardCorners(gray, CHECKERBOARD, cv2.CALIB_CB_ADAPTIVE_THRESH + cv2.CALIB_CB_FAST_CHECK + cv2.CALIB_CB_NORMALIZE_IMAGE)
"""
If desired number of corner are detected,
we refine the pixel coordinates and display
them on the images of checker board
"""
if ret == True:
objpoints.append(objp)
# refining pixel coordinates for given 2d points.
corners2 = cv2.cornerSubPix(gray, corners, (11,11),(-1,-1), criteria)
imgpoints.append(corners2)
# Draw and display the corners
img = cv2.drawChessboardCorners(img, CHECKERBOARD, corners2, ret)
cv2.imshow('img',img)
cv2.waitKey(0)
cv2.destroyAllWindows()
h,w = img.shape[:2]
"""
Performing camera calibration by
passing the value of known 3D points (objpoints)
and corresponding pixel coordinates of the
detected corners (imgpoints)
"""
ret, mtx, dist, rvecs, tvecs = cv2.calibrateCamera(objpoints, imgpoints, gray.shape[::-1], None, None)
return [ret, mtx, dist, rvecs, tvecs]
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--type', dest='type',type=str, default='topview',help='is the image topview or sideview?')
parser.add_argument('--folder', dest='folder',type=str, default='cal_images/checkerboard_topview',help='is the image topview or sideview?')
args = parser.parse_args()
WIDTH = 6
HEIGHT = 9
# Calibrate
ret, mtx, dist, rvecs, tvecs = calibrate_chessboard(args.folder)
print(mtx)
print(dist)
mtx_list = ["calibration matrix:\n", str(mtx),
"\ndistortion matrix:", str(dist)]
txt_file = "matrix_" + camera_orientation +".txt"
with open(txt_file, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(mtx_list))
and get the calibration matrix as:
M = [[2.86276094e+03 0.00000000e+00 8.23315889e+02]
[0.00000000e+00 2.86846709e+03 5.80987675e+02]
[0.00000000e+00 0.00000000e+00 1.00000000e+00]]
This gives me the focal length (f) in pixel units (M[0][0]) i.e. 2862.
I then calculate size in pixels (X_sizepx) of the checkerboard grid 11 mm (X_sizemm) object at distance Z (in my case 500mm) using formula:
X_sizepx = (f/Z) * X_sizemm
Substituting all the values: f = 2862, Z =500, X_sizemm = 11, I get 62.94. So as per opencv calibration and the formula, 11mm should ~63 pixels.
I then open the checkerboard image in MS paint to see the pixel dimension of the square grid in pixels and it says 41 pixels (image below).
This is a big difference if I were to use camera calibration matrix and the the formula. Is there something I am doing wrong?
Note: Z can never be more than 530 mm if I were to assume that Z might be slightly off.

Adjusting pytesseract parameters

Note: I am migrating this question from Data Science Stack Exchange, where it received little exposure.
I am trying to implement an OCR solution to identify the numbers read from the picture of a screen.
I am adapting this pyimagesearch tutorial to my problem.
Because I am dealing with a dark background, I first invert the image, before converting it to grayscale and thresholding it:
inverted_cropped_image = cv2.bitwise_not(cropped_image)
gray = get_grayscale(inverted_cropped_image)
thresholded_image = cv2.threshold(gray, 100, 255, cv2.THRESH_BINARY)[1]
Then I call pytesseract's image_to_data function to output a dictionary containing the different text regions and their confidence intervals:
from pytesseract import Output
results = pytesseract.image_to_data(thresholded_image, output_type=Output.DICT)
Finally I iterate over results and plot them when their confidence exceeds a user defined threshold (70%). What bothers me, is that my script identifies everything in the image except the number that I would like to recognize (1227.938).
My first guess is that the image_to_data parameters are not set properly.
Checking this website, I selected a page segmentation mode (psm) of 11 (sparse text) and tried whitelisting numbers only (tessedit_char_whitelist=0123456789m.'):
results = pytesseract.image_to_data(thresholded_image, config='--psm 11 --oem 3 -c tessedit_char_whitelist=0123456789m.', output_type=Output.DICT)
Alas, this is even worse, and the script now identifies nothing at all!
Do you have any suggestion? Am I missing something obvious here?
EDIT #1:
At Ann Zen's request, here's the code used to obtain the first image:
import imutils
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pytesseract
from pytesseract import Output
def get_grayscale(image):
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
filename = "IMAGE.JPG"
cropped_image = cv2.imread(filename)
inverted_cropped_image = cv2.bitwise_not(cropped_image)
gray = get_grayscale(inverted_cropped_image)
thresholded_image = cv2.threshold(gray, 100, 255, cv2.THRESH_BINARY)[1]
results = pytesseract.image_to_data(thresholded_image, config='--psm 11 --oem 3 -c tessedit_char_whitelist=0123456789m.', output_type=Output.DICT)
color = (255, 255, 255)
for i in range(0, len(results["text"])):
x = results["left"][i]
y = results["top"][i]
w = results["width"][i]
h = results["height"][i]
text = results["text"][i]
conf = int(results["conf"][i])
print("Confidence: {}".format(conf))
if conf > 70:
print("Confidence: {}".format(conf))
print("Text: {}".format(text))
print("")
text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
cv2.rectangle(cropped_image, (x, y), (x + w, y + h), color, 2)
cv2.putText(cropped_image, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX,1.2, color, 3)
cv2.imshow('Image', cropped_image)
cv2.waitKey(0)
EDIT #2:
Rarely have I spent reputation points so well! All three replies posted so far helped me refine my algorithm.
First, I wrote a Tkinter program allowing me to manually crop the image around the number of interest (modifying the one found in this SO post)
Then I used Ann Zen's idea of narrowing down the search area around the fractional part. I am using her nifty process function to prepare my grayscale image for contour extraction: contours, _ = cv2.findContours(process(img_gray), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE). I am using RETR_EXTERNAL to avoid dealing with overlapping bounding rectangles.
I then sorted my contours from left to right. Bounding rectangles exceeding a user-defined threshold are associated with the integral part (white rectangles); otherwise they are associated with the fractional part (black rectangles).
I then extracted the characters using Esraa's approach i.e. applying a Gaussian blur prior to calling Tesseract. I used a much larger kernel (15x15 vs 3x3) to achieve this.
I am not out of the woods yet, but hopefully I will get better results by using Ahx's adaptive thresholding.
The Concept
As you have probably heard, pytesseract is not good at detecting text of different sizes on the same line as one piece of text. In your case, you want to detect the 1227.938, where the 1227 is much larger than the .938.
One way to go about solving this is to have the program estimate where the .938 is, and enlarge that part of the image. After that, pytesseract will have no problem in returning the text.
The Code
import cv2
import numpy as np
import pytesseract
def process(img):
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(img_gray, 200, 255, cv2.THRESH_BINARY)
img_canny = cv2.Canny(thresh, 100, 100)
kernel = np.ones((3, 3))
img_dilate = cv2.dilate(img_canny, kernel, iterations=2)
return cv2.erode(img_dilate, kernel, iterations=2)
img = cv2.imread("image.png")
img_copy = img.copy()
hh = 50
contours, _ = cv2.findContours(process(img), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
if 20 * hh < cv2.contourArea(cnt) < 30 * hh:
x, y, w, h = cv2.boundingRect(cnt)
ww = int(hh / h * w)
src_seg = img[y: y + h, x: x + w]
dst_seg = img_copy[y: y + hh, x: x + ww]
h_seg, w_seg = dst_seg.shape[:2]
dst_seg[:] = cv2.resize(src_seg, (ww, hh))[:h_seg, :w_seg]
gray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY)
results = pytesseract.image_to_data(thresh)
for b in map(str.split, results.splitlines()[1:]):
if len(b) == 12:
x, y, w, h = map(int, b[6: 10])
cv2.putText(img, b[11], (x, y + h + 15), cv2.FONT_HERSHEY_COMPLEX, 0.6, 0)
cv2.imshow("Result", img)
cv2.waitKey(0)
The Output
Here is the input image:
And here is the output image:
As you have said in your post, the only part you need the the decimal 1227.938. If you want to filter out the rest of the detected text, you can try tweaking some parameters. For example, replacing the 180 from _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY) with 230 will result in the output image:
The Explanation
Import the necessary libraries:
import cv2
import numpy as np
import pytesseract
Define a function, process(), that will take in an image array, and return a binary image array that is the processed version of the image that will allow proper contour detection:
def process(img):
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(img_gray, 200, 255, cv2.THRESH_BINARY)
img_canny = cv2.Canny(thresh, 100, 100)
kernel = np.ones((3, 3))
img_dilate = cv2.dilate(img_canny, kernel, iterations=2)
return cv2.erode(img_dilate, kernel, iterations=2)
I'm sure that you don't have to do this, but due to a problem in my environment, I have to add pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' before I can call the pytesseract.image_to_data() method, or it throws an error:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
Read in the original image, make a copy of it, and define the rough height of the large part of the decimal:
img = cv2.imread("image.png")
img_copy = img.copy()
hh = 50
Detect the contours of the processed version of the image, and add a filter that roughly filters out the contours so that the small text remains:
contours, _ = cv2.findContours(process(img), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
if 20 * hh < cv2.contourArea(cnt) < 30 * hh:
Define the bounding box of each contour that didn't get filtered out, and use the properties to enlarge those parts of the image to the height defined for the large text (making sure to also scale the width accordingly):
x, y, w, h = cv2.boundingRect(cnt)
ww = int(hh / h * w)
src_seg = img[y: y + h, x: x + w]
dst_seg = img_copy[y: y + hh, x: x + ww]
h_seg, w_seg = dst_seg.shape[:2]
dst_seg[:] = cv2.resize(src_seg, (ww, hh))[:h_seg, :w_seg]
Finally, we can use the pytesseract.image_to_data() method to detect the text. Of course, we'll need to threshold the image again:
gray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY)
results = pytesseract.image_to_data(thresh)
for b in map(str.split, results.splitlines()[1:]):
if len(b) == 12:
x, y, w, h = map(int, b[6: 10])
cv2.putText(img, b[11], (x, y + h + 15), cv2.FONT_HERSHEY_COMPLEX, 0.6, 0)
cv2.imshow("Result", img)
cv2.waitKey(0)
I have been working with Tesseract for quite some time, so let me clarify something for you. Tesseract is extremely helpful if you're trying to recognize text in documents more than any other computer vision projects. It usually needs a binarized image to get a good output. Therefore, you will always need some image pre-processing.
However, after several trials in the past with all page segmentation modes, I realized that it fails when font size differs on the same line without having a space. Sometimes PSM 6 is helpful if the difference is low, but in your condition, you may try an alternative. If you don't care about the decimals, you may try the following solution:
img = cv2.imread(r'E:\Downloads\Iwzrg.png')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_blur = cv2.GaussianBlur(gray, (3,3),0)
_,thresh = cv2.threshold(img_blur,200,255,cv2.THRESH_BINARY_INV)
# If using a fixed camera
new_img = thresh[0:100, 80:320]
text = pytesseract.image_to_string(new_img, lang='eng', config='--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789')
OUTPUT: 1227
I would like to recommend applying another image processing method.
Because I am dealing with a dark background, I first invert the image, before converting it to grayscale and thresholding it:
You applied global thresholding and couldn't achieve the desired result.
Then you can apply either adaptive-thresholding or inRange
For the given image, if we apply the inRange threshold:
To be able to recognize the image as accurately as possible we can add a border to the top of the image and resize the image (Optional)
In the OCR section, check if the detected region contains a digit
if text.isdigit():
Then display on the image:
The result is nearly the desired value. Now you can try with the other suggested methods to find the exact value.
The problem is .938 recognized as 235, maybe resizing using different values might improve the result.
Code:
from cv2 import imread, cvtColor, COLOR_BGR2HSV as HSV, inRange, getStructuringElement, resize
from cv2 import imshow, waitKey, MORPH_RECT, dilate, bitwise_and, rectangle, putText
from cv2 import copyMakeBorder as addBorder, BORDER_CONSTANT as CONSTANT, FONT_HERSHEY_SIMPLEX
from numpy import array
from pytesseract import image_to_data, Output
bgr = imread("Iwzrg.png")
resized = resize(bgr, (800, 600), fx=0.75, fy=0.75)
bordered = addBorder(resized, 200, 0, 0, 0, CONSTANT, value=0)
hsv = cvtColor(bordered, HSV)
mask = inRange(hsv, array([0, 0, 250]), array([179, 255, 255]))
kernel = getStructuringElement(MORPH_RECT, (50, 30))
dilated = dilate(mask, kernel, iterations=1)
thresh = 255 - bitwise_and(dilated, mask)
data = image_to_data(thresh, output_type=Output.DICT)
for i in range(0, len(data["text"])):
x = data["left"][i]
y = data["top"][i]
w = data["width"][i]
h = data["height"][i]
text = data["text"][i]
if text.isdigit():
print("Text: {}".format(text))
print("")
text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
rectangle(thresh, (x, y), (x + w, y + h), (0, 255, 0), 2)
putText(thresh, text, (x, y - 10), FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3)
imshow("", thresh)
waitKey(0)

Extract face rectangle from ID card

I’m researching the subject of extracting the information from ID cards and have found a suitable algorithm to locate the face on the front. As it is, OpenCV has Haar cascades for that, but I’m unsure what can be used to extract the full rectangle that person is in instead of just the face (as is done in https://github.com/deepc94/photo-id-ocr). The few ideas that I’m yet to test are:
Find second largest rectangle that’s inside the card containing the face rect
Do “explode” of the face rectangle until it hits the boundary
Play around with filters to see what can be seen
What can be recommended to try here as well? Any thoughts, ideas or even existing examples are fine.
Normal approach:
import cv2
import numpy as np
import matplotlib.pyplot as plt
image = cv2.imread("a.jpg")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_,thresh = cv2.threshold(gray,128,255,cv2.THRESH_BINARY)
cv2.imshow("thresh",thresh)
thresh = cv2.bitwise_not(thresh)
element = cv2.getStructuringElement(shape=cv2.MORPH_RECT, ksize=(7, 7))
dilate = cv2.dilate(thresh,element,6)
cv2.imshow("dilate",dilate)
erode = cv2.erode(dilate,element,6)
cv2.imshow("erode",erode)
morph_img = thresh.copy()
cv2.morphologyEx(src=erode, op=cv2.MORPH_CLOSE, kernel=element, dst=morph_img)
cv2.imshow("morph_img",morph_img)
_,contours,_ = cv2.findContours(morph_img,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
areas = [cv2.contourArea(c) for c in contours]
sorted_areas = np.sort(areas)
cnt=contours[areas.index(sorted_areas[-3])] #the third biggest contour is the face
r = cv2.boundingRect(cnt)
cv2.rectangle(image,(r[0],r[1]),(r[0]+r[2],r[1]+r[3]),(0,0,255),2)
cv2.imshow("img",image)
cv2.waitKey(0)
cv2.destroyAllWindows()
I found the first two biggest contours are the boundary, the third biggest contour is the face. Result:
There is also another way to investigate the image, using sum of pixel values by axises:
x_hist = np.sum(morph_img,axis=0).tolist()
plt.plot(x_hist)
plt.ylabel('sum of pixel values by X-axis')
plt.show()
y_hist = np.sum(morph_img,axis=1).tolist()
plt.plot(y_hist)
plt.ylabel('sum of pixel values by Y-axis')
plt.show()
Base on those pixel sums over 2 asixes, you can crop the region you want by setting thresholds for it.
Haarcascades approach (The most simple)
# Using cascade Classifiers
import numpy as np
import cv2
# We point OpenCV's CascadeClassifier function to where our
# classifier (XML file format) is stored
face_classifier = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
# Load our image then convert it to grayscale
image = cv2.imread('./your/image/path.jpg')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
cv2.imshow('Original image', image)
# Our classifier returns the ROI of the detected face as a tuple
# It stores the top left coordinate and the bottom right coordiantes
faces = face_classifier.detectMultiScale(gray, 1.3, 5)
# When no faces detected, face_classifier returns and empty tuple
if faces is ():
print("No faces found")
# We iterate through our faces array and draw a rectangle
# over each face in faces
for (x, y, w, h) in faces:
x = x - 25 # Padding trick to take the whole face not just Haarcascades points
y = y - 40 # Same here...
cv2.rectangle(image, (x, y), (x + w + 50, y + h + 70), (27, 200, 10), 2)
cv2.imshow('Face Detection', image)
cv2.waitKey(0)
cv2.destroyAllWindows()
Link to the haarcascade_frontalface_default file
update to #Sanix darker code,
# Using cascade Classifiers
import numpy as np
import cv2
img = cv2.imread('link_to_your_image')
face_classifier = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
scale_percent = 60 # percent of original size
width = int(img.shape[1] * scale_percent / 100)
height = int(img.shape[0] * scale_percent / 100)
dim = (width, height)
# resize image
image = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# face classifier
faces = face_classifier.detectMultiScale(gray, 1.3, 5)
# When no faces detected, face_classifier returns and empty tuple
if faces is ():
print("No faces found")
# We iterate through our faces array and draw a rectangle
# over each face in faces
for (x, y, w, h) in faces:
x = x - 25 # Padding trick to take the whole face not just Haarcascades points
y = y - 40 # Same here...
cv2.rectangle(image, (x, y), (x + w + 50, y + h + 70), (27, 200, 10), 2)
cv2.imshow('Face Detection', image)
cv2.waitKey(0)
cv2.destroyAllWindows()
# if you want to crop the face use below code
for (x, y, width, height) in faces:
roi = image[y:y+height, x:x+width]
cv2.imwrite("face.png", roi)

Segmenting products on the shelf

I am trying to detect edges from the products on a shelf using histogram projections. But I am stuck at 2 levels.
The challenges that I m facing are:
How to get the longest non shelf segment from the image i.e Detect the width of the widest product on the shelf from the available one.
How to achieve morphological reconstruction using custom markers.To eliminate
all small horizontal segments, I am generating 2 markers which can be seen in 'markers.png' (Attached). With them, I am calculating the minimum of the reconstruction outputs from both the markers.
Need assistance on this.
Thanks a lot
Below is my python code for the same.
Below is my python code
********************************************************************************
import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt
import math
# Read the input image
img = cv.imread('C:\\Users\\672059\\Desktop\\p2.png')
# Converting from BGR to RGB. Default is BGR.
# img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)
# Resize the image to 150,150
img_resize = cv.resize(img, (150, 150))
# Get the dimensions of the image
img_h, img_w, img_c = img_resize.shape
# Split the image on channels
red = img[:, :, 0]
green = img[:, :, 1]
blue = img[:, :, 2]
# Defining a vse for erosion
vse = np.ones((img_h, img_w), dtype=np.uint8)
# Morphological Erosion for red channel
red_erode = cv.erode(red, vse, iterations=1)
grad_red = cv.subtract(red, red_erode)
# Morphological Erosion for green channel
green_erode = cv.erode(green, vse, iterations=1)
grad_green = cv.subtract(green, green_erode)
# Morphological Erosion for blue channel
blue_erode = cv.erode(blue, vse, iterations=1)
grad_blue = cv.subtract(blue, blue_erode)
# Stacking the individual channels into one processed image
grad = [grad_red, grad_green, grad_blue]
retrieved_img = np.stack(grad, axis=-1)
retrieved_img = retrieved_img.astype(np.uint8)
retrieved_img_gray = cv.cvtColor(retrieved_img, cv.COLOR_RGB2GRAY)
plt.title('Figure 1')
plt.imshow(cv.bitwise_not(retrieved_img_gray), cmap=plt.get_cmap('gray'))
plt.show()
# Hough Transform of the image to get the longest non shelf boundary from the image!
edges = cv.Canny(retrieved_img_gray, 127, 255)
minLineLength = img_w
maxLineGap = 10
lines = cv.HoughLinesP(edges, 1, np.pi/180, 127, minLineLength=1, maxLineGap=1)
temp = img.copy()
l = []
for x in range(0, len(lines)):
for x1, y1, x2, y2 in lines[x]:
cv.line(temp, (x1, y1), (x2, y2), (0, 255, 0), 2)
d = math.sqrt((x2-x1)**2 + (y2-y1)**2)
l.append(d)
# Defining a hse for erosion
hse = np.ones((1, 7), dtype=np.uint8)
opening = cv.morphologyEx(retrieved_img_gray, cv.MORPH_OPEN, hse)
plt.title('Figure 2')
plt.subplot(1, 2, 1), plt.imshow(img)
plt.subplot(1, 2, 2), plt.imshow(cv.bitwise_not(opening), 'gray')
plt.show()
# Dilation with disk shaped structuring element
horizontal_size = 7
horizontalstructure = cv.getStructuringElement(cv.MORPH_ELLIPSE, (horizontal_size, 1))
dilation = cv.dilate(opening, horizontalstructure)
plt.title('Figure 3')
plt.imshow(cv.bitwise_not(dilation), 'gray')
plt.show()
# Doing canny edge on dilated image
edge = cv.Canny(dilation, 127, 255)
plt.title('Figure 4')
plt.imshow(edges, cmap='gray')
plt.show()
h_projection = edge.sum(axis=1)
print(h_projection)
plt.title('Projection')
plt.plot(h_projection)
plt.show()
listing = []
for i in range(1, len(h_projection)-1):
if h_projection[i-1] == 0 and h_projection[i] == 0:
listing.append(dilation[i])
listing.append(dilation[i-1])
a = np.array([np.array(b) for b in l])
h = len(l)
_, contours, _ = cv.findContours(a, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
x, y, w, h = cv.boundingRect(contours[0])
y = y + i - h
cv.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 2)
l.clear()
plt.imshow(img)
plt.show()
# Generating a mask
black_bg = np.ones([img_h, img_w], dtype=np.uint8)
# Clone the black bgd image
left = black_bg.copy()
right = black_bg.copy()
# Taking 10% of the image width
ten = int(0.1 * img_w)
left[:, 0:ten+1] = 0
right[:, img_w-ten:img_w+1] = 0
plt.title('Figure 4')
plt.subplot(121), plt.imshow(left, 'gray')
plt.subplot(122), plt.imshow(right, 'gray')
plt.show()
# Marker = left and right. Mask = dilation
mask = dilation
marker_left = left
marker_right = right
********************************************************************************
markers.png link: https://i.stack.imgur.com/45WJ6.png
********************************************************************************
Based on you input image, I would :
take a picture of an empty fridge
then compare the current image with the empty one.
play with morphological operations
get connected components > size N
If you can't take a empty fridge image:
segment the shelves (threshold white parts)
undo do the rotation of the image by using image moments of the shelves
for each shelve:
Threshold on saturation
Do a vertical projection
Count maxima.
Tresholded:
Erode-dilate:
Connected componens (width > 10 * height + > minsize):
And you have shelves.
Now take the average Y form each shelf and cut the original image in pieces:
Dither to 8 colors:
and threshold:
Connected components (h>1.5*w, minsize... this is hard here, I played with it :)

How to detect test strips with OpenCV?

I'm a newbie to computer vision, and I'm trying to detect all the test strips in this image:
The result I'm trying to get:
I assume it should be very easy, because all the target objects are in rectangular shape and have a fixed aspect ratio. But I have no idea which algorithm or function should I use.
I've tried edge detection and the 2D feature detection example in OpenCV, but the result is not ideal. How should I detect these similar objects but with small differences?
Update:
The test strips can vary in colors, and of course, the shade of the result lines. But they all have the same references lines, as showing in the picture:
I don't know how should I describe these simple features for object detection, as most examples I found online are for complex objects like a building or a face.
The solution is not exact, but it provides a good starting point. You have to play with the parameters though. It would greatly help you if you partition the strips using some threshold method and then apply hough lines individually as #api55 mentioned.
Here are the results I got.
Code.
import cv2
import numpy as np
# read image
img = cv2.imread('KbxN6.jpg')
# filter it
img = cv2.GaussianBlur(img, (11, 11), 0)
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# get edges using laplacian
laplacian_val = cv2.Laplacian(gray_img, cv2.CV_32F)
# lap_img = np.zeros_like(laplacian_val, dtype=np.float32)
# cv2.normalize(laplacian_val, lap_img, 1, 255, cv2.NORM_MINMAX)
# cv2.imwrite('laplacian_val.jpg', lap_img)
# apply threshold to edges
ret, laplacian_th = cv2.threshold(laplacian_val, thresh=2, maxval=255, type=cv2.THRESH_BINARY)
# filter out salt and pepper noise
laplacian_med = cv2.medianBlur(laplacian_th, 5)
# cv2.imwrite('laplacian_blur.jpg', laplacian_med)
laplacian_fin = np.array(laplacian_med, dtype=np.uint8)
# get lines in the filtered laplacian using Hough lines
lines = cv2.HoughLines(laplacian_fin,1,np.pi/180,480)
for rho,theta in lines[0]:
a = np.cos(theta)
b = np.sin(theta)
x0 = a*rho
y0 = b*rho
x1 = int(x0 + 1000*(-b))
y1 = int(y0 + 1000*(a))
x2 = int(x0 - 1000*(-b))
y2 = int(y0 - 1000*(a))
# overlay line on original image
cv2.line(img,(x1,y1),(x2,y2),(0,255,0),2)
# cv2.imwrite('processed.jpg', img)
# cv2.imshow('Window', img)
# cv2.waitKey(0)
This is an alternative solution by using the function findCountours in combination with canny edge detection. The code is based very slightly on this tutorial
import cv2
import numpy as np
import imutils
image = cv2.imread('test.jpg')
resized = imutils.resize(image, width=300)
ratio = image.shape[0] / float(resized.shape[0])
# convert the resized image to grayscale, blur it slightly,
# and threshold it
gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(resized,100,200)
cv2.imshow('dsd2', edges)
cv2.waitKey(0)
cnts = cv2.findContours(edges.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_NONE)
cnts = cnts[0] if imutils.is_cv2() else cnts[1]
sd = ShapeDetector()
# loop over the contours
for c in cnts:
# compute the center of the contour, then detect the name of the
# shape using only the contour
M = cv2.moments(c)
cX = int((M["m10"] / M["m00"]) * ratio)
cY = int((M["m01"] / M["m00"]) * ratio)
# multiply the contour (x, y)-coordinates by the resize ratio,
# then draw the contours and the name of the shape on the image
c = c.astype("float")
c *= ratio
c = c.astype("int")
cv2.drawContours(image, [c], -1, (0, 255, 0), 2)
#show the output image
#cv2.imshow("Image", image)
#cv2.waitKey(0)
cv2.imwrite("erg.jpg",image)
Result:
I guess it can be improved by tuning following parameters:
image resizing width
CHAIN_APPROX_NONE (findContour Docs)
It is maybe also usefull to filter small contours or merge contours which are close to each other.

Resources