Character segmentation in python - machine-learning

I am working on detecting handwritten symbols using computer vision in python. I trained a cnn on a dataset of individual characters, but now I want to be able to extract characters from an image in order to make predictions on the individual characters. What is the best way to do this? The handwritten text that I will be working with will not be cursive and there will be an obvious separation between the characters.

In the below snippet,the boxes variable has dimensions for each character in the image.
import cv2
import pytesseract
file = '/content/Captchas/image22.jpg'
img = cv2.imread(file)
h, w, _ = img.shape
boxes = pytesseract.image_to_boxes(img)
for b in boxes.splitlines():
b = b.split(' ')
img = cv2.rectangle(img, (int(b[1]), h - int(b[2])), (int(b[3]), h - int(b[4])), (0, 255, 0), 2)
cv2_imshow(img)
print(boxes)

you can use find contours and bound them with a box.
image = cv2.imread("filename")
image = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21)
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
res,thresh = cv2.threshold(gray,150,255,cv2.THRESH_BINARY_INV) #threshold
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3))
dilated = cv2.dilate(thresh,kernel,iterations = 5)
val,contours, hierarchy =
cv2.findContours(dilated,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)
coord = []
for contour in contours:
[x,y,w,h] = cv2.boundingRect(contour)
if h>300 and w>300:
continue
if h<40 or w<40:
continue
coord.append((x,y,w,h))
coord.sort(key=lambda tup:tup[0]) # if the image has only one sentence sort in one axis
count = 0
for cor in coord:
[x,y,w,h] = cor
t = image[y:y+h,x:x+w,:]
cv2.imwrite(str(count)+".png",t)
print("number of char in image:", count)

Related

OCR with grouped text based on solid rectangles

I can read text from an image using OCR. However, it works line by line.
I want to now group text based on solid lines surrounding the text.
For example, consider I have below rectangle banners. I can read text line by line. Fine! Now I want to group them by Board A,B,C and hold them in some data structure, so that I can identify, which lines belong to which board. It is given that images would be diagrams like this with solid lines around each block of text.
Please guide me on the right approach.
As mentioned in the comments by Yunus, you need to crop sub-images and feed them to an OCR module individually. An additional step could be ordering of the contours.
Approach:
Obtain binary image and invert it
Find contours
Crop sub-images based on the bounding rectangle for each contour
Feed each sub-image to OCR module (I used easyocr for demonstration)
Store text for each board in a dictionary
Code:
# Libraries import
import cv2
from easyocr import Reader
reader = Reader(['en'])
img = cv2.imread('board_text.jpg',1)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# inverse binary
th = cv2.threshold(gray,127,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)[1]
# find contours and sort them from left to right
contours, hierarchy = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
contours = sorted(contours, key=lambda x: [cv2.boundingRect(x)[0], cv2.boundingRect(x)[1]])
#initialize dictionary
board_dictionary = {}
# iterate each contour and crop bounding box
for i, c in enumerate(cnts):
x,y,w,h = cv2.boundingRect(c)
crop_img = img[y:y+h, x:x+w]
# feed cropped image to easyOCR module
results = reader.readtext(crop_img)
# result is output per line
# create a list to append all lines in cropped image to it
board_text = []
for (bbox, text, prob) in results:
board_text.append(text)
# convert list of words to single string
board_para = ' '.join(board_text)
#print(board_para)
# store string within a dictionary
board_dictionary[str(i)] = board_para
Dictionary Output:
board_dictionary
{'0': 'Board A Board A contains Some Text, That goes Here Some spaces and then text again', '1': 'Board B Board B has some text too but sparse.', '2': 'Board €C Board C is wide and contains text with white spaces '}
Drawing each contour
img2 = img.copy()
for i, c in enumerate(cnts):
x,y,w,h = cv2.boundingRect(c)
img2 = cv2.rectangle(img2, (x, y), (x + w, y + h), (0,255,0), 3)
Note:
While working on different images make sure the ordering is correct.
Choice of OCR module is yours pytesseract and easyocr are the options I know.
This can be done by performing following steps:
Find the shapes.
Compute the shape centers.
Find the text boxes.
Compute the text boxes centers.
Associate the textboxes with shapes based on distance.
The code is as follows:
import cv2
from easyocr import Reader
import math
shape_number = 2
image = cv2.imread("./ueUco.jpg")
deep_copy = image.copy()
image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
ret, thresh = cv2.threshold(image_gray, 150, 255, cv2.THRESH_BINARY)
thresh = 255 - thresh
shapes, hierarchy = cv2.findContours(image=thresh, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE)
cv2.drawContours(image=deep_copy, contours=shapes, contourIdx=-1, color=(0, 255, 0), thickness=2, lineType=cv2.LINE_AA)
shape_centers = []
for shape in shapes:
row = int((shape[0][0][0] + shape[3][0][0])/2)
column = int((shape[3][0][1] + shape[2][0][1])/2)
center = (row, column, shape)
shape_centers.append(center)
# cv2.imshow('Shapes', deep_copy)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
languages = ['en']
reader = Reader(languages, gpu = True)
results = reader.readtext(image)
def cleanup_text(text):
return "".join([c if ord(c) < 128 else "" for c in text]).strip()
for (bbox, text, prob) in results:
text = cleanup_text(text)
(tl, tr, br, bl) = bbox
tl = (int(tl[0]), int(tl[1]))
tr = (int(tr[0]), int(tr[1]))
br = (int(br[0]), int(br[1]))
bl = (int(bl[0]), int(bl[1]))
column = int((tl[0] + tr[0])/2)
row = int((tr[1] + br[1])/2)
center = (row, column, bbox)
distances = []
for iteration, shape_center in enumerate(shape_centers):
shape_row = shape_center[0]
shape_column = shape_center[1]
dist = int(math.dist([column, row], [shape_row, shape_column]))
distances.append(dist)
min_value = min(distances)
min_index = distances.index(min_value)
if min_index == shape_number:
cv2.rectangle(image, tl, br, (0, 255, 0), 2)
cv2.putText(image, text, (tl[0], tl[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
cv2.imshow("Image", image)
cv2.waitKey(0)
cv2.imwrite(f"image_{shape_number}.jpg", image)
cv2.destroyAllWindows()
The output looks like this.
Please note that this solution is almost complete. You just have to compute the text embodied in each shape and put it in your desired data structure.
Note: shape_number represents the shape that you want to consider.
There is another solution that I would like you to work on.
Find all the text boxes.
Compute the centers for text boxes.
Run k-means clustering on the centers.
I would prefer the second solution but for the time being, I implemented the first one.

Adjusting pytesseract parameters

Note: I am migrating this question from Data Science Stack Exchange, where it received little exposure.
I am trying to implement an OCR solution to identify the numbers read from the picture of a screen.
I am adapting this pyimagesearch tutorial to my problem.
Because I am dealing with a dark background, I first invert the image, before converting it to grayscale and thresholding it:
inverted_cropped_image = cv2.bitwise_not(cropped_image)
gray = get_grayscale(inverted_cropped_image)
thresholded_image = cv2.threshold(gray, 100, 255, cv2.THRESH_BINARY)[1]
Then I call pytesseract's image_to_data function to output a dictionary containing the different text regions and their confidence intervals:
from pytesseract import Output
results = pytesseract.image_to_data(thresholded_image, output_type=Output.DICT)
Finally I iterate over results and plot them when their confidence exceeds a user defined threshold (70%). What bothers me, is that my script identifies everything in the image except the number that I would like to recognize (1227.938).
My first guess is that the image_to_data parameters are not set properly.
Checking this website, I selected a page segmentation mode (psm) of 11 (sparse text) and tried whitelisting numbers only (tessedit_char_whitelist=0123456789m.'):
results = pytesseract.image_to_data(thresholded_image, config='--psm 11 --oem 3 -c tessedit_char_whitelist=0123456789m.', output_type=Output.DICT)
Alas, this is even worse, and the script now identifies nothing at all!
Do you have any suggestion? Am I missing something obvious here?
EDIT #1:
At Ann Zen's request, here's the code used to obtain the first image:
import imutils
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pytesseract
from pytesseract import Output
def get_grayscale(image):
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
filename = "IMAGE.JPG"
cropped_image = cv2.imread(filename)
inverted_cropped_image = cv2.bitwise_not(cropped_image)
gray = get_grayscale(inverted_cropped_image)
thresholded_image = cv2.threshold(gray, 100, 255, cv2.THRESH_BINARY)[1]
results = pytesseract.image_to_data(thresholded_image, config='--psm 11 --oem 3 -c tessedit_char_whitelist=0123456789m.', output_type=Output.DICT)
color = (255, 255, 255)
for i in range(0, len(results["text"])):
x = results["left"][i]
y = results["top"][i]
w = results["width"][i]
h = results["height"][i]
text = results["text"][i]
conf = int(results["conf"][i])
print("Confidence: {}".format(conf))
if conf > 70:
print("Confidence: {}".format(conf))
print("Text: {}".format(text))
print("")
text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
cv2.rectangle(cropped_image, (x, y), (x + w, y + h), color, 2)
cv2.putText(cropped_image, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX,1.2, color, 3)
cv2.imshow('Image', cropped_image)
cv2.waitKey(0)
EDIT #2:
Rarely have I spent reputation points so well! All three replies posted so far helped me refine my algorithm.
First, I wrote a Tkinter program allowing me to manually crop the image around the number of interest (modifying the one found in this SO post)
Then I used Ann Zen's idea of narrowing down the search area around the fractional part. I am using her nifty process function to prepare my grayscale image for contour extraction: contours, _ = cv2.findContours(process(img_gray), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE). I am using RETR_EXTERNAL to avoid dealing with overlapping bounding rectangles.
I then sorted my contours from left to right. Bounding rectangles exceeding a user-defined threshold are associated with the integral part (white rectangles); otherwise they are associated with the fractional part (black rectangles).
I then extracted the characters using Esraa's approach i.e. applying a Gaussian blur prior to calling Tesseract. I used a much larger kernel (15x15 vs 3x3) to achieve this.
I am not out of the woods yet, but hopefully I will get better results by using Ahx's adaptive thresholding.
The Concept
As you have probably heard, pytesseract is not good at detecting text of different sizes on the same line as one piece of text. In your case, you want to detect the 1227.938, where the 1227 is much larger than the .938.
One way to go about solving this is to have the program estimate where the .938 is, and enlarge that part of the image. After that, pytesseract will have no problem in returning the text.
The Code
import cv2
import numpy as np
import pytesseract
def process(img):
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(img_gray, 200, 255, cv2.THRESH_BINARY)
img_canny = cv2.Canny(thresh, 100, 100)
kernel = np.ones((3, 3))
img_dilate = cv2.dilate(img_canny, kernel, iterations=2)
return cv2.erode(img_dilate, kernel, iterations=2)
img = cv2.imread("image.png")
img_copy = img.copy()
hh = 50
contours, _ = cv2.findContours(process(img), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
if 20 * hh < cv2.contourArea(cnt) < 30 * hh:
x, y, w, h = cv2.boundingRect(cnt)
ww = int(hh / h * w)
src_seg = img[y: y + h, x: x + w]
dst_seg = img_copy[y: y + hh, x: x + ww]
h_seg, w_seg = dst_seg.shape[:2]
dst_seg[:] = cv2.resize(src_seg, (ww, hh))[:h_seg, :w_seg]
gray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY)
results = pytesseract.image_to_data(thresh)
for b in map(str.split, results.splitlines()[1:]):
if len(b) == 12:
x, y, w, h = map(int, b[6: 10])
cv2.putText(img, b[11], (x, y + h + 15), cv2.FONT_HERSHEY_COMPLEX, 0.6, 0)
cv2.imshow("Result", img)
cv2.waitKey(0)
The Output
Here is the input image:
And here is the output image:
As you have said in your post, the only part you need the the decimal 1227.938. If you want to filter out the rest of the detected text, you can try tweaking some parameters. For example, replacing the 180 from _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY) with 230 will result in the output image:
The Explanation
Import the necessary libraries:
import cv2
import numpy as np
import pytesseract
Define a function, process(), that will take in an image array, and return a binary image array that is the processed version of the image that will allow proper contour detection:
def process(img):
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(img_gray, 200, 255, cv2.THRESH_BINARY)
img_canny = cv2.Canny(thresh, 100, 100)
kernel = np.ones((3, 3))
img_dilate = cv2.dilate(img_canny, kernel, iterations=2)
return cv2.erode(img_dilate, kernel, iterations=2)
I'm sure that you don't have to do this, but due to a problem in my environment, I have to add pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' before I can call the pytesseract.image_to_data() method, or it throws an error:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
Read in the original image, make a copy of it, and define the rough height of the large part of the decimal:
img = cv2.imread("image.png")
img_copy = img.copy()
hh = 50
Detect the contours of the processed version of the image, and add a filter that roughly filters out the contours so that the small text remains:
contours, _ = cv2.findContours(process(img), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
if 20 * hh < cv2.contourArea(cnt) < 30 * hh:
Define the bounding box of each contour that didn't get filtered out, and use the properties to enlarge those parts of the image to the height defined for the large text (making sure to also scale the width accordingly):
x, y, w, h = cv2.boundingRect(cnt)
ww = int(hh / h * w)
src_seg = img[y: y + h, x: x + w]
dst_seg = img_copy[y: y + hh, x: x + ww]
h_seg, w_seg = dst_seg.shape[:2]
dst_seg[:] = cv2.resize(src_seg, (ww, hh))[:h_seg, :w_seg]
Finally, we can use the pytesseract.image_to_data() method to detect the text. Of course, we'll need to threshold the image again:
gray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY)
results = pytesseract.image_to_data(thresh)
for b in map(str.split, results.splitlines()[1:]):
if len(b) == 12:
x, y, w, h = map(int, b[6: 10])
cv2.putText(img, b[11], (x, y + h + 15), cv2.FONT_HERSHEY_COMPLEX, 0.6, 0)
cv2.imshow("Result", img)
cv2.waitKey(0)
I have been working with Tesseract for quite some time, so let me clarify something for you. Tesseract is extremely helpful if you're trying to recognize text in documents more than any other computer vision projects. It usually needs a binarized image to get a good output. Therefore, you will always need some image pre-processing.
However, after several trials in the past with all page segmentation modes, I realized that it fails when font size differs on the same line without having a space. Sometimes PSM 6 is helpful if the difference is low, but in your condition, you may try an alternative. If you don't care about the decimals, you may try the following solution:
img = cv2.imread(r'E:\Downloads\Iwzrg.png')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_blur = cv2.GaussianBlur(gray, (3,3),0)
_,thresh = cv2.threshold(img_blur,200,255,cv2.THRESH_BINARY_INV)
# If using a fixed camera
new_img = thresh[0:100, 80:320]
text = pytesseract.image_to_string(new_img, lang='eng', config='--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789')
OUTPUT: 1227
I would like to recommend applying another image processing method.
Because I am dealing with a dark background, I first invert the image, before converting it to grayscale and thresholding it:
You applied global thresholding and couldn't achieve the desired result.
Then you can apply either adaptive-thresholding or inRange
For the given image, if we apply the inRange threshold:
To be able to recognize the image as accurately as possible we can add a border to the top of the image and resize the image (Optional)
In the OCR section, check if the detected region contains a digit
if text.isdigit():
Then display on the image:
The result is nearly the desired value. Now you can try with the other suggested methods to find the exact value.
The problem is .938 recognized as 235, maybe resizing using different values might improve the result.
Code:
from cv2 import imread, cvtColor, COLOR_BGR2HSV as HSV, inRange, getStructuringElement, resize
from cv2 import imshow, waitKey, MORPH_RECT, dilate, bitwise_and, rectangle, putText
from cv2 import copyMakeBorder as addBorder, BORDER_CONSTANT as CONSTANT, FONT_HERSHEY_SIMPLEX
from numpy import array
from pytesseract import image_to_data, Output
bgr = imread("Iwzrg.png")
resized = resize(bgr, (800, 600), fx=0.75, fy=0.75)
bordered = addBorder(resized, 200, 0, 0, 0, CONSTANT, value=0)
hsv = cvtColor(bordered, HSV)
mask = inRange(hsv, array([0, 0, 250]), array([179, 255, 255]))
kernel = getStructuringElement(MORPH_RECT, (50, 30))
dilated = dilate(mask, kernel, iterations=1)
thresh = 255 - bitwise_and(dilated, mask)
data = image_to_data(thresh, output_type=Output.DICT)
for i in range(0, len(data["text"])):
x = data["left"][i]
y = data["top"][i]
w = data["width"][i]
h = data["height"][i]
text = data["text"][i]
if text.isdigit():
print("Text: {}".format(text))
print("")
text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
rectangle(thresh, (x, y), (x + w, y + h), (0, 255, 0), 2)
putText(thresh, text, (x, y - 10), FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3)
imshow("", thresh)
waitKey(0)

Image Classification using openCV, feature extraction and model building

I am currently working on a project, where the problem statement is to detect handwritten text from a image of a particular form. As a pre-processing step I have extracted texts in the form of bounding boxes, and I have around 1500 images of texts extracted from the image form, out of which 50 of them are handwritten.
The problem is how do I now use these extracted images to train a classifier model which will classify the images as printed or handwritten text. I have no prior knowledge of Deep learning. Any help will be appreciated. I am uploading the image and the extracted images, as well as the code to extract the texts from the images.
im_ns = cv.imread('~/Image processing/IMG_20180921_111952.png')
gray = cv.cvtColor(im_ns,cv.COLOR_BGR2GRAY)
blurred_g = cv.GaussianBlur(gray,(11,11),0)
ret, th1 = cv.threshold(blurred_g,127,255,cv.THRESH_BINARY)
th2 = cv.adaptiveThreshold(blurred_g,255,cv.ADAPTIVE_THRESH_MEAN_C,cv.THRESH_BINARY,11,2)
th3 = cv.adaptiveThreshold(blurred_g,255,cv.ADAPTIVE_THRESH_GAUSSIAN_C,cv.THRESH_BINARY,11,2)
##Detecting horizontal Lines and removing them
th3_di1 = th3_di.copy()
hor = int(round(th3_di1.shape[1]/30,0))
hor_struc = cv.getStructuringElement(cv.MORPH_RECT,(hor,1))
bw_hor_er = cv.erode(th3_di1,hor_struc,iterations=1)
bw_hor_di = cv.dilate(th3_di1,hor_struc,iterations=1)
for i in range(0,bw_hor_di.shape[0]):
for j in range(0,bw_hor_di.shape[1]):
if bw_hor_di[i,j] == 0:
th3_di1[i,j] = 255
else:
th3_di1[i,j] = th3_di1[i,j]
plt.figure(figsize=(20,25))
plt.imshow(th3_di1,'gray')
# perform a connected component analysis on the thresholded
# image, then initialize a mask to store only the "large"
# components
labels = measure.label(th3_di1, neighbors=4, background=255)
mask = np.zeros(th3_di1.shape, dtype="uint8")
plt.figure(figsize=(30,25))
plt.imshow(labels)
# loop over the unique components
for lab in np.unique(labels):
# if this is the background label, ignore it
if lab == 0:
continue
# otherwise, construct the label mask and count the
# number of pixels
labelMask = np.zeros(th3_di.shape, dtype="uint8")
labelMask[labels == lab] = 255
numPixels = cv.countNonZero(labelMask)
# if the number of pixels in the component is sufficiently
# large, then add it to our mask of "large blobs"
if numPixels > 8:
mask = cv.add(mask, labelMask)
plt.figure(figsize=(30,24))
plt.imshow(mask,'gray')
# find the contours in the mask, then sort them from left to
# right
cnts = cv.findContours(mask.copy(), cv.RETR_EXTERNAL,
cv.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if imutils.is_cv2() else cnts[1]
cnts = contours.sort_contours(cnts)[0]
# loop over the contours to make rectangles for the th3 image with gassian thresholding
for (i, c) in enumerate(cnts):
# draw the bright spot on the image
(x,y,w,h) = cv.boundingRect(c)
#((cX, cY), radius) = cv.minEnclosingCircle(c)
cv.rectangle(th3,(x,y),(x+w,y+h),(0,255),2)
cv.putText(th3, "",(x+w+10,y+h),0,0.3,(0,255,0))
# show the output image
cv.imshow("Image", th3)
cv.waitKey(10000)
cv.destroyAllWindows()
##Extracting the bounding boxes
idx=0
for (i, c) in enumerate(cnts):
# draw the bright spot on the image
idx += 1
x,y,w,h = cv.boundingRect(c)
roi = im_ns[y:y+h,x:x+w]
#((cX, cY), radius) = cv.minEnclosingCircle(c)
#cv.rectangle(im_ns,(x,y),(x+w,y+h),(0,255),2)
cv.imwrite(str(idx)+'.jpg',roi)
Images:

Difficult time trying to do shape recognition for 3D objects

I am trying to make a shape recognition classifier in which if you give an individual picture of an object (from a scene), it would be able to classify (after machine learning) the shape of an object (cylinder, cube, sphere, etc).
Original scene:
Individual objects it will classify:
I attempted to do this using cv2.approxPolyDB with an attempt to classify a cylinder. However, either my implementation isn't good or this wasn't a good choice of an algorithm to choose in the first place, the objects in the shape of cylinders were assigned a approxPolyDB value of 3 or 4.
Perhaps I can threshold and, in general, if given a value of 3 or 4, assume the object is a cylinder, but I feel like it's not the most reliable method for 3D shape classification. I feel like there is a better way to implement this and a better method as opposed to just hardcoding values. I feel like that with this method, it can easily confuse a cylinder with a cube.
Is there any way I can improve my 3D shape recognition program?
Code:
import cv2
import numpy as np
from pyimagesearch import imutils
from PIL import Image
from time import time
def invert_img(img):
img = (255-img)
return img
def threshold(im):
imgray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
imgray = cv2.medianBlur(imgray,9)
imgray = cv2.Canny(imgray,75,200)
return imgray
def view_all_contours(im, size_min, size_max):
main = np.array([[]])
cnt_target = im.copy()
for c in cnts:
epsilon = 0.1*cv2.arcLength(c,True)
approx = cv2.approxPolyDP(c,epsilon,True)
area = cv2.contourArea(c)
print 'area: ', area
test = im.copy()
# To weed out contours that are too small or big
if area > size_min and area < size_max:
print c[0,0]
print 'approx: ', len(approx)
max_pos = c.max(axis=0)
max_x = max_pos[0,0]
max_y = max_pos[0,1]
min_pos = c.min(axis=0)
min_x = min_pos[0,0]
min_y = min_pos[0,1]
# Load each contour onto image
cv2.drawContours(cnt_target, c, -1,(0,0,255),2)
print 'Found object'
frame_f = test[min_y:max_y , min_x:max_x]
main = np.append(main, approx[None,:][None,:])
thresh = frame_f.copy()
thresh = threshold(thresh)
contours_small, hierarchy = cv2.findContours(thresh.copy(),cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
cnts_small = sorted(contours_small, key = cv2.contourArea, reverse = True)
cv2.drawContours(frame_f, cnts_small, -1,(0,0,255),2)
cv2.imshow('Thresh', thresh)
cv2.imshow('Show Ya', frame_f)
cv2.waitKey(0)
# Uncomment in order to show all rectangles in image
print '---------------------------------------------'
#cv2.drawContours(cnt_target, cnts, -1,(0,255,0),2)
print main.shape
print main
return cnt_target
time_1 = time()
roi = cv2.imread('images/beach_trash_3.jpg')
hsv = cv2.cvtColor(roi,cv2.COLOR_BGR2HSV)
target = cv2.imread('images/beach_trash_3.jpg')
target = imutils.resize(target, height = 400)
hsvt = cv2.cvtColor(target,cv2.COLOR_BGR2HSV)
img_height = target.shape[0]
img_width = target.shape[1]
# calculating object histogram
roihist = cv2.calcHist([hsv],[0, 1], None, [180, 256], [0, 180, 0, 256] )
# normalize histogram and apply backprojection
cv2.normalize(roihist,roihist,0,255,cv2.NORM_MINMAX)
dst = cv2.calcBackProject([hsvt],[0,1],roihist,[0,180,0,256],1)
# Now convolute with circular disc
disc = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(5,5))
cv2.filter2D(dst,-1,disc,dst)
# threshold and binary AND
ret,thresh = cv2.threshold(dst,50,255,0)
thresh_one = thresh.copy()
thresh = cv2.merge((thresh,thresh,thresh))
res = cv2.bitwise_and(target,thresh)
# Implementing morphological erosion & dilation
kernel = np.ones((9,9),np.uint8) # (6,6) to get more contours (9,9) to reduce noise
thresh_one = cv2.erode(thresh_one, kernel, iterations = 3)
thresh_one = cv2.dilate(thresh_one, kernel, iterations=2)
# Invert the image
thresh_one = invert_img(thresh_one)
# To show prev img
#res = np.vstack((target,thresh,res))
#cv2.imwrite('res.jpg',res)
#cv2.waitKey(0)
#cv2.imshow('Before contours', thresh_one)
cnt_target = target.copy()
cnt_full = target.copy()
# Code to draw the contours
contours, hierarchy = cv2.findContours(thresh_one.copy(),cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
cnts = sorted(contours, key = cv2.contourArea, reverse = True)
print time() - time_1
size_min = 200
size_max = 5000
cnt_target = view_all_contours(target, size_min, size_max)
cv2.drawContours(cnt_full, cnts, -1,(0,0,255),2)
res = imutils.resize(thresh_one, height = 700)
cv2.imshow('Original image', target)
cv2.imshow('Preprocessed', thresh_one)
cv2.imshow('All contours', cnt_full)
cv2.imshow('Filtered contours', cnt_target)
cv2.waitKey(0)

Impulse, gaussian and salt and pepper noise with OpenCV

I'm studying Image Processing on the famous Gonzales "Digital Image Processing" and talking about image restoration a lot of examples are done with computer-generated noise (gaussian, salt and pepper, etc). In MATLAB there are some built-in functions to do it. What about OpenCV?
As far as I know there are no convenient built in functions like in Matlab. But with only a few lines of code you can create those images yourself.
For example additive gaussian noise:
Mat gaussian_noise = img.clone();
randn(gaussian_noise,128,30);
Salt and pepper noise:
Mat saltpepper_noise = Mat::zeros(img.rows, img.cols,CV_8U);
randu(saltpepper_noise,0,255);
Mat black = saltpepper_noise < 30;
Mat white = saltpepper_noise > 225;
Mat saltpepper_img = img.clone();
saltpepper_img.setTo(255,white);
saltpepper_img.setTo(0,black);
There is function random_noise() from the scikit-image package. It has several builtin noise patterns, such as gaussian, s&p (for salt and pepper noise), possion and speckle.
Below I show an example of how to use this method
from PIL import Image
import numpy as np
from skimage.util import random_noise
im = Image.open("test.jpg")
# convert PIL Image to ndarray
im_arr = np.asarray(im)
# random_noise() method will convert image in [0, 255] to [0, 1.0],
# inherently it use np.random.normal() to create normal distribution
# and adds the generated noised back to image
noise_img = random_noise(im_arr, mode='gaussian', var=0.05**2)
noise_img = (255*noise_img).astype(np.uint8)
img = Image.fromarray(noise_img)
img.show()
There is also a package called imgaug which are dedicated to augment images in various ways. It provides gaussian, poissan and salt&pepper noise augmenter. Here is how you can use it to add noise to image:
from PIL import Image
import numpy as np
from imgaug import augmenters as iaa
def main():
im = Image.open("bg_img.jpg")
im_arr = np.asarray(im)
# gaussian noise
# aug = iaa.AdditiveGaussianNoise(loc=0, scale=0.1*255)
# poisson noise
# aug = iaa.AdditivePoissonNoise(lam=10.0, per_channel=True)
# salt and pepper noise
aug = iaa.SaltAndPepper(p=0.05)
im_arr = aug.augment_image(im_arr)
im = Image.fromarray(im_arr).convert('RGB')
im.show()
if __name__ == "__main__":
main()
Simple Function to add Gaussian, Salt-pepper speckle and poisson noise to an image
Parameters
----------
image : ndarray
Input image data. Will be converted to float.
mode : str
One of the following strings, selecting the type of noise to add:
'gauss' Gaussian-distributed additive noise.
'poisson' Poisson-distributed noise generated from the data.
's&p' Replaces random pixels with 0 or 1.
'speckle' Multiplicative noise using out = image + n*image,where
n,is uniform noise with specified mean & variance.
import numpy as np
import os
import cv2
def noisy(noise_typ,image):
if noise_typ == "gauss":
row,col,ch= image.shape
mean = 0
#var = 0.1
#sigma = var**0.5
gauss = np.random.normal(mean,1,(row,col,ch))
gauss = gauss.reshape(row,col,ch)
noisy = image + gauss
return noisy
elif noise_typ == "s&p":
row,col,ch = image.shape
s_vs_p = 0.5
amount = 0.004
out = image
# Salt mode
num_salt = np.ceil(amount * image.size * s_vs_p)
coords = [np.random.randint(0, i - 1, int(num_salt))
for i in image.shape]
out[coords] = 1
# Pepper mode
num_pepper = np.ceil(amount* image.size * (1. - s_vs_p))
coords = [np.random.randint(0, i - 1, int(num_pepper))
for i in image.shape]
out[coords] = 0
return out
elif noise_typ == "poisson":
vals = len(np.unique(image))
vals = 2 ** np.ceil(np.log2(vals))
noisy = np.random.poisson(image * vals) / float(vals)
return noisy
elif noise_typ =="speckle":
row,col,ch = image.shape
gauss = np.random.randn(row,col,ch)
gauss = gauss.reshape(row,col,ch)
noisy = image + image * gauss
return noisy
"Salt & Pepper" noise can be added in a quite simple fashion using NumPy matrix operations.
def add_salt_and_pepper(gb, prob):
'''Adds "Salt & Pepper" noise to an image.
gb: should be one-channel image with pixels in [0, 1] range
prob: probability (threshold) that controls level of noise'''
rnd = np.random.rand(gb.shape[0], gb.shape[1])
noisy = gb.copy()
noisy[rnd < prob] = 0
noisy[rnd > 1 - prob] = 1
return noisy
# Adding noise to the image
import cv2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
img = cv2.imread('./fruit.png',0)
im = np.zeros(img.shape, np.uint8) # do not use original image it overwrites the image
mean = 0
sigma = 10
cv2.randn(im,mean,sigma) # create the random distribution
Fruit_Noise = cv2.add(img, im) # add the noise to the original image
plt.imshow(Fruit_Noise, cmap='gray')
The values of mean and sigma can be altered to bring about a specific change in noise like gaussian or pepper-salt noise etc.
You can use either randn or randu according to the need. Have a look at the documentation: https://docs.opencv.org/2.4/modules/core/doc/operations_on_arrays.html#cv2.randu
I made some change of #Shubham Pachori 's code. When reading a image into numpy arrary, the default dtype is uint8, which can cause wrapping when adding noise onto the image.
import numpy as np
from PIL import Image
"""
image: read through PIL.Image.open('path')
sigma: variance of gaussian noise
factor: the bigger this value is, the more noisy is the poisson_noised image
##IMPORTANT: when reading a image into numpy arrary, the default dtype is uint8,
which can cause wrapping when adding noise onto the image.
E.g, example = np.array([128,240,255], dtype='uint8')
example + 50 = np.array([178,44,49], dtype='uint8')
Transfer np.array to dtype='int16' can solve this problem.
"""
def gaussian_noise(image, sigma):
img = np.array(image)
noise = np.random.randn(img.shape[0], img.shape[1], img.shape[2])
img = img.astype('int16')
img_noise = img + noise * sigma
img_noise = np.clip(img_noise, 0, 255)
img_noise = img_noise.astype('uint8')
return Image.fromarray(img_noise)
def poisson_noise(image, factor):
factor = 1 / factor
img = np.array(image)
img = img.astype('int16')
img_noise = np.random.poisson(img * factor) / float(factor)
np.clip(img_noise, 0, 255, img_noise)
img_noise = img_noise.astype('uint8')
return Image.fromarray(img_noise)
http://scikit-image.org/docs/dev/api/skimage.util.html#skimage.util.random_noise
skimage.util.random_noise(image, mode='gaussian', seed=None, clip=True, **kwargs)
#Adding noise
[m,n]=img.shape
saltpepper_noise=zeros((m, n));
saltpepper_noise=rand(m,n); #creates a uniform random variable from 0 to 1
for i in range(0,m):
for j in range(0,n):
if saltpepper_noise[i,j]<=0.5:
saltpepper_noise[i,j]=0
else:
saltpepper_noise[i,j]=255
def add_salt_noise(src, ratio: float = 0.05, noise: list = [0, 0, 0]):
dst = src.copy()
import random
shuffle_dict = {}
i = 0
while i < (int(dst.shape[0]*dst.shape[1] * ratio)):
x, y = random.randint(0, dst.shape[0] - 1), random.randint(0, dst.shape[1] - 1)
if (x, y) in shuffle_dict:
continue
else:
dst[x, y] = noise
shuffle_dict[(x, y)] = 0
i += 1
return dst
although there is no built-in functions like in matlab
imnoise(image,noiseType,NoiseLevel) but we can easily add required amount random
valued impulse noise or salt and pepper into an image manually.
to add random valued impulse noise.
import random as r
def addRvinGray(image,n): # add random valued impulse noise in grayscale
'''parameters:
image: type=numpy array. input image in which you want add noise.
n: noise level (in percentage)'''
k=0 # counter variable
ih=image.shape[0]
iw=image.shape[1]
noisypixels=(ih*iw*n)/100 # here we calculate the number of pixels to be altered.
for i in range(ih*iw):
if k<noisypixels:
image[r.randrange(0,ih)][r.randrange(0,iw)]=r.randrange(0,256) #access random pixel in the image gives random intensity (0-255)
k+=1
else:
break
return image
to add salt and pepper noise
def addSaltGray(image,n): #add salt-&-pepper noise in grayscale image
k=0
salt=True
ih=image.shape[0]
iw=image.shape[1]
noisypixels=(ih*iw*n)/100
for i in range(ih*iw):
if k<noisypixels: #keep track of noise level
if salt==True:
image[r.randrange(0,ih)][r.randrange(0,iw)]=255
salt=False
else:
image[r.randrange(0,ih)][r.randrange(0,iw)]=0
salt=True
k+=1
else:
break
return image
Note: for color images: first split image in to three or four channels depending on the input image using opencv function:
(B, G, R) = cv2.split(image)
(B, G, R, A) = cv2.split(image)
after spliting perform the same operations on all channels.
at the end merge all the channels:
merged = cv2.merge([B, G, R])
return merged

Resources