Related
TL;DR I'm using:
adaptive thresholding
segmenting by keys (width/height ratio) - see green boxes in image result
psm 10 to treat each key as a character
but it fails to recognize some keys, falsely identifies others or identifies 2 for 1 char (see the L character in the image result, it's an L and P), etc.
Note: I cropped the image and re-ran the results to get it to fit on this site, but before cropping it did slightly better (recognized more keys, fewer false positives, etc).
I just want it to recognize the alphabet keys. Ultimately I will want it to work for realtime video.
config:
'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
I've tried scaling the image differently, scaling the individual key segments, using opening/closing/etc but it doesn't recognize all the keys.
original image
image result
Update: new results if I make the image straighter (bird's eye) and remove the whitelisting, it manages to detect all for the most part (although it thinks the O is a 0 and the I is a |, which is understandable). Why is this and how could I make this adaptive enough for a dynamic video when it is so sensitive to these conditions?
Code:
import pytesseract
import numpy as np
try:
from PIL import Image
except ImportError:
import Image
import cv2
from tqdm import tqdm
from collections import defaultdict
def get_missing_chars(dict):
capital_alphabet = [chr(ascii) for ascii in range(65, 91)]
return [let for let in capital_alphabet if let not in dict]
def draw_box_and_char(img, contour_dims, c, box_col, text_col):
x, y, w, h = contour_dims
top_left = (x, y)
bot_right = (x + w, y+h)
font_offset = 3
text_pos = (x+h//2+12, y+h-font_offset)
img_copy = img.copy()
cv2.rectangle(img_copy, top_left, bot_right, box_col, 2)
cv2.putText(img_copy, c, text_pos, cv2.FONT_HERSHEY_SIMPLEX, fontScale=.5, color=text_col, thickness=1, lineType=cv2.LINE_AA)
return img_copy
def detect_keys(img):
scaling = .25
img = cv2.resize(img, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_AREA)
print("img shape", img.shape)
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ratio_min = 0.7
area_min = 1000
nbrhood_size = 1001
bias = 2
# adapt to different lighting
bin_img = cv2.adaptiveThreshold(gray_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
cv2.THRESH_BINARY_INV, nbrhood_size, bias)
items = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = items[0] if len(items) == 2 else items[1]
key_contours = []
for c in contours:
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
# square-like ratio, try to get character
if ratio > ratio_min and area > area_min:
key_contours.append(c)
detected = defaultdict(int)
n_kept = 0
img_copy = cv2.cvtColor(bin_img, cv2.COLOR_GRAY2RGB)
let_to_contour = {}
n_contours = len(key_contours)
# offset to get smaller square within the key segment for easier char recognition
offset = 10
show_each_char = False
for _, c in tqdm(enumerate(key_contours), total=n_contours):
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
base = np.zeros(bin_img.shape, dtype=np.uint8)
base.fill(255)
n_kept += 1
new_y = y+offset
new_x = x+offset
new_h = h-2*offset
new_w = w-2*offset
base[new_y:new_y+new_h, new_x:new_x+new_w] = bin_img[new_y:new_y+new_h, new_x:new_x+new_w]
segment = cv2.bitwise_not(base)
# try scaling up individual keys
# scaling = 2
# segment = cv2.resize(segment, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_CUBIC)
# psm 10: treats the segment as a single character
custom_config = r'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
d = pytesseract.image_to_data(segment, config=custom_config, output_type='dict')
conf = d['conf']
c = d['text'][-1]
if c:
# sometimes recognizes multiple keys even though there is only 1
for sub_c in c:
# save character and contour to draw on image and show bounds/detection
if sub_c not in let_to_contour or (sub_c in let_to_contour and conf > let_to_contour[sub_c]['conf']):
let_to_contour[sub_c] = {'conf': conf, 'cont': (new_x, new_y, new_w, new_h)}
else:
c = "?"
text_col = (0, 0, 255)
if show_each_char:
contour_dims = (new_x, new_y, new_w, new_h)
box_col = (0, 255, 0)
text_col = (0, 0, 0)
segment_with_boxes = draw_box_and_char(segment, contour_dims, c, box_col, text_col)
cv2.imshow('segment', segment_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()
# draw boxes around recognized keys
for c, data in let_to_contour.items():
box_col = (0, 255, 0)
text_col = (0, 0, 0)
img_copy = draw_box_and_char(img_copy, data['cont'], c, box_col, text_col)
detected = {k: 1 for k in let_to_contour}
for det in let_to_contour:
print(det, let_to_contour[det])
print("total detected: ", let_to_contour.keys())
missing = get_missing_chars(detected)
print(f"n_missing: {len(missing)}")
print(f"chars missing: {missing}")
return img_copy
if __name__ == "__main__":
img_file = "keyboard.jpg"
img = cv2.imread(img_file)
img_with_detected_keys = detect_keys(img)
cv2.imshow("detected", img_with_detected_keys)
cv2.waitKey(0)
cv2.destroyAllWindows()
I am having a problem with "Eye blinking detection" using Python, OpenCV, and dlib. I am using Jupyter notebook.
The following code uses the shape_predictor_68_face_landmarks.dat library which plot 68 predefine points on a face.
import cv2
#import numpy as np
import dlib
from math import hypot
cap = cv2.VideoCapture(0)
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("C:\\Users\\Asirajdin\\dev\\shape_predictor_68_face_landmarks
\\shape_predictor_68_face_landmarks.dat")
def midpoint (p1, p2):
return int ((p1.x + p2.x)/2), int ((p1.y + p2.y)/2)
font = cv2.FONT_HERSHEY_SIMPLEX
def get_blinking_ratio (eye_points, facial_landmarks):
left_point = (facial_landmarks.part(eye_points[0]).x, facial_landmarks.part(eye_points[0]).y)
right_point = (facial_landmarks.part(eye_points[3]).x, facial_landmarks.part(eye_points[3]).y)
centre_top = midpoint (facial_landmarks.part(eye_points[1]), facial_landmarks.part(eye_points[2]))
centre_bottom = midpoint (facial_landmarks.part(eye_points[5]),
facial_landmarks.part(eye_points[4]))
hor_line = cv2.line (frame, left_point, right_point, (0, 255,0), 2)
ver_line = cv2.line (frame, centre_top, centre_bottom, (0, 255,0), 2)
hor_line_length = hypot ((left_point[0] - right_point[0]), (left_point[1] - right_point[1]))
ver_line_length = hypot((centre_top[0] - centre_bottom[0]), (centre_top[1] - centre_bottom[1]))
ratio = hor_line_length/ver_line_length
while(True):
_, frame = cap.read()
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = detector(gray)
for face in faces:
#x, y = face.left(), face.top()
#x1, y1 = face.right(), face.bottom()
#cv2.rectangle(frame,(x, y), (x1, y1), (0, 255,0), 2)
landmarks = predictor(gray, face)
left_eye_ratio = get_blinking_ratio([36, 37, 38, 39, 40, 41], landmarks)
right_eye_ratio = get_blinking_ratio([42, 43, 44, 45, 46, 47], landmarks)
blinking_ratio = ((left_eye_ratio + right_eye_ratio) / 2)
if blinking_ratio > 5.7:
cv2.putText(frame, "BLINKING", (50, 150), font, 5, (255, 0, 0))
cv2.imshow("Frame", frame)
key = cv2.waitKey(1)
if key == 27:
break
# when everything is done then release the capture
cap.release()
cv2.destroyAllWindows()
Error:
TypeError Traceback (most recent call last)
<ipython-input-2-ca92d06b8ea8> in <module>
45 left_eye_ratio = get_blinking_ratio([36, 37, 38, 39, 40, 41], landmarks)
46 right_eye_ratio = get_blinking_ratio([42, 43, 44, 45, 46, 47], landmarks)
---> 47 blinking_ratio = ((left_eye_ratio + right_eye_ratio) / 2)
48
49 if blinking_ratio > 5.7:
TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'
I have found the solution to my post. I realised that the hor_line_length and the ver_line_length returns NoneType, then I tried to convert the ratio to integer. I returned the ratio under get_blinking_ratio as int i.e
ratio = hor_line_length/ver_line_length
return int (ratio)
I am novice in python, though the code is working but this might not be the best answer. Thanks.
your function get_blinking_ratio does not return anything. in python that means it implicitly returns None.
that's how the None values show up in that division.
after your function there are some lines that seem to deal with ratios. did you intend for those to go inside the function? then you need to indent them. they still don't contain a return statement, which is needed for a function to return anything.
I would like to implement a 2D LSTM as in this paper, specifically I would like to do so dynamically, so using tf.while. In brief this network works as follows.
order the pixels in an image so that pixel i, j -> i * width + j
run a 2D-LSTM over this sequence
The difference between a 2D and regular LSTM is we have a recurrent connection between the previous element in the sequence and the pixel directly above the current pixel, so at pixel i,j are connections to i - 1, j and i, j - 1.
What I have done
I have tried to do this using tf.while where in each iteration of the loop I accumulate the activations and cell states into a tensor whose shape I allow to vary. This is what the following block of code tries to do.
def single_lstm_layer(inputs, height, width, units, direction = 'tl'):
with tf.variable_scope(direction) as scope:
#Get 2D lstm cell
cell = lstm_cell
#position in sequence
row, col = tf.to_int32(0), tf.to_int32(0)
#use for when i - 1 < 0 or j - 1 < 0
zero_state = tf.fill([1, units], 0.0)
#get first activation and cell_state
output, state = cell(inputs.read(row * width + col), zero_state, zero_state, zero_state, zero_state)
#these are currently of shape (1, units) will ultimately be of shape
#(height * width, untis)
activations = output
cell_states = state
col += 1
with tf.variable_scope(direction, reuse = True) as scope:
def loop_fn(activations, cell_states, row, col):
#Read next input in sequence
i = inputs.read(row * width + col)
#if we are not in the first row then we want to get the activation/cell_state
#above us. Otherwise use zero state.
hidden_state_t = tf.cond(tf.greater_equal(row - 1, 0),
lambda:tf.gather(activations, [(row - 1) * (width) + col]),
lambda:tf.identity(zero_state))
cell_state_t = tf.cond(tf.greater_equal(row - 1, 0),
lambda:tf.gather(cell_states, [(row - 1) * (width) + col]),
lambda:tf.identity(zero_state))
#if we are not in the first col then we want to get the activation/cell_state
#left of us. Otherwise use zero state.
hidden_state_l = tf.cond(tf.greater_equal(col - 1, 0),
lambda:tf.gather(activations, [row * (width) + col - 1]),
lambda:tf.identity(zero_state))
cell_state_l = tf.cond(tf.greater_equal(col - 1, 0),
lambda:tf.gather(cell_states, [row * (width) + col - 1]),
lambda:tf.identity(zero_state))
#Using previous activations/cell_states get current activation/cell_state
output, state = cell(i, hidden_state_l, hidden_state_t, cell_state_l, cell_state_t)
#Append to bottom, will increase number of rows by 1
activations = tf.concat(0, [activations, output])
cell_states = tf.concat(0, [cell_states, state])
#move to next item in sequence
col = tf.cond(tf.equal(col, width - 1), lambda:tf.mul(col, 0), lambda:tf.add(col, 1))
row = tf.cond(tf.equal(col, 0), lambda:tf.add(row, 1), lambda:tf.identity(row))
return activations, cell_states, row, col,
row, col = tf.to_int32(0), tf.constant(1)
activations, cell_states, _, _ = tf.while_loop(
cond = lambda activations, cell_states, row, col: tf.logical_and(tf.less_equal(row , (height - 1)), tf.less_equal(col, width -1)) ,
body = loop_fn,
loop_vars = (activations,
cell_states,
row,
col),
shape_invariants = (tf.TensorShape((None, units)),
tf.TensorShape((None, units)),
tf.TensorShape([]),
tf.TensorShape([]),
),
)
#Return activations with shape [height, width, units]
return tf.pack(tf.split(0, height, activations))
This works, at least in the forward direction. That is to say if I look at what is returned in a session then I get what I want which is a 3D tensor, call it T, of shape [height, width, units] where T[i,j,:] contains the activation of the LSTM cell at input i, j.
I then would like to classify each pixel and for this purpose I conv2D across T then reshape the result into [height * width, num_labels] and construct the cross entropy loss.
T = tf.nn.conv2d(T, W, strides = [1, 1, 1, 1], padding = 'VALID')
T = tf.reshape(T, [height * width, num_labels])
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels = tf.reshape(labels, [height * width, num_labels]),
logits = T)
)
optimizer = tf.train.AdagradOptimizer(0.01).minimize(loss)
The problem
However now when I try with an image which 28 x 28 and 32 units
sess.run(optimizer, feed_dict = feed_dict)
I get the following error
File "Assignment2/train_model.py", line 52, in <module>
train_models()
File "/Assignment2/train_model.py", line 12, in train_models
image, out, labels, optomizer, accuracy, prediction, ac = build_graph(28, 28)
File "/Assignment2/multidimensional.py", line 101, in build_graph
optimizer = tf.train.AdagradOptimizer(0.01).minimize(loss)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 196, in minimize
grad_loss=grad_loss)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 253, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients.py", line 491, in gradients
in_grad.set_shape(t_in.get_shape())
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 408, in set_shape
self._shape = self._shape.merge_with(shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/tensor_shape.py", line 579, in merge_with
(self, other))
ValueError: Shapes (784, 32) and (1, 32) are not compatible
I think this is a problem with calculating the gradients resulting from the tf.while loop but I am pretty lost at this point.
Please help me to rectify the errors. This is an Opencv feature extraction code.
from __future__ import division
import numpy as np
import cv2
ESC=27
camera = cv2.VideoCapture(0)
orb = cv2.ORB()
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
imgTrainColor=cv2.imread('train.jpg')
imgTrainGray = cv2.cvtColor(imgTrainColor, cv2.COLOR_BGR2GRAY)
kpTrain = orb.detect(imgTrainGray,None)
kpTrain, desTrain = orb.compute(imgTrainGray, kpTrain)
firsttime=True
while True:
ret, imgCamColor = camera.read()
imgCamGray = cv2.cvtColor(imgCamColor, cv2.COLOR_BGR2GRAY)
kpCam = orb.detect(imgCamGray,None)
kpCam, desCam = orb.compute(imgCamGray, kpCam)
matches = bf.match(desCam,desTrain)
dist = [m.distance for m in matches]
thres_dist = (sum(dist) / len(dist)) * 0.5
matches = [m for m in matches if m.distance < thres_dist]
if firsttime==True:
h1, w1 = imgCamColor.shape[:2]
h2, w2 = imgTrainColor.shape[:2]
nWidth = w1+w2
nHeight = max(h1, h2)
hdif = (h1-h2)/2
firsttime=False
result = np.zeros((nHeight, nWidth, 3), np.uint8)
result[hdif:hdif+h2, :w2] = imgTrainColor
result[:h1, w2:w1+w2] = imgCamColor
for i in range(len(matches)):
pt_a=(int(kpTrain[matches[i].trainIdx].pt[0]), int(kpTrain[matches[i].trainIdx].pt[1]+hdif))
pt_b=(int(kpCam[matches[i].queryIdx].pt[0]+w2), int(kpCam[matches[i].queryIdx].pt[1]))
cv2.line(result, pt_a, pt_b, (255, 0, 0))
cv2.imshow('Camara', result)
key = cv2.waitKey(30)
if key == ESC:
break
cv2.destroyAllWindows()
camera.release()
ERRORS APPEARING:
Traceback (most recent call last):
File "sift.py", line 39, in
result[hdif:hdif+h2, :w2] = imgTrainColor
ValueError: could not broadcast input array from shape (700,227,3) into shape (0,227,3)
Without digging through your code in detail
result[hdif:hdif+h2, :w2] = imgTrainColor
... from shape (700,227,3) into shape (0,227,3)
I duduce that imgTrainColor is 3d with shape (700,227,3).
result must has (3,) last dimension; the :w2 must be slicing 227 vales. But the hdif:hdif+h2 is slicing 0, probably because h2 is 0.
In other words, you are trying to put the imgTrainColor values into a block of result that is too small.
Can I leave to you to figure out why h2 is wrong? Another possibility is the hdif is too large (>700). You may need to print those indexing values just before this error.
Oh, and clean up the indentation.
I'm trying to use Pybrain to predict sequences of characters belonging to the Reber grammar.
Concretely what I'm doing is generating strings using the Reber grammar graph (you can check it here : http://www.felixgers.de/papers/phd.pdf page 22). An example of such string could be BPVVE. I want my neural network to learn the underlying rules of the grammar. For each of these string I create a sequence that would typically look like this :
[B, T, S, X, P, V, E,] , [B, T, S, X, P, V, E,]
B -> value = [1, 0, 0, 0, 0, 0, 0,] , target = [0, 0, 0, 0, 1, 0, 0,]
P -> value = [0, 0, 0, 0, 1, 0, 0,] , target = [0, 0, 0, 0, 0, 1, 0,]
V -> value = [0, 0, 0, 0, 0, 1, 0,] , target = [0, 0, 0, 0, 0, 1, 0,]
V -> value = [0, 0, 0, 0, 0, 1, 0,] , target = [0, 0, 0, 0, 0, 0, 1,]
E -> E is ignored for now because it marks the end
as you can see the value is just a 7-d vector representing the current letter and the target is the next letter in the Reber word.
Here is the code I'm trying to run :
#!/usr/bin/python
import reberGrammar as reber
import random as rnd
from pylab import *
from pybrain.supervised import RPropMinusTrainer
from pybrain.supervised import BackpropTrainer
from pybrain.datasets import SequenceClassificationDataSet
from pybrain.structure.modules import LSTMLayer, SoftmaxLayer
from pybrain.tools.validation import testOnSequenceData
from pybrain.tools.shortcuts import buildNetwork
def reberToListInt(word): #e.g. "BPVVE" -> [0,4,3,3,5]
out = [None]*len(word)
for i,l in enumerate(word):
if l == 'B':
out[i] = 0
elif l == 'T':
out[i] = 1
elif l == 'S':
out[i] = 2
elif l == 'V':
out[i] = 3
elif l == 'P':
out[i] = 4
elif l == 'E':
out[i] = 5
else :
out[i] = 6
return out
def buildReberDataSet(numSample):
"""Generate a 7 class dataset"""
reberLexicon = reber.ReberGrammarLexicon(numSample)
DS = SequenceClassificationDataSet(7, 7, nb_classes=7)
for rw in reberLexicon.lexicon:
DS.newSequence()
rw2 = reberToListInt(rw)
for i in range(len(rw2)-1): #inserting one letter at a time
inpt = outpt = [0.0]*7
inpt[rw2[i]]=1.0
outpt[rw2[i+1]]=1.0
DS.addSample(inpt,outpt)
return DS
def printDataSet(DS, numLines): #just to print some stat
print "\t############"
print "Number of sequences: ",DS.getNumSequences()
print "Input and output dimensions: ", DS.indim,"\t", DS.outdim
print "\n"
for i in range(numLines):
for inp, target in DS.getSequenceIterator(i):
print inp,
print "\n"
print "\t#############"
'''Dataset creation / split into training and test sets'''
fullDS = buildReberDataSet(700)
tstdata, trndata = fullDS.splitWithProportion( 0.25 )
trndata._convertToOneOfMany( bounds=[0.,1.])
tstdata._convertToOneOfMany( bounds=[0.,1.])
#printDataSet(trndata,2)
'''Network setup / training'''
rnn = buildNetwork( trndata.indim, 7, trndata.outdim, hiddenclass=LSTMLayer, outclass=SoftmaxLayer, outputbias=False, recurrent=True)
trainer = RPropMinusTrainer( rnn, dataset=trndata, verbose=True )
#trainer = BackpropTrainer( rnn, dataset=trndata, verbose=True, momentum=0.9, learningrate=0.5 )
trainError=[]
testError =[]
#errors = trainer.trainUntilConvergence()
for i in range(9):
trainer.trainEpochs( 2 )
trainError.append(100. * (1.0-testOnSequenceData(rnn, trndata)))
testError.append(100. * (1.0-testOnSequenceData(rnn, tstdata)))
print "train error: %5.2f%%" % trainError[i], ", test error: %5.2f%%" % testError[i]
plot(trainError)
hold(True)
plot(testError)
show()
I fail to train this net. The errors are fluctuating a lot and there is no real convergence. I would really appreciate some advises on this.
Here is the code I'm using to generate Reber strings :
#!/usr/bin/python
import random as rnd
class ReberGrammarLexicon(object):
lexicon = set() #contain Reber words
graph = [ [(1,'T'), (5,'P')], \
[(1, 'S'), (2, 'X')], \
[(3,'S') ,(5, 'X')], \
[(6, 'E')], \
[(3, 'V'),(2, 'P')], \
[(4, 'V'), (5, 'T')] ] #store the graph
def __init__(self, num, maxSize = 1000): #fill Lexicon with num words
self.maxSize = maxSize
if maxSize < 5:
raise NameError('maxSize too small, require maxSize > 4')
while len(self.lexicon) < num:
word = self.generateWord()
if word != None:
self.lexicon.add(word)
def generateWord(self): #generate one word
c = 2
currentEdge = 0
word = 'B'
while c <= self.maxSize:
inc = rnd.randint(0,len(self.graph[currentEdge])-1)
nextEdge = self.graph[currentEdge][inc][0]
word += self.graph[currentEdge][inc][1]
currentEdge = nextEdge
if currentEdge == 6 :
break
c+=1
if c > self.maxSize :
return None
return word
Thanks,
Best