Emgu CV - Anisotropic Diffusion - opencv

Can anybody guide me to some existing implementations of anisotropic diffusion, preferably the perona-malik diffusion?

translate the following MATLAB code :
% pm2.m - Anisotropic Diffusion routines
function ZN = pm2(ZN,K,iterate);
[m,n] = size(ZN);
% lambda = 0.250;
lambda = .025;
%K=16;
rowC = [1:m]; rowN = [1 1:m-1]; rowS = [2:m m];
colC = [1:n]; colE = [2:n n]; colW = [1 1:n-1];
result_save=0;
for i = 1:iterate,
%i;
% result=PSNR(Z,ZN);
% if result>result_save
% result_save=result;
% else
% break;
% end
deltaN = ZN(rowN,colC) - ZN(rowC,colC);
deltaS = ZN(rowS,colC) - ZN(rowC,colC);
deltaE = ZN(rowC,colE) - ZN(rowC,colC);
deltaW = ZN(rowC,colW) - ZN(rowC,colC);
% deltaN = deltaN .*abs(deltaN<K);
% deltaS = deltaS .*abs(deltaS<K);
% deltaE = deltaE .*abs(deltaE<K);
% deltaW = deltaW .*abs(deltaW<K);
fluxN = deltaN .* exp(-((abs(deltaN) ./ K).^2) );
fluxS = deltaS .* exp(-((abs(deltaS) ./ K).^2) );
fluxE = deltaE .* exp(-((abs(deltaE) ./ K).^2) );
fluxW = deltaW .* exp(-((abs(deltaW) ./ K).^2) );
ZN = ZN + lambda*(fluxN +fluxS + fluxE + fluxW);
%ZN=max(0,ZN);ZN=min(255,ZN);
end
the code is not mine and has been taken from: http://www.csee.wvu.edu/~xinl/code/pm2.m

OpenCV Implementation (It needs 3 channel image):
from cv2.ximgproc import anisotropicDiffusion
ultrasound_ad_cv2 = anisotropicDiffusion(im,0.075 ,80, 100)
Juxtapose comparison
From scratch in Python: (For grayscale image only)
import scipy.ndimage.filters as flt
import numpy as np
import warnings
def anisodiff(img,niter=1,kappa=50,gamma=0.1,step=(1.,1.),sigma=0, option=1,ploton=False):
"""
Anisotropic diffusion.
Usage:
imgout = anisodiff(im, niter, kappa, gamma, option)
Arguments:
img - input image
niter - number of iterations
kappa - conduction coefficient 20-100 ?
gamma - max value of .25 for stability
step - tuple, the distance between adjacent pixels in (y,x)
option - 1 Perona Malik diffusion equation No 1
2 Perona Malik diffusion equation No 2
ploton - if True, the image will be plotted on every iteration
Returns:
imgout - diffused image.
kappa controls conduction as a function of gradient. If kappa is low
small intensity gradients are able to block conduction and hence diffusion
across step edges. A large value reduces the influence of intensity
gradients on conduction.
gamma controls speed of diffusion (you usually want it at a maximum of
0.25)
step is used to scale the gradients in case the spacing between adjacent
pixels differs in the x and y axes
Diffusion equation 1 favours high contrast edges over low contrast ones.
Diffusion equation 2 favours wide regions over smaller ones.
"""
# ...you could always diffuse each color channel independently if you
# really want
if img.ndim == 3:
warnings.warn("Only grayscale images allowed, converting to 2D matrix")
img = img.mean(2)
# initialize output array
img = img.astype('float32')
imgout = img.copy()
# initialize some internal variables
deltaS = np.zeros_like(imgout)
deltaE = deltaS.copy()
NS = deltaS.copy()
EW = deltaS.copy()
gS = np.ones_like(imgout)
gE = gS.copy()
# create the plot figure, if requested
if ploton:
import pylab as pl
from time import sleep
fig = pl.figure(figsize=(20,5.5),num="Anisotropic diffusion")
ax1,ax2 = fig.add_subplot(1,2,1),fig.add_subplot(1,2,2)
ax1.imshow(img,interpolation='nearest')
ih = ax2.imshow(imgout,interpolation='nearest',animated=True)
ax1.set_title("Original image")
ax2.set_title("Iteration 0")
fig.canvas.draw()
for ii in np.arange(1,niter):
# calculate the diffs
deltaS[:-1,: ] = np.diff(imgout,axis=0)
deltaE[: ,:-1] = np.diff(imgout,axis=1)
if 0<sigma:
deltaSf=flt.gaussian_filter(deltaS,sigma);
deltaEf=flt.gaussian_filter(deltaE,sigma);
else:
deltaSf=deltaS;
deltaEf=deltaE;
# conduction gradients (only need to compute one per dim!)
if option == 1:
gS = np.exp(-(deltaSf/kappa)**2.)/step[0]
gE = np.exp(-(deltaEf/kappa)**2.)/step[1]
elif option == 2:
gS = 1./(1.+(deltaSf/kappa)**2.)/step[0]
gE = 1./(1.+(deltaEf/kappa)**2.)/step[1]
# update matrices
E = gE*deltaE
S = gS*deltaS
# subtract a copy that has been shifted 'North/West' by one
# pixel. don't as questions. just do it. trust me.
NS[:] = S
EW[:] = E
NS[1:,:] -= S[:-1,:]
EW[:,1:] -= E[:,:-1]
# update the image
imgout += gamma*(NS+EW)
if ploton:
iterstring = "Iteration %i" %(ii+1)
ih.set_data(imgout)
ax2.set_title(iterstring)
fig.canvas.draw()
# sleep(0.01)
return imgout
Usage
:
#anisodiff(img,niter=1,kappa=50,gamma=0.1,step=(1.,1.),sigma=0, option=1,ploton=False)
us_im_ad = anisodiff(ultrasound,100,80,0.075,(1,1),2.5,1)
Source
Juxtapose comparison

Related

Linefitting how to deal with continuous values?

I'm trying to fit a line using quadratic poly, but because the fit results in continuous values, the integer conversion (for CartesianIndex) rounds it off, and I loose data at that pixel.
I tried the method
here. So I get new y values as
using Images, Polynomials, Plots,ImageView
img = load("jTjYb.png")
img = Gray.(img)
img = img[end:-1:1, :]
nodes = findall(img.>0)
xdata = map(p->p[2], nodes)
ydata = map(p->p[1], nodes)
f = fit(xdata, ydata, 2)
ydata_new .= round.(Int, f.(xdata)
new_line_fitted_img=zeros(size(img))
new_line_fitted_img[xdata,ydata_new].=1
imshow(new_line_fitted_img)
which results in chopped line as below
whereas I was expecting it to be continuous line as it was in pre-processing
Do you expect the following:
Raw Image
Fitted Polynomial
Superposition
enter image description here
enter image description here
enter image description here
Code:
using Images, Polynomials
img = load("img.png");
img = Gray.(img)
fx(data, dCoef, cCoef, bCoef, aCoef) = #. data^3 *aCoef + data^2 *bCoef + data*cCoef + dCoef;
function fit_poly(img::Array{<:Gray, 2})
img = img[end:-1:1, :]
nodes = findall(img.>0)
xdata = map(p->p[2], nodes)
ydata = map(p->p[1], nodes)
f = fit(xdata, ydata, 3)
xdt = unique(xdata)
xdt, fx(xdt, f.coeffs...)
end;
function draw_poly!(X, y)
the_min = minimum(y)
if the_min<0
y .-= the_min - 1
end
initialized_img = Gray.(zeros(maximum(X), maximum(y)))
initialized_img[CartesianIndex.(X, y)] .= 1
dif = diff(y)
for i in eachindex(dif)
the_dif = dif[i]
if abs(the_dif) >= 2
segment = the_dif รท 2
initialized_img[i, y[i]:y[i]+segment] .= 1
initialized_img[i+1, y[i]+segment+1:y[i+1]-1] .= 1
end
end
rotl90(initialized_img)
end;
X, y = fit_poly(img);
y = convert(Vector{Int64}, round.(y));
draw_poly!(X, y)

Why my feature map seems incorrect when the prediction of the class is correct

from torchvision.models.feature_extraction import create_feature_extractor
# Data processing
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)])
image_path = './data/test_images/anemone.jpg'
image = Image.open(image_path).convert('RGB')
img_processed = preprocess(image)
batch_img_cat_tensor = torch.unsqueeze(img_processed, 0)
# Model initialization
resnet50_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# Eval model for predictions
resnet50_model.eval()
# Creating feature extractor (Detailed example here: https://pytorch.org/blog/FX-feature-extraction-torchvision/)
feature_extractor = create_feature_extractor(resnet50_model,
return_nodes=['layer4.2.conv3', 'fc'])
# Forward pass
out = feature_extractor(batch_img_cat_tensor)
pred = torch.argmax(out['fc'])
# Transforming last conv output to numpy and reshaping it so that the channels would be last
last_conv_output = torch.squeeze(out['layer4.2.conv3'])
last_conv_output = torch.reshape(last_conv_output, (7, 7, -1))
last_conv_output = last_conv_output.detach().numpy()
last_conv_output = last_conv_output.astype(np.uint8)
Calculating the upscale factors for last conv output
width_factor = int(image.size[0] / last_conv_output.shape[0])
height_factor = int(image.size[1] / last_conv_output.shape[1])
# Getting the shapes of the last conv output
last_conv_w, last_conv_h, n_channels = last_conv_output.shape
# Calculate the
upscaled_h = last_conv_h * height_factor
upscaled_w = last_conv_w * width_factor
# Upscaling the last_conv_output so that it could be "masked" with original image
upsampled_last_conv_output = np.zeros((upscaled_h, upscaled_w, n_channels))
upsampled_last_conv_output = []
for x in range(0, n_channels, 512):
upsampled_last_conv_output.append(cv2.resize(last_conv_output[:, :, x:x+512], (upscaled_w, upscaled_h), cv2.INTER_CUBIC))
upsampled_last_conv_output = np.concatenate(upsampled_last_conv_output, axis=2)
# Getting the weights of the predicted class
last_layer_weights = resnet50_model.fc.weight.T
last_layer_weights_for_pred = last_layer_weights[:, pred]
# Dot multiplying the upsampled_last_conv_output with last_layer_weights_for_pred
upsampled_last_conv_output = upsampled_last_conv_output.reshape((-1, 2048))
heat_map = np.dot(upsampled_last_conv_output,
last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
# Plotting the results
fig, ax = plt.subplots()
ax.imshow(image)
ax.imshow(heat_map, cmap='jet', alpha=0.5)
ax.set_title(prediction)
I have followed the tutorial from here: https://www.youtube.com/watch?v=GiyldmoYe_M&t=665s&ab_channel=DigitalSreeni
The main problem with this is that I get the feature map that looks like this:
As you see it looks like the model reacts to multiple areas on the image and no matter what image I use it always has the biggest reaction in the middle.
PS. If you think this question should be posted on the AI stack exchange please notify me
I have found an error I made. It was that after creating a
heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
I had to apply this as well:
heat_map = heat_map - np.min(heat_map)
heat_map = heat_map / np.max(heat_map)
Since I normalized the image, the generated heatmap was also normalized, so I needed to "denormalize" it back to it's original values.

Why does Tesseract fail to recognize 6 out of 26 of my alphabetic keyboard keys even with several parameter tunings?

TL;DR I'm using:
adaptive thresholding
segmenting by keys (width/height ratio) - see green boxes in image result
psm 10 to treat each key as a character
but it fails to recognize some keys, falsely identifies others or identifies 2 for 1 char (see the L character in the image result, it's an L and P), etc.
Note: I cropped the image and re-ran the results to get it to fit on this site, but before cropping it did slightly better (recognized more keys, fewer false positives, etc).
I just want it to recognize the alphabet keys. Ultimately I will want it to work for realtime video.
config:
'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
I've tried scaling the image differently, scaling the individual key segments, using opening/closing/etc but it doesn't recognize all the keys.
original image
image result
Update: new results if I make the image straighter (bird's eye) and remove the whitelisting, it manages to detect all for the most part (although it thinks the O is a 0 and the I is a |, which is understandable). Why is this and how could I make this adaptive enough for a dynamic video when it is so sensitive to these conditions?
Code:
import pytesseract
import numpy as np
try:
from PIL import Image
except ImportError:
import Image
import cv2
from tqdm import tqdm
from collections import defaultdict
def get_missing_chars(dict):
capital_alphabet = [chr(ascii) for ascii in range(65, 91)]
return [let for let in capital_alphabet if let not in dict]
def draw_box_and_char(img, contour_dims, c, box_col, text_col):
x, y, w, h = contour_dims
top_left = (x, y)
bot_right = (x + w, y+h)
font_offset = 3
text_pos = (x+h//2+12, y+h-font_offset)
img_copy = img.copy()
cv2.rectangle(img_copy, top_left, bot_right, box_col, 2)
cv2.putText(img_copy, c, text_pos, cv2.FONT_HERSHEY_SIMPLEX, fontScale=.5, color=text_col, thickness=1, lineType=cv2.LINE_AA)
return img_copy
def detect_keys(img):
scaling = .25
img = cv2.resize(img, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_AREA)
print("img shape", img.shape)
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ratio_min = 0.7
area_min = 1000
nbrhood_size = 1001
bias = 2
# adapt to different lighting
bin_img = cv2.adaptiveThreshold(gray_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
cv2.THRESH_BINARY_INV, nbrhood_size, bias)
items = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = items[0] if len(items) == 2 else items[1]
key_contours = []
for c in contours:
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
# square-like ratio, try to get character
if ratio > ratio_min and area > area_min:
key_contours.append(c)
detected = defaultdict(int)
n_kept = 0
img_copy = cv2.cvtColor(bin_img, cv2.COLOR_GRAY2RGB)
let_to_contour = {}
n_contours = len(key_contours)
# offset to get smaller square within the key segment for easier char recognition
offset = 10
show_each_char = False
for _, c in tqdm(enumerate(key_contours), total=n_contours):
x, y, w, h = cv2.boundingRect(c)
ratio = h/w
area = cv2.contourArea(c)
base = np.zeros(bin_img.shape, dtype=np.uint8)
base.fill(255)
n_kept += 1
new_y = y+offset
new_x = x+offset
new_h = h-2*offset
new_w = w-2*offset
base[new_y:new_y+new_h, new_x:new_x+new_w] = bin_img[new_y:new_y+new_h, new_x:new_x+new_w]
segment = cv2.bitwise_not(base)
# try scaling up individual keys
# scaling = 2
# segment = cv2.resize(segment, None, fx=scaling, fy=scaling, interpolation=cv2.INTER_CUBIC)
# psm 10: treats the segment as a single character
custom_config = r'-l eng --oem 1 --psm 10 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZ"'
d = pytesseract.image_to_data(segment, config=custom_config, output_type='dict')
conf = d['conf']
c = d['text'][-1]
if c:
# sometimes recognizes multiple keys even though there is only 1
for sub_c in c:
# save character and contour to draw on image and show bounds/detection
if sub_c not in let_to_contour or (sub_c in let_to_contour and conf > let_to_contour[sub_c]['conf']):
let_to_contour[sub_c] = {'conf': conf, 'cont': (new_x, new_y, new_w, new_h)}
else:
c = "?"
text_col = (0, 0, 255)
if show_each_char:
contour_dims = (new_x, new_y, new_w, new_h)
box_col = (0, 255, 0)
text_col = (0, 0, 0)
segment_with_boxes = draw_box_and_char(segment, contour_dims, c, box_col, text_col)
cv2.imshow('segment', segment_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()
# draw boxes around recognized keys
for c, data in let_to_contour.items():
box_col = (0, 255, 0)
text_col = (0, 0, 0)
img_copy = draw_box_and_char(img_copy, data['cont'], c, box_col, text_col)
detected = {k: 1 for k in let_to_contour}
for det in let_to_contour:
print(det, let_to_contour[det])
print("total detected: ", let_to_contour.keys())
missing = get_missing_chars(detected)
print(f"n_missing: {len(missing)}")
print(f"chars missing: {missing}")
return img_copy
if __name__ == "__main__":
img_file = "keyboard.jpg"
img = cv2.imread(img_file)
img_with_detected_keys = detect_keys(img)
cv2.imshow("detected", img_with_detected_keys)
cv2.waitKey(0)
cv2.destroyAllWindows()

How to write an optimiser for StyleGAN2 interpolation?

I would like to interpolate two images using StyleGAN2-ADA-PyTorch from NVLabs. For the sake of simplicity, it can be said that with two images of different persons I want to create a third image depicting a third person, with a body from the first image, and their head from the second. I also have corresponding w-vectors for the two images ready at hand.
# G is a generative model in line with StyleGAN2, trained to output 512x512 images.
# Latents shape is [1, 16, 512]
G = G.eval().requires_grad_(False).to(device) # type: ignore
num_ws = G.mapping.num_ws # 16
w_dim = G.mapping.w_dim # 512
# Segmentation network is used to extract important parts from images
segmentation_dnn = segmentation_dnn.to(device)
# Source images are represented as latent vectors. I use G to generate actual images:
image_body = image_from_output(G.synthesis(w_body, noise_mode='const'))
image_head = image_from_output(G.synthesis(w_head, noise_mode='const'))
# Custom function is applied to source images, creating masked images.
# In masked images, only head or body is present (and the rest is filled with white pixels)
image_body_masked = apply_segmentation_mask(image_body, segmentation_dnn, select='body')
image_head_masked = apply_segmentation_mask(image_head, segmentation_dnn, select='head')
In order to compare similarity of any two images, I use VGGLos
# VGG16 is used as a feature extractor to evaluate image similarity
url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
with dnnlib.util.open_url(url) as f:
vgg16 = torch.jit.load(f).eval().to(device)
class VGGLoss(nn.Module):
def __init__(self, device, vgg):
super().__init__()
for param in self.parameters():
param.requires_grad = False
self.criterion = nn.L1Loss().to(device)
def forward(self, source, target):
loss = 0
source_features = self.vgg(source, resize_images=False, return_lpips=True)
target_features = self.vgg(target, resize_images=False, return_lpips=True)
loss += self.criterion(source, target)
return loss
vgg_loss = VGGLoss(device, vgg=vgg16)
Now, I want to interpolate image_body and image_head, creating image_target.
To do this, I need to find latent representation of image_target in the latent space of StyleGAN2
Crudely, we can use optimize for a coefficient query_opt to partially include latents from image_body and image_head: w_target = w_body + (query_opt * (w_head - w_person))
query_opt = torch.randn([1, num_ws, 1], dtype=torch.float32, device=device, requires_grad=True)
optimizer = torch.optim.Adam(query_opt, betas=(0.9, 0.999), lr=initial_learning_rate)
w_out = []
for step in num_steps:
# Learning rate schedule.
t = step / num_steps
lr_ramp = min(1.0, (1.0 - t) / lr_rampdown_length)
lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi)
lr_ramp = lr_ramp * min(1.0, t / lr_rampup_length)
lr = initial_learning_rate * lr_ramp
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# Synth image from w_target using query_opt.
# This interpolation formula is an important step, and I think my math might be out of order up here
w_target = w_body + (query_opt * (w_head - w_person))
image_target = image_from_output(G.synthesis(ws, noise_mode='const'))
image_target_body_masked = apply_segmentation_mask(image_target, segmentation_dnn, select='body')
image_target_head_masked = apply_segmentation_mask(image_target, segmentation_dnn, select='head')
loss = vgg_loss(image_body_masked, image_target_body_masked) + vgg_loss(image_head_masked, image_target_head_masked)
# Step
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
logprint(f'step {step+1:>4d}/{num_steps}: loss {float(loss):<5.2f}')
# Save current w_target
w_out[step] = w_target.detach()
I can't figure out how to make my optimizer actually target query_opt in such a way that combined VGGLoss is actually optimized for. I must be missing something in my PyTorch code, or maybe even in the main interpolation formula.

Variational bayes method - Unreasonable result from posterior distribution

I'm trying to implement the basic example from https://en.wikipedia.org/wiki/Variational_Bayesian_methods#A_basic_example in order to find the posterior distribution over the mean and variance for the input data(which I generate in the beginning).
It is my understanding that the approximated posterior should be given by the product of q(mu)*q(tau) so I thought I could get it by simple multiplying each distribution for each point in the grid and then plot it. Although I can't see any error with my distributions, the gamma distribution produces extremely small values while the Gaussian distributions only has one non-zero element. My guess is that there is something wrong in the end where I multiply the two distributions for each point in the grid produced by meshgrid but I just wrote both distributions straight from Wikipedia. Why are my posterior probabilities so small/nan and what can I do to fix it?
Here is my code:
# First Exact solution
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy.special import factorial
# Produce N data points from known gaussian distribution
real_mu = 2
real_tau = 1
x = np.arange(-3,7,0.1)
N = len(x)
nrv = stats.norm.pdf(x, real_mu, real_tau)
## Approximate posterior distribution over mean and covariance ##
x = nrv # N data points from "unknown distribution"
N = len(x) #
# The Algorithm
#hyperparameters - can be set arbitrarily but smaller positive values indicates larger prior distributions over mu and tau
lambda_0 = 0.05
mu_0 = 0.05
a_0 = 0.05
b_0 = 0.05
##
x_sum = np.sum(x)
x_ave = np.sum(x)/N
x_squared = np.dot(x, x)
mu_n = (lambda_0*mu_0 + N*x_ave)/(lambda_0 + N)
a_N = a_0 + (N + 1)/2
lambda_N = 1 # Initialize lambda to some arbitrary value
difference = 9999999
while difference > 0.0000001:
b_N = b_0 + 0.5*((lambda_0 + N)*((1/lambda_N) + mu_n**2) - 2*(lambda_0*mu_0 + x_sum)*mu_n + (x_squared) + lambda_0*mu_0*mu_0)
new_lambda_N = (lambda_0 + N)*a_N/b_N
difference_1 = new_lambda_N - lambda_N
lambda_N = new_lambda_N
difference = np.absolute(difference_1)
# Calulate the approximated posterior from these parameters: q(mu, tau) = q(mu)q(tau)
t = np.arange(-2,2,0.01)
#qmu = stats.norm.pdf(t, mu_n, 1/lambda_N)
#qtau = gamma.pdf(t, a_N, loc=0, scale=b_N) #scale=1/b_N)
def gaussian(x):
return (1/(np.sqrt(2*np.pi*sigma*sigma)))*np.exp(-(x-mu_n)*(x-mu_n)/(2*sigma*sigma))
def gamma(x):
return ((b_N**a_N)*(x**(a_N-1))*np.exp(-x*b_N))/(factorial(a_N-1))
sigma = 1/lambda_N
xx, yy = np.meshgrid(t, t)
# First part in zz is from Gaussian distribution over mu and second a gamma distribution over tau
# Same as the two defined functions above
zz = ((1/(np.sqrt(2*np.pi*sigma*sigma)))*np.exp(-(xx-mu_n)*(xx-mu_n)/(2*sigma*sigma)))*((b_N**a_N)*(yy**(a_N-1))*np.exp(-yy*b_N))/(factorial(a_N-1))
plt.xlabel("mu")
plt.ylabel("tau")
plt.contourf(t,t,zz)
plt.show()

Resources