I'm implementing the sobel filter according to the following pseudocode taken from Wikipedia:
function sobel(A : as two dimensional image array)
Gx=[-1 0 1; -2 0 2; -1 0 1]
Gy=[-1 -2 -1; 0 0 0; 1 2 1]
rows = size(A,1)
columns = size(A,2)
mag=zeros(A)
for i=1:rows-2
for j=1:columns-2
S1=sum(sum(Gx.*A(i:i+2,j:j+2)))
S2=sum(sum(Gy.*A(i:i+2,j:j+2)))
mag(i+1,j+1)=sqrt(S1.^2+S2.^2)
end for
end for
threshold = 70 %varies for application [0 255]
output_image = max(mag,threshold)
output_image(output_image==round(threshold))=0;
return output_image
end function
However, upon applying this algorithm, I'm getting many output_image values above 255, and that makes sense considering how Gx and Gy are defined. How can I modify this algorithm such that the values don't go above 255 and finally that the results look more like this?:
--- Edit ---
There was some error in my filter implementation and I think that's why the values were above 255. After fixing the error, the values range between 0 - 16. Since now all values are below 70, applying a threshold of 70 sends everything to 0. So I set a lower threshold, 5, and multiplied the rest of the values by 10 (to enhance the edges since they are in the 5-16 range) and got the following result:
I also tried the normalization method mentioned in the comments but got a similar noisy image.
--- Edit 2 ---
Since the actual code was requested, I'm posting the code, which is written in Halide.
int main(int argc, char **argv) {
Var x, y, k, c;
Buffer<uint8_t> left_buffer = load_image("images/stereo/bike.jpg");
Expr clamped_x = clamp(x, 0, left_buffer.width() - 1);
Expr clamped_y = clamp(y, 0, left_buffer.height() - 1);
Func left_original("left_original");
left_original(x, y) = left_buffer(clamped_x, clamped_y);
left_original.compute_root();
// 3x3 sobel filter
Buffer<uint8_t> sobel_1(3);
sobel_1(0) = -1;
sobel_1(1) = 0;
sobel_1(2) = 1;
Buffer<uint8_t> sobel_2(3);
sobel_2(0) = 1;
sobel_2(1) = 2;
sobel_2(2) = 1;
RDom conv_x(-1, 2);
RDom conv_y(-1, 2);
Func output_x_inter("output_x_inter");
output_x_inter(x, y) = sum(left_original(x - conv_x, y) * sobel_1(conv_x + 1));
output_x_inter.compute_root();
Func output_x("output_x");
output_x(x, y) = sum(output_x_inter(x, y - conv_y) * sobel_2(conv_y + 1));
output_x.compute_root();
Func output_y("output_y");
output_y(x, y) = sum(conv_y, sum(conv_x, left_original(x - conv_x, y - conv_y) * sobel_2(conv_x + 1)) * sobel_1(conv_y + 1));
output_y.compute_root();
Func output("output");
output(x, y) = sqrt(output_x(x, y) * output_x(x, y) + output_y(x, y) * output_y(x, y));
output.compute_root();
output.trace_stores();
RDom img(0, left_buffer.width(), 0, left_buffer.height());
Func max("max");
max(k) = f32(0);
max(0) = maximum(output(img.x, img.y));
max.compute_root();
Func min("min");
min(k) = f32(0);
min(0) = minimum(output(img.x, img.y));
min.compute_root();
Func output_u8("output_u8");
// The following line sends all the values of output <= 5 to zero, and multiplies the resulting values by 10 to enhance the intensity of the edges.
output_u8(x, y) = u8(select(output(x, y) <= 5, 0, output(x, y))*10);
output_u8.compute_root();
output_u8.trace_stores();
Buffer<uint8_t> output_buff = output_u8.realize(left_buffer.width(), left_buffer.height());
save_image(output_buff, "images/stereo/sobel/out.png");
}
--- Edit 3 ---
As one answer suggested, I changed all types to float except the last one, which must be unsigned 8-bit type. Here's the code, and the result that I'm getting.
int main(int argc, char **argv) {
Var x, y, k, c;
Buffer<uint8_t> left_buffer = load_image("images/stereo/bike.jpg");
Expr clamped_x = clamp(x, 0, left_buffer.width() - 1);
Expr clamped_y = clamp(y, 0, left_buffer.height() - 1);
Func left_original("left_original");
left_original(x, y) = left_buffer(clamped_x, clamped_y);
left_original.compute_root();
// 3x3 sobel filter
Buffer<float_t> sobel_1(3);
sobel_1(0) = -1;
sobel_1(1) = 0;
sobel_1(2) = 1;
Buffer<float_t> sobel_2(3);
sobel_2(0) = 1;
sobel_2(1) = 2;
sobel_2(2) = 1;
RDom conv_x(-1, 2);
RDom conv_y(-1, 2);
Func output_x_inter("output_x_inter");
output_x_inter(x, y) = f32(sum(left_original(x - conv_x, y) * sobel_1(conv_x + 1)));
output_x_inter.compute_root();
Func output_x("output_x");
output_x(x, y) = f32(sum(output_x_inter(x, y - conv_y) * sobel_2(conv_y + 1)));
output_x.compute_root();
RDom img(0, left_buffer.width(), 0, left_buffer.height());
Func output_y("output_y");
output_y(x, y) = f32(sum(conv_y, sum(conv_x, left_original(x - conv_x, y - conv_y) * sobel_2(conv_x + 1)) * sobel_1(conv_y + 1)));
output_y.compute_root();
Func output("output");
output(x, y) = sqrt(output_x(x, y) * output_x(x, y) + output_y(x, y) * output_y(x, y));
output.compute_root();
Func max("max");
max(k) = f32(0);
max(0) = maximum(output(img.x, img.y));
max.compute_root();
Func min("min");
min(k) = f32(0);
min(0) = minimum(output(img.x, img.y));
min.compute_root();
// output_inter for scaling
Func output_inter("output_inter");
output_inter(x, y) = f32((output(x, y) - min(0)) * 255 / (max(0) - min(0)));
output_inter.compute_root();
Func output_u8("output_u8");
output_u8(x, y) = u8(select(output_inter(x, y) <= 70, 0, output_inter(x, y)));
output_u8.compute_root();
output_u8.trace_stores();
Buffer<uint8_t> output_buff = output_u8.realize(left_buffer.width(), left_buffer.height());
save_image(output_buff, "images/stereo/sobel/out.png");
}
--- Edit 4 ---
As #CrisLuengo suggested, I simplified my code and outputted the result of the following:
output(x, y) = u8(min(sqrt(output_x(x, y) * output_x(x, y) + output_y(x, y) * output_y(x, y)), 255));
Since many values are way above 255, these many values are clamped to 255 and thus we get a "washed out" image:
I don't know the Halide syntax, I've just learned it exists. But I can point out one clear problem:
Buffer<uint8_t> sobel_1(3);
sobel_1(0) = -1;
You are assigning -1 to a uint8 type. That doesn't work as intended. Make the kernel a float, and do all computations as floats, then scale the result and store it in your uint8 output image.
When computing using small integer types, one has to be very careful with overflow and underflow. The Sobel computations could likely be done in the (signed) int16 type, but in my experience there is no advantage in that over using the float type, then scaling (or clamping) and casting the result to the output image's type.
I figured it out finally, but I'm not sure why Halide is behaving this way.
When I do this:
RDom conv_x(-1, 2);
RDom conv_y(-1, 2);
Func output_x_inter("output_x_inter");
output_x_inter(x, y) = f32(sum(left_original(x - conv_x, y) * sobel_1(conv_x + 1)));
Func output_x("output_x");
output_x(x, y) = f32(sum(output_x_inter(x, y - conv_y) * sobel_2(conv_y + 1)));
Things don't work. But when I "unroll" the sum function things work:
Func output_x_inter("output_x_inter");
output_x_inter(x, y) = f32(left_original(x + 1, y) * sobel_1(0) + left_original(x, y) * sobel_1(1) + left_original(x - 1, y) * sobel_1(2));
Func output_x("output_x");
output_x(x, y) = f32(output_x_inter(x, y + 1) * sobel_2(0) + output_x_inter(x, y) * sobel_2(1) + output_x_inter(x, y - 1) * sobel_2(2));
Related
I have implemented separable Gaussian blur. Horizontal pass was relatively easy to optimize with SIMD processing. However, I am not sure how to optimize vertical pass.
Accessing elements is not very cache friendly and filling SIMD lane would mean reading many different pixels. I was thinking about transpose the image and run horizontal pass and then transpose image back, however, I am not sure if it will gain any improvement because of two tranpose operations.
I have quite large images 16k resolution and kernel size is 19, so vectorization of vertical pass gain was about 15%.
My Vertical pass is as follows (it is sinde generic class typed to T which can be uint8_t or float):
int yStart = kernelHalfSize;
int xStart = kernelHalfSize;
int yEnd = input.GetWidth() - kernelHalfSize;
int xEnd = input.GetHeigh() - kernelHalfSize;
const T * inData = input.GetData().data();
V * outData = output.GetData().data();
int kn = kernelHalfSize * 2 + 1;
int kn4 = kn - kn % 4;
for (int y = yStart; y < yEnd; y++)
{
size_t yW = size_t(y) * output.GetWidth();
size_t outX = size_t(xStart) + yW;
size_t xEndSimd = xStart;
int len = xEnd - xStart;
len = len - len % 4;
xEndSimd = xStart + len;
for (int x = xStart; x < xEndSimd; x += 4)
{
size_t inYW = size_t(y) * input.GetWidth();
size_t x0 = ((x + 0) - kernelHalfSize) + inYW;
size_t x1 = x0 + 1;
size_t x2 = x0 + 2;
size_t x3 = x0 + 3;
__m128 sumDot = _mm_setzero_ps();
int i = 0;
for (; i < kn4; i += 4)
{
__m128 kx = _mm_set_ps1(kernelDataX[i + 0]);
__m128 ky = _mm_set_ps1(kernelDataX[i + 1]);
__m128 kz = _mm_set_ps1(kernelDataX[i + 2]);
__m128 kw = _mm_set_ps1(kernelDataX[i + 3]);
__m128 dx, dy, dz, dw;
if constexpr (std::is_same<T, uint8_t>::value)
{
//we need co convert uint8_t inputs to float
__m128i u8_0 = _mm_loadu_si128((const __m128i*)(inData + x0));
__m128i u8_1 = _mm_loadu_si128((const __m128i*)(inData + x1));
__m128i u8_2 = _mm_loadu_si128((const __m128i*)(inData + x2));
__m128i u8_3 = _mm_loadu_si128((const __m128i*)(inData + x3));
__m128i u32_0 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_0, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_1 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_1, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_2 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_2, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_3 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_3, _mm_setzero_si128()),
_mm_setzero_si128());
dx = _mm_cvtepi32_ps(u32_0);
dy = _mm_cvtepi32_ps(u32_1);
dz = _mm_cvtepi32_ps(u32_2);
dw = _mm_cvtepi32_ps(u32_3);
}
else
{
/*
//load 8 consecutive values
auto dd = _mm256_loadu_ps(inData + x0);
//extract parts by shifting and casting to 4 values float
dx = _mm256_castps256_ps128(dd);
dy = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 4, 3, 2, 1)));
dz = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 5, 4, 3, 2)));
dw = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 6, 5, 4, 3)));
*/
dx = _mm_loadu_ps(inData + x0);
dy = _mm_loadu_ps(inData + x1);
dz = _mm_loadu_ps(inData + x2);
dw = _mm_loadu_ps(inData + x3);
}
//calculate 4 dots at once
//[dx, dy, dz, dw] <dot> [kx, ky, kz, kw]
auto mx = _mm_mul_ps(dx, kx); //dx * kx
auto my = _mm_fmadd_ps(dy, ky, mx); //mx + dy * ky
auto mz = _mm_fmadd_ps(dz, kz, my); //my + dz * kz
auto res = _mm_fmadd_ps(dw, kw, mz); //mz + dw * kw
sumDot = _mm_add_ps(sumDot, res);
x0 += 4;
x1 += 4;
x2 += 4;
x3 += 4;
}
for (; i < kn; i++)
{
auto v = _mm_set_ps1(kernelDataX[i]);
auto v2 = _mm_set_ps(
*(inData + x3), *(inData + x2),
*(inData + x1), *(inData + x0)
);
sumDot = _mm_add_ps(sumDot, _mm_mul_ps(v, v2));
x0++;
x1++;
x2++;
x3++;
}
sumDot = _mm_mul_ps(sumDot, _mm_set_ps1(weightX));
if constexpr (std::is_same<V, uint8_t>::value)
{
__m128i asInt = _mm_cvtps_epi32(sumDot);
asInt = _mm_packus_epi32(asInt, asInt);
asInt = _mm_packus_epi16(asInt, asInt);
uint32_t res = _mm_cvtsi128_si32(asInt);
((uint32_t *)(outData + outX))[0] = res;
outX += 4;
}
else
{
float tmpRes[4];
_mm_store_ps(tmpRes, sumDot);
outData[outX + 0] = tmpRes[0];
outData[outX + 1] = tmpRes[1];
outData[outX + 2] = tmpRes[2];
outData[outX + 3] = tmpRes[3];
outX += 4;
}
}
for (int x = xEndSimd; x < xEnd; x++)
{
int kn = kernelHalfSize * 2 + 1;
const T * v = input.GetPixelStart(x - kernelHalfSize, y);
float tmp = 0;
for (int i = 0; i < kn; i++)
{
tmp += kernelDataX[i] * v[i];
}
tmp *= weightX;
outData[outX] = ImageUtils::clamp_cast<V>(tmp);
outX++;
}
}
There’s a well-known trick for that.
While you compute both passes, read them sequentially, use SIMD to compute, but write out the result into another buffer, transposed, using scalar stores. Protip: SSE 4.1 has _mm_extract_ps just don’t forget to cast your destination image pointer from float* into int*. Another thing about these stores, I would recommend using _mm_stream_si32 for that as you want maximum cache space used by your input data. When you’ll be computing the second pass, you’ll be reading sequential memory addresses again, the prefetcher hardware will deal with the latency.
This way both passes will be identical, I usually call same function twice, with different buffers.
Two transposes caused by your 2 passes cancel each other. Here’s an HLSL version, BTW.
There’s more. If your kernel size is only 19, that fits in 3 AVX registers. I think shuffle/permute/blend instructions are still faster than even L1 cache loads, i.e. it might be better to load the kernel outside the loop.
The XYZ color space encompasses all possible colors, not just those which can be generated by a particular device like a monitor. Not all XYZ triplets represent a color that is physically possible. Is there a way, given an XYZ triplet, to determine if it represents a real color?
I wanted to generate a CIE 1931 chromaticity diagram (seen bellow) for myself, but wasn't sure how to go about it. It's easy to, for example, take all combinations of sRGB triplets and then transform them into the xy coordinates of the chromaticity diagram and then plot them. You cannot use this same approach in the XYZ color space though since not all combinations are valid colors. So far the best I have come up with is a stochastic approach, where I generate a random spectral distribution by summing a random number of random Gaussians, then converting it to XYZ using the standard observer functions.
Having thought about it a little more I felt the obvious solution is to generate a list of xy points around the edge of spectral locus, corresponding to pure monochromatic colors. It seems to me that this can be done by directly inputting the visible frequencies (~380-780nm) into the CIE XYZ standard observer color matching functions. Treating these points like a convex polygon you could determine if a point is within the spectral locus using one algorithm or another. In my case, since what I really wanted to do is simply generate the chromaticity diagram, I simply input these points into a graphics library's polygon drawing routine and then for each pixel of the polygon I can transform it into sRGB.
I believe this solution is similar to the one used by the library that Kel linked in a comment. I'm not entirely sure, as I am not familiar with Python.
function RGBfromXYZ(X, Y, Z) {
const R = 3.2404542 * X - 1.5371385 * Y - 0.4985314 * Z
const G = -0.969266 * X + 1.8760108 * Y + 0.0415560 * Z
const B = 0.0556434 * X - 0.2040259 * Y + 1.0572252 * Z
return [R, G, B]
}
function XYZfromYxy(Y, x, y) {
const X = Y / y * x
const Z = Y / y * (1 - x - y)
return [X, Y, Z]
}
function srgb_from_linear(x) {
if (x <= 0.0031308) {
return x * 12.92
} else {
return 1.055 * Math.pow(x, 1/2.4) - 0.055
}
}
// Analytic Approximations to the CIE XYZ Color Matching Functions
// from Sloan http://jcgt.org/published/0002/02/01/paper.pdf
function xFit_1931(x) {
const t1 = (x - 442) * (x < 442 ? 0.0624 : 0.0374)
const t2 = (x -599.8) * (x < 599.8 ? 0.0264 : 0.0323)
const t3 = (x - 501.1) * (x < 501.1 ? 0.0490 : 0.0382)
return 0.362 * Math.exp(-0.5 * t1 * t1) + 1.056 * Math.exp(-0.5 * t2 * t2) - 0.065 * Math.exp(-0.5 * t3 * t3)
}
function yFit_1931(x) {
const t1 = (x - 568.8) * (x < 568.8 ? 0.0213 : 0.0247)
const t2 = (x - 530.9) * (x < 530.9 ? 0.0613 : 0.0322)
return 0.821 * Math.exp(-0.5 * t1 * t1) + 0.286 * Math.exp(-0.5 * t2 * t2)
}
function zFit_1931(x) {
const t1 = (x - 437) * (x < 437 ? 0.0845 : 0.0278)
const t2 = (x - 459) * (x < 459 ? 0.0385 : 0.0725)
return 1.217 * Math.exp(-0.5 * t1 * t1) + 0.681 * Math.exp(-0.5 * t2 * t2)
}
const canvas = document.createElement("canvas")
document.body.append(canvas)
canvas.width = canvas.height = 512
const ctx = canvas.getContext("2d")
const locus_points = []
for (let i = 440; i < 650; ++i) {
const [X, Y, Z] = [xFit_1931(i), yFit_1931(i), zFit_1931(i)]
const x = (X / (X + Y + Z)) * canvas.width
const y = (Y / (X + Y + Z)) * canvas.height
locus_points.push([x, y])
}
ctx.beginPath()
ctx.moveTo(...locus_points[0])
locus_points.slice(1).forEach(point => ctx.lineTo(...point))
ctx.closePath()
ctx.fill()
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
for (let y = 0; y < canvas.height; ++y) {
for (let x = 0; x < canvas.width; ++x) {
const alpha = imageData.data[(y * canvas.width + x) * 4 + 3]
if (alpha > 0) {
const [X, Y, Z] = XYZfromYxy(1, x / canvas.width, y / canvas.height)
const [R, G, B] = RGBfromXYZ(X, Y, Z)
const r = Math.round(srgb_from_linear(R / Math.sqrt(R**2 + G**2 + B**2)) * 255)
const g = Math.round(srgb_from_linear(G / Math.sqrt(R**2 + G**2 + B**2)) * 255)
const b = Math.round(srgb_from_linear(B / Math.sqrt(R**2 + G**2 + B**2)) * 255)
imageData.data[(y * canvas.width + x) * 4 + 0] = r
imageData.data[(y * canvas.width + x) * 4 + 1] = g
imageData.data[(y * canvas.width + x) * 4 + 2] = b
}
}
}
ctx.putImageData(imageData, 0, 0)
few days ago I've just started learning RenderScript. I managed to create some simple image processing filters e.g. grayscale, color change.
Now I'm working on Canny edge filters with no success.
Question: Why ImageView displays black image and how to solve it?
I'am using implementation of Canny egde filter made by arekolek github
optional: Can I compute it faster?
I ended with all code wrote in on method "runEdgeFilter(...)" which runs when i clicked image on my device, to make sure I'am not messing with imageView in other place. Code that i use so far.
import android.content.Context;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.support.v8.renderscript.*;
import android.support.v7.app.AppCompatActivity;
import android.os.Bundle;
import android.view.View;
import android.widget.ImageView;
public class MainActivity extends AppCompatActivity {
private static final float THRESHOLD_MULT_LOW = 0.66f * 0.00390625f;
private static final float THRESHOLD_MULT_HIGH = 1.33f * 0.00390625f;
private ImageView imageView;
private Bitmap img;
private boolean setThresholds = true;
#Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
imageView = (ImageView) findViewById(R.id.imageView);
img = BitmapFactory.decodeResource(getResources(), R.drawable.test_img_no_dpi2);
imageView.setImageBitmap(img);
}
public void imageClicked(View view) {
runEdgeFilter(img, this);
}
private void runEdgeFilter(Bitmap image, Context context) {
int width = image.getWidth();
int height = image.getHeight();
RenderScript rs = RenderScript.create(context);
Allocation allocationIn = Allocation.createFromBitmap(rs, image);
Type.Builder tb;
tb = new Type.Builder(rs, Element.F32(rs)).setX(width).setY(height);
Allocation allocationBlurred = Allocation.createTyped(rs, tb.create());
Allocation allocationMagnitude = Allocation.createTyped(rs, tb.create());
tb = new Type.Builder(rs, Element.I32(rs)).setX(width).setY(height);
Allocation allocationDirection = Allocation.createTyped(rs, tb.create());
Allocation allocationEdge = Allocation.createTyped(rs, tb.create());
tb = new Type.Builder(rs, Element.I32(rs)).setX(256);
Allocation allocationHistogram = Allocation.createTyped(rs, tb.create());
tb = new Type.Builder(rs, Element.RGBA_8888(rs)).setX(width).setY(height);
Allocation allocationOut = Allocation.createTyped(rs, tb.create());
ScriptC_edge edgeFilter = new ScriptC_edge(rs);
ScriptIntrinsicHistogram histogram = ScriptIntrinsicHistogram.create(rs, Element.U8(rs));
histogram.setOutput(allocationHistogram);
edgeFilter.invoke_set_histogram(allocationHistogram);
edgeFilter.invoke_set_blur_input(allocationIn);
edgeFilter.invoke_set_compute_gradient_input(allocationBlurred);
edgeFilter.invoke_set_suppress_input(allocationMagnitude, allocationDirection);
edgeFilter.invoke_set_hysteresis_input(allocationEdge);
edgeFilter.invoke_set_thresholds(0.2f, 0.6f);
histogram.forEach_Dot(allocationIn);
int[] histogramOutput = new int[256];
allocationHistogram.copyTo(histogramOutput);
if(setThresholds) {
int median = width * height / 2;
for (int i = 0; i < 256; ++i) {
median -= histogramOutput[i];
if (median < 1) {
edgeFilter.invoke_set_thresholds(i * THRESHOLD_MULT_LOW, i * THRESHOLD_MULT_HIGH);
break;
}
}
}
edgeFilter.forEach_blur(allocationBlurred);
edgeFilter.forEach_compute_gradient(allocationMagnitude);
edgeFilter.forEach_suppress(allocationEdge);
edgeFilter.forEach_hysteresis(allocationOut);
allocationOut.copyTo(image);
allocationIn.destroy();
allocationMagnitude.destroy();
allocationBlurred.destroy();
allocationDirection.destroy();
allocationEdge.destroy();
allocationHistogram.destroy();
allocationOut.destroy();
histogram.destroy();
edgeFilter.destroy();
rs.destroy();
imageView.setImageBitmap(image);
}
}
renderscript edge.rs:
#pragma version(1)
#pragma rs java_package_name(com.lukasz.edgeexamplers)
#pragma rs_fp_relaxed
#include "rs_debug.rsh"
static rs_allocation raw, magnitude, blurred, direction, candidates;
static float low, high;
static const uint32_t zero = 0;
void set_blur_input(rs_allocation u8_buf) {
raw = u8_buf;
}
void set_compute_gradient_input(rs_allocation f_buf) {
blurred = f_buf;
}
void set_suppress_input(rs_allocation f_buf, rs_allocation i_buf) {
magnitude = f_buf;
direction = i_buf;
}
void set_hysteresis_input(rs_allocation i_buf) {
candidates = i_buf;
}
void set_thresholds(float l, float h) {
low = l;
high = h;
}
inline static float getElementAt_uchar_to_float(rs_allocation a, uint32_t x,
uint32_t y) {
return rsGetElementAt_uchar(a, x, y) / 255.0f;
}
static rs_allocation histogram;
void set_histogram(rs_allocation h) {
histogram = h;
}
uchar4 __attribute__((kernel)) addhisto(uchar in, uint32_t x, uint32_t y) {
int px = (x - 100) / 2;
if (px > -1 && px < 256) {
int v = log((float) rsGetElementAt_int(histogram, (uint32_t) px)) * 30;
int py = (400 - y);
if (py > -1 && v > py) {
in = 255;
}
if (py == -1) {
in = 255;
}
}
uchar4 out = { in, in, in, 255 };
return out;
}
uchar4 __attribute__((kernel)) copy(uchar in) {
uchar4 out = { in, in, in, 255 };
return out;
}
uchar4 __attribute__((kernel)) blend(uchar4 in, uint32_t x, uint32_t y) {
uchar r = rsGetElementAt_uchar(raw, x, y);
uchar4 out = { r, r, r, 255 };
return max(out, in);
}
float __attribute__((kernel)) blur(uint32_t x, uint32_t y) {
float pixel = 0;
pixel += 2 * getElementAt_uchar_to_float(raw, x - 2, y - 2);
pixel += 4 * getElementAt_uchar_to_float(raw, x - 1, y - 2);
pixel += 5 * getElementAt_uchar_to_float(raw, x, y - 2);
pixel += 4 * getElementAt_uchar_to_float(raw, x + 1, y - 2);
pixel += 2 * getElementAt_uchar_to_float(raw, x + 2, y - 2);
pixel += 4 * getElementAt_uchar_to_float(raw, x - 2, y - 1);
pixel += 9 * getElementAt_uchar_to_float(raw, x - 1, y - 1);
pixel += 12 * getElementAt_uchar_to_float(raw, x, y - 1);
pixel += 9 * getElementAt_uchar_to_float(raw, x + 1, y - 1);
pixel += 4 * getElementAt_uchar_to_float(raw, x + 2, y - 1);
pixel += 5 * getElementAt_uchar_to_float(raw, x - 2, y);
pixel += 12 * getElementAt_uchar_to_float(raw, x - 1, y);
pixel += 15 * getElementAt_uchar_to_float(raw, x, y);
pixel += 12 * getElementAt_uchar_to_float(raw, x + 1, y);
pixel += 5 * getElementAt_uchar_to_float(raw, x + 2, y);
pixel += 4 * getElementAt_uchar_to_float(raw, x - 2, y + 1);
pixel += 9 * getElementAt_uchar_to_float(raw, x - 1, y + 1);
pixel += 12 * getElementAt_uchar_to_float(raw, x, y + 1);
pixel += 9 * getElementAt_uchar_to_float(raw, x + 1, y + 1);
pixel += 4 * getElementAt_uchar_to_float(raw, x + 2, y + 1);
pixel += 2 * getElementAt_uchar_to_float(raw, x - 2, y + 2);
pixel += 4 * getElementAt_uchar_to_float(raw, x - 1, y + 2);
pixel += 5 * getElementAt_uchar_to_float(raw, x, y + 2);
pixel += 4 * getElementAt_uchar_to_float(raw, x + 1, y + 2);
pixel += 2 * getElementAt_uchar_to_float(raw, x + 2, y + 2);
pixel /= 159;
return pixel;
}
float __attribute__((kernel)) compute_gradient(uint32_t x, uint32_t y) {
float gx = 0;
gx -= rsGetElementAt_float(blurred, x - 1, y - 1);
gx -= rsGetElementAt_float(blurred, x - 1, y) * 2;
gx -= rsGetElementAt_float(blurred, x - 1, y + 1);
gx += rsGetElementAt_float(blurred, x + 1, y - 1);
gx += rsGetElementAt_float(blurred, x + 1, y) * 2;
gx += rsGetElementAt_float(blurred, x + 1, y + 1);
float gy = 0;
gy += rsGetElementAt_float(blurred, x - 1, y - 1);
gy += rsGetElementAt_float(blurred, x, y - 1) * 2;
gy += rsGetElementAt_float(blurred, x + 1, y - 1);
gy -= rsGetElementAt_float(blurred, x - 1, y + 1);
gy -= rsGetElementAt_float(blurred, x, y + 1) * 2;
gy -= rsGetElementAt_float(blurred, x + 1, y + 1);
int d = ((int) round(atan2pi(gy, gx) * 4.0f) + 4) % 4;
rsSetElementAt_int(direction, d, x, y);
return hypot(gx, gy);
}
int __attribute__((kernel)) suppress(uint32_t x, uint32_t y) {
int d = rsGetElementAt_int(direction, x, y);
float g = rsGetElementAt_float(magnitude, x, y);
if (d == 0) {
// horizontal, check left and right
float a = rsGetElementAt_float(magnitude, x - 1, y);
float b = rsGetElementAt_float(magnitude, x + 1, y);
return a < g && b < g ? 1 : 0;
} else if (d == 2) {
// vertical, check above and below
float a = rsGetElementAt_float(magnitude, x, y - 1);
float b = rsGetElementAt_float(magnitude, x, y + 1);
return a < g && b < g ? 1 : 0;
} else if (d == 1) {
// NW-SE
float a = rsGetElementAt_float(magnitude, x - 1, y - 1);
float b = rsGetElementAt_float(magnitude, x + 1, y + 1);
return a < g && b < g ? 1 : 0;
} else {
// NE-SW
float a = rsGetElementAt_float(magnitude, x + 1, y - 1);
float b = rsGetElementAt_float(magnitude, x - 1, y + 1);
return a < g && b < g ? 1 : 0;
}
}
static const int NON_EDGE = 0b000;
static const int LOW_EDGE = 0b001;
static const int MED_EDGE = 0b010;
static const int HIG_EDGE = 0b100;
inline static int getEdgeType(uint32_t x, uint32_t y) {
int e = rsGetElementAt_int(candidates, x, y);
float g = rsGetElementAt_float(magnitude, x, y);
if (e == 1) {
if (g < low)
return LOW_EDGE;
if (g > high)
return HIG_EDGE;
return MED_EDGE;
}
return NON_EDGE;
}
uchar4 __attribute__((kernel)) hysteresis(uint32_t x, uint32_t y) {
uchar4 white = { 255, 255, 255, 255 };
uchar4 red = { 255, 0, 0, 255 };
uchar4 black = { 0, 0, 0, 255 };
int type = getEdgeType(x, y);
if (type) {
if (type & LOW_EDGE) {
return black;
}
if (type & HIG_EDGE) {
//rsDebug("wh : x=", x);
//rsDebug("wh : y=", y);
return white;
}
// it's medium, check nearest neighbours
type = getEdgeType(x - 1, y - 1);
type |= getEdgeType(x, y - 1);
type |= getEdgeType(x + 1, y - 1);
type |= getEdgeType(x - 1, y);
type |= getEdgeType(x + 1, y);
type |= getEdgeType(x - 1, y + 1);
type |= getEdgeType(x, y + 1);
type |= getEdgeType(x + 1, y + 1);
if (type & HIG_EDGE) {
//rsDebug("wh : x=", x);
//rsDebug("wh : y=", y);
return white;
}
if (type & MED_EDGE) {
// check further
type = getEdgeType(x - 2, y - 2);
type |= getEdgeType(x - 1, y - 2);
type |= getEdgeType(x, y - 2);
type |= getEdgeType(x + 1, y - 2);
type |= getEdgeType(x + 2, y - 2);
type |= getEdgeType(x - 2, y - 1);
type |= getEdgeType(x + 2, y - 1);
type |= getEdgeType(x - 2, y);
type |= getEdgeType(x + 2, y);
type |= getEdgeType(x - 2, y + 1);
type |= getEdgeType(x + 2, y + 1);
type |= getEdgeType(x - 2, y + 2);
type |= getEdgeType(x - 1, y + 2);
type |= getEdgeType(x, y + 2);
type |= getEdgeType(x + 1, y + 2);
type |= getEdgeType(x + 2, y + 2);
if (type & HIG_EDGE) {
//rsDebug("wh : x=", x);
//rsDebug("wh : y=", y);
return white;
}
}
}
return black;
}
After some debugging I found that:
uchar4 __attribute__((kernel)) hysteresis(uint32_t x, uint32_t y) {...}
returns white and black pixels so renderscript works properly I think.
Output is the same type as my previous renderscript filters (uchar4) which I assign to Bitmap with success.
I have no idea what I've done wrong.
Also my logcat prints:
V/RenderScript_jni: RS compat mode
V/RenderScript_jni: Unable to load libRSSupportIO.so, USAGE_IO not supported
V/RenderScript_jni: Unable to load BLAS lib, ONLY BNNM will be supported: java.lang.UnsatisfiedLinkError: Couldn't load blasV8 from loader dalvik.system.PathClassLoader[dexPath=/data/app/com.lukasz.edgeexamplers-20.apk,libraryPath=/data/app-lib/com.lukasz.edgeexamplers-20]: findLibrary returned null
E/RenderScript: Couldn't load libRSSupportIO.so
in every program which use renderscript, but other programs works even with this warnings.
Update #1
As #Stephen Hines mention, there was issue with reading out of bounds. I think I fixed it for now (without messing with renderscript) by changing those lines:
edgeFilter.forEach_blur(allocationBlurred);
edgeFilter.forEach_compute_gradient(allocationMagnitude);
edgeFilter.forEach_suppress(allocationEdge);
edgeFilter.forEach_hysteresis(allocationOut);
into:
Script.LaunchOptions sLaunchOpt = new Script.LaunchOptions();
sLaunchOpt.setX(2, width - 3);
sLaunchOpt.setY(2, height - 3);
edgeFilter.forEach_blur(allocationBlurred, sLaunchOpt);
edgeFilter.forEach_compute_gradient(allocationMagnitude, sLaunchOpt);
edgeFilter.forEach_suppress(allocationEdge, sLaunchOpt);
edgeFilter.forEach_hysteresis(allocationOut, sLaunchOpt);
But my problem is still not solved. Output is black as earlier.
I want to extract the red ball from one picture and get the detected ellipse matrix in picture.
Here is my example:
I threshold the picture, find the contour of red ball by using findContour() function and use fitEllipse() to fit an ellipse.
But what I want is to get coefficient of this ellipse. Because the fitEllipse() return a rotation rectangle (RotatedRect), so I need to re-write this function.
One Ellipse can be expressed as Ax^2 + By^2 + Cxy + Dx + Ey + F = 0; So I want to get u=(A,B,C,D,E,F) or u=(A,B,C,D,E) if F is 1 (to construct an ellipse matrix).
I read the source code of fitEllipse(), there are totally three SVD process, I think I can get the above coefficients from the results of those three SVD process. But I am quite confused what does each result (variable cv::Mat x) of each SVD process represent and why there are three SVD here?
Here is this function:
cv::RotatedRect cv::fitEllipse( InputArray _points )
{
Mat points = _points.getMat();
int i, n = points.checkVector(2);
int depth = points.depth();
CV_Assert( n >= 0 && (depth == CV_32F || depth == CV_32S));
RotatedRect box;
if( n < 5 )
CV_Error( CV_StsBadSize, "There should be at least 5 points to fit the ellipse" );
// New fitellipse algorithm, contributed by Dr. Daniel Weiss
Point2f c(0,0);
double gfp[5], rp[5], t;
const double min_eps = 1e-8;
bool is_float = depth == CV_32F;
const Point* ptsi = points.ptr<Point>();
const Point2f* ptsf = points.ptr<Point2f>();
AutoBuffer<double> _Ad(n*5), _bd(n);
double *Ad = _Ad, *bd = _bd;
// first fit for parameters A - E
Mat A( n, 5, CV_64F, Ad );
Mat b( n, 1, CV_64F, bd );
Mat x( 5, 1, CV_64F, gfp );
for( i = 0; i < n; i++ )
{
Point2f p = is_float ? ptsf[i] : Point2f((float)ptsi[i].x, (float)ptsi[i].y);
c += p;
}
c.x /= n;
c.y /= n;
for( i = 0; i < n; i++ )
{
Point2f p = is_float ? ptsf[i] : Point2f((float)ptsi[i].x, (float)ptsi[i].y);
p -= c;
bd[i] = 10000.0; // 1.0?
Ad[i*5] = -(double)p.x * p.x; // A - C signs inverted as proposed by APP
Ad[i*5 + 1] = -(double)p.y * p.y;
Ad[i*5 + 2] = -(double)p.x * p.y;
Ad[i*5 + 3] = p.x;
Ad[i*5 + 4] = p.y;
}
solve(A, b, x, DECOMP_SVD);
// now use general-form parameters A - E to find the ellipse center:
// differentiate general form wrt x/y to get two equations for cx and cy
A = Mat( 2, 2, CV_64F, Ad );
b = Mat( 2, 1, CV_64F, bd );
x = Mat( 2, 1, CV_64F, rp );
Ad[0] = 2 * gfp[0];
Ad[1] = Ad[2] = gfp[2];
Ad[3] = 2 * gfp[1];
bd[0] = gfp[3];
bd[1] = gfp[4];
solve( A, b, x, DECOMP_SVD );
// re-fit for parameters A - C with those center coordinates
A = Mat( n, 3, CV_64F, Ad );
b = Mat( n, 1, CV_64F, bd );
x = Mat( 3, 1, CV_64F, gfp );
for( i = 0; i < n; i++ )
{
Point2f p = is_float ? ptsf[i] : Point2f((float)ptsi[i].x, (float)ptsi[i].y);
p -= c;
bd[i] = 1.0;
Ad[i * 3] = (p.x - rp[0]) * (p.x - rp[0]);
Ad[i * 3 + 1] = (p.y - rp[1]) * (p.y - rp[1]);
Ad[i * 3 + 2] = (p.x - rp[0]) * (p.y - rp[1]);
}
solve(A, b, x, DECOMP_SVD);
// store angle and radii
rp[4] = -0.5 * atan2(gfp[2], gfp[1] - gfp[0]); // convert from APP angle usage
if( fabs(gfp[2]) > min_eps )
t = gfp[2]/sin(-2.0 * rp[4]);
else // ellipse is rotated by an integer multiple of pi/2
t = gfp[1] - gfp[0];
rp[2] = fabs(gfp[0] + gfp[1] - t);
if( rp[2] > min_eps )
rp[2] = std::sqrt(2.0 / rp[2]);
rp[3] = fabs(gfp[0] + gfp[1] + t);
if( rp[3] > min_eps )
rp[3] = std::sqrt(2.0 / rp[3]);
box.center.x = (float)rp[0] + c.x;
box.center.y = (float)rp[1] + c.y;
box.size.width = (float)(rp[2]*2);
box.size.height = (float)(rp[3]*2);
if( box.size.width > box.size.height )
{
float tmp;
CV_SWAP( box.size.width, box.size.height, tmp );
box.angle = (float)(90 + rp[4]*180/CV_PI);
}
if( box.angle < -180 )
box.angle += 360;
if( box.angle > 360 )
box.angle -= 360;
return box;
}
The source code link: https://github.com/Itseez/opencv/blob/master/modules/imgproc/src/shapedescr.cpp
The function fitEllipse returns a RotatedRect that contains all the parameters of the ellipse.
An ellipse is defined by 5 parameters:
xc : x coordinate of the center
yc : y coordinate of the center
a : major semi-axis
b : minor semi-axis
theta : rotation angle
You can obtain these parameters like:
RotatedRect e = fitEllipse(points);
float xc = e.center.x;
float yc = e.center.y;
float a = e.size.width / 2; // width >= height
float b = e.size.height / 2;
float theta = e.angle; // in degrees
You can draw an ellipse with the function ellipse using the RotatedRect:
ellipse(image, e, Scalar(0,255,0));
or, equivalently using the ellipse parameters:
ellipse(res, Point(xc, yc), Size(a, b), theta, 0.0, 360.0, Scalar(0,255,0));
If you need the values of the coefficients of the implicit equation, you can do like (from Wikipedia):
So, you can get the parameters you need from the RotatedRect, and you don't need to change the function fitEllipse.
The solve function is used to solve linear systems or least-squares problems. Using the SVD decomposition method the system can be over-defined and/or the matrix src1 can be singular.
For more details on the algorithm, you can see the paper of Fitzgibbon that proposed this fit ellipse method.
Here is some code that worked for me which I based on the other responses on this thread.
def getConicCoeffFromEllipse(e):
# ellipse(Point(xc, yc),Size(a, b), theta)
xc = e[0][0]
yc = e[0][1]
a = e[1][0]/2
b = e[1][1]/2
theta = math.radians(e[2])
# See https://en.wikipedia.org/wiki/Ellipse
# Ax^2 + Bxy + Cy^2 + Dx + Ey + F = 0 is the equation
A = a*a*math.pow(math.sin(theta),2) + b*b*math.pow(math.cos(theta),2)
B = 2*(b*b - a*a)*math.sin(theta)*math.cos(theta)
C = a*a*math.pow(math.cos(theta),2) + b*b*math.pow(math.sin(theta),2)
D = -2*A*xc - B*yc
E = -B*xc - 2*C*yc
F = A*xc*xc + B*xc*yc + C*yc*yc - a*a*b*b
coef = np.array([A,B,C,D,E,F]) / F
return coef
def getConicMatrixFromCoeff(c):
C = np.array([[c[0], c[1]/2, c[3]/2], # [ a, b/2, d/2 ]
[c[1]/2, c[2], c[4]/2], # [b/2, c, e/2 ]
[c[3]/2, c[4]/2, c[5]]]) # [d/2], e/2, f ]
return C
I have ported to the new C++ api the code from this answer. However, while I understand the most of the code I cannot get the idea behind the calc_shift() function and how the pixels shift is extracted. If someone could provide me an explanation I would be really grateful. The function has as follows:
float calc_shift(float x1,float x2,float cx,float k)
{
float thresh = 1;
float x3 = x1+(x2-x1)*0.5;
float res1 = x1+((x1-cx)*k*((x1-cx)*(x1-cx)));
float res3 = x3+((x3-cx)*k*((x3-cx)*(x3-cx)));
std::cerr<<"x1: "<<x1<<" - "<<res1<<" x3: "<<x3<<" - "<<res3<<std::endl;
if(res1>-thresh and res1 < thresh)
return x1;
if(res3<0){
return calc_shift(x3,x2,cx,k);
}else{
return calc_shift(x1,x3,cx,k);
}
}
and the way that the above function is called can be seen below:
int w = src.cols;
int h = src.rows;
xShift = calc_shift(0, Cx - 1, Cx, k);
float newCenterX = w - Cx;
float xShift2 = calc_shift(0, newCenterX - 1, newCenterX, k);
yShift = calc_shift(0, Cy - 1, Cy, k);
float newCenterY = w - Cy;
float yShift2 = calc_shift(0, newCenterY - 1, newCenterY, k);
xScale = (w - xShift - xShift2) / w;
yScale = (h - yShift - yShift2) / h;
I would like to understand the above code because I want also to use it for the pincushion distortion case where the k is < 0. If I use it as it is now the code falls into an infinite loop with a k<0 given value.