Kernel Function - ios

I am drawing squares around the points i passed to the Shader
override func computedraw(computeCommandEncoder: MTLComputeCommandEncoder) {
computeCommandEncoder.setComputePipelineState(pipelineState)
computeCommandEncoder.setTexture(self.texture, index: 0)
if(pointsArray.count > 0){
var count:Int = 4;
computeCommandEncoder.setBytes(&pointsArray, length:MemoryLayout<float2>.stride, index: 0)
computeCommandEncoder.setBytes(&count, length:MemoryLayout<Int>.stride, index: 1)
let threadGroupCount = MTLSizeMake(2, 2, 1)
let threadGroups = MTLSizeMake((self.texture?.width)! / threadGroupCount.width, (self.texture?.height)! / threadGroupCount.height, 1)
computeCommandEncoder.dispatchThreadgroups(threadGroups, threadsPerThreadgroup: threadGroupCount)
}
}
The PointsArray
var pointsArray : [float2] = [float2(0.40,-0.5),float2(0.20,-0.5),float2(0.0,0.0),float2(0.56,-0.4)]
In Kernel Function
float2 touchPointF(float2 tap) {
float deviceWidth = 2732;
float deviceHeight = 2048;
float2 touchPoint = (0,0);
touchPoint.x = ((tap.x + 1) * deviceWidth) / 2;
touchPoint.y = ((-1 * (tap.y - 1 )) * deviceHeight) / 2;
return touchPoint;
}
kernel void computeTool(
constant float2 *point [[buffer(0)]],
constant int &pointCount [[buffer(1)]],
texture2d<float,access::read_write> des [[texture(0)]],
// texture2d<float,access::read> star [[texture(1)]],
uint2 gid [[thread_position_in_grid]])
{
for (int i = 0; i < pointCount; ++i) {
float2 x = touchPointF(point[i]) ;
if ((gid.x > (uint(x.x) - 40) && (gid.x < (uint(x.x) + 40) )) && (gid.y > (uint(x.y) -40) && gid.y < (uint(x.y) + 40) )) {
des.write(float4(float(pointCount)/10,0.0,0.0,1.0), gid);
}
}
}
I pass 4 points to the shader but it only draws two points in the screen. Is it problem with the kernel function Issue or Problem with Kernel Threading

computeCommandEncoder.setBytes(&pointsArray, length:MemoryLayout<float2>.stride, index: 0)
Here length I Need to multiply by count of pointsArray

Related

How can I calculate the mean and variance value of an image with 16 channels using Metal Shader Lanuage

how can I calculate mean and variance value of an image with 16 channels using Metal ?
I want to calculate mean and variance value of different channel sperately!
ex.:
kernel void meanandvariance(texture2d_array<float, access::read> in[[texture(0)]],
texture2d_array<float, access::write> out[[texture(1)]],
ushort3 gid[[thread_position_in_grid]],
ushort tid[[thread_index_in_threadgroup]],
ushort3 tg_size[[threads_per_threadgroup]]) {
}
There's probably a way to do this by creating a sequence of texture views on the input texture array and output texture array, encoding a MPSImageStatisticsMeanAndVariance kernel invocation for each slice.
But let's take a look at how to do it ourselves. There are many different possible approaches, so I chose one that was simple and used some interesting results from statistics.
Essentially, we'll do the following:
Write a kernel that can produce a subset mean and variance for a single row of the image.
Write a kernel that can produce an overall mean and variance from the partial results from step 1.
Here are the kernels:
kernel void compute_row_mean_variance_array(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
uint3 tpig [[thread_position_in_grid]])
{
uint row = tpig.x;
uint slice = tpig.y;
uint width = inTexture.get_width();
if (row >= inTexture.get_height() || slice >= inTexture.get_array_size()) { return; }
float4 mean(0.0f);
float4 var(0.0f);
for (uint col = 0; col < width; ++col) {
float4 rgba = inTexture.read(ushort2(col, row), slice);
// http://datagenetics.com/blog/november22017/index.html
float weight = 1.0f / (col + 1);
float4 oldMean = mean;
mean = mean + (rgba - mean) * weight;
var = var + (rgba - oldMean) * (rgba - mean);
}
var = var / width;
outTexture.write(mean, ushort2(row, 0), slice);
outTexture.write(var, ushort2(row, 1), slice);
}
kernel void reduce_mean_variance_array(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
uint3 tpig [[thread_position_in_grid]])
{
uint width = inTexture.get_width();
uint slice = tpig.x;
// https://arxiv.org/pdf/1007.1012.pdf
float4 mean(0.0f);
float4 meanOfVar(0.0f);
float4 varOfMean(0.0f);
for (uint col = 0; col < width; ++col) {
float weight = 1.0f / (col + 1);
float4 oldMean = mean;
float4 submean = inTexture.read(ushort2(col, 0), slice);
mean = mean + (submean - mean) * weight;
float4 subvar = inTexture.read(ushort2(col, 1), slice);
meanOfVar = meanOfVar + (subvar - meanOfVar) * weight;
varOfMean = varOfMean + (submean - oldMean) * (submean - mean);
}
float4 var = meanOfVar + varOfMean / width;
outTexture.write(mean, ushort2(0, 0), slice);
outTexture.write(var, ushort2(1, 0), slice);
}
In summary, to achieve step 1, we use an "online" (incremental) algorithm to calculate the partial mean/variance of the row in a way that's more numerically-stable than just adding all the pixel values and dividing by the width. My reference for writing this kernel was this post. Each thread in the grid writes its row's statistics to the appropriate column and slice of an intermediate texture array.
To achieve step 2, we need to find a statistically-sound way of computing the overall statistics from the partial results. This is quite simple in the case of finding the mean: the mean of the population is the mean of the means of the subsets (this holds when the sample size of each subset is the same; in the general case, the overall mean is a weighted sum of the subset means). The variance is trickier, but it turns out that the variance of the population is the sum of the mean of the variances of the subsets and the variance of the means of the subsets (the same caveat about equally-sized subsets applies here). This is a convenient fact that we can combine with our incremental approach above to produce the final mean and variance of each slice, which is written to the corresponding slice of the output texture.
For completeness, here's the Swift code I used to drive these kernels:
let library = device.makeDefaultLibrary()!
let meanVarKernelFunction = library.makeFunction(name: "compute_row_mean_variance_array")!
let meanVarComputePipelineState = try! device.makeComputePipelineState(function: meanVarKernelFunction)
let reduceKernelFunction = library.makeFunction(name: "reduce_mean_variance_array")!
let reduceComputePipelineState = try! device.makeComputePipelineState(function: reduceKernelFunction)
let width = sourceTexture.width
let height = sourceTexture.height
let arrayLength = sourceTexture.arrayLength
let textureDescriptor = MTLTextureDescriptor.texture2DDescriptor(pixelFormat: .rgba32Float, width: width, height: height, mipmapped: false)
textureDescriptor.textureType = .type2DArray
textureDescriptor.arrayLength = arrayLength
textureDescriptor.width = height
textureDescriptor.height = 2
textureDescriptor.usage = [.shaderRead, .shaderWrite]
let partialResultsTexture = device.makeTexture(descriptor: textureDescriptor)!
textureDescriptor.width = 2
textureDescriptor.height = 1
textureDescriptor.usage = .shaderWrite
let destTexture = device.makeTexture(descriptor: textureDescriptor)!
let commandBuffer = commandQueue.makeCommandBuffer()!
let computeCommandEncoder = commandBuffer.makeComputeCommandEncoder()!
computeCommandEncoder.setComputePipelineState(meanVarComputePipelineState)
computeCommandEncoder.setTexture(sourceTexture, index: 0)
computeCommandEncoder.setTexture(partialResultsTexture, index: 1)
let meanVarGridSize = MTLSize(width: sourceTexture.height, height: sourceTexture.arrayLength, depth: 1)
let meanVarThreadgroupSize = MTLSizeMake(meanVarComputePipelineState.threadExecutionWidth, 1, 1)
let meanVarThreadgroupCount = MTLSizeMake((meanVarGridSize.width + meanVarThreadgroupSize.width - 1) / meanVarThreadgroupSize.width,
(meanVarGridSize.height + meanVarThreadgroupSize.height - 1) / meanVarThreadgroupSize.height,
1)
computeCommandEncoder.dispatchThreadgroups(meanVarThreadgroupCount, threadsPerThreadgroup: meanVarThreadgroupSize)
computeCommandEncoder.setComputePipelineState(reduceComputePipelineState)
computeCommandEncoder.setTexture(partialResultsTexture, index: 0)
computeCommandEncoder.setTexture(destTexture, index: 1)
let reduceThreadgroupSize = MTLSizeMake(1, 1, 1)
let reduceThreadgroupCount = MTLSizeMake(arrayLength, 1, 1)
computeCommandEncoder.dispatchThreadgroups(reduceThreadgroupCount, threadsPerThreadgroup: reduceThreadgroupSize)
computeCommandEncoder.endEncoding()
let destTexture2DDesc = MTLTextureDescriptor.texture2DDescriptor(pixelFormat: .rgba32Float, width: 2, height: 1, mipmapped: false)
destTexture2DDesc.usage = .shaderWrite
let destTexture2D = device.makeTexture(descriptor: destTexture2DDesc)!
meanVarKernel.encode(commandBuffer: commandBuffer, sourceTexture: sourceTexture2D, destinationTexture: destTexture2D)
#if os(macOS)
let blitCommandEncoder = commandBuffer.makeBlitCommandEncoder()!
blitCommandEncoder.synchronize(resource: destTexture)
blitCommandEncoder.synchronize(resource: destTexture2D)
blitCommandEncoder.endEncoding()
#endif
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
In my experiments, this program produced the same results as MPSImageStatisticsMeanAndVariance, give or take some differences on the order of 1e-7. It was also 2.5x slower than MPS on my Mac, probably due in part to failure to exploit latency hiding with granular parallelism.
#include <metal_stdlib>
using namespace metal;
kernel void instance_norm(constant float4* scale[[buffer(0)]],
constant float4* shift[[buffer(1)]],
texture2d_array<float, access::read> in[[texture(0)]],
texture2d_array<float, access::write> out[[texture(1)]],
ushort3 gid[[thread_position_in_grid]],
ushort tid[[thread_index_in_threadgroup]],
ushort3 tg_size[[threads_per_threadgroup]]) {
ushort width = in.get_width();
ushort height = in.get_height();
const ushort thread_count = tg_size.x * tg_size.y;
threadgroup float4 shared_mem [256];
float4 sum = 0;
for(ushort xIndex = gid.x; xIndex < width; xIndex += tg_size.x) {
for(ushort yIndex = gid.y; yIndex < height; yIndex += tg_size.y) {
sum += in.read(ushort2(xIndex, yIndex), gid.z);
}
}
shared_mem[tid] = sum;
threadgroup_barrier(mem_flags::mem_threadgroup);
// Reduce to 32 values
sum = 0;
if (tid < 32) {
for (ushort i = tid + 32; i < thread_count; i += 32) {
sum += shared_mem[i];
}
}
shared_mem[tid] += sum;
threadgroup_barrier(mem_flags::mem_threadgroup);
// Calculate mean
sum = 0;
if (tid == 0) {
ushort top = min(ushort(32), thread_count);
for (ushort i = 0; i < top; i += 1) {
sum += shared_mem[i];
}
shared_mem[0] = sum / (width * height);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
const float4 mean = shared_mem[0];
threadgroup_barrier(mem_flags::mem_threadgroup);
// Variance
sum = 0;
for(ushort xIndex = gid.x; xIndex < width; xIndex += tg_size.x) {
for(ushort yIndex = gid.y; yIndex < height; yIndex += tg_size.y) {
sum += pow(in.read(ushort2(xIndex, yIndex), gid.z) - mean, 2);
}
}
shared_mem[tid] = sum;
threadgroup_barrier(mem_flags::mem_threadgroup);
// Reduce to 32 values
sum = 0;
if (tid < 32) {
for (ushort i = tid + 32; i < thread_count; i += 32) {
sum += shared_mem[i];
}
}
shared_mem[tid] += sum;
threadgroup_barrier(mem_flags::mem_threadgroup);
// Calculate variance
sum = 0;
if (tid == 0) {
ushort top = min(ushort(32), thread_count);
for (ushort i = 0; i < top; i += 1) {
sum += shared_mem[i];
}
shared_mem[0] = sum / (width * height);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
const float4 sigma = sqrt(shared_mem[0] + float4(1e-4));
float4 multiplier = scale[gid.z] / sigma;
for(ushort xIndex = gid.x; xIndex < width; xIndex += tg_size.x) {
for(ushort yIndex = gid.y; yIndex < height; yIndex += tg_size.y) {
float4 val = in.read(ushort2(xIndex, yIndex), gid.z);
out.write(clamp((val - mean) * multiplier + shift[gid.z], -10.0, 10.0), ushort2(xIndex, yIndex), gid.z);
}
}
}
this is how Blend implement, but I do not think it is true, can anybody solve it ?
https://github.com/xmartlabs/Bender/blob/master/Sources/Metal/instanceNorm.metal

HLSL alphablending in geometry shader

I am rather new to HLSL and I am struggling with implementing a grass shader.
In the geometry shader I create quads which will display the grass blades. However when I try blending in the pixelshader things get weird. Sometimes it ignores everything which is behind the quad. I'm assuming it's a problem with the depth stencil.
this is the result:
Here is my shader:
//************
// VARIABLES *
//************
cbuffer cbPerObject
{
float4x4 m_MatrixWorldViewProj : WORLDVIEWPROJECTION;
float4x4 m_MatrixWorld : WORLD;
float4x4 gMatrixViewInverse : VIEWINVERSE;
float3 m_LightDir = { 2.0f,-5.0f,0.0f };
}
RasterizerState FrontCulling
{
CullMode = NONE;
};
SamplerState samLinear
{
Filter = MIN_MAG_MIP_LINEAR;
AddressU = Wrap;// of Mirror of Clamp of Border
AddressV = Wrap;// of Mirror of Clamp of Border
};
BlendState EnableBlending
{
BlendEnable[0] = TRUE;
SrcBlend = SRC_ALPHA;
DestBlend = INV_SRC_ALPHA;
BlendOp = ADD;
SrcBlendAlpha = ZERO;
DestBlendAlpha = ZERO;
BlendOpAlpha = ADD;
RenderTargetWriteMask[0] = 0x0F;
};
DepthStencilState EnableDepth
{
// Depth test parameters
DepthEnable = true;
DepthWriteMask = all;
DepthFunc = less;
StencilEnable = false;
};
Texture2D m_TextureDiffuse<
string UIName = "Diffuse Texture";
string UIWidget = "Texture";
string ResourceName = "Grass.dds";
>;
Texture2D m_TextureDiffuseBlade<
string UIName = "Diffuse Texture Blade";
string UIWidget = "Texture";
string ResourceName = "GrassBladeDiffuse.dds";
>;
Texture2D m_PerlinNoise<
string UIName = "Perlin Texture";
string UIWidget = "Texture";
string ResourceName = "Perlin.dds";
>;
float gGrassHeight
<
string UIName = "Grass Height";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 10.0f;
float UIStep = 0.01;
> = 0.6f;
float gGrassHeightRandom
<
string UIName = "Grass Height Random";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 1.0f;
float UIStep = 0.01;
> = 1.0f;
float gGrassBend
<
string UIName = "Grass Bend";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 1.0f;
float UIStep = 0.01;
> = 1.0f;
int gGrassBlades
<
string UIName = "Grass Blades";
string UIWidget = "slider";
int UIMin = 1;
int UIMax = 5.0f;
int UIStep = 1;
> = 5;
float gGrassBladesSize
<
string UIName = "Grass Blades Size";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 1.0f;
float UIStep = 0.01;
> = 0.2f;
float gGrassSpread<
string UIName = "Grass Spread";
> = 5.0f;
float gTime;
//**********
// STRUCTS *
//**********
struct VS_DATA
{
float3 Position : POSITION;
float3 Normal : NORMAL;
float2 TexCoord : TEXCOORD;
};
struct GS_DATA
{
float4 Position : SV_POSITION;
float3 Normal : NORMAL;
float2 TexCoord : TEXCOORD0;
bool Blade : FALSE;
};
//****************
// VERTEX SHADER *
//****************
VS_DATA MainVS(VS_DATA vsData)
{
return vsData;
}
//******************
// GEOMETRY SHADER *
//******************
void CreateVertex(inout TriangleStream<GS_DATA> triStream, float3 pos, float3 normal, float2 texCoord, bool blade = true)
{
//Step 1. Create a GS_DATA object
GS_DATA temp = (GS_DATA)0;
//Step 2. Transform the position using the WVP Matrix and assign it to (GS_DATA object).Position (Keep in mind: float3 -> float4)
temp.Position = mul(float4(pos, 1), m_MatrixWorldViewProj);
//Step 3. Transform the normal using the World Matrix and assign it to (GS_DATA object).Normal (Only Rotation, No translation!)
temp.Normal = mul(normal, (float3x3)m_MatrixWorld);
//Step 4. Assign texCoord to (GS_DATA object).TexCoord
temp.TexCoord = texCoord;
//set if blade or not
temp.Blade = blade;
//Step 5. Append (GS_DATA object) to the TriangleStream parameter (TriangleStream::Append(...))
triStream.Append(temp);
}
float3x3 AngleAxis3x3(float angle, float3 axis)
{
float c, s;
sincos(angle, s, c);
float t = 1 - c;
float x = axis.x;
float y = axis.y;
float z = axis.z;
return float3x3(
t * x * x + c, t * x * y - s * z, t * x * z + s * y,
t * x * y + s * z, t * y * y + c, t * y * z - s * x,
t * x * z - s * y, t * y * z + s * x, t * z * z + c
);
}
[maxvertexcount(5*6*3 +3)]
//[instance(16)]
void GrassGenerator(triangle VS_DATA vertices[3], inout TriangleStream<GS_DATA> triStream)//, uint InstanceID : SV_GSInstanceID)
{
float3 basePoint, top;
//Step 1. Calculate The basePoint
basePoint = (vertices[0].Position + vertices[1].Position + vertices[2].Position) / 3;
//Step 2. Calculate The normal of the basePoint
float3 normal = normalize((vertices[0].Normal + vertices[1].Normal + vertices[2].Normal) / 3);
//orignal vertex
CreateVertex(triStream, vertices[0].Position, vertices[0].Normal, vertices[0].TexCoord, false);
CreateVertex(triStream, vertices[1].Position, vertices[1].Normal, vertices[1].TexCoord, false);
CreateVertex(triStream, vertices[2].Position, vertices[2].Normal, vertices[2].TexCoord, false);
triStream.RestartStrip();
float3 left, right, grassnormal;
for (int j = 0; j < gGrassBlades; j++)
{
float3 position = basePoint + float3(m_PerlinNoise.SampleLevel(samLinear, vertices[j].TexCoord, 0).y - 0.5f, m_PerlinNoise.SampleLevel(samLinear, vertices[j].TexCoord, 0).z - 0.5f, 0)*gGrassSpread;
top = position + (gGrassHeight * normal);
float3 grassDirection = float3(1, 0, 0) * gGrassBladesSize;
float xAngle = 0.0f;
for (int i = 0; i < 3; i++)
{
float3x3 rotation = AngleAxis3x3(xAngle, normal);
grassDirection = mul(grassDirection, rotation);
//Step 5. Calculate The Normal of the grass
float3 leftEdge, rightEdge;
leftEdge = (position - grassDirection) - top;
rightEdge = (position + grassDirection) - top;
grassnormal = normalize(cross(leftEdge, rightEdge));
//Create Spike Geometry
CreateVertex(triStream, top - grassDirection, grassnormal, float2(0, 0));
CreateVertex(triStream, position - grassDirection, grassnormal, float2(0, 1));
CreateVertex(triStream, position + grassDirection, grassnormal, float2(1, 1));
triStream.RestartStrip();
CreateVertex(triStream, top + grassDirection, grassnormal, float2(1, 0));
CreateVertex(triStream, position + grassDirection, grassnormal, float2(1, 1));
CreateVertex(triStream, top - grassDirection, grassnormal, float2(0, 0));
triStream.RestartStrip();
static const float PI = 3.14159265f;
xAngle = 2 * PI / 3;
}
}
}
//***************
// PIXEL SHADER *
//***************
float4 MainPS(GS_DATA input) : SV_TARGET
{
input.Normal = -normalize(input.Normal);
float alpha;
float3 color;
if (input.Blade) {
alpha = m_TextureDiffuseBlade.Sample(samLinear,input.TexCoord).a;
color = m_TextureDiffuseBlade.Sample(samLinear,input.TexCoord).rgb;
}
else {
alpha = m_TextureDiffuse.Sample(samLinear,input.TexCoord).a;
color = m_TextureDiffuse.Sample(samLinear,input.TexCoord).rgb;
}
float s = max(dot(m_LightDir, input.Normal), 0.4f);
return float4(color*s,alpha);
}
//*************
// TECHNIQUES *
//*************
technique10 DefaultTechnique
{
pass p0 {
SetDepthStencilState(EnableDepth, 0);
SetBlendState(EnableBlending, float4(0.0f, 0.0f, 0.0f, 0.0f), 0xFFFFFFFF);
SetRasterizerState(FrontCulling);
SetVertexShader(CompileShader(vs_4_0, MainVS()));
SetGeometryShader(CompileShader(gs_5_0, GrassGenerator()));
SetPixelShader(CompileShader(ps_4_0, MainPS()));
}
}

Function in Metal

I have next function:
float4 blur(float rad, texture2d<float> tex2D, sampler sampler2D, float2 textureCoordinate){
float width = tex2D.get_width();
float height = tex2D.get_height();
float weight = 1 / ((2 * rad + 1) * (2 * rad + 1));
float4 blured_color = float4(0,0,0,0);
for(int i = -1 * rad; i <= rad; i++){
for (int j = -1 * rad; j <= rad; j++){
blured_color += tex2D.sample(sampler2D, textureCoordinate + float2(i/width, j/height)) * weight;
}
}
return blured_color;
}
It blurs given fragment.
My problem is that, when I call this function it doesn't work properly - it just make picture darker. But when I write the same code without wrapping it in function it works okay:
fragment float4 blured_background_fragment(VertexOut interpolated [[ stage_in ]],
texture2d<float> tex2D [[ texture(0) ]],
sampler sampler2D [[ sampler(0) ]])
{
float4 color = tex2D.sample(sampler2D, interpolated.textureCoordinate);
float3 color3 = float3(color[0] , color[1] , color[2]);
if (is_skin(color3) && !(interpolated.color[0] == 1 && interpolated.color[1] == 1 && interpolated.color[2] == 1)){
float width = tex2D.get_width();
float height = tex2D.get_height();
float rad = 13;
float weight = 1 / ((2 * rad + 1) * (2 * rad + 1));
float4 blured_color = float4(0,0,0,0);
for(int i = -1 * rad; i <= rad; i++){
for (int j = -1 * rad; j <= rad; j++){
blured_color += tex2D.sample(sampler2D, interpolated.textureCoordinate + float2(i/width, j/height)) * weight;
}
}
// Here I try to call this blur function
// float4 blured_color = blur(13, tex2D, sampler2D, interpolated.textureCoordinate);
return blured_color * 0.43 + color * 0.57;
}
else{
return tex2D.sample(sampler2D, interpolated.textureCoordinate);
}
}

Computing gradient orientation in c++ using opencv functions

Can anyone help me out with this?
I am trying to calculate gradient orientation using the Sobel operator in OpenCV for gradient in x and y direction. I am using the atan2 function for computing the tangent in radians, which I later convert to degrees, but all the angles I am getting are between 0 and 90 degrees.
My expectation is to get angles between 0 and 360 degrees. The image I am using is grayscale. The code segment is here below.
Mat PeripheralArea;
Mat grad_x, grad_y; // this is the matrix for the gradients in x and y directions
int off_set_y = 0, off_set_x = 0;
int scale = 1, num_bins = 8, bin = 0;
int delta=-1 ;
int ddepth = CV_16S;
GaussianBlur(PeripheralArea, PeripheralArea, Size(3, 3), 0, 0, BORDER_DEFAULT);
Sobel(PeripheralArea, grad_y, ddepth, 0, 1,3,scale, delta, BORDER_DEFAULT);
Sobel(PeripheralArea, grad_x, ddepth, 1, 0,3, scale, delta, BORDER_DEFAULT);
for (int row_y1 = 0, row_y2 = 0; row_y1 < grad_y.rows / 5, row_y2 < grad_x.rows / 5; row_y1++, row_y2++) {
for (int col_x1 = 0, col_x2 = 0; col_x1 < grad_y.cols / 5, col_x2 < grad_x.cols / 5; col_x1++, col_x2++) {
gradient_direction_radians = (double) atan2((double) grad_y.at<uchar>(row_y1 + off_set_y, col_x1 + off_set_x), (double) grad_x.at<uchar>(row_y2 + off_set_y, col_x2 + off_set_x));
gradient_direction_degrees = (int) (180 * gradient_direction_radians / 3.1415);
gradient_direction_degrees = gradient_direction_degrees < 0
? gradient_direction_degrees+360
: gradient_direction_degrees;
}
}
Note the off_set_x and off_set_y variable are not part of the computation
but to offset to different square blocks for which I eventually want to
compute an histogram feature vector
You have specified that the destination depth of Sobel() is CV_16S.
Yet, when you access grad_x and grad_y, you use .at<uchar>(), implying that their elements are 8 bit unsigned quantities, when in fact they are 16 bit signed. You could use .at<short>() instead, but to me it looks like there a number of issues with your code, not the least of which is that there is an OpenCV function that does exactly what you want.
Use cv::phase(), and replace your for loops with
cv::Mat gradient_angle_degrees;
bool angleInDegrees = true;
cv::phase(grad_x, grad_y, gradient_angle_degrees, angleInDegrees);
I solved this need when I dived into doing some edge detection using C++.
For orientation of gradient I use artan2(), this standard API defines its +y and +x same as how we usually traverse a 2D image.
Plot it to show you my understanding.
///////////////////////////////
// Quadrants of image:
// 3(-dx,-dy) | 4(+dx,-dy) [-pi,0]
// ------------------------->+x
// 2(-dx,+dy) | 1(+dx,+dy) [0,pi]
// v
// +y
///////////////////////////////
// Definition of arctan2():
// -135(-dx,-dy) | -45(+dx,-dy)
// ------------------------->+x
// 135(-dx,+dy) | +45(+dx,+dy)
// v
// +y
///////////////////////////////
How I do for gradient:
bool gradient(double*& magnitude, double*& orientation, double* src, int width, int height, string file) {
if (src == NULL)
return false;
if (width <= 0 || height <= 0)
return false;
double gradient_x_correlation[3*3] = {-0.5, 0.0, 0.5,
-0.5, 0.0, 0.5,
-0.5, 0.0, 0.5};
double gradient_y_correlation[3*3] = {-0.5,-0.5,-0.5,
0.0, 0.0, 0.0,
0.5, 0.5, 0.5};
double *Gx = NULL;
double *Gy = NULL;
this->correlation(Gx, src, gradient_x_correlation, width, height, 3);
this->correlation(Gy, src, gradient_y_correlation, width, height, 3);
if (Gx == NULL || Gy == NULL)
return false;
//magnitude
magnitude = new double[sizeof(double)*width*height];
if (magnitude == NULL)
return false;
memset(magnitude, 0, sizeof(double)*width*height);
double gx = 0.0;
double gy = 0.0;
double gm = 0.0;
for (int j=0; j<height; j++) {
for (int i=0; i<width; i++) {
gx = pow(Gx[i+j*width],2);
gy = pow(Gy[i+j*width],2);
gm = sqrt(pow(Gx[i+j*width],2)+pow(Gy[i+j*width],2));
if (gm >= 255.0) {
return false;
}
magnitude[i+j*width] = gm;
}
}
//orientation
orientation = new double[sizeof(double)*width*height];
if (orientation == NULL)
return false;
memset(orientation, 0, sizeof(double)*width*height);
double ori = 0.0;
double dtmp = 0.0;
double ori_normalized = 0.0;
for (int j=0; j<height; j++) {
for (int i=0; i<width; i++) {
gx = (Gx[i+j*width]);
gy = (Gy[i+j*width]);
ori = atan2(Gy[i+j*width], Gx[i+j*width])/PI*(180.0); //[-pi,+pi]
if (gx >= 0 && gy >= 0) { //[Qudrant 1]:[0,90] to be [0,63]
if (ori < 0) {
printf("[Err1QUA]ori:%.1f\n", ori);
return false;
}
ori_normalized = (ori)*255.0/360.0;
if (ori != 0.0 && dtmp != ori) {
printf("[Qudrant 1]orientation: %.1f to be %.1f(%d)\n", ori, ori_normalized, (uint8_t)ori_normalized);
dtmp = ori;
}
}
else if (gx >= 0 && gy < 0) { //[Qudrant 4]:[270,360) equal to [-90, 0) to be [191,255]
if (ori > 0) {
printf("[Err4QUA]orientation:%.1f\n", ori);
return false;
}
ori_normalized = (360.0+ori)*255.0/360.0;
if (ori != 0.0 && dtmp != ori) {
printf("[Qudrant 4]orientation:%.1f to be %.1f(%d)\n", ori, ori_normalized, (uint8_t)ori_normalized);
dtmp = ori;
}
}
else if (gx < 0 && gy >= 0) { //[Qudrant 2]:(90,180] to be [64,127]
if (ori < 0) {
printf("[Err2QUA]orientation:%.1f\n", ori);
return false;
}
ori_normalized = (ori)*255.0/360.0;
if (ori != 0.0 && dtmp != ori) {
printf("[Qudrant 2]orientation: %.1f to be %.1f(%d)\n", ori, ori_normalized, (uint8_t)ori_normalized);
dtmp = ori;
}
}
else if (gx < 0 && gy < 0) { //[Qudrant 3]:(180,270) equal to (-180, -90) to be [128,190]
if (ori > 0) {
printf("[Err3QUA]orientation:%.1f\n", ori);
return false;
}
ori_normalized = (360.0+ori)*255.0/360.0;
if (ori != 0.0 && dtmp != ori) {
printf("[Qudrant 3]orientation:%.1f to be %.1f(%d)\n", ori, ori_normalized, (uint8_t)ori_normalized);
dtmp = ori;
}
}
else {
printf("[EXCEPTION]orientation:%.1f\n", ori);
return false;
}
orientation[i+j*width] = ori_normalized;
}
}
return true;
}
How I do for cross correlation:
bool correlation(double*& dst, double* src, double* kernel, int width, int height, int window) {
if (src == NULL || kernel == NULL)
return false;
if (width <= 0 || height <= 0 || width < window || height < window )
return false;
dst = new double[sizeof(double)*width*height];
if (dst == NULL)
return false;
memset(dst, 0, sizeof(double)*width*height);
int ii = 0;
int jj = 0;
int nn = 0;
int mm = 0;
double max = std::numeric_limits<double>::min();
double min = std::numeric_limits<double>::max();
double range = std::numeric_limits<double>::max();
for (int j=0; j<height; j++) {
for (int i=0; i<width; i++) {
for (int m=0; m<window; m++) {
for (int n=0; n<window; n++) {
ii = i+(n-window/2);
jj = j+(m-window/2);
nn = n;
mm = m;
if (ii >=0 && ii<width && jj>=0 && jj<height) {
dst[i+j*width] += src[ii+jj*width]*kernel[nn+mm*window];
}
else {
dst[i+j*width] += 0;
}
}
}
if (dst[i+j*width] > max)
max = dst[i+j*width];
else if (dst[i+j*width] < min)
min = dst[i+j*width];
}
}
//normalize double matrix to be an uint8_t matrix
range = max - min;
double norm = 0.0;
printf("correlated matrix max:%.1f, min:%.1f, range:%.1f\n", max, min, range);
for (int j=0; j<height; j++) {
for (int i=0; i<width; i++) {
norm = dst[i+j*width];
norm = 255.0*norm/range;
dst[i+j*width] = norm;
}
}
return true;
}
For me, I use an image like a hollow rectangle, you can download it on my sample.
The orientation of gradient of the hollow rectangle part of my sample image would move from 0 to 360 clockwise (Quadrant 1 to 2 to 3 to 4).
Here is my print which describes the trace of orientation:
[Qudrant 1]orientation: 45.0 to be 31.9(31)
[Qudrant 1]orientation: 90.0 to be 63.8(63)
[Qudrant 2]orientation: 135.0 to be 95.6(95)
[Qudrant 2]orientation: 180.0 to be 127.5(127)
[Qudrant 3]orientation:-135.0 to be 159.4(159)
[Qudrant 3]orientation:-116.6 to be 172.4(172)
[Qudrant 4]orientation:-90.0 to be 191.2(191)
[Qudrant 4]orientation:-63.4 to be 210.1(210)
[Qudrant 4]orientation:-45.0 to be 223.1(223)
You can see more source code about digital image processing on my GitHub :)

Pixel Shader performance on xbox

I've got a pixelshader (below) that i'm using with XNA. On my laptop (crappy graphics card) it runs a little jerky, but ok. I've just tried running it on the xbox and it's horrible!
There's nothing to the game (it's just a fractal renderer) so it's got to be the pixel shader causing the issues. I also think it's the PS code because i've lowered the iterations and it's ok. I've also checked, and the GC delta is zero.
Are there any HLSL functions that are no-no's on the xbox?? I must be doing something wrong here, performance can't be that bad!
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
float i;
float oldBailoutTest = 0;
float bailoutTest = 0;
for(i = 0; i < Iterations; i++)
{
z = ComPow(z, ZPower) + c;
bailoutTest = z.x * z.x + z.y * z.y;
if(bailoutTest >= ZPower * ZPower)
{
break;
}
oldBailoutTest = bailoutTest;
}
float normalisedIterations = i / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (i >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
Below is FractalBase.fxh:
float4x4 MatrixTransform : register(vs, c0);
float2 Pan;
float Zoom;
float Aspect;
void SpriteVertexShader(inout float4 Colour : COLOR0,
inout float2 texCoord : TEXCOORD0,
inout float4 position : SV_Position)
{
position = mul(position, MatrixTransform);
// Convert the position into from screen space into complex coordinates
texCoord = (position) * Zoom * float2(1, Aspect) - float2(Pan.x, -Pan.y);
}
EDIT I did try removing the conditional by using lots of lerps, however when i did that i got loads of artifacts (and not the kind that "belong in a museum"!). I changed things around, and fixed a few logic errors, however the key was to multiply the GreaterThan result by 1 + epsilon, to account for rounding errors just making 0.9999 = 0 (integer). See the fixed code below:
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float GreaterThan(float x, float y)
{
return ((x - y) / (2 * abs(x - y)) + 0.5) * 1.001;
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
int i;
float oldBailoutTest = 0;
float bailoutTest = 0;
int KeepGoing = 1;
int DoneIterations = Iterations;
int Bailout = 0;
for(i = 0; i < Iterations; i++)
{
z = lerp(z, ComPow(z, ZPower) + c, KeepGoing);
bailoutTest = lerp(bailoutTest, z.x * z.x + z.y * z.y, KeepGoing);
Bailout = lerp(Bailout, GreaterThan(bailoutTest, ZPower * ZPower), -abs(Bailout) + 1);
KeepGoing = lerp(KeepGoing, 0.0, Bailout);
DoneIterations = lerp(DoneIterations, min(i, DoneIterations), Bailout);
oldBailoutTest = lerp(oldBailoutTest, bailoutTest, KeepGoing);
}
float normalisedIterations = DoneIterations / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (DoneIterations >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
The xbox has a pretty large block size, so branching on the xbox isn't always so great. Also the compiler isn't always the most effective at emitting dynamic branches which your code seems to use.
Look into the branch attribute: http://msdn.microsoft.com/en-us/library/bb313972%28v=xnagamestudio.31%29.aspx
Also, if you move the early bailout, does the PC become more more similar to the Xbox?
Keep in mind that modern graphic cards are actually quite a bit faster then the Xenon unit by now.

Resources