DX 11 Compute Shader\SharpDX Deferrerd Tiled lighting, Point light problems - sharpdx

I have just finished porting my engine from XNA to SharpDX(DX11).
Everything is going really well and I have conquered most of my issues without having to ask for help until now and I'm really stuck, maybe I just need another set of eye to look over my code idk but here it is.
I'm implementing tile based lighting (point lights only for now), I'm basing my code off the Intel sample because it's not as messy as the ATI one.
So my problem is that the lights move with the camera, I have looked all over the place to find a fix and I have tried everything (am I crazy?).
I just made sure all my normal and light vectors are in view space and normalized (still the same).
I have tried with the inverse View, inverse Projection, a mix of the both and a few other bits from over the net but I can't fix it.
So here is my CPU code:
Dim viewSpaceLPos As Vector3 = Vector3.Transform(New Vector3(pointlight.PosRad.X, pointlight.PosRad.Y, pointlight.PosRad.Z), Engine.Camera.EyeTransform)
Dim lightMatrix As Matrix = Matrix.Scaling(pointlight.PosRad.W) * Matrix.Translation(New Vector3(pointlight.PosRad.X, pointlight.PosRad.Y, pointlight.PosRad.Z))
Here is my CS shader code:
[numthreads(GROUP_WIDTH, GROUP_HEIGHT, GROUP_DEPTH)]
void TileLightingCS(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID)
{
int2 globalCoords = dispatchThreadID.xy;
uint groupIndex = GroupThreadID.y * GROUP_WIDTH + GroupThreadID.x;
float minZSample = FrameBufferCamNearFar.x;
float maxZSample = FrameBufferCamNearFar.y;
float2 gbufferDim;
DepthBuffer.GetDimensions(gbufferDim.x, gbufferDim.y);
float2 screenPixelOffset = float2(2.0f, -2.0f) / gbufferDim;
float2 positionScreen = (float2(globalCoords)+0.5f) * screenPixelOffset.xy + float2(-1.0f, 1.0f);
float depthValue = DepthBuffer[globalCoords].r;
float3 positionView = ComputePositionViewFromZ(positionScreen, Projection._43 / (depthValue - Projection._33));
// Avoid shading skybox/background or otherwise invalid pixels
float viewSpaceZ = positionView.z;
bool validPixel = viewSpaceZ >= FrameBufferCamNearFar.x && viewSpaceZ < FrameBufferCamNearFar.y;
[flatten] if (validPixel)
{
minZSample = min(minZSample, viewSpaceZ);
maxZSample = max(maxZSample, viewSpaceZ);
}
// How many total lights?
uint totalLights, dummy;
InputBuffer.GetDimensions(totalLights, dummy);
// Initialize shared memory light list and Z bounds
if (groupIndex == 0)
{
sTileNumLights = 0;
sMinZ = 0x7F7FFFFF; // Max float
sMaxZ = 0;
}
GroupMemoryBarrierWithGroupSync();
if (maxZSample >= minZSample) {
InterlockedMin(sMinZ, asuint(minZSample));
InterlockedMax(sMaxZ, asuint(maxZSample));
}
GroupMemoryBarrierWithGroupSync();
float minTileZ = asfloat(sMinZ);
float maxTileZ = asfloat(sMaxZ);
// Work out scale/bias from [0, 1]
float2 tileScale = float2(FrameBufferCamNearFar.zw) * rcp(float(2 * GROUP_WIDTH));
float2 tileBias = tileScale - float2(GroupID.xy);
// Now work out composite projection matrix
// Relevant matrix columns for this tile frusta
float4 c1 = float4(Projection._11 * tileScale.x, 0.0f, tileBias.x, 0.0f);
float4 c2 = float4(0.0f, -Projection._22 * tileScale.y, tileBias.y, 0.0f);
float4 c4 = float4(0.0f, 0.0f, 1.0f, 0.0f);
// Derive frustum planes
float4 frustumPlanes[6];
// Sides
frustumPlanes[0] = c4 - c1;
frustumPlanes[1] = c4 + c1;
frustumPlanes[2] = c4 - c2;
frustumPlanes[3] = c4 + c2;
// Near/far
frustumPlanes[4] = float4(0.0f, 0.0f, 1.0f, -minTileZ);
frustumPlanes[5] = float4(0.0f, 0.0f, -1.0f, maxTileZ);
// Normalize frustum planes (near/far already normalized)
[unroll] for (uint i = 0; i < 4; ++i)
{
frustumPlanes[i] *= rcp(length(frustumPlanes[i].xyz));
}
// Cull lights for this tile
for (uint lightIndex = groupIndex; lightIndex < totalLights; lightIndex += (GROUP_WIDTH * GROUP_HEIGHT))
{
PointLight light = InputBuffer[lightIndex];
float3 lightVS = light.PosRad.xyz;// mul(float4(light.Pos.xyz, 1), View);
// Cull: point light sphere vs tile frustum
bool inFrustum = true;
[unroll]
for (uint i = 0; i < 6; ++i)
{
float d = dot(frustumPlanes[i], float4(lightVS, 1.0f));
inFrustum = inFrustum && (d >= -light.PosRad.w);
}
[branch]
if (inFrustum)
{
uint listIndex;
InterlockedAdd(sTileNumLights, 1, listIndex);
sTileLightIndices[listIndex] = lightIndex;
}
}
GroupMemoryBarrierWithGroupSync();
uint numLights = sTileNumLights;
if (all(globalCoords < FrameBufferCamNearFar.zw))
{
float4 NormalMap = NormalBuffer[globalCoords];
float3 normal = DecodeNormal(NormalMap);
if (numLights > 0)
{
float3 lit = float3(0.0f, 0.0f, 0.0f);
for (uint tileLightIndex = 0; tileLightIndex < numLights; ++tileLightIndex)
{
PointLight light = InputBuffer[sTileLightIndices[tileLightIndex]];
float3 lDir = light.PosRad.xyz - positionView;
lDir = normalize(lDir);
float3 nl = saturate(dot(lDir, normal));
lit += ((light.Color.xyz * light.Color.a) * nl) * 0.1f;
}
PointLightColor[globalCoords] = float4(lit, 1);
}
else
{
PointLightColor[globalCoords] = 0;
}
}
GroupMemoryBarrierWithGroupSync();
}
So I know the culling works because there are lights drawn, they just move with the camera.
Could it be a handedness issue?
Am I setting my CPU light code up right?
Have I messed my spaces up?
What am I missing?
Am I reconstructing my position from depth wrong? (don't think it's this because the culling works)
ps. I write depth out like this:
VS shader
float4 viewSpacePos = mul(float4(input.Position,1), WV);
output.Depth=viewSpacePos.z ;
PS Shader
-input.Depth.x / FarClip

Related

HLSL: Which DDX DDY are expected for TextureCube.SampleGrad()

I am wondering which DDX DDY values the SampleGrad() function expects for a TextureCube object.
I know that it's the change in UV coordinates for 2D textures. So I thought, it would be the change in the direction in this case. However, this does not seem to be the case.
I get different results if I try to use the Sample function vs. SampleGrad:
Sample:
// calculate reflected ray
float3 reflRay = reflect(-viewDir, normal);
// reflection map lookup
return reflectionMap.Sample(linearSampler, reflRay);
SampleGrad:
// calculate reflected ray
float3 reflRay = reflect(-viewDir, normal);
// reflection map lookup
float3 dxr = ddx(reflRay);
float3 dyr = ddy(reflRay);
return reflectionMap.SampleGrad(linearSampler, reflRay, dxr, dyr);
I still don't know which values for DDX and DDY are required, but if found an acceptable workaround that computes the level of detail for my gradients. Unfortunately, the quality of this solution is not as good as a real Sample function with anisotropic filtering.
In case anyone needs it:
The computation is described in: https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#LODCalculation
My HLSL implementation:
// calculate reflected ray
float3 reflRay = reflect(-viewDir, normal);
// reflection map lookup
float3 dxr = ddx(reflRay);
float3 dyr = ddy(reflRay);
// cubemap size for lod computation
float reflWidth, reflHeight;
reflectionMap.GetDimensions(reflWidth, reflHeight);
// calculate lod based on raydiffs
float lod = calcLod(getCubeDiff(reflRay, dxr).xy * reflWidth, getCubeDiff(reflRay, dyr).xy * reflHeight);
return reflectionMap.SampleLevel(linearSampler, reflRay, lod).rgb;
Helper functions:
float pow2(float x) {
return x * x;
}
// calculates texture coordinates [-1, 1] for the view direction (xy values must be divided by axisMajorValue for proper [-1, 1] range).else
// z coordinate is the faceId
float3 getCubeCoord(float3 viewDir, out float axisMajorValue)
{
// according to dx spec: https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#PointSampling
// Choose the largest magnitude component of the input vector. Call this magnitude of this value AxisMajor. In the case of a tie, the following precedence should occur: Z, Y, X.
int axisMajor = 0;
int axisFlip = 0;
axisMajorValue = 0.0;
[unroll] for (int i = 0; i < 3; ++i)
{
if (abs(viewDir[i]) >= axisMajorValue)
{
axisMajor = i;
axisFlip = viewDir[i] < 0.0f ? 1 : 0;
axisMajorValue = abs(viewDir[i]);
}
}
int faceId = axisMajor * 2 + axisFlip;
// Select and mirror the minor axes as defined by the TextureCube coordinate space. Call this new 2d coordinate Position.
int axisMinor1 = axisMajor == 0 ? 2 : 0; // first coord is x or z
int axisMinor2 = 3 - axisMajor - axisMinor1;
// Project the coordinate onto the cube by dividing the components Position by AxisMajor.
//float u = viewDir[axisMinor1] / axisMajorValue;
//float v = -viewDir[axisMinor2] / axisMajorValue;
// don't project for getCubeDiff function!
float u = viewDir[axisMinor1];
float v = -viewDir[axisMinor2];
switch (faceId)
{
case 0:
case 5:
u *= -1.0f;
break;
case 2:
v *= -1.0f;
break;
}
return float3(u, v, float(faceId));
}
float3 getCubeDiff(float3 ray, float3 diff)
{
// from: https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#LODCalculation
// Using TC, determine which component is of the largest magnitude, as when calculating the texel location. If any of the components are equivalent, precedence is as follows: Z, Y, X. The absolute value of this will be referred to as AxisMajor.
// select and mirror the minor axes of TC as defined by the TextureCube coordinate space to generate TC'.uv
float axisMajor;
float3 tuv = getCubeCoord(ray, axisMajor);
// select and mirror the minor axes of the partial derivative vectors as defined by the TextureCube coordinate space, generating 2 new partial derivative vectors dX'.uv & dY'.uv.
float derivateMajor;
float3 duv = getCubeCoord(diff, derivateMajor);
// Calculate 2 new dX and dY vectors for future calculations as follows:
// dX.uv = (AxisMajor*dX'.uv - TC'.uv*DerivativeMajorX)/(AxisMajor*AxisMajor)
float3 res;
res.z = 0.0;
res.xy = (axisMajor * duv.xy - tuv.xy * derivateMajor) / (axisMajor * axisMajor);
return res * 0.5;
}
// dx, dy in pixel coordinates
float calcLod(float2 dX, float2 dY)
{
// from: https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#LODCalculation
float A = pow2(dX.y) + pow2(dY.y);
float B = -2.0 * (dX.x * dX.y + dY.x * dY.y);
float C = pow2(dX.x) + pow2(dY.x);
float F = pow2(dX.x * dY.y - dY.x * dX.y);
float p = A - C;
float q = A + C;
float t = sqrt(pow2(p) + pow2(B));
float lengthX = sqrt(abs(F * (t+p) / ( t * (q+t))) + abs(F * (t-p) / ( t * (q+t))));
float lengthY = sqrt(abs(F * (t-p) / ( t * (q-t))) + abs(F * (t+p) / ( t * (q-t))));
return log2(max(lengthX,lengthY));
}

HLSL alphablending in geometry shader

I am rather new to HLSL and I am struggling with implementing a grass shader.
In the geometry shader I create quads which will display the grass blades. However when I try blending in the pixelshader things get weird. Sometimes it ignores everything which is behind the quad. I'm assuming it's a problem with the depth stencil.
this is the result:
Here is my shader:
//************
// VARIABLES *
//************
cbuffer cbPerObject
{
float4x4 m_MatrixWorldViewProj : WORLDVIEWPROJECTION;
float4x4 m_MatrixWorld : WORLD;
float4x4 gMatrixViewInverse : VIEWINVERSE;
float3 m_LightDir = { 2.0f,-5.0f,0.0f };
}
RasterizerState FrontCulling
{
CullMode = NONE;
};
SamplerState samLinear
{
Filter = MIN_MAG_MIP_LINEAR;
AddressU = Wrap;// of Mirror of Clamp of Border
AddressV = Wrap;// of Mirror of Clamp of Border
};
BlendState EnableBlending
{
BlendEnable[0] = TRUE;
SrcBlend = SRC_ALPHA;
DestBlend = INV_SRC_ALPHA;
BlendOp = ADD;
SrcBlendAlpha = ZERO;
DestBlendAlpha = ZERO;
BlendOpAlpha = ADD;
RenderTargetWriteMask[0] = 0x0F;
};
DepthStencilState EnableDepth
{
// Depth test parameters
DepthEnable = true;
DepthWriteMask = all;
DepthFunc = less;
StencilEnable = false;
};
Texture2D m_TextureDiffuse<
string UIName = "Diffuse Texture";
string UIWidget = "Texture";
string ResourceName = "Grass.dds";
>;
Texture2D m_TextureDiffuseBlade<
string UIName = "Diffuse Texture Blade";
string UIWidget = "Texture";
string ResourceName = "GrassBladeDiffuse.dds";
>;
Texture2D m_PerlinNoise<
string UIName = "Perlin Texture";
string UIWidget = "Texture";
string ResourceName = "Perlin.dds";
>;
float gGrassHeight
<
string UIName = "Grass Height";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 10.0f;
float UIStep = 0.01;
> = 0.6f;
float gGrassHeightRandom
<
string UIName = "Grass Height Random";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 1.0f;
float UIStep = 0.01;
> = 1.0f;
float gGrassBend
<
string UIName = "Grass Bend";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 1.0f;
float UIStep = 0.01;
> = 1.0f;
int gGrassBlades
<
string UIName = "Grass Blades";
string UIWidget = "slider";
int UIMin = 1;
int UIMax = 5.0f;
int UIStep = 1;
> = 5;
float gGrassBladesSize
<
string UIName = "Grass Blades Size";
string UIWidget = "slider";
float UIMin = 0;
float UIMax = 1.0f;
float UIStep = 0.01;
> = 0.2f;
float gGrassSpread<
string UIName = "Grass Spread";
> = 5.0f;
float gTime;
//**********
// STRUCTS *
//**********
struct VS_DATA
{
float3 Position : POSITION;
float3 Normal : NORMAL;
float2 TexCoord : TEXCOORD;
};
struct GS_DATA
{
float4 Position : SV_POSITION;
float3 Normal : NORMAL;
float2 TexCoord : TEXCOORD0;
bool Blade : FALSE;
};
//****************
// VERTEX SHADER *
//****************
VS_DATA MainVS(VS_DATA vsData)
{
return vsData;
}
//******************
// GEOMETRY SHADER *
//******************
void CreateVertex(inout TriangleStream<GS_DATA> triStream, float3 pos, float3 normal, float2 texCoord, bool blade = true)
{
//Step 1. Create a GS_DATA object
GS_DATA temp = (GS_DATA)0;
//Step 2. Transform the position using the WVP Matrix and assign it to (GS_DATA object).Position (Keep in mind: float3 -> float4)
temp.Position = mul(float4(pos, 1), m_MatrixWorldViewProj);
//Step 3. Transform the normal using the World Matrix and assign it to (GS_DATA object).Normal (Only Rotation, No translation!)
temp.Normal = mul(normal, (float3x3)m_MatrixWorld);
//Step 4. Assign texCoord to (GS_DATA object).TexCoord
temp.TexCoord = texCoord;
//set if blade or not
temp.Blade = blade;
//Step 5. Append (GS_DATA object) to the TriangleStream parameter (TriangleStream::Append(...))
triStream.Append(temp);
}
float3x3 AngleAxis3x3(float angle, float3 axis)
{
float c, s;
sincos(angle, s, c);
float t = 1 - c;
float x = axis.x;
float y = axis.y;
float z = axis.z;
return float3x3(
t * x * x + c, t * x * y - s * z, t * x * z + s * y,
t * x * y + s * z, t * y * y + c, t * y * z - s * x,
t * x * z - s * y, t * y * z + s * x, t * z * z + c
);
}
[maxvertexcount(5*6*3 +3)]
//[instance(16)]
void GrassGenerator(triangle VS_DATA vertices[3], inout TriangleStream<GS_DATA> triStream)//, uint InstanceID : SV_GSInstanceID)
{
float3 basePoint, top;
//Step 1. Calculate The basePoint
basePoint = (vertices[0].Position + vertices[1].Position + vertices[2].Position) / 3;
//Step 2. Calculate The normal of the basePoint
float3 normal = normalize((vertices[0].Normal + vertices[1].Normal + vertices[2].Normal) / 3);
//orignal vertex
CreateVertex(triStream, vertices[0].Position, vertices[0].Normal, vertices[0].TexCoord, false);
CreateVertex(triStream, vertices[1].Position, vertices[1].Normal, vertices[1].TexCoord, false);
CreateVertex(triStream, vertices[2].Position, vertices[2].Normal, vertices[2].TexCoord, false);
triStream.RestartStrip();
float3 left, right, grassnormal;
for (int j = 0; j < gGrassBlades; j++)
{
float3 position = basePoint + float3(m_PerlinNoise.SampleLevel(samLinear, vertices[j].TexCoord, 0).y - 0.5f, m_PerlinNoise.SampleLevel(samLinear, vertices[j].TexCoord, 0).z - 0.5f, 0)*gGrassSpread;
top = position + (gGrassHeight * normal);
float3 grassDirection = float3(1, 0, 0) * gGrassBladesSize;
float xAngle = 0.0f;
for (int i = 0; i < 3; i++)
{
float3x3 rotation = AngleAxis3x3(xAngle, normal);
grassDirection = mul(grassDirection, rotation);
//Step 5. Calculate The Normal of the grass
float3 leftEdge, rightEdge;
leftEdge = (position - grassDirection) - top;
rightEdge = (position + grassDirection) - top;
grassnormal = normalize(cross(leftEdge, rightEdge));
//Create Spike Geometry
CreateVertex(triStream, top - grassDirection, grassnormal, float2(0, 0));
CreateVertex(triStream, position - grassDirection, grassnormal, float2(0, 1));
CreateVertex(triStream, position + grassDirection, grassnormal, float2(1, 1));
triStream.RestartStrip();
CreateVertex(triStream, top + grassDirection, grassnormal, float2(1, 0));
CreateVertex(triStream, position + grassDirection, grassnormal, float2(1, 1));
CreateVertex(triStream, top - grassDirection, grassnormal, float2(0, 0));
triStream.RestartStrip();
static const float PI = 3.14159265f;
xAngle = 2 * PI / 3;
}
}
}
//***************
// PIXEL SHADER *
//***************
float4 MainPS(GS_DATA input) : SV_TARGET
{
input.Normal = -normalize(input.Normal);
float alpha;
float3 color;
if (input.Blade) {
alpha = m_TextureDiffuseBlade.Sample(samLinear,input.TexCoord).a;
color = m_TextureDiffuseBlade.Sample(samLinear,input.TexCoord).rgb;
}
else {
alpha = m_TextureDiffuse.Sample(samLinear,input.TexCoord).a;
color = m_TextureDiffuse.Sample(samLinear,input.TexCoord).rgb;
}
float s = max(dot(m_LightDir, input.Normal), 0.4f);
return float4(color*s,alpha);
}
//*************
// TECHNIQUES *
//*************
technique10 DefaultTechnique
{
pass p0 {
SetDepthStencilState(EnableDepth, 0);
SetBlendState(EnableBlending, float4(0.0f, 0.0f, 0.0f, 0.0f), 0xFFFFFFFF);
SetRasterizerState(FrontCulling);
SetVertexShader(CompileShader(vs_4_0, MainVS()));
SetGeometryShader(CompileShader(gs_5_0, GrassGenerator()));
SetPixelShader(CompileShader(ps_4_0, MainPS()));
}
}

First two fragment shader outputs are different

I'm currently trying to get this bokeh shader to work with GPUImage: http://blenderartists.org/forum/showthread.php?237488-GLSL-depth-of-field-with-bokeh-v2-4-(update)
This is what I've got at the moment:
precision mediump float;
varying highp vec2 textureCoordinate;
varying highp vec2 textureCoordinate2;
uniform sampler2D inputImageTexture;
uniform sampler2D inputImageTexture2;
uniform float inputImageTextureWidth;
uniform float inputImageTextureHeight;
#define PI 3.14159265
float width = inputImageTextureWidth; //texture width
float height = inputImageTextureHeight; //texture height
vec2 texel = vec2(1.0/width,1.0/height);
//uniform variables from external script
uniform float focalDepth; //focal distance value in meters, but you may use autofocus option below
uniform float focalLength; //focal length in mm
uniform float fstop; //f-stop value
bool showFocus = false; //show debug focus point and focal range (red = focal point, green = focal range)
float znear = 0.1; //camera clipping start
float zfar = 5.0; //camera clipping end
//------------------------------------------
//user variables
int samples = 3; //samples on the first ring
int rings = 3; //ring count
bool manualdof = false; //manual dof calculation
float ndofstart = 1.0; //near dof blur start
float ndofdist = 2.0; //near dof blur falloff distance
float fdofstart = 1.0; //far dof blur start
float fdofdist = 3.0; //far dof blur falloff distance
float CoC = 0.03;//circle of confusion size in mm (35mm film = 0.03mm)
bool vignetting = false; //use optical lens vignetting?
float vignout = 1.3; //vignetting outer border
float vignin = 0.0; //vignetting inner border
float vignfade = 22.0; //f-stops till vignete fades
bool autofocus = false; //use autofocus in shader? disable if you use external focalDepth value
vec2 focus = vec2(0.5, 0.5); // autofocus point on screen (0.0,0.0 - left lower corner, 1.0,1.0 - upper right)
float maxblur = 1.0; //clamp value of max blur (0.0 = no blur,1.0 default)
float threshold = 0.5; //highlight threshold;
float gain = 2.0; //highlight gain;
float bias = 0.5; //bokeh edge bias
float fringe = 0.7; //bokeh chromatic aberration/fringing
bool noise = false; //use noise instead of pattern for sample dithering
float namount = 0.0001; //dither amount
bool depthblur = false; //blur the depth buffer?
float dbsize = 1.25; //depthblursize
/*
next part is experimental
not looking good with small sample and ring count
looks okay starting from samples = 4, rings = 4
*/
bool pentagon = false; //use pentagon as bokeh shape?
float feather = 0.4; //pentagon shape feather
//------------------------------------------
float penta(vec2 coords) //pentagonal shape
{
float scale = float(rings) - 1.3;
vec4 HS0 = vec4( 1.0, 0.0, 0.0, 1.0);
vec4 HS1 = vec4( 0.309016994, 0.951056516, 0.0, 1.0);
vec4 HS2 = vec4(-0.809016994, 0.587785252, 0.0, 1.0);
vec4 HS3 = vec4(-0.809016994,-0.587785252, 0.0, 1.0);
vec4 HS4 = vec4( 0.309016994,-0.951056516, 0.0, 1.0);
vec4 HS5 = vec4( 0.0 ,0.0 , 1.0, 1.0);
vec4 one = vec4( 1.0 );
vec4 P = vec4((coords),vec2(scale, scale));
vec4 dist = vec4(0.0);
float inorout = -4.0;
dist.x = dot( P, HS0 );
dist.y = dot( P, HS1 );
dist.z = dot( P, HS2 );
dist.w = dot( P, HS3 );
dist = smoothstep( -feather, feather, dist );
inorout += dot( dist, one );
dist.x = dot( P, HS4 );
dist.y = HS5.w - abs( P.z );
dist = smoothstep( -feather, feather, dist );
inorout += dist.x;
return clamp( inorout, 0.0, 1.0 );
}
float bdepth(vec2 coords) //blurring depth
{
float d = 0.0;
float kernel[9];
vec2 offset[9];
vec2 wh = vec2(texel.x, texel.y) * dbsize;
offset[0] = vec2(-wh.x,-wh.y);
offset[1] = vec2( 0.0, -wh.y);
offset[2] = vec2( wh.x -wh.y);
offset[3] = vec2(-wh.x, 0.0);
offset[4] = vec2( 0.0, 0.0);
offset[5] = vec2( wh.x, 0.0);
offset[6] = vec2(-wh.x, wh.y);
offset[7] = vec2( 0.0, wh.y);
offset[8] = vec2( wh.x, wh.y);
kernel[0] = 1.0/16.0; kernel[1] = 2.0/16.0; kernel[2] = 1.0/16.0;
kernel[3] = 2.0/16.0; kernel[4] = 4.0/16.0; kernel[5] = 2.0/16.0;
kernel[6] = 1.0/16.0; kernel[7] = 2.0/16.0; kernel[8] = 1.0/16.0;
for( int i=0; i<9; i++ )
{
float tmp = texture2D(inputImageTexture2, coords + offset[i]).r;
d += tmp * kernel[i];
}
return d;
}
vec3 color(vec2 coords,float blur) //processing the sample
{
vec3 col = vec3(0.0);
col.r = texture2D(inputImageTexture, coords + vec2(0.0,1.0)*texel*fringe*blur).r;
col.g = texture2D(inputImageTexture, coords + vec2(-0.866,-0.5)*texel*fringe*blur).g;
col.b = texture2D(inputImageTexture, coords + vec2(0.866,-0.5)*texel*fringe*blur).b;
vec3 lumcoeff = vec3(0.299,0.587,0.114);
float lum = dot(col.rgb, lumcoeff);
float thresh = max((lum-threshold)*gain, 0.0);
return col+mix(vec3(0.0),col,thresh*blur);
}
vec2 rand(vec2 coord) //generating noise/pattern texture for dithering
{
float noiseX = ((fract(1.0-coord.s*(width/2.0))*0.25)+(fract(coord.t*(height/2.0))*0.75))*2.0-1.0;
float noiseY = ((fract(1.0-coord.s*(width/2.0))*0.75)+(fract(coord.t*(height/2.0))*0.25))*2.0-1.0;
if (noise)
{
noiseX = clamp(fract(sin(dot(coord ,vec2(12.9898,78.233))) * 43758.5453),0.0,1.0)*2.0-1.0;
noiseY = clamp(fract(sin(dot(coord ,vec2(12.9898,78.233)*2.0)) * 43758.5453),0.0,1.0)*2.0-1.0;
}
return vec2(noiseX,noiseY);
}
vec3 debugFocus(vec3 col, float blur, float depth)
{
float edge = 0.002*depth; //distance based edge smoothing
float m = clamp(smoothstep(0.0,edge,blur),0.0,1.0);
float e = clamp(smoothstep(1.0-edge,1.0,blur),0.0,1.0);
col = mix(col,vec3(1.0,1.0,0.0),(1.0-m)*0.6);
col = mix(col,vec3(0.0,1.0,1.0),((1.0-e)-(1.0-m))*0.2);
return col;
}
float linearize(float depth)
{
return -zfar * znear / (depth * (zfar - znear) - zfar);
}
float vignette()
{
float dist = distance(textureCoordinate.xy, vec2(0.5,0.5));
dist = smoothstep(vignout+(fstop/vignfade), vignin+(fstop/vignfade), dist);
return clamp(dist,0.0,1.0);
}
void main()
{
//scene depth calculation
float depth = linearize(texture2D(inputImageTexture2, textureCoordinate2.xy).x);
if (depthblur)
{
depth = linearize(bdepth(textureCoordinate2.xy));
}
//focal plane calculation
float fDepth = focalDepth;
if (autofocus)
{
fDepth = linearize(texture2D(inputImageTexture2, focus).x);
}
//dof blur factor calculation
float blur = 0.0;
if (manualdof)
{
float a = depth-fDepth; //focal plane
float b = (a-fdofstart)/fdofdist; //far DoF
float c = (-a-ndofstart)/ndofdist; //near Dof
blur = (a>0.0)?b:c;
}
else
{
float f = focalLength; //focal length in mm
float d = fDepth*1000.0; //focal plane in mm
float o = depth*1000.0; //depth in mm
float a = (o*f)/(o-f);
float b = (d*f)/(d-f);
float c = (d-f)/(d*fstop*CoC);
blur = abs(a-b)*c;
}
blur = clamp(blur,0.0,1.0);
// calculation of pattern for ditering
vec2 noise = rand(textureCoordinate.xy)*namount*blur;
// getting blur x and y step factor
float w = (1.0/width)*blur*maxblur+noise.x;
float h = (1.0/height)*blur*maxblur+noise.y;
// calculation of final color
vec3 col = vec3(0.0);
if(blur < 0.05) //some optimization thingy
{
col = texture2D(inputImageTexture, textureCoordinate.xy).rgb;
}
else
{
col = texture2D(inputImageTexture, textureCoordinate.xy).rgb;
float s = 1.0;
int ringsamples;
for (int i = 1; i <= rings; i += 1)
{
ringsamples = i * samples;
for (int j = 0 ; j < ringsamples ; j += 1)
{
float step = PI*2.0 / float(ringsamples);
float pw = (cos(float(j)*step)*float(i));
float ph = (sin(float(j)*step)*float(i));
float p = 1.0;
if (pentagon)
{
p = penta(vec2(pw,ph));
}
col += color(textureCoordinate.xy + vec2(pw*w,ph*h),blur)*mix(1.0,(float(i))/(float(rings)),bias)*p;
s += 1.0*mix(1.0,(float(i))/(float(rings)),bias)*p;
}
}
col /= s; //divide by sample count
}
if (showFocus)
{
col = debugFocus(col, blur, depth);
}
if (vignetting)
{
col *= vignette();
}
gl_FragColor.rgb = col;
gl_FragColor.a = 1.0;
}
This is my bokeh filter, a subclass of GPUImageTwoInputFilter:
#implementation GPUImageBokehFilter
- (id)init;
{
NSString *fragmentShaderPathname = [[NSBundle mainBundle] pathForResource:#"BokehShader" ofType:#"fsh"];
NSString *fragmentShaderString = [NSString stringWithContentsOfFile:fragmentShaderPathname encoding:NSUTF8StringEncoding error:nil];
if (!(self = [super initWithFragmentShaderFromString:fragmentShaderString]))
{
return nil;
}
focalDepthUniform = [filterProgram uniformIndex:#"focalDepth"];
focalLengthUniform = [filterProgram uniformIndex:#"focalLength"];
fStopUniform = [filterProgram uniformIndex:#"fstop"];
[self setFocalDepth:1.0];
[self setFocalLength:35.0];
[self setFStop:2.2];
return self;
}
#pragma mark -
#pragma mark Accessors
- (void)setFocalDepth:(float)focalDepth {
_focalDepth = focalDepth;
[self setFloat:_focalDepth forUniform:focalDepthUniform program:filterProgram];
}
- (void)setFocalLength:(float)focalLength {
_focalLength = focalLength;
[self setFloat:_focalLength forUniform:focalLengthUniform program:filterProgram];
}
- (void)setFStop:(CGFloat)fStop {
_fStop = fStop;
[self setFloat:_fStop forUniform:fStopUniform program:filterProgram];
}
#end
And finally, this is how I use said filter:
#implementation ViewController {
GPUImageBokehFilter *bokehFilter;
GPUImagePicture *bokehMap;
UIImage *inputImage;
}
- (void)viewDidLoad
{
[super viewDidLoad];
inputImage = [UIImage imageNamed:#"stones"];
bokehMap = [[GPUImagePicture alloc] initWithImage:[UIImage imageNamed:#"bokehmask"]];
_backgroundImage.image = inputImage;
bokehFilter = [[GPUImageBokehFilter alloc] init];
[self processImage];
}
- (IBAction)dataInputUpdated:(id)sender {
[self processImage];
}
- (void *)processImage {
dispatch_async(dispatch_get_global_queue( DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
GPUImagePicture *gpuPicture = [[GPUImagePicture alloc] initWithImage:inputImage];
[gpuPicture addTarget:bokehFilter];
[gpuPicture processImage];
[bokehMap addTarget:bokehFilter];
[bokehMap processImage];
[bokehFilter useNextFrameForImageCapture];
[bokehFilter setFloat:inputImage.size.width forUniformName:#"inputImageTextureWidth"];
[bokehFilter setFloat:inputImage.size.height forUniformName:#"inputImageTextureHeight"];
UIImage *blurredImage = [bokehFilter imageFromCurrentFramebuffer];
dispatch_async(dispatch_get_main_queue(), ^{
[self displayNewImage:blurredImage];
});
});
}
- (void)displayNewImage:(UIImage*)newImage {
[UIView transitionWithView:_backgroundImage
duration:.6f
options:UIViewAnimationOptionTransitionCrossDissolve
animations:^{
_backgroundImage.image = newImage;
} completion:nil];
}
...
The first image is the one I'm trying to blur, the second one is a random gradient to test the shader's depth map thingy:
When I start the app on my iPhone, I get this:
After moving the slider (which triggers the dataInputChanged method), I get this:
While that admittedly looks much better than the first image, I still have some problems with this:
There's a diagonal noisy line (inside the red lines I put on the picture) that appears to be unblurred.
The top left of the image is blurry, even though it shouldn't be.
Why do I get this weird behavior? Shouldn't the shader output be the same every time?
Also, how do I get it to respect the depth map? My GLSL shader knowledge is very limited, so please be patient.
The diagonal artifact appears to be caused by your test gradient. You can see that it occurs at about the same place as where your gradient goes to completely white. Try spreading out the gradient so it only reaches 1.0 or 0.0 at the very corners of the image.
It's a pretty big question, and I can't make a full answer because I would really need to test the thing out.
But a few points: The final image that you put up is hard to work with. Because the image has been upscaled so much, I can't tell if it's actually blurred or if it just appears blurry because of the resolution. Regardless, the amount of blur that you're getting (when compared to the original link that you provided) suggests that something isn't working with the shader.
Another thing that concerns me is the //some optimization thingy comment that you've got in there. This is the sort of thing that's going to be responsible for an ugly line in your final output. Saying that you wont have any blur under blur < 0.05 isn't necessarily something that you can do! I would be expecting a nasty artifact as the shader transitions from the blur shader and into the 'optimized' part.
Hope that sheds some light, and good luck!
Have you tried enabling showFocus? This should show the focal point in red and the focal range in green which should help with debugging. You could also try enabling autofocus to ensure that the centre of the image is in focus, because at the moment it's not obvious which distance should be in focus, due to the linearize function changing coordinate systems. After that try tweaking fstop to get the desired amount of blur. You will probably also find that you will need greater than samples = 3 and rings = 3 to produce a smooth bokeh effect.
Your answers helped me get on the right track, and after a few hours of fiddling around with my code and the shader, I managed to get all bugs fixed. Here's what caused them and how I fixed them:
The ugly diagonal line was caused by the linearize() method, so I removed it and made the shader use the RGB values (or, to be more precise: only the R value) from the depth map without processing them first.
The blue-ish image I got from the shader was caused by my own incompetence. These two lines had to be put before the calls to processImage:
[bokehFilter setFloat:inputImage.size.width forUniformName:#"inputImageTextureWidth"];
[bokehFilter setFloat:inputImage.size.height forUniformName:#"inputImageTextureHeight"];
In hindsight, it's obvious why I only got results the second time I used the shader. After fixing those bugs, I went on to optimize it a bit to keep the execution time as low as possible, and now I can tell it to render 8 samples/4 rings and it does so in less than a second. Here's what that looks like:
Thanks for the answers, everyone, I probably wouldn't have gotten those bugs fixed without you.

Shader code optimization

I have this code snippet (for cubemap PCF filtering). I would like to optimize it for shader model 2. I tried eliminating the branches with permutation matrices stored in uniforms, but it requires too much (2x24).
float3 l = normalize(ldir);
float3 al = abs(l);
float3 off2, off3, off4;
if( al.x < al.y )
{
if( al.y < al.z )
{
// z is dominant
off2 = CubeOffset(l.zxy, float2(0, 1), texelsize).yzx;
off3 = CubeOffset(l.zxy, float2(1, 0), texelsize).yzx;
off4 = CubeOffset(l.zxy, float2(1, 1), texelsize).yzx;
}
else
{
// y is dominant
off2 = CubeOffset(l.yxz, float2(0, 1), texelsize).yxz;
off3 = CubeOffset(l.yxz, float2(1, 0), texelsize).yxz;
off4 = CubeOffset(l.yxz, float2(1, 1), texelsize).yxz;
}
}
else
{
if( al.x < al.z )
{
// z is dominant
off2 = CubeOffset(l.zxy, float2(0, 1), texelsize).yzx;
off3 = CubeOffset(l.zxy, float2(1, 0), texelsize).yzx;
off4 = CubeOffset(l.zxy, float2(1, 1), texelsize).yzx;
}
else
{
// x is dominant
off2 = CubeOffset(l, float2(0, 1), texelsize);
off3 = CubeOffset(l, float2(1, 0), texelsize);
off4 = CubeOffset(l, float2(1, 1), texelsize);
}
}
Perhaps a mathematical relation can be found between the comparisons (al.xyy < al.yzz) and the swizzles.
UPDATE: definition of cubeoffset
float3 CubeOffset(float3 swiz, float2 off, float2 texelsize)
{
float3 ret;
ret.yz = swiz.yz + 2.0f * off * texelsize;
ret.x = sqrt(1.0f - dot(ret.yz, ret.yz));
if( swiz.x < 0 )
ret.x *= -1.0f;
return ret;
}
And the HLSL error when compiling SM 2.0:
error X5608: Compiled shader code uses too many arithmetic instruction slots (107).
Max. allowed by the target (ps_2_0) is 64.
error X5609: Compiled shader code uses too many instruction slots (111).
Max. allowed by the target (ps_2_0) is 96.
GLSL handles it fine. The goal is backward compatibility.
(btw. the algorithm is faulty, but that's not an issue right now)
Not really an optimization, but consider to test this.
Obvious, not always desired, and rarely best solution in such cases is to move to CPU extra code that not fits up (due to instructions count for example). In case of branching you can:
move branch condition check to CPU
split and move to separate shaders branch bodies
bind appropriate shader by results of condition check
It is a simplest thing you can do. And no need to poking around in assembler. Problem is when your condition is calculated inside shader.
Hope it helps somehow.
While I don't know whether it can be solved with SM 2.0, considering the advancements in GPU power I provide a SM 3.0 solution.
Please keep in mind, that this code is a snippet from my own shader language (but similar to HLSL):
template <int samples>
float PCFIrregularCUBE(sampler shadowmap, sampler noisetex, float3 ldir, float2 sloc, float2 texelsize)
{
const float kernelradius = 2.0f;
float3 l = normalize(ldir);
float3 al = abs(l);
float2 noise;
float2 rotated;
float sd, t, s;
float d = length(ldir);
noise = tex2D(noisetex, sloc);
noise = normalize(noise * 2.0f - 1.0f);
float2 rotmat0 = float2(noise.x, noise.y);
float2 rotmat1 = float2(-noise.y, noise.x);
float3 off;
s = 0;
for( int i = 0; i < samples; ++i ) {
rotated.x = dot(irreg_kernel[i], rotmat0) * kernelradius;
rotated.y = dot(irreg_kernel[i], rotmat1) * kernelradius;
if( al.x < al.y ) {
if( al.y < al.z )
off = CubeOffsetZXY(l, rotated, texelsize);
else
off = CubeOffsetYXZ(l, rotated, texelsize);
} else {
if( al.x < al.z )
off = CubeOffsetZXY(l, rotated, texelsize);
else
off = CubeOffsetXYZ(l, rotated, texelsize);
}
sd = texCUBE(shadowmap, off).r;
t = ((d > sd) ? 0.0f : 1.0f);
s += ((sd < 0.001f) ? 1.0f : t);
}
return s * (1.0f / samples);
}
With CubeOffsetXXX being like:
float3 CubeOffsetZXY(float3 swiz, float2 off, float2 texelsize)
{
float3 ret;
ret.xy = swiz.xy + 2.0f * off * texelsize * swiz.z;
ret.z = sqrt(1.0f - dot(ret.xy, ret.xy));
if( swiz.z < 0 )
ret.z *= -1.0f;
return ret;
}
For more details you should google for irregular PCF. The result in its worst (as in "the camera is close") is:
Notice the "salt and pepper" noise caused by irregular PCF. From a distance it is perfectly acceptable (Crysis 1 method).

Pixel Shader performance on xbox

I've got a pixelshader (below) that i'm using with XNA. On my laptop (crappy graphics card) it runs a little jerky, but ok. I've just tried running it on the xbox and it's horrible!
There's nothing to the game (it's just a fractal renderer) so it's got to be the pixel shader causing the issues. I also think it's the PS code because i've lowered the iterations and it's ok. I've also checked, and the GC delta is zero.
Are there any HLSL functions that are no-no's on the xbox?? I must be doing something wrong here, performance can't be that bad!
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
float i;
float oldBailoutTest = 0;
float bailoutTest = 0;
for(i = 0; i < Iterations; i++)
{
z = ComPow(z, ZPower) + c;
bailoutTest = z.x * z.x + z.y * z.y;
if(bailoutTest >= ZPower * ZPower)
{
break;
}
oldBailoutTest = bailoutTest;
}
float normalisedIterations = i / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (i >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
Below is FractalBase.fxh:
float4x4 MatrixTransform : register(vs, c0);
float2 Pan;
float Zoom;
float Aspect;
void SpriteVertexShader(inout float4 Colour : COLOR0,
inout float2 texCoord : TEXCOORD0,
inout float4 position : SV_Position)
{
position = mul(position, MatrixTransform);
// Convert the position into from screen space into complex coordinates
texCoord = (position) * Zoom * float2(1, Aspect) - float2(Pan.x, -Pan.y);
}
EDIT I did try removing the conditional by using lots of lerps, however when i did that i got loads of artifacts (and not the kind that "belong in a museum"!). I changed things around, and fixed a few logic errors, however the key was to multiply the GreaterThan result by 1 + epsilon, to account for rounding errors just making 0.9999 = 0 (integer). See the fixed code below:
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float GreaterThan(float x, float y)
{
return ((x - y) / (2 * abs(x - y)) + 0.5) * 1.001;
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
int i;
float oldBailoutTest = 0;
float bailoutTest = 0;
int KeepGoing = 1;
int DoneIterations = Iterations;
int Bailout = 0;
for(i = 0; i < Iterations; i++)
{
z = lerp(z, ComPow(z, ZPower) + c, KeepGoing);
bailoutTest = lerp(bailoutTest, z.x * z.x + z.y * z.y, KeepGoing);
Bailout = lerp(Bailout, GreaterThan(bailoutTest, ZPower * ZPower), -abs(Bailout) + 1);
KeepGoing = lerp(KeepGoing, 0.0, Bailout);
DoneIterations = lerp(DoneIterations, min(i, DoneIterations), Bailout);
oldBailoutTest = lerp(oldBailoutTest, bailoutTest, KeepGoing);
}
float normalisedIterations = DoneIterations / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (DoneIterations >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
The xbox has a pretty large block size, so branching on the xbox isn't always so great. Also the compiler isn't always the most effective at emitting dynamic branches which your code seems to use.
Look into the branch attribute: http://msdn.microsoft.com/en-us/library/bb313972%28v=xnagamestudio.31%29.aspx
Also, if you move the early bailout, does the PC become more more similar to the Xbox?
Keep in mind that modern graphic cards are actually quite a bit faster then the Xenon unit by now.

Resources