DirectX Intel HD and NVIDIA different behavior Geometry Shader - directx

My code uses a geometry shader to produce thick lines using this: https://forum.libcinder.org/topic/smooth-thick-lines-using-geometry-shader
(Uses geometry shader approach)
I get it work on my local machine using an Intel HD Graphics card.
However, if I use the same settings on my destination machine the lines that are drawn with weird gaps.
I don't understand why, because on different Intel HD it works.
Note that my target is a NVS 300 that is fairly old, but supports FL10_1 and Geometry shader I guess. The Intel devices I try it with might be a bit newer.
Since I force the feature level on my development on device creation to be 10_1 I expect no difference.
I don't see any error codes that could hint a arbitrary behavior in the output that would explain it, even if I set up native code debugging or remote debugging.
Does anyone have a clue why this behaves differently?
I could add images, but you basically see a thick sine curve on my local machine and a thick ,but fractioned with gaps, on the target.
Thanks in advance for any clues.
cbuffer constBuffer
{
float THICKNESS;
float2 WIN_SCALE;
};
struct PSInput
{
float4 Position : SV_POSITION;
};
float2 toScreenSpace(float4 vertex)
{
//float2 WIN_SCALE = { 100.0f, 100.0f };
return float2(vertex.xy) * WIN_SCALE;
}
[maxvertexcount(7)]
void main(lineadj float4 vertices[4] : SV_POSITION, inout TriangleStream<PSInput> triStream)
{
//float2 WIN_SCALE = { 100.0f, 100.0f };
float2 p0 = toScreenSpace(vertices[0]); // start of previous segment
float2 p1 = toScreenSpace(vertices[1]); // end of previous segment, start of current segment
float2 p2 = toScreenSpace(vertices[2]); // end of current segment, start of next segment
float2 p3 = toScreenSpace(vertices[3]); // end of next segment
// perform naive culling
float2 area = WIN_SCALE * 1.2;
if (p1.x < -area.x || p1.x > area.x)
return;
if (p1.y < -area.y || p1.y > area.y)
return;
if (p2.x < -area.x || p2.x > area.x)
return;
if (p2.y < -area.y || p2.y > area.y)
return;
float2 v0 = normalize(p1 - p0);
float2 v1 = normalize(p2 - p1);
float2 v2 = normalize(p3 - p2);
// determine the normal of each of the 3 segments (previous, current, next)
float2 n0 = { -v0.y, v0.x};
float2 n1 = { -v1.y, v1.x};
float2 n2 = { -v2.y, v2.x};
// determine miter lines by averaging the normals of the 2 segments
float2 miter_a = normalize(n0 + n1); // miter at start of current segment
float2 miter_b = normalize(n1 + n2); // miter at end of current segment
// determine the length of the miter by projecting it onto normal and then inverse it
//float THICKNESS = 10;
float length_a = THICKNESS / dot(miter_a, n1);
float length_b = THICKNESS / dot(miter_b, n1);
float MITER_LIMIT = -1;
//float MITER_LIMIT = -1;
//float MITER_LIMIT = 1;
PSInput v;
float2 temp;
//// prevent excessively long miters at sharp corners
if (dot(v0, v1) < -MITER_LIMIT)
{
miter_a = n1;
length_a = THICKNESS;
// close the gap
if (dot(v0, n1) > 0)
{
temp = (p1 + THICKNESS * n0) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
temp = (p1 + THICKNESS * n1) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
v.Position = float4(p1 / WIN_SCALE, 0, 1.0);
triStream.Append(v);
triStream.RestartStrip();
}
else
{
temp = (p1 - THICKNESS * n1) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
temp = (p1 - THICKNESS * n0) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
v.Position = float4(p1 / WIN_SCALE, 0, 1.0);
triStream.Append(v);
triStream.RestartStrip();
}
}
if (dot(v1, v2) < -MITER_LIMIT)
{
miter_b = n1;
length_b = THICKNESS;
}
// generate the triangle strip
temp = (p1 + length_a * miter_a) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
temp = (p1 - length_a * miter_a) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
temp = (p2 + length_b * miter_b) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
temp = (p2 - length_b * miter_b) / WIN_SCALE;
v.Position = float4(temp, 0, 1.0);
triStream.Append(v);
triStream.RestartStrip();
}

Related

Fragment shader and Vertex Shader in Metal

Fragment Shader
vertex VertexOutBezier bezier_vertex(constant BezierParameters *allParams[[buffer(0)]],
// constant GlobalParameters& globalParams[[buffer(1)]],
uint vertexId [[vertex_id]],
uint instanceId [[instance_id]])
{
float t = (float) vertexId / 300;
rint(t);
BezierParameters params = allParams[instanceId];
float lineWidth = (1 - (((float) (vertexId % 2)) * 2.0)) * params.lineThickness;
float2 a = params.a;
float2 b = params.b;
float nt = 1.0f - t;
float nt_2 = nt * nt;
float nt_3 = nt_2 * nt;
float t_2 = t * t;
float t_3 = t_2 * t;
float2 point = a * nt_3 + params.p1 * nt_2 * t + params.p2 * nt * t_2 + b * t_3;
float2 tangent = -3.0 * a * nt_2 + params.p1 * (1.0 - 4.0 * t + 3.0 * t_2) + params.p2 * (2.0 * t - 3.0 * t_2) + 3 * b * t_2;
tangent = normalize(float2(-tangent.y, tangent.x));
VertexOutBezier vo;
vo.pos.xy = point + (tangent * (lineWidth / 3.0f));
vo.pos.zw = float2(0, 1);
vo.color = params.color ;
return vo;
}
My Fragment shader is
fragment float4 bezier_fragment(VertexOutBezier params[[stage_in]],
texture2d<float> texture [[texture(0)]]
)
{
constexpr sampler defaultSampler;
float4 canvasColor = texture.sample(defaultSampler, params.pos.xy);
return canvasColor;
}
Here i expect to get the pixel color of the texture. But here it is only getting single color. It is not getting the color of the texture according to its position.
Even when I do this in fragment I am getting the single color it is not varying with coordinates
fragment float4 bezier_fragment(VertexOutBezier params[[stage_in]],
texture2d<float> texture [[texture(0)]]
)
{
constexpr sampler defaultSampler;
float4 canvasColor = params.color * params.pos.x;
return canvasColor;
}
If I do this in Vertex Shader I got color varying according position of x
vo.pos.xy = point + (tangent * (lineWidth / 3.0f));
vo.pos.zw = float2(0, 1);
vo.color = params.color * vo.pos.x;
What is the Issue in fragment Shader. I cannot get the coordinates from Vertex Shader
Please make sure the VertexOutBezier.pos.xy value is normalization ( 0 ~ 1.0) ,due to the defaultSampler only receive normalization position value, if always return a single may be the position is beyond the bounds.

HLSL Geometry Shader Thick Lines DirectX

I try to draw a thick lined Sinus curve using this approach : https://forum.libcinder.org/topic/smooth-thick-lines-using-geometry-shader
I tried to port it to HLSL geometry shader:
Setting the dimension to fix 500/500 atm
THICKNESS seems not to change the issue
struct PSInput
{
float4 Position : SV_POSITION;
};
float2 toScreenSpace(float4 vertex)
{
float2 WIN_SCALE = { 500.0f, 500.0f };
return float2(vertex.xy / vertex.w) * WIN_SCALE;
}
[maxvertexcount(7)]
void main(lineadj float4 vertices[4] : SV_POSITION, inout TriangleStream<PSInput> triStream)
{
float2 WIN_SCALE = { 500.0f, 500.0f };
float2 p0 = toScreenSpace(vertices[0]); // start of previous segment
float2 p1 = toScreenSpace(vertices[1]); // end of previous segment, start of current segment
float2 p2 = toScreenSpace(vertices[2]); // end of current segment, start of next segment
float2 p3 = toScreenSpace(vertices[3]); // end of next segment
// perform naive culling
float2 area = WIN_SCALE * 1.2;
if (p1.x < -area.x || p1.x > area.x)
return;
if (p1.y < -area.y || p1.y > area.y)
return;
if (p2.x < -area.x || p2.x > area.x)
return;
if (p2.y < -area.y || p2.y > area.y)
return;
float2 v0 = normalize(p1 - p0);
float2 v1 = normalize(p2 - p1);
float2 v2 = normalize(p3 - p2);
// determine the normal of each of the 3 segments (previous, current, next)
float2 n0 = { -v0.y, v0.x};
float2 n1 = { -v1.y, v1.x};
float2 n2 = { -v2.y, v2.x};
// determine miter lines by averaging the normals of the 2 segments
float2 miter_a = normalize(n0 + n1); // miter at start of current segment
float2 miter_b = normalize(n1 + n2); // miter at end of current segment
// determine the length of the miter by projecting it onto normal and then inverse it
float THICKNESS = 1;
float length_a = THICKNESS / dot(miter_a, n1);
float length_b = THICKNESS / dot(miter_b, n1);
float MITER_LIMIT = 0.75;
//float MITER_LIMIT = -1;
//float MITER_LIMIT = 0.1;
PSInput v;
float2 temp;
//// prevent excessively long miters at sharp corners
if (dot(v0, v1) < -MITER_LIMIT)
{
miter_a = n1;
length_a = THICKNESS;
// close the gap
if (dot(v0, n1) > 0)
{
temp = (p1 + THICKNESS * n0) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
temp = (p1 + THICKNESS * n1) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
v.Position = float4(p1 / WIN_SCALE, 1.0, 1.0);
triStream.Append(v);
//triStream.RestartStrip();
}
else
{
temp = (p1 - THICKNESS * n1) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
temp = (p1 - THICKNESS * n0) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
v.Position = float4(p1 / WIN_SCALE, 1.0, 1.0);
triStream.Append(v);
//triStream.RestartStrip();
}
}
if (dot(v1, v2) < -MITER_LIMIT)
{
miter_b = n1;
length_b = THICKNESS;
}
// generate the triangle strip
temp = (p1 + length_a * miter_a) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
temp = (p1 - length_a * miter_a) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
temp = (p2 + length_b * miter_b) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
temp = (p2 - length_b * miter_b) / WIN_SCALE;
v.Position = float4(temp, 1.0, 1.0);
triStream.Append(v);
//triStream.RestartStrip();
}
The thing is, that it won't output anything of the curve I give as points to it. It already works very well without thick lines, but I want to have that. I can add Noise to my Sinus signal and if I do that, (y-values increase by a little random %) suddenly the curve shows up and is thicker and seems to work, but I want it to be thick and showing without noise too.
If I up the frequency, without noise, point appear scattered-plot style, but not as a connected Sinus.
If I debug with the VS Graphics Debugger, I can see that in the frame with no Noise it says the Pixel-Shader-Stage is not run (?).
The frame with noise shows it run and working.
Maybe my porting is incorrect and I forget something, I am very new to programming shaders. Maybe I missunderstand the approach at all.
Any help appreciated.
I needed to set the default behavior of the culling to CullMode.None to get all the triangle drawn.

how do I port this Shadertoy shader with an fwidth() call to a Metal shader?

I've been porting Shadertoy shaders to Metal in order to learn how to write Metal shaders. I don't think I'm doing it correctly as I have been writing every one of my shaders as a compute shader, rather than vertex/fragment shaders. This has worked for quite a few shaders I've ported, almost 20. However some ports are extremely slow, and others include functions that aren't available.
Here is one of the shaders that is tripping me up:
https://www.shadertoy.com/view/4t2SRh
The fwidth() call in render() and mainImage() is not allowed within a metal compute shader. Metal Shader Language does however have fwidth(), but it can only be called within a fragment shader.
Here is my attempt at porting to a compute shader:
#include <metal_stdlib>
using namespace metal;
float float_mod(float f1, float f2) {
return f1-f2 * floor(f1/f2);
}
float sdfCircle(float2 center, float radius, float2 coord )
{
float2 offset = coord - center;
return sqrt((offset.x * offset.x) + (offset.y * offset.y)) - radius;
}
float sdfEllipse(float2 center, float a, float b, float2 coord)
{
float a2 = a * a;
float b2 = b * b;
return (b2 * (coord.x - center.x) * (coord.x - center.x) +
a2 * (coord.y - center.y) * (coord.y - center.y) - a2 * b2)/(a2 * b2);
}
float sdfLine(float2 p0, float2 p1, float width, float2 coord)
{
float2 dir0 = p1 - p0;
float2 dir1 = coord - p0;
float h = clamp(dot(dir0, dir1)/dot(dir0, dir0), 0.0, 1.0);
return (length(dir1 - dir0 * h) - width * 0.5);
}
float sdfUnion( const float a, const float b )
{
return min(a, b);
}
float sdfDifference( const float a, const float b)
{
return max(a, -b);
}
float sdfIntersection( const float a, const float b )
{
return max(a, b);
}
float anti(float d) {
return fwidth(d) * 1.0;
}
float4 render(float d, float3 color, float stroke)
{
//stroke = fwidth(d) * 2.0;
float anti = fwidth(d) * 1.0;
float4 strokeLayer = float4(float3(0.05), 1.0-smoothstep(-anti, anti, d - stroke));
float4 colorLayer = float4(color, 1.0-smoothstep(-anti, anti, d));
if (stroke < 0.000001) {
return colorLayer;
}
return float4(mix(strokeLayer.rgb, colorLayer.rgb, colorLayer.a), strokeLayer.a);
}
kernel void compute(texture2d<float, access::write> output [[texture(0)]],
texture2d<float, access::sample> input [[texture(1)]],
constant float &timer [[buffer(0)]],
uint2 gid [[thread_position_in_grid]])
{
float4 fragColor;
int width = output.get_width();
int height = output.get_height();
float2 resolution = float2(width,height);
float2 uv = float2(gid) / resolution;
float size = min(resolution.x, resolution.y);
float pixSize = 1.0 / size;
float stroke = pixSize * 1.5;
float2 center = float2(0.5, 0.5 * resolution.y/resolution.x);
float a = sdfEllipse(float2(0.5, center.y*2.0-0.34), 0.25, 0.25, uv);
float b = sdfEllipse(float2(0.5, center.y*2.0+0.03), 0.8, 0.35, uv);
b = sdfIntersection(a, b);
float4 layer1 = render(b, float3(0.32, 0.56, 0.53), fwidth(b) * 2.0);
// Draw strips
float4 layer2 = layer1;
float t, r0, r1, r2, e, f;
float2 sinuv = float2(uv.x, (sin(uv.x*40.0)*0.02 + 1.0)*uv.y);
for (float i = 0.0; i < 10.0; i++) {
t = float_mod(timer + 0.3 * i, 3.0) * 0.2;
r0 = (t - 0.15) / 0.2 * 0.9 + 0.1;
r1 = (t - 0.15) / 0.2 * 0.1 + 0.9;
r2 = (t - 0.15) / 0.2 * 0.15 + 0.85;
e = sdfEllipse(float2(0.5, center.y*2.0+0.37-t*r2), 0.7*r0, 0.35*r1, sinuv);
f = sdfEllipse(float2(0.5, center.y*2.0+0.41-t), 0.7*r0, 0.35*r1, sinuv);
f = sdfDifference(e, f);
f = sdfIntersection(f, b);
float4 layer = render(f, float3(1.0, 0.81, 0.27), 0.0);
layer2 = mix(layer2, layer, layer.a);
}
// Draw the handle
float bottom = 0.08;
float handleWidth = 0.01;
float handleRadius = 0.04;
float d = sdfCircle(float2(0.5-handleRadius+0.5*handleWidth, bottom), handleRadius, uv);
float c = sdfCircle(float2(0.5-handleRadius+0.5*handleWidth, bottom), handleRadius-handleWidth, uv);
d = sdfDifference(d, c);
c = uv.y - bottom;
d = sdfIntersection(d, c);
c = sdfLine(float2(0.5, center.y*2.0-0.05), float2(0.5, bottom), handleWidth, uv);
d = sdfUnion(d, c);
c = sdfCircle(float2(0.5, center.y*2.0-0.05), 0.01, uv);
d = sdfUnion(c, d);
c = sdfCircle(float2(0.5-handleRadius*2.0+handleWidth, bottom), handleWidth*0.5, uv);
d = sdfUnion(c, d);
float4 layer0 = render(d, float3(0.404, 0.298, 0.278), stroke);
float2 p = (2.0*float2(gid).xy-resolution.xy)/min(resolution.y,resolution.x);
float3 bcol = float3(1.0,0.8,0.7-0.07*p.y)*(1.0-0.25*length(p));
fragColor = float4(bcol, 1.0);
fragColor.rgb = mix(fragColor.rgb, layer0.rgb, layer0.a);
fragColor.rgb = mix(fragColor.rgb, layer1.rgb, layer1.a);
fragColor.rgb = mix(fragColor.rgb, layer2.rgb, layer2.a);
fragColor.rgb = pow(fragColor.rgb, float3(1.0/2.2));
output.write(fragColor,gid);
}
This doesn't compile, as fwidth() is not available. However, if I do get rid of fwidth(), it will compile... but of course not draw the right thing.
I was wondering if there is a better way to port this to a fragment/vertex shader, so that I can use MSL's fwidth() ? Or is writing it as a compute shader fine, and I should find a different way around using fwidth() ?

DX 11 Compute Shader\SharpDX Deferrerd Tiled lighting, Point light problems

I have just finished porting my engine from XNA to SharpDX(DX11).
Everything is going really well and I have conquered most of my issues without having to ask for help until now and I'm really stuck, maybe I just need another set of eye to look over my code idk but here it is.
I'm implementing tile based lighting (point lights only for now), I'm basing my code off the Intel sample because it's not as messy as the ATI one.
So my problem is that the lights move with the camera, I have looked all over the place to find a fix and I have tried everything (am I crazy?).
I just made sure all my normal and light vectors are in view space and normalized (still the same).
I have tried with the inverse View, inverse Projection, a mix of the both and a few other bits from over the net but I can't fix it.
So here is my CPU code:
Dim viewSpaceLPos As Vector3 = Vector3.Transform(New Vector3(pointlight.PosRad.X, pointlight.PosRad.Y, pointlight.PosRad.Z), Engine.Camera.EyeTransform)
Dim lightMatrix As Matrix = Matrix.Scaling(pointlight.PosRad.W) * Matrix.Translation(New Vector3(pointlight.PosRad.X, pointlight.PosRad.Y, pointlight.PosRad.Z))
Here is my CS shader code:
[numthreads(GROUP_WIDTH, GROUP_HEIGHT, GROUP_DEPTH)]
void TileLightingCS(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID)
{
int2 globalCoords = dispatchThreadID.xy;
uint groupIndex = GroupThreadID.y * GROUP_WIDTH + GroupThreadID.x;
float minZSample = FrameBufferCamNearFar.x;
float maxZSample = FrameBufferCamNearFar.y;
float2 gbufferDim;
DepthBuffer.GetDimensions(gbufferDim.x, gbufferDim.y);
float2 screenPixelOffset = float2(2.0f, -2.0f) / gbufferDim;
float2 positionScreen = (float2(globalCoords)+0.5f) * screenPixelOffset.xy + float2(-1.0f, 1.0f);
float depthValue = DepthBuffer[globalCoords].r;
float3 positionView = ComputePositionViewFromZ(positionScreen, Projection._43 / (depthValue - Projection._33));
// Avoid shading skybox/background or otherwise invalid pixels
float viewSpaceZ = positionView.z;
bool validPixel = viewSpaceZ >= FrameBufferCamNearFar.x && viewSpaceZ < FrameBufferCamNearFar.y;
[flatten] if (validPixel)
{
minZSample = min(minZSample, viewSpaceZ);
maxZSample = max(maxZSample, viewSpaceZ);
}
// How many total lights?
uint totalLights, dummy;
InputBuffer.GetDimensions(totalLights, dummy);
// Initialize shared memory light list and Z bounds
if (groupIndex == 0)
{
sTileNumLights = 0;
sMinZ = 0x7F7FFFFF; // Max float
sMaxZ = 0;
}
GroupMemoryBarrierWithGroupSync();
if (maxZSample >= minZSample) {
InterlockedMin(sMinZ, asuint(minZSample));
InterlockedMax(sMaxZ, asuint(maxZSample));
}
GroupMemoryBarrierWithGroupSync();
float minTileZ = asfloat(sMinZ);
float maxTileZ = asfloat(sMaxZ);
// Work out scale/bias from [0, 1]
float2 tileScale = float2(FrameBufferCamNearFar.zw) * rcp(float(2 * GROUP_WIDTH));
float2 tileBias = tileScale - float2(GroupID.xy);
// Now work out composite projection matrix
// Relevant matrix columns for this tile frusta
float4 c1 = float4(Projection._11 * tileScale.x, 0.0f, tileBias.x, 0.0f);
float4 c2 = float4(0.0f, -Projection._22 * tileScale.y, tileBias.y, 0.0f);
float4 c4 = float4(0.0f, 0.0f, 1.0f, 0.0f);
// Derive frustum planes
float4 frustumPlanes[6];
// Sides
frustumPlanes[0] = c4 - c1;
frustumPlanes[1] = c4 + c1;
frustumPlanes[2] = c4 - c2;
frustumPlanes[3] = c4 + c2;
// Near/far
frustumPlanes[4] = float4(0.0f, 0.0f, 1.0f, -minTileZ);
frustumPlanes[5] = float4(0.0f, 0.0f, -1.0f, maxTileZ);
// Normalize frustum planes (near/far already normalized)
[unroll] for (uint i = 0; i < 4; ++i)
{
frustumPlanes[i] *= rcp(length(frustumPlanes[i].xyz));
}
// Cull lights for this tile
for (uint lightIndex = groupIndex; lightIndex < totalLights; lightIndex += (GROUP_WIDTH * GROUP_HEIGHT))
{
PointLight light = InputBuffer[lightIndex];
float3 lightVS = light.PosRad.xyz;// mul(float4(light.Pos.xyz, 1), View);
// Cull: point light sphere vs tile frustum
bool inFrustum = true;
[unroll]
for (uint i = 0; i < 6; ++i)
{
float d = dot(frustumPlanes[i], float4(lightVS, 1.0f));
inFrustum = inFrustum && (d >= -light.PosRad.w);
}
[branch]
if (inFrustum)
{
uint listIndex;
InterlockedAdd(sTileNumLights, 1, listIndex);
sTileLightIndices[listIndex] = lightIndex;
}
}
GroupMemoryBarrierWithGroupSync();
uint numLights = sTileNumLights;
if (all(globalCoords < FrameBufferCamNearFar.zw))
{
float4 NormalMap = NormalBuffer[globalCoords];
float3 normal = DecodeNormal(NormalMap);
if (numLights > 0)
{
float3 lit = float3(0.0f, 0.0f, 0.0f);
for (uint tileLightIndex = 0; tileLightIndex < numLights; ++tileLightIndex)
{
PointLight light = InputBuffer[sTileLightIndices[tileLightIndex]];
float3 lDir = light.PosRad.xyz - positionView;
lDir = normalize(lDir);
float3 nl = saturate(dot(lDir, normal));
lit += ((light.Color.xyz * light.Color.a) * nl) * 0.1f;
}
PointLightColor[globalCoords] = float4(lit, 1);
}
else
{
PointLightColor[globalCoords] = 0;
}
}
GroupMemoryBarrierWithGroupSync();
}
So I know the culling works because there are lights drawn, they just move with the camera.
Could it be a handedness issue?
Am I setting my CPU light code up right?
Have I messed my spaces up?
What am I missing?
Am I reconstructing my position from depth wrong? (don't think it's this because the culling works)
ps. I write depth out like this:
VS shader
float4 viewSpacePos = mul(float4(input.Position,1), WV);
output.Depth=viewSpacePos.z ;
PS Shader
-input.Depth.x / FarClip

Pixel Shader performance on xbox

I've got a pixelshader (below) that i'm using with XNA. On my laptop (crappy graphics card) it runs a little jerky, but ok. I've just tried running it on the xbox and it's horrible!
There's nothing to the game (it's just a fractal renderer) so it's got to be the pixel shader causing the issues. I also think it's the PS code because i've lowered the iterations and it's ok. I've also checked, and the GC delta is zero.
Are there any HLSL functions that are no-no's on the xbox?? I must be doing something wrong here, performance can't be that bad!
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
float i;
float oldBailoutTest = 0;
float bailoutTest = 0;
for(i = 0; i < Iterations; i++)
{
z = ComPow(z, ZPower) + c;
bailoutTest = z.x * z.x + z.y * z.y;
if(bailoutTest >= ZPower * ZPower)
{
break;
}
oldBailoutTest = bailoutTest;
}
float normalisedIterations = i / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (i >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
Below is FractalBase.fxh:
float4x4 MatrixTransform : register(vs, c0);
float2 Pan;
float Zoom;
float Aspect;
void SpriteVertexShader(inout float4 Colour : COLOR0,
inout float2 texCoord : TEXCOORD0,
inout float4 position : SV_Position)
{
position = mul(position, MatrixTransform);
// Convert the position into from screen space into complex coordinates
texCoord = (position) * Zoom * float2(1, Aspect) - float2(Pan.x, -Pan.y);
}
EDIT I did try removing the conditional by using lots of lerps, however when i did that i got loads of artifacts (and not the kind that "belong in a museum"!). I changed things around, and fixed a few logic errors, however the key was to multiply the GreaterThan result by 1 + epsilon, to account for rounding errors just making 0.9999 = 0 (integer). See the fixed code below:
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float GreaterThan(float x, float y)
{
return ((x - y) / (2 * abs(x - y)) + 0.5) * 1.001;
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
int i;
float oldBailoutTest = 0;
float bailoutTest = 0;
int KeepGoing = 1;
int DoneIterations = Iterations;
int Bailout = 0;
for(i = 0; i < Iterations; i++)
{
z = lerp(z, ComPow(z, ZPower) + c, KeepGoing);
bailoutTest = lerp(bailoutTest, z.x * z.x + z.y * z.y, KeepGoing);
Bailout = lerp(Bailout, GreaterThan(bailoutTest, ZPower * ZPower), -abs(Bailout) + 1);
KeepGoing = lerp(KeepGoing, 0.0, Bailout);
DoneIterations = lerp(DoneIterations, min(i, DoneIterations), Bailout);
oldBailoutTest = lerp(oldBailoutTest, bailoutTest, KeepGoing);
}
float normalisedIterations = DoneIterations / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (DoneIterations >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
The xbox has a pretty large block size, so branching on the xbox isn't always so great. Also the compiler isn't always the most effective at emitting dynamic branches which your code seems to use.
Look into the branch attribute: http://msdn.microsoft.com/en-us/library/bb313972%28v=xnagamestudio.31%29.aspx
Also, if you move the early bailout, does the PC become more more similar to the Xbox?
Keep in mind that modern graphic cards are actually quite a bit faster then the Xenon unit by now.

Resources