Optimization in Shaders - ios

I got following message while profiling in the xcode. It suggest that i am using 32 bit floating point. In any case there is no issue for me to use 16 bit precision floating point. but If i do that it shows error.
how can i improve the performance following is the shader function. For texture sampling can we use half4.
struct VertexOutCalculatedTextureBezier {
float4 pos[[position]];
float4 color;
float2 textureCoordinates;
int8_t index;
};
struct BezierCalParameters
{
float2 a;
float angle;
float size;
int8_t index;
};
Vertex shader function which is suggested while profiling
vertex VertexOutCalculatedTextureBezier bezier_calculated_vertex_texture(constant BezierCalParameters *allParams[[buffer(0)]],
uint vertexId [[vertex_id]],
uint instanceId [[instance_id]])
{
BezierCalParameters params = allParams[instanceId];
VertexOutCalculatedTextureBezier vo;
vo.pos.xy = params.a;
if(vertexId % 4 == 0){
vo.pos.x = params.a.x + params.size * sinpi(params.angle) ;
vo.pos.y = params.a.y - params.size * cospi(params.angle);
vo.textureCoordinates = float2(0,0);
vo.index = params.index ;
}else if(vertexId % 4 == 1){
vo.pos.x = params.a.x + params.size * cospi(params.angle);
vo.pos.y = params.a.y + params.size * sinpi(params.angle);
vo.textureCoordinates = float2(0,1);;
vo.index = params.index ;
}
else if(vertexId % 4 == 2){
vo.pos.x = params.a.x - params.size * cospi(params.angle) ;
vo.pos.y = params.a.y - params.size * sinpi(params.angle);
vo.textureCoordinates = float2(1,0);
vo.index = params.index ;
}
else if(vertexId % 4 == 3){
vo.pos.x = params.a.x - params.size * sinpi(params.angle);
vo.pos.y = params.a.y + params.size * cospi(params.angle) ;
vo.textureCoordinates = float2(1,1);
vo.index = params.index ;
}
vo.pos.zw = float2(0, 1);
return vo;
}
The fragment function corresponding to this function
fragment half4 bezier_fragment_calculated_texture(VertexOutCalculatedTextureBezier params[[stage_in]],
texture2d<float, access::sample>texture [[texture(0)]],
texture2d<float, access::sample> texture2 [[texture(1)]],
texture2d<float, access::sample> texture3 [[texture(2)]])
{
constexpr sampler defaultSampler;
if(params.index == 0) {
float4 color =texture.sample(defaultSampler, params.textureCoordinates) * float4(1,0,0,0.68);
return half4(color);
}else if(params.index == 1) {
float4 color =texture2.sample(defaultSampler, params.textureCoordinates) * float4(1,0,0,0.68);
return half4(color);
}else if(params.index == 2) {
float4 color =texture3.sample(defaultSampler, params.textureCoordinates) * float4(1,0,0,0.68);
return half4(color);
}
float4 color =texture.sample(defaultSampler, params.textureCoordinates) * float4(1,0,0,0.68);
return half4(color);
}

Related

Kernel Function

I am drawing squares around the points i passed to the Shader
override func computedraw(computeCommandEncoder: MTLComputeCommandEncoder) {
computeCommandEncoder.setComputePipelineState(pipelineState)
computeCommandEncoder.setTexture(self.texture, index: 0)
if(pointsArray.count > 0){
var count:Int = 4;
computeCommandEncoder.setBytes(&pointsArray, length:MemoryLayout<float2>.stride, index: 0)
computeCommandEncoder.setBytes(&count, length:MemoryLayout<Int>.stride, index: 1)
let threadGroupCount = MTLSizeMake(2, 2, 1)
let threadGroups = MTLSizeMake((self.texture?.width)! / threadGroupCount.width, (self.texture?.height)! / threadGroupCount.height, 1)
computeCommandEncoder.dispatchThreadgroups(threadGroups, threadsPerThreadgroup: threadGroupCount)
}
}
The PointsArray
var pointsArray : [float2] = [float2(0.40,-0.5),float2(0.20,-0.5),float2(0.0,0.0),float2(0.56,-0.4)]
In Kernel Function
float2 touchPointF(float2 tap) {
float deviceWidth = 2732;
float deviceHeight = 2048;
float2 touchPoint = (0,0);
touchPoint.x = ((tap.x + 1) * deviceWidth) / 2;
touchPoint.y = ((-1 * (tap.y - 1 )) * deviceHeight) / 2;
return touchPoint;
}
kernel void computeTool(
constant float2 *point [[buffer(0)]],
constant int &pointCount [[buffer(1)]],
texture2d<float,access::read_write> des [[texture(0)]],
// texture2d<float,access::read> star [[texture(1)]],
uint2 gid [[thread_position_in_grid]])
{
for (int i = 0; i < pointCount; ++i) {
float2 x = touchPointF(point[i]) ;
if ((gid.x > (uint(x.x) - 40) && (gid.x < (uint(x.x) + 40) )) && (gid.y > (uint(x.y) -40) && gid.y < (uint(x.y) + 40) )) {
des.write(float4(float(pointCount)/10,0.0,0.0,1.0), gid);
}
}
}
I pass 4 points to the shader but it only draws two points in the screen. Is it problem with the kernel function Issue or Problem with Kernel Threading
computeCommandEncoder.setBytes(&pointsArray, length:MemoryLayout<float2>.stride, index: 0)
Here length I Need to multiply by count of pointsArray

Unity3D shader bug on ios device

I use water shader but it bug when build to ios device (still work fine on android, editor, mac).Normal
Bug on ios
Bug: Plane has shader alway on top of screen and plane has been upside down.
I already debug on device but nothing error ...
Here is shader code
Shader "RealisticWater/WaterMobile" {
Properties {
_Color ("Main Color", Color) = (1,1,1,1)
_ReflectionColor ("Reflection Color", Color) = (1,1,1,1)
_Wave1 ("Wave1 Distortion Texture", 2D) = "bump" {}
_Wave2 ("Wave2 Distortion Texture", 2D) = "bump" {}
_Cube("Reflection Map", Cube) = "" {}
_Direction ("Waves Direction 1 & 2", Vector) = (1.0 ,1.0, -1.0, -1.0)
_FPOW("FPOW Fresnel", float) = 5.0
_R0("R0 Fresnel", float) = 0.05
_OffsetFresnel("Offset Fresnel", float) = 0.1
_Distortion ("Distortion", float) = 500
_DistortionVert ("Per Vertex Distortion", Float) = 1
_GAmplitude ("Wave Amplitude", Vector) = (0.1 ,0.3, 0.2, 0.15)
_GFrequency ("Wave Frequency", Vector) = (0.6, 0.5, 0.5, 1.8)
_GSteepness ("Wave Steepness", Vector) = (1.0, 2.0, 1.5, 1.0)
_GSpeed ("Wave Speed", Vector) = (-0.23, -1.25, -3.0, 1.5)
_GDirectionAB ("Wave Direction", Vector) = (0.3 ,0.5, 0.85, 0.25)
_GDirectionCD ("Wave Direction", Vector) = (0.1 ,0.9, 0.5, 0.5)
_WaveScale("Waves Scale", float) = 1
_TexturesScale("Textures Scale", Float) = 1
}
Category {
Tags { "Queue"="Transparent+1" "RenderType"="Transparent" }
ZWrite Off
Cull Off
SubShader {
Tags { "Queue"="Transparent+1" "RenderType"="Transparent" }
Pass {
CGPROGRAM
#pragma vertex vert
#pragma fragment frag
#pragma fragmentoption ARB_precision_hint_fastest
#pragma glsl_no_auto_normalization
#pragma target 3.0
#pragma multi_compile cubeMap_on cubeMap_off
//#pragma glsl
#include "UnityCG.cginc"
fixed _WaveScale;
float _TexturesScale;
fixed4 _Color;
fixed4 _GlareColor;
fixed4 _ReflectionColor;
sampler2D _GrabTextureMobile;
fixed4 _GrabTextureMobile_TexelSize;
sampler2D _Wave1;
sampler2D _Wave2;
samplerCUBE _Cube;
fixed4 _Direction;
fixed _FPOW;
fixed _R0;
fixed _OffsetFresnel;
fixed _Distortion;
fixed _DistortionVert;
sampler2D _ReflectionTex;
fixed _Bias;
fixed _Scale;
fixed _Power;
fixed _Shininess;
fixed4 _LightColor0;
fixed4 _GAmplitude;
fixed4 _GFrequency;
fixed4 _GSteepness;
fixed4 _GSpeed;
fixed4 _GDirectionAB;
fixed4 _GDirectionCD;
fixed4 _Wave1_ST;
fixed4 _Wave2_ST;
fixed4 _Wave3_ST;
sampler2D _WaterDisplacementTexture;
struct appdata_t {
fixed4 vertex : POSITION;
fixed2 texcoord: TEXCOORD0;
fixed3 normal : NORMAL;
};
struct v2f {
fixed4 vertex : POSITION;
fixed4 uvgrab : TEXCOORD0;
fixed4 uvWave12 : TEXCOORD1;
fixed4 offset : TEXCOORD2;
#if cubeMap_on
fixed3 reflectionDir : TEXCOORD3;
#endif
};
v2f vert (appdata_t v)
{
v2f o;
fixed2 time1 = fixed2(fmod(_Time.x*_Direction.x, 1), fmod(_Time.x*_Direction.y, 1));
fixed2 time2 = fixed2(fmod(_Time.x*_Direction.z, 1), fmod(_Time.x*_Direction.w, 1));
fixed3 posWorld = mul(unity_ObjectToWorld, v.vertex).xyz;
fixed2 scaleeUv = -posWorld.xz / _TexturesScale;
o.uvWave12.xy = scaleeUv * _Wave1_ST.xy + _Wave1_ST.w + time1;
o.uvWave12.zw = scaleeUv * _Wave2_ST.xy + _Wave2_ST.w + time2;
//--------------------Gerstner waves-------------
fixed2 vtxForAni = posWorld.xz / _WaveScale;
fixed3 offsets;
fixed4 AB = _GSteepness.xxyy * _GAmplitude.xxyy * _GDirectionAB.xyzw;
fixed4 CD = _GSteepness.zzww * _GAmplitude.zzww * _GDirectionCD.xyzw;
fixed4 dotABCD = _GFrequency.xyzw * fixed4(dot(_GDirectionAB.xy, vtxForAni), dot(_GDirectionAB.zw, vtxForAni), dot(_GDirectionCD.xy, vtxForAni), dot(_GDirectionCD.zw, vtxForAni));
fixed4 TIME = fmod(_Time.y * _GSpeed, 6.2831);
fixed4 COS = cos (dotABCD + TIME);
fixed4 SIN = sin (dotABCD + TIME);
offsets.x = dot(COS, fixed4(AB.xz, CD.xz));
offsets.z = dot(COS, fixed4(AB.yw, CD.yw));
offsets.y = dot(SIN, _GAmplitude);
//------------------------------------------------
v.vertex.xyz += offsets;
fixed4 oPos = UnityObjectToClipPos(v.vertex);
#if UNITY_UV_STARTS_AT_TOP
fixed scale = -1.0;
#else
fixed scale = 1.0;
#endif
o.uvgrab.xy = (fixed2(oPos.x, oPos.y*scale) + oPos.w) * 0.5;
o.uvgrab.zw = oPos.zw;
o.uvgrab.xy += (offsets.xz + offsets.y*offsets.y)/_DistortionVert;
fixed3 normWorld = normalize(mul((fixed3x3)(unity_ObjectToWorld), v.normal).xyz);
#if cubeMap_on
fixed3 normalDir = normalize(mul(float4(v.normal, 0.0), unity_WorldToObject).xyz);
o.reflectionDir = reflect(posWorld - _WorldSpaceCameraPos, normalDir);
#endif
o.offset.xy = _Distortion*fixed2(0.001, 0.001);
o.offset.zw = o.offset.xy/30;
o.offset.xy = o.offset.xy*o.offset.xy*o.offset.xy;
o.vertex = UnityObjectToClipPos(v.vertex);
return o;
}
fixed4 frag( v2f i ) : COLOR
{
fixed2 normal1 = UnpackNormal(tex2D(_Wave1, i.uvWave12.xy)).rg;
fixed2 normal2 = UnpackNormal(tex2D(_Wave2, i.uvWave12.zw + normal1)).rg;
fixed2 offset = normal2 * normal2 * normal2 * i.offset.xy + normal2 * i.offset.zw;
i.uvgrab.xy = offset * i.uvgrab.z + i.uvgrab.xy;
#if UNITY_VERSION >= 460
fixed4 coord = UNITY_PROJ_COORD(i.uvgrab);
#else
fixed4 coord = i.uvgrab;
#endif
fixed3 reflection;
#if cubeMap_off
reflection = tex2Dproj(_ReflectionTex, coord).rgb * _ReflectionColor.rgb;
#endif
#if cubeMap_on
reflection = texCUBE(_Cube, i.reflectionDir) * _ReflectionColor.rgb;
#endif
fixed3 grab = tex2Dproj(_GrabTextureMobile, coord).rgb;
fixed3 col = grab * _Color.rgb + reflection + reflection * offset.x * 10;
return fixed4(col, 1);
}
ENDCG
}
}
// ------------------------------------------------------------------
// Fallback for older cards and Unity non-Pro
SubShader {
Blend DstColor Zero
Pass {
Name "BASE"
SetTexture [_MainTex] { combine texture }
}
}
}
}
Sorry if my english not good. Thanks you guys !

Function in Metal

I have next function:
float4 blur(float rad, texture2d<float> tex2D, sampler sampler2D, float2 textureCoordinate){
float width = tex2D.get_width();
float height = tex2D.get_height();
float weight = 1 / ((2 * rad + 1) * (2 * rad + 1));
float4 blured_color = float4(0,0,0,0);
for(int i = -1 * rad; i <= rad; i++){
for (int j = -1 * rad; j <= rad; j++){
blured_color += tex2D.sample(sampler2D, textureCoordinate + float2(i/width, j/height)) * weight;
}
}
return blured_color;
}
It blurs given fragment.
My problem is that, when I call this function it doesn't work properly - it just make picture darker. But when I write the same code without wrapping it in function it works okay:
fragment float4 blured_background_fragment(VertexOut interpolated [[ stage_in ]],
texture2d<float> tex2D [[ texture(0) ]],
sampler sampler2D [[ sampler(0) ]])
{
float4 color = tex2D.sample(sampler2D, interpolated.textureCoordinate);
float3 color3 = float3(color[0] , color[1] , color[2]);
if (is_skin(color3) && !(interpolated.color[0] == 1 && interpolated.color[1] == 1 && interpolated.color[2] == 1)){
float width = tex2D.get_width();
float height = tex2D.get_height();
float rad = 13;
float weight = 1 / ((2 * rad + 1) * (2 * rad + 1));
float4 blured_color = float4(0,0,0,0);
for(int i = -1 * rad; i <= rad; i++){
for (int j = -1 * rad; j <= rad; j++){
blured_color += tex2D.sample(sampler2D, interpolated.textureCoordinate + float2(i/width, j/height)) * weight;
}
}
// Here I try to call this blur function
// float4 blured_color = blur(13, tex2D, sampler2D, interpolated.textureCoordinate);
return blured_color * 0.43 + color * 0.57;
}
else{
return tex2D.sample(sampler2D, interpolated.textureCoordinate);
}
}

Nondeterministic artifacts in HLSL shader

I have a HLSL shader (compiled using fxc and ps_2_b) that is giving me flickering artifacts. First I need to establish that all the variables seem to be constant, and their values with the following shader:
float4 src_rect : register(c0);
float4 main(float2 uv : TEXCOORD) : COLOR {
float2 uvmin = {src_rect.x, src_rect.z};
float2 uvmax = {src_rect.y, src_rect.w};
float2 uv1 = (uv - uvmin)/(uvmax - uvmin);
float4 c = 1;
c.rgb = 0;
if (uv.x == 639)
c.rg = 1;
else if (uv.y == 359)
c.g = 1;
else if (uv.y == 0)
c.b = 1;
else if (uv.x == 0)
c.gb = 1;
else {
if ((uvmin.x == 0) && (uvmin.y == 0)
&& (uvmax.x == 640) && (uvmax.y == 360))
c.rgb = float3(1, 0, 0);
}
return c;
}
Here is the output of this shader:
Image
So uvmin = {0, 0} and uvmax = {640, 360}. The top-left pixel uv is (0, 0) and the bottom-right pixel uv is (639, 359). Here's a shader reproducing my issue:
float4 src_rect : register(c0);
float4 main(float2 uv : TEXCOORD) : COLOR {
float2 uvmin = {src_rect.x, src_rect.z};
float2 uvmax = {src_rect.y, src_rect.w};
float2 uv1 = (uv - uvmin)/(uvmax - uvmin);
float4 c = 1;
c.rgb = 0;
if (uv.x == 639)
c.rg = 1;
else if (uv.y == 359)
c.g = 1;
else if (uv.y == 0)
c.b = 1;
else if (uv.x == 0)
c.gb = 1;
else {
c.rgb = frac(floor(uv1.x*1.001)*9.999);
}
return c;
}
The output of this shader produces the output below, but with the white parts flickering black and white:
Image
If I plug the values for uvmin and uvmax in as constants, the problem no longer occurs. Here is the shader:
float4 src_rect : register(c0);
float4 main(float2 uv : TEXCOORD) : COLOR {
float2 uvmin = {0, 0};
float2 uvmax = {640, 360};
float2 uv1 = (uv - uvmin)/(uvmax - uvmin);
float4 c = 1;
c.rgb = 0;
if (uv.x == 639)
c.rg = 1;
else if (uv.y == 359)
c.g = 1;
else if (uv.y == 0)
c.b = 1;
else if (uv.x == 0)
c.gb = 1;
else {
c.rgb = frac(floor(uv1.x*1.001)*9.999);
}
return c;
}
StackOVerflow won't let me include a link to the output, but it is the same as the initial image but with black instead of red.

Pixel Shader performance on xbox

I've got a pixelshader (below) that i'm using with XNA. On my laptop (crappy graphics card) it runs a little jerky, but ok. I've just tried running it on the xbox and it's horrible!
There's nothing to the game (it's just a fractal renderer) so it's got to be the pixel shader causing the issues. I also think it's the PS code because i've lowered the iterations and it's ok. I've also checked, and the GC delta is zero.
Are there any HLSL functions that are no-no's on the xbox?? I must be doing something wrong here, performance can't be that bad!
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
float i;
float oldBailoutTest = 0;
float bailoutTest = 0;
for(i = 0; i < Iterations; i++)
{
z = ComPow(z, ZPower) + c;
bailoutTest = z.x * z.x + z.y * z.y;
if(bailoutTest >= ZPower * ZPower)
{
break;
}
oldBailoutTest = bailoutTest;
}
float normalisedIterations = i / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (i >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
Below is FractalBase.fxh:
float4x4 MatrixTransform : register(vs, c0);
float2 Pan;
float Zoom;
float Aspect;
void SpriteVertexShader(inout float4 Colour : COLOR0,
inout float2 texCoord : TEXCOORD0,
inout float4 position : SV_Position)
{
position = mul(position, MatrixTransform);
// Convert the position into from screen space into complex coordinates
texCoord = (position) * Zoom * float2(1, Aspect) - float2(Pan.x, -Pan.y);
}
EDIT I did try removing the conditional by using lots of lerps, however when i did that i got loads of artifacts (and not the kind that "belong in a museum"!). I changed things around, and fixed a few logic errors, however the key was to multiply the GreaterThan result by 1 + epsilon, to account for rounding errors just making 0.9999 = 0 (integer). See the fixed code below:
#include "FractalBase.fxh"
float ZPower;
float3 Colour;
float3 ColourScale;
float ComAbs(float2 Arg)
{
return sqrt(Arg.x * Arg.x + Arg.y * Arg.y);
}
float2 ComPow(float2 Arg, float Power)
{
float Mod = pow(Arg.x * Arg.x + Arg.y * Arg.y, Power / 2);
float Ang = atan2(Arg.y, Arg.x) * Power;
return float2(Mod * cos(Ang), Mod * sin(Ang));
}
float GreaterThan(float x, float y)
{
return ((x - y) / (2 * abs(x - y)) + 0.5) * 1.001;
}
float4 FractalPixelShader(float2 texCoord : TEXCOORD0, uniform float Iterations) : COLOR0
{
float2 c = texCoord.xy;
float2 z = 0;
int i;
float oldBailoutTest = 0;
float bailoutTest = 0;
int KeepGoing = 1;
int DoneIterations = Iterations;
int Bailout = 0;
for(i = 0; i < Iterations; i++)
{
z = lerp(z, ComPow(z, ZPower) + c, KeepGoing);
bailoutTest = lerp(bailoutTest, z.x * z.x + z.y * z.y, KeepGoing);
Bailout = lerp(Bailout, GreaterThan(bailoutTest, ZPower * ZPower), -abs(Bailout) + 1);
KeepGoing = lerp(KeepGoing, 0.0, Bailout);
DoneIterations = lerp(DoneIterations, min(i, DoneIterations), Bailout);
oldBailoutTest = lerp(oldBailoutTest, bailoutTest, KeepGoing);
}
float normalisedIterations = DoneIterations / Iterations;
float factor = (bailoutTest - oldBailoutTest) / (ZPower * ZPower - oldBailoutTest);
float4 Result = normalisedIterations + (1 / factor / Iterations);
Result = (DoneIterations >= Iterations - 1) ? float4(0.0, 0.0, 0.0, 1.0) : float4(Result.x * Colour.r * ColourScale.x, Result.y * Colour.g * ColourScale.y, Result.z * Colour.b * ColourScale.z, 1);
return Result;
}
technique Technique1
{
pass
{
VertexShader = compile vs_3_0 SpriteVertexShader();
PixelShader = compile ps_3_0 FractalPixelShader(128);
}
}
The xbox has a pretty large block size, so branching on the xbox isn't always so great. Also the compiler isn't always the most effective at emitting dynamic branches which your code seems to use.
Look into the branch attribute: http://msdn.microsoft.com/en-us/library/bb313972%28v=xnagamestudio.31%29.aspx
Also, if you move the early bailout, does the PC become more more similar to the Xbox?
Keep in mind that modern graphic cards are actually quite a bit faster then the Xenon unit by now.

Resources