I have sample metal code that I'm trying to convert to iOS. Is there an iOS compatible value that I can use for bt601?
#include <metal_stdlib>
#include "utilities.h" // error not found
using namespace metal;
kernel void laplace(texture2d<half, access::read> inTexture [[ texture(0) ]],
texture2d<half, access::read_write> outTexture [[ texture(1) ]],
uint2 gid [[ thread_position_in_grid ]]) {
constexpr int kernel_size = 3;
constexpr int radius = kernel_size / 2;
half3x3 laplace_kernel = half3x3(0, 1, 0,
1, -4, 1,
0, 1, 0);
half4 acc_color(0, 0, 0, 0);
for (int j = 0; j <= kernel_size - 1; j++) {
for (int i = 0; i <= kernel_size - 1; i++) {
uint2 textureIndex(gid.x + (i - radius), gid.y + (j - radius));
acc_color += laplace_kernel[i][j] * inTexture.read(textureIndex).rgba;
}
}
half value = dot(acc_color.rgb, bt601); //bt601 not defined
half4 gray_color(value, value, value, 1.0);
outTexture.write(gray_color, gid);
}
It seems that the intention here is simply to derive a single "luminance" value from the RGB output of the kernel. In that case, bt601 would be a three-element vector whose components are the desired weights of the respective channels, summing to 1.0.
Borrowing values from Rec. 601, we might define it like this:
float3 bt601(0.299f, 0.587f, 0.114f);
This is certainly a common choice. Another popular choice uses coefficients found in the Rec. 709 standard. That would look like this:
float3 bt709(0.212671f, 0.715160f, 0.072169f);
Both of these vectors will give you a single gray value that approximates the brightness of a linear sRGB color. Whether either of them is "correct" depends on the provenance of your data and how you process it further down the pipeline.
For whatever it's worth, the MetalPerformanceShaders MPSImageThresholdBinary kernel seems to favor the BT.601 values.
I'd recommend taking a look at this answer for more detail on the issues, and conditions under which the use of these values is appropriate.
Related
I need to overlay the edges detected in live video preview with a color of my choice (as is done in Lightroom CC app when you adjust focus). What's the easiest way to draw those lines in real time using Metal or CoreImage? I can use Sobel edge detection to detect the edges using Metal Performance Shader but not sure how to overlay the edges with a color of my choice.
Here is a edge detect shader for metal
kernel void edge_detect(texture2d<half, access::read> inTexture [[ texture(0) ]],
texture2d<half, access::write> outTexture [[ texture(1) ]],
uint2 gid [[ thread_position_in_grid ]]) {
constexpr int kernel_size = 3;
constexpr int radius = kernel_size / 2;
half3x3 horizontal_kernel = half3x3(-1./8., -1./8., -1./8.,
-1./8., 1., -1./8.,
-1./8., -1./8., -1./8.);
half3x3 vertical_kernel = half3x3(-1./8., -1./8., -1./8.,
-1./8., 1., -1./8.,
-1./8., -1./8., -1./8.);
half3 result_horizontal(0,0,0);
half3 result_vertical(0,0,0);
for(int j = 0; j <= kernel_size - 1; j++) {
for(int i = 0; i <= kernel_size - 1; i++) {
uint2 texture_index(gid.x + (i - radius), gid.y + (j - radius));
result_horizontal += horizontal_kernel[i][j] * inTexture.read(texture_index).rgb;
result_vertical += vertical_kernel[i][j] * inTexture.read(texture_index).rgb;
}
}
half3 bt601 = half3(0.299, 0.587, 0.114);
half gray_horizontal = dot(result_horizontal.rgb, bt601);
half gray_vertical = dot(result_vertical.rgb, bt601);
half magnitude = length(half2(gray_horizontal, gray_vertical));
outTexture.write(half4(half3(magnitude), 1), gid);
}
I know this is late, but if anyone still needs it I figured it out just now. It's very easy to find an edge detection shader, but not easy to figure out how to change the color of the detected edges, especially if you are new to this. Here is my kernel:
typedef struct {
simd_float3 rgb;
} AppliedColor;
kernel void edgeEffect(texture2d<half, access::read> inputTexture [[ texture(0) ]],
texture2d<half, access::read_write> outputTexture [[ texture(1) ]],
constant float &edgeStrength [[ buffer(0) ]],
constant AppliedColor &newColor [[ buffer(1) ]],
uint2 gid [[thread_position_in_grid]]) {
constexpr int kernelSize = 3;
constexpr int radius = kernelSize / 2;
half3x3 horizontalKernel = half3x3(-1, -2, -1,
0, 0, 0,
1, 2, 1);
half3x3 verticalKernel = half3x3(1, 0, -1,
2, 0, -2,
1, 0, -1);
half3 horizontalResult(0, 0, 0);
half3 verticalResult(0, 0, 0);
for(int j = 0; j <= kernelSize - 1; j++) {
for(int i = 0; i <= kernelSize - 1; i++) {
uint2 textureIndex(gid.x + (i - radius), gid.y + (j - radius));
horizontalResult += horizontalKernel[i][j] * inputTexture.read(textureIndex).rgb;
verticalResult += verticalKernel[i][j] * inputTexture.read(textureIndex).rgb;
}
}
half horizontalWhite = dot(horizontalResult.rgb, half3(1.0));
half verticalWhite = dot(verticalResult.rgb, half3(1.0));
half magnitude = length(half2(horizontalWhite, verticalWhite)) * edgeStrength;
outputTexture.write(half4(half3(newColor.rgb * magnitude), 1), gid);
}//edgeEffect
This is using Sobel kernels to calculate the derivatives.
Below is my kernel. It works wonderfully if both the input and output buffers contain RGBA-32 bit pixel data. I've made this kernel slightly inefficient to show Metal's seeming ineptitude in dealing with 24-bit data.
(I previously had this working with the input and output buffers being declared as containing uint32_t data)
kernel void stripe_Kernel(device const uchar *inBuffer [[ buffer(0) ]],
device uchar4 *outBuffer [[ buffer(1) ]],
device const ushort *imgWidth [[ buffer(2) ]],
device const ushort *imgHeight [[ buffer(3) ]],
device const ushort *packWidth [[ buffer(4) ]],
uint2 gid [[ thread_position_in_grid ]])
{
const ushort imgW = imgWidth[0];
const ushort imgH = imgHeight[0];
const ushort packW = packWidth[0]; // eg. 2048
uint32_t posX = gid.x; // eg. 0...2047
uint32_t posY = gid.y; // eg. 0...895
uint32_t sourceX = ((int)(posY/imgH)*packW + posX) % imgW;
uint32_t sourceY = (int)(posY%imgH);
uint32_t ptr = (sourceY*imgW + sourceX)*4; // this is for 32-bit data
uchar4 pixel = uchar4(inBuffer[ptr],inBuffer[ptr+1],inBuffer[ptr+2],255);
outBuffer[posY*packW + posX] = pixel;
}
I should mention that the inBuffer has been allocated as follows:
unsigned char *diskFrame;
posix_memalign((void *)&diskFrame,0x4000,imgHeight*imgWidth*4);
Now... if I actually have 24-bit data in there, and use multipliers of 3 (wherever I have 4), I get a entirely black image.
What's with that?
I am writing a metal cnn code.
Metal provides MPSCNNLocalContrastNormalization,
Since the concept of Instance Normalization is slightly different, I intend to implement it as a Kernel Function.
However, the problem is that the mean and variance for each R, G, B should be obtained when feature is R, G, B in texture received from input in kernel function.
I want to get some hints on how to implement this.
kernel void instance_normalization_2darray(texture2d_array<float, access::sample> src [[ texture(0) ]],
texture2d_array<float, access::write> dst [[ texture(1) ]],
uint3 tid [[thread_position_in_grid]]) {
}
kernel void calculate_avgA(texture2d_array<float, access::read> texture_in [[texture(0)]],
texture2d_array<float, access::write> texture_out [[texture(1)]],
uint3 tid [[thread_position_in_grid]])
{
int width = texture_in.get_width();
int height = texture_in.get_height();
int depth = texture_in.get_array_size();
float4 outColor;
uint3 kernelIndex(0,0,0);
uint3 textureIndex(0,0,0);
for(int k = 0; k < depth; k++) {
outColor = (0.0, 0.0, 0.0, 0.0);
for (int i=0; i < width; i++)
{
for (int j=0; j < height; j++)
{
kernelIndex = uint3(i, j, k);
textureIndex = uint3(tid.x + i, tid.y + j, tid.z + k);
float4 color = texture_in.read(textureIndex.xy, textureIndex.z).rgba;
outColor += color;
}
}
outColor = outColor / (width * height);
texture_out.write(float4(outColor.rgba), tid.xy, textureIndex.z);
}
}
Mr.Bista
I had the same problem for this, apple didn't provide some function for this with fast speed.
And I just use MPSCNNPoolingAverage for caculate mean before kernels.
Maybe it is a temporary method for it.
And other algorithm is not better than this ,such as reduction sum algorithm after my test with codes.
So I will continue to track better implementation for this.
As mentioned in Apple's document, texture2d of shading language could be of int type. I have tried to use texture2d of int type as parameter of shader language, but the write method of texture2d failed to work.
kernel void dummy(texture2d<int, access::write> outTexture [[ texture(0) ]],
uint2 gid [[ thread_position_in_grid ]])
{
outTexture.write( int4( 2, 4, 6, 8 ), gid );
}
However, if I replace the int with float, it worked.
kernel void dummy(texture2d<float, access::write> outTexture [[ texture(0) ]],
uint2 gid [[ thread_position_in_grid ]])
{
outTexture.write( float4( 1.0, 0, 0, 1.0 ), gid );
}
Could other types of texture2d, such texture2d of int, texture2d of short and so on, be used as shader function parameters, and how to use them? Thanks for reviewing my question.
The related host codes:
MTLTextureDescriptor *desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA8Unorm
desc.usage = MTLTextureUsageShaderWrite;
id<MTLTexture> texture = [device newTextureWithDescriptor:desc];
[commandEncoder setTexture:texture atIndex:0];
The code to show the output computed by GPU, w and h represents width and height of textrue, respectively.
uint8_t* imageBytes = malloc(w*h*4);
memset( imageBytes, 0, w*h*4 );
MTLRegion region = MTLRegionMake2D(0, 0, [texture width], [texture height]);
[texture getBytes:imageBytes bytesPerRow:[texture width]*4 fromRegion:region mipmapLevel:0];
for( int j = 0; j < h; j++ )
{
printf("%3d: ", j);
for( int i = 0; i < w*pixel_size; i++ )
{
printf(" %3d",imageBytes[j*w*pixel_size+i] );
}
printf("\n")
}
The problem is that the pixel format you used to create this texture (MTLPixelFormatRGBA8Unorm) is normalized, meaning that the expected pixel value range is 0.0-1.0. For normalized pixel types, the required data type for reading or writing to this texture within a Metal kernel is float or half-float.
In order to write to a texture with integers, you must select an integer pixel format. Here are all of the available formats:
https://developer.apple.com/documentation/metal/mtlpixelformat
The Metal Shading Language Guide states that:
Note: If T is int or short, the data associated with the texture must use a signed integer format. If T is uint or ushort, the data associated with the texture must use an unsigned integer format.
All you have to do is make sure the texture you write to in the API (host code) matches what you have in the kernel function. Alternatively, you can also cast the int values into float before writing to the outTexture.
I have a MTLTexture containing 16bit unsigned integers (MTLPixelFormatR16Uint). The values range from about 7000 to 20000, with 0 being used as a 'nodata' value, which is why it is skipped in the code below. I'd like to find the minimum and maximum values so I can rescale these values between 0-255. Ultimately I'll be looking to base the minimum and maximum values on a histogram of the data (it has some outliers), but for now I'm stuck on simply extracting the min/max.
I can read the data from the GPU to CPU and pull the min/max values out but would prefer to perform this task on the GPU.
First attempt
The command encoder is dispatched with 16x16 threads per thread group, the number of thread groups is based on the texture size (eg; width = textureWidth / 16, height = textureHeight / 16).
typedef struct {
atomic_uint min;
atomic_uint max;
} BandMinMax;
kernel void minMax(texture2d<ushort, access::read> band1 [[texture(0)]],
device BandMinMax &out [[buffer(0)]],
uint2 gid [[thread_position_in_grid]])
{
ushort value = band1.read(gid).r;
if (value != 0) {
uint currentMin = atomic_load_explicit(&out.min, memory_order_relaxed);
uint currentMax = atomic_load_explicit(&out.max, memory_order_relaxed);
if (value > currentMax) {
atomic_store_explicit(&out.max, value, memory_order_relaxed);
}
if (value < currentMin) {
atomic_store_explicit(&out.min, value, memory_order_relaxed);
}
}
}
From this I get a minimum and maximum value, but for the same dataset the min and max will often return different values. Fairly certain this is the min and max from a single thread when there are multiple threads running.
Second attempt
Building on the previous attempt, this time I'm storing the individual min/max values from each thread, all 256 (16x16).
kernel void minMax(texture2d<ushort, access::read> band1 [[texture(0)]],
device BandMinMax *out [[buffer(0)]],
uint2 gid [[thread_position_in_grid]],
uint tid [[ thread_index_in_threadgroup ]])
{
ushort value = band1.read(gid).r;
if (value != 0) {
uint currentMin = atomic_load_explicit(&out[tid].min, memory_order_relaxed);
uint currentMax = atomic_load_explicit(&out[tid].max, memory_order_relaxed);
if (value > currentMax) {
atomic_store_explicit(&out[tid].max, value, memory_order_relaxed);
}
if (value < currentMin) {
atomic_store_explicit(&out[tid].min, value, memory_order_relaxed);
}
}
}
This returns an array containing 256 sets of min/max values. From these I guess I could find the lowest of the minimum values, but this seems like a poor approach. Would appreciate a pointer in the right direction, thanks!
The Metal Shading Language has atomic compare-and-swap functions you can use to compare the existing value at a memory location with a value, and replace the value at that location if they don't compare equal. With these, you can create a set of atomic compare-and-replace-if-[greater|less]-than operations:
static void atomic_uint_exchange_if_less_than(volatile device atomic_uint *current, uint candidate)
{
uint val;
do {
val = *((device uint *)current);
} while ((candidate < val || val == 0) && !atomic_compare_exchange_weak_explicit(current,
&val,
candidate,
memory_order_relaxed,
memory_order_relaxed));
}
static void atomic_uint_exchange_if_greater_than(volatile device atomic_uint *current, uint candidate)
{
uint val;
do {
val = *((device uint *)current);
} while (candidate > val && !atomic_compare_exchange_weak_explicit(current,
&val,
candidate,
memory_order_relaxed,
memory_order_relaxed));
}
To apply these, you might create a buffer that contains one interleaved min, max pair per threadgroup. Then, in the kernel function, read from the texture and conditionally write the min and max values:
kernel void min_max_per_threadgroup(texture2d<ushort, access::read> texture [[texture(0)]],
device uint *mapBuffer [[buffer(0)]],
uint2 tpig [[thread_position_in_grid]],
uint2 tgpig [[threadgroup_position_in_grid]],
uint2 tgpg [[threadgroups_per_grid]])
{
ushort val = texture.read(tpig).r;
device atomic_uint *atomicBuffer = (device atomic_uint *)mapBuffer;
atomic_uint_exchange_if_less_than(atomicBuffer + ((tgpig[1] * tgpg[0] + tgpig[0]) * 2),
val);
atomic_uint_exchange_if_greater_than(atomicBuffer + ((tgpig[1] * tgpg[0] + tgpig[0]) * 2) + 1,
val);
}
Finally, run a separate kernel to reduce over this buffer and collect the final min, max values across the entire texture:
kernel void min_max_reduce(constant uint *mapBuffer [[buffer(0)]],
device uint *reduceBuffer [[buffer(1)]],
uint2 tpig [[thread_position_in_grid]])
{
uint minv = mapBuffer[tpig[0] * 2];
uint maxv = mapBuffer[tpig[0] * 2 + 1];
device atomic_uint *atomicBuffer = (device atomic_uint *)reduceBuffer;
atomic_uint_exchange_if_less_than(atomicBuffer, minv);
atomic_uint_exchange_if_greater_than(atomicBuffer + 1, maxv);
}
Of course, you can only reduce over the total allowed thread execution width of the device (~256), so you may need to do the reduction in multiple passes, with each one reducing the size of the data to be operated on by a factor of the maximum thread execution width.
Disclaimer: This may not be the best technique, but it does appear to be correct in my limited testing of an OS X implementation. It was marginally faster than a naive CPU implementation on a 256x256 texture on Intel Iris Pro, but substantially slower on an Nvidia GT 750M (because of dispatch overhead).
Very interesting discussion.
I am going to share my metal code that will help your understanding.
kernel void grayscale_minmax(texture2d<half, access::read> inTexture [[texture(0)]],
texture2d<half, access::write> outTexture [[texture(1)]],
device atomic_uint *min_max [[buffer(0)]],
uint2 gid [[thread_position_in_grid]],
uint tid [[thread_index_in_threadgroup]],
uint2 tsz [[threads_per_threadgroup]])
{
// local_atomic[0]: min value, local_atomic[1]: max value
threadgroup atomic_uint local_count, local_atomic[2];
if (tid == 0) { // initialize thread group local vars
atomic_store_explicit(&local_atomic[0], 255, memory_order_relaxed);
atomic_store_explicit(&local_atomic[1], 0, memory_order_relaxed);
atomic_store_explicit(&local_count, 0, memory_order_relaxed);
}
if ((gid.x >= outTexture.get_width()) || (gid.y >= outTexture.get_height())) {
atomic_fetch_add_explicit(&local_count, 1, memory_order_relaxed);
uint count = atomic_load_explicit(&local_count, memory_order_relaxed);
// when threadgroup calculation ends up, update device vars
if (count >= (tsz.x*tsz.y)) {
uint threadgroup_min_val = atomic_load_explicit(&local_atomic[0], memory_order_relaxed);
uint threadgroup_max_val = atomic_load_explicit(&local_atomic[1], memory_order_relaxed);
atomic_fetch_min_explicit(&min_max[0], threadgroup_min_val, memory_order_relaxed);
atomic_fetch_max_explicit(&min_max[1], threadgroup_max_val, memory_order_relaxed);
}
return;
}
// true color to gray scaled
const half4 inColor = inTexture.read(gid);
const half outColor = dot(inColor.rgb, half3(0.299, 0.587, 0.114));
const uint intColor = uint(clamp(outColor, 0.h, 1.h)*255.h);
// wait for other threads in the thread group stopping work
threadgroup_barrier(mem_flags::mem_threadgroup);
// update local variables
atomic_fetch_min_explicit(&local_atomic[0], intColor, memory_order_relaxed);
atomic_fetch_max_explicit(&local_atomic[1], intColor, memory_order_relaxed);
atomic_fetch_add_explicit(&local_count, 1, memory_order_relaxed);
uint count = atomic_load_explicit(&local_count, memory_order_relaxed);
// when threadgroup calculation ends up, update device vars
if (count >= (tsz.x*tsz.y)) {
uint threadgroup_min_val = atomic_load_explicit(&local_atomic[0], memory_order_relaxed);
uint threadgroup_max_val = atomic_load_explicit(&local_atomic[1], memory_order_relaxed);
atomic_fetch_min_explicit(&min_max[0], threadgroup_min_val, memory_order_relaxed);
atomic_fetch_max_explicit(&min_max[1], threadgroup_max_val, memory_order_relaxed);
}
outTexture.write(half4(half3(outColor), 1.h), gid);
}