This seems to be a silly question, but I can't find a good way to loop through an array and currently, I have to pass a buffer that contains the element count to my kernel function.
kernel void test_func(constant const int2* array [[ buffer(0) ]],
constant const int& arrayCount [[ buffer(1) ]],
device half4* result [[ buffer(2) ]],
uint2 pos [[thread_position_in_grid]]) {
// some code to end early if pos is outside of my data
for(ulong i = 0; i < sizeof(array) / sizeof(int2) /*(ulong) arrayCount*/; i += 1 ) {
// do something
}
}
Calculation using sizeof always yields incorrect results, on the other hand, using the count buffer return correct results. Seems like MSL doesn't support for each loop of c++ 11.
There should be a better way to do this, right?
Below is my kernel. It works wonderfully if both the input and output buffers contain RGBA-32 bit pixel data. I've made this kernel slightly inefficient to show Metal's seeming ineptitude in dealing with 24-bit data.
(I previously had this working with the input and output buffers being declared as containing uint32_t data)
kernel void stripe_Kernel(device const uchar *inBuffer [[ buffer(0) ]],
device uchar4 *outBuffer [[ buffer(1) ]],
device const ushort *imgWidth [[ buffer(2) ]],
device const ushort *imgHeight [[ buffer(3) ]],
device const ushort *packWidth [[ buffer(4) ]],
uint2 gid [[ thread_position_in_grid ]])
{
const ushort imgW = imgWidth[0];
const ushort imgH = imgHeight[0];
const ushort packW = packWidth[0]; // eg. 2048
uint32_t posX = gid.x; // eg. 0...2047
uint32_t posY = gid.y; // eg. 0...895
uint32_t sourceX = ((int)(posY/imgH)*packW + posX) % imgW;
uint32_t sourceY = (int)(posY%imgH);
uint32_t ptr = (sourceY*imgW + sourceX)*4; // this is for 32-bit data
uchar4 pixel = uchar4(inBuffer[ptr],inBuffer[ptr+1],inBuffer[ptr+2],255);
outBuffer[posY*packW + posX] = pixel;
}
I should mention that the inBuffer has been allocated as follows:
unsigned char *diskFrame;
posix_memalign((void *)&diskFrame,0x4000,imgHeight*imgWidth*4);
Now... if I actually have 24-bit data in there, and use multipliers of 3 (wherever I have 4), I get a entirely black image.
What's with that?
Anyone knows a proper way to calculate mean value of the buffer with random float numbers in the metal kernel?
Dispatching work on the compute command encoder:
threadsPerGroup = MTLSizeMake(1, 1, inputTexture.arrayLength);
numThreadGroups = MTLSizeMake(1, 1, inputTexture.arrayLength / threadsPerGroup.depth);
[commandEncoder dispatchThreadgroups:numThreadGroups
threadsPerThreadgroup:threadsPerGroup];
Kernel code:
kernel void mean(texture2d_array<float, access::read> inTex [[ texture(0) ]],
device float *means [[ buffer(1) ]],
uint3 id [[ thread_position_in_grid ]]) {
if (id.x == 0 && id.y == 0) {
float mean = 0.0;
for (uint i = 0; i < inTex.get_width(); ++i) {
for (uint j = 0; j < inTex.get_height(); ++j) {
mean += inTex.read(uint2(i, j), id.z)[0];
}
}
float textureArea = inTex.get_width() * inTex.get_height();
mean /= textureArea;
out[id.z] = mean;
}
}
The buffer is represented in the texture of texture2d_array type with R32Float pixel format.
If you can use an array of uint (instead of float) as your data source, I would suggest using an "Atomic Fetch and Modify functions" (as described in the metal shading language spec) to write atomically to a buffer.
Here's an example of a kernel function which takes an input buffer (data: an array of Float) and writes the sum of the buffer into an atomic buffer (sum, a pointer to a uint):
kernel void sum(device uint *data [[ buffer(0) ]],
volatile device atomic_uint *sum [[ buffer(1) ]],
uint gid [[ thread_position_in_grid ]])
{
atomic_fetch_add_explicit(sum, data[gid], memory_order_relaxed);
}
In your swift file, you would set the buffers:
...
let data: [UInt] = [1, 2, 3, 4]
let dataBuffer = device.makeBuffer(bytes: &data, length: (data.count * MemoryLayout<UInt>.size), options: [])
commandEncoder.setBuffer(dataBuffer, offset: 0, at: 0)
var sum:UInt = 0
let sumBuffer = device!.makeBuffer(bytes: &sum, length: MemoryLayout<UInt>.size, options: [])
commandEncoder.setBuffer(sumBuffer, offset: 0, at: 1)
commandEncoder.endEncoding()
Commit, wait and then fetch the data from the GPU:
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let nsData = NSData(bytesNoCopy: sumBuffer.contents(),
length: sumBuffer.length,
freeWhenDone: false)
nsData.getBytes(&sum, length:sumBuffer.length)
let mean = Float(sum/data.count)
print(mean)
Alternatively, if your initial data source has to be an array of float, you could use the vDSP_meanv method of the Accelerate framework which is very fast for such computation.
I Hope that helped, cheers!
As mentioned in Apple's document, texture2d of shading language could be of int type. I have tried to use texture2d of int type as parameter of shader language, but the write method of texture2d failed to work.
kernel void dummy(texture2d<int, access::write> outTexture [[ texture(0) ]],
uint2 gid [[ thread_position_in_grid ]])
{
outTexture.write( int4( 2, 4, 6, 8 ), gid );
}
However, if I replace the int with float, it worked.
kernel void dummy(texture2d<float, access::write> outTexture [[ texture(0) ]],
uint2 gid [[ thread_position_in_grid ]])
{
outTexture.write( float4( 1.0, 0, 0, 1.0 ), gid );
}
Could other types of texture2d, such texture2d of int, texture2d of short and so on, be used as shader function parameters, and how to use them? Thanks for reviewing my question.
The related host codes:
MTLTextureDescriptor *desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA8Unorm
desc.usage = MTLTextureUsageShaderWrite;
id<MTLTexture> texture = [device newTextureWithDescriptor:desc];
[commandEncoder setTexture:texture atIndex:0];
The code to show the output computed by GPU, w and h represents width and height of textrue, respectively.
uint8_t* imageBytes = malloc(w*h*4);
memset( imageBytes, 0, w*h*4 );
MTLRegion region = MTLRegionMake2D(0, 0, [texture width], [texture height]);
[texture getBytes:imageBytes bytesPerRow:[texture width]*4 fromRegion:region mipmapLevel:0];
for( int j = 0; j < h; j++ )
{
printf("%3d: ", j);
for( int i = 0; i < w*pixel_size; i++ )
{
printf(" %3d",imageBytes[j*w*pixel_size+i] );
}
printf("\n")
}
The problem is that the pixel format you used to create this texture (MTLPixelFormatRGBA8Unorm) is normalized, meaning that the expected pixel value range is 0.0-1.0. For normalized pixel types, the required data type for reading or writing to this texture within a Metal kernel is float or half-float.
In order to write to a texture with integers, you must select an integer pixel format. Here are all of the available formats:
https://developer.apple.com/documentation/metal/mtlpixelformat
The Metal Shading Language Guide states that:
Note: If T is int or short, the data associated with the texture must use a signed integer format. If T is uint or ushort, the data associated with the texture must use an unsigned integer format.
All you have to do is make sure the texture you write to in the API (host code) matches what you have in the kernel function. Alternatively, you can also cast the int values into float before writing to the outTexture.
I have a MTLTexture containing 16bit unsigned integers (MTLPixelFormatR16Uint). The values range from about 7000 to 20000, with 0 being used as a 'nodata' value, which is why it is skipped in the code below. I'd like to find the minimum and maximum values so I can rescale these values between 0-255. Ultimately I'll be looking to base the minimum and maximum values on a histogram of the data (it has some outliers), but for now I'm stuck on simply extracting the min/max.
I can read the data from the GPU to CPU and pull the min/max values out but would prefer to perform this task on the GPU.
First attempt
The command encoder is dispatched with 16x16 threads per thread group, the number of thread groups is based on the texture size (eg; width = textureWidth / 16, height = textureHeight / 16).
typedef struct {
atomic_uint min;
atomic_uint max;
} BandMinMax;
kernel void minMax(texture2d<ushort, access::read> band1 [[texture(0)]],
device BandMinMax &out [[buffer(0)]],
uint2 gid [[thread_position_in_grid]])
{
ushort value = band1.read(gid).r;
if (value != 0) {
uint currentMin = atomic_load_explicit(&out.min, memory_order_relaxed);
uint currentMax = atomic_load_explicit(&out.max, memory_order_relaxed);
if (value > currentMax) {
atomic_store_explicit(&out.max, value, memory_order_relaxed);
}
if (value < currentMin) {
atomic_store_explicit(&out.min, value, memory_order_relaxed);
}
}
}
From this I get a minimum and maximum value, but for the same dataset the min and max will often return different values. Fairly certain this is the min and max from a single thread when there are multiple threads running.
Second attempt
Building on the previous attempt, this time I'm storing the individual min/max values from each thread, all 256 (16x16).
kernel void minMax(texture2d<ushort, access::read> band1 [[texture(0)]],
device BandMinMax *out [[buffer(0)]],
uint2 gid [[thread_position_in_grid]],
uint tid [[ thread_index_in_threadgroup ]])
{
ushort value = band1.read(gid).r;
if (value != 0) {
uint currentMin = atomic_load_explicit(&out[tid].min, memory_order_relaxed);
uint currentMax = atomic_load_explicit(&out[tid].max, memory_order_relaxed);
if (value > currentMax) {
atomic_store_explicit(&out[tid].max, value, memory_order_relaxed);
}
if (value < currentMin) {
atomic_store_explicit(&out[tid].min, value, memory_order_relaxed);
}
}
}
This returns an array containing 256 sets of min/max values. From these I guess I could find the lowest of the minimum values, but this seems like a poor approach. Would appreciate a pointer in the right direction, thanks!
The Metal Shading Language has atomic compare-and-swap functions you can use to compare the existing value at a memory location with a value, and replace the value at that location if they don't compare equal. With these, you can create a set of atomic compare-and-replace-if-[greater|less]-than operations:
static void atomic_uint_exchange_if_less_than(volatile device atomic_uint *current, uint candidate)
{
uint val;
do {
val = *((device uint *)current);
} while ((candidate < val || val == 0) && !atomic_compare_exchange_weak_explicit(current,
&val,
candidate,
memory_order_relaxed,
memory_order_relaxed));
}
static void atomic_uint_exchange_if_greater_than(volatile device atomic_uint *current, uint candidate)
{
uint val;
do {
val = *((device uint *)current);
} while (candidate > val && !atomic_compare_exchange_weak_explicit(current,
&val,
candidate,
memory_order_relaxed,
memory_order_relaxed));
}
To apply these, you might create a buffer that contains one interleaved min, max pair per threadgroup. Then, in the kernel function, read from the texture and conditionally write the min and max values:
kernel void min_max_per_threadgroup(texture2d<ushort, access::read> texture [[texture(0)]],
device uint *mapBuffer [[buffer(0)]],
uint2 tpig [[thread_position_in_grid]],
uint2 tgpig [[threadgroup_position_in_grid]],
uint2 tgpg [[threadgroups_per_grid]])
{
ushort val = texture.read(tpig).r;
device atomic_uint *atomicBuffer = (device atomic_uint *)mapBuffer;
atomic_uint_exchange_if_less_than(atomicBuffer + ((tgpig[1] * tgpg[0] + tgpig[0]) * 2),
val);
atomic_uint_exchange_if_greater_than(atomicBuffer + ((tgpig[1] * tgpg[0] + tgpig[0]) * 2) + 1,
val);
}
Finally, run a separate kernel to reduce over this buffer and collect the final min, max values across the entire texture:
kernel void min_max_reduce(constant uint *mapBuffer [[buffer(0)]],
device uint *reduceBuffer [[buffer(1)]],
uint2 tpig [[thread_position_in_grid]])
{
uint minv = mapBuffer[tpig[0] * 2];
uint maxv = mapBuffer[tpig[0] * 2 + 1];
device atomic_uint *atomicBuffer = (device atomic_uint *)reduceBuffer;
atomic_uint_exchange_if_less_than(atomicBuffer, minv);
atomic_uint_exchange_if_greater_than(atomicBuffer + 1, maxv);
}
Of course, you can only reduce over the total allowed thread execution width of the device (~256), so you may need to do the reduction in multiple passes, with each one reducing the size of the data to be operated on by a factor of the maximum thread execution width.
Disclaimer: This may not be the best technique, but it does appear to be correct in my limited testing of an OS X implementation. It was marginally faster than a naive CPU implementation on a 256x256 texture on Intel Iris Pro, but substantially slower on an Nvidia GT 750M (because of dispatch overhead).
Very interesting discussion.
I am going to share my metal code that will help your understanding.
kernel void grayscale_minmax(texture2d<half, access::read> inTexture [[texture(0)]],
texture2d<half, access::write> outTexture [[texture(1)]],
device atomic_uint *min_max [[buffer(0)]],
uint2 gid [[thread_position_in_grid]],
uint tid [[thread_index_in_threadgroup]],
uint2 tsz [[threads_per_threadgroup]])
{
// local_atomic[0]: min value, local_atomic[1]: max value
threadgroup atomic_uint local_count, local_atomic[2];
if (tid == 0) { // initialize thread group local vars
atomic_store_explicit(&local_atomic[0], 255, memory_order_relaxed);
atomic_store_explicit(&local_atomic[1], 0, memory_order_relaxed);
atomic_store_explicit(&local_count, 0, memory_order_relaxed);
}
if ((gid.x >= outTexture.get_width()) || (gid.y >= outTexture.get_height())) {
atomic_fetch_add_explicit(&local_count, 1, memory_order_relaxed);
uint count = atomic_load_explicit(&local_count, memory_order_relaxed);
// when threadgroup calculation ends up, update device vars
if (count >= (tsz.x*tsz.y)) {
uint threadgroup_min_val = atomic_load_explicit(&local_atomic[0], memory_order_relaxed);
uint threadgroup_max_val = atomic_load_explicit(&local_atomic[1], memory_order_relaxed);
atomic_fetch_min_explicit(&min_max[0], threadgroup_min_val, memory_order_relaxed);
atomic_fetch_max_explicit(&min_max[1], threadgroup_max_val, memory_order_relaxed);
}
return;
}
// true color to gray scaled
const half4 inColor = inTexture.read(gid);
const half outColor = dot(inColor.rgb, half3(0.299, 0.587, 0.114));
const uint intColor = uint(clamp(outColor, 0.h, 1.h)*255.h);
// wait for other threads in the thread group stopping work
threadgroup_barrier(mem_flags::mem_threadgroup);
// update local variables
atomic_fetch_min_explicit(&local_atomic[0], intColor, memory_order_relaxed);
atomic_fetch_max_explicit(&local_atomic[1], intColor, memory_order_relaxed);
atomic_fetch_add_explicit(&local_count, 1, memory_order_relaxed);
uint count = atomic_load_explicit(&local_count, memory_order_relaxed);
// when threadgroup calculation ends up, update device vars
if (count >= (tsz.x*tsz.y)) {
uint threadgroup_min_val = atomic_load_explicit(&local_atomic[0], memory_order_relaxed);
uint threadgroup_max_val = atomic_load_explicit(&local_atomic[1], memory_order_relaxed);
atomic_fetch_min_explicit(&min_max[0], threadgroup_min_val, memory_order_relaxed);
atomic_fetch_max_explicit(&min_max[1], threadgroup_max_val, memory_order_relaxed);
}
outTexture.write(half4(half3(outColor), 1.h), gid);
}