simulating dynamic memory allocation in OpenCl - memory

I ran into a problem which is making me crazy.
I need to simulate dynamic memory allocation in OpenCl kernel. In this regard, I have the following malloc function defined in a *.cl file:
__global void* malloc(size_t size, __global byte *heap, __global uint *next)
{
uint index = atomic_add(next, size);
return heap+index;
}
In the host program, I dynamically dedicate a large array of type cl_uchar for this virtual heap as follows:
int MAX_NUM_OF_HEADERS_PROCESSED_IN_PARALLEL = 1000;
cl_uchar* heap = new cl_byte[1000000];
cl_uint *next = new cl_uint;
*next = 0;
cl_uint * test_result =
new cl_uint[MAX_NUM_OF_HEADERS_PROCESSED_IN_PARALLEL];
cl_mem memory[3]= { 0, 0, 0};
cl_int error;
memory[0] = clCreateBuffer(GPU_context,
CL_MEM_READ_WRITE, sizeof(cl_uchar) * MAX_HEAP_SIZE, NULL,
NULL);
memory[1] = clCreateBuffer(GPU_context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL,
&error);
memory[2] = clCreateBuffer(GPU_context, CL_MEM_READ_WRITE,
sizeof(cl_uint) * MAX_NUM_OF_HEADERS_PROCESSED_IN_PARALLEL, NULL,
&error);
clEnqueueWriteBuffer(command_queue, memory[0], CL_TRUE, 0,
sizeof(cl_uchar) * MAX_HEAP_SIZE, heap, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, memory[1], CL_TRUE, 0, sizeof(cl_uint),
next, 0, NULL, NULL);
error = 0;
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &memory[0]);
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memory[1]);
error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memory[2]);
size_t globalWorkSize[1] = { MAX_NUM_OF_HEADERS_PROCESSED_IN_PARALLEL };
size_t localWorkSize[1] = { 1 };
error = 0;
error = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
globalWorkSize, localWorkSize, 0, NULL, NULL);
I also have the following kernel:
__kernel void packet_routing2(__global byte* heap_, __global uint* next, __global uint* test_result){
int gid = get_global_id(0);
__global uint*xx[100];
for ( int i = 0 ; i < 100; i ++)
{
xx[i] = (__global uint*) malloc(sizeof(uint),heap_,next);
*xx[i] = i*gid;
result[gid] = *(xx[0]);
}
I encounterd the following error when I run the program:
" %27 = load i32 addrspace(1)* %26, align 4, !tbaa !17
Illegal pointer which is not from a valid memory space.
Aborting..."
Could you please help me fix this issue. I also found out that if xx has only 10 elements, instead of 100, the code works well !!!!

Edit: Simplest solution: add a padding value to 'size' before malloc so all struct types (that are lesser in size than max-padding) receive necessary alignment conditions.
0=struct footprint in memory
*=heap
_=padding
***000_____*****0000____****0_______****00000___*****0000000_*******00______***
|
v
save this unused padded memory space in its thread to use later.
it is important that first/starting address value needs to satisfy maximum alignment requirements. If there is a struct 256-byte long, it should have multiple of 256 to start.
struct size malloc size minimum 'next' value (address, not offset)
1-4 4 multiple of 4
5-8 8 multiple of 8
9-16 16 multiple of 16
17-32 32 32*k
33-64 64 64*k
if there is 64-byte struct, even an int needs 64-byte malloc size now. Maybe you can save that values locally per thread to use that remaining unused areas.
So it doesnt give alignment errors and probably works faster for those don't.
Also float3 needs 16 byte natively.

Related

Number of thread increase but no effect on runtime

I have tried to implement alpha image blending algorithm in CUDA C. There is no error in my code. It compiled fine. As per the thread logic, If I run the code with the increased number of threads the runtime should be decreased. In my code, I got a weird pattern of run time. When I run the code with 1 thread the runtime was 8.060539 e-01 sec, when I run the code with 4 thread I got the runtime 7.579031 e-01 sec, When It ran for 8 threads the runtime was 7.810102e-01, and for 256 thread the runtime is 7.875319e-01.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
__global__ void image_blend(unsigned char *Pout, unsigned char *pin1, unsigned char *pin2, int width, int height, int channels, float alpha){
int col = threadIdx.x + blockIdx.x*blockDim.x;
int row = threadIdx.y + blockIdx.y*blockDim.y;
if(col<width && row<height){
size_t img_size = width * height * channels;
if (Pout != NULL)
{
for (size_t i = 0; i < img_size; i++)
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
}
}
}
int main(int argc, char* argv[]){
int thread_count;
double start, finish;
float alpha;
int width, height, channels;
unsigned char *new_img;
thread_count = strtol(argv[1], NULL, 10);
printf("Enter the value for alpha:");
scanf("%f", &alpha);
unsigned char *apple = stbi_load("apple.jpg", &width, &height, &channels, 0);
unsigned char *orange = stbi_load("orange.jpg", &width, &height, &channels, 0);
size_t img_size = width * height * channels;
//unsigned char *new_img = malloc(img_size);
cudaMallocManaged(&new_img,img_size*sizeof(unsigned char));
cudaMallocManaged(&apple,img_size* sizeof(unsigned char));
cudaMallocManaged(&orange, img_size*sizeof(unsigned char));
GET_TIME(start);
image_blend<<<1,16,thread_count>>>(new_img,apple, orange, width, height, channels,alpha);
cudaDeviceSynchronize();
GET_TIME(finish);
stbi_write_jpg("new_image.jpg", width, height, channels, new_img, 100);
cudaFree(new_img);
cudaFree(apple);
cudaFree(orange);
printf("\n Elapsed time for cuda = %e seconds\n", finish-start);
}
After getting a weird pattern in the runtime I am bit skeptical about the implementation of the code. Can anyone let me know why I get those runtime even if my code has no bug.
Let's start here:
image_blend<<<1,16,thread_count>>>(new_img,apple, orange, width, height, channels,alpha);
It seems evident you don't understand the kernel launch syntax:
<<<1,16,thread_count>>>
The first number (1) is the number of blocks to launch.
The second number (16) is the number of threads per block.
The third number (thread_count) is the size of the dynamically allocated shared memory in bytes.
So our first observation will be that although you claimed to have changed the thread count, you didn't. You were changing the number of bytes of dynamically allocated shared memory. Since your kernel code doesn't use shared memory, this is a completely meaningless variable.
Let's also observe your kernel code:
for (size_t i = 0; i < img_size; i++)
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
For every thread that passes your if test, each one of those threads will execute the entire for-loop and will process the entire image. That is not the general idea with writing CUDA kernels. The general idea is to break up the work so that each thread does a portion of the work, not the whole activity.
These are very basic observations. If you take advantage of an orderly introduction to CUDA, such as here, you can get beyond some of these basic concepts.
We could also point out that your kernel nominally expects a 2D launch, and you are not providing one, and perhaps many other observations. Another important concept that you are missing is that you cannot do this:
unsigned char *apple = stbi_load("apple.jpg", &width, &height, &channels, 0);
...
cudaMallocManaged(&apple,img_size* sizeof(unsigned char));
and expect anything sensible to come from that. If you want to see how data is moved from a host allocation to the device, study nearly any CUDA sample code, such as vectorAdd. Using a managed allocation doesn't allow you to overwrite the pointer like you are doing and get anything useful from that.
I'll provide an example of how one might go about doing what I think you are suggesting, without providing a complete tutorial on CUDA. To provide an example, I'm going to skip the STB image loading routines. To understand the work you are trying to do here, the actual image content does not matter.
Here's an example of an image processing kernel (1D) that will:
Process the entire image, only once
Use less time, roughly speaking, as you increase the thread count.
You haven't provided your timer routine/code, so I'll provide my own:
$ cat t2130.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
unsigned char *i_load(int w, int h, int c, int init){
unsigned char *res = new unsigned char[w*h*c];
for (int i = 0; i < w*h*c; i++) res[i] = init;
return res;
}
__global__ void image_blend(unsigned char *Pout, unsigned char *pin1, unsigned char *pin2, int width, int height, int channels, float alpha){
if (Pout != NULL)
{
size_t img_size = width * height * channels;
for (size_t i = blockIdx.x*blockDim.x+threadIdx.x; i < img_size; i+=gridDim.x*blockDim.x) // grid-stride loop
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
}
}
int main(int argc, char* argv[]){
int threads_per_block = 64;
unsigned long long dt;
float alpha;
int width = 1920;
int height = 1080;
int channels = 3;
size_t img_size = width * height * channels;
int thread_count = img_size;
if (argc > 1) thread_count = atoi(argv[1]);
unsigned char *new_img, *m_apple, *m_orange;
printf("Enter the value for alpha:");
scanf("%f", &alpha);
unsigned char *apple = i_load(width, height, channels, 10);
unsigned char *orange = i_load(width, height, channels, 70);
//unsigned char *new_img = malloc(img_size);
cudaMallocManaged(&new_img,img_size*sizeof(unsigned char));
cudaMallocManaged(&m_apple,img_size* sizeof(unsigned char));
cudaMallocManaged(&m_orange, img_size*sizeof(unsigned char));
memcpy(m_apple, apple, img_size);
memcpy(m_orange, orange, img_size);
int blocks;
if (thread_count < threads_per_block) {threads_per_block = thread_count; blocks = 1;}
else {blocks = thread_count/threads_per_block;}
printf("running with %d blocks of %d threads\n", blocks, threads_per_block);
dt = dtime_usec(0);
image_blend<<<blocks, threads_per_block>>>(new_img,m_apple, m_orange, width, height, channels,alpha);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("CUDA Error: %s\n", cudaGetErrorString(err));
else printf("\n Elapsed time for cuda = %e seconds\n", dt/(float)USECPSEC);
cudaFree(new_img);
cudaFree(m_apple);
cudaFree(m_orange);
}
$ nvcc -o t2130 t2130.cu
$ ./t2130 1
Enter the value for alpha:0.2
running with 1 blocks of 1 threads
Elapsed time for cuda = 5.737880e-01 seconds
$ ./t2130 2
Enter the value for alpha:0.2
running with 1 blocks of 2 threads
Elapsed time for cuda = 3.230150e-01 seconds
$ ./t2130 32
Enter the value for alpha:0.2
running with 1 blocks of 32 threads
Elapsed time for cuda = 4.865200e-02 seconds
$ ./t2130 64
Enter the value for alpha:0.2
running with 1 blocks of 64 threads
Elapsed time for cuda = 2.623300e-02 seconds
$ ./t2130 128
Enter the value for alpha:0.2
running with 2 blocks of 64 threads
Elapsed time for cuda = 1.546000e-02 seconds
$ ./t2130
Enter the value for alpha:0.2
running with 97200 blocks of 64 threads
Elapsed time for cuda = 5.809000e-03 seconds
$
(CentOS 7, CUDA 11.4, V100)
The key methodology that allows the kernel to do all the work (only once) while making use of an "arbitrary" number of threads efficiently is the grid-stride loop.

CMakeListx.txt file for OpenCL on OS X

I would like to make as automatic as possible the compilation and linking of my code projects using OpenCL on OS X, I know how to do it for C++ but I am experiencing problems for OpenCL. This is the code that I am using as an example:
main.cpp:
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__ //Mac OSX has a different name for the header file
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MEM_SIZE (128)//suppose we have a vector with 128 elements
#define MAX_SOURCE_SIZE (0x100000)
int main()
{
//In general Intel CPU and NV/AMD's GPU are in different platforms
//But in Mac OSX, all the OpenCL devices are in the platform "Apple"
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL; //"stream" in CUDA
cl_mem memobj = NULL;//device memory
cl_program program = NULL; //cl_prgram is a program executable created from the source or binary
cl_kernel kernel = NULL; //kernel function
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret; //accepts return values for APIs
float mem[MEM_SIZE]; //alloc memory on host(CPU) ram
//OpenCL source can be placed in the source code as text strings or read from another file.
FILE *fp;
const char fileName[] = "./kernel.cl";
size_t source_size;
char *source_str;
cl_int i;
// read the kernel file into ram
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
fclose( fp );
//initialize the mem with 1,2,3...,n
for( i = 0; i < MEM_SIZE; i++ ) {
mem[i] = i;
}
//get the device info
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//create context on the specified device
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
//create the command_queue (stream)
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//alloc mem on the device with the read/write flag
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret);
//copy the memory from host to device, CL_TRUE means blocking write/read
ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);
//create a program object for a context
//load the source code specified by the text strings into the program object
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
//build (compiles and links) a program executable from the program source or binary
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//create a kernel object with specified name
kernel = clCreateKernel(program, "vecAdd", &ret);
//set the argument value for a specific argument of a kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
//define the global size and local size (grid size and block size in CUDA)
size_t global_work_size[3] = {MEM_SIZE, 0, 0};
size_t local_work_size[3] = {MEM_SIZE, 0, 0};
//Enqueue a command to execute a kernel on a device ("1" indicates 1-dim work)
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
//copy memory from device to host
ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);
//print out the result
for(i=0; i<MEM_SIZE; i++) {
printf("mem[%d] : %.2f\n", i, mem[i]);
}
//clFlush only guarantees that all queued commands to command_queue get issued to the appropriate device
//There is no guarantee that they will be complete after clFlush returns
ret = clFlush(command_queue);
//clFinish blocks until all previously queued OpenCL commands in command_queue are issued to the associated device and have completed.
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobj);//free memory on device
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(source_str);//free memory on host
return 0;
}
kernel.cl:
__kernel void vecAdd(__global float* a)
{
int gid = get_global_id(0);// in CUDA = blockIdx.x * blockDim.x + threadIdx.x
a[gid] += a[gid];
}
and this is my CMakelists.txt so far:
#Minimal OpenCL CMakeLists.txt by StreamHPC
cmake_minimum_required (VERSION 3.1)
project(GreatProject)
# Handle OpenCL
find_package(OpenCL REQUIRED)
include_directories(${OpenCL_INCLUDE_DIRS})
link_directories(${OpenCL_LIBRARY})
add_executable (main main.cpp)
target_include_directories (main PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries (main ${OpenCL_LIBRARY})
Apparently it compiles, but when I run the executable I get the error:
Failed to load kernel.
I compiled successfully the code by hand following this answer, but my project is willing to have various kernels and various C++ files and headers, therefore I would like to use CMake in order to automatize the compilation of the project.
How should I modify my CMakeLists.txt script?
NOTE:
I guess that the file kernel.cl is not being compiled, I don't what is a proper way to guarantee having a CMakeLists.txt that always compiles all the *.cl files in the project directory in addition to all the *.cpp. Would be even better if it is posible to linking agains MKL.
For mac opencl is used as a framework you need to do the following to link libraries from the framework.
cmake_minimum_required (VERSION 2.6)
project (montecarlo_cl)
find_package(OpenCL REQUIRED)
include_directories( ${OPENCL_INCLUDE_DIR})
set (montecarlo_cl_src montecarlo_ocl.c)
add_executable (montecarlo_cl ${montecarlo_cl_src})
target_link_libraries(montecarlo_cl "-framework OpenCL" )

OpenCL :Access proper index by using globalid(.)

Hi,
I am coding in OpenCL.
I am converting a "C function" having 2D array starting from i=1 and j=1 .PFB .
cv::Mat input; //Input :having some data in it ..
//Image input size is :input.rows=288 ,input.cols =640
cv::Mat output(input.rows-2,input.cols-2,CV_32F); //Output buffer
//Image output size is :output.rows=286 ,output.cols =638
This is a code Which I want to modify in OpenCL:
for(int i=1;i<output.rows-1;i++)
{
for(int j=1;j<output.cols-1;j++)
{
float xVal = input.at<uchar>(i-1,j-1)-input.at<uchar>(i-1,j+1)+ 2*(input.at<uchar>(i,j-1)-input.at<uchar>(i,j+1))+input.at<uchar>(i+1,j-1) - input.at<uchar>(i+1,j+1);
float yVal = input.at<uchar>(i-1,j-1) - input.at<uchar>(i+1,j-1)+ 2*(input.at<uchar>(i-1,j) - input.at<uchar>(i+1,j))+input.at<uchar>(i-1,j+1)-input.at<uchar>(i+1,j+1);
output.at<float>(i-1,j-1) = xVal*xVal+yVal*yVal;
}
}
...
Host code :
//Input Image size is :input.rows=288 ,input.cols =640
//Output Image size is :output.rows=286 ,output.cols =638
OclStr->global_work_size[0] =(input.cols);
OclStr->global_work_size[1] =(input.rows);
size_t outBufSize = (output.rows) * (output.cols) * 4;//4 as I am copying all 4 uchar values into one float variable space
cl_mem cl_input_buffer = clCreateBuffer(
OclStr->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR ,
(input.rows) * (input.cols),
static_cast<void *>(input.data), &OclStr->returnstatus);
cl_mem cl_output_buffer = clCreateBuffer(
OclStr->context, CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR ,
(output.rows) * (output.cols) * sizeof(float),
static_cast<void *>(output.data), &OclStr->returnstatus);
OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 0, sizeof(cl_mem), (void *)&cl_input_buffer);
OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 1, sizeof(cl_mem), (void *)&cl_output_buffer);
OclStr->returnstatus = clEnqueueNDRangeKernel(
OclStr->command_queue,
OclStr->objkernel,
2,
NULL,
OclStr->global_work_size,
NULL,
0,
NULL,
NULL
);
clEnqueueMapBuffer(OclStr->command_queue, cl_output_buffer, true, CL_MAP_READ, 0, outBufSize, 0, NULL, NULL, &OclStr->returnstatus);
kernel Code :
__kernel void Sobel_uchar (__global uchar *pSrc, __global float *pDstImage)
{
const uint cols = get_global_id(0)+1;
const uint rows = get_global_id(1)+1;
const uint width= get_global_size(0);
uchar Opsoble[8];
Opsoble[0] = pSrc[(cols-1)+((rows-1)*width)];
Opsoble[1] = pSrc[(cols+1)+((rows-1)*width)];
Opsoble[2] = pSrc[(cols-1)+((rows+0)*width)];
Opsoble[3] = pSrc[(cols+1)+((rows+0)*width)];
Opsoble[4] = pSrc[(cols-1)+((rows+1)*width)];
Opsoble[5] = pSrc[(cols+1)+((rows+1)*width)];
Opsoble[6] = pSrc[(cols+0)+((rows-1)*width)];
Opsoble[7] = pSrc[(cols+0)+((rows+1)*width)];
float gx = Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5];
float gy = Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5];
pDstImage[(cols-1)+(rows-1)*width] = gx*gx + gy*gy;
}
Here I am not able to get the output as expected.
I am having some questions that
My for loop is starting from i=1 instead of zero, then How can I get proper index by using the global_id() in x and y direction
What is going wrong in my above kernel code :(
I am suspecting there is a problem in buffer stride but not able to further break my head as already broke it throughout a day :(
I have observed that with below logic output is skipping one or two frames after some 7/8 frames sequence.
I have added the screen shot of my output which is compared with the reference output.
My above logic is doing partial sobelling on my input .I changed the width as -
const uint width = get_global_size(0)+1;
PFB
Your suggestions are most welcome !!!
It looks like you may be fetching values in (y,x) format in your opencl version. Also, you need to add 1 to the global id to replicate your for loops starting from 1 rather than 0.
I don't know why there is an unused iOffset variable. Maybe your bug is related to this? I removed it in my version.
Does this kernel work better for you?
__kernel void simple(__global uchar *pSrc, __global float *pDstImage)
{
const uint i = get_global_id(0) +1;
const uint j = get_global_id(1) +1;
const uint width = get_global_size(0) +2;
uchar Opsoble[8];
Opsoble[0] = pSrc[(i-1) + (j - 1)*width];
Opsoble[1] = pSrc[(i-1) + (j + 1)*width];
Opsoble[2] = pSrc[i + (j-1)*width];
Opsoble[3] = pSrc[i + (j+1)*width];
Opsoble[4] = pSrc[(i+1) + (j - 1)*width];
Opsoble[5] = pSrc[(i+1) + (j + 1)*width];
Opsoble[6] = pSrc[(i-1) + (j)*width];
Opsoble[7] = pSrc[(i+1) + (j)*width];
float gx = Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5];
float gy = Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5];
pDstImage[(i-1) + (j-1)*width] = gx*gx + gy*gy ;
}
I am a bit apprehensive about posting an answer suggesting optimizations to your kernel, seeing as the original output has not been reproduced exactly as of yet. There is a major improvement available to be made for problems related to image processing/filtering.
Using local memory will help you out by reducing the number of global reads by a factor of eight, as well as grouping the global writes together for potential gains with the single write-per-pixel output.
The kernel below reads a block of up to 34x34 from pSrc, and outputs a 32x32(max) area of the pDstImage. I hope the comments in the code are enough to guide you in using the kernel. I have not been able to give this a complete test, so there could be changes required. Any comments are appreciated as well.
__kernel void sobel_uchar_wlocal (__global uchar *pSrc, __global float *pDstImage, __global uint2 dimDstImage)
{
//call this kernel 1-dimensional work group size: 32x1
//calculates 32x32 region of output with 32 work items
const uint wid = get_local_id(0);
const uint wid_1 = wid+1; // corrected for the calculation step
const uint2 gid = (uint2)(get_group_id(0),get_group_id(1));
const uint localDim = get_local_size(0);
const uint2 globalTopLeft = (uint2)(localDim * gid.x, localDim * gid.y); //position in pSrc to copy from/to
//dimLocalBuff is used for the right and bottom edges of the image, where the work group may run over the border
const uint2 dimLocalBuff = (uint2)(localDim,localDim);
if(dimDstImage.x - globalTopLeft.x < dimLocalBuff.x){
dimLocalBuff.x = dimDstImage.x - globalTopLeft.x;
}
if(dimDstImage.y - globalTopLeft.y < dimLocalBuff.y){
dimLocalBuff.y = dimDstImage.y - globalTopLeft.y;
}
int i,j;
//save region of data into local memory
__local uchar srcBuff[34][34]; //34^2 uchar = 1156 bytes
for(j=-1;j<dimLocalBuff.y+1;j++){
for(i=x-1;i<dimLocalBuff.x+1;i+=localDim){
srcBuff[i+1][j+1] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j];
}
}
mem_fence(CLK_LOCAL_MEM_FENCE);
//compute output and store locally
__local float dstBuff[32][32]; //32^2 float = 4096 bytes
if(wid_1 < dimLocalBuff.x){
for(i=0;i<dimLocalBuff.y;i++){
float gx = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1-1)+ (i + 1)]+2*(srcBuff[wid_1+ (i-1)]-srcBuff[wid_1+ (i+1)])+srcBuff[(wid_1+1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i + 1)];
float gy = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i - 1)]+2*(srcBuff[(wid_1-1)+ (i)]-srcBuff[(wid_1+1)+ (i)])+srcBuff[(wid_1-1)+ (i + 1)]-srcBuff[(wid_1+1)+ (i + 1)];
dstBuff[wid][i] = gx*gx + gy*gy;
}
}
mem_fence(CLK_LOCAL_MEM_FENCE);
//copy results to output
for(j=0;j<dimLocalBuff.y;j++){
for(i=0;i<dimLocalBuff.x;i+=localDim){
srcBuff[i][j] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j];
}
}
}

OpenCL matrix multiplication

I'm a beginner in OpenCL. And I've been trying to write a matrix multiplication code.
It works fine only it gives garbage value as the output for C array. I'm unable to fix the error.
Any help will be much appreciated.
Here's is the host and kernel code.
#include <CL/cl.h>
#include <iostream>
#include <cstdio>
#include <fstream>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
using namespace std;
#define SUCCESS 0
#define FAILURE 1
// Function to convert file name into a string
int convertToString(const char *filename, std::string &s)
{
size_t size;
char *str;
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if (f.is_open())
{
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size + 1];
if (!str)
{
f.close();
return 0;
}
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return 0;
}
cout << "Error: failed to open file\n:" << filename << endl;
return FAILURE;
}
int main()
{
cl_uint status;
cl_int *error;
int A[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
int B[9] = {2, 2, 2, 2, 2, 2, 2, 2, 2};
int C[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
// Setting up platforms
cl_platform_id platform = NULL;
cl_uint numPlatforms = 0;
// Getting no of platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (status != CL_SUCCESS)
{
cout << "\nUnable to query platforms";
return 0;
}
// Get the platform
if (numPlatforms > 0)
{
cl_platform_id*platforms=
cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
}
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status =
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, devices, &numDevices);
if (numDevices == 0)
{
cout << "No GPU device available! Choosing CPU.\n";
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, devices,
&numDevices);
devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices,
devices, NULL);
}
else
{
devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices,
devices, NULL);
if (status == 0)
{
cout << "Device error!";
return 0;
}
}
// Creating contexts
cl_context context =
clCreateContext(NULL, 1, devices, NULL, NULL, (cl_int *)status);
if (status != CL_SUCCESS)
{
cout << status;
}
// Creating command queues
cl_command_queue command =
clCreateCommandQueue(context, devices[0], 0, NULL);
// if(error!=CL_SUCCESS)
//{
// cout<<error;
//}
// Creating buffers
cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
status = clEnqueueWriteBuffer(command, bufferA, CL_TRUE, 0, 9 * sizeof(int),
(void *)A, 0, NULL, NULL);
status = clEnqueueWriteBuffer(command, bufferB, CL_TRUE, 0, 9 * sizeof(int),
(void *)B, 0, NULL, NULL);
// status=clEnqueueReadBuffer(command,bufferA,CL_TRUE,0,9*sizeof(int),(void*)C,0,NULL,NULL);
const char *filename = "kernel.cl";
string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = {strlen(source)};
cl_program program =
clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, numDevices, 0, NULL, NULL, NULL);
cl_kernel myKernel = clCreateKernel(program, "multiply", NULL);
// Setting kernel arguments
clSetKernelArg(myKernel, 0, sizeof(cl_mem), &bufferC);
clSetKernelArg(myKernel, 1, sizeof(cl_mem), &bufferA);
clSetKernelArg(myKernel, 2, sizeof(cl_mem), &bufferB);
size_t localws[2] = {9, 9};
size_t globalws[2] = {3, 3};
status = clEnqueueNDRangeKernel(command, myKernel, 2, NULL, globalws,
localws, 0, NULL, NULL);
status = clEnqueueReadBuffer(command, bufferC, CL_TRUE, 0, 9 * sizeof(int),
(void *)C, 0, NULL, NULL);
for (int i = 0; i < 9; i++) cout << C[i] << " ";
status = clReleaseKernel(myKernel); // Release kernel.
status = clReleaseProgram(program); // Release program object.
status = clReleaseMemObject(bufferA); // Release mem object.
status = clReleaseMemObject(bufferB);
status = clReleaseMemObject(bufferC);
status = clReleaseCommandQueue(command); // Release Command queue.
status = clReleaseContext(context); // Release context.
}
Kernel code:
__kernel void multiply(_global int outputC, _global int inputA,
_global int inputB)
{
int row = get_global_id(0);
int col = get_global_id(1);
int sum = 0;
for (int i = 0; i < 3; i++)
sum += inputA[row * 3 + 1] * inputB[i * 3 + col];
outputC[row + 3 + col] = sum;
}
As already pointed out by #Marco13 the kernel suffers from quite a few issues.
When running this kernel through a tool like clcc you can see that there are a number of compilation errors to begin with:
> clcc matmul.cl
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 2: error: identifier "_global" is undefined
_global int inputB)
^
"/tmp/OCLu7FyFF.cl", line 2: error: invalid combination of type specifiers
_global int inputB)
^
6 errors detected in the compilation of "/tmp/OCLu7FyFF.cl".
A tool like clcc is very useful for catching errors early on. Most vendors also have their own version of a standalone kernel compiler/checker: e.g. Intel has its Kernel Builder, AMD's CodeXL contains a static kernel analyzer. Another option is to retrieve kernel compilation errors right from your host code, by calling clGetProgramBuildInfo to retrieve the compiler output, after clBuildProgram returned CL_BUILD_PROGRAM_FAILURE.
Once these compilation errors are fixed, it looks like your kernel is still not doing what you expect: as noted, the inputs and outputs should be pointers, as you will be passing buffers to the kernel. Also, the indexing of your input and output arrays is incorrect: In the for-loop inputA[row * 3 + 1] should be inputA[row * 3 + i] (i instead of 1). When saving the result to outputC, I would expect outputC[row * 3 + col] (row * 3) instead of row + 3).
I haven't looked in detail at the host code, but I would at least make sure, especially when just starting out with OpenCL, to always check every return code and error. This will save you a lot of time and frustration.
Finally, if you want a quick jump-start to learning OpenCL with a hands-on approach, I would strongly recommend going through the open source Hands-on OpenCL training by Simon McIntosh-Smith and Tom Deakin. It doesn't take very long, is quite pragmatic and provides lots of useful insights. Optimizing matrix multiplication is one of the use cases that is shown step-by-step.

How to compile vImage emboss effect sample code?

Here is the code found in the documentation:
int myEmboss(void *inData,
unsigned int inRowBytes,
void *outData,
unsigned int outRowBytes,
unsigned int height,
unsigned int width,
void *kernel,
unsigned int kernel_height,
unsigned int kernel_width,
int divisor ,
vImage_Flags flags ) {
uint_8 kernel = {-2, -2, 0, -2, 6, 0, 0, 0, 0}; // 1
vImage_Buffer src = { inData, height, width, inRowBytes }; // 2
vImage_Buffer dest = { outData, height, width, outRowBytes }; // 3
unsigned char bgColor[4] = { 0, 0, 0, 0 }; // 4
vImage_Error err; // 5
err = vImageConvolve_ARGB8888( &src, //const vImage_Buffer *src
&dest, //const vImage_Buffer *dest,
NULL,
0, //unsigned int srcOffsetToROI_X,
0, //unsigned int srcOffsetToROI_Y,
kernel, //const signed int *kernel,
kernel_height, //unsigned int
kernel_width, //unsigned int
divisor, //int
bgColor,
flags | kvImageBackgroundColorFill
//vImage_Flags flags
);
return err;
}
Here is the problem: the kernel variable seems to refer to three different types:
void * kernel in the formal parameter list
an undefined unsigned int uint_8 kernel, as a new variable which presumably would shadow the formal parameter
a const signed int *kernel when calling vImageConvolve_ARGB8888.
Is this actual code ? How may I compile this function ?
You are correct that that function is pretty messed up. I recommend using the Provide Feedback widget to let Apple know.
I think you should remove the kernel, kernel_width, and kernel_height parameters from the function signature. Those seem to be holdovers from a function that applies a caller-supplied kernel, but this example is about applying an internally-defined kernel.
Fixed the declaration of the kernel local variable to make it an array of uint8_t, like so:
uint8_t kernel[] = {-2, -2, 0, -2, 6, 0, 0, 0, 0}; // 1
Then, at the call to vImageConvolve_ARGB8888(), replace kernel_width and kernel_height by 3. Since the kernel is hard-coded, the dimensions can be as well.
The kernel is just the kernel used in the convolution. In mathematical terms, it is the matrix that is convolved with your image, to achieve blur/sharpen/emboss or other effects. This function you provided is just a thin wrapper around the vimage convolution function. To actually perform the convolution you can follow the code below. The code is all hand typed so not necessarily 100% correct but should point you in the right direction.
To use this function, you first need to have pixel access to your image. Assuming you have a UIImage, you do this:
//image is a UIImage
CGImageRef img = image.CGImage;
CGDataProviderRef dataProvider = CGImageGetDataProvider(img);
CFDataRef cfData = CGDataProviderCopyData(dataProvider);
void * dataPtr = (void*)CFDataGetBytePtr(cfData);
Next, you construct the vImage_Buffer that you will pass to the function
vImage_Buffer inBuffer, outBuffer;
inBuffer.data = dataPtr;
inBuffer.width = CGImageGetWidth(img);
inBuffer.height = CGImageGetHeight(img);
inBuffer.rowBytes = CGImageGetBytesPerRow(img);
Allocate the outBuffer as well
outBuffer.data = malloc(inBuffer.height * inBuffer.rowBytes)
// Setup width, height, rowbytes equal to inBuffer here
Now we create the Kernel, the same one in your example, which is a 3x3 matrix
Multiply the values by a divisor if they are float (they need to be int)
int divisor = 1000;
CGSize kernalSize = CGSizeMake(3,3);
int16_t *kernel = (int16_t*)malloc(sizeof(int16_t) * 3 * 3);
// Assign kernel values to the emboss kernel
// uint_8 kernel = {-2, -2, 0, -2, 6, 0, 0, 0, 0} // * 1000 ;
Now perform the convolution on the image!
//Use a background of transparent black as temp
Pixel_8888 temp = 0;
vImageConvolve_ARGB8888(&inBuffer, &outBuffer, NULL, 0, 0, kernel, kernelSize.width, kernelSize.height, divisor, temp, kvImageBackgroundColorFill);
Now construct a new UIImage out of outBuffer and your done!
Remember to free the kernel and the outBuffer data.
This is the way I am using it to process frames read from a video with AVAssetReader. This is a blur, but you can change the kernel to suit your needs. 'imageData' can of course be obtained by other means, e.g. from an UIImage.
CMSampleBufferRef sampleBuffer = [asset_reader_output copyNextSampleBuffer];
CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
CVPixelBufferLockBaseAddress(imageBuffer,0);
void *imageData = CVPixelBufferGetBaseAddress(imageBuffer);
int16_t kernel[9];
for(int i = 0; i < 9; i++) {
kernel[i] = 1;
}
kernel[4] = 2;
unsigned char *newData= (unsigned char*)malloc(4*currSize);
vImage_Buffer inBuff = { imageData, height, width, 4*width };
vImage_Buffer outBuff = { newData, height, width, 4*width };
vImage_Error err=vImageConvolve_ARGB8888 (&inBuff,&outBuff,NULL, 0,0,kernel,3,3,10,nil,kvImageEdgeExtend);
if (err != kvImageNoError) NSLog(#"convolve error %ld", err);
CVPixelBufferUnlockBaseAddress(imageBuffer, 0);
//newData holds the processed image

Resources