Copy_to_user problems - device-driver

I wrote module for a device, and I'm having problems in my read function:
ssize_t my_sys_read(struct file *f, char __user *buffer, size_t s, loff_t *off){
char * myBuffer = "ossec buffer";
size_t read_bytes;
if (s > ( sizeof(char) * 13 ) ) s = ( sizeof(char) * 13 );
if (!access_ok(%VERIFY_WRITE, (void *) buffer, s)) return -EFAULT;
read_bytes = copy_to_user((void *) buffer, (void *) myBuffer, s);
printk(KERN_INFO "myBuffer %s", myBuffer);
printk(KERN_INFO "buffer %s", buffer);
read_bytes = s - ( sizeof(char) * 13 );
return read_bytes;
}
I really don't know why but the copy it's not working, and the printed buffer has no sense for me.
[10038.885838] buffer \xffffff81\xffffffc3\xffffffcb\x1a
I guess the problem is in the copy because the program which use the device is simple.
int main(void)
{
int fd = open(device_name, O_RDONLY);
if(fd < 0)
{
printf("Error: Impossible to open device, action not permited.\n");
return 0;
}
char * buff;
int read_bytes;
read_bytes = read(fd, buff, (13 * sizeof(char) ) );
printf(" %s\n", buff);
}
Thanks!

char * buff;
int read_bytes;
read_bytes = read(fd, buff, (13 * sizeof(char) ) );
You are using the pointer buff without allocating memory for it, it is then a dangling pointer with a random value. You need to do
char *buff = new char(); or
char buff;
int read_bytes;
read_bytes = read(fd, &buff, (13 * sizeof(char)) );

Related

Is it possible to MMAP a PCI BAR memory?

I want to user access memory from a PCIe board which provides a 1GB memory with BAR0.
Currently I use only read and write functionality of my character device driver, which is VERY slow (1MB/s read and 16MB/s write) on a 8x PCIe Gen3.
static ssize_t
MPD_read(
struct file *filp,
char *buffer,
size_t bufferSize,
loff_t *offset )
{
unsigned long unusedBytes = copy_to_user(
( void * ) buffer,
MPD_AdapterBoard.bars[ 0 ].barHWAddress,
bufferSize );
return 0;
}
static ssize_t
MPD_write(
struct file *filp,
const char *buffer,
size_t bufferSize,
loff_t *offset )
{
unsigned long unusedBytes = copy_from_user(
MPD_AdapterBoard.bars[ 0 ].barHWAddress,
( void * ) buffer,
bufferSize );
return 0;
}
Is it possible to use the MMAP (with the .mmap file operation) to get more speed ?
Or is DMA the only option ?
Thanks in advance!
/Jesko
I found out how it's working:
static int
MPD_mmap(
struct file *filp,
struct vm_area_struct *vma )
{
unsigned long offset;
offset = vma->vm_pgoff << PAGE_SHIFT;
if (( offset + ( vma->vm_end - vma->vm_start )) > MPD_AdapterBoard.bars[ 0 ].barSizeInBytes )
{
return -EINVAL;
}
offset += ( unsigned long ) MPD_AdapterBoard.bars[ 0 ].mmioStart;
vma->vm_page_prot = pgprot_noncached( vma->vm_page_prot );
if ( io_remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot ))
{
return -EAGAIN;
}
return 0;
}
Attention: This is work in progress, so error checking is fairly limited.
In the hope to help someone here, the complete code can be downloaded including a test program from here: https://github.com/jesko42/minipci

BIGNUM strange behavior in a calculation loop

I'm trying to implement a basic routine to perform some calculation on BIGNUM(s) and I've found a strange behavior. The functions are as follows
unsigned char *char_array_as_hex(unsigned char *chr_a, int len)
{
unsigned char *chr_s = (unsigned char *)malloc(len * 2);
char buffer[5];
for (int i = 0; i < len; i++)
{
sprintf(buffer, "%02X", chr_a[i]);
chr_s[(2 * i) + 0] = buffer[0];
chr_s[(2 * i) + 1] = buffer[1];
}
return chr_s;
}
and
char *big_number_as_decimal_from_hex_array(unsigned char *chr_a, int len, BN_CTX *bn_ctx)
{
unsigned char *hex_s = char_array_as_hex(chr_a, len);
BIGNUM *big_number = BN_CTX_get(bn_ctx);
BN_hex2bn(&big_number, (char *)hex_s);
char *big_number_as_decimal = BN_bn2dec(big_number);
free(hex_s);
BN_free(big_number);
return big_number_as_decimal;
}
and
void test_compute_prime256v1()
{
BN_CTX *bn_ctx = BN_CTX_new();
BN_CTX_start(bn_ctx);
unsigned char seed_a[20] = {
0xC4,0x9D,0x36,0x08,0x86,0xE7,0x04,0x93,0x6A,0x66, /* seed */
0x78,0xE1,0x13,0x9D,0x26,0xB7,0x81,0x9F,0x7E,0x90
};
printf("s = %s\n", big_number_as_decimal_from_hex_array(seed_a, 20, bn_ctx));
unsigned char p_a[32] = {
0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x01,0x00,0x00, /* p */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF
};
printf("p = %s\n", big_number_as_decimal_from_hex_array(p_a, 32, bn_ctx));
BN_CTX_end(bn_ctx);
BN_CTX_free(bn_ctx);
}
then I call "test_compute_prime256v1" in an Objective-C method. If I call it once or multiple times with a reasonable delay between each call it produces correct result however, when I call that function in a loop it produces different incorrect values
- (IBAction)btnOK_Clicked:(id)sender
{
for (int i = 1; i < 10; i++)
{
printf("i = %d\n", i);
test_compute_prime256v1();
}
}
and a sample output was
i = 1
s = 1122468115042657169822351801880191947498376363664
p = 115792089210356248762697446949407573530086143415290314195533631308867097853951
i = 2
s = 1122468115042657169822351801880191947498376363664
p = 966134380529368896499052403318808180610643774633026536153469502543482958881555881553276...
i = 3
s = 1122468115042657169822351801880191947498376363664
p = 115792089210356248762697446949407573530086143415290314195533631308867097853951
Note: some numbers are trimmed to fit in. I have followed the suggestion in here.
Am I missing something? Is there any mistake somewhere?
Anyone can help?
Thanks
EDITED:
I made some modification to code but the issue still exists. I changed big_number_as_decimal_from_hex_array as follows
char *big_number_as_decimal_from_hex_array_ex(unsigned char *chr_a, int len)
{
BN_CTX *bn_ctx = BN_CTX_new();
BN_CTX_start(bn_ctx);
unsigned char *hex_s = char_array_as_hex(chr_a, len);
BIGNUM *big_number = BN_CTX_get(bn_ctx);
BN_hex2bn(&big_number, (char *)hex_s);
char *big_number_as_decimal = BN_bn2dec(big_number);
free(hex_s);
BN_free(big_number);
BN_CTX_end(bn_ctx);
BN_CTX_free(bn_ctx);
return big_number_as_decimal;
}
and also
char *big_number_as_decimal_from_hex_array_ex_2(unsigned char *chr_a, int len)
{
BN_CTX *bn_ctx = BN_CTX_new();
unsigned char *hex_s = char_array_as_hex(chr_a, len);
BIGNUM *big_number = BN_CTX_get(bn_ctx);
BN_hex2bn(&big_number, (char *)hex_s);
char *big_number_as_decimal = BN_bn2dec(big_number);
free(hex_s);
BN_free(big_number);
BN_CTX_free(bn_ctx);
return big_number_as_decimal;
}
I modified the test_compute_prime256v1 as
void test_compute_prime256v1_ex()
{
unsigned char seed_a[20] = {...};
printf("s = %s\n", big_number_as_decimal_from_hex_array_ex(seed_a, 20));
unsigned char p_a[32] = {...};
printf("p = %s\n", big_number_as_decimal_from_hex_array_ex(p_a, 32));
// or
unsigned char seed_a[20] = {...};
printf("s = %s\n", big_number_as_decimal_from_hex_array_ex_2(seed_a, 20));
unsigned char p_a[32] = {...};
printf("p = %s\n", big_number_as_decimal_from_hex_array_ex_2(p_a, 32));
}
but the code produces the same incorrect result in a looped calculation
BN_hex2bn(&big_number, (char *)hex_s); expects a C string as second argument, ie a '\0' terminated one since it has no other way to know the size of your string.

opencv 3 channel image data offset for cuda kernel [duplicate]

I'm doing linear filtering on images using CUDA. I use 2D thread blocks and 2D grid to make the problem natural. Here's how I index: (height and width are image dimensions)
dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + 15) / 16;
GridDim.y = (height + 15) / 16;
In kernel I access the locations as follows:
unsigned int xIndex = blockIdx.x*16+ threadIdx.x;
unsigned int yIndex = blockIdx.y*16+ threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
And I want to return four boundaries (i'll cater them later on). I do this as:
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
Where N is the number of pixels at each boundary I don't want to calculate.
Problem:
The code runs fine on all standard images sizes. But for some random image sizes it shows diagonal line(s). For example in my case 500x333 image (even when no dimension is multiple of 16) is showing correct output whereas 450x365 is showing diagonal lines in the output. The problem remains even if I just return the extra threads of grid and nothing else like this:
if(yIndex>=height || xIndex>=width)
return;
The code remains the same, some inputs run fine while others don't. Can anybody spot the bug? I have attached the input and output samples here: IMAGES Thanks!
Update:
Kernel Code (Simplified to return input image, but gives the same problem)
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
/*Filter code removed, still gives the same problem*/
out[tid] = in[tid];
}
Update 2:
I have also removed the return statement by reversing the if condition. But the problem persists.
if(yIndex<=height-N && xIndex<=width-N && yIndex>N && xIndex>N){
/*Kernel Code*/
}
There are quite a few things you still haven't described very well, but based on the information you have posted, I built what I am guessing is a reasonable repro case with parameters which match a case you say it failing (450 x 364 with filterSize=5):
#include <stdio.h>
#include <assert.h>
template<int filterSize>
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
out[tid] = in[tid];
}
int main(void)
{
const int width = 450, height = 365, filterSize=5;
const size_t isize = sizeof(unsigned char) * size_t(width * height);
unsigned char * _in, * _out, * out;
assert( cudaMalloc((void **)&_in, isize) == cudaSuccess );
assert( cudaMalloc((void **)&_out, isize) == cudaSuccess );
assert( cudaMemset(_in, 'Z', isize) == cudaSuccess );
assert( cudaMemset(_out, 'A', isize) == cudaSuccess );
const dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + BlockDim.x - 1) / BlockDim.x;
GridDim.y = (height + BlockDim.y - 1) / BlockDim.y;
filter_8u_c1_kernel<filterSize><<<GridDim,BlockDim>>>(_in,_out,width,height,0,0);
assert( cudaPeekAtLastError() == cudaSuccess );
out = (unsigned char *)malloc(isize);
assert( cudaMemcpy(out, _out, isize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<width; i++) {
fprintf(stdout, "%d: ", i);
for(int j=0; j<height; j++) {
unsigned int idx = i + j*width;
fprintf(stdout, "%c", out[idx]);
}
fprintf(stdout, "\n");
}
return cudaThreadExit();
}
When run it does exactly what I would expect, overwriting the output memory with the input everywhere except for the first and last two lines and the first and last two entries in all the lines in between. This is running with CUDA 3.2 on OS X 10.6.5 with a compute 1.2 GPU. So whatever is happening in you code, it isn't happening in my repro case, which either means I have misinterpreted what you have written, or there is something else you haven't described which is causing the problem.

Why do operations with an array corrupt the values?

I'm trying to implement the Particle Swarm Optimization on CUDA. I'm partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.
The problem is, when I'm trying to modify array element like so
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.
I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can't spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].
Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;
typedef struct {
unsigned k_max;
float c1;
float c2;
unsigned p;
float inertia_factor;
float Ef;
float X_low[f_n];
float X_high[f_n];
float X_min[n][f_n];
} params_t;
typedef void (*kernelWrapperType) (
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
);
typedef float (*twoArgsFuncType) (
float x1,
float x2
);
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
__device__ float kernelF1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
void f1(
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
) {
float *X_d = NULL;
float *Y_d = NULL;
unsigned length = n * p;
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
size_t pitch;
size_t dpitch;
cudaError_t err;
unsigned width = n;
unsigned height = p;
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
pitch = n * sizeof(float);
err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
err = cudaMalloc (&Y_d, sizeof(float) * p);
err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
err = cudaFree(X_d);
err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
err = cudaFree(Y_d);
}
float F1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
/*
* Generates random float in [0.0; 1.0]
*/
float frand(){
return (float)rand()/(float)RAND_MAX;
}
/*
* This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
* brings the result vector back to host and dumps it on the console.
*/
int main() {
const params_t params = {
100,
0.5,
0.5,
p,
0.98,
0.01,
{-5.12, -2.048, -5.12},
{5.12, 2.048, 5.12},
{{0, 1, 0}, {0, 1, 0}}
};
float X[p][n];
float X_highVec[n];
float V[p][n];
float X_best[p][n];
float Y[p] = {0};
float Y_best[p] = {0};
float X_swarmBest[n];
kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
twoArgsFuncType F[f_n] = {&F1, &F1, &F1};
for (unsigned f = 0; f < f_n; f++) {
printf("Optimizing function #%u\n", f);
srand ( time(NULL) );
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
X[i][j] = X_best[i][j] = frand();
for (int i = 0; i < n; i++)
X_highVec[i] = params.X_high[f];
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
V[i][j] = frand();
for (unsigned i = 0; i < p; i++)
Y_best[i] = F[f](X[i][0], X[i][1]);
for (unsigned i = 0; i < n; i++)
X_swarmBest[i] = params.X_high[f];
float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);
bool termination = false;
float inertia = 1.;
for (unsigned k = 0; k < params.k_max; k++) {
F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, &params, f);
}
for (unsigned i = 0; i < p; i++)
{
for (unsigned j = 0; j < n; j++)
{
printf("%f\t", X[i][j]);
}
printf("F = %f\n", Y[i]);
}
getchar();
}
}
Update: I tried adding error handling like so
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
fprintf(stderr, cudaGetErrorString(err));
exit(1);
}
after each API call, but it gave me nothing and didn't return (I still get all the results and program works to the end).
This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:
const unsigned n = 2;
const unsigned p = 64;
unsigned length = n * p
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.
As for infinite values, that is to be expected isn't it? Your code starts with values in (0,1), and then does
X[i] = X[i] * (5.12--5.12) - -5.12
100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by
X[i] = X[i] * (2.048--2.048) - -2.048
100 times, which is the rough equivalent of multiplying by 4^100, finally followed by
X[i] = X[i] * (5.12--5.12) - -5.12
again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.

pass structure to kernel local memory

I have problem with passing structure to kernel local memory. Here is the kernel kode:
typedef struct data {
unsigned long wId; // group_id
unsigned long iId[1]; // global_item_id
} DATA;
__kernel void tKernel(__global DATA *x, __local DATA tmp) {
int wd = get_work_dim();
// x dimension
int xGrId = get_group_id(0);
int xLId = get_local_id(0);
int xGlId = get_global_id(0);
x += xGrId;
x->wId = tmp.wId;
x->iId[xLId] = ++tmp.wId;
}
Here is the host code:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define GLOBAL_ITEM_SIZE (1)
#define LOCAL_ITEM_SIZE (1)
#define MAX_SOURCE_SIZE (0x100000)
typedef struct data {
unsigned long wId;
unsigned long iId[LOCAL_ITEM_SIZE];
} DATA;
int main() {
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_mem cmPinnedBufOut = NULL;
DATA *cDataOut = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
size_t group_size = GLOBAL_ITEM_SIZE / LOCAL_ITEM_SIZE;
FILE *fp;
const char fileName[] = "./kernel.cl";
size_t source_size;
char *source_str;
/* Load kernel source file */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(EXIT_FAILURE);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue with measurment of preformance */
commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create memory object */
cmPinnedBufOut = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, group_size * sizeof(DATA), NULL, &ret);
cDataOut = (DATA *)malloc(group_size * sizeof(DATA));
/* Create kernel program from source file */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
assert(ret == CL_SUCCESS);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("\nFail to build the program\n");
char buffer[10240];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
printf("%s\n", buffer);
exit(EXIT_FAILURE);
}
/* Create data parallel OpenCL kernel */
kernel = clCreateKernel(program, "tKernel", &ret);
assert(ret == CL_SUCCESS);
/* Set OpenCL kernel arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&cmPinnedBufOut);
assert(ret == CL_SUCCESS);
DATA tmp;
tmp.wId = 66;
ret = clSetKernelArg(kernel, 1, sizeof(DATA), &tmp);
assert(ret == CL_SUCCESS);
size_t global_item_size = GLOBAL_ITEM_SIZE;
size_t local_item_size = LOCAL_ITEM_SIZE;
/* Execute OpenCL kernel as data parallel */
ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret == CL_INVALID_WORK_GROUP_SIZE) {
printf("Invalid work group size: error when compute group size: %lu/%lu", global_item_size, local_item_size);
exit(EXIT_FAILURE);
}
assert(ret == CL_SUCCESS);
/* Transfer result to host */
ret = clEnqueueReadBuffer(commandQueue, cmPinnedBufOut, CL_TRUE, 0, group_size * sizeof(DATA), cDataOut, 0, NULL, NULL);
assert(ret == CL_SUCCESS);
/* Display Results */
for (int i = 0; i < group_size; i++) {
printf("%d: -> group_id %lu ~> work_item_ids: ", i, cDataOut[i].wId);
for (int j = 0; j < LOCAL_ITEM_SIZE; j++)
printf("%2lu, ", cDataOut[i].iId[j]);
printf("\n");
}
printf("\n");
/* Finalization */
ret = clFlush(commandQueue);
ret = clFinish(commandQueue); // blockink function, wait until all queue cmd are finished
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
free(source_str);
return 0;
}
So I expected as result 0: -> group_id 66 ~> work_item_ids: 67,
But I get 0: -> group_id 0 ~> work_item_ids: 1,
From this I conclude that the hh structure with the number 66 was not reading correctly
by the kernel. I try to put this same way on integer number and this works perfectly.
So my question is, am I doing something wrong, or there isn't way to copy data structure from host to device local memory, or is there another way to doing this?
The clSetKernelArg for __local buffers only specifies the size, and the pointer must be 0. See OpenCL spec 5.7.2. There is no way you can initialize local memory from the host.

Resources