OpenCL :Access proper index by using globalid(.) - opencv

Hi,
I am coding in OpenCL.
I am converting a "C function" having 2D array starting from i=1 and j=1 .PFB .
cv::Mat input; //Input :having some data in it ..
//Image input size is :input.rows=288 ,input.cols =640
cv::Mat output(input.rows-2,input.cols-2,CV_32F); //Output buffer
//Image output size is :output.rows=286 ,output.cols =638
This is a code Which I want to modify in OpenCL:
for(int i=1;i<output.rows-1;i++)
{
for(int j=1;j<output.cols-1;j++)
{
float xVal = input.at<uchar>(i-1,j-1)-input.at<uchar>(i-1,j+1)+ 2*(input.at<uchar>(i,j-1)-input.at<uchar>(i,j+1))+input.at<uchar>(i+1,j-1) - input.at<uchar>(i+1,j+1);
float yVal = input.at<uchar>(i-1,j-1) - input.at<uchar>(i+1,j-1)+ 2*(input.at<uchar>(i-1,j) - input.at<uchar>(i+1,j))+input.at<uchar>(i-1,j+1)-input.at<uchar>(i+1,j+1);
output.at<float>(i-1,j-1) = xVal*xVal+yVal*yVal;
}
}
...
Host code :
//Input Image size is :input.rows=288 ,input.cols =640
//Output Image size is :output.rows=286 ,output.cols =638
OclStr->global_work_size[0] =(input.cols);
OclStr->global_work_size[1] =(input.rows);
size_t outBufSize = (output.rows) * (output.cols) * 4;//4 as I am copying all 4 uchar values into one float variable space
cl_mem cl_input_buffer = clCreateBuffer(
OclStr->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR ,
(input.rows) * (input.cols),
static_cast<void *>(input.data), &OclStr->returnstatus);
cl_mem cl_output_buffer = clCreateBuffer(
OclStr->context, CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR ,
(output.rows) * (output.cols) * sizeof(float),
static_cast<void *>(output.data), &OclStr->returnstatus);
OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 0, sizeof(cl_mem), (void *)&cl_input_buffer);
OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 1, sizeof(cl_mem), (void *)&cl_output_buffer);
OclStr->returnstatus = clEnqueueNDRangeKernel(
OclStr->command_queue,
OclStr->objkernel,
2,
NULL,
OclStr->global_work_size,
NULL,
0,
NULL,
NULL
);
clEnqueueMapBuffer(OclStr->command_queue, cl_output_buffer, true, CL_MAP_READ, 0, outBufSize, 0, NULL, NULL, &OclStr->returnstatus);
kernel Code :
__kernel void Sobel_uchar (__global uchar *pSrc, __global float *pDstImage)
{
const uint cols = get_global_id(0)+1;
const uint rows = get_global_id(1)+1;
const uint width= get_global_size(0);
uchar Opsoble[8];
Opsoble[0] = pSrc[(cols-1)+((rows-1)*width)];
Opsoble[1] = pSrc[(cols+1)+((rows-1)*width)];
Opsoble[2] = pSrc[(cols-1)+((rows+0)*width)];
Opsoble[3] = pSrc[(cols+1)+((rows+0)*width)];
Opsoble[4] = pSrc[(cols-1)+((rows+1)*width)];
Opsoble[5] = pSrc[(cols+1)+((rows+1)*width)];
Opsoble[6] = pSrc[(cols+0)+((rows-1)*width)];
Opsoble[7] = pSrc[(cols+0)+((rows+1)*width)];
float gx = Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5];
float gy = Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5];
pDstImage[(cols-1)+(rows-1)*width] = gx*gx + gy*gy;
}
Here I am not able to get the output as expected.
I am having some questions that
My for loop is starting from i=1 instead of zero, then How can I get proper index by using the global_id() in x and y direction
What is going wrong in my above kernel code :(
I am suspecting there is a problem in buffer stride but not able to further break my head as already broke it throughout a day :(
I have observed that with below logic output is skipping one or two frames after some 7/8 frames sequence.
I have added the screen shot of my output which is compared with the reference output.
My above logic is doing partial sobelling on my input .I changed the width as -
const uint width = get_global_size(0)+1;
PFB
Your suggestions are most welcome !!!

It looks like you may be fetching values in (y,x) format in your opencl version. Also, you need to add 1 to the global id to replicate your for loops starting from 1 rather than 0.
I don't know why there is an unused iOffset variable. Maybe your bug is related to this? I removed it in my version.
Does this kernel work better for you?
__kernel void simple(__global uchar *pSrc, __global float *pDstImage)
{
const uint i = get_global_id(0) +1;
const uint j = get_global_id(1) +1;
const uint width = get_global_size(0) +2;
uchar Opsoble[8];
Opsoble[0] = pSrc[(i-1) + (j - 1)*width];
Opsoble[1] = pSrc[(i-1) + (j + 1)*width];
Opsoble[2] = pSrc[i + (j-1)*width];
Opsoble[3] = pSrc[i + (j+1)*width];
Opsoble[4] = pSrc[(i+1) + (j - 1)*width];
Opsoble[5] = pSrc[(i+1) + (j + 1)*width];
Opsoble[6] = pSrc[(i-1) + (j)*width];
Opsoble[7] = pSrc[(i+1) + (j)*width];
float gx = Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5];
float gy = Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5];
pDstImage[(i-1) + (j-1)*width] = gx*gx + gy*gy ;
}

I am a bit apprehensive about posting an answer suggesting optimizations to your kernel, seeing as the original output has not been reproduced exactly as of yet. There is a major improvement available to be made for problems related to image processing/filtering.
Using local memory will help you out by reducing the number of global reads by a factor of eight, as well as grouping the global writes together for potential gains with the single write-per-pixel output.
The kernel below reads a block of up to 34x34 from pSrc, and outputs a 32x32(max) area of the pDstImage. I hope the comments in the code are enough to guide you in using the kernel. I have not been able to give this a complete test, so there could be changes required. Any comments are appreciated as well.
__kernel void sobel_uchar_wlocal (__global uchar *pSrc, __global float *pDstImage, __global uint2 dimDstImage)
{
//call this kernel 1-dimensional work group size: 32x1
//calculates 32x32 region of output with 32 work items
const uint wid = get_local_id(0);
const uint wid_1 = wid+1; // corrected for the calculation step
const uint2 gid = (uint2)(get_group_id(0),get_group_id(1));
const uint localDim = get_local_size(0);
const uint2 globalTopLeft = (uint2)(localDim * gid.x, localDim * gid.y); //position in pSrc to copy from/to
//dimLocalBuff is used for the right and bottom edges of the image, where the work group may run over the border
const uint2 dimLocalBuff = (uint2)(localDim,localDim);
if(dimDstImage.x - globalTopLeft.x < dimLocalBuff.x){
dimLocalBuff.x = dimDstImage.x - globalTopLeft.x;
}
if(dimDstImage.y - globalTopLeft.y < dimLocalBuff.y){
dimLocalBuff.y = dimDstImage.y - globalTopLeft.y;
}
int i,j;
//save region of data into local memory
__local uchar srcBuff[34][34]; //34^2 uchar = 1156 bytes
for(j=-1;j<dimLocalBuff.y+1;j++){
for(i=x-1;i<dimLocalBuff.x+1;i+=localDim){
srcBuff[i+1][j+1] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j];
}
}
mem_fence(CLK_LOCAL_MEM_FENCE);
//compute output and store locally
__local float dstBuff[32][32]; //32^2 float = 4096 bytes
if(wid_1 < dimLocalBuff.x){
for(i=0;i<dimLocalBuff.y;i++){
float gx = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1-1)+ (i + 1)]+2*(srcBuff[wid_1+ (i-1)]-srcBuff[wid_1+ (i+1)])+srcBuff[(wid_1+1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i + 1)];
float gy = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i - 1)]+2*(srcBuff[(wid_1-1)+ (i)]-srcBuff[(wid_1+1)+ (i)])+srcBuff[(wid_1-1)+ (i + 1)]-srcBuff[(wid_1+1)+ (i + 1)];
dstBuff[wid][i] = gx*gx + gy*gy;
}
}
mem_fence(CLK_LOCAL_MEM_FENCE);
//copy results to output
for(j=0;j<dimLocalBuff.y;j++){
for(i=0;i<dimLocalBuff.x;i+=localDim){
srcBuff[i][j] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j];
}
}
}

Related

opencv cv::mat not returning the same result

int sizeOfChannel = (_width / 2) * (_height / 2);
double* channel_gr = new double[sizeOfChannel];
// filling the data into channel_gr....
cv::Mat my( _width/2, _height/2, CV_32F,channel_gr);
cv::Mat src(_width/2, _height/2, CV_32F);
for (int i = 0; i < (_width/2) * (_height/2); ++i)
{
src.at<float>(i) = channel_gr[i];
}
cv::imshow("src",src);
cv::imshow("my",my);
cv::waitKey(0);
I'm wondering why i'm not getting the same image in my and src imshow
update:
I have changed my array into double* still same result;
I think it is something to do with steps?
my image output
src image output
this one works for me:
int halfWidth = _width/2;
int halfHeight = _height/2;
int sizeOfChannel = halfHeight*halfWidth;
// ******************************* //
// you use CV_321FC1 later so it is single precision float
float* channel_gr = new float[sizeOfChannel];
// filling the data into channel_gr....
for(int i=0; i<sizeOfChannel; ++i) channel_gr[i] = i/(float)sizeOfChannel;
// ******************************* //
// changed row/col ordering, but this shouldnt be important
cv::Mat my( halfHeight , halfWidth , CV_32FC1,channel_gr);
cv::Mat src(halfHeight , halfWidth, CV_32FC1);
// ******************************* //
// changed from 1D indexing to 2D indexing
for(int y=0; y<src.rows; ++y)
for(int x=0; x<src.cols; ++x)
{
int arrayPos = y*halfWidth + x;
// you have a 2D mat so access it in 2D
src.at<float>(y,x) = channel_gr[arrayPos ];
}
cv::imshow("src",src);
cv::imshow("my",my);
// check for differences
cv::imshow("diff1 > 0",src-my > 0);
cv::imshow("diff2 > 0",my-src > 0);
cv::waitKey(0);
'my' is array of floats but you give it pointer to arrays of double. There no way it can get data from this array properly.
It seems that the constructor version that you are using is
Mat::Mat(int rows, int cols, int type, const Scalar& s)
This is from OpenCV docs. Seems like you are using float for src and assigning from channel_gr (declared as double). Isn't that some form of precision loss?

JavaCV creating and drawing grayscale one channel histogram

i am new to this website, please let me know if i have made any mistake on my post.
I have some questions regarding calculating and drawing histogram in javacv. Below are the codes that i have written based on some information that i have searched:
There is this error that i get: OpenCV Error: One of arguments' values is out of range (index is out of range) in unknown function, file ......\src\opencv\modules\core\src\array.cpp, line 1691
private CvHistogram getHistogram(IplImage image) {//get histogram data, input has been converted to grayscale beforehand
IplImage[] hsvImage1 = {image};
//bins and value-range
int numberOfBins = 256;
float minRange = 0.0f;
float maxRange = 255.0f;
// Allocate histogram object
int dims = 1;
int[] sizes = new int[]{numberOfBins};
int histType = CV_HIST_ARRAY;
float[] minMax = new float[]{minRange, maxRange};
float[][] ranges = new float[][]{minMax};
CvHistogram hist = cvCreateHist(dims, sizes, histType, ranges, 1);
cvCalcHist(hsvImage1, hist, 0, null);
return hist;
}
private IplImage DrawHistogram(CvHistogram hist, IplImage image) {//draw histogram
int scaleX = 1;
int scaleY = 1;
int i;
float[] max_value = {0};
int[] int_value = {0};
cvGetMinMaxHistValue(hist, max_value, max_value, int_value, int_value);//get min and max value for histogram
IplImage imgHist = cvCreateImage(cvSize(256, image.height() ),IPL_DEPTH_8U,1);//create image to store histogram
cvZero(imgHist);
CvPoint pts = new CvPoint(5);
for (i = 0; i < 256; i++) {//draw the histogram
float value = opencv_legacy.cvQueryHistValue_1D(hist, i);
float nextValue = opencv_legacy.cvQueryHistValue_1D(hist, i + 1);
pts.position(0).x(i * scaleX).y(image.height() * scaleY);
pts.position(1).x(i * scaleX + scaleX).y(image.height() * scaleY);
pts.position(2).x(i * scaleX + scaleX).y((int)((image.height() - nextValue * image.height() /max_value[0]) * scaleY));
pts.position(3).x(i * scaleX).y((int)((image.height() - value * image.height() / max_value[0]) * scaleY));
pts.position(4).x(i * scaleX).y(image.height() * scaleY);
cvFillConvexPoly(imgHist, pts.position(0), 5, CvScalar.RED, CV_AA, 0);
}
return imgHist;
}
I have tried searching few links that i provided at the bottom, however, each of them are in different language, therefore i am not sure i have converted them to java correctly. To be honest there are few things i doubt, will be glad if any advice can be provided, such as:
float[] max_value = {0}; // i referred to the internet and it helps me to getby syntax error in cvGetMinMaxHistValue() , not sure if it will cause logic error
pts.position(3).x(i * scaleX).y((int)((image.height() - value * image.height() / max_value[0]) * scaleY)); // i put int to downcast it to the type the pts will recognise, and one more thing is max_value[0] is 0, wondering if it will cause logical error due to division
Links used:
//use this
public CvHistogram getHistogram(IplImage image) {//get histogram data, input has been converted to grayscale beforehand
IplImageArray hsvImage1 = splitChannels(image);
//bins and value-range
int numberOfBins = 256;
float minRange = 0.0f;
float maxRange = 255.0f;
// Allocate histogram object
int dims = 1;
int[] sizes = new int[]{numberOfBins};
int histType = CV_HIST_ARRAY;
float[] minMax = new float[]{minRange, maxRange};
float[][] ranges = new float[][]{minMax};
CvHistogram hist = cvCreateHist(dims, sizes, histType, ranges, 1);
cvCalcHist(hsvImage1, hist, 0, null);
return hist;
}
private IplImageArray splitChannels(IplImage hsvImage) {
CvSize size = hsvImage.cvSize();
int depth = hsvImage.depth();
IplImage channel0 = cvCreateImage(size, depth, 1);
IplImage channel1 = cvCreateImage(size, depth, 1);
IplImage channel2 = cvCreateImage(size, depth, 1);
cvSplit(hsvImage, channel0, channel1, channel2, null);
return new IplImageArray(channel0, channel1, channel2);
}
Your error is in this part:
for (i = 0; i < 256; i++) {//draw the histogram
float value = opencv_legacy.cvQueryHistValue_1D(hist, i);
float nextValue = opencv_legacy.cvQueryHistValue_1D(hist, i + 1);
You use i+1 and it causes the error out of range, you can use your for until 255 to correct it.
I hope I helped you. GL

colored image to greyscale image using CUDA parallel processing

I am trying to solve a problem in which i am supposed to change a colour image to a greyscale image. For this purpose i am using CUDA parallel approach. The kerne code i am invoking on the GPU is as follows.
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int absolute_image_position_x = blockIdx.x;
int absolute_image_position_y = blockIdx.y;
if ( absolute_image_position_x >= numCols ||
absolute_image_position_y >= numRows )
{
return;
}
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[absolute_image_position_x + absolute_image_position_y] = channelSum;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage,
uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage,
size_t numRows,
size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(numCols/32, numCols/32 , 1); //TODO
const dim3 gridSize(numRows/12, numRows/12 , 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage,
d_greyImage,
numRows,
numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
i see a line of dots in the first pixel line.
error i am getting is
libdc1394 error: Failed to initialize libdc1394
Difference at pos 51 exceeds tolerance of 5
Reference: 255
GPU : 0
my input/output images
Can anyone help me with this??? thanks in advance.
I recently joined this course and tried your solution but it don't work so, i tried my own. You are almost correct. The correct solution is this:
__global__`
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{`
int pos_x = (blockIdx.x * blockDim.x) + threadIdx.x;
int pos_y = (blockIdx.y * blockDim.y) + threadIdx.y;
if(pos_x >= numCols || pos_y >= numRows)
return;
uchar4 rgba = rgbaImage[pos_x + pos_y * numCols];
greyImage[pos_x + pos_y * numCols] = (.299f * rgba.x + .587f * rgba.y + .114f * rgba.z);
}
The rest is same as your code.
Now, since I posted this question I have been continuously working on this problem there are a couple of improvements that should be done in order to get this problem correct now I realize my initial solution was wrong . Changes to be done:-
1. absolute_position_x =(blockIdx.x * blockDim.x) + threadIdx.x;
2. absolute_position_y = (blockIdx.y * blockDim.y) + threadIdx.y;
Secondly,
1. const dim3 blockSize(24, 24, 1);
2. const dim3 gridSize((numCols/16), (numRows/16) , 1);
In the solution we are using a grid of numCols/16 * numCols/16
and blocksize of 24 * 24
code executed in 0.040576 ms
#datenwolf : thanks for answering above!!!
Since you are not aware of the image size. It is best to choose any reasonable dimension of the two-dimensional block of threads and then check for two conditions. The first one is that the pos_x and pos_y indexes in the kernel do not exceed numRows and numCols. Secondly the grid size should be just above the total number of threads in all the blocks.
const dim3 blockSize(16, 16, 1);
const dim3 gridSize((numCols%16) ? numCols/16+1 : numCols/16,
(numRows%16) ? numRows/16+1 : numRows/16, 1);
libdc1394 error: Failed to initialize libdc1394
I don't think that this is a CUDA problem. libdc1394 is a library used to access IEEE1394 aka FireWire aka iLink video devices (DV camcorders, Apple iSight camera). That library doesn'r properly initialize, hence you're not getting usefull results. Basically it's NINO: Nonsens In Nonsens Out.
the calculation of absolute x & y image positions is perfect.
but when u need to access that particular pixel in the coloured image , shouldn't you u use the following code??
uchar4 rgba = rgbaImage[absolute_image_position_x + (absolute_image_position_y * numCols)];
I thought so, when comparing it to a code you'd write to execute the same problem in serial code.
Please let me know :)
You still should have a problem with run time - the conversion will not give a proper result.
The lines:
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y];
greyImage[absolute_image_position_x + absolute_image_position_y] = channelSum;
should be changed to:
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y*numCols];
greyImage[absolute_image_position_x + absolute_image_position_y*numCols] = channelSum;
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int rgba_x = blockIdx.x * blockDim.x + threadIdx.x;
int rgba_y = blockIdx.y * blockDim.y + threadIdx.y;
int pixel_pos = rgba_x+rgba_y*numCols;
uchar4 rgba = rgbaImage[pixel_pos];
unsigned char gray = (unsigned char)(0.299f * rgba.x + 0.587f * rgba.y + 0.114f * rgba.z);
greyImage[pixel_pos] = gray;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(24, 24, 1); //TODO
const dim3 gridSize( numCols/24+1, numRows/24+1, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
The libdc1394 error is not related to firewire etc in this case - it is the library that udacity is using to compare the image your program creates to the reference image. And what is is saying is that the difference between your image and the reference image has been been exceeded by a specific threshold, for that position ie. pixel.
You are running following number of block and grids:
const dim3 blockSize(numCols/32, numCols/32 , 1); //TODO
const dim3 gridSize(numRows/12, numRows/12 , 1); //TODO
yet you are not using any threads in your kernel code!
int absolute_image_position_x = blockIdx.x;
int absolute_image_position_y = blockIdx.y;
think this way, the width of an image can be divide into absolute_image_position_x parts of column and the height of an image can be divide into absolute_image_position_y parts of row. Now the box each of the cross section it creates you need to change/redraw all the pixels in terms of greyImage, parallely. Enough spoiler for an assignment :)
same code with with ability to handle non-standard input size images
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;
uchar4 rgbcell=rgbaImage[idx*numCols+idy];
greyImage[idx*numCols+idy]=0.299*rgbcell.x+0.587*rgbcell.y+0.114*rgbcell.z;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
int totalpixels=numRows*numCols;
int factors[]={2,4,8,16,24,32};
vector<int> numbers(factors,factors+sizeof(factors)/sizeof(int));
int factor=1;
while(!numbers.empty())
{
if(totalpixels%numbers.back()==0)
{
factor=numbers.back();
break;
}
else
{
numbers.pop_back();
}
}
const dim3 blockSize(factor, factor, 1); //TODO
const dim3 gridSize(numRows/factor+1, numCols/factor+1,1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
1- int x =(blockIdx.x * blockDim.x) + threadIdx.x;
2- int y = (blockIdx.y * blockDim.y) + threadIdx.y;
And in grid and block size
1- const dim3 blockSize(32, 32, 1);
2- const dim3 gridSize((numCols/32+1), (numRows/32+1) , 1);
Code executed in 0.036992 ms.
const dim3 blockSize(16, 16, 1); //TODO
const dim3 gridSize( (numRows+15)/16, (numCols+15)/16, 1); //TODO
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
uchar4 rgba = rgbaImage[y*numRows + x];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[y*numRows + x] = channelSum;

Why do operations with an array corrupt the values?

I'm trying to implement the Particle Swarm Optimization on CUDA. I'm partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.
The problem is, when I'm trying to modify array element like so
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.
I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can't spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].
Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;
typedef struct {
unsigned k_max;
float c1;
float c2;
unsigned p;
float inertia_factor;
float Ef;
float X_low[f_n];
float X_high[f_n];
float X_min[n][f_n];
} params_t;
typedef void (*kernelWrapperType) (
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
);
typedef float (*twoArgsFuncType) (
float x1,
float x2
);
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
__device__ float kernelF1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
void f1(
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
) {
float *X_d = NULL;
float *Y_d = NULL;
unsigned length = n * p;
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
size_t pitch;
size_t dpitch;
cudaError_t err;
unsigned width = n;
unsigned height = p;
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
pitch = n * sizeof(float);
err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
err = cudaMalloc (&Y_d, sizeof(float) * p);
err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
err = cudaFree(X_d);
err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
err = cudaFree(Y_d);
}
float F1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
/*
* Generates random float in [0.0; 1.0]
*/
float frand(){
return (float)rand()/(float)RAND_MAX;
}
/*
* This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
* brings the result vector back to host and dumps it on the console.
*/
int main() {
const params_t params = {
100,
0.5,
0.5,
p,
0.98,
0.01,
{-5.12, -2.048, -5.12},
{5.12, 2.048, 5.12},
{{0, 1, 0}, {0, 1, 0}}
};
float X[p][n];
float X_highVec[n];
float V[p][n];
float X_best[p][n];
float Y[p] = {0};
float Y_best[p] = {0};
float X_swarmBest[n];
kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
twoArgsFuncType F[f_n] = {&F1, &F1, &F1};
for (unsigned f = 0; f < f_n; f++) {
printf("Optimizing function #%u\n", f);
srand ( time(NULL) );
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
X[i][j] = X_best[i][j] = frand();
for (int i = 0; i < n; i++)
X_highVec[i] = params.X_high[f];
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
V[i][j] = frand();
for (unsigned i = 0; i < p; i++)
Y_best[i] = F[f](X[i][0], X[i][1]);
for (unsigned i = 0; i < n; i++)
X_swarmBest[i] = params.X_high[f];
float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);
bool termination = false;
float inertia = 1.;
for (unsigned k = 0; k < params.k_max; k++) {
F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, &params, f);
}
for (unsigned i = 0; i < p; i++)
{
for (unsigned j = 0; j < n; j++)
{
printf("%f\t", X[i][j]);
}
printf("F = %f\n", Y[i]);
}
getchar();
}
}
Update: I tried adding error handling like so
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
fprintf(stderr, cudaGetErrorString(err));
exit(1);
}
after each API call, but it gave me nothing and didn't return (I still get all the results and program works to the end).
This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:
const unsigned n = 2;
const unsigned p = 64;
unsigned length = n * p
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.
As for infinite values, that is to be expected isn't it? Your code starts with values in (0,1), and then does
X[i] = X[i] * (5.12--5.12) - -5.12
100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by
X[i] = X[i] * (2.048--2.048) - -2.048
100 times, which is the rough equivalent of multiplying by 4^100, finally followed by
X[i] = X[i] * (5.12--5.12) - -5.12
again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.

CUDA memory limitations

If I try to send to my CUDA device a struct wich is heavier than the size of memory available, will CUDA give me any kind of warning or error?
I'm asking that because my GPU has 1024 MBytes (1073414144 bytes) Total amount of global memory, but I don't know how I should handle and eventual problem.
That's my code:
#define VECSIZE 2250000
#define WIDTH 1500
#define HEIGHT 1500
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
struct Matrix
{
int width;
int height;
int* elements;
};
int main()
{
Matrix M;
M.width = WIDTH;
M.height = HEIGHT;
M.elements = (int *) calloc(VECSIZE,sizeof(int));
int row, col;
// define Matrix M
// Matrix generator:
for (int i = 0; i < M.height; i++)
for(int j = 0; j < M.width; j++)
{
row = i;
col = j;
if (i == j)
M.elements[row * M.width + col] = INFINITY;
else
{
M.elements[row * M.width + col] = (rand() % 2); // because 'rand() % 1' just does not seems to work ta all.
if (M.elements[row * M.width + col] == 0) // can't have zero weight.
M.elements[row * M.width + col] = INFINITY;
else if (M.elements[row * M.width + col] == 2)
M.elements[row * M.width + col] = 1;
}
}
// Declare & send device Matrix to Device.
Matrix d_M;
d_M.width = M.width;
d_M.height = M.height;
size_t size = M.width * M.height * sizeof(int);
cudaMalloc(&d_M.elements, size);
cudaMemcpy(d_M.elements, M.elements, size, cudaMemcpyHostToDevice);
int *d_k= (int*) malloc(sizeof(int));
cudaMalloc((void**) &d_k, sizeof (int));
int *d_width=(int*)malloc(sizeof(int));
cudaMalloc((void**) &d_width, sizeof(int));
unsigned int *width=(unsigned int*)malloc(sizeof(unsigned int));
width[0] = M.width;
cudaMemcpy(d_width, width, sizeof(int), cudaMemcpyHostToDevice);
int *d_height=(int*)malloc(sizeof(int));
cudaMalloc((void**) &d_height, sizeof(int));
unsigned int *height=(unsigned int*)malloc(sizeof(unsigned int));
height[0] = M.height;
cudaMemcpy(d_height, height, sizeof(int), cudaMemcpyHostToDevice);
/*
et cetera .. */
While you may not currently be sending enough data to the GPU to max out it's memory, when you do, your cudaMalloc will return the error code cudaErrorMemoryAllocation which as per the cuda api docs, signals that the memory allocation failed. I note that in your example code you are not checking the return values of the cuda calls. These return codes need to be checked to make sure your program is running correctly. The cuda api does not throw exceptions: you must check the return codes. See this article for info on checking the errors and getting meaningful messages about the errors
If you are using cutil.h, then it provides two very useful macros:
CUDA_SAFE_CALL (used while issuing functions like cudaMalloc, cudaMemcpy etc.)
and
CUT_CHECK_ERROR (used after executing a kernel to check for errors in kernel execution).
They take care of the errors, if any, by using the error checking mechanism detailed in the article provided by flipchart.

Resources