Blurring creates stripes on the output image - opencv

I wrote this piece of code to make a median Blur in CUDA but I am running into an issue, where the channel of image is blurred but it creates stripes which look unusual for blurring.
#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
using namespace std;
using namespace cv;
#define BLOCK_SIZE 16
#define TILE_SIZE 14
#define FILTER_WIDTH 3
#define FILTER_HEIGHT 3
__device__ void sort(unsigned char* filterVector)
{
for (int i = 0; i < FILTER_WIDTH*FILTER_HEIGHT; i++) {
for (int j = i + 1; j < FILTER_WIDTH*FILTER_HEIGHT; j++) {
if (filterVector[i] > filterVector[j]) {
unsigned char tmp = filterVector[i];
filterVector[i] = filterVector[j];
filterVector[j] = tmp;
}
}
}
}
__global__ void medianFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{
int x_o = TILE_SIZE * blockIdx.x + threadIdx.x;
int y_o = TILE_SIZE * blockIdx.y + threadIdx.y;
int x_i = x_o - (FILTER_HEIGHT / 2);
int y_i = y_o - (FILTER_WIDTH / 2);
__shared__ unsigned char sBuffer[BLOCK_SIZE][BLOCK_SIZE];
if ((x_i >= 0) && (x_i < width) && (y_i >= 0) && (y_i < height)) {
sBuffer[threadIdx.y][threadIdx.x] = srcImage[y_i * width + x_i];
} else {
sBuffer[threadIdx.y][threadIdx.x] = 0;
}
__syncthreads();
unsigned char filterVector[FILTER_WIDTH*FILTER_HEIGHT];
// int size_vec = sizeof(filterVector) / sizeof(filterVector[0]);
// printf("%d \n", size_vec);
if (threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {
for (int r = 0; r < FILTER_HEIGHT; r++) {
for (int c = 0; c < FILTER_HEIGHT; c++) {
filterVector[r*FILTER_HEIGHT+c] = sBuffer[threadIdx.y + r][threadIdx.x + c];
}
}
}
sort(filterVector);
if (x_o < width && y_o < height) {
dstImage[y_o * width + x_o] = filterVector[4]; // (FILTER_WIDTH*FILTER_HEIGHT)/2
}
}
int main(int argc, char **argv)
{
std::string image_path = "./test.jpg";
cv::Mat img = imread(image_path, IMREAD_COLOR);
std::string output_file = "test_gpu.jpg";
if(img.empty())
{
std::cout << "Couldn't read img:" << image_path << std::endl;
}
Mat bgr[3];
split(img, bgr);
cv::Mat dstImg (bgr[1].size(), bgr[1].type());
const int inputSize = img.cols * img.rows;
const int outputSize = dstImg.cols * dstImg.rows;
unsigned char *d_input, *d_output;
cudaMalloc<unsigned char>(&d_input, inputSize);
cudaMalloc<unsigned char>(&d_output, outputSize);
cudaMemcpy(d_input, bgr[1].ptr(), inputSize, cudaMemcpyHostToDevice);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid((dstImg.cols + TILE_SIZE - 1)/TILE_SIZE, (dstImg.rows + TILE_SIZE - 1)/TILE_SIZE);
medianFilter<<<grid,block>>>(d_input, d_output, dstImg.cols, dstImg.rows);
cudaMemcpy(dstImg.ptr(), d_output, outputSize, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
imwrite(output_file, dstImg);
}
This is my original image:
and here is one blurred channel:
For some reason I get those stripes on the output image, which is just one of the channels for now. Any idea why this is happening?

Your intention is that even though you are launching a block of dimension (BLOCK_SIZE, BLOCK_SIZE), you only intend (TILE_SIZE, TILE_SIZE) threads in that block to actually compute the values for output pixels.
However you are not properly accounting for that here:
if (x_o < width && y_o < height) {
that should be, instead:
if (x_o < width && y_o < height && threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {
(In fact, everything after the __syncthreads() in your kernel can be conditioned to only execute if threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE if you wish.)

Related

How to implement nearest neighbours image resizing algorithm in CUDA?

My main purpose is to load frames from a video with OpenCV, then copy it Nvidia Gpu memory, resize it with a Cuda based nearest neighbour algorithm, then copy it back to the host side and visualise it with cv::imshow()
Unfortunately, I always got segmentation faults. There could be a problem with defining the amount of bytes to be copied or with the data conversions.
Below, you can find the main parts of the source code, but here is the repo for the full project:
https://github.com/foxakarmi/imageResize
Main function:
#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>
void *buffers[3];
int main() {
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened()) {
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
buffers[2] = malloc(3 * 640 * 640);
while (capture.read(frame)) {
CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))
cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);
CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))
cv::Mat foo;
foo.data = static_cast<uchar *>(buffers[2]);
cv::imshow("img", foo);
cv::waitKey(1);
}
capture.release();
return 0;
}
The .cu file containing the kernel and a wrapper function:
#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"
__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if (i < dst_h && j < dst_w) {
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_h;
dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
}
}
cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
kernelNearestNeighbourResize <<< 3600, 256>>>(
src_img, dst_img, src_w,
src_h, dst_w, dst_h);
return cudaGetLastError();
}
Below you can see a complete working solution.
There are 3 main issues in your code:
The setup for the CUDA grid is incorrect. See an example how to set it in my code below (just an initial working version that you can further improve). See some general info here: The CUDA Programming Model.
Note: the grid setup can have a meaningful effect on the overall performance, and it is not trivial to optimize.
See more info here: How do I choose grid and block dimensions for CUDA kernels?.
When copying the data to the device, you used frame.ptr() instead of frame.data.
You only set the data pointer for the output cv::Mat foo, without properly initializing it.
So the cv::Mat metadata (rows, cols etc.) were not set and cv::imshow could not show it properly.
In my code it is not required - see below.
Note that your code skips the first frame. I kept this behavior. You could include the first frame by checking if dst_img was already initialized, and if not (since it's the first frame) - initialize it and the CUDA buffers.
Some more notes on the code below:
There's no need to allocate buffer[2] for the host output image.
Instead I initialized the cv::Mat with the proper size and use it's allocated buffer.
I renamed the device buffers, and added cudaFree for them.
It is safer to pass the number of channels to the kernel, rather than making it assume it is 3.
I passed the step (AKA stride) of the images to the kernel. This will support the case where the images have padding (see about it here: stride and padding of an image).
Code for main:
#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include "cuda_runtime.h"
#include <assert.h>
#define CUDA_CHECK(x) { cudaError_t cudaStatus = x; assert(cudaStatus == cudaSuccess); }
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channel,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step);
int main()
{
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened())
{
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
int src_w = frame.cols;
int src_h = frame.rows;
int src_step = (int)frame.step[0];
int channels = frame.channels();
int data_type = frame.type();
assert((data_type & CV_MAT_DEPTH_MASK) == CV_8U); // assert that it is a uchar image
// Parameters you can change:
int dst_w = 640;
int dst_h = 640;
cv::Mat dst_img(dst_h, dst_w, data_type);
int dst_step = (int)dst_img.step[0];
void * src_dev_buffer;
void * dst_dev_buffer;
CUDA_CHECK(cudaMalloc(&src_dev_buffer, src_h * src_step));
CUDA_CHECK(cudaMalloc(&dst_dev_buffer, dst_h * dst_step));
while (capture.read(frame))
{
// assert that the current frame has the same type and dimensions as the first one (should be guaranteed by the video decoder):
assert(frame.cols == src_w);
assert(frame.rows == src_h);
assert((int)frame.step[0] == src_step);
assert(frame.type() == data_type);
CUDA_CHECK(cudaMemcpy(src_dev_buffer, frame.data, src_h * src_step, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaNearestResize((unsigned char *)src_dev_buffer, (unsigned char *)dst_dev_buffer, channels, src_w, src_h, src_step, dst_w, dst_h, dst_step));
CUDA_CHECK(cudaMemcpy(dst_img.data, dst_dev_buffer, dst_h * dst_step, cudaMemcpyDeviceToHost));
cv::imshow("dst_img", dst_img);
cv::waitKey(1);
}
CUDA_CHECK(cudaFree(src_dev_buffer));
CUDA_CHECK(cudaFree(dst_dev_buffer));
capture.release();
return 0;
}
Code for the CUDA kernel and the wrapping function:
#include "cuda_runtime.h"
__global__ void kernelNearestNeighbourResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (i < dst_h && j < dst_w)
{
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_w;
int src_offset = i * dst_step + j * channels;
int dst_offset = iIn * src_step + jIn * channels;
for (int c = 0; c < channels; ++c)
{
dst_img[src_offset + c] = src_img[dst_offset + c];
}
}
}
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
// The grid dimensions
dim3 dimBlock(32, 32);
dim3 dimGrid(dst_w / 32 + 1, dst_h / 32 + 1);
kernelNearestNeighbourResize << < dimGrid, dimBlock >> >(
src_img, dst_img, channels,
src_w, src_h, src_step, dst_w, dst_h, dst_step);
return cudaGetLastError();
}

RGB2GRAY with CUDA and CImg library

I need to do a RGB2GRAY image processing algorithm. I just need some help in completing the global function or how I can access the * d_src pointer. This is my code, your help will be greatly appreciated.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "CImg.h"
#include <iostream>
using namespace std;
using namespace cimg_library;
__global__ void rgb2gray(unsigned char * d_src, unsigned char * d_dst, int width, int height){
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
}
int main(){
//Load image
CImg<unsigned char> src("lena.jpg");
int width = src.width();
int height = src.height();
unsigned long sizee = src.size();
int sze = width * height;
cout << sze << endl;
//create pointer to image
unsigned char *h_src = src.data();
CImg<unsigned char> dst(width, height, 1, 1);
unsigned char *h_dst = dst.data();
unsigned char *d_src;
unsigned char *d_dst;
cout << sizee << endl;
cudaMalloc((void**)&d_src, sizee);
cudaMalloc((void**)&d_dst, width*height*sizeof(int));
cudaMemcpy(d_src, h_src, sizee, cudaMemcpyHostToDevice);
//launch the kernel
rgb2gray << <(width/16,height/16,1), (16, 16, 1) >> >(d_src, d_dst, width, height);
//force the printf()s to flush
cudaDeviceSynchronize();
// copy back the result array to the CPU
cudaMemcpy(h_dst, d_dst, width*height, cudaMemcpyDeviceToHost);
cudaFree(d_src);
cudaFree(d_dst);
CImgDisplay main_disp(dst, "After Processing");
while (!main_disp.is_closed())
main_disp.wait();
return 0;
}
Firstly, since your dst object consists of unsigned char, allocate d_dst as follows;
cudaMalloc((void**)&d_dst, width*height*sizeof(unsigned char));
Next, grid must cover every pixels, considering cases when width or height are not a multiple of 16. Launch kernel with following kernel configuration.
dim3 blkDim (16, 16, 1);
dim3 grdDim ((width + 15)/16, (height + 15)/16, 1);
rgb2gray<<<grdDim, blkDim>>>(d_src, d_dst, width, height);
Lastly, your kernel should look like this. Note that RGB channels are split in d_src.
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
unsigned char r = d_src[pos_y * width + pos_x];
unsigned char g = d_src[(height + pos_y) * width + pos_x];
unsigned char b = d_src[(height * 2 + pos_y) * width + pos_x];
unsigned int _gray = (unsigned int)((float)(r + g + b) / 3.0f + 0.5);
unsigned char gray = _gray > 255 ? 255 : _gray;
d_dst[pos_y * width + pos_x] = gray;
You can see the full code here.

Assertion failed in my OpenCV program

My program is the another edition of the OpenCV Harris and Shi-Tomasi corner detection.
I just added a little code into it to change the image corner detection into video corner detection. However, it came out one faulty which I can't deal with. (cv::Exception).
This is my program:
#include "stdafx.h"
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/features2d/features2d.hpp>
#include <opencv2/imgproc/imgproc.hpp>
using namespace cv;
using namespace std;
double Harris_minVal;
double Harris_maxVal;
double ShiTomasi_minVal;
double ShiTomasi_maxVal;
int Harris_qualityLevel = 50;
int ShiTomasi_qualityLevel = 50;
int max_qualityLevel = 100;
RNG rng(12345);
const char* Harris_window = "Harris Corner Detection";
const char* ShiTomasi_window = "ShiTomasi Corner Detection";
Mat frame, frame_gray, Harris_dst, Harris_copy, ShiTomasi_dst, ShiTomasi_copy,Mc;
void HarrisFunction(int, void*);
void ShiTomasiFunction(int, void*);
int main()
{
VideoCapture capture(0);
while (1) {
capture >> frame;
cvtColor(frame, frame_gray, COLOR_BGR2GRAY);
int blockSize = 3;
int kSize = 3;
Harris_dst = Mat::zeros(frame_gray.size(), CV_32FC(6));
Mc = Mat::zeros(frame_gray.size(), CV_32FC1);
cornerEigenValsAndVecs(frame_gray, Harris_dst, blockSize, kSize, BORDER_DEFAULT);
for (int j = 0; j < frame_gray.rows; j++) {
for (int i = 0; i < frame_gray.cols; i++) {
float lambda_1 = Harris_dst.at<Vec6f>(j, i)[0];
float lambda_2 = Harris_dst.at<Vec6f>(j, i)[1];
Mc.at<float>(j, i) = lambda_1*lambda_2 - 0.04f*pow((lambda_1 + lambda_2), 2);
}
}
minMaxLoc(Mc, &Harris_minVal, &Harris_maxVal, 0, 0, Mat());
namedWindow(Harris_window, WINDOW_AUTOSIZE);
createTrackbar("质量:", Harris_window, &Harris_qualityLevel, max_qualityLevel, HarrisFunction);
HarrisFunction(0, 0);
ShiTomasi_dst = Mat::zeros(frame_gray.size(), CV_32FC1);
cornerEigenValsAndVecs(frame_gray, ShiTomasi_dst, blockSize, kSize, BORDER_DEFAULT);
minMaxLoc(ShiTomasi_dst, &ShiTomasi_minVal, &ShiTomasi_maxVal, 0, 0, Mat());
namedWindow(ShiTomasi_window, WINDOW_AUTOSIZE);
createTrackbar(" 质量:", ShiTomasi_window, &ShiTomasi_qualityLevel, max_qualityLevel, ShiTomasiFunction);
ShiTomasiFunction(0, 0);
}
return 0;
}
void HarrisFunction(int, void*) {
Harris_copy = frame.clone();
if (Harris_qualityLevel < 1) {
Harris_qualityLevel = 1;
}
for (int j = 0; j <= frame_gray.rows; j++) {
for (int i = 0; i <= frame_gray.cols; i++) {
if (Mc.at<float>(j, i) > Harris_minVal + (Harris_maxVal - Harris_minVal)*(Harris_qualityLevel / max_qualityLevel))
{
circle(Harris_copy, Point(i, j), 4, Scalar(rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255)),1,8,0);
}
}
}
imshow(Harris_window, Harris_copy);
waitKey(30);
}
void ShiTomasiFunction(int, void*) {
ShiTomasi_copy = frame.clone();
if (ShiTomasi_qualityLevel < 1) {
ShiTomasi_qualityLevel = 1;
}
for (int j = 0; j <= frame_gray.rows; j++) {
for (int i = 0; i <= frame_gray.cols; i++) {
if (ShiTomasi_dst.at<float>(j, i) > ShiTomasi_minVal + (ShiTomasi_maxVal - ShiTomasi_minVal)*(ShiTomasi_qualityLevel / max_qualityLevel))
{
circle(ShiTomasi_copy, Point(i, j), 4, Scalar(rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255)), 1, 8, 0);
}
}
}
imshow(ShiTomasi_window, ShiTomasi_copy);
waitKey(30);
}
After my debug, the Problem is before imshow(Harris_window,Harris_copy):
OpenCV Error: Assertion failed (dims <= 2 && data && (unsigned)i0<(unsigned)size.p[0] && (unsigned)(i1 * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()) && ((((sizeof(size_t)<<28)|0x8442211) >> ((DataType<_Tp>::depth) & ((1<< 3) - 1))*4) & 15) == elemSize1()) in cv::Mat::at,filed:\opencv\build\include\opencv2\core\mat.inl.hpp, line 894
The upper part is what the panel window writes.

opencv 3 channel image data offset for cuda kernel [duplicate]

I'm doing linear filtering on images using CUDA. I use 2D thread blocks and 2D grid to make the problem natural. Here's how I index: (height and width are image dimensions)
dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + 15) / 16;
GridDim.y = (height + 15) / 16;
In kernel I access the locations as follows:
unsigned int xIndex = blockIdx.x*16+ threadIdx.x;
unsigned int yIndex = blockIdx.y*16+ threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
And I want to return four boundaries (i'll cater them later on). I do this as:
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
Where N is the number of pixels at each boundary I don't want to calculate.
Problem:
The code runs fine on all standard images sizes. But for some random image sizes it shows diagonal line(s). For example in my case 500x333 image (even when no dimension is multiple of 16) is showing correct output whereas 450x365 is showing diagonal lines in the output. The problem remains even if I just return the extra threads of grid and nothing else like this:
if(yIndex>=height || xIndex>=width)
return;
The code remains the same, some inputs run fine while others don't. Can anybody spot the bug? I have attached the input and output samples here: IMAGES Thanks!
Update:
Kernel Code (Simplified to return input image, but gives the same problem)
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
/*Filter code removed, still gives the same problem*/
out[tid] = in[tid];
}
Update 2:
I have also removed the return statement by reversing the if condition. But the problem persists.
if(yIndex<=height-N && xIndex<=width-N && yIndex>N && xIndex>N){
/*Kernel Code*/
}
There are quite a few things you still haven't described very well, but based on the information you have posted, I built what I am guessing is a reasonable repro case with parameters which match a case you say it failing (450 x 364 with filterSize=5):
#include <stdio.h>
#include <assert.h>
template<int filterSize>
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
out[tid] = in[tid];
}
int main(void)
{
const int width = 450, height = 365, filterSize=5;
const size_t isize = sizeof(unsigned char) * size_t(width * height);
unsigned char * _in, * _out, * out;
assert( cudaMalloc((void **)&_in, isize) == cudaSuccess );
assert( cudaMalloc((void **)&_out, isize) == cudaSuccess );
assert( cudaMemset(_in, 'Z', isize) == cudaSuccess );
assert( cudaMemset(_out, 'A', isize) == cudaSuccess );
const dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + BlockDim.x - 1) / BlockDim.x;
GridDim.y = (height + BlockDim.y - 1) / BlockDim.y;
filter_8u_c1_kernel<filterSize><<<GridDim,BlockDim>>>(_in,_out,width,height,0,0);
assert( cudaPeekAtLastError() == cudaSuccess );
out = (unsigned char *)malloc(isize);
assert( cudaMemcpy(out, _out, isize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<width; i++) {
fprintf(stdout, "%d: ", i);
for(int j=0; j<height; j++) {
unsigned int idx = i + j*width;
fprintf(stdout, "%c", out[idx]);
}
fprintf(stdout, "\n");
}
return cudaThreadExit();
}
When run it does exactly what I would expect, overwriting the output memory with the input everywhere except for the first and last two lines and the first and last two entries in all the lines in between. This is running with CUDA 3.2 on OS X 10.6.5 with a compute 1.2 GPU. So whatever is happening in you code, it isn't happening in my repro case, which either means I have misinterpreted what you have written, or there is something else you haven't described which is causing the problem.

Why do operations with an array corrupt the values?

I'm trying to implement the Particle Swarm Optimization on CUDA. I'm partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.
The problem is, when I'm trying to modify array element like so
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.
I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can't spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].
Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;
typedef struct {
unsigned k_max;
float c1;
float c2;
unsigned p;
float inertia_factor;
float Ef;
float X_low[f_n];
float X_high[f_n];
float X_min[n][f_n];
} params_t;
typedef void (*kernelWrapperType) (
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
);
typedef float (*twoArgsFuncType) (
float x1,
float x2
);
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
__device__ float kernelF1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
void f1(
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
) {
float *X_d = NULL;
float *Y_d = NULL;
unsigned length = n * p;
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
size_t pitch;
size_t dpitch;
cudaError_t err;
unsigned width = n;
unsigned height = p;
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
pitch = n * sizeof(float);
err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
err = cudaMalloc (&Y_d, sizeof(float) * p);
err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
err = cudaFree(X_d);
err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
err = cudaFree(Y_d);
}
float F1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
/*
* Generates random float in [0.0; 1.0]
*/
float frand(){
return (float)rand()/(float)RAND_MAX;
}
/*
* This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
* brings the result vector back to host and dumps it on the console.
*/
int main() {
const params_t params = {
100,
0.5,
0.5,
p,
0.98,
0.01,
{-5.12, -2.048, -5.12},
{5.12, 2.048, 5.12},
{{0, 1, 0}, {0, 1, 0}}
};
float X[p][n];
float X_highVec[n];
float V[p][n];
float X_best[p][n];
float Y[p] = {0};
float Y_best[p] = {0};
float X_swarmBest[n];
kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
twoArgsFuncType F[f_n] = {&F1, &F1, &F1};
for (unsigned f = 0; f < f_n; f++) {
printf("Optimizing function #%u\n", f);
srand ( time(NULL) );
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
X[i][j] = X_best[i][j] = frand();
for (int i = 0; i < n; i++)
X_highVec[i] = params.X_high[f];
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
V[i][j] = frand();
for (unsigned i = 0; i < p; i++)
Y_best[i] = F[f](X[i][0], X[i][1]);
for (unsigned i = 0; i < n; i++)
X_swarmBest[i] = params.X_high[f];
float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);
bool termination = false;
float inertia = 1.;
for (unsigned k = 0; k < params.k_max; k++) {
F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, &params, f);
}
for (unsigned i = 0; i < p; i++)
{
for (unsigned j = 0; j < n; j++)
{
printf("%f\t", X[i][j]);
}
printf("F = %f\n", Y[i]);
}
getchar();
}
}
Update: I tried adding error handling like so
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
fprintf(stderr, cudaGetErrorString(err));
exit(1);
}
after each API call, but it gave me nothing and didn't return (I still get all the results and program works to the end).
This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:
const unsigned n = 2;
const unsigned p = 64;
unsigned length = n * p
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.
As for infinite values, that is to be expected isn't it? Your code starts with values in (0,1), and then does
X[i] = X[i] * (5.12--5.12) - -5.12
100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by
X[i] = X[i] * (2.048--2.048) - -2.048
100 times, which is the rough equivalent of multiplying by 4^100, finally followed by
X[i] = X[i] * (5.12--5.12) - -5.12
again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.

Resources