RGB2GRAY with CUDA and CImg library - image-processing

I need to do a RGB2GRAY image processing algorithm. I just need some help in completing the global function or how I can access the * d_src pointer. This is my code, your help will be greatly appreciated.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "CImg.h"
#include <iostream>
using namespace std;
using namespace cimg_library;
__global__ void rgb2gray(unsigned char * d_src, unsigned char * d_dst, int width, int height){
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
}
int main(){
//Load image
CImg<unsigned char> src("lena.jpg");
int width = src.width();
int height = src.height();
unsigned long sizee = src.size();
int sze = width * height;
cout << sze << endl;
//create pointer to image
unsigned char *h_src = src.data();
CImg<unsigned char> dst(width, height, 1, 1);
unsigned char *h_dst = dst.data();
unsigned char *d_src;
unsigned char *d_dst;
cout << sizee << endl;
cudaMalloc((void**)&d_src, sizee);
cudaMalloc((void**)&d_dst, width*height*sizeof(int));
cudaMemcpy(d_src, h_src, sizee, cudaMemcpyHostToDevice);
//launch the kernel
rgb2gray << <(width/16,height/16,1), (16, 16, 1) >> >(d_src, d_dst, width, height);
//force the printf()s to flush
cudaDeviceSynchronize();
// copy back the result array to the CPU
cudaMemcpy(h_dst, d_dst, width*height, cudaMemcpyDeviceToHost);
cudaFree(d_src);
cudaFree(d_dst);
CImgDisplay main_disp(dst, "After Processing");
while (!main_disp.is_closed())
main_disp.wait();
return 0;
}

Firstly, since your dst object consists of unsigned char, allocate d_dst as follows;
cudaMalloc((void**)&d_dst, width*height*sizeof(unsigned char));
Next, grid must cover every pixels, considering cases when width or height are not a multiple of 16. Launch kernel with following kernel configuration.
dim3 blkDim (16, 16, 1);
dim3 grdDim ((width + 15)/16, (height + 15)/16, 1);
rgb2gray<<<grdDim, blkDim>>>(d_src, d_dst, width, height);
Lastly, your kernel should look like this. Note that RGB channels are split in d_src.
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
unsigned char r = d_src[pos_y * width + pos_x];
unsigned char g = d_src[(height + pos_y) * width + pos_x];
unsigned char b = d_src[(height * 2 + pos_y) * width + pos_x];
unsigned int _gray = (unsigned int)((float)(r + g + b) / 3.0f + 0.5);
unsigned char gray = _gray > 255 ? 255 : _gray;
d_dst[pos_y * width + pos_x] = gray;
You can see the full code here.

Related

How to implement nearest neighbours image resizing algorithm in CUDA?

My main purpose is to load frames from a video with OpenCV, then copy it Nvidia Gpu memory, resize it with a Cuda based nearest neighbour algorithm, then copy it back to the host side and visualise it with cv::imshow()
Unfortunately, I always got segmentation faults. There could be a problem with defining the amount of bytes to be copied or with the data conversions.
Below, you can find the main parts of the source code, but here is the repo for the full project:
https://github.com/foxakarmi/imageResize
Main function:
#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>
void *buffers[3];
int main() {
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened()) {
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
buffers[2] = malloc(3 * 640 * 640);
while (capture.read(frame)) {
CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))
cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);
CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))
cv::Mat foo;
foo.data = static_cast<uchar *>(buffers[2]);
cv::imshow("img", foo);
cv::waitKey(1);
}
capture.release();
return 0;
}
The .cu file containing the kernel and a wrapper function:
#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"
__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if (i < dst_h && j < dst_w) {
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_h;
dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
}
}
cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
kernelNearestNeighbourResize <<< 3600, 256>>>(
src_img, dst_img, src_w,
src_h, dst_w, dst_h);
return cudaGetLastError();
}
Below you can see a complete working solution.
There are 3 main issues in your code:
The setup for the CUDA grid is incorrect. See an example how to set it in my code below (just an initial working version that you can further improve). See some general info here: The CUDA Programming Model.
Note: the grid setup can have a meaningful effect on the overall performance, and it is not trivial to optimize.
See more info here: How do I choose grid and block dimensions for CUDA kernels?.
When copying the data to the device, you used frame.ptr() instead of frame.data.
You only set the data pointer for the output cv::Mat foo, without properly initializing it.
So the cv::Mat metadata (rows, cols etc.) were not set and cv::imshow could not show it properly.
In my code it is not required - see below.
Note that your code skips the first frame. I kept this behavior. You could include the first frame by checking if dst_img was already initialized, and if not (since it's the first frame) - initialize it and the CUDA buffers.
Some more notes on the code below:
There's no need to allocate buffer[2] for the host output image.
Instead I initialized the cv::Mat with the proper size and use it's allocated buffer.
I renamed the device buffers, and added cudaFree for them.
It is safer to pass the number of channels to the kernel, rather than making it assume it is 3.
I passed the step (AKA stride) of the images to the kernel. This will support the case where the images have padding (see about it here: stride and padding of an image).
Code for main:
#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include "cuda_runtime.h"
#include <assert.h>
#define CUDA_CHECK(x) { cudaError_t cudaStatus = x; assert(cudaStatus == cudaSuccess); }
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channel,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step);
int main()
{
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened())
{
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
int src_w = frame.cols;
int src_h = frame.rows;
int src_step = (int)frame.step[0];
int channels = frame.channels();
int data_type = frame.type();
assert((data_type & CV_MAT_DEPTH_MASK) == CV_8U); // assert that it is a uchar image
// Parameters you can change:
int dst_w = 640;
int dst_h = 640;
cv::Mat dst_img(dst_h, dst_w, data_type);
int dst_step = (int)dst_img.step[0];
void * src_dev_buffer;
void * dst_dev_buffer;
CUDA_CHECK(cudaMalloc(&src_dev_buffer, src_h * src_step));
CUDA_CHECK(cudaMalloc(&dst_dev_buffer, dst_h * dst_step));
while (capture.read(frame))
{
// assert that the current frame has the same type and dimensions as the first one (should be guaranteed by the video decoder):
assert(frame.cols == src_w);
assert(frame.rows == src_h);
assert((int)frame.step[0] == src_step);
assert(frame.type() == data_type);
CUDA_CHECK(cudaMemcpy(src_dev_buffer, frame.data, src_h * src_step, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaNearestResize((unsigned char *)src_dev_buffer, (unsigned char *)dst_dev_buffer, channels, src_w, src_h, src_step, dst_w, dst_h, dst_step));
CUDA_CHECK(cudaMemcpy(dst_img.data, dst_dev_buffer, dst_h * dst_step, cudaMemcpyDeviceToHost));
cv::imshow("dst_img", dst_img);
cv::waitKey(1);
}
CUDA_CHECK(cudaFree(src_dev_buffer));
CUDA_CHECK(cudaFree(dst_dev_buffer));
capture.release();
return 0;
}
Code for the CUDA kernel and the wrapping function:
#include "cuda_runtime.h"
__global__ void kernelNearestNeighbourResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (i < dst_h && j < dst_w)
{
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_w;
int src_offset = i * dst_step + j * channels;
int dst_offset = iIn * src_step + jIn * channels;
for (int c = 0; c < channels; ++c)
{
dst_img[src_offset + c] = src_img[dst_offset + c];
}
}
}
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
// The grid dimensions
dim3 dimBlock(32, 32);
dim3 dimGrid(dst_w / 32 + 1, dst_h / 32 + 1);
kernelNearestNeighbourResize << < dimGrid, dimBlock >> >(
src_img, dst_img, channels,
src_w, src_h, src_step, dst_w, dst_h, dst_step);
return cudaGetLastError();
}

Converting only 1/3 of RGB image to greyscale using CUDA

I am trying to convert RGB image to greyscale using CUDA. I want to read the image with stbi_load, pass it to convertToGreyscale() where I call the kernel and save the image in unsigned char* - from where I can use it and apply variable threshold, sobel, multiple threshold etc. The problem is that only 1/3 of the image is being processed and actually affected by the kernel (the greyscale kernel).
Here is my kernel:
__global__ void greyscale(unsigned char* originalImg, unsigned char* d_greyImg, int width, int height, int channels) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int id = x + y * width;
if (x < width && y < height) {
unsigned char r = originalImg[id];
unsigned char g = originalImg[id + 1];
unsigned char b = originalImg[id + 2];
int offset = (r+g+b) / channels;
for (int i = 0; i < channels; i++) {
d_greyImg[id + i] = offset;
}
}
}
And here is the other part of the code:
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define THREADS 8
void convertToGreyscale(unsigned char* originalImg, unsigned char* greyImg, int width, int height, int channels)
{
unsigned char* d_originalImg = NULL;
unsigned char* d_greyImg = NULL;
int size = width * height * channels * sizeof(unsigned char);
cudaMalloc(&d_originalImg, size);
cudaMalloc(&d_greyImg, size);
cudaMemcpy(d_originalImg, originalImg, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_greyImg, greyImg, size, cudaMemcpyHostToDevice);
dim3 dimBlock(THREADS, THREADS);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);
greyscale << <dimGrid, dimBlock >> > (d_originalImg, d_greyImg, width, height, channels);
cudaMemcpy(greyImg, d_greyImg, size, cudaMemcpyDeviceToHost);
cudaFree(d_originalImg);
}
void sobelFilter(unsigned char* originalImg, unsigned char* sobelImg, int width, int height, int channels)
{
dim3 dimBlock(THREADS, THREADS, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);
unsigned char* d_originalImg = NULL;
int size = width * height;
cudaMalloc(&d_originalImg, size * channels * sizeof(unsigned char));
cudaMemcpy(d_originalImg, originalImg, size * channels * sizeof(unsigned char), cudaMemcpyHostToDevice);
sobel << <dimGrid, dimBlock >> > (d_originalImg, width, height);
cudaMemcpy(sobelImg, d_originalImg, size * channels , cudaMemcpyDeviceToHost);
cudaFree(d_originalImg);
}
int main()
{
// read the image
int width, height, channels;
unsigned char* originalImg = stbi_load("lenna.png", &width, &height, &channels, 0);
size_t img_size = width * height * channels;
unsigned char* greyImg = (unsigned char*) malloc(img_size);
unsigned char* sobelImg = (unsigned char*) malloc(img_size);
convertToGreyscale(originalImg, greyImg, width, height, channels);
stbi_write_jpg("greyscale.png", width, height, channels, greyImg, 100);
/*sobelFilter(originalImg, sobelImg, width, height, channels);
stbi_write_jpg("sobel.png", width, height, channels, sobelImg, 100);*/
return 0;
}
you may want to do dimGrid((width+dimBlock.x-1) / dimBlock.x, (height+dimBlock.y-1) / dimBlock.y); to ensure you don't miss corners.
I think the main issue is that your input image is being read without considering channels.
Your thread will have value x, but also value x+1 (and the same y) at some point. But if you see how you define the id, you'll notice that instead of reading 3 by 3, you are reading 1 by one, because these two threads will have id and id+1 as value. You need to make sure the computation of id has channels into account, but notice that then it will not work for the grryscale, you'd need a different one there (or divide by 3).

Resize image using nearest neighborhood with cuda

I am implementing a nearest neighborhood kernel function to resize the input image. But the result is wrong and I have no idea.
Here is the input image
the result is wrong.
I use opencv to read the input image.
cv::Mat image = cv::imread("/home/tumh/test.jpg");
unsigned char* data = image.data;
int outH, outW;
float *out_data_host = test(data, image.rows, image.cols, outH, outW);
cv::Mat out_image(outH, outW, CV_32FC3);
memcpy(out_image.data, out_data_host, outH * outW * 3 * sizeof(float));
float* test(unsigned char* in_data_host, const int &inH, const int &inW, int &outH, int &outW) {
// get the output size
int im_size_min = std::min(inW, inH);
int im_size_max = std::max(inW, inH);
float scale_factor = static_cast<float>(640) / im_size_min;
float im_scale_x = std::floor(inW * scale_factor / 64) * 64 / inW;
float im_scale_y = std::floor(inH * scale_factor / 64) * 64 / inH;
outW = inW * im_scale_x;
outH = inH * im_scale_y;
int channel = 3;
unsigned char* in_data_dev;
CUDA_CHECK(cudaMalloc(&in_data_dev, sizeof(unsigned char) * channel * inH * inW));
CUDA_CHECK(cudaMemcpy(in_data_dev, in_data_host, 1 * sizeof(unsigned char) * channel * inH * inW, cudaMemcpyHostToDevice));
// image pre process
const float2 scale = make_float2( im_scale_x, im_scale_y);
float * out_buffer = NULL;
CUDA_CHECK(cudaMalloc(&out_buffer, sizeof(float) * channel * outH * outW));
float *out_data_host = new float[sizeof(float) * channel * outH * outW];
const dim3 threads(32, 32);
const dim3 block(iDivUp(outW, threads.x), iDivUp(outW, threads.y));
gpuPreImageNet<<<block, threads>>>(scale, in_data_dev, inW, out_buffer, outW, outH);
CUDA_CHECK(cudaFree(in_data_dev));
CUDA_CHECK(cudaMemcpy(out_data_host, out_buffer, sizeof(float) * channel * outH * outW, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(out_buffer));
return out_data_host;
}
Here is the resize kernel function
__global__ void gpuPreImageNet( float2 scale, unsigned char* input, int iWidth, float* output, int oWidth, int oHeight )
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int n = oWidth * oHeight;
int channel = 3;
if( x >= oWidth || y >= oHeight )
return;
const int dx = ((float)x * scale.x);
const int dy = ((float)y * scale.y);
const unsigned char* px = input + dy * iWidth * channel + dx * channel ;
const float3 bgr = make_float3(*(px + 0), *(px + 1), *(px + 2));
output[channel * y * oWidth + channel * x + 0] = bgr.x;
output[channel * y * oWidth + channel * x + 1] = bgr.y;
output[channel * y * oWidth + channel * x + 2] = bgr.z;
}
Most of the implementation is from https://github.com/soulsheng/ResizeNN/blob/master/resizeCUDA/resizeNN.cu
Any idea?
Maybe you are observing an uninitialized memory problem.
As i understand your code, out_data_host allocation is too big
new float[sizeof(float) * channel * outH * outW];
should be
new float[channel * outH * outW]
Then out_buffer is uninitialized, add a cudaMemset after the cudaMalloc line.
To clarify your code, since you already use OpenCV to load images, why don't you use opencv to resize your images ?
cv::resize // Host side method is probably better since you'll have less data copied through PCI-Express
// or
cv::cuda::resize
It took me around two days to figure out a solution for this problem. Basically, I was building a GPU based image preprocessing pipeline for my project. Here's the custom Cuda Kernel.
For Gray scale Image Resizing, change channel from 3 -> 1 and it should work.
__global__ void resize_kernel( real* pIn, real* pOut, int widthIn, int heightIn, int widthOut, int heightOut)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if( i < heightOut && j < widthOut )
{
int iIn = i * heightIn / heightOut;
int jIn = j * widthIn / widthOut;
for(int c = 0; c < channel; c++)
pOut[ (i*widthOut + j)*channel + c ] = pIn[ (iIn*widthIn + jIn)*channel + c ];
}
}

Cuda Memory access error : CudaIllegalAddress , Image Processing(Stereo vision)

I'm using cuda to deal with image proccessing. but my result is always get 'cudaErrorIllegalAddress : an illegal memory access was encountered'
What i did is below.
First, Load converted image(rgb to gray) to device, i use 'cudaMallocPitch' and 'cudaMemcpy2D'
unsigned char *dev_srcleft;
size_t dev_srcleftPitch
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS * sizeof(int), ROWS));
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
And, Allocating 2D array for store result. the result value is describe as 27bit, so what i'm trying is using 'int' which is 4bytes=32bits, not only for ample size , atomic operation(atomicOr, atomicXor) is needed for performance.
and my device is not supports 64bit atomic operation.
int *dev_leftTrans;
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMemset2D(dev_leftTrans, dev_leftTransPitch, 0, COLS, ROWS);
Memory allocation and memcpy2D works great, and i check by
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D(temp_output.data, temp_output.step, dev_srcleft, dev_srcleftPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
Then, Do kernel code.
__global__ void TestKernel(unsigned char *src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
Parameter COLS and ROWS is size of image.
I think the error occurs here : TestKerenl.
src_val, reading from global memory works good but when i'm trying to access dst, it blows up with cudaErrorIllegalAddress
I don't know what is wrong, and i sufferd for 4 days. please help me
below is my full code
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <cuda_device_runtime_api.h>
#include <device_launch_parameters.h>
#include <math.h>
#include <iostream>
#include <opencv2\opencv.hpp>
#include<string>
#define HANDLE_ERROR(err)(HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char*file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
using namespace std;
using namespace cv;
string imagePath = "Ted";
string imagePathL = imagePath + "imL.png";
string imagePathR = imagePath + "imR.png";
__global__ void TestKernel(unsigned char*src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((COLS< x) && (ROWS < y)) return;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
int main(void)
{
//Print_DeviceProperty();
//Left Image Load
Mat host_srcImgL = imread(imagePathL, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Left Image Load Fail!" << endl; return; }
Mat host_srcConvertL;
cvtColor(host_srcImgL, host_srcConvertL, CV_BGR2GRAY);
//Right Image Load
Mat host_srcImgR = imread(imagePathR, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Right Image Load Fail!" << endl; return; }
Mat host_srcConvertR;
cvtColor(host_srcImgR, host_srcConvertR, CV_BGR2GRAY);
//Create parameters
unsigned int COLS = host_srcConvertL.cols;
unsigned int ROWS = host_srcConvertR.rows;
unsigned int SIZE = COLS * ROWS;
imshow("Left source image", host_srcConvertL);
imshow("Right source image", host_srcConvertR);
unsigned char *dev_srcleft, *dev_srcright, *dev_disp;
int *dev_leftTrans, *dev_rightTrans;
size_t dev_srcleftPitch, dev_srcrightPitch, dev_dispPitch, dev_leftTransPitch, dev_rightTransPitch;
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_srcright, &dev_srcrightPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_disp, &dev_dispPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMallocPitch((void**)&dev_rightTrans, &dev_rightTransPitch, COLS * sizeof(int), ROWS);
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemcpy2D(dev_srcright, dev_srcrightPitch, host_srcConvertR.data, host_srcConvertR.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemset(dev_disp, 255, dev_dispPitch * ROWS);
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
cudaEvent_t start, stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
/*TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcright, dev_srcrightPitch, dev_rightTrans, dev_rightTransPitch, COLS, ROWS);*/
cudaThreadSynchronize();
cudaError_t res = cudaGetLastError();
if (res != cudaSuccess)
printf("%s : %s\n", cudaGetErrorName(res), cudaGetErrorString(res));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << elapsedtime << "msec" << endl;
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D((int*)temp_output.data, temp_output.step, dev_leftTrans, dev_leftTransPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
waitKey(0);
return 0;
}
And this is my environment vs2013, cuda v6.5
Device' property's below
Major revision number: 3
Minor revision number: 0
Name: GeForce GTX 760 (192-bit)
Total global memory: 1610612736
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 888500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 6
Kernel execution timeout: Yes
One problem is that your kernel doesn't do any thread-checking.
When you define a grid of blocks like this:
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
you will often be launching extra blocks. The reason is that if COLS or ROW is not evenly divisible by the block dimensions (3 in this case) then you will get extra blocks to cover the remainder in each case.
These extra blocks will have some threads that are doing useful work, and some that will access out-of-bounds. To protect against this, it's customary to put a thread-check in your kernel to prevent out-of-bounds accesses:
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < COLS) && (y < ROWS)) { // add this
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
} // add this
This means that only the threads that have a valid (in-bounds) x and y will actually do any accesses.
As an aside, (3,3) may not be a particularly good choice of block dimensions for performance reasons. It's usually a good idea to create block dimensions whose product is a multiple of 32, so (32,4) or (16,16) might be examples of better choices.
Another problem in your code is pitch usage for dst array.
Pitch is always in bytes, so first you need to cast dst pointer to char*, calculate row offset and then cast it back to int*:
int* dst_row = (int*)(((char*)dst) + y * dst_pitch);
dst_row[x] = src_val;

opencv 3 channel image data offset for cuda kernel [duplicate]

I'm doing linear filtering on images using CUDA. I use 2D thread blocks and 2D grid to make the problem natural. Here's how I index: (height and width are image dimensions)
dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + 15) / 16;
GridDim.y = (height + 15) / 16;
In kernel I access the locations as follows:
unsigned int xIndex = blockIdx.x*16+ threadIdx.x;
unsigned int yIndex = blockIdx.y*16+ threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
And I want to return four boundaries (i'll cater them later on). I do this as:
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
Where N is the number of pixels at each boundary I don't want to calculate.
Problem:
The code runs fine on all standard images sizes. But for some random image sizes it shows diagonal line(s). For example in my case 500x333 image (even when no dimension is multiple of 16) is showing correct output whereas 450x365 is showing diagonal lines in the output. The problem remains even if I just return the extra threads of grid and nothing else like this:
if(yIndex>=height || xIndex>=width)
return;
The code remains the same, some inputs run fine while others don't. Can anybody spot the bug? I have attached the input and output samples here: IMAGES Thanks!
Update:
Kernel Code (Simplified to return input image, but gives the same problem)
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
/*Filter code removed, still gives the same problem*/
out[tid] = in[tid];
}
Update 2:
I have also removed the return statement by reversing the if condition. But the problem persists.
if(yIndex<=height-N && xIndex<=width-N && yIndex>N && xIndex>N){
/*Kernel Code*/
}
There are quite a few things you still haven't described very well, but based on the information you have posted, I built what I am guessing is a reasonable repro case with parameters which match a case you say it failing (450 x 364 with filterSize=5):
#include <stdio.h>
#include <assert.h>
template<int filterSize>
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
out[tid] = in[tid];
}
int main(void)
{
const int width = 450, height = 365, filterSize=5;
const size_t isize = sizeof(unsigned char) * size_t(width * height);
unsigned char * _in, * _out, * out;
assert( cudaMalloc((void **)&_in, isize) == cudaSuccess );
assert( cudaMalloc((void **)&_out, isize) == cudaSuccess );
assert( cudaMemset(_in, 'Z', isize) == cudaSuccess );
assert( cudaMemset(_out, 'A', isize) == cudaSuccess );
const dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + BlockDim.x - 1) / BlockDim.x;
GridDim.y = (height + BlockDim.y - 1) / BlockDim.y;
filter_8u_c1_kernel<filterSize><<<GridDim,BlockDim>>>(_in,_out,width,height,0,0);
assert( cudaPeekAtLastError() == cudaSuccess );
out = (unsigned char *)malloc(isize);
assert( cudaMemcpy(out, _out, isize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<width; i++) {
fprintf(stdout, "%d: ", i);
for(int j=0; j<height; j++) {
unsigned int idx = i + j*width;
fprintf(stdout, "%c", out[idx]);
}
fprintf(stdout, "\n");
}
return cudaThreadExit();
}
When run it does exactly what I would expect, overwriting the output memory with the input everywhere except for the first and last two lines and the first and last two entries in all the lines in between. This is running with CUDA 3.2 on OS X 10.6.5 with a compute 1.2 GPU. So whatever is happening in you code, it isn't happening in my repro case, which either means I have misinterpreted what you have written, or there is something else you haven't described which is causing the problem.

Resources