Converting only 1/3 of RGB image to greyscale using CUDA - image-processing

I am trying to convert RGB image to greyscale using CUDA. I want to read the image with stbi_load, pass it to convertToGreyscale() where I call the kernel and save the image in unsigned char* - from where I can use it and apply variable threshold, sobel, multiple threshold etc. The problem is that only 1/3 of the image is being processed and actually affected by the kernel (the greyscale kernel).
Here is my kernel:
__global__ void greyscale(unsigned char* originalImg, unsigned char* d_greyImg, int width, int height, int channels) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int id = x + y * width;
if (x < width && y < height) {
unsigned char r = originalImg[id];
unsigned char g = originalImg[id + 1];
unsigned char b = originalImg[id + 2];
int offset = (r+g+b) / channels;
for (int i = 0; i < channels; i++) {
d_greyImg[id + i] = offset;
}
}
}
And here is the other part of the code:
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define THREADS 8
void convertToGreyscale(unsigned char* originalImg, unsigned char* greyImg, int width, int height, int channels)
{
unsigned char* d_originalImg = NULL;
unsigned char* d_greyImg = NULL;
int size = width * height * channels * sizeof(unsigned char);
cudaMalloc(&d_originalImg, size);
cudaMalloc(&d_greyImg, size);
cudaMemcpy(d_originalImg, originalImg, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_greyImg, greyImg, size, cudaMemcpyHostToDevice);
dim3 dimBlock(THREADS, THREADS);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);
greyscale << <dimGrid, dimBlock >> > (d_originalImg, d_greyImg, width, height, channels);
cudaMemcpy(greyImg, d_greyImg, size, cudaMemcpyDeviceToHost);
cudaFree(d_originalImg);
}
void sobelFilter(unsigned char* originalImg, unsigned char* sobelImg, int width, int height, int channels)
{
dim3 dimBlock(THREADS, THREADS, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);
unsigned char* d_originalImg = NULL;
int size = width * height;
cudaMalloc(&d_originalImg, size * channels * sizeof(unsigned char));
cudaMemcpy(d_originalImg, originalImg, size * channels * sizeof(unsigned char), cudaMemcpyHostToDevice);
sobel << <dimGrid, dimBlock >> > (d_originalImg, width, height);
cudaMemcpy(sobelImg, d_originalImg, size * channels , cudaMemcpyDeviceToHost);
cudaFree(d_originalImg);
}
int main()
{
// read the image
int width, height, channels;
unsigned char* originalImg = stbi_load("lenna.png", &width, &height, &channels, 0);
size_t img_size = width * height * channels;
unsigned char* greyImg = (unsigned char*) malloc(img_size);
unsigned char* sobelImg = (unsigned char*) malloc(img_size);
convertToGreyscale(originalImg, greyImg, width, height, channels);
stbi_write_jpg("greyscale.png", width, height, channels, greyImg, 100);
/*sobelFilter(originalImg, sobelImg, width, height, channels);
stbi_write_jpg("sobel.png", width, height, channels, sobelImg, 100);*/
return 0;
}

you may want to do dimGrid((width+dimBlock.x-1) / dimBlock.x, (height+dimBlock.y-1) / dimBlock.y); to ensure you don't miss corners.
I think the main issue is that your input image is being read without considering channels.
Your thread will have value x, but also value x+1 (and the same y) at some point. But if you see how you define the id, you'll notice that instead of reading 3 by 3, you are reading 1 by one, because these two threads will have id and id+1 as value. You need to make sure the computation of id has channels into account, but notice that then it will not work for the grryscale, you'd need a different one there (or divide by 3).

Related

How to implement nearest neighbours image resizing algorithm in CUDA?

My main purpose is to load frames from a video with OpenCV, then copy it Nvidia Gpu memory, resize it with a Cuda based nearest neighbour algorithm, then copy it back to the host side and visualise it with cv::imshow()
Unfortunately, I always got segmentation faults. There could be a problem with defining the amount of bytes to be copied or with the data conversions.
Below, you can find the main parts of the source code, but here is the repo for the full project:
https://github.com/foxakarmi/imageResize
Main function:
#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>
void *buffers[3];
int main() {
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened()) {
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
buffers[2] = malloc(3 * 640 * 640);
while (capture.read(frame)) {
CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))
cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);
CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))
cv::Mat foo;
foo.data = static_cast<uchar *>(buffers[2]);
cv::imshow("img", foo);
cv::waitKey(1);
}
capture.release();
return 0;
}
The .cu file containing the kernel and a wrapper function:
#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"
__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if (i < dst_h && j < dst_w) {
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_h;
dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
}
}
cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
kernelNearestNeighbourResize <<< 3600, 256>>>(
src_img, dst_img, src_w,
src_h, dst_w, dst_h);
return cudaGetLastError();
}
Below you can see a complete working solution.
There are 3 main issues in your code:
The setup for the CUDA grid is incorrect. See an example how to set it in my code below (just an initial working version that you can further improve). See some general info here: The CUDA Programming Model.
Note: the grid setup can have a meaningful effect on the overall performance, and it is not trivial to optimize.
See more info here: How do I choose grid and block dimensions for CUDA kernels?.
When copying the data to the device, you used frame.ptr() instead of frame.data.
You only set the data pointer for the output cv::Mat foo, without properly initializing it.
So the cv::Mat metadata (rows, cols etc.) were not set and cv::imshow could not show it properly.
In my code it is not required - see below.
Note that your code skips the first frame. I kept this behavior. You could include the first frame by checking if dst_img was already initialized, and if not (since it's the first frame) - initialize it and the CUDA buffers.
Some more notes on the code below:
There's no need to allocate buffer[2] for the host output image.
Instead I initialized the cv::Mat with the proper size and use it's allocated buffer.
I renamed the device buffers, and added cudaFree for them.
It is safer to pass the number of channels to the kernel, rather than making it assume it is 3.
I passed the step (AKA stride) of the images to the kernel. This will support the case where the images have padding (see about it here: stride and padding of an image).
Code for main:
#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include "cuda_runtime.h"
#include <assert.h>
#define CUDA_CHECK(x) { cudaError_t cudaStatus = x; assert(cudaStatus == cudaSuccess); }
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channel,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step);
int main()
{
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened())
{
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
int src_w = frame.cols;
int src_h = frame.rows;
int src_step = (int)frame.step[0];
int channels = frame.channels();
int data_type = frame.type();
assert((data_type & CV_MAT_DEPTH_MASK) == CV_8U); // assert that it is a uchar image
// Parameters you can change:
int dst_w = 640;
int dst_h = 640;
cv::Mat dst_img(dst_h, dst_w, data_type);
int dst_step = (int)dst_img.step[0];
void * src_dev_buffer;
void * dst_dev_buffer;
CUDA_CHECK(cudaMalloc(&src_dev_buffer, src_h * src_step));
CUDA_CHECK(cudaMalloc(&dst_dev_buffer, dst_h * dst_step));
while (capture.read(frame))
{
// assert that the current frame has the same type and dimensions as the first one (should be guaranteed by the video decoder):
assert(frame.cols == src_w);
assert(frame.rows == src_h);
assert((int)frame.step[0] == src_step);
assert(frame.type() == data_type);
CUDA_CHECK(cudaMemcpy(src_dev_buffer, frame.data, src_h * src_step, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaNearestResize((unsigned char *)src_dev_buffer, (unsigned char *)dst_dev_buffer, channels, src_w, src_h, src_step, dst_w, dst_h, dst_step));
CUDA_CHECK(cudaMemcpy(dst_img.data, dst_dev_buffer, dst_h * dst_step, cudaMemcpyDeviceToHost));
cv::imshow("dst_img", dst_img);
cv::waitKey(1);
}
CUDA_CHECK(cudaFree(src_dev_buffer));
CUDA_CHECK(cudaFree(dst_dev_buffer));
capture.release();
return 0;
}
Code for the CUDA kernel and the wrapping function:
#include "cuda_runtime.h"
__global__ void kernelNearestNeighbourResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (i < dst_h && j < dst_w)
{
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_w;
int src_offset = i * dst_step + j * channels;
int dst_offset = iIn * src_step + jIn * channels;
for (int c = 0; c < channels; ++c)
{
dst_img[src_offset + c] = src_img[dst_offset + c];
}
}
}
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
// The grid dimensions
dim3 dimBlock(32, 32);
dim3 dimGrid(dst_w / 32 + 1, dst_h / 32 + 1);
kernelNearestNeighbourResize << < dimGrid, dimBlock >> >(
src_img, dst_img, channels,
src_w, src_h, src_step, dst_w, dst_h, dst_step);
return cudaGetLastError();
}

RGB2GRAY with CUDA and CImg library

I need to do a RGB2GRAY image processing algorithm. I just need some help in completing the global function or how I can access the * d_src pointer. This is my code, your help will be greatly appreciated.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "CImg.h"
#include <iostream>
using namespace std;
using namespace cimg_library;
__global__ void rgb2gray(unsigned char * d_src, unsigned char * d_dst, int width, int height){
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
}
int main(){
//Load image
CImg<unsigned char> src("lena.jpg");
int width = src.width();
int height = src.height();
unsigned long sizee = src.size();
int sze = width * height;
cout << sze << endl;
//create pointer to image
unsigned char *h_src = src.data();
CImg<unsigned char> dst(width, height, 1, 1);
unsigned char *h_dst = dst.data();
unsigned char *d_src;
unsigned char *d_dst;
cout << sizee << endl;
cudaMalloc((void**)&d_src, sizee);
cudaMalloc((void**)&d_dst, width*height*sizeof(int));
cudaMemcpy(d_src, h_src, sizee, cudaMemcpyHostToDevice);
//launch the kernel
rgb2gray << <(width/16,height/16,1), (16, 16, 1) >> >(d_src, d_dst, width, height);
//force the printf()s to flush
cudaDeviceSynchronize();
// copy back the result array to the CPU
cudaMemcpy(h_dst, d_dst, width*height, cudaMemcpyDeviceToHost);
cudaFree(d_src);
cudaFree(d_dst);
CImgDisplay main_disp(dst, "After Processing");
while (!main_disp.is_closed())
main_disp.wait();
return 0;
}
Firstly, since your dst object consists of unsigned char, allocate d_dst as follows;
cudaMalloc((void**)&d_dst, width*height*sizeof(unsigned char));
Next, grid must cover every pixels, considering cases when width or height are not a multiple of 16. Launch kernel with following kernel configuration.
dim3 blkDim (16, 16, 1);
dim3 grdDim ((width + 15)/16, (height + 15)/16, 1);
rgb2gray<<<grdDim, blkDim>>>(d_src, d_dst, width, height);
Lastly, your kernel should look like this. Note that RGB channels are split in d_src.
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
unsigned char r = d_src[pos_y * width + pos_x];
unsigned char g = d_src[(height + pos_y) * width + pos_x];
unsigned char b = d_src[(height * 2 + pos_y) * width + pos_x];
unsigned int _gray = (unsigned int)((float)(r + g + b) / 3.0f + 0.5);
unsigned char gray = _gray > 255 ? 255 : _gray;
d_dst[pos_y * width + pos_x] = gray;
You can see the full code here.

Cuda Memory access error : CudaIllegalAddress , Image Processing(Stereo vision)

I'm using cuda to deal with image proccessing. but my result is always get 'cudaErrorIllegalAddress : an illegal memory access was encountered'
What i did is below.
First, Load converted image(rgb to gray) to device, i use 'cudaMallocPitch' and 'cudaMemcpy2D'
unsigned char *dev_srcleft;
size_t dev_srcleftPitch
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS * sizeof(int), ROWS));
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
And, Allocating 2D array for store result. the result value is describe as 27bit, so what i'm trying is using 'int' which is 4bytes=32bits, not only for ample size , atomic operation(atomicOr, atomicXor) is needed for performance.
and my device is not supports 64bit atomic operation.
int *dev_leftTrans;
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMemset2D(dev_leftTrans, dev_leftTransPitch, 0, COLS, ROWS);
Memory allocation and memcpy2D works great, and i check by
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D(temp_output.data, temp_output.step, dev_srcleft, dev_srcleftPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
Then, Do kernel code.
__global__ void TestKernel(unsigned char *src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
Parameter COLS and ROWS is size of image.
I think the error occurs here : TestKerenl.
src_val, reading from global memory works good but when i'm trying to access dst, it blows up with cudaErrorIllegalAddress
I don't know what is wrong, and i sufferd for 4 days. please help me
below is my full code
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <cuda_device_runtime_api.h>
#include <device_launch_parameters.h>
#include <math.h>
#include <iostream>
#include <opencv2\opencv.hpp>
#include<string>
#define HANDLE_ERROR(err)(HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char*file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
using namespace std;
using namespace cv;
string imagePath = "Ted";
string imagePathL = imagePath + "imL.png";
string imagePathR = imagePath + "imR.png";
__global__ void TestKernel(unsigned char*src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((COLS< x) && (ROWS < y)) return;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
int main(void)
{
//Print_DeviceProperty();
//Left Image Load
Mat host_srcImgL = imread(imagePathL, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Left Image Load Fail!" << endl; return; }
Mat host_srcConvertL;
cvtColor(host_srcImgL, host_srcConvertL, CV_BGR2GRAY);
//Right Image Load
Mat host_srcImgR = imread(imagePathR, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Right Image Load Fail!" << endl; return; }
Mat host_srcConvertR;
cvtColor(host_srcImgR, host_srcConvertR, CV_BGR2GRAY);
//Create parameters
unsigned int COLS = host_srcConvertL.cols;
unsigned int ROWS = host_srcConvertR.rows;
unsigned int SIZE = COLS * ROWS;
imshow("Left source image", host_srcConvertL);
imshow("Right source image", host_srcConvertR);
unsigned char *dev_srcleft, *dev_srcright, *dev_disp;
int *dev_leftTrans, *dev_rightTrans;
size_t dev_srcleftPitch, dev_srcrightPitch, dev_dispPitch, dev_leftTransPitch, dev_rightTransPitch;
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_srcright, &dev_srcrightPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_disp, &dev_dispPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMallocPitch((void**)&dev_rightTrans, &dev_rightTransPitch, COLS * sizeof(int), ROWS);
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemcpy2D(dev_srcright, dev_srcrightPitch, host_srcConvertR.data, host_srcConvertR.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemset(dev_disp, 255, dev_dispPitch * ROWS);
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
cudaEvent_t start, stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
/*TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcright, dev_srcrightPitch, dev_rightTrans, dev_rightTransPitch, COLS, ROWS);*/
cudaThreadSynchronize();
cudaError_t res = cudaGetLastError();
if (res != cudaSuccess)
printf("%s : %s\n", cudaGetErrorName(res), cudaGetErrorString(res));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << elapsedtime << "msec" << endl;
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D((int*)temp_output.data, temp_output.step, dev_leftTrans, dev_leftTransPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
waitKey(0);
return 0;
}
And this is my environment vs2013, cuda v6.5
Device' property's below
Major revision number: 3
Minor revision number: 0
Name: GeForce GTX 760 (192-bit)
Total global memory: 1610612736
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 888500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 6
Kernel execution timeout: Yes
One problem is that your kernel doesn't do any thread-checking.
When you define a grid of blocks like this:
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
you will often be launching extra blocks. The reason is that if COLS or ROW is not evenly divisible by the block dimensions (3 in this case) then you will get extra blocks to cover the remainder in each case.
These extra blocks will have some threads that are doing useful work, and some that will access out-of-bounds. To protect against this, it's customary to put a thread-check in your kernel to prevent out-of-bounds accesses:
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < COLS) && (y < ROWS)) { // add this
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
} // add this
This means that only the threads that have a valid (in-bounds) x and y will actually do any accesses.
As an aside, (3,3) may not be a particularly good choice of block dimensions for performance reasons. It's usually a good idea to create block dimensions whose product is a multiple of 32, so (32,4) or (16,16) might be examples of better choices.
Another problem in your code is pitch usage for dst array.
Pitch is always in bytes, so first you need to cast dst pointer to char*, calculate row offset and then cast it back to int*:
int* dst_row = (int*)(((char*)dst) + y * dst_pitch);
dst_row[x] = src_val;

opencv 3 channel image data offset for cuda kernel [duplicate]

I'm doing linear filtering on images using CUDA. I use 2D thread blocks and 2D grid to make the problem natural. Here's how I index: (height and width are image dimensions)
dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + 15) / 16;
GridDim.y = (height + 15) / 16;
In kernel I access the locations as follows:
unsigned int xIndex = blockIdx.x*16+ threadIdx.x;
unsigned int yIndex = blockIdx.y*16+ threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
And I want to return four boundaries (i'll cater them later on). I do this as:
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
Where N is the number of pixels at each boundary I don't want to calculate.
Problem:
The code runs fine on all standard images sizes. But for some random image sizes it shows diagonal line(s). For example in my case 500x333 image (even when no dimension is multiple of 16) is showing correct output whereas 450x365 is showing diagonal lines in the output. The problem remains even if I just return the extra threads of grid and nothing else like this:
if(yIndex>=height || xIndex>=width)
return;
The code remains the same, some inputs run fine while others don't. Can anybody spot the bug? I have attached the input and output samples here: IMAGES Thanks!
Update:
Kernel Code (Simplified to return input image, but gives the same problem)
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
/*Filter code removed, still gives the same problem*/
out[tid] = in[tid];
}
Update 2:
I have also removed the return statement by reversing the if condition. But the problem persists.
if(yIndex<=height-N && xIndex<=width-N && yIndex>N && xIndex>N){
/*Kernel Code*/
}
There are quite a few things you still haven't described very well, but based on the information you have posted, I built what I am guessing is a reasonable repro case with parameters which match a case you say it failing (450 x 364 with filterSize=5):
#include <stdio.h>
#include <assert.h>
template<int filterSize>
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
out[tid] = in[tid];
}
int main(void)
{
const int width = 450, height = 365, filterSize=5;
const size_t isize = sizeof(unsigned char) * size_t(width * height);
unsigned char * _in, * _out, * out;
assert( cudaMalloc((void **)&_in, isize) == cudaSuccess );
assert( cudaMalloc((void **)&_out, isize) == cudaSuccess );
assert( cudaMemset(_in, 'Z', isize) == cudaSuccess );
assert( cudaMemset(_out, 'A', isize) == cudaSuccess );
const dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + BlockDim.x - 1) / BlockDim.x;
GridDim.y = (height + BlockDim.y - 1) / BlockDim.y;
filter_8u_c1_kernel<filterSize><<<GridDim,BlockDim>>>(_in,_out,width,height,0,0);
assert( cudaPeekAtLastError() == cudaSuccess );
out = (unsigned char *)malloc(isize);
assert( cudaMemcpy(out, _out, isize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<width; i++) {
fprintf(stdout, "%d: ", i);
for(int j=0; j<height; j++) {
unsigned int idx = i + j*width;
fprintf(stdout, "%c", out[idx]);
}
fprintf(stdout, "\n");
}
return cudaThreadExit();
}
When run it does exactly what I would expect, overwriting the output memory with the input everywhere except for the first and last two lines and the first and last two entries in all the lines in between. This is running with CUDA 3.2 on OS X 10.6.5 with a compute 1.2 GPU. So whatever is happening in you code, it isn't happening in my repro case, which either means I have misinterpreted what you have written, or there is something else you haven't described which is causing the problem.

colored image to greyscale image using CUDA parallel processing

I am trying to solve a problem in which i am supposed to change a colour image to a greyscale image. For this purpose i am using CUDA parallel approach. The kerne code i am invoking on the GPU is as follows.
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int absolute_image_position_x = blockIdx.x;
int absolute_image_position_y = blockIdx.y;
if ( absolute_image_position_x >= numCols ||
absolute_image_position_y >= numRows )
{
return;
}
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[absolute_image_position_x + absolute_image_position_y] = channelSum;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage,
uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage,
size_t numRows,
size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(numCols/32, numCols/32 , 1); //TODO
const dim3 gridSize(numRows/12, numRows/12 , 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage,
d_greyImage,
numRows,
numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
i see a line of dots in the first pixel line.
error i am getting is
libdc1394 error: Failed to initialize libdc1394
Difference at pos 51 exceeds tolerance of 5
Reference: 255
GPU : 0
my input/output images
Can anyone help me with this??? thanks in advance.
I recently joined this course and tried your solution but it don't work so, i tried my own. You are almost correct. The correct solution is this:
__global__`
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{`
int pos_x = (blockIdx.x * blockDim.x) + threadIdx.x;
int pos_y = (blockIdx.y * blockDim.y) + threadIdx.y;
if(pos_x >= numCols || pos_y >= numRows)
return;
uchar4 rgba = rgbaImage[pos_x + pos_y * numCols];
greyImage[pos_x + pos_y * numCols] = (.299f * rgba.x + .587f * rgba.y + .114f * rgba.z);
}
The rest is same as your code.
Now, since I posted this question I have been continuously working on this problem there are a couple of improvements that should be done in order to get this problem correct now I realize my initial solution was wrong . Changes to be done:-
1. absolute_position_x =(blockIdx.x * blockDim.x) + threadIdx.x;
2. absolute_position_y = (blockIdx.y * blockDim.y) + threadIdx.y;
Secondly,
1. const dim3 blockSize(24, 24, 1);
2. const dim3 gridSize((numCols/16), (numRows/16) , 1);
In the solution we are using a grid of numCols/16 * numCols/16
and blocksize of 24 * 24
code executed in 0.040576 ms
#datenwolf : thanks for answering above!!!
Since you are not aware of the image size. It is best to choose any reasonable dimension of the two-dimensional block of threads and then check for two conditions. The first one is that the pos_x and pos_y indexes in the kernel do not exceed numRows and numCols. Secondly the grid size should be just above the total number of threads in all the blocks.
const dim3 blockSize(16, 16, 1);
const dim3 gridSize((numCols%16) ? numCols/16+1 : numCols/16,
(numRows%16) ? numRows/16+1 : numRows/16, 1);
libdc1394 error: Failed to initialize libdc1394
I don't think that this is a CUDA problem. libdc1394 is a library used to access IEEE1394 aka FireWire aka iLink video devices (DV camcorders, Apple iSight camera). That library doesn'r properly initialize, hence you're not getting usefull results. Basically it's NINO: Nonsens In Nonsens Out.
the calculation of absolute x & y image positions is perfect.
but when u need to access that particular pixel in the coloured image , shouldn't you u use the following code??
uchar4 rgba = rgbaImage[absolute_image_position_x + (absolute_image_position_y * numCols)];
I thought so, when comparing it to a code you'd write to execute the same problem in serial code.
Please let me know :)
You still should have a problem with run time - the conversion will not give a proper result.
The lines:
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y];
greyImage[absolute_image_position_x + absolute_image_position_y] = channelSum;
should be changed to:
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y*numCols];
greyImage[absolute_image_position_x + absolute_image_position_y*numCols] = channelSum;
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int rgba_x = blockIdx.x * blockDim.x + threadIdx.x;
int rgba_y = blockIdx.y * blockDim.y + threadIdx.y;
int pixel_pos = rgba_x+rgba_y*numCols;
uchar4 rgba = rgbaImage[pixel_pos];
unsigned char gray = (unsigned char)(0.299f * rgba.x + 0.587f * rgba.y + 0.114f * rgba.z);
greyImage[pixel_pos] = gray;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(24, 24, 1); //TODO
const dim3 gridSize( numCols/24+1, numRows/24+1, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
The libdc1394 error is not related to firewire etc in this case - it is the library that udacity is using to compare the image your program creates to the reference image. And what is is saying is that the difference between your image and the reference image has been been exceeded by a specific threshold, for that position ie. pixel.
You are running following number of block and grids:
const dim3 blockSize(numCols/32, numCols/32 , 1); //TODO
const dim3 gridSize(numRows/12, numRows/12 , 1); //TODO
yet you are not using any threads in your kernel code!
int absolute_image_position_x = blockIdx.x;
int absolute_image_position_y = blockIdx.y;
think this way, the width of an image can be divide into absolute_image_position_x parts of column and the height of an image can be divide into absolute_image_position_y parts of row. Now the box each of the cross section it creates you need to change/redraw all the pixels in terms of greyImage, parallely. Enough spoiler for an assignment :)
same code with with ability to handle non-standard input size images
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;
uchar4 rgbcell=rgbaImage[idx*numCols+idy];
greyImage[idx*numCols+idy]=0.299*rgbcell.x+0.587*rgbcell.y+0.114*rgbcell.z;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
int totalpixels=numRows*numCols;
int factors[]={2,4,8,16,24,32};
vector<int> numbers(factors,factors+sizeof(factors)/sizeof(int));
int factor=1;
while(!numbers.empty())
{
if(totalpixels%numbers.back()==0)
{
factor=numbers.back();
break;
}
else
{
numbers.pop_back();
}
}
const dim3 blockSize(factor, factor, 1); //TODO
const dim3 gridSize(numRows/factor+1, numCols/factor+1,1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
1- int x =(blockIdx.x * blockDim.x) + threadIdx.x;
2- int y = (blockIdx.y * blockDim.y) + threadIdx.y;
And in grid and block size
1- const dim3 blockSize(32, 32, 1);
2- const dim3 gridSize((numCols/32+1), (numRows/32+1) , 1);
Code executed in 0.036992 ms.
const dim3 blockSize(16, 16, 1); //TODO
const dim3 gridSize( (numRows+15)/16, (numCols+15)/16, 1); //TODO
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
uchar4 rgba = rgbaImage[y*numRows + x];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[y*numRows + x] = channelSum;

Resources