How to implement nearest neighbours image resizing algorithm in CUDA? - opencv

My main purpose is to load frames from a video with OpenCV, then copy it Nvidia Gpu memory, resize it with a Cuda based nearest neighbour algorithm, then copy it back to the host side and visualise it with cv::imshow()
Unfortunately, I always got segmentation faults. There could be a problem with defining the amount of bytes to be copied or with the data conversions.
Below, you can find the main parts of the source code, but here is the repo for the full project:
https://github.com/foxakarmi/imageResize
Main function:
#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>
void *buffers[3];
int main() {
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened()) {
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
buffers[2] = malloc(3 * 640 * 640);
while (capture.read(frame)) {
CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))
cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);
CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))
cv::Mat foo;
foo.data = static_cast<uchar *>(buffers[2]);
cv::imshow("img", foo);
cv::waitKey(1);
}
capture.release();
return 0;
}
The .cu file containing the kernel and a wrapper function:
#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"
__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if (i < dst_h && j < dst_w) {
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_h;
dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
}
}
cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
kernelNearestNeighbourResize <<< 3600, 256>>>(
src_img, dst_img, src_w,
src_h, dst_w, dst_h);
return cudaGetLastError();
}

Below you can see a complete working solution.
There are 3 main issues in your code:
The setup for the CUDA grid is incorrect. See an example how to set it in my code below (just an initial working version that you can further improve). See some general info here: The CUDA Programming Model.
Note: the grid setup can have a meaningful effect on the overall performance, and it is not trivial to optimize.
See more info here: How do I choose grid and block dimensions for CUDA kernels?.
When copying the data to the device, you used frame.ptr() instead of frame.data.
You only set the data pointer for the output cv::Mat foo, without properly initializing it.
So the cv::Mat metadata (rows, cols etc.) were not set and cv::imshow could not show it properly.
In my code it is not required - see below.
Note that your code skips the first frame. I kept this behavior. You could include the first frame by checking if dst_img was already initialized, and if not (since it's the first frame) - initialize it and the CUDA buffers.
Some more notes on the code below:
There's no need to allocate buffer[2] for the host output image.
Instead I initialized the cv::Mat with the proper size and use it's allocated buffer.
I renamed the device buffers, and added cudaFree for them.
It is safer to pass the number of channels to the kernel, rather than making it assume it is 3.
I passed the step (AKA stride) of the images to the kernel. This will support the case where the images have padding (see about it here: stride and padding of an image).
Code for main:
#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include "cuda_runtime.h"
#include <assert.h>
#define CUDA_CHECK(x) { cudaError_t cudaStatus = x; assert(cudaStatus == cudaSuccess); }
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channel,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step);
int main()
{
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened())
{
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
int src_w = frame.cols;
int src_h = frame.rows;
int src_step = (int)frame.step[0];
int channels = frame.channels();
int data_type = frame.type();
assert((data_type & CV_MAT_DEPTH_MASK) == CV_8U); // assert that it is a uchar image
// Parameters you can change:
int dst_w = 640;
int dst_h = 640;
cv::Mat dst_img(dst_h, dst_w, data_type);
int dst_step = (int)dst_img.step[0];
void * src_dev_buffer;
void * dst_dev_buffer;
CUDA_CHECK(cudaMalloc(&src_dev_buffer, src_h * src_step));
CUDA_CHECK(cudaMalloc(&dst_dev_buffer, dst_h * dst_step));
while (capture.read(frame))
{
// assert that the current frame has the same type and dimensions as the first one (should be guaranteed by the video decoder):
assert(frame.cols == src_w);
assert(frame.rows == src_h);
assert((int)frame.step[0] == src_step);
assert(frame.type() == data_type);
CUDA_CHECK(cudaMemcpy(src_dev_buffer, frame.data, src_h * src_step, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaNearestResize((unsigned char *)src_dev_buffer, (unsigned char *)dst_dev_buffer, channels, src_w, src_h, src_step, dst_w, dst_h, dst_step));
CUDA_CHECK(cudaMemcpy(dst_img.data, dst_dev_buffer, dst_h * dst_step, cudaMemcpyDeviceToHost));
cv::imshow("dst_img", dst_img);
cv::waitKey(1);
}
CUDA_CHECK(cudaFree(src_dev_buffer));
CUDA_CHECK(cudaFree(dst_dev_buffer));
capture.release();
return 0;
}
Code for the CUDA kernel and the wrapping function:
#include "cuda_runtime.h"
__global__ void kernelNearestNeighbourResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (i < dst_h && j < dst_w)
{
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_w;
int src_offset = i * dst_step + j * channels;
int dst_offset = iIn * src_step + jIn * channels;
for (int c = 0; c < channels; ++c)
{
dst_img[src_offset + c] = src_img[dst_offset + c];
}
}
}
cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channels,
int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
// The grid dimensions
dim3 dimBlock(32, 32);
dim3 dimGrid(dst_w / 32 + 1, dst_h / 32 + 1);
kernelNearestNeighbourResize << < dimGrid, dimBlock >> >(
src_img, dst_img, channels,
src_w, src_h, src_step, dst_w, dst_h, dst_step);
return cudaGetLastError();
}

Related

Blurring creates stripes on the output image

I wrote this piece of code to make a median Blur in CUDA but I am running into an issue, where the channel of image is blurred but it creates stripes which look unusual for blurring.
#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
using namespace std;
using namespace cv;
#define BLOCK_SIZE 16
#define TILE_SIZE 14
#define FILTER_WIDTH 3
#define FILTER_HEIGHT 3
__device__ void sort(unsigned char* filterVector)
{
for (int i = 0; i < FILTER_WIDTH*FILTER_HEIGHT; i++) {
for (int j = i + 1; j < FILTER_WIDTH*FILTER_HEIGHT; j++) {
if (filterVector[i] > filterVector[j]) {
unsigned char tmp = filterVector[i];
filterVector[i] = filterVector[j];
filterVector[j] = tmp;
}
}
}
}
__global__ void medianFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{
int x_o = TILE_SIZE * blockIdx.x + threadIdx.x;
int y_o = TILE_SIZE * blockIdx.y + threadIdx.y;
int x_i = x_o - (FILTER_HEIGHT / 2);
int y_i = y_o - (FILTER_WIDTH / 2);
__shared__ unsigned char sBuffer[BLOCK_SIZE][BLOCK_SIZE];
if ((x_i >= 0) && (x_i < width) && (y_i >= 0) && (y_i < height)) {
sBuffer[threadIdx.y][threadIdx.x] = srcImage[y_i * width + x_i];
} else {
sBuffer[threadIdx.y][threadIdx.x] = 0;
}
__syncthreads();
unsigned char filterVector[FILTER_WIDTH*FILTER_HEIGHT];
// int size_vec = sizeof(filterVector) / sizeof(filterVector[0]);
// printf("%d \n", size_vec);
if (threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {
for (int r = 0; r < FILTER_HEIGHT; r++) {
for (int c = 0; c < FILTER_HEIGHT; c++) {
filterVector[r*FILTER_HEIGHT+c] = sBuffer[threadIdx.y + r][threadIdx.x + c];
}
}
}
sort(filterVector);
if (x_o < width && y_o < height) {
dstImage[y_o * width + x_o] = filterVector[4]; // (FILTER_WIDTH*FILTER_HEIGHT)/2
}
}
int main(int argc, char **argv)
{
std::string image_path = "./test.jpg";
cv::Mat img = imread(image_path, IMREAD_COLOR);
std::string output_file = "test_gpu.jpg";
if(img.empty())
{
std::cout << "Couldn't read img:" << image_path << std::endl;
}
Mat bgr[3];
split(img, bgr);
cv::Mat dstImg (bgr[1].size(), bgr[1].type());
const int inputSize = img.cols * img.rows;
const int outputSize = dstImg.cols * dstImg.rows;
unsigned char *d_input, *d_output;
cudaMalloc<unsigned char>(&d_input, inputSize);
cudaMalloc<unsigned char>(&d_output, outputSize);
cudaMemcpy(d_input, bgr[1].ptr(), inputSize, cudaMemcpyHostToDevice);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid((dstImg.cols + TILE_SIZE - 1)/TILE_SIZE, (dstImg.rows + TILE_SIZE - 1)/TILE_SIZE);
medianFilter<<<grid,block>>>(d_input, d_output, dstImg.cols, dstImg.rows);
cudaMemcpy(dstImg.ptr(), d_output, outputSize, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
imwrite(output_file, dstImg);
}
This is my original image:
and here is one blurred channel:
For some reason I get those stripes on the output image, which is just one of the channels for now. Any idea why this is happening?
Your intention is that even though you are launching a block of dimension (BLOCK_SIZE, BLOCK_SIZE), you only intend (TILE_SIZE, TILE_SIZE) threads in that block to actually compute the values for output pixels.
However you are not properly accounting for that here:
if (x_o < width && y_o < height) {
that should be, instead:
if (x_o < width && y_o < height && threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {
(In fact, everything after the __syncthreads() in your kernel can be conditioned to only execute if threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE if you wish.)

WTV in Opencv's Opencl code for Image resizing

What does WTV stands for in the following Opencl code?
I can't find much info for that. The code is from Opencv for processing on gpu.
__
kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
float ifx, float ify, __global const int * ofs_tab,
__global const int * map_tab, __global const float * alpha_tab)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if (dx < dst_cols && dy < dst_rows)
{
int dst_index = mad24(dy, dst_step, dst_offset);
__global const int * xmap_tab = map_tab;
__global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
__global const float * xalpha_tab = alpha_tab;
__global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
__global const int * xofs_tab = ofs_tab;
__global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);
int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];
int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];
WTV sum = (WTV)(0), buf;
int src_index = mad24(sy0, src_step, src_offset);
for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
{
WTV beta = (WTV)(yalpha_tab[yk]);
buf = (WTV)(0);
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
{
WTV alpha = (WTV)(xalpha_tab[xk]);
buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
}
sum += buf * beta;
}
storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
}
}
It is not defined in the source you shared. It appears to be a type, like float. Just guessing: it's defined using "-D WTV=something" while compiling the kernel.

RGB2GRAY with CUDA and CImg library

I need to do a RGB2GRAY image processing algorithm. I just need some help in completing the global function or how I can access the * d_src pointer. This is my code, your help will be greatly appreciated.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "CImg.h"
#include <iostream>
using namespace std;
using namespace cimg_library;
__global__ void rgb2gray(unsigned char * d_src, unsigned char * d_dst, int width, int height){
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
}
int main(){
//Load image
CImg<unsigned char> src("lena.jpg");
int width = src.width();
int height = src.height();
unsigned long sizee = src.size();
int sze = width * height;
cout << sze << endl;
//create pointer to image
unsigned char *h_src = src.data();
CImg<unsigned char> dst(width, height, 1, 1);
unsigned char *h_dst = dst.data();
unsigned char *d_src;
unsigned char *d_dst;
cout << sizee << endl;
cudaMalloc((void**)&d_src, sizee);
cudaMalloc((void**)&d_dst, width*height*sizeof(int));
cudaMemcpy(d_src, h_src, sizee, cudaMemcpyHostToDevice);
//launch the kernel
rgb2gray << <(width/16,height/16,1), (16, 16, 1) >> >(d_src, d_dst, width, height);
//force the printf()s to flush
cudaDeviceSynchronize();
// copy back the result array to the CPU
cudaMemcpy(h_dst, d_dst, width*height, cudaMemcpyDeviceToHost);
cudaFree(d_src);
cudaFree(d_dst);
CImgDisplay main_disp(dst, "After Processing");
while (!main_disp.is_closed())
main_disp.wait();
return 0;
}
Firstly, since your dst object consists of unsigned char, allocate d_dst as follows;
cudaMalloc((void**)&d_dst, width*height*sizeof(unsigned char));
Next, grid must cover every pixels, considering cases when width or height are not a multiple of 16. Launch kernel with following kernel configuration.
dim3 blkDim (16, 16, 1);
dim3 grdDim ((width + 15)/16, (height + 15)/16, 1);
rgb2gray<<<grdDim, blkDim>>>(d_src, d_dst, width, height);
Lastly, your kernel should look like this. Note that RGB channels are split in d_src.
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
unsigned char r = d_src[pos_y * width + pos_x];
unsigned char g = d_src[(height + pos_y) * width + pos_x];
unsigned char b = d_src[(height * 2 + pos_y) * width + pos_x];
unsigned int _gray = (unsigned int)((float)(r + g + b) / 3.0f + 0.5);
unsigned char gray = _gray > 255 ? 255 : _gray;
d_dst[pos_y * width + pos_x] = gray;
You can see the full code here.

Access pixels in gpu::mat

I'd like to know how to access pixel information when using OpenCV GPU. I'm currently downloading gpu::mat information to a mat variable but it is too slow.
Does anyone know how to do it?
You could access to the data inside a kernel.
For (row,col) the row and column number,
the value of a pixel with channel number ch < 3 will be:
uint8_t val = gpumat.data[ (row*gpumat.step) + col*gpumat.channels() + ch];
So let say you have an BGR input image stored on a GpuMat called src and you want to
assign each pixel value to a destination image called dst (also GpuMat).
You could call the kernel
kernel_assign_pixel<<<gridDim, blockDim>>>
(src.data, dst.data, src.rows, src.cols, src.step, dst.step, src.channels());
Defined in the following way
__global__ void kernel_assign_pixel
(uint8_t* src, uint8_t* dst, int MaxRows, int MaxCols, int iStep, int oStep, int MaxC)
{
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; //Row number
unsigned int col = blockIdx.y * blockDim.y + threadIdx.y; //Column number
unsigned int ch = blockIdx.z * blockDim.z + threadIdx.z; //Channel number
if (row<MaxRows && col<MaxCols && ch < MaxC)
{
int tidIn = row * iStep + col * MaxC + ch;
int tidOut = row * oStep + col * MaxC + ch;
dst[tidOut]=src[tidIn];
}
}

opencv 3 channel image data offset for cuda kernel [duplicate]

I'm doing linear filtering on images using CUDA. I use 2D thread blocks and 2D grid to make the problem natural. Here's how I index: (height and width are image dimensions)
dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + 15) / 16;
GridDim.y = (height + 15) / 16;
In kernel I access the locations as follows:
unsigned int xIndex = blockIdx.x*16+ threadIdx.x;
unsigned int yIndex = blockIdx.y*16+ threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
And I want to return four boundaries (i'll cater them later on). I do this as:
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
Where N is the number of pixels at each boundary I don't want to calculate.
Problem:
The code runs fine on all standard images sizes. But for some random image sizes it shows diagonal line(s). For example in my case 500x333 image (even when no dimension is multiple of 16) is showing correct output whereas 450x365 is showing diagonal lines in the output. The problem remains even if I just return the extra threads of grid and nothing else like this:
if(yIndex>=height || xIndex>=width)
return;
The code remains the same, some inputs run fine while others don't. Can anybody spot the bug? I have attached the input and output samples here: IMAGES Thanks!
Update:
Kernel Code (Simplified to return input image, but gives the same problem)
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
/*Filter code removed, still gives the same problem*/
out[tid] = in[tid];
}
Update 2:
I have also removed the return statement by reversing the if condition. But the problem persists.
if(yIndex<=height-N && xIndex<=width-N && yIndex>N && xIndex>N){
/*Kernel Code*/
}
There are quite a few things you still haven't described very well, but based on the information you have posted, I built what I am guessing is a reasonable repro case with parameters which match a case you say it failing (450 x 364 with filterSize=5):
#include <stdio.h>
#include <assert.h>
template<int filterSize>
__global__ void filter_8u_c1_kernel(unsigned char* in, unsigned char* out, int width, int height, float* filter, int fSize)
{
unsigned int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int tid = yIndex * width + xIndex;
unsigned int N = filterSize/2;
if(yIndex>=height-N || xIndex>=width-N || yIndex<N || xIndex<N)
return;
out[tid] = in[tid];
}
int main(void)
{
const int width = 450, height = 365, filterSize=5;
const size_t isize = sizeof(unsigned char) * size_t(width * height);
unsigned char * _in, * _out, * out;
assert( cudaMalloc((void **)&_in, isize) == cudaSuccess );
assert( cudaMalloc((void **)&_out, isize) == cudaSuccess );
assert( cudaMemset(_in, 'Z', isize) == cudaSuccess );
assert( cudaMemset(_out, 'A', isize) == cudaSuccess );
const dim3 BlockDim(16,16);
dim3 GridDim;
GridDim.x = (width + BlockDim.x - 1) / BlockDim.x;
GridDim.y = (height + BlockDim.y - 1) / BlockDim.y;
filter_8u_c1_kernel<filterSize><<<GridDim,BlockDim>>>(_in,_out,width,height,0,0);
assert( cudaPeekAtLastError() == cudaSuccess );
out = (unsigned char *)malloc(isize);
assert( cudaMemcpy(out, _out, isize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<width; i++) {
fprintf(stdout, "%d: ", i);
for(int j=0; j<height; j++) {
unsigned int idx = i + j*width;
fprintf(stdout, "%c", out[idx]);
}
fprintf(stdout, "\n");
}
return cudaThreadExit();
}
When run it does exactly what I would expect, overwriting the output memory with the input everywhere except for the first and last two lines and the first and last two entries in all the lines in between. This is running with CUDA 3.2 on OS X 10.6.5 with a compute 1.2 GPU. So whatever is happening in you code, it isn't happening in my repro case, which either means I have misinterpreted what you have written, or there is something else you haven't described which is causing the problem.

Resources