Related
I have tried to implement alpha image blending algorithm in CUDA C. There is no error in my code. It compiled fine. As per the thread logic, If I run the code with the increased number of threads the runtime should be decreased. In my code, I got a weird pattern of run time. When I run the code with 1 thread the runtime was 8.060539 e-01 sec, when I run the code with 4 thread I got the runtime 7.579031 e-01 sec, When It ran for 8 threads the runtime was 7.810102e-01, and for 256 thread the runtime is 7.875319e-01.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
__global__ void image_blend(unsigned char *Pout, unsigned char *pin1, unsigned char *pin2, int width, int height, int channels, float alpha){
int col = threadIdx.x + blockIdx.x*blockDim.x;
int row = threadIdx.y + blockIdx.y*blockDim.y;
if(col<width && row<height){
size_t img_size = width * height * channels;
if (Pout != NULL)
{
for (size_t i = 0; i < img_size; i++)
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
}
}
}
int main(int argc, char* argv[]){
int thread_count;
double start, finish;
float alpha;
int width, height, channels;
unsigned char *new_img;
thread_count = strtol(argv[1], NULL, 10);
printf("Enter the value for alpha:");
scanf("%f", &alpha);
unsigned char *apple = stbi_load("apple.jpg", &width, &height, &channels, 0);
unsigned char *orange = stbi_load("orange.jpg", &width, &height, &channels, 0);
size_t img_size = width * height * channels;
//unsigned char *new_img = malloc(img_size);
cudaMallocManaged(&new_img,img_size*sizeof(unsigned char));
cudaMallocManaged(&apple,img_size* sizeof(unsigned char));
cudaMallocManaged(&orange, img_size*sizeof(unsigned char));
GET_TIME(start);
image_blend<<<1,16,thread_count>>>(new_img,apple, orange, width, height, channels,alpha);
cudaDeviceSynchronize();
GET_TIME(finish);
stbi_write_jpg("new_image.jpg", width, height, channels, new_img, 100);
cudaFree(new_img);
cudaFree(apple);
cudaFree(orange);
printf("\n Elapsed time for cuda = %e seconds\n", finish-start);
}
After getting a weird pattern in the runtime I am bit skeptical about the implementation of the code. Can anyone let me know why I get those runtime even if my code has no bug.
Let's start here:
image_blend<<<1,16,thread_count>>>(new_img,apple, orange, width, height, channels,alpha);
It seems evident you don't understand the kernel launch syntax:
<<<1,16,thread_count>>>
The first number (1) is the number of blocks to launch.
The second number (16) is the number of threads per block.
The third number (thread_count) is the size of the dynamically allocated shared memory in bytes.
So our first observation will be that although you claimed to have changed the thread count, you didn't. You were changing the number of bytes of dynamically allocated shared memory. Since your kernel code doesn't use shared memory, this is a completely meaningless variable.
Let's also observe your kernel code:
for (size_t i = 0; i < img_size; i++)
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
For every thread that passes your if test, each one of those threads will execute the entire for-loop and will process the entire image. That is not the general idea with writing CUDA kernels. The general idea is to break up the work so that each thread does a portion of the work, not the whole activity.
These are very basic observations. If you take advantage of an orderly introduction to CUDA, such as here, you can get beyond some of these basic concepts.
We could also point out that your kernel nominally expects a 2D launch, and you are not providing one, and perhaps many other observations. Another important concept that you are missing is that you cannot do this:
unsigned char *apple = stbi_load("apple.jpg", &width, &height, &channels, 0);
...
cudaMallocManaged(&apple,img_size* sizeof(unsigned char));
and expect anything sensible to come from that. If you want to see how data is moved from a host allocation to the device, study nearly any CUDA sample code, such as vectorAdd. Using a managed allocation doesn't allow you to overwrite the pointer like you are doing and get anything useful from that.
I'll provide an example of how one might go about doing what I think you are suggesting, without providing a complete tutorial on CUDA. To provide an example, I'm going to skip the STB image loading routines. To understand the work you are trying to do here, the actual image content does not matter.
Here's an example of an image processing kernel (1D) that will:
Process the entire image, only once
Use less time, roughly speaking, as you increase the thread count.
You haven't provided your timer routine/code, so I'll provide my own:
$ cat t2130.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
unsigned char *i_load(int w, int h, int c, int init){
unsigned char *res = new unsigned char[w*h*c];
for (int i = 0; i < w*h*c; i++) res[i] = init;
return res;
}
__global__ void image_blend(unsigned char *Pout, unsigned char *pin1, unsigned char *pin2, int width, int height, int channels, float alpha){
if (Pout != NULL)
{
size_t img_size = width * height * channels;
for (size_t i = blockIdx.x*blockDim.x+threadIdx.x; i < img_size; i+=gridDim.x*blockDim.x) // grid-stride loop
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
}
}
int main(int argc, char* argv[]){
int threads_per_block = 64;
unsigned long long dt;
float alpha;
int width = 1920;
int height = 1080;
int channels = 3;
size_t img_size = width * height * channels;
int thread_count = img_size;
if (argc > 1) thread_count = atoi(argv[1]);
unsigned char *new_img, *m_apple, *m_orange;
printf("Enter the value for alpha:");
scanf("%f", &alpha);
unsigned char *apple = i_load(width, height, channels, 10);
unsigned char *orange = i_load(width, height, channels, 70);
//unsigned char *new_img = malloc(img_size);
cudaMallocManaged(&new_img,img_size*sizeof(unsigned char));
cudaMallocManaged(&m_apple,img_size* sizeof(unsigned char));
cudaMallocManaged(&m_orange, img_size*sizeof(unsigned char));
memcpy(m_apple, apple, img_size);
memcpy(m_orange, orange, img_size);
int blocks;
if (thread_count < threads_per_block) {threads_per_block = thread_count; blocks = 1;}
else {blocks = thread_count/threads_per_block;}
printf("running with %d blocks of %d threads\n", blocks, threads_per_block);
dt = dtime_usec(0);
image_blend<<<blocks, threads_per_block>>>(new_img,m_apple, m_orange, width, height, channels,alpha);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("CUDA Error: %s\n", cudaGetErrorString(err));
else printf("\n Elapsed time for cuda = %e seconds\n", dt/(float)USECPSEC);
cudaFree(new_img);
cudaFree(m_apple);
cudaFree(m_orange);
}
$ nvcc -o t2130 t2130.cu
$ ./t2130 1
Enter the value for alpha:0.2
running with 1 blocks of 1 threads
Elapsed time for cuda = 5.737880e-01 seconds
$ ./t2130 2
Enter the value for alpha:0.2
running with 1 blocks of 2 threads
Elapsed time for cuda = 3.230150e-01 seconds
$ ./t2130 32
Enter the value for alpha:0.2
running with 1 blocks of 32 threads
Elapsed time for cuda = 4.865200e-02 seconds
$ ./t2130 64
Enter the value for alpha:0.2
running with 1 blocks of 64 threads
Elapsed time for cuda = 2.623300e-02 seconds
$ ./t2130 128
Enter the value for alpha:0.2
running with 2 blocks of 64 threads
Elapsed time for cuda = 1.546000e-02 seconds
$ ./t2130
Enter the value for alpha:0.2
running with 97200 blocks of 64 threads
Elapsed time for cuda = 5.809000e-03 seconds
$
(CentOS 7, CUDA 11.4, V100)
The key methodology that allows the kernel to do all the work (only once) while making use of an "arbitrary" number of threads efficiently is the grid-stride loop.
I am writing a CUDA Program while working with OpenCV. I have an empty Mat of a given size (e.g. 1000x800) which I explicitly converted to GPUMat with dataytpe CV_16SC3. It is desired to manipulate the Image in this format in the CUDA Kernel. However trying to manipulate the Mat does not seem to work correctly.
I am calling my CUDA kernel as follows:
my_kernel <<< gridDim, blockDim >>>( (unsigned short*)img.data, img.cols, img.rows, img.step);
and my sample kernel looks like this
__global__ void my_kernel( unsigned short* img, int width, int height, int img_step)
{
int x, y, pixel;
y = blockIdx.y * blockDim.y + threadIdx.y;
x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= height)
return;
if (x >= width)
return;
pixel = (y * (img_step)) + (3 * x);
img[pixel] = 255; //I know 255 is basically an uchar, this is just part of my test
img[pixel+1] = 255
img[pixel+2] = 255;
}
I am expecting this small kernel sample to write al pixels to white. However, after downloading the Mat again from the GPU and visualizing it with imshow, not all the pixels are white and some weird black lines are present, which makes me believe that somehow I am writing to invalid memory addresses.
My guess is the following. The OpenCV documentation states that cv::mat::data returns an uchar pointer. However, my Mat has a data type "16U" (short unsigned to my knowledge). That is why in the kernel launch I am casting the pointer to (unsigned short*). But apparently that is incorrect.
How should I correctly proceed to be able to read and write the Mat data as short in my kernel?
First of all, the input image type should be short instead of unsigned short because the type of Mat is 16SC3 ( rather than 16UC3 ).
Now, since the image step is in bytes and the data type is short, the pixel index ( or address ) should be calculated taken into account the difference in byte width of those. There are 2 ways to fix this issue.
Method 1:
__global__ void my_kernel( short* img, int width, int height, int img_step)
{
int x, y, pixel;
y = blockIdx.y * blockDim.y + threadIdx.y;
x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= height)
return;
if (x >= width)
return;
//Reinterpret the input pointer as char* to allow jump in bytes instead of short
char* imgBytes = reinterpret_cast<char*>(img);
//Calculate row start address using the newly created pointer
char* rowStartBytes = imgBytes + (y * img_step); // Jump in byte
//Reinterpret the row start address back to required data type.
short* rowStartShort = reinterpret_cast<short*>(rowStartBytes);
short* pixelAddress = rowStartShort + ( 3 * x ); // Jump in short
//Modify the image values
pixelAddress[0] = 255;
pixelAddress[1] = 255;
pixelAddress[2] = 255;
}
Method 2:
Divide the input image step by the size of required data type (short). It may be done when passing the step as a kernel argument.
my_kernel<<<grid,block>>>( img, width, height, img_step/sizeof(short));
I have used method 2 for quite a long time. It is a shortcut method, but later on when I got to look at the source code of certain image processing libraries, I realized that actually Method 1 is more portable, since the size of type can vary across different platforms.
I'm using cuda to deal with image proccessing. but my result is always get 'cudaErrorIllegalAddress : an illegal memory access was encountered'
What i did is below.
First, Load converted image(rgb to gray) to device, i use 'cudaMallocPitch' and 'cudaMemcpy2D'
unsigned char *dev_srcleft;
size_t dev_srcleftPitch
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS * sizeof(int), ROWS));
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
And, Allocating 2D array for store result. the result value is describe as 27bit, so what i'm trying is using 'int' which is 4bytes=32bits, not only for ample size , atomic operation(atomicOr, atomicXor) is needed for performance.
and my device is not supports 64bit atomic operation.
int *dev_leftTrans;
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMemset2D(dev_leftTrans, dev_leftTransPitch, 0, COLS, ROWS);
Memory allocation and memcpy2D works great, and i check by
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D(temp_output.data, temp_output.step, dev_srcleft, dev_srcleftPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
Then, Do kernel code.
__global__ void TestKernel(unsigned char *src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
Parameter COLS and ROWS is size of image.
I think the error occurs here : TestKerenl.
src_val, reading from global memory works good but when i'm trying to access dst, it blows up with cudaErrorIllegalAddress
I don't know what is wrong, and i sufferd for 4 days. please help me
below is my full code
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <cuda_device_runtime_api.h>
#include <device_launch_parameters.h>
#include <math.h>
#include <iostream>
#include <opencv2\opencv.hpp>
#include<string>
#define HANDLE_ERROR(err)(HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char*file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
using namespace std;
using namespace cv;
string imagePath = "Ted";
string imagePathL = imagePath + "imL.png";
string imagePathR = imagePath + "imR.png";
__global__ void TestKernel(unsigned char*src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((COLS< x) && (ROWS < y)) return;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
int main(void)
{
//Print_DeviceProperty();
//Left Image Load
Mat host_srcImgL = imread(imagePathL, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Left Image Load Fail!" << endl; return; }
Mat host_srcConvertL;
cvtColor(host_srcImgL, host_srcConvertL, CV_BGR2GRAY);
//Right Image Load
Mat host_srcImgR = imread(imagePathR, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Right Image Load Fail!" << endl; return; }
Mat host_srcConvertR;
cvtColor(host_srcImgR, host_srcConvertR, CV_BGR2GRAY);
//Create parameters
unsigned int COLS = host_srcConvertL.cols;
unsigned int ROWS = host_srcConvertR.rows;
unsigned int SIZE = COLS * ROWS;
imshow("Left source image", host_srcConvertL);
imshow("Right source image", host_srcConvertR);
unsigned char *dev_srcleft, *dev_srcright, *dev_disp;
int *dev_leftTrans, *dev_rightTrans;
size_t dev_srcleftPitch, dev_srcrightPitch, dev_dispPitch, dev_leftTransPitch, dev_rightTransPitch;
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_srcright, &dev_srcrightPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_disp, &dev_dispPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMallocPitch((void**)&dev_rightTrans, &dev_rightTransPitch, COLS * sizeof(int), ROWS);
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemcpy2D(dev_srcright, dev_srcrightPitch, host_srcConvertR.data, host_srcConvertR.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemset(dev_disp, 255, dev_dispPitch * ROWS);
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
cudaEvent_t start, stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
/*TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcright, dev_srcrightPitch, dev_rightTrans, dev_rightTransPitch, COLS, ROWS);*/
cudaThreadSynchronize();
cudaError_t res = cudaGetLastError();
if (res != cudaSuccess)
printf("%s : %s\n", cudaGetErrorName(res), cudaGetErrorString(res));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << elapsedtime << "msec" << endl;
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D((int*)temp_output.data, temp_output.step, dev_leftTrans, dev_leftTransPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
waitKey(0);
return 0;
}
And this is my environment vs2013, cuda v6.5
Device' property's below
Major revision number: 3
Minor revision number: 0
Name: GeForce GTX 760 (192-bit)
Total global memory: 1610612736
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 888500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 6
Kernel execution timeout: Yes
One problem is that your kernel doesn't do any thread-checking.
When you define a grid of blocks like this:
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
you will often be launching extra blocks. The reason is that if COLS or ROW is not evenly divisible by the block dimensions (3 in this case) then you will get extra blocks to cover the remainder in each case.
These extra blocks will have some threads that are doing useful work, and some that will access out-of-bounds. To protect against this, it's customary to put a thread-check in your kernel to prevent out-of-bounds accesses:
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < COLS) && (y < ROWS)) { // add this
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
} // add this
This means that only the threads that have a valid (in-bounds) x and y will actually do any accesses.
As an aside, (3,3) may not be a particularly good choice of block dimensions for performance reasons. It's usually a good idea to create block dimensions whose product is a multiple of 32, so (32,4) or (16,16) might be examples of better choices.
Another problem in your code is pitch usage for dst array.
Pitch is always in bytes, so first you need to cast dst pointer to char*, calculate row offset and then cast it back to int*:
int* dst_row = (int*)(((char*)dst) + y * dst_pitch);
dst_row[x] = src_val;
I'm very new to working with image processing at a low level and have just had a go at implementing a gaussian kernel with both GPU and CPU - however both yield the same output, an image which is severely skewed by a grid:
I'm aware I could use OpenCV's pre-built functions to handle the filters, but I wanted to learn the methodology behind it, so I built my own.
Convolution kernel:
// Convolution kernel - this manipulates the given channel and writes out a new blurred channel.
void convoluteChannel_cpu(
const unsigned char* const channel, // Input channel
unsigned char* const channelBlurred, // Output channel
const size_t numRows, const size_t numCols, // Channel width/height (rows, cols)
const float *filter, // The weight of sigma, to convulge
const int filterWidth // This is normally a sample of 9
)
{
// Loop through the images given R, G or B channel
for(int rows = 0; rows < (int)numRows; rows++)
{
for(int cols = 0; cols < (int)numCols; cols++)
{
// Declare new pixel colour value
float newColor = 0.f;
// Loop for every row along the stencil size (3x3 matrix)
for(int filter_x = -filterWidth/2; filter_x <= filterWidth/2; filter_x++)
{
// Loop for every col along the stencil size (3x3 matrix)
for(int filter_y = -filterWidth/2; filter_y <= filterWidth/2; filter_y++)
{
// Clamp to the boundary of the image to ensure we don't access a null index.
int image_x = __min(__max(rows + filter_x, 0), static_cast<int>(numRows -1));
int image_y = __min(__max(cols + filter_y, 0), static_cast<int>(numCols -1));
// Assign the new pixel value to the current pixel, numCols and numRows are both 3, so we only
// need to use one to find the current pixel index (similar to how we find the thread in a block)
float pixel = static_cast<float>(channel[image_x * numCols + image_y]);
// Sigma is the new weight to apply to the image, we perform the equation to get a radnom weighting,
// if we don't do this the image will become choppy.
float sigma = filter[(filter_x + filterWidth / 2) * filterWidth + filter_y + filterWidth/2];
//float sigma = 1 / 81.f;
// Set the new pixel value
newColor += pixel * sigma;
}
}
// Set the value of the next pixel at the current image index with the newly declared color
channelBlurred[rows * numCols + cols] = newColor;
}
}
}
I call this 3 times from another method which splits the image into respective R, G, B channels, but I don't believe this would cause the image to be so severely mutated.
Has anybody encountered a problem similar to this before, and if so how did you solve it?
EDIT Channel Splitting Func:
void gaussian_cpu(
const uchar4* const rgbaImage, // Our input image from the camera
uchar4* const outputImage, // The image we are writing back for display
size_t numRows, size_t numCols, // Width and Height of the input image (rows/cols)
const float* const filter, // The value of sigma
const int filterWidth // The size of the stencil (3x3) 9
)
{
// Build an array to hold each channel for the given image
unsigned char *r_c = new unsigned char[numRows * numCols];
unsigned char *g_c = new unsigned char[numRows * numCols];
unsigned char *b_c = new unsigned char[numRows * numCols];
// Build arrays for each of the output (blurred) channels
unsigned char *r_bc = new unsigned char[numRows * numCols];
unsigned char *g_bc = new unsigned char[numRows * numCols];
unsigned char *b_bc = new unsigned char[numRows * numCols];
// Separate the image into R,G,B channels
for(size_t i = 0; i < numRows * numCols; i++)
{
uchar4 rgba = rgbaImage[i];
r_c[i] = rgba.x;
g_c[i] = rgba.y;
b_c[i] = rgba.z;
}
// Convolute each of the channels using our array
convoluteChannel_cpu(r_c, r_bc, numRows, numCols, filter, filterWidth);
convoluteChannel_cpu(g_c, g_bc, numRows, numCols, filter, filterWidth);
convoluteChannel_cpu(b_c, b_bc, numRows, numCols, filter, filterWidth);
// Recombine the channels to build the output image - 255 for alpha as we want 0 transparency
for(size_t i = 0; i < numRows * numCols; i++)
{
uchar4 rgba = make_uchar4(r_bc[i], g_bc[i], b_bc[i], 255);
outputImage[i] = rgba;
}
}
EDIT Calling the kernel
while(gpu_frames > 0)
{
//cout << gpu_frames << "\n";
camera >> frameIn;
// Allocate I/O Pointers
beginStream(&h_inputFrame, &h_outputFrame, &d_inputFrame, &d_outputFrame, &d_redBlurred, &d_greenBlurred, &d_blueBlurred, &_h_filter, &filterWidth, frameIn);
// Show the source image
imshow("Source", frameIn);
g_timer.Start();
// Allocate mem to GPU
allocateMemoryAndCopyToGPU(numRows(), numCols(), _h_filter, filterWidth);
// Apply the gaussian kernel filter and then free any memory ready for the next iteration
gaussian_gpu(h_inputFrame, d_inputFrame, d_outputFrame, numRows(), numCols(), d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
// Output the blurred image
cudaMemcpy(h_outputFrame, d_frameOut, sizeof(uchar4) * numPixels(), cudaMemcpyDeviceToHost);
g_timer.Stop();
cudaDeviceSynchronize();
gpuTime += g_timer.Elapsed();
cout << "Time for this kernel " << g_timer.Elapsed() << "\n";
Mat outputFrame(Size(numCols(), numRows()), CV_8UC1, h_outputFrame, Mat::AUTO_STEP);
clean_mem();
imshow("Dest", outputFrame);
// 1ms delay to prevent system from being interrupted whilst drawing the new frame
waitKey(1);
gpu_frames--;
}
And then within the beginStream() method, images are converted to uchar4:
// Allocate host variables, casting the frameIn and frameOut vars to uchar4 elements, these will
// later be processed by the kernel
*h_inputFrame = (uchar4 *)frameIn.ptr<unsigned char>(0);
*h_outputFrame = (uchar4 *)frameOut.ptr<unsigned char>(0);
There are many doubts in the problem.
At the start of the code, its mentioned that the filter width is 9, thus making it a 9x9 kernel. But in some other comments its said to be 3. So I am guessing that you are actually using a 9x9 kernel and the filter do have the 81 weights in them.
But the above output can never be due to the above mentioned confusion.
uchar4 is of 4-byte size. Thus in gaussian_cpu while splitting the data by running the loop over rgbaImage[i] on an image that doesnot contain alpha value (it could be inferred from the above mentioned loop that alpha is not present) what actually gets done is that your are copying R1,G2,B3,R5,G6,B7 and so on to the red-channel. Better you initially try the code on a grayscale image and make sure you are using uchar instead of uchar4.
The output image seems exactly 1/3rd the width of the original image, which makes the above assumption to be true.
EDIT 1:
Is the input rgbaImage to guassian_cpu function RGBA or RGB? videoCapture must be giving a 3 channel output. The initialization of *h_inputFrame (to uchar4) itself is wrong as its pointing to 3 channel data.
Similarly the output data is four channel data, but Mat outputFrame is declared as a single channel which points to this four channel data. Try Mat outputFrame as 8UC3 type and see the result.
Also, how is the code working, the guassian_cpu() function has 7 input parameters in the definition, but when you call the function 8 parameters are used. Hope this is just a typo.
I am trying to solve a problem in which i am supposed to change a colour image to a greyscale image. For this purpose i am using CUDA parallel approach. The kerne code i am invoking on the GPU is as follows.
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int absolute_image_position_x = blockIdx.x;
int absolute_image_position_y = blockIdx.y;
if ( absolute_image_position_x >= numCols ||
absolute_image_position_y >= numRows )
{
return;
}
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[absolute_image_position_x + absolute_image_position_y] = channelSum;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage,
uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage,
size_t numRows,
size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(numCols/32, numCols/32 , 1); //TODO
const dim3 gridSize(numRows/12, numRows/12 , 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage,
d_greyImage,
numRows,
numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
i see a line of dots in the first pixel line.
error i am getting is
libdc1394 error: Failed to initialize libdc1394
Difference at pos 51 exceeds tolerance of 5
Reference: 255
GPU : 0
my input/output images
Can anyone help me with this??? thanks in advance.
I recently joined this course and tried your solution but it don't work so, i tried my own. You are almost correct. The correct solution is this:
__global__`
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{`
int pos_x = (blockIdx.x * blockDim.x) + threadIdx.x;
int pos_y = (blockIdx.y * blockDim.y) + threadIdx.y;
if(pos_x >= numCols || pos_y >= numRows)
return;
uchar4 rgba = rgbaImage[pos_x + pos_y * numCols];
greyImage[pos_x + pos_y * numCols] = (.299f * rgba.x + .587f * rgba.y + .114f * rgba.z);
}
The rest is same as your code.
Now, since I posted this question I have been continuously working on this problem there are a couple of improvements that should be done in order to get this problem correct now I realize my initial solution was wrong . Changes to be done:-
1. absolute_position_x =(blockIdx.x * blockDim.x) + threadIdx.x;
2. absolute_position_y = (blockIdx.y * blockDim.y) + threadIdx.y;
Secondly,
1. const dim3 blockSize(24, 24, 1);
2. const dim3 gridSize((numCols/16), (numRows/16) , 1);
In the solution we are using a grid of numCols/16 * numCols/16
and blocksize of 24 * 24
code executed in 0.040576 ms
#datenwolf : thanks for answering above!!!
Since you are not aware of the image size. It is best to choose any reasonable dimension of the two-dimensional block of threads and then check for two conditions. The first one is that the pos_x and pos_y indexes in the kernel do not exceed numRows and numCols. Secondly the grid size should be just above the total number of threads in all the blocks.
const dim3 blockSize(16, 16, 1);
const dim3 gridSize((numCols%16) ? numCols/16+1 : numCols/16,
(numRows%16) ? numRows/16+1 : numRows/16, 1);
libdc1394 error: Failed to initialize libdc1394
I don't think that this is a CUDA problem. libdc1394 is a library used to access IEEE1394 aka FireWire aka iLink video devices (DV camcorders, Apple iSight camera). That library doesn'r properly initialize, hence you're not getting usefull results. Basically it's NINO: Nonsens In Nonsens Out.
the calculation of absolute x & y image positions is perfect.
but when u need to access that particular pixel in the coloured image , shouldn't you u use the following code??
uchar4 rgba = rgbaImage[absolute_image_position_x + (absolute_image_position_y * numCols)];
I thought so, when comparing it to a code you'd write to execute the same problem in serial code.
Please let me know :)
You still should have a problem with run time - the conversion will not give a proper result.
The lines:
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y];
greyImage[absolute_image_position_x + absolute_image_position_y] = channelSum;
should be changed to:
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y*numCols];
greyImage[absolute_image_position_x + absolute_image_position_y*numCols] = channelSum;
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int rgba_x = blockIdx.x * blockDim.x + threadIdx.x;
int rgba_y = blockIdx.y * blockDim.y + threadIdx.y;
int pixel_pos = rgba_x+rgba_y*numCols;
uchar4 rgba = rgbaImage[pixel_pos];
unsigned char gray = (unsigned char)(0.299f * rgba.x + 0.587f * rgba.y + 0.114f * rgba.z);
greyImage[pixel_pos] = gray;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(24, 24, 1); //TODO
const dim3 gridSize( numCols/24+1, numRows/24+1, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
The libdc1394 error is not related to firewire etc in this case - it is the library that udacity is using to compare the image your program creates to the reference image. And what is is saying is that the difference between your image and the reference image has been been exceeded by a specific threshold, for that position ie. pixel.
You are running following number of block and grids:
const dim3 blockSize(numCols/32, numCols/32 , 1); //TODO
const dim3 gridSize(numRows/12, numRows/12 , 1); //TODO
yet you are not using any threads in your kernel code!
int absolute_image_position_x = blockIdx.x;
int absolute_image_position_y = blockIdx.y;
think this way, the width of an image can be divide into absolute_image_position_x parts of column and the height of an image can be divide into absolute_image_position_y parts of row. Now the box each of the cross section it creates you need to change/redraw all the pixels in terms of greyImage, parallely. Enough spoiler for an assignment :)
same code with with ability to handle non-standard input size images
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;
uchar4 rgbcell=rgbaImage[idx*numCols+idy];
greyImage[idx*numCols+idy]=0.299*rgbcell.x+0.587*rgbcell.y+0.114*rgbcell.z;
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
int totalpixels=numRows*numCols;
int factors[]={2,4,8,16,24,32};
vector<int> numbers(factors,factors+sizeof(factors)/sizeof(int));
int factor=1;
while(!numbers.empty())
{
if(totalpixels%numbers.back()==0)
{
factor=numbers.back();
break;
}
else
{
numbers.pop_back();
}
}
const dim3 blockSize(factor, factor, 1); //TODO
const dim3 gridSize(numRows/factor+1, numCols/factor+1,1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
1- int x =(blockIdx.x * blockDim.x) + threadIdx.x;
2- int y = (blockIdx.y * blockDim.y) + threadIdx.y;
And in grid and block size
1- const dim3 blockSize(32, 32, 1);
2- const dim3 gridSize((numCols/32+1), (numRows/32+1) , 1);
Code executed in 0.036992 ms.
const dim3 blockSize(16, 16, 1); //TODO
const dim3 gridSize( (numRows+15)/16, (numCols+15)/16, 1); //TODO
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
uchar4 rgba = rgbaImage[y*numRows + x];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[y*numRows + x] = channelSum;