Related
I'm making an app to edit image's HSL colorspace via opencv2 and some conversions code from Internet.
I suppose the original image's color space is RGB, so here is my thought:
Convert the UIImage to cvMat
Convert the colorspace from BGR to HLS.
Loop through all the pixel points to get the corresponding HLS values.
Custom algorithms.
Rewrite the HLS value changes to cvMat
Convert the cvMat to UIImage
Here is my code:
Conversion between UIImage and cvMat
Reference: https://stackoverflow.com/a/10254561/1677041
#import <UIKit/UIKit.h>
#import <opencv2/core/core.hpp>
UIImage *UIImageFromCVMat(cv ::Mat cvMat)
{
NSData *data = [NSData dataWithBytes:cvMat.data length:cvMat.elemSize() * cvMat.total()];
CGColorSpaceRef colorSpace;
CGBitmapInfo bitmapInfo;
if (cvMat.elemSize() == 1) {
colorSpace = CGColorSpaceCreateDeviceGray();
bitmapInfo = kCGImageAlphaNone | kCGBitmapByteOrderDefault;
} else {
colorSpace = CGColorSpaceCreateDeviceRGB();
#if 0
// OpenCV defaults to either BGR or ABGR. In CoreGraphics land,
// this means using the "32Little" byte order, and potentially
// skipping the first pixel. These may need to be adjusted if the
// input matrix uses a different pixel format.
bitmapInfo = kCGBitmapByteOrder32Little | (
cvMat.elemSize() == 3? kCGImageAlphaNone : kCGImageAlphaNoneSkipFirst
);
#else
bitmapInfo = kCGImageAlphaNone | kCGBitmapByteOrderDefault;
#endif
}
CGDataProviderRef provider = CGDataProviderCreateWithCFData((__bridge CFDataRef)data);
// Creating CGImage from cv::Mat
CGImageRef imageRef = CGImageCreate(
cvMat.cols, // width
cvMat.rows, // height
8, // bits per component
8 * cvMat.elemSize(), // bits per pixel
cvMat.step[0], // bytesPerRow
colorSpace, // colorspace
bitmapInfo, // bitmap info
provider, // CGDataProviderRef
NULL, // decode
false, // should interpolate
kCGRenderingIntentDefault // intent
);
// Getting UIImage from CGImage
UIImage *finalImage = [UIImage imageWithCGImage:imageRef];
CGImageRelease(imageRef);
CGDataProviderRelease(provider);
CGColorSpaceRelease(colorSpace);
return finalImage;
}
cv::Mat cvMatWithImage(UIImage *image)
{
CGColorSpaceRef colorSpace = CGImageGetColorSpace(image.CGImage);
size_t numberOfComponents = CGColorSpaceGetNumberOfComponents(colorSpace);
CGFloat cols = image.size.width;
CGFloat rows = image.size.height;
cv::Mat cvMat(rows, cols, CV_8UC4); // 8 bits per component, 4 channels
CGBitmapInfo bitmapInfo = kCGImageAlphaNoneSkipLast | kCGBitmapByteOrderDefault;
// check whether the UIImage is greyscale already
if (numberOfComponents == 1) {
cvMat = cv::Mat(rows, cols, CV_8UC1); // 8 bits per component, 1 channels
bitmapInfo = kCGImageAlphaNone | kCGBitmapByteOrderDefault;
}
CGContextRef contextRef = CGBitmapContextCreate(
cvMat.data, // Pointer to backing data
cols, // Width of bitmap
rows, // Height of bitmap
8, // Bits per component
cvMat.step[0], // Bytes per row
colorSpace, // Colorspace
bitmapInfo // Bitmap info flags
);
CGContextDrawImage(contextRef, CGRectMake(0, 0, cols, rows), image.CGImage);
CGContextRelease(contextRef);
return cvMat;
}
I tested these two functions alone and confirm that they work.
Core operations about conversion:
/// Generate a new image based on specified HSL value changes.
/// #param h_delta h value in [-360, 360]
/// #param s_delta s value in [-100, 100]
/// #param l_delta l value in [-100, 100]
- (void)adjustImageWithH:(CGFloat)h_delta S:(CGFloat)s_delta L:(CGFloat)l_delta completion:(void (^)(UIImage *resultImage))completion
{
dispatch_async(dispatch_get_global_queue(0, 0), ^{
Mat original = cvMatWithImage(self.originalImage);
Mat image;
cvtColor(original, image, COLOR_BGR2HLS);
// https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#the-efficient-way
// accept only char type matrices
CV_Assert(image.depth() == CV_8U);
int channels = image.channels();
int nRows = image.rows;
int nCols = image.cols * channels;
int y, x;
for (y = 0; y < nRows; ++y) {
for (x = 0; x < nCols; ++x) {
// https://answers.opencv.org/question/30547/need-to-know-the-hsv-value/
// https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html?#cvtcolor
Vec3b hls = original.at<Vec3b>(y, x);
uchar h = hls.val[0], l = hls.val[1], s = hls.val[2];
// h = MAX(0, MIN(360, h + h_delta));
// s = MAX(0, MIN(100, s + s_delta));
// l = MAX(0, MIN(100, l + l_delta));
printf("(%02d, %02d):\tHSL(%d, %d, %d)\n", x, y, h, s, l); // <= Label 1
original.at<Vec3b>(y, x)[0] = h;
original.at<Vec3b>(y, x)[1] = l;
original.at<Vec3b>(y, x)[2] = s;
}
}
cvtColor(image, image, COLOR_HLS2BGR);
UIImage *resultImage = UIImageFromCVMat(image);
dispatch_async(dispatch_get_main_queue(), ^ {
if (completion) {
completion(resultImage);
}
});
});
}
The question is:
Why does the HLS values out of my expected range? It shows as [0, 255] like RGB range, is that cvtColor wrong usage?
Should I use Vec3b within the two for loop? or Vec3i instead?
Does my thought have something wrong above?
Update:
Vec3b hls = original.at<Vec3b>(y, x);
uchar h = hls.val[0], l = hls.val[1], s = hls.val[2];
// Remap the hls value range to human-readable range (0~360, 0~1.0, 0~1.0).
// https://docs.opencv.org/master/de/d25/imgproc_color_conversions.html
float fh, fl, fs;
fh = h * 2.0;
fl = l / 255.0;
fs = s / 255.0;
fh = MAX(0, MIN(360, fh + h_delta));
fl = MAX(0, MIN(1, fl + l_delta / 100));
fs = MAX(0, MIN(1, fs + s_delta / 100));
// Convert them back
fh /= 2.0;
fl *= 255.0;
fs *= 255.0;
printf("(%02d, %02d):\tHSL(%d, %d, %d)\tHSL2(%.4f, %.4f, %.4f)\n", x, y, h, s, l, fh, fs, fl);
original.at<Vec3b>(y, x)[0] = short(fh);
original.at<Vec3b>(y, x)[1] = short(fl);
original.at<Vec3b>(y, x)[2] = short(fs);
1) take a look to this, specifically the part of RGB->HLS. When the source image is 8 bits it will go from 0-255 but if you use float image it may have different values.
8-bit images: V←255⋅V, S←255⋅S, H←H/2(to fit to 0 to 255)
V should be L, there is a typo in the documentation
You can convert the RGB/BGR image to a floating point image and then you will have the full value. i.e. the S and L are from 0 to 1 and H 0-360.
But you have to be careful converting it back.
2) Vec3b is unsigned 8 bits image (CV_8U) and Vec3i is integers (CV_32S). Knowing this, it depends on what type is your image. As you said it goes from 0-255 it should be unsigned 8 bits which you should use Vec3b. If you use the other one, it will get 32 bits per pixel and it uses this size to calculate the position in the array of pixels... so it may give something like out of bounds or segmentation error or random problems.
If you have a question, feel free to comment
I saw an opportunity to improve my app performance by using a Metal compute pipeline. However, my initial testing revealed the the compute pipeline was absurdly slow (at least on older device).
So I did a sample project to compare the compute and render pipelines performance. The program takes a 2048 x 2048 source texture and convert it to grayscale in a destination texture.
On an iPhone 5S, it took 3 ms for the fragment shader to do the convertion. However, it took 177 ms for the compute kernel to do the same thing. That is 59 times longer!!!
What is your exeperience with the compute pipeline on older device? It isn't absurdly slow?
Here's are my fragment and compute functions:
// Grayscale Fragment Function
fragment half4 grayscaleFragment(RasterizerData in [[stage_in]],
texture2d<half> inTexture [[texture(0)]])
{
constexpr sampler textureSampler;
half4 inColor = inTexture.sample(textureSampler, in.textureCoordinate);
half gray = dot(inColor.rgb, kRec709Luma);
return half4(gray, gray, gray, 1.0);
}
// Grayscale Kernel Function
kernel void grayscaleKernel(uint2 gid [[thread_position_in_grid]],
texture2d<half, access::read> inTexture [[texture(0)]],
texture2d<half, access::write> outTexture [[texture(1)]])
{
half4 inColor = inTexture.read(gid);
half gray = dot(inColor.rgb, kRec709Luma);
outTexture.write(half4(gray, gray, gray, 1.0), gid);
}
Compute and render methods
- (void)compute {
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
// Compute encoder
id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
[computeEncoder setComputePipelineState:_computePipelineState];
[computeEncoder setTexture:_srcTexture atIndex:0];
[computeEncoder setTexture:_dstTexture atIndex:1];
[computeEncoder dispatchThreadgroups:_threadgroupCount threadsPerThreadgroup:_threadgroupSize];
[computeEncoder endEncoding];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
}
- (void)render {
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
// Render pass descriptor
MTLRenderPassDescriptor *renderPassDescriptor = [MTLRenderPassDescriptor renderPassDescriptor];
renderPassDescriptor.colorAttachments[0].loadAction = MTLLoadActionDontCare;
renderPassDescriptor.colorAttachments[0].texture = _dstTexture;
renderPassDescriptor.colorAttachments[0].storeAction = MTLStoreActionStore;
// Render encoder
id<MTLRenderCommandEncoder> renderEncoder = [commandBuffer renderCommandEncoderWithDescriptor:renderPassDescriptor];
[renderEncoder setRenderPipelineState:_renderPipelineState];
[renderEncoder setFragmentTexture:_srcTexture atIndex:0];
[renderEncoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:0 vertexCount:4];
[renderEncoder endEncoding];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
}
And Metal setup:
- (void)setupMetal
{
// Get metal device
_device = MTLCreateSystemDefaultDevice();
// Create the command queue
_commandQueue = [_device newCommandQueue];
id<MTLLibrary> defaultLibrary = [_device newDefaultLibrary];
// Create compute pipeline state
_computePipelineState = [_device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:#"grayscaleKernel"] error:nil];
// Create render pipeline state
MTLRenderPipelineDescriptor *pipelineStateDescriptor = [[MTLRenderPipelineDescriptor alloc] init];
pipelineStateDescriptor.vertexFunction = [defaultLibrary newFunctionWithName:#"vertexShader"];
pipelineStateDescriptor.fragmentFunction = [defaultLibrary newFunctionWithName:#"grayscaleFragment"];
pipelineStateDescriptor.colorAttachments[0].pixelFormat = MTLPixelFormatBGRA8Unorm;
_renderPipelineState = [_device newRenderPipelineStateWithDescriptor:pipelineStateDescriptor error:nil];
// Create source and destination texture descriptor
// Since the compute kernel function doesn't check if pixels are within the bounds of the destination texture, make sure texture width
// and height are multiples of the pipeline threadExecutionWidth and (threadExecutionWidth / maxTotalThreadsPerThreadgroup) respectivly.
MTLTextureDescriptor *textureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatBGRA8Unorm
width:2048
height:2048
mipmapped:NO];
// Create source texture
textureDescriptor.usage = MTLTextureUsageShaderRead;
_srcTexture = [_device newTextureWithDescriptor:textureDescriptor];
// Create description texture
textureDescriptor.usage = MTLTextureUsageShaderWrite | MTLTextureUsageRenderTarget;
_dstTexture = [_device newTextureWithDescriptor:textureDescriptor];
// Set the compute kernel's threadgroup size
NSUInteger threadWidth = _computePipelineState.threadExecutionWidth;
NSUInteger threadMax = _computePipelineState.maxTotalThreadsPerThreadgroup;
_threadgroupSize = MTLSizeMake(threadWidth, threadMax / threadWidth, 1);
// Set the compute kernel's threadgroup count
_threadgroupCount.width = (_srcTexture.width + _threadgroupSize.width - 1) / _threadgroupSize.width;
_threadgroupCount.height = (_srcTexture.height + _threadgroupSize.height - 1) / _threadgroupSize.height;
_threadgroupCount.depth = 1;
}
The Metal compute pipeline is unusable on A7 class CPU/GPU devices. The same compute pipeline has great performance on A8 and newer devices. Your options for dealing with this are to create fragment shader impls for A7 devices and use compute logic for all newer devices, or you can export computation to the CPUs on A7 (there are at least 2 CPUs with this device class). You could also just use all fragment shaders for all devices, but much better performance on complex code is possible with compute kernels, so it is something to think about.
I'm trying to blit buffer to PVRTC texture. The reason why I'm doing it, because want to keep texture with private storage.
Here is quote from documentation.
If the texture's pixel format is a compressed format, then sourceSize
must be a multiple of the pixel format's block size or be clamped to
the edge of the texture if the block extends outside the bounds of a
texture. For a compressed format, sourceBytesPerRow is the number of
bytes from the start of one row of blocks to the start of the next row
of blocks.
Something wrong in my code, because texture looks broken after.
MTLBlitOption options = MTLBlitOptionNone;
if (_pixelFormat == MTLPixelFormatPVRTC_RGB_4BPP || _pixelFormat == MTLPixelFormatPVRTC_RGBA_4BPP) {
uint32_t blockWidth = 4;
uint32_t blockHeight = 4;
uint32_t bitsPerPixel = 4;
uint32_t blockSize = blockWidth * blockHeight;
uint32_t widthInBlocks = width / blockWidth;
uint32_t heightInBlocks = height / blockHeight;
options = MTLBlitOptionRowLinearPVRTC;
levelBytesPerRow = widthInBlocks * ((blockSize * bitsPerPixel) / 8);
}
id <MTLBuffer> buffer = [device newBufferWithBytes:[data bytes] length:[data length] options:0];
[blitEncoder copyFromBuffer:buffer
sourceOffset:0
sourceBytesPerRow:levelBytesPerRow
sourceBytesPerImage:[buffer length]
sourceSize:MTLSizeMake(width, height, 1)
toTexture:self.textureMetal
destinationSlice:0
destinationLevel:i
destinationOrigin:MTLOriginMake(0, 0, 0)
options:options];
I'm trying to get a CVPixelBuffer in RGB color space from the Apple's ARKit. In func session(_ session: ARSession, didUpdate frame: ARFrame) method of ARSessionDelegate I get an instance of ARFrame. On page Displaying an AR Experience with Metal I found that this pixel buffer is in YCbCr (YUV) color space.
I need to convert this to RGB color space (I actually need CVPixelBuffer and not UIImage). I've found something about color conversion on iOS but I was not able to get this working in Swift 3.
There's several ways to do this, depending on what you're after. The best way to do this in realtime (to say, render the buffer to a view) is to use a custom shader to convert the YCbCr CVPixelBuffer to RGB.
Using Metal:
If you make a new project, select "Augmented Reality App," and select "Metal" for the content technology, the project generated will contain the code and shaders necessary to make this conversion.
Using OpenGL:
The GLCameraRipple example from Apple uses an AVCaptureSession to capture the camera, and shows how to map the resulting CVPixelBuffer to GL textures, which are then converted to RGB in shaders (again, provided in the example).
Non Realtime:
The answer to this stackoverflow question addresses converting the buffer to a UIImage, and offers a pretty simple way to do it.
I have also stuck on this question for several days. All of the code snippet I could find on the Internet is written in Objective-C rather than Swift, regarding converting CVPixelBuffer to UIImage.
Finally, the following code snippet works perfect for me, to convert a YUV image to either JPG or PNG file format, and then you can write it to the local file in your application.
func pixelBufferToUIImage(pixelBuffer: CVPixelBuffer) -> UIImage {
let ciImage = CIImage(cvPixelBuffer: pixelBuffer)
let context = CIContext(options: nil)
let cgImage = context.createCGImage(ciImage, from: ciImage.extent)
let uiImage = UIImage(cgImage: cgImage!)
return uiImage
}
The docs explicitly says that you need to access the luma and chroma planes:
ARKit captures pixel buffers in a planar YCbCr format (also known as YUV) format. To render these images on a device display, you'll need to access the luma and chroma planes of the pixel buffer and convert pixel values to an RGB format.
So there's no way to directly get the RGB planes and you'll have to handle this in your shaders, either in Metal or openGL as described by #joshue
You may want the Accelerate framework's image conversion functions. Perhaps a combination of vImageConvert_420Yp8_Cb8_Cr8ToARGB8888 and vImageConvert_ARGB8888toRGB888 (If you don't want the alpha channel). In my experience these work in real time.
Struggled a long while with this as well and I've ended up writing the following code, which works for me:
// Helper macro to ensure pixel values are bounded between 0 and 255
#define clamp(a) (a > 255 ? 255 : (a < 0 ? 0 : a));
- (void)processImageBuffer:(CVImageBufferRef)imageBuffer
{
OSType type = CVPixelBufferGetPixelFormatType(imageBuffer);
if (type == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange)
{
CVPixelBufferLockBaseAddress(imageBuffer, 0);
// We know the return format of the base address based on the YpCbCr8BiPlanarFullRange format (as per doc)
StandardBuffer baseAddress = (StandardBuffer)CVPixelBufferGetBaseAddress(imageBuffer);
// Get the number of bytes per row for the pixel buffer, width and height
size_t bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer);
size_t width = CVPixelBufferGetWidth(imageBuffer);
size_t height = CVPixelBufferGetHeight(imageBuffer);
// Get buffer info and planar pixel data
CVPlanarPixelBufferInfo_YCbCrBiPlanar *bufferInfo = (CVPlanarPixelBufferInfo_YCbCrBiPlanar *)baseAddress;
uint8_t* cbrBuff = (uint8_t *)CVPixelBufferGetBaseAddressOfPlane(imageBuffer, 1);
// This just moved the pointer past the offset
baseAddress = (uint8_t *)CVPixelBufferGetBaseAddressOfPlane(imageBuffer, 0);
int bytesPerPixel = 4;
uint8_t *rgbData = rgbFromYCrCbBiPlanarFullRangeBuffer(baseAddress,
cbrBuff,
bufferInfo,
width,
height,
bytesPerRow);
[self doStuffOnRGBBuffer:rgbData width:width height:height bitsPerComponent:8 bytesPerPixel:bytesPerPixel bytesPerRow:bytesPerRow];
free(rgbData);
CVPixelBufferUnlockBaseAddress(imageBuffer, 0);
}
else
{
NSLog(#"Unsupported image buffer type");
}
}
uint8_t * rgbFromYCrCbBiPlanarFullRangeBuffer(uint8_t *inBaseAddress,
uint8_t *cbCrBuffer,
CVPlanarPixelBufferInfo_YCbCrBiPlanar * inBufferInfo,
size_t inputBufferWidth,
size_t inputBufferHeight,
size_t inputBufferBytesPerRow)
{
int bytesPerPixel = 4;
NSUInteger yPitch = EndianU32_BtoN(inBufferInfo->componentInfoY.rowBytes);
uint8_t *rgbBuffer = (uint8_t *)malloc(inputBufferWidth * inputBufferHeight * bytesPerPixel);
NSUInteger cbCrPitch = EndianU32_BtoN(inBufferInfo->componentInfoCbCr.rowBytes);
uint8_t *yBuffer = (uint8_t *)inBaseAddress;
for(int y = 0; y < inputBufferHeight; y++)
{
uint8_t *rgbBufferLine = &rgbBuffer[y * inputBufferWidth * bytesPerPixel];
uint8_t *yBufferLine = &yBuffer[y * yPitch];
uint8_t *cbCrBufferLine = &cbCrBuffer[(y >> 1) * cbCrPitch];
for(int x = 0; x < inputBufferWidth; x++)
{
int16_t y = yBufferLine[x];
int16_t cb = cbCrBufferLine[x & ~1] - 128;
int16_t cr = cbCrBufferLine[x | 1] - 128;
uint8_t *rgbOutput = &rgbBufferLine[x*bytesPerPixel];
int16_t r = (int16_t)roundf( y + cr * 1.4 );
int16_t g = (int16_t)roundf( y + cb * -0.343 + cr * -0.711 );
int16_t b = (int16_t)roundf( y + cb * 1.765);
// ABGR image representation
rgbOutput[0] = 0Xff;
rgbOutput[1] = clamp(b);
rgbOutput[2] = clamp(g);
rgbOutput[3] = clamp(r);
}
}
return rgbBuffer;
}
I want to pass an OpenCL Mat to a selfwritten OpenCL Kernel for a FGPA (doesnt´t support the OpenCV OpenCL).
Host- Code:
Mat img = imread( "template.jpg", IMREAD_GRAYSCALE );
Mat output(img.rows, img.cols, CV_8UC1);
// Program, Context already declared
// Create Kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "copy", &status);
// Create Command Queue and associate it with the device you want to execute on
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context,devices[0], 0, &status);
// Buffer, prob i do something wrong here
cl_mem buffer_img = clCreateBuffer(context,CL_MEM_READ_ONLY, sizeof(uint) * img.cols * img.rows, NULL,&status);
cl_mem buffer_outputimg = clCreateBuffer(context,CL_MEM_WRITE_ONLY, sizeof(uint) * img.cols * img.rows,NULL,&status);
status = clEnqueueWriteBuffer(cmdQueue, buffer_img,CL_FALSE,0,sizeof(uint) * img.cols * img.rows,&img,0,NULL,NULL);
// set kernel arguments
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&buffer_img);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&buffer_outputimg);
size_t globalWorkSize[2];
globalWorkSize[0] = img.cols;
globalWorkSize[1] = img.rows;
status = clEnqueueNDRangeKernel(cmdQueue,kernel,2,NULL, globalWorkSize, NULL,0, NULL,NULL);
clEnqueueReadBuffer(cmdQueue,buffer_outputimg,CL_TRUE,0,sizeof(uint) * img.cols * img.rows, &output, 0, NULL, NULL);
//stop cpu till queue is finish
clFinish(cmdQueue);
Kernel-Code:
__kernel void copy(__global uchar * input, __global uchar * output)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
//copy
output[y * get_global_size(0) + x] = input[y * get_global_size(0) + x] ;
}
When excecuting it on the FPGA i get a Segmentation fault, whichs is propably due the wrong handling with the OpenCV Mat.
EDIT:
Edited Host-Code as suggested by api55 solved the problem:
Mat img = imread( "scene.jpg", IMREAD_GRAYSCALE );
Mat output(img.rows, img.cols, CV_8UC1);
// Program, Context already declared
// Create Kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "copy", &status);
// Create Command Queue and associate it with the device you want to execute on
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context,devices[0], 0, &status);
checkError(status, "Failed to create commadnqueue");
// Buffer
cl_mem buffer_img = clCreateBuffer(context,CL_MEM_READ_ONLY, sizeof(uchar) * img.cols * img.rows, NULL,&status);
cl_mem buffer_outputimg = clCreateBuffer(context,CL_MEM_WRITE_ONLY, sizeof(uchar) * img.cols * img.rows,NULL,&status);
checkError(status, "Failed to create buffer_mask");
status = clEnqueueWriteBuffer(cmdQueue, buffer_img,CL_FALSE,0,sizeof(uchar) * img.cols * img.rows,img.data,0,NULL,NULL);
checkError(status, "Failed to enqueue buffer_img");
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&buffer_img);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&buffer_outputimg);
size_t globalWorkSize[2];
globalWorkSize[0] = img.cols;
globalWorkSize[1] = img.rows;
status = clEnqueueNDRangeKernel(cmdQueue,kernel,2,NULL, globalWorkSize, NULL,0, NULL,NULL);
clEnqueueReadBuffer(cmdQueue,buffer_outputimg,CL_TRUE,0,sizeof(uchar) * img.cols * img.rows, output.data,0,NULL,NULL);
imwrite("output.jpg", output);
I do not have much experience with opencl, but i think it is an opencv/c++ problem.
The opencv mat data lies in img.data which is an uchar* of the size sizeof(T) * channels * rows * cols.
Usually, T is uchar when loading images, and channels is 3 (unless that is a greyscale img). 3 channel uchar is 24 bits per pixel and greyscale (as you are loading) is 8 bits per pixel and you are using uint which is size of 32 bits. At some point it will go outside the memory and do the segmentation error. Also, if you do not use the data pointer in the structure, you may be copying the header information and just the pointer to the data and not the data itself.
I suggest you to change &img in:
status = clEnqueueWriteBuffer(cmdQueue, buffer_img,CL_FALSE,0,sizeof(uint) * img.cols * img.rows,&img,0,NULL,NULL);
to img.data
Finally, you need to have the correct data. I am not sure if opencl may use uchar, but if it can't, change the cv::Mat to another type like this:
img.convertTo(img, CV_32S);
After loading the image. This will change it to int... opencv does not support matrices with unsigned int... just make sure to change it accordingly in the other places (i.e. sizeof(uint)) and if you convert the input, remember to create the output with the same type.
If you prefer float, use CV_32F and if you like double CV_64F.