Using 32 bit ARGB camera buffer to initialize OpenCV Mat - opencv

I'm trying to initialize a Mat using a camera buffer that is holding a 32 bit ARGB frame. These are the steps I have taken till now:
cv::Mat src = cv::Mat(cv::Size(img_height, img_width),CV_8UC4);
memcpy(src.ptr(), (void*) img_buffer,img_height * img_width * 4);
cv::Mat dest= src.clone();
cv::cvtColor(src,dest,COLOR_BGRA2BGR);
This leads to a segfault. Still occurs even if dest is initialized as
cv::Mat dest=cv::Mat(src.size(),src.type());
Would appreciate any help on this.
UPDATE
So I'm trying to untangle the order manually, like this:
int rgb_temp[4];
for(int y=0; y < (int)img_height; y++) {
for(int x=0; x < (int)img_width; x++) {
rgb_temp[0] = (unsigned int)img_buffer[(int)img_stride * y + x + 0]; // A
rgb_temp[1] = (unsigned int)img_buffer[(int)img_stride * y + x + 1]; // R
rgb_temp[2] = (unsigned int)img_buffer[(int)img_stride * y + x + 2]; // G
rgb_temp[3] = (unsigned int)img_buffer[(int)img_stride * y + x + 3]; // B
src.data[ (y + x) + 0] = rgb_temp[3]; // B
src.data[ (y + x) + 1] = rgb_temp[2]; // G
src.data[ (y + x) + 2] = rgb_temp[1]; // R
src.data[ (y + x) + 3] = rgb_temp[0]; // A
}
}
But to no avail. I am able to read the ARGB values from the img_buffer but am unable to write to the src.data. Is this a right way to take?

You could use the following construction:
Mat::Mat(int rows, int cols, int type, void* data, size_t step=AUTO_STEP)
which maps your data into the OpenCV format in your case this is:
cv::Mat src(img_height, img_width ,CV_8UC4, img_buffer)
cv::Mat dst;
src.copyTo(dst);
but be carefull the first line is not copying the data.

Related

Separable gaussian blur - optimize vertical pass

I have implemented separable Gaussian blur. Horizontal pass was relatively easy to optimize with SIMD processing. However, I am not sure how to optimize vertical pass.
Accessing elements is not very cache friendly and filling SIMD lane would mean reading many different pixels. I was thinking about transpose the image and run horizontal pass and then transpose image back, however, I am not sure if it will gain any improvement because of two tranpose operations.
I have quite large images 16k resolution and kernel size is 19, so vectorization of vertical pass gain was about 15%.
My Vertical pass is as follows (it is sinde generic class typed to T which can be uint8_t or float):
int yStart = kernelHalfSize;
int xStart = kernelHalfSize;
int yEnd = input.GetWidth() - kernelHalfSize;
int xEnd = input.GetHeigh() - kernelHalfSize;
const T * inData = input.GetData().data();
V * outData = output.GetData().data();
int kn = kernelHalfSize * 2 + 1;
int kn4 = kn - kn % 4;
for (int y = yStart; y < yEnd; y++)
{
size_t yW = size_t(y) * output.GetWidth();
size_t outX = size_t(xStart) + yW;
size_t xEndSimd = xStart;
int len = xEnd - xStart;
len = len - len % 4;
xEndSimd = xStart + len;
for (int x = xStart; x < xEndSimd; x += 4)
{
size_t inYW = size_t(y) * input.GetWidth();
size_t x0 = ((x + 0) - kernelHalfSize) + inYW;
size_t x1 = x0 + 1;
size_t x2 = x0 + 2;
size_t x3 = x0 + 3;
__m128 sumDot = _mm_setzero_ps();
int i = 0;
for (; i < kn4; i += 4)
{
__m128 kx = _mm_set_ps1(kernelDataX[i + 0]);
__m128 ky = _mm_set_ps1(kernelDataX[i + 1]);
__m128 kz = _mm_set_ps1(kernelDataX[i + 2]);
__m128 kw = _mm_set_ps1(kernelDataX[i + 3]);
__m128 dx, dy, dz, dw;
if constexpr (std::is_same<T, uint8_t>::value)
{
//we need co convert uint8_t inputs to float
__m128i u8_0 = _mm_loadu_si128((const __m128i*)(inData + x0));
__m128i u8_1 = _mm_loadu_si128((const __m128i*)(inData + x1));
__m128i u8_2 = _mm_loadu_si128((const __m128i*)(inData + x2));
__m128i u8_3 = _mm_loadu_si128((const __m128i*)(inData + x3));
__m128i u32_0 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_0, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_1 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_1, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_2 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_2, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_3 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_3, _mm_setzero_si128()),
_mm_setzero_si128());
dx = _mm_cvtepi32_ps(u32_0);
dy = _mm_cvtepi32_ps(u32_1);
dz = _mm_cvtepi32_ps(u32_2);
dw = _mm_cvtepi32_ps(u32_3);
}
else
{
/*
//load 8 consecutive values
auto dd = _mm256_loadu_ps(inData + x0);
//extract parts by shifting and casting to 4 values float
dx = _mm256_castps256_ps128(dd);
dy = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 4, 3, 2, 1)));
dz = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 5, 4, 3, 2)));
dw = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 6, 5, 4, 3)));
*/
dx = _mm_loadu_ps(inData + x0);
dy = _mm_loadu_ps(inData + x1);
dz = _mm_loadu_ps(inData + x2);
dw = _mm_loadu_ps(inData + x3);
}
//calculate 4 dots at once
//[dx, dy, dz, dw] <dot> [kx, ky, kz, kw]
auto mx = _mm_mul_ps(dx, kx); //dx * kx
auto my = _mm_fmadd_ps(dy, ky, mx); //mx + dy * ky
auto mz = _mm_fmadd_ps(dz, kz, my); //my + dz * kz
auto res = _mm_fmadd_ps(dw, kw, mz); //mz + dw * kw
sumDot = _mm_add_ps(sumDot, res);
x0 += 4;
x1 += 4;
x2 += 4;
x3 += 4;
}
for (; i < kn; i++)
{
auto v = _mm_set_ps1(kernelDataX[i]);
auto v2 = _mm_set_ps(
*(inData + x3), *(inData + x2),
*(inData + x1), *(inData + x0)
);
sumDot = _mm_add_ps(sumDot, _mm_mul_ps(v, v2));
x0++;
x1++;
x2++;
x3++;
}
sumDot = _mm_mul_ps(sumDot, _mm_set_ps1(weightX));
if constexpr (std::is_same<V, uint8_t>::value)
{
__m128i asInt = _mm_cvtps_epi32(sumDot);
asInt = _mm_packus_epi32(asInt, asInt);
asInt = _mm_packus_epi16(asInt, asInt);
uint32_t res = _mm_cvtsi128_si32(asInt);
((uint32_t *)(outData + outX))[0] = res;
outX += 4;
}
else
{
float tmpRes[4];
_mm_store_ps(tmpRes, sumDot);
outData[outX + 0] = tmpRes[0];
outData[outX + 1] = tmpRes[1];
outData[outX + 2] = tmpRes[2];
outData[outX + 3] = tmpRes[3];
outX += 4;
}
}
for (int x = xEndSimd; x < xEnd; x++)
{
int kn = kernelHalfSize * 2 + 1;
const T * v = input.GetPixelStart(x - kernelHalfSize, y);
float tmp = 0;
for (int i = 0; i < kn; i++)
{
tmp += kernelDataX[i] * v[i];
}
tmp *= weightX;
outData[outX] = ImageUtils::clamp_cast<V>(tmp);
outX++;
}
}
There’s a well-known trick for that.
While you compute both passes, read them sequentially, use SIMD to compute, but write out the result into another buffer, transposed, using scalar stores. Protip: SSE 4.1 has _mm_extract_ps just don’t forget to cast your destination image pointer from float* into int*. Another thing about these stores, I would recommend using _mm_stream_si32 for that as you want maximum cache space used by your input data. When you’ll be computing the second pass, you’ll be reading sequential memory addresses again, the prefetcher hardware will deal with the latency.
This way both passes will be identical, I usually call same function twice, with different buffers.
Two transposes caused by your 2 passes cancel each other. Here’s an HLSL version, BTW.
There’s more. If your kernel size is only 19, that fits in 3 AVX registers. I think shuffle/permute/blend instructions are still faster than even L1 cache loads, i.e. it might be better to load the kernel outside the loop.

Copy cv::Mat into CMSampleBufferRef

How can I copy cv::Mat data back into the sampleBuffer?
My scenario as follow :
I create a cv::Mat from pixelBuffer for landmark detection and add the landmarks to cv::Mat image data. I'd like to copy this cv::Mat into the sample buffer to be shown with the landmark.
Is this possible ?
I achieved this with dlib but need to know how to do it with cv::mat:
char *baseBuffer = (char *)CVPixelBufferGetBaseAddress(imageBuffer);
img.reset();
long position = 0;
while (img.move_next()) {
dlib::bgr_pixel& pixel = img.element();
long bufferLocation = position * 4; //(row * width + column) * 4;
char b = baseBuffer[bufferLocation];
char g = baseBuffer[bufferLocation + 1];
char r = baseBuffer[bufferLocation + 2];
dlib::bgr_pixel newpixel(b, g, r);
pixel = newpixel;
position++;
}
I am answering my own question.
First thing, you need to access the pixel data of cv::mat Image, I followed this great solution
Then you need to copy pixel into the buffer starting from the basebuffer. Following code should help you achieve this :
CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
char *baseBuffer = (char *)CVPixelBufferGetBaseAddress(imageBuffer);
long position = 0;
uint8_t* pixelPtr = (uint8_t*)targetImage.data;
int cn = targetImage.channels();
cv::Scalar_<uint8_t> rgbPixel;
for(int i = 0; i < targetImage.rows; i++)
{
for(int j = 0; j < targetImage.cols; j++)
{
long bufferLocation = position * 4;
rgbPixel.val[0] = pixelPtr[i*targetImage.cols*cn + j*cn + 0]; // B
rgbPixel.val[1] = pixelPtr[i*targetImage.cols*cn + j*cn + 1]; // G
rgbPixel.val[2] = pixelPtr[i*targetImage.cols*cn + j*cn + 2]; // R
baseBuffer[bufferLocation] = rgbPixel.val[2];
baseBuffer[bufferLocation + 1] = rgbPixel.val[1];
baseBuffer[bufferLocation + 2] = rgbPixel.val[0];
position++;
}
}
Some things to take note of
make sure you CVPixelBufferLockBaseAddress and
CVPixelBufferUnlockBaseAddress before and after the operation. I
am doing this on CV_8UC3, you might want to check your cv::mat
type.
I haven't done the performance analysis but I am getting smooth output with this.

Converting cv::Mat to MTLTexture

An intermediate step of my current project requires conversion of opencv's cv::Mat to MTLTexture, the texture container of Metal. I need to store the Floats in the Mat as Floats in the texture; my project cannot quite afford the loss of precision.
This is my attempt at such a conversion.
- (id<MTLTexture>)texForMat:(cv::Mat)image context:(MBEContext *)context
{
id<MTLTexture> texture;
int width = image.cols;
int height = image.rows;
Float32 *rawData = (Float32 *)calloc(height * width * 4,sizeof(float));
int bytesPerPixel = 4;
int bytesPerRow = bytesPerPixel * width;
float r, g, b,a;
for(int i = 0; i < height; i++)
{
Float32* imageData = (Float32*)(image.data + image.step * i);
for(int j = 0; j < width; j++)
{
r = (Float32)(imageData[4 * j]);
g = (Float32)(imageData[4 * j + 1]);
b = (Float32)(imageData[4 * j + 2]);
a = (Float32)(imageData[4 * j + 3]);
rawData[image.step * (i) + (4 * j)] = r;
rawData[image.step * (i) + (4 * j + 1)] = g;
rawData[image.step * (i) + (4 * j + 2)] = b;
rawData[image.step * (i) + (4 * j + 3)] = a;
}
}
MTLTextureDescriptor *textureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
width:width
height:height
mipmapped:NO];
texture = [context.device newTextureWithDescriptor:textureDescriptor];
MTLRegion region = MTLRegionMake2D(0, 0, width, height);
[texture replaceRegion:region mipmapLevel:0 withBytes:rawData bytesPerRow:bytesPerRow];
free(rawData);
return texture;
}
But it doesn't seem to be working. It reads zeroes every time from the Mat, and throws up EXC_BAD_ACCESS. I need the MTLTexture in MTLPixelFormatRGBA16Float to keep the precision.
Thanks for considering this issue.
One problem here is you’re loading up rawData with Float32s but your texture is RGBA16Float, so the data will be corrupted (16Float is half the size of Float32). This shouldn’t cause your crash, but it’s an issue you’ll have to deal with.
Also as “chappjc” noted you’re using ‘image.step’ when writing your data out, but that buffer should be contiguous and not ever have a step that’s not just (width * bytesPerPixel).

Reading pixel bytes from CFData: Arithmetic on a pointer to incomplete type

This code should get me each pixel's values starting from a CGImageRef:
UIImage* image = [UIImage imageNamed:#"mask.bmp"];
CGImageRef aCGImageRef = image.CGImage;
CFDataRef rawData = CGDataProviderCopyData(CGImageGetDataProvider(aCGImageRef));
UInt8 * buf = (UInt8 *) CFDataGetBytePtr(rawData);
int length = CFDataGetLength(rawData);
CFRelease(rawData);
int no_of_channels = 3;
int image_width = SCREEN_WIDTH();
unsigned long row_stride = image_width * no_of_channels; // 960 bytes in this case
unsigned long x_offset = x * no_of_channels;
/* assuming RGB byte order (as opposed to BGR) */
UInt8 r = *(rawData + row_stride * y + x_offset );
UInt8 g = *(rawData + row_stride * y + x_offset + 1);
UInt8 b = *(rawData + row_stride * y + x_offset + 2);
These last three lines would do the trick, but the compiler says it won't do it with x and y as floats. So I casted them to int, but now it says
Arithmetic on a pointer to an incomplete type const struct __CFData
How do I fix that?
You want to do your arithmetic on the byte pointer itself, not to the CFData struct (which has the bytes as a member). That means using the buf variable from above:
UInt8 r = *(buf + row_stride * y + x_offset );
UInt8 g = *(buf + row_stride * y + x_offset + 1);
UInt8 b = *(buf + row_stride * y + x_offset + 2);

Converting a 24-bit PNG image to an array of GLubytes

I'd like to do the following:
Read RGB color values from a 24 bit PNG image
Average the RGB values and store them into an array of Glubytes.
I have provided my function that I was hoping would perform these 2 steps.
My function returns an array of Glubytes, however all elements have a value of 0.
So im guessing im reading the image data incorrectly.
What am i going wrong in reading the image? (perhaps my format is incorrect).
Here is my function:
+ (GLubyte *) LoadPhotoAveragedIndexPNG:(UIImage *)image numPixelComponents: (int)numComponents
{
// Load an image and return byte array.
CGImageRef textureImage = image.CGImage;
if (textureImage == nil)
{
NSLog(#"LoadPhotoIndexPNG: Failed to load texture image");
return nil;
}
NSInteger texWidth = CGImageGetWidth(textureImage);
NSInteger texHeight = CGImageGetHeight(textureImage);
CGColorSpaceRef colorSpace = CGColorSpaceCreateDeviceRGB();
GLubyte *indexedData = (GLubyte *)malloc(texWidth * texHeight);
GLubyte *rawData = (GLubyte *)malloc(texWidth * texHeight * numComponents);
CGContextRef textureContext = CGBitmapContextCreate(
rawData,
texWidth,
texHeight,
8,
texWidth * numComponents,
colorSpace,
kCGImageAlphaPremultipliedLast);
CGColorSpaceRelease(colorSpace);
CGContextDrawImage(textureContext,
CGRectMake(0.0, 0.0, (float)texWidth, (float)texHeight),
textureImage);
CGContextRelease(textureContext);
int rawDataLength = texWidth * texHeight * numComponents;
for (int i = 0, j = 0; i < rawDataLength; i += numComponents)
{
GLubyte b = rawData[i];
GLubyte g = rawData[i + 1];
GLubyte r = rawData[i + 2];
indexedData[j++] = (r + g + b) / 3;
}
return indexedData;
}
Here is the test image im loading (RGB colorspace in PNG format):
Do check with some logging if the parameters b,g and r are producing normal values in the last for loop. Where you made a mistake is indexedData[j++] = (r + g + b) / 3; those 3 parameters are sizeof 1 byte and you can not sum them up like that. Use a larger integer, typecast them and typecast the result back to array. (You are most likely getting overflow)
Apart from your original problem there's a major problem here (maybe even related)
for (int i = 0, j = 0; i < rawDataLength; i += numComponents)
{
GLubyte b = rawData[i];
GLubyte g = rawData[i + 1];
GLubyte r = rawData[i + 2];
indexedData[j++] = (r + g + b) / 3;
}
Namely the expression
(r + g + b)
This expression will be performed on GLubyte sized integer operations. If the sum of r+g+b is larger than the type GLubyte can hold it will overflow. Whenever you're processing data through intermediary variables (good style!) choose the variable types large enough to hold the largest value you can encounter. Another method was casting the expression like
indexedData[j++] = ((uint16_t)r + (uint16_t)g + (uint16_t)b) / 3;
But that's cumbersome to read. Also if you're processing integers of a known size, use the types found in stdint.h. You know, that you're expecting 8 bits per channel. Also you can use the comma operator in the for increment clause
uint8_t *indexedData = (GLubyte *)malloc(texWidth * texHeight);
/* ... */
for (int i = 0, j = 0; i < rawDataLength; i += numComponents, j++)
{
uint16_t b = rawData[i];
uint16_t g = rawData[i + 1];
uint16_t r = rawData[i + 2];
indexedData[j] = (r + g + b) / 3;
}

Resources