I have several thousands of tga files (without palette) which contain RGBA4444 data (I know usualy tga files don't contain RGBA4444 data). I would like to convert them into RGBA8888 data. I use the following command line:
convert -depth 4 woody4.tga -depth 8 woody8.tga
In this case, woody4.tga is the original RGBA4444 file, and woody8.tga the target RGBA8888 file but it doesn't change the colors of my pictures, what am I missing?
Thanks,
Pierre
Edit:
Thanks very much Mark, I have successfully converted more than 10 000 TGA with your program, the result is very good and correct to the original TGA ! this would has been impossible without the parallel command ! Just a last point, I have around 50 TGA larger (the backgrounds of the game) which are coded with RGBA5650 and not RGBA4444, how can I modify your program to manage the RGBA5650 ? Thanks very much !
Oh, I see Eric beat me to it:-)
Hey ho! I did it a different way anyway and got a different answer so you can see which one you like best. I also wrote some C but I didn't rely on any libraries, I just read the TGA and converted it to a PAM format and let ImageMagick make that into PNG afterwards at command-line.
I chose PAM because it is the simplest file to write which supports transparency - see Wikipedia on PAM format.
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
int main(int argc,char* argv[]){
unsigned char buf[64];
FILE* fp=fopen(argv[1],"rb");
if(fp==NULL){
fprintf(stderr,"ERROR: Unable to open %s\n",argv[1]);
exit(1);
}
// Read TGA header of 18 bytes, extract width and height
fread(buf,1,18,fp); // 12 bytes junk, 2 bytes width, 2 bytes height, 2 bytes junk
unsigned short w=buf[12]|(buf[13]<<8);
unsigned short h=buf[14]|(buf[15]<<8);
// Write PAM header
fprintf(stdout,"P7\n");
fprintf(stdout,"WIDTH %d\n",w);
fprintf(stdout,"HEIGHT %d\n",h);
fprintf(stdout,"DEPTH 4\n");
fprintf(stdout,"MAXVAL 255\n");
fprintf(stdout,"TUPLTYPE RGB_ALPHA\n");
fprintf(stdout,"ENDHDR\n");
// Read 2 bytes at a time RGBA4444
while(fread(buf,2,1,fp)==1){
unsigned char out[4];
out[0]=(buf[1]&0x0f)<<4;
out[1]=buf[0]&0xf0;
out[2]=(buf[0]&0x0f)<<4;
out[3]=buf[1]&0xf0;
// Write the 4 modified bytes out RGBA8888
fwrite(out,4,1,stdout);
}
fclose(fp);
return 0;
}
I the compile that with gcc:
gcc targa.c -o targa
Or you could use clang:
clang targa.c -o targa
and run it with
./targa someImage.tga > someImage.pam
and convert the PAM to PNG with ImageMagick at the command-line:
convert someImage.pam someImage.png
If you want to avoid writing the intermediate PAM file to disk, you can pipe it straight into convert like this:
./targa illu_evolution_01.tga | convert - result.png
You can, equally, make a BMP output file if you wish:
./targa illu_evolution_01.tga | convert - result.bmp
If you have thousands of files to do, and you are on a Mac or Linux, you can use GNU Parallel and get them all done in parallel much faster like this:
parallel --eta './targa {} | convert - {.}.png' ::: *.tga
If you have more than a couple of thousand files, you may get "Argument list too long" errors, in which case, use the slightly harder syntax:
find . -name \*tga -print0 | parallel -0 --eta './targa {} | convert - {.}.png'
On a Mac, you would install GNU Parallel with homebrew using:
brew install parallel
For your RGBA5650 images, I will fall back to PPM as my intermediate format because the alpha channel of PAM is no longer needed. The code will now look like this:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
int main(int argc,char* argv[]){
unsigned char buf[64];
FILE* fp=fopen(argv[1],"rb");
if(fp==NULL){
fprintf(stderr,"ERROR: Unable to open %s\n",argv[1]);
exit(1);
}
// Read TGA header of 18 bytes, extract width and height
fread(buf,1,18,fp); // 12 bytes junk, 2 bytes width, 2 bytes height, 2 bytes junk
unsigned short w=buf[12]|(buf[13]<<8);
unsigned short h=buf[14]|(buf[15]<<8);
// Write PPM header
fprintf(stdout,"P6\n");
fprintf(stdout,"%d %d\n",w,h);
fprintf(stdout,"255\n");
// Read 2 bytes at a time RGBA5650
while(fread(buf,2,1,fp)==1){
unsigned char out[3];
out[0]=buf[1]&0xf8;
out[1]=((buf[1]&7)<<5) | ((buf[0]>>3)&0x1c);
out[2]=(buf[0]&0x1f)<<3;
// Write the 3 modified bytes out RGB888
fwrite(out,3,1,stdout);
}
fclose(fp);
return 0;
}
And will compile and run exactly the same way.
Updated answer.
After reading a few documents about TARGA format. I've revised + simplified a C program to convert.
// tga2img.c
#include <stdio.h>
#include <stdlib.h>
#include <wand/MagickWand.h>
typedef struct {
unsigned char idlength;
unsigned char colourmaptype;
unsigned char datatypecode;
short int colourmaporigin;
short int colourmaplength;
unsigned char colourmapdepth;
short int x_origin;
short int y_origin;
short int width;
short int height;
unsigned char bitsperpixel;
unsigned char imagedescriptor;
} HEADER;
typedef struct {
int extensionoffset;
int developeroffset;
char signature[16];
unsigned char p;
unsigned char n;
} FOOTER;
int main(int argc, const char * argv[]) {
HEADER tga_header;
FOOTER tga_footer;
FILE
* fd;
size_t
tga_data_size,
tga_pixel_size,
i,
j;
unsigned char
* tga_data,
* buffer;
const char
* input,
* output;
if (argc != 3) {
printf("Usage:\n\t %s <input> <output>\n", argv[0]);
return 1;
}
input = argv[1];
output = argv[2];
fd = fopen(input, "rb");
if (fd == NULL) {
fprintf(stderr, "Unable to read TGA input\n");
return 1;
}
/********\
* TARGA *
\*********/
#pragma mark TARGA
// Read TGA header
fread(&tga_header.idlength, sizeof(unsigned char), 1, fd);
fread(&tga_header.colourmaptype, sizeof(unsigned char), 1, fd);
fread(&tga_header.datatypecode, sizeof(unsigned char), 1, fd);
fread(&tga_header.colourmaporigin, sizeof( short int), 1, fd);
fread(&tga_header.colourmaplength, sizeof( short int), 1, fd);
fread(&tga_header.colourmapdepth, sizeof(unsigned char), 1, fd);
fread(&tga_header.x_origin, sizeof( short int), 1, fd);
fread(&tga_header.y_origin, sizeof( short int), 1, fd);
fread(&tga_header.width, sizeof( short int), 1, fd);
fread(&tga_header.height, sizeof( short int), 1, fd);
fread(&tga_header.bitsperpixel, sizeof(unsigned char), 1, fd);
fread(&tga_header.imagedescriptor, sizeof(unsigned char), 1, fd);
// Calculate sizes
tga_pixel_size = tga_header.bitsperpixel / 8;
tga_data_size = tga_header.width * tga_header.height * tga_pixel_size;
// Read image data
tga_data = malloc(tga_data_size);
fread(tga_data, 1, tga_data_size, fd);
// Read TGA footer.
fseek(fd, -26, SEEK_END);
fread(&tga_footer.extensionoffset, sizeof( int), 1, fd);
fread(&tga_footer.developeroffset, sizeof( int), 1, fd);
fread(&tga_footer.signature, sizeof( char), 16, fd);
fread(&tga_footer.p, sizeof(unsigned char), 1, fd);
fread(&tga_footer.n, sizeof(unsigned char), 1, fd);
fclose(fd);
buffer = malloc(tga_header.width * tga_header.height * 4);
#pragma mark RGBA4444 to RGBA8888
for (i = 0, j=0; i < tga_data_size; i+= tga_pixel_size) {
buffer[j++] = (tga_data[i+1] & 0x0f) << 4; // Red
buffer[j++] = tga_data[i ] & 0xf0; // Green
buffer[j++] = (tga_data[i ] & 0x0f) << 4; // Blue
buffer[j++] = tga_data[i+1] & 0xf0; // Alpha
}
free(tga_data);
/***************\
* IMAGEMAGICK *
\***************/
#pragma mark IMAGEMAGICK
MagickWandGenesis();
PixelWand * background;
background = NewPixelWand();
PixelSetColor(background, "none");
MagickWand * wand;
wand = NewMagickWand();
MagickNewImage(wand,
tga_header.width,
tga_header.height,
background);
background = DestroyPixelWand(background);
MagickImportImagePixels(wand,
0,
0,
tga_header.width,
tga_header.height,
"RGBA",
CharPixel,
buffer);
free(buffer);
MagickWriteImage(wand, argv[2]);
wand = DestroyMagickWand(wand);
return 0;
}
Which can be compiled with clang $(MagickWand-config --cflags --libs) -o tga2im tga2im.c, and can be executed simply by ./tga2im N_birthday_0000.tga N_birthday_0000.tga.png.
Original answer.
The only way I can think of converting the images is to author a quick program/script to do the bitwise color-pixel logic.
This answer offers a quick way to read the image data; so combining with MagickWand, can be converted easily. (Although I know there'll be better solutions found on old game-dev forums...)
#include <stdio.h>
#include <stdbool.h>
#include <wand/MagickWand.h>
typedef struct
{
unsigned char imageTypeCode;
short int imageWidth;
short int imageHeight;
unsigned char bitCount;
unsigned char *imageData;
} TGAFILE;
bool LoadTGAFile(const char *filename, TGAFILE *tgaFile);
int main(int argc, const char * argv[]) {
const char
* input,
* output;
if (argc != 3) {
printf("Usage:\n\t%s <input> <output>\n", argv[0]);
}
input = argv[1];
output = argv[2];
MagickWandGenesis();
TGAFILE header;
if (LoadTGAFile(input, &header) == true) {
// Build a blank canvas image matching TGA file.
MagickWand * wand;
wand = NewMagickWand();
PixelWand * background;
background = NewPixelWand();
PixelSetColor(background, "NONE");
MagickNewImage(wand, header.imageWidth, header.imageHeight, background);
background = DestroyPixelWand(background);
// Allocate RGBA8888 buffer
unsigned char * buffer = malloc(header.imageWidth * header.imageHeight * 4);
// Iterate over TGA image data, and convert RGBA4444 to RGBA8888;
size_t pixel_size = header.bitCount / 8;
size_t total_bytes = header.imageWidth * header.imageHeight * pixel_size;
for (int i = 0, j = 0; i < total_bytes; i+=pixel_size) {
// Red
buffer[j++] = (header.imageData[i ] & 0x0f) << 4;
// Green
buffer[j++] = (header.imageData[i ] & 0xf0);
// Blue
buffer[j++] = (header.imageData[i+1] & 0xf0) << 4;
// Alpha
buffer[j++] = (header.imageData[i+1] & 0xf0);
}
// Import image data over blank canvas
MagickImportImagePixels(wand, 0, 0, header.imageWidth, header.imageHeight, "RGBA", CharPixel, buffer);
// Write image
MagickWriteImage(wand, output);
wand = DestroyMagickWand(wand);
} else {
fprintf(stderr, "Could not read TGA file %s\n", input);
}
MagickWandTerminus();
return 0;
}
/*
* Method copied verbatim from https://stackoverflow.com/a/7050007/438117
* Show your love by +1 to Wroclai answer.
*/
bool LoadTGAFile(const char *filename, TGAFILE *tgaFile)
{
FILE *filePtr;
unsigned char ucharBad;
short int sintBad;
long imageSize;
int colorMode;
unsigned char colorSwap;
// Open the TGA file.
filePtr = fopen(filename, "rb");
if (filePtr == NULL)
{
return false;
}
// Read the two first bytes we don't need.
fread(&ucharBad, sizeof(unsigned char), 1, filePtr);
fread(&ucharBad, sizeof(unsigned char), 1, filePtr);
// Which type of image gets stored in imageTypeCode.
fread(&tgaFile->imageTypeCode, sizeof(unsigned char), 1, filePtr);
// For our purposes, the type code should be 2 (uncompressed RGB image)
// or 3 (uncompressed black-and-white images).
if (tgaFile->imageTypeCode != 2 && tgaFile->imageTypeCode != 3)
{
fclose(filePtr);
return false;
}
// Read 13 bytes of data we don't need.
fread(&sintBad, sizeof(short int), 1, filePtr);
fread(&sintBad, sizeof(short int), 1, filePtr);
fread(&ucharBad, sizeof(unsigned char), 1, filePtr);
fread(&sintBad, sizeof(short int), 1, filePtr);
fread(&sintBad, sizeof(short int), 1, filePtr);
// Read the image's width and height.
fread(&tgaFile->imageWidth, sizeof(short int), 1, filePtr);
fread(&tgaFile->imageHeight, sizeof(short int), 1, filePtr);
// Read the bit depth.
fread(&tgaFile->bitCount, sizeof(unsigned char), 1, filePtr);
// Read one byte of data we don't need.
fread(&ucharBad, sizeof(unsigned char), 1, filePtr);
// Color mode -> 3 = BGR, 4 = BGRA.
colorMode = tgaFile->bitCount / 8;
imageSize = tgaFile->imageWidth * tgaFile->imageHeight * colorMode;
// Allocate memory for the image data.
tgaFile->imageData = (unsigned char*)malloc(sizeof(unsigned char)*imageSize);
// Read the image data.
fread(tgaFile->imageData, sizeof(unsigned char), imageSize, filePtr);
// Change from BGR to RGB so OpenGL can read the image data.
for (int imageIdx = 0; imageIdx < imageSize; imageIdx += colorMode)
{
colorSwap = tgaFile->imageData[imageIdx];
tgaFile->imageData[imageIdx] = tgaFile->imageData[imageIdx + 2];
tgaFile->imageData[imageIdx + 2] = colorSwap;
}
fclose(filePtr);
return true;
}
The order of the color channels may need to be switch around.
I have been thinking about this some more and it ought to be possible to reconstruct the image without any special software - I can't quite see my mistake for the moment by maybe #emcconville can cast your expert eye over it and point out my mistake! Pretty please?
So, my concept is that ImageMagick has read in the image size and pixel data correctly but has just allocated the bits according to the standard RGB5551 interpretation of a TARGA file rather than RGBA4444. So, we rebuild the 16-bits of data it read and split them differently.
The first line below does the rebuild into the original 16-bit data, then each subsequent line splits out one of the RGBA channels and then we recombine them:
convert illu_evolution_01.tga -depth 16 -channel R -fx "(((r*255)<<10) | ((g*255)<<5) | (b*255) | ((a*255)<<15))/255" \
\( -clone 0 -channel R -fx "((((r*255)>>12)&15)<<4)/255" \) \
\( -clone 0 -channel R -fx "((((r*255)>>8 )&15)<<4)/255" \) \
\( -clone 0 -channel R -fx "((((r*255) )&15)<<4)/255" \) \
-delete 0 -set colorspace RGB -combine -colorspace sRGB result.png
# The rest is just debug so you can see the reconstructed channels in [rgba].png
convert result.png -channel R -separate r.png
convert result.png -channel G -separate g.png
convert result.png -channel B -separate b.png
convert result.png -channel A -separate a.png
So, the following diagram represents the 16-bits of 1 pixel:
A R R R R R G G G G G B B B B B <--- what IM saw
R R R R G G G G B B B B A A A A <--- what it really meant
Yes, I have disregarded the alpha channel for the moment.
Related
I have tried to implement alpha image blending algorithm in CUDA C. There is no error in my code. It compiled fine. As per the thread logic, If I run the code with the increased number of threads the runtime should be decreased. In my code, I got a weird pattern of run time. When I run the code with 1 thread the runtime was 8.060539 e-01 sec, when I run the code with 4 thread I got the runtime 7.579031 e-01 sec, When It ran for 8 threads the runtime was 7.810102e-01, and for 256 thread the runtime is 7.875319e-01.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
__global__ void image_blend(unsigned char *Pout, unsigned char *pin1, unsigned char *pin2, int width, int height, int channels, float alpha){
int col = threadIdx.x + blockIdx.x*blockDim.x;
int row = threadIdx.y + blockIdx.y*blockDim.y;
if(col<width && row<height){
size_t img_size = width * height * channels;
if (Pout != NULL)
{
for (size_t i = 0; i < img_size; i++)
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
}
}
}
int main(int argc, char* argv[]){
int thread_count;
double start, finish;
float alpha;
int width, height, channels;
unsigned char *new_img;
thread_count = strtol(argv[1], NULL, 10);
printf("Enter the value for alpha:");
scanf("%f", &alpha);
unsigned char *apple = stbi_load("apple.jpg", &width, &height, &channels, 0);
unsigned char *orange = stbi_load("orange.jpg", &width, &height, &channels, 0);
size_t img_size = width * height * channels;
//unsigned char *new_img = malloc(img_size);
cudaMallocManaged(&new_img,img_size*sizeof(unsigned char));
cudaMallocManaged(&apple,img_size* sizeof(unsigned char));
cudaMallocManaged(&orange, img_size*sizeof(unsigned char));
GET_TIME(start);
image_blend<<<1,16,thread_count>>>(new_img,apple, orange, width, height, channels,alpha);
cudaDeviceSynchronize();
GET_TIME(finish);
stbi_write_jpg("new_image.jpg", width, height, channels, new_img, 100);
cudaFree(new_img);
cudaFree(apple);
cudaFree(orange);
printf("\n Elapsed time for cuda = %e seconds\n", finish-start);
}
After getting a weird pattern in the runtime I am bit skeptical about the implementation of the code. Can anyone let me know why I get those runtime even if my code has no bug.
Let's start here:
image_blend<<<1,16,thread_count>>>(new_img,apple, orange, width, height, channels,alpha);
It seems evident you don't understand the kernel launch syntax:
<<<1,16,thread_count>>>
The first number (1) is the number of blocks to launch.
The second number (16) is the number of threads per block.
The third number (thread_count) is the size of the dynamically allocated shared memory in bytes.
So our first observation will be that although you claimed to have changed the thread count, you didn't. You were changing the number of bytes of dynamically allocated shared memory. Since your kernel code doesn't use shared memory, this is a completely meaningless variable.
Let's also observe your kernel code:
for (size_t i = 0; i < img_size; i++)
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
For every thread that passes your if test, each one of those threads will execute the entire for-loop and will process the entire image. That is not the general idea with writing CUDA kernels. The general idea is to break up the work so that each thread does a portion of the work, not the whole activity.
These are very basic observations. If you take advantage of an orderly introduction to CUDA, such as here, you can get beyond some of these basic concepts.
We could also point out that your kernel nominally expects a 2D launch, and you are not providing one, and perhaps many other observations. Another important concept that you are missing is that you cannot do this:
unsigned char *apple = stbi_load("apple.jpg", &width, &height, &channels, 0);
...
cudaMallocManaged(&apple,img_size* sizeof(unsigned char));
and expect anything sensible to come from that. If you want to see how data is moved from a host allocation to the device, study nearly any CUDA sample code, such as vectorAdd. Using a managed allocation doesn't allow you to overwrite the pointer like you are doing and get anything useful from that.
I'll provide an example of how one might go about doing what I think you are suggesting, without providing a complete tutorial on CUDA. To provide an example, I'm going to skip the STB image loading routines. To understand the work you are trying to do here, the actual image content does not matter.
Here's an example of an image processing kernel (1D) that will:
Process the entire image, only once
Use less time, roughly speaking, as you increase the thread count.
You haven't provided your timer routine/code, so I'll provide my own:
$ cat t2130.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
unsigned char *i_load(int w, int h, int c, int init){
unsigned char *res = new unsigned char[w*h*c];
for (int i = 0; i < w*h*c; i++) res[i] = init;
return res;
}
__global__ void image_blend(unsigned char *Pout, unsigned char *pin1, unsigned char *pin2, int width, int height, int channels, float alpha){
if (Pout != NULL)
{
size_t img_size = width * height * channels;
for (size_t i = blockIdx.x*blockDim.x+threadIdx.x; i < img_size; i+=gridDim.x*blockDim.x) // grid-stride loop
{
Pout[i] = ((1.0 - alpha) * pin1[i] + alpha * pin2[i]);
}
}
}
int main(int argc, char* argv[]){
int threads_per_block = 64;
unsigned long long dt;
float alpha;
int width = 1920;
int height = 1080;
int channels = 3;
size_t img_size = width * height * channels;
int thread_count = img_size;
if (argc > 1) thread_count = atoi(argv[1]);
unsigned char *new_img, *m_apple, *m_orange;
printf("Enter the value for alpha:");
scanf("%f", &alpha);
unsigned char *apple = i_load(width, height, channels, 10);
unsigned char *orange = i_load(width, height, channels, 70);
//unsigned char *new_img = malloc(img_size);
cudaMallocManaged(&new_img,img_size*sizeof(unsigned char));
cudaMallocManaged(&m_apple,img_size* sizeof(unsigned char));
cudaMallocManaged(&m_orange, img_size*sizeof(unsigned char));
memcpy(m_apple, apple, img_size);
memcpy(m_orange, orange, img_size);
int blocks;
if (thread_count < threads_per_block) {threads_per_block = thread_count; blocks = 1;}
else {blocks = thread_count/threads_per_block;}
printf("running with %d blocks of %d threads\n", blocks, threads_per_block);
dt = dtime_usec(0);
image_blend<<<blocks, threads_per_block>>>(new_img,m_apple, m_orange, width, height, channels,alpha);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("CUDA Error: %s\n", cudaGetErrorString(err));
else printf("\n Elapsed time for cuda = %e seconds\n", dt/(float)USECPSEC);
cudaFree(new_img);
cudaFree(m_apple);
cudaFree(m_orange);
}
$ nvcc -o t2130 t2130.cu
$ ./t2130 1
Enter the value for alpha:0.2
running with 1 blocks of 1 threads
Elapsed time for cuda = 5.737880e-01 seconds
$ ./t2130 2
Enter the value for alpha:0.2
running with 1 blocks of 2 threads
Elapsed time for cuda = 3.230150e-01 seconds
$ ./t2130 32
Enter the value for alpha:0.2
running with 1 blocks of 32 threads
Elapsed time for cuda = 4.865200e-02 seconds
$ ./t2130 64
Enter the value for alpha:0.2
running with 1 blocks of 64 threads
Elapsed time for cuda = 2.623300e-02 seconds
$ ./t2130 128
Enter the value for alpha:0.2
running with 2 blocks of 64 threads
Elapsed time for cuda = 1.546000e-02 seconds
$ ./t2130
Enter the value for alpha:0.2
running with 97200 blocks of 64 threads
Elapsed time for cuda = 5.809000e-03 seconds
$
(CentOS 7, CUDA 11.4, V100)
The key methodology that allows the kernel to do all the work (only once) while making use of an "arbitrary" number of threads efficiently is the grid-stride loop.
I am a newer to cuda and its image,signal processing library :NPP ,now I am trying to convert YUV420 to BGR ,use this function:
NppStatus nppiYUV420ToBGR_8u_P3C3R(const Npp8u * const pSrc[3], int rSrcStep[3], Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
but I can't decide the rSrcStep , I know it's the row size of each component ,Y U V, but not sure I really understand it , the original image size is 1920x1080 (wxh) ,I use a opencv Mat to contain the YUV image
cv::Mat(cv::Size(1920,1080*3/2), CV_8UC1, (void*)data)
,then for the parameter rSrcStep of the 1st function ,I try rSrcStep={1920,1920/2,1920/2} ,but it returns NPP_STEP_ERROR
ps: for the nDestStep , I use the below function to allocate dest buff ,and get the step at same time
Npp8u *
nppiMalloc_8u_C3(int nWidthPixels, int nHeightPixels, int * pStepBytes);
1080*(3/2) because YUV420 size is wh3/2 bytes when original RGB image is w*h
Set rSrcStep and nDstStep to the following values:
int rSrcStep[3] = { COLS, COLS / 2, COLS / 2 };
int nDstStep = COLS * 3;
Where COLS = 1920.
In YUV420 planar format (I420), the resolution of Y channel is full resolution, and the resolution of U and V channels is half resolution in each axis.
Example:
Y:
U:
V:
Assuming the data is continuous in memory, the step (row stride in bytes) of Y equals to image width, and the step of U and V equals width/2.
Testing:
The testing code uses FFmpeg for building the input image in raw I420 format, and uses FFmpeg for converting the raw BGR output to PNG image.
#include <stdint.h>
#include <stdio.h>
#include "nppi.h"
#define COLS 192
#define ROWS 108
uint8_t Y[COLS * ROWS]; //Y color channel in host memory
uint8_t U[COLS * ROWS / 4]; //U color channel in host memory
uint8_t V[COLS * ROWS / 4]; //V color channel in host memory
uint8_t BGR[COLS * ROWS * 3]; //BGR output image in host memory
int main()
{
//Read Y, U, V planes to host memory buffers.
//Build input sample using FFmpeg first:
//ffmpeg -y -f lavfi -i testsrc=size=192x108:rate=1:duration=1 -pix_fmt yuvj420p -f rawvideo in.yuv420p
////////////////////////////////////////////////////////////////////////////
FILE* f = fopen("in.yuv420p", "rb");
fread(Y, 1, COLS * ROWS, f);
fread(U, 1, COLS * ROWS / 4, f);
fread(V, 1, COLS * ROWS / 4, f);
fclose(f);
////////////////////////////////////////////////////////////////////////////
//Allocate device memory, and copy Y,U,V from host to device.
////////////////////////////////////////////////////////////////////////////
Npp8u* gpuY, * gpuU, * gpuV, * gpuBGR;
cudaMalloc(&gpuY, COLS * ROWS);
cudaMalloc(&gpuU, COLS * ROWS / 4);
cudaMalloc(&gpuV, COLS * ROWS / 4);
cudaMalloc(&gpuBGR, COLS * ROWS * 3);
cudaMemcpy(gpuY, Y, COLS * ROWS, cudaMemcpyHostToDevice);
cudaMemcpy(gpuU, U, COLS * ROWS / 4, cudaMemcpyHostToDevice);
cudaMemcpy(gpuV, V, COLS * ROWS / 4, cudaMemcpyHostToDevice);
////////////////////////////////////////////////////////////////////////////
//Execute nppiYUV420ToBGR_8u_P3C3R
////////////////////////////////////////////////////////////////////////////
const Npp8u* const pSrc[3] = { gpuY, gpuU, gpuV };
int rSrcStep[3] = { COLS, COLS / 2, COLS / 2 };
int nDstStep = COLS * 3;
NppiSize oSizeROI = { COLS, ROWS };
NppStatus sts = nppiYUV420ToBGR_8u_P3C3R(pSrc, //const Npp8u* const pSrc[3],
rSrcStep, //int rSrcStep[3],
gpuBGR, //Npp8u *pDst,
nDstStep, //int nDstStep,
oSizeROI); //NppiSize oSizeROI);
if (sts != NPP_SUCCESS)
{
printf("Error: nppiResize_8u_C3R status = %d\n", (int)sts);
}
////////////////////////////////////////////////////////////////////////////
// Copy BGR output from device to host, and save BGR output to binary file
// After saving, use FFmpeg to convert the output image from binary to PNG:
// ffmpeg -y -f rawvideo -video_size 192x108 -pixel_format bgr24 -i out.bgr out.png
////////////////////////////////////////////////////////////////////////////
cudaMemcpy(BGR, gpuBGR, COLS * ROWS * 3, cudaMemcpyDeviceToHost);
f = fopen("out.bgr", "wb");
fwrite(BGR, 1, COLS * ROWS * 3, f);
fclose(f);
////////////////////////////////////////////////////////////////////////////
cudaFree(&gpuY);
cudaFree(&gpuU);
cudaFree(&gpuV);
cudaFree(&gpuBGR);
return 0;
}
Output (out.png):
I have a raw binary image file where every pixel consists of 12 bit data (gray-scale). For example, the first four pixels in the raw file:
0x0 0xC0
0x1 0x05
0x2 0x5C
0x3 0xC0
0x4 0x05
0x5 0x5C
This corresponds to 4 pixel values with the value 0x5C0 (little endian).
Unfortunately, using the following command:
convert -size 384x184 -depth 12 gray:frame_0.raw out.tiff
interprets the pixel values incorrectly (big endian), resulting in the pixel values 0xC00 0x55C 0xC00 0x55C.
I tried the options -endian LSB and -endian MSB, but unfortunately they only change the output byte order, not the input byte order.
How do I get convert to open the raw image as 12-bit little endian data?
I had a quick try at this, but I have no test data but it should be fairly close and easy to detect errors with your images:
// pad12to16.c
// Mark Setchell
// Pad 12-bit data to 16-bit
//
// Compile with:
// gcc pad12to16.c -o pad12to16
//
// Run with:
// ./pad12to16 < 12-bit.dat > 16-bit.dat
#include <stdio.h>
#include <sys/uio.h>
#include <unistd.h>
#include <sys/types.h>
#define BYTESPERREAD 6
#define PIXPERWRITE 4
int main(){
unsigned char buf[BYTESPERREAD];
unsigned short pixel[PIXPERWRITE];
// Read 6 bytes at a time and decode to 4 off 16-bit pixels
while(read(0,buf,BYTESPERREAD)==BYTESPERREAD){
pixel[0] = buf[0] | ((buf[1] & 0xf) << 8);
pixel[1] = (buf[2] << 4) | ((buf[1] & 0xf0) >> 4);
pixel[2] = buf[3] | ((buf[2] & 0xf) << 8);
pixel[3] = (buf[5] << 4) | ((buf[4] & 0xf0) >> 4);
write(1,pixel,PIXPERWRITE*2);
}
return 0;
}
So you would run this (I think):
./pad12to16 < 12-bit.dat | convert -size 384x184 -depth 16 gray:- result.tif
Mark's answer is correct, as you'll need to involve some external tool to sort-out the data stream. Usually there's some sort of padding when working with 12-bit depth. In the example blob provided, we see that the each pair of pixels share a common byte. The task of splitting the shared byte, and shifting what-to-where is fairly easy. This answer compliments Mark's answer, and argues that ImageMagick's C-API might as well be used.
// my12bit_convert.c
#include <stdio.h>
#include <stdlib.h>
#include <magick/MagickCore.h>
#include <wand/MagickWand.h>
static ExceptionType serverty;
#define LEADING_HALF(x) ((x >> 4) & 0xF)
#define FOLLOWING_HALF(x) (x & 0xF)
#define TO_DOUBLE(x) ((double)x / (double)0xFFF);
#define IS_OK(x,y) if(x == MagickFalse) { fprintf(stderr, "%s\n", MagickGetException(y, &serverty)); }
int main(int argc, const char * argv[]) {
// Prototype vars
int
i,
tmp_pixels[2];
double * pixel_buffer;
size_t
w = 0,
h =0,
total = 0,
iterator = 0;
ssize_t
x = 0,
y = 0;
const char
* path = NULL,
* output = NULL;
unsigned char read_pixel_chunk[3];
FILE * fh;
MagickWand * wand;
PixelWand * pwand;
MagickBooleanType ok;
// Iterate over arguments and collect size, input, & output.
for ( i = 1; i < argc; i++ ) {
if (argv[i][0] == '-') {
if (LocaleCompare("size", &argv[i][1]) == 0) {
i++;
if (i == argc) {
fprintf(stderr, "Missing `WxH' argument for `-size'.");
return EXIT_FAILURE;
}
GetGeometry(argv[i], &x, &y, &w, &h);
}
} else if (path == NULL){
path = argv[i];
} else {
output = argv[i];
}
}
// Validate to some degree
if ( path == NULL ) {
fprintf(stderr, "Missing input path\n");
return EXIT_FAILURE;
}
if ( output == NULL ) {
fprintf(stderr, "Missing output path\n");
return EXIT_FAILURE;
}
total = w * h;
if (total == 0) {
fprintf(stderr, "Unable to determine size of %s. (use `-size WxH')\n", path);
return EXIT_FAILURE;
}
// Allocated memory and start the party!
pixel_buffer = malloc(sizeof(double) * total);
MagickWandGenesis();
// Read input file, and sort 12-bit pixels.
fh = fopen(path, "rb");
if (fh == NULL) {
fprintf(stderr, "Unable to read `%s'\n", path);
return 1;
}
while(!feof(fh)) {
total = fread(read_pixel_chunk, 3, 1, fh);
if (total) {
// 0xC0 0x05
// ^------' ==> 0x05C0
tmp_pixels[0] = FOLLOWING_HALF(read_pixel_chunk[1]) << 8 | read_pixel_chunk[0];
// 0x05 0x5C
// '------^ ==> 0x05C0
tmp_pixels[1] = read_pixel_chunk[2] << 4 | LEADING_HALF(read_pixel_chunk[1]);
// 0x5C0 / 0xFFF ==> 0.359463
pixel_buffer[iterator++] = TO_DOUBLE(tmp_pixels[0]);
pixel_buffer[iterator++] = TO_DOUBLE(tmp_pixels[1]);
}
}
fclose(fh);
// Create image
wand = NewMagickWand();
pwand = NewPixelWand();
ok = PixelSetColor(pwand, "white");
IS_OK(ok, wand);
// Create new Image
ok = MagickNewImage(wand, w, h, pwand);
IS_OK(ok, wand);
// Import pixels as gray, or intensity, values.
ok = MagickImportImagePixels(wand, x, y, w, h, "I", DoublePixel, pixel_buffer);
IS_OK(ok, wand);
// Save ouput
ok = MagickWriteImage(wand, output);
IS_OK(ok, wand);
// Clean house
DestroyPixelWand(pwand);
DestroyMagickWand(wand);
MagickWandTerminus();
if (pixel_buffer) {
free(pixel_buffer);
}
return 0;
}
Which can be compiled with
LLVM_CFLAGS=`MagickWand-config --cflags`
LLVM_LDFLAGS=`MagickWand-config --ldflags`
clang $LLVM_CFLAGS $LLVM_LDFLAGS -o my12bit_convert my12bit_convert.c
And usage
./my12bit_convert -size 384x184 frame_0.raw out.tiff
I'm using cuda to deal with image proccessing. but my result is always get 'cudaErrorIllegalAddress : an illegal memory access was encountered'
What i did is below.
First, Load converted image(rgb to gray) to device, i use 'cudaMallocPitch' and 'cudaMemcpy2D'
unsigned char *dev_srcleft;
size_t dev_srcleftPitch
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS * sizeof(int), ROWS));
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
And, Allocating 2D array for store result. the result value is describe as 27bit, so what i'm trying is using 'int' which is 4bytes=32bits, not only for ample size , atomic operation(atomicOr, atomicXor) is needed for performance.
and my device is not supports 64bit atomic operation.
int *dev_leftTrans;
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMemset2D(dev_leftTrans, dev_leftTransPitch, 0, COLS, ROWS);
Memory allocation and memcpy2D works great, and i check by
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D(temp_output.data, temp_output.step, dev_srcleft, dev_srcleftPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
Then, Do kernel code.
__global__ void TestKernel(unsigned char *src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
Parameter COLS and ROWS is size of image.
I think the error occurs here : TestKerenl.
src_val, reading from global memory works good but when i'm trying to access dst, it blows up with cudaErrorIllegalAddress
I don't know what is wrong, and i sufferd for 4 days. please help me
below is my full code
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <cuda_device_runtime_api.h>
#include <device_launch_parameters.h>
#include <math.h>
#include <iostream>
#include <opencv2\opencv.hpp>
#include<string>
#define HANDLE_ERROR(err)(HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char*file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
using namespace std;
using namespace cv;
string imagePath = "Ted";
string imagePathL = imagePath + "imL.png";
string imagePathR = imagePath + "imR.png";
__global__ void TestKernel(unsigned char*src, size_t src_pitch,
int *dst, size_t dst_pitch,
unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((COLS< x) && (ROWS < y)) return;
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
}
int main(void)
{
//Print_DeviceProperty();
//Left Image Load
Mat host_srcImgL = imread(imagePathL, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Left Image Load Fail!" << endl; return; }
Mat host_srcConvertL;
cvtColor(host_srcImgL, host_srcConvertL, CV_BGR2GRAY);
//Right Image Load
Mat host_srcImgR = imread(imagePathR, CV_LOAD_IMAGE_UNCHANGED);
if (host_srcImgL.empty()){ cout << "Right Image Load Fail!" << endl; return; }
Mat host_srcConvertR;
cvtColor(host_srcImgR, host_srcConvertR, CV_BGR2GRAY);
//Create parameters
unsigned int COLS = host_srcConvertL.cols;
unsigned int ROWS = host_srcConvertR.rows;
unsigned int SIZE = COLS * ROWS;
imshow("Left source image", host_srcConvertL);
imshow("Right source image", host_srcConvertR);
unsigned char *dev_srcleft, *dev_srcright, *dev_disp;
int *dev_leftTrans, *dev_rightTrans;
size_t dev_srcleftPitch, dev_srcrightPitch, dev_dispPitch, dev_leftTransPitch, dev_rightTransPitch;
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_srcright, &dev_srcrightPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_disp, &dev_dispPitch, COLS, ROWS);
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMallocPitch((void**)&dev_rightTrans, &dev_rightTransPitch, COLS * sizeof(int), ROWS);
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemcpy2D(dev_srcright, dev_srcrightPitch, host_srcConvertR.data, host_srcConvertR.step,
COLS, ROWS, cudaMemcpyHostToDevice);
cudaMemset(dev_disp, 255, dev_dispPitch * ROWS);
dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
cudaEvent_t start, stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
/*TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
(dev_srcright, dev_srcrightPitch, dev_rightTrans, dev_rightTransPitch, COLS, ROWS);*/
cudaThreadSynchronize();
cudaError_t res = cudaGetLastError();
if (res != cudaSuccess)
printf("%s : %s\n", cudaGetErrorName(res), cudaGetErrorString(res));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << elapsedtime << "msec" << endl;
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D((int*)temp_output.data, temp_output.step, dev_leftTrans, dev_leftTransPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);
waitKey(0);
return 0;
}
And this is my environment vs2013, cuda v6.5
Device' property's below
Major revision number: 3
Minor revision number: 0
Name: GeForce GTX 760 (192-bit)
Total global memory: 1610612736
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 888500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 6
Kernel execution timeout: Yes
One problem is that your kernel doesn't do any thread-checking.
When you define a grid of blocks like this:
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));
you will often be launching extra blocks. The reason is that if COLS or ROW is not evenly divisible by the block dimensions (3 in this case) then you will get extra blocks to cover the remainder in each case.
These extra blocks will have some threads that are doing useful work, and some that will access out-of-bounds. To protect against this, it's customary to put a thread-check in your kernel to prevent out-of-bounds accesses:
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < COLS) && (y < ROWS)) { // add this
unsigned char src_val = src[x + y * src_pitch];
dst[x + y * dst_pitch] = src_val;
} // add this
This means that only the threads that have a valid (in-bounds) x and y will actually do any accesses.
As an aside, (3,3) may not be a particularly good choice of block dimensions for performance reasons. It's usually a good idea to create block dimensions whose product is a multiple of 32, so (32,4) or (16,16) might be examples of better choices.
Another problem in your code is pitch usage for dst array.
Pitch is always in bytes, so first you need to cast dst pointer to char*, calculate row offset and then cast it back to int*:
int* dst_row = (int*)(((char*)dst) + y * dst_pitch);
dst_row[x] = src_val;
i began to implement some simple image processing using cuda but i have an error in my code
the error happens when i copy pixels from device to host
this is my try
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <opencv2\core\core.hpp>
#include <opencv2\highgui\highgui.hpp>
#include <stdio.h>
using namespace cv;
unsigned char *h_pixels;
unsigned char *d_pixels;
int bufferSize;
int width,height;
const int BLOCK_SIZE = 32;
Mat image;
void get_pixels(const char* fileName)
{
image = imread(fileName);
bufferSize = image.size().width * image.size().height * 3 * sizeof(unsigned char);
width = image.size().width;
height = image.size().height;
h_pixels = new unsigned char[bufferSize];
memcpy(h_pixels,image.data,bufferSize);
}
__global__ void invert_image(unsigned char* pixels,int width,int height)
{
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int cidx = (row * width + col) * 3;
pixels[cidx] = 255 - pixels[cidx];
pixels[cidx + 1] = 255 - pixels[cidx + 1];
pixels[cidx + 2] = 255 - pixels[cidx + 2];
}
int main()
{
get_pixels("D:\\photos\\z.jpg");
cudaError_t err = cudaMalloc((void**)&d_pixels,bufferSize);
err = cudaMemcpy(d_pixels,h_pixels,bufferSize,cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(width/dimBlock.x,height/dimBlock.y);
invert_image<<<dimBlock,dimGrid>>>(d_pixels,width,height);
unsigned char *pixels = new unsigned char[bufferSize];
err= cudaMemcpy(pixels,d_pixels,bufferSize,cudaMemcpyDeviceToHost);// unknown error
const char * errStr = cudaGetErrorString(err);
cudaFree(d_pixels);
image.data = pixels;
namedWindow("display image");
imshow("display image",image);
waitKey();
return 0;
}
also how can i find out error that occurs in cuda device
thanks for your help
OpenCV images are not continuous. Each row is 4 byte or 8 byte aligned. You should also pass the step field of the Mat to the CUDA kernel, so that you can calculate the cidx correctly. The generic formula to calculate the output index is:
cidx = row * (step/elementSize) + (NumberOfChannels * col);
in your case, it will be:
cidx = row * step + (3 * col);
Referring to the alignment of images, you buffer size is equal to image.step * image.size().height.
Next thing is the one pointed out by #phoad in the third point. You should create enough number of thread blocks to cover the whole image.
Here is a generic formula for Grid which will create enough number of blocks for any image size.
dim3 block(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid((width + block.x - 1)/block.x,(height + block.y - 1)/block.y);
First of all be sure that the image file is read correctly.
Check if the device memory is allocated with CUDA_SAFE_CALL(cudaMalloc(..))
Check the dimensions of the image. If the dimension of the image is not multiples of BLOCKSIZE than you might be missing some indices and the image is not fully inverted.
Call cudaDeviceSynchronize after the kernel call and check its return value.
Do you get any error when you run the code without calling the kernel anyway?
You are not freeing the h_pixels and might have a memory leak.
Instead of using BLOCKSIZE in the kernel you might use "blockDim.x". So calculating indices like "blockIdx.x * blockDim.x + threadIdx.x"
Try to do not touch the memory area in the kernel code, namely comment out the memory updates at the kernel (the lines where you access the pixels array) and check if the program continues to fail. If it does not continue to fail you might be accessing out of the bounds.
Use this command immediately after the kernel invocation to print the kernel errors:
printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()))