HDFql Writing Very Slow - hdf5

I have some code, which iteratively receives data which it dumps to a HDF5 file. Here is a toy example of what I am trying to achieve:
#include <HDFql.hpp>
void createHDF(const std::string & filepath)
{
char script_[1024];
sprintf(script_, "CREATE TRUNCATE FILE %s", filepath.c_str());
HDFql::execute(script_);
sprintf(script_, "USE FILE %s", filepath.c_str());
HDFql::execute(script_);
sprintf(script_, "CREATE GROUP events");
HDFql::execute(script_);
HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");
sprintf(script_, "CREATE GROUP frames");
HDFql::execute(script_);
sprintf(script_, "CREATE GROUP optic_flow");
HDFql::execute(script_);
}
void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v,
std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
//Input arrays are all the same size
const int data_size = ts_v.size();
//Open file
sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
HDFql::execute(script_);
//Add events
sprintf(script_, "ALTER DIMENSION events/xs TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "ALTER DIMENSION events/ys TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "ALTER DIMENSION events/ts TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "ALTER DIMENSION events/ps TO +%d", data_size);
HDFql::execute(script_);
HDFql::variableRegister(&xs_v[0]);
sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&xs_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&xs_v[0]);
HDFql::variableRegister(&ys_v[0]);
sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&ys_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&ys_v[0]);
HDFql::variableRegister(&ts_v[0]);
sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&ts_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&ts_v[0]);
HDFql::variableRegister(&ps_v[0]);
sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&ps_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&ps_v[0]);
total_events_added_ += data_size;
events_idx_++;
}
int main (int argc, const char * argv[]) {
std::string path = "/tmp/test.h5";
createHDF(path);
const int data_size = 1000;
const int iterations = 10000;
std::vector<double> ts(data_size);
std::vector<int16_t> xs(data_size);
std::vector<int16_t> ys(data_size);
std::vector<int8_t> ps(data_size);
for(int i=0; i<data_size; i++)
{
ts_v.push_back(i);
xs_v.push_back(i);
ys_v.push_back(i);
ps_v.push_back(1);
}
for(int i=0; i<iterations; i++)
{
writeData(ts, xs, ys, ps);
}
}
This code runs extremely slowly. Using other binary libraries such as cnpy, this executes in the blink of an eye, so it is not the amount of data being written that is the issue. I was wondering if that is just how things are in HDFql, or whether there is some blunder in the code somewhere.
Many thanks!

Are you with cnpy executing the same operations that you are doing in HDFql (e.g. extending the dimensions of datasets events/xs, events/ys, events/ts and events/ps, using a chunk size equal to 1)?
Looking at your code, you may want to explicitly specify the chunk size of the datasets equal to ts_v.size() as this will most probably increase performance greatly. The way you have it now makes HDFql automatically calculate a chunk size for your convenience (using a best guess approach), which may not lead to an optimal performance. You need to explicitly specify the chunk size like, e.g., CREATE CHUNKED(10) DATASET events/xs AS SMALLINT(UNLIMITED).

Your code more optimized:
#include <HDFql.hpp>
void createHDF(const std::string & filepath)
{
char script_[1024];
sprintf(script_, "CREATE TRUNCATE AND USE FILE %s", filepath.c_str());
HDFql::execute(script_);
HDFql::execute("CREATE GROUP events, frames, optic_flow");
HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");
}
void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v, std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
//Input arrays are all the same size
const int data_size = ts_v.size();
//Open file
sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
HDFql::execute(script_);
//Add events
sprintf(script_, "ALTER DIMENSION events/xs, events/ys, events/ts, events/ps TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY 0", data_size, data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY 1", data_size, data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY 2", data_size, data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY 3", data_size, data_size);
HDFql::execute(script_);
total_events_added_ += data_size;
events_idx_++;
}
int main (int argc, const char * argv[]) {
std::string path = "/tmp/test.h5";
createHDF(path);
const int data_size = 1000;
const int iterations = 10000;
std::vector<double> ts(data_size);
std::vector<int16_t> xs(data_size);
std::vector<int16_t> ys(data_size);
std::vector<int8_t> ps(data_size);
for(int i=0; i<data_size; i++)
{
ts_v.push_back(i);
xs_v.push_back(i);
ys_v.push_back(i);
ps_v.push_back(1);
}
HDFql::variableRegister(&xs_v);
HDFql::variableRegister(&ys_v);
HDFql::variableRegister(&ts_v);
HDFql::variableRegister(&ps_v);
for(int i=0; i<iterations; i++)
{
writeData(ts, xs, ys, ps);
}
}
In addition, is it possible to move these two consecutive lines of code sprintf(script_, "USE FILE %s", HDF5_path_.c_str()); HDFql::execute(script_); outside the writeData function and just open the file once? Doing so will for sure make things faster.

Related

Read OpenCV Mat from raw file [duplicate]

Is there a more efficient way to load a large Mat object into memory than the FileStorage method in OpenCV?
I have a large Mat with 192 columns and 1 million rows I want to store locally in a file and load into memory then my application starts. There is no problem using the FileStorage, but I was wondering if there exists a more efficient method to do this. At the moment it takes about 5 minutes to load the Mat into memory using the Debug mode in Visual Studio and around 3 minutes in the Release mode and the size of the data file is around 1.2GB.
Is the FileStorage method the only method available to do this task?
Are you ok with a 100x speedup?
You should save and load your images in binary format. You can do that with the matwrite and matread function in the code below.
I tested both loading from a FileStorage and the binary file, and for a smaller image with 250K rows, 192 columns, type CV_8UC1 I got these results (time in ms):
// Mat: 250K rows, 192 cols, type CV_8UC1
Using FileStorage: 5523.45
Using Raw: 50.0879
On a image with 1M rows and 192 cols using the binary mode I got (time in ms):
// Mat: 1M rows, 192 cols, type CV_8UC1
Using FileStorage: (can't load, out of memory)
Using Raw: 197.381
NOTE
Never measure performance in debug.
3 minutes to load a matrix seems way too much, even for FileStorages. However, you'll gain a lot switching to binary mode.
Here the code with the functions matwrite and matread, and the test:
#include <opencv2\opencv.hpp>
#include <iostream>
#include <fstream>
using namespace std;
using namespace cv;
void matwrite(const string& filename, const Mat& mat)
{
ofstream fs(filename, fstream::binary);
// Header
int type = mat.type();
int channels = mat.channels();
fs.write((char*)&mat.rows, sizeof(int)); // rows
fs.write((char*)&mat.cols, sizeof(int)); // cols
fs.write((char*)&type, sizeof(int)); // type
fs.write((char*)&channels, sizeof(int)); // channels
// Data
if (mat.isContinuous())
{
fs.write(mat.ptr<char>(0), (mat.dataend - mat.datastart));
}
else
{
int rowsz = CV_ELEM_SIZE(type) * mat.cols;
for (int r = 0; r < mat.rows; ++r)
{
fs.write(mat.ptr<char>(r), rowsz);
}
}
}
Mat matread(const string& filename)
{
ifstream fs(filename, fstream::binary);
// Header
int rows, cols, type, channels;
fs.read((char*)&rows, sizeof(int)); // rows
fs.read((char*)&cols, sizeof(int)); // cols
fs.read((char*)&type, sizeof(int)); // type
fs.read((char*)&channels, sizeof(int)); // channels
// Data
Mat mat(rows, cols, type);
fs.read((char*)mat.data, CV_ELEM_SIZE(type) * rows * cols);
return mat;
}
int main()
{
// Save the random generated data
{
Mat m(1024*256, 192, CV_8UC1);
randu(m, 0, 1000);
FileStorage fs("fs.yml", FileStorage::WRITE);
fs << "m" << m;
matwrite("raw.bin", m);
}
// Load the saved matrix
{
// Method 1: using FileStorage
double tic = double(getTickCount());
FileStorage fs("fs.yml", FileStorage::READ);
Mat m1;
fs["m"] >> m1;
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << "Using FileStorage: " << toc << endl;
}
{
// Method 2: usign raw binary data
double tic = double(getTickCount());
Mat m2 = matread("raw.bin");
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << "Using Raw: " << toc << endl;
}
int dummy;
cin >> dummy;
return 0;
}

OpenCV create Mat of float from binary

I have a binary Mat (1x256) (CV_8UC1) with 256 bytes. I need to create another Mat (CV_32F) of floats with 1x64 dimension out of it. Meaning that each 4 bytes construct a float value in result matrix.
Is there any way to do that in OpenCV? Or any other C++ way?
#include "opencv2/opencv.hpp"
using namespace cv;
#include <assert.h>
int main()
{
Mat floatOrig = Mat::zeros(1,64,CV_32FC1);
Mat ucharConverted = Mat::zeros(1,256,CV_8UC1);
Mat floatConverted = Mat::zeros(1,64,CV_32FC1);
//construct some data
RNG rng = theRNG();
for(int i=0;i<floatOrig.cols;++i)
{
floatOrig.at<float>(0,i)=rng.gaussian(1.);
}
//save them into uchar first
for(int i=0;i<ucharConverted.cols;++i)
{
ucharConverted.at<uchar>(0,i)= floatOrig.at<uchar>(0,i);
}
//now convert them back into float
//uchar b[4] = {0}; uncomment for big endian data
for(int i=0;i<floatConverted.cols;++i)
{
/* uncomment for big endian ordering
b[0]=ucharConverted.at<uchar>(0,i*4+3);
b[1]=ucharConverted.at<uchar>(0,i*4+2);
b[2]=ucharConverted.at<uchar>(0,i*4+1);
b[3]=ucharConverted.at<uchar>(0,i*4+0);
memcpy(&floatConverted.at<float>(0,i),&b, sizeof(float));
*/
memcpy(&floatConverted.at<float>(0,i),&ucharConverted.at<uchar>(0,i*4), sizeof(float));
}
//verify
for(int i=0;i<floatConverted.cols;++i)
{
assert(floatConverted.at<float>(0,i)-floatOrig.at<float>(0,i)==0.);
}
// now lets try saving that to file
FILE* fp = fopen("c:/data/float64.bin","wb");
for(size_t i=0;i<floatConverted.cols;++i)
{
fwrite( &floatConverted.at<float>(0,i),sizeof(float),1,fp);
}
fclose(fp);
floatConverted=0;//we gonna try to load it back
fp = fopen("c:/data/float64.bin","rb");
for(size_t i=0;i<floatConverted.cols;++i)
{
fread( &floatConverted.at<float>(0,i),sizeof(float),1,fp);
}
fclose(fp);
//verify data read from file
for(int i=0;i<floatConverted.cols;++i)
{
assert(floatConverted.at<float>(0,i)-floatOrig.at<float>(0,i)==0.);
}
getchar();
}

Converting a 2D Canny Edge image to 1D edge pixel array in CUDA - Strange behaviour

I have a CUDA kernel which takes an edge image and processes it to create a smaller, 1D array of the edge pixels. Now here is the strange behaviour. Every time I run the kernel and calculate the number of edge pixels in "d_nlist" (see the code near the printf), I get a greater pixel count each time, even when I use the same image and stop the program completely and re-run. Therefore, each time I run it, it takes longer to run, until eventually, it throws an un-caught exception.
My question is, how can I stop this from happening so that I can get consistent results each time I run the kernel?
My device is a Geforce 620.
Constants:
THREADS_X = 32
THREADS_Y = 4
PIXELS_PER_THREAD = 4
MAX_QUEUE_LENGTH = THREADS_X * THREADS_Y * PIXELS_PER_THREAD
IMG_WIDTH = 256
IMG_HEIGHT = 256
IMG_SIZE = IMG_WIDTH * IMG_HEIGHT
BLOCKS_X = IMG_WIDTH / (THREADS_X * PIXELS_PER_THREAD)
BLOCKS_Y = IMG_HEIGHT / THREADS_Y
The kernel is as follows:
__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
unsigned int* const list, int* const glob_index ) {
unsigned int const x = blockIdx.x * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y * THREADS_Y + threadIdx.y;
volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;
// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
int const xx = i*THREADS_X + x;
// Read one image pixel from global memory
unsigned char const pixel = image[y*IMG_WIDTH + xx];
unsigned int const queue_val = (y << 16) + xx;
if(pixel)
{
do {
qindex++;
sh_qindex[threadIdx.y] = qindex;
sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
} while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
}
// Reload index from smem (last thread to write to smem will have updated it)
qindex = sh_qindex[threadIdx.y];
}
// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
// Find how many items are stored in each list
int total_index = 0;
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] = total_index;
total_index += (sh_qindex[i] + 1u);
}
// Calculate the offset in the global list
unsigned int global_offset = atomicAdd(glob_index, total_index);
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] += global_offset;
}
}
__syncthreads();
// Copy local queues to global queue
for(int i=0; i<=qindex; i+=THREADS_X)
{
if(i + threadIdx.x > qindex)
break;
unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
}
The following is the method which calls the kernel:
void call2DTo1DKernel(unsigned char const * const h_image)
{
// Device side allocation
unsigned char *d_image = NULL;
unsigned int *d_list = NULL;
int h_nlist, *d_nlist = NULL;
cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
cudaMalloc((void**)&d_nlist, sizeof(int));
// Time measurement initialization
cudaEvent_t start, stop, startio, stopio;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startio);
cudaEventCreate(&stopio);
// Start timer w/ io
cudaEventRecord(startio,0);
// Copy image data to device
cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE, cudaMemcpyHostToDevice);
// Start timer
cudaEventRecord(start,0);
// Kernel call
// Phase 1 : Convert 2D binary image to 1D pixel array
dim3 dimBlock1(THREADS_X, THREADS_Y);
dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
// Stop timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
// Stop timer w/ io
cudaEventRecord(stopio,0);
cudaEventSynchronize(stopio);
// Time measurement
cudaEventElapsedTime(&et,start,stop);
cudaEventElapsedTime(&etio,startio,stopio);
// Time measurement deinitialization
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventDestroy(startio);
cudaEventDestroy(stopio);
// Get list size
cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
// Report on console
printf("%d pixels processed...\n", h_nlist);
// Device side dealloc
cudaFree(d_image);
cudaFree(d_space);
cudaFree(d_list);
cudaFree(d_nlist);
}
Thank you very much in advance for your help everyone.
As a preamble, let me suggest some troubleshooting steps that are useful:
instrument your code with proper cuda error checking
run your code with cuda-memcheck e.g. cuda-memcheck ./myapp
If you do the above steps, you'll find that your kernel is failing, and the failures have to do with global writes of size 4. So that will focus your attention on the last segment of your kernel, beginning with the comment // Copy local queues to global queue
Regarding your code, then, you have at least 2 problems:
The addressing/indexing in your final segment of your kernel, where you are writing the individual queues out to global memory, is messed up. I'm not going to try and debug this for you.
You are not initializing your d_nlist variable to zero. Therefore when you do an atomic add to it, you are adding your values to a junk value, which will tend to increase as you repeat the process.
Here's some code which has the problems removed, (I did not try to sort out your queue copy code) and error checking added. It produces repeatable results for me:
$ cat t216.cu
#include <stdio.h>
#include <stdlib.h>
#define THREADS_X 32
#define THREADS_Y 4
#define PIXELS_PER_THREAD 4
#define MAX_QUEUE_LENGTH (THREADS_X*THREADS_Y*PIXELS_PER_THREAD)
#define IMG_WIDTH 256
#define IMG_HEIGHT 256
#define IMG_SIZE (IMG_WIDTH*IMG_HEIGHT)
#define BLOCKS_X (IMG_WIDTH/(THREADS_X*PIXELS_PER_THREAD))
#define BLOCKS_Y (IMG_HEIGHT/THREADS_Y)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
unsigned int* const list, int* const glob_index ) {
unsigned int const x = blockIdx.x * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y * THREADS_Y + threadIdx.y;
volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;
// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
int const xx = i*THREADS_X + x;
// Read one image pixel from global memory
unsigned char const pixel = image[y*IMG_WIDTH + xx];
unsigned int const queue_val = (y << 16) + xx;
if(pixel)
{
do {
qindex++;
sh_qindex[threadIdx.y] = qindex;
sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
} while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
}
// Reload index from smem (last thread to write to smem will have updated it)
qindex = sh_qindex[threadIdx.y];
}
// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
// Find how many items are stored in each list
int total_index = 0;
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] = total_index;
total_index += (sh_qindex[i] + 1u);
}
// Calculate the offset in the global list
unsigned int global_offset = atomicAdd(glob_index, total_index);
#pragma unroll
for(int i=0; i<THREADS_Y; i++)
{
sh_qstart[i] += global_offset;
}
}
__syncthreads();
// Copy local queues to global queue
/*
for(int i=0; i<=qindex; i+=THREADS_X)
{
if(i + threadIdx.x > qindex)
break;
unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
*/
}
void call2DTo1DKernel(unsigned char const * const h_image)
{
// Device side allocation
unsigned char *d_image = NULL;
unsigned int *d_list = NULL;
int h_nlist=0, *d_nlist = NULL;
cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
cudaMalloc((void**)&d_nlist, sizeof(int));
cudaCheckErrors("cudamalloc fail");
// Time measurement initialization
cudaEvent_t start, stop, startio, stopio;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startio);
cudaEventCreate(&stopio);
float et, etio;
// Start timer w/ io
cudaEventRecord(startio,0);
cudaMemcpy(d_nlist, &h_nlist, sizeof(int), cudaMemcpyHostToDevice);
// Copy image data to device
cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE, cudaMemcpyHostToDevice);
cudaCheckErrors("cudamemcpy 1");
// Start timer
cudaEventRecord(start,0);
// Kernel call
// Phase 1 : Convert 2D binary image to 1D pixel array
dim3 dimBlock1(THREADS_X, THREADS_Y);
dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
// Stop timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
// Stop timer w/ io
cudaEventRecord(stopio,0);
cudaEventSynchronize(stopio);
// Time measurement
cudaEventElapsedTime(&et,start,stop);
cudaEventElapsedTime(&etio,startio,stopio);
// Time measurement deinitialization
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventDestroy(startio);
cudaEventDestroy(stopio);
// Get list size
cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
// Report on console
printf("%d pixels processed...\n", h_nlist);
// Device side dealloc
cudaFree(d_image);
// cudaFree(d_space);
cudaFree(d_list);
cudaFree(d_nlist);
}
int main(){
unsigned char *image;
image = (unsigned char *)malloc(IMG_SIZE * sizeof(unsigned char));
if (image == 0) {printf("malloc fail\n"); return 0;}
for (int i =0 ; i<IMG_SIZE; i++)
image[i] = rand()%2;
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
call2DTo1DKernel(image);
cudaCheckErrors("some error");
return 0;
}
$ nvcc -arch=sm_20 -O3 -o t216 t216.cu
$ ./t216
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
$ ./t216
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
32617 pixels processed...
$

cudaFree is not freeing memory

The code below calculates the dot product of two vectors a and b. The correct result is 8192. When I run it for the first time the result is correct. Then when I run it for the second time the result is the previous result + 8192 and so on:
1st iteration: result = 8192
2nd iteration: result = 8192 + 8192
3rd iteration: result = 8192 + 8192
and so on.
I checked by printing it on screen and the device variable dev_c is not freed. What's more writing to it causes something like a sum, the result beeing the previous value plus the new one being written to it. I guess that could be something with the atomicAdd() operation, but nonetheless cudaFree(dev_c) should erase it after all.
#define N 8192
#define THREADS_PER_BLOCK 512
#define NUMBER_OF_BLOCKS (N/THREADS_PER_BLOCK)
#include <stdio.h>
__global__ void dot( int *a, int *b, int *c ) {
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if( 0 == threadIdx.x ) {
int sum = 0;
for( int i= 0; i< THREADS_PER_BLOCK; i++ ){
sum += temp[i];
}
atomicAdd(c,sum);
}
}
int main( void ) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
int size = N * sizeof( int);
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, sizeof(int));
a = (int*)malloc(size);
b = (int*)malloc(size);
c = (int*)malloc(sizeof(int));
for(int i = 0 ; i < N ; i++){
a[i] = 1;
b[i] = 1;
}
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
dot<<< N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>( dev_a, dev_b, dev_c);
cudaMemcpy( c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
printf("Dot product = %d\n", *c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
free(a);
free(b);
free(c);
return 0;
}
cudaFree doesn't erase anything, it simply returns memory to a pool to be re-allocated. cudaMalloc doesn't guarantee the value of memory that has been allocated. You need to initialize memory (both global and shared) that your program uses, in order to have consistent results. The same is true for malloc and free, by the way.
From the documentation of cudaMalloc();
The memory is not cleared.
That means that dev_c is not initialized, and your atomicAdd(c,sum); will add to any random value that happens to be stored in memory at the returned position.

Extracting DCT coefficients from encoded images and video

Is there a way to easily extract the DCT coefficients (and quantization parameters) from encoded images and video? Any decoder software must be using them to decode block-DCT encoded images and video. So I'm pretty sure the decoder knows what they are. Is there a way to expose them to whomever is using the decoder?
I'm implementing some video quality assessment algorithms that work directly in the DCT domain. Currently, the majority of my code uses OpenCV, so it would be great if anyone knows of a solution using that framework. I don't mind using other libraries (perhaps libjpeg, but that seems to be for still images only), but my primary concern is to do as little format-specific work as possible (I don't want to reinvent the wheel and write my own decoders). I want to be able to open any video/image (H.264, MPEG, JPEG, etc) that OpenCV can open, and if it's block DCT-encoded, to get the DCT coefficients.
In the worst case, I know that I can write up my own block DCT code, run the decompressed frames/images through it and then I'd be back in the DCT domain. That's hardly an elegant solution, and I hope I can do better.
Presently, I use the fairly common OpenCV boilerplate to open images:
IplImage *image = cvLoadImage(filename);
// Run quality assessment metric
The code I'm using for video is equally trivial:
CvCapture *capture = cvCaptureFromAVI(filename);
while (cvGrabFrame(capture))
{
IplImage *frame = cvRetrieveFrame(capture);
// Run quality assessment metric on frame
}
cvReleaseCapture(&capture);
In both cases, I get a 3-channel IplImage in BGR format. Is there any way I can get the DCT coefficients as well?
Well, I did a bit of reading and my original question seems to be an instance of wishful thinking.
Basically, it's not possible to get the DCT coefficients from H.264 video frames for the simple reason that H.264 doesn't use DCT. It uses a different transform (integer transform). Next, the coefficients for that transform don't necessarily change on a frame-by-frame basis -- H.264 is smarter cause it splits up frames into slices. It should be possible to get those coefficients through a special decoder, but I doubt OpenCV exposes it for the user.
For JPEG, things are a bit more positive. As I suspected, libjpeg exposes the DCT coefficients for you. I wrote a small app to show that it works (source at the end). It makes a new image using the DC term from each block. Because the DC term is equal to the block average (after proper scaling), the DC images are downsampled versions of the input JPEG image.
EDIT: fixed scaling in source
Original image (512 x 512):
DC images (64x64): luma Cr Cb RGB
Source (C++):
#include <stdio.h>
#include <assert.h>
#include <cv.h>
#include <highgui.h>
extern "C"
{
#include "jpeglib.h"
#include <setjmp.h>
}
#define DEBUG 0
#define OUTPUT_IMAGES 1
/*
* Extract the DC terms from the specified component.
*/
IplImage *
extract_dc(j_decompress_ptr cinfo, jvirt_barray_ptr *coeffs, int ci)
{
jpeg_component_info *ci_ptr = &cinfo->comp_info[ci];
CvSize size = cvSize(ci_ptr->width_in_blocks, ci_ptr->height_in_blocks);
IplImage *dc = cvCreateImage(size, IPL_DEPTH_8U, 1);
assert(dc != NULL);
JQUANT_TBL *tbl = ci_ptr->quant_table;
UINT16 dc_quant = tbl->quantval[0];
#if DEBUG
printf("DCT method: %x\n", cinfo->dct_method);
printf
(
"component: %d (%d x %d blocks) sampling: (%d x %d)\n",
ci,
ci_ptr->width_in_blocks,
ci_ptr->height_in_blocks,
ci_ptr->h_samp_factor,
ci_ptr->v_samp_factor
);
printf("quantization table: %d\n", ci);
for (int i = 0; i < DCTSIZE2; ++i)
{
printf("% 4d ", (int)(tbl->quantval[i]));
if ((i + 1) % 8 == 0)
printf("\n");
}
printf("raw DC coefficients:\n");
#endif
JBLOCKARRAY buf =
(cinfo->mem->access_virt_barray)
(
(j_common_ptr)cinfo,
coeffs[ci],
0,
ci_ptr->v_samp_factor,
FALSE
);
for (int sf = 0; (JDIMENSION)sf < ci_ptr->height_in_blocks; ++sf)
{
for (JDIMENSION b = 0; b < ci_ptr->width_in_blocks; ++b)
{
int intensity = 0;
intensity = buf[sf][b][0]*dc_quant/DCTSIZE + 128;
intensity = MAX(0, intensity);
intensity = MIN(255, intensity);
cvSet2D(dc, sf, (int)b, cvScalar(intensity));
#if DEBUG
printf("% 2d ", buf[sf][b][0]);
#endif
}
#if DEBUG
printf("\n");
#endif
}
return dc;
}
IplImage *upscale_chroma(IplImage *quarter, CvSize full_size)
{
IplImage *full = cvCreateImage(full_size, IPL_DEPTH_8U, 1);
cvResize(quarter, full, CV_INTER_NN);
return full;
}
GLOBAL(int)
read_JPEG_file (char * filename, IplImage **dc)
{
/* This struct contains the JPEG decompression parameters and pointers to
* working space (which is allocated as needed by the JPEG library).
*/
struct jpeg_decompress_struct cinfo;
struct jpeg_error_mgr jerr;
/* More stuff */
FILE * infile; /* source file */
/* In this example we want to open the input file before doing anything else,
* so that the setjmp() error recovery below can assume the file is open.
* VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
* requires it in order to read binary files.
*/
if ((infile = fopen(filename, "rb")) == NULL) {
fprintf(stderr, "can't open %s\n", filename);
return 0;
}
/* Step 1: allocate and initialize JPEG decompression object */
cinfo.err = jpeg_std_error(&jerr);
/* Now we can initialize the JPEG decompression object. */
jpeg_create_decompress(&cinfo);
/* Step 2: specify data source (eg, a file) */
jpeg_stdio_src(&cinfo, infile);
/* Step 3: read file parameters with jpeg_read_header() */
(void) jpeg_read_header(&cinfo, TRUE);
/* We can ignore the return value from jpeg_read_header since
* (a) suspension is not possible with the stdio data source, and
* (b) we passed TRUE to reject a tables-only JPEG file as an error.
* See libjpeg.txt for more info.
*/
/* Step 4: set parameters for decompression */
/* In this example, we don't need to change any of the defaults set by
* jpeg_read_header(), so we do nothing here.
*/
jvirt_barray_ptr *coeffs = jpeg_read_coefficients(&cinfo);
IplImage *y = extract_dc(&cinfo, coeffs, 0);
IplImage *cb_q = extract_dc(&cinfo, coeffs, 1);
IplImage *cr_q = extract_dc(&cinfo, coeffs, 2);
IplImage *cb = upscale_chroma(cb_q, cvGetSize(y));
IplImage *cr = upscale_chroma(cr_q, cvGetSize(y));
cvReleaseImage(&cb_q);
cvReleaseImage(&cr_q);
#if OUTPUT_IMAGES
cvSaveImage("y.png", y);
cvSaveImage("cb.png", cb);
cvSaveImage("cr.png", cr);
#endif
*dc = cvCreateImage(cvGetSize(y), IPL_DEPTH_8U, 3);
assert(dc != NULL);
cvMerge(y, cr, cb, NULL, *dc);
cvReleaseImage(&y);
cvReleaseImage(&cb);
cvReleaseImage(&cr);
/* Step 7: Finish decompression */
(void) jpeg_finish_decompress(&cinfo);
/* We can ignore the return value since suspension is not possible
* with the stdio data source.
*/
/* Step 8: Release JPEG decompression object */
/* This is an important step since it will release a good deal of memory. */
jpeg_destroy_decompress(&cinfo);
fclose(infile);
return 1;
}
int
main(int argc, char **argv)
{
int ret = 0;
if (argc != 2)
{
fprintf(stderr, "usage: %s filename.jpg\n", argv[0]);
return 1;
}
IplImage *dc = NULL;
ret = read_JPEG_file(argv[1], &dc);
assert(dc != NULL);
IplImage *rgb = cvCreateImage(cvGetSize(dc), IPL_DEPTH_8U, 3);
cvCvtColor(dc, rgb, CV_YCrCb2RGB);
#if OUTPUT_IMAGES
cvSaveImage("rgb.png", rgb);
#else
cvNamedWindow("DC", CV_WINDOW_AUTOSIZE);
cvShowImage("DC", rgb);
cvWaitKey(0);
#endif
cvReleaseImage(&dc);
cvReleaseImage(&rgb);
return 0;
}
You can use, libjpeg to extract dct data of your jpeg file, but for h.264 video file, I can't find any open source code that give you dct data (actully Integer dct data). But you can use h.264 open source software like JM, JSVM or x264. In these two source file, you have to find their specific function that make use of dct function, and change it to your desire form, to get your output dct data.
For Image:
use the following code, and after read_jpeg_file( infilename, v, quant_tbl ), v and quant_tbl will have dct data and quantization table of your jpeg image respectively.
I used Qvector to store my output data, change it to your preferred c++ array list.
#include <iostream>
#include <stdio.h>
#include <jpeglib.h>
#include <stdlib.h>
#include <setjmp.h>
#include <fstream>
#include <QVector>
int read_jpeg_file( char *filename, QVector<QVector<int> > &dct_coeff, QVector<unsigned short> &quant_tbl)
{
struct jpeg_decompress_struct cinfo;
struct jpeg_error_mgr jerr;
FILE * infile;
if ((infile = fopen(filename, "rb")) == NULL) {
fprintf(stderr, "can't open %s\n", filename);
return 0;
}
cinfo.err = jpeg_std_error(&jerr);
jpeg_create_decompress(&cinfo);
jpeg_stdio_src(&cinfo, infile);
(void) jpeg_read_header(&cinfo, TRUE);
jvirt_barray_ptr *coeffs_array = jpeg_read_coefficients(&cinfo);
for (int ci = 0; ci < 1; ci++)
{
JBLOCKARRAY buffer_one;
JCOEFPTR blockptr_one;
jpeg_component_info* compptr_one;
compptr_one = cinfo.comp_info + ci;
for (int by = 0; by < compptr_one->height_in_blocks; by++)
{
buffer_one = (cinfo.mem->access_virt_barray)((j_common_ptr)&cinfo, coeffs_array[ci], by, (JDIMENSION)1, FALSE);
for (int bx = 0; bx < compptr_one->width_in_blocks; bx++)
{
blockptr_one = buffer_one[0][bx];
QVector<int> tmp;
for (int bi = 0; bi < 64; bi++)
{
tmp.append(blockptr_one[bi]);
}
dct_coeff.push_back(tmp);
}
}
}
// coantization table
j_decompress_ptr dec_cinfo = (j_decompress_ptr) &cinfo;
jpeg_component_info *ci_ptr = &dec_cinfo->comp_info[0];
JQUANT_TBL *tbl = ci_ptr->quant_table;
for(int ci =0 ; ci < 64; ci++){
quant_tbl.append(tbl->quantval[ci]);
}
return 1;
}
int main()
{
QVector<QVector<int> > v;
QVector<unsigned short> quant_tbl;
char *infilename = "your_image.jpg";
std::ofstream out;
out.open("out_dct.txt");
if( read_jpeg_file( infilename, v, quant_tbl ) > 0 ){
for(int j = 0; j < v.size(); j++ ){
for (int i = 0; i < v[0].size(); ++i){
out << v[j][i] << "\t";
}
out << "---------------" << std::endl;
}
out << "\n\n\n" << std::string(10,'-') << std::endl;
out << "\nQauntization Table:" << std::endl;
for(int i = 0; i < quant_tbl.size(); i++ ){
out << quant_tbl[i] << "\t";
}
}
else{
std::cout << "Can not read, Returned With Error";
return -1;
}
out.close();
return 0;
}

Resources