OpenCL Read Back bus speed extensive drop

OpenCL Read Back bus speed extensive drop - buffer

I have a simple OpenCL host code, which is writing some amount of data into an OpenCL enabled device's buffer and then read it back into the host memory. It does this experiment for different buffer sizes. Here is the code I use:
#include <iostream>
#include "support.h"
#include "Event.h"
#include "ResultDatabase.h"
#include "OptionParser.h"
using namespace std;
void addBenchmarkSpecOptions(OptionParser &op)
{
op.addOption("nopinned", OPT_BOOL, "",
"disable usage of pinned (pagelocked) memory");
}
// Modifications:
// Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010
// Added calculation of latency estimate.
void RunBenchmark(cl_device_id id,
cl_context ctx,
cl_command_queue queue,
ResultDatabase &resultDB,
OptionParser &op)
{
bool verbose = op.getOptionBool("verbose");
bool pinned = !op.getOptionBool("nopinned");
int npasses = op.getOptionInt("passes");
const bool waitForEvents = true;
// Sizes are in kb
int nSizes = 20;
int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
32768,65536,131072,262144,524288};
// Max sure we don't surpass the OpenCL limit.
cl_long maxAllocSizeBytes = 0;
clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_long), &maxAllocSizeBytes, NULL);
while (sizes[nSizes-1]*1024 > 0.90 * maxAllocSizeBytes)
{
--nSizes;
if (verbose) cout << " - dropping allocation size to keep under reported limit.\n";
if (nSizes < 1)
{
cerr << "Error: OpenCL reported a max allocation size less than 1kB.\b";
return;
}
}
// Create some host memory pattern
if (verbose) cout << ">> creating host mem pattern\n";
int err;
float *hostMem1;
float *hostMem2;
cl_mem hostMemObj1;
cl_mem hostMemObj2;
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
if (pinned)
{
int err1, err2;
hostMemObj1 = clCreateBuffer(ctx,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof(float)*numMaxFloats, NULL, &err1);
if (err1 == CL_SUCCESS)
{
hostMem1 = (float*)clEnqueueMapBuffer(queue, hostMemObj1, true,
CL_MAP_READ|CL_MAP_WRITE,
0,sizeof(float)*numMaxFloats,0,
NULL,NULL,&err1);
}
hostMemObj2 = clCreateBuffer(ctx,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof(float)*numMaxFloats, NULL, &err2);
if (err2 == CL_SUCCESS)
{
hostMem2 = (float*)clEnqueueMapBuffer(queue, hostMemObj2, true,
CL_MAP_READ|CL_MAP_WRITE,
0,sizeof(float)*numMaxFloats,0,
NULL,NULL,&err2);
}
while (err1 != CL_SUCCESS || err2 != CL_SUCCESS)
{
// free the first buffer if only the second failed
if (err1 == CL_SUCCESS)
clReleaseMemObject(hostMemObj1);
// drop the size and try again
if (verbose) cout << " - dropping size allocating pinned mem\n";
--nSizes;
if (nSizes < 1)
{
cerr << "Error: Couldn't allocated any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hostMemObj1 = clCreateBuffer(ctx,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof(float)*numMaxFloats, NULL, &err1);
if (err1 == CL_SUCCESS)
{
hostMem1 = (float*)clEnqueueMapBuffer(queue, hostMemObj1, true,
CL_MAP_READ|CL_MAP_WRITE,
0,sizeof(float)*numMaxFloats,0,
NULL,NULL,&err1);
}
hostMemObj2 = clCreateBuffer(ctx,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof(float)*numMaxFloats, NULL, &err2);
if (err2 == CL_SUCCESS)
{
hostMem2 = (float*)clEnqueueMapBuffer(queue, hostMemObj2, true,
CL_MAP_READ|CL_MAP_WRITE,
0,sizeof(float)*numMaxFloats,0,
NULL,NULL,&err2);
}
}
}
else
{
hostMem1 = new float[numMaxFloats];
hostMem2 = new float[numMaxFloats];
}
for (int i=0; i<numMaxFloats; i++) {
hostMem1[i] = i % 77;
hostMem2[i] = -1;
}
// Allocate some device memory
if (verbose) cout << ">> allocating device mem\n";
cl_mem mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
sizeof(float)*numMaxFloats, NULL, &err);
while (err != CL_SUCCESS)
{
// drop the size and try again
if (verbose) cout << " - dropping size allocating device mem\n";
--nSizes;
if (nSizes < 1)
{
cerr << "Error: Couldn't allocated any device buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
sizeof(float)*numMaxFloats, NULL, &err);
}
if (verbose) cout << ">> filling device mem to force allocation\n";
Event evDownloadPrime("DownloadPrime");
err = clEnqueueWriteBuffer(queue, mem1, false, 0,
numMaxFloats*sizeof(float), hostMem1,
0, NULL, &evDownloadPrime.CLEvent());
CL_CHECK_ERROR(err);
if (verbose) cout << ">> waiting for download to finish\n";
err = clWaitForEvents(1, &evDownloadPrime.CLEvent());
CL_CHECK_ERROR(err);
// Three passes, forward and backward both
for (int pass = 0; pass < npasses; pass++)
{
// store the times temporarily to estimate latency
//float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++)
{
int sizeIndex;
if ((pass%2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes-1) - i;
// Read memory back from the device
if (verbose) cout << ">> reading from device "<<sizes[sizeIndex]<<"kB\n";
Event evReadback("Readback");
err = clEnqueueReadBuffer(queue, mem1, false, 0,
sizes[sizeIndex]*1024, hostMem2,
0, NULL, &evReadback.CLEvent());
CL_CHECK_ERROR(err);
// Wait for event to finish
if (verbose) cout << ">> waiting for readback to finish\n";
err = clWaitForEvents(1, &evReadback.CLEvent());
CL_CHECK_ERROR(err);
if (verbose) cout << ">> finish!";
if (verbose) cout << endl;
// Get timings
err = clFlush(queue);
CL_CHECK_ERROR(err);
evReadback.FillTimingInfo();
if (verbose) evReadback.Print(cerr);
double t = evReadback.SubmitEndRuntime() / 1.e6; // in ms
//times[sizeIndex] = t;
// Add timings to database
double speed = (double(sizes[sizeIndex] * 1024.) / (1000.*1000.)) / t;
char sizeStr[256];
sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed);
// Add timings to database
double delay = evReadback.SubmitStartDelay() / 1.e6;
resultDB.AddResult("ReadbackDelay", sizeStr, "ms", delay);
resultDB.AddResult("ReadbackTime", sizeStr, "ms", t);
}
//resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.);
//resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.);
//resultDB.AddResult("ReadbackLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.);
}
// Cleanup
err = clReleaseMemObject(mem1);
CL_CHECK_ERROR(err);
if (pinned)
{
err = clReleaseMemObject(hostMemObj1);
CL_CHECK_ERROR(err);
err = clReleaseMemObject(hostMemObj2);
CL_CHECK_ERROR(err);
}
else
{
delete[] hostMem1;
delete[] hostMem2;
}
}
while running the code and plotting the speed against the buffer size, I see this:
As you can see, the performance drops for data sizes over 1024KB. And then stays the same at some point.
My Device is a Tesla K40 and I'm using Nvidia's OpenCL driver. But still do not understand the main reason behind such a weird behaviour.

Related

iOS FFMPEG hight level API

I have video file with subtitles and I'd like to get all subtitles from it. With terminal it's quite easy to do this.
ffmpeg -i video.mkv -map 0:s:0 subs.srt
How can I execute this command on iOS?
Edit
Or maybe you know easy way to get subtitles from video file? Fails on av_guess_format returns NULL.
+ (void)readSubtitles:(NSString *)videoPath saveFolder:(NSString *)saveFolder {
AVFormatContext *pFormatCtx;
av_register_all();
avcodec_register_all();
avformat_network_init();
pFormatCtx = avformat_alloc_context();
if (avformat_open_input(&pFormatCtx, [videoPath UTF8String], NULL, NULL) != 0) {
return;
}
if (avformat_find_stream_info(pFormatCtx, NULL) < 0) {
return;
}
for (int i = 0; i < pFormatCtx->nb_streams; i++) {
if (pFormatCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
NSString *subPath = [saveFolder stringByAppendingPathComponent:[NSString stringWithFormat:#"sub_%d.srt", i]];
[self parseSubtitles:pFormatCtx streamIdx:i savePath:subPath];
}
}
}
+ (void)parseSubtitles:(AVFormatContext *)context streamIdx:(int)idx savePath:(NSString *)savePath {
const char *filename = [savePath UTF8String];
AVStream *avstream = context->streams[idx];
AVCodec *codec = avcodec_find_decoder( avstream->codec->codec_id );
int result = avcodec_open2( avstream->codec, codec, NULL );
AVOutputFormat *outFormat = av_guess_format( NULL, "sub.mp4", NULL );
NSAssert(outFormat != NULL, #"Error finding format"); // !!! fails !!!
NSLog(#"Found output format: %# (%#)", [NSString stringWithUTF8String:outFormat->name], [NSString stringWithUTF8String:outFormat->long_name]);
AVFormatContext *outFormatContext;
avformat_alloc_output_context2( &outFormatContext, NULL, NULL, filename );
AVCodec *encoder = avcodec_find_encoder( outFormat->subtitle_codec );
// checkResult( encoder != NULL, "Error finding encoder" );
NSLog(#"Found encoder: %#", [NSString stringWithUTF8String:encoder->name]);
AVStream *outStream = avformat_new_stream( outFormatContext, encoder );
AVCodecContext *c = outStream->codec;
result = avcodec_get_context_defaults3( c, encoder );
// outStream->codecpar
NSLog(#"outstream codec: %#", [NSString stringWithUTF8String:outStream->codec]);
NSLog(#"Opened stream %d, codec=%d", outStream->id, outStream->codec->codec_id);
result = avio_open( &outFormatContext->pb, filename, AVIO_FLAG_WRITE );
// checkResult( result == 0, "Error opening out file" );
// cerr << "out file opened correctly" << endl;
result = avformat_write_header( outFormatContext, NULL );
// checkResult(result==0, "Error writing header");
// cerr << "header wrote correctly" << endl;
result = 0;
AVPacket pkt;
av_init_packet( &pkt );
pkt.data = NULL;
pkt.size = 0;
// cerr << "srt codec id: " << AV_CODEC_ID_SUBRIP << endl;
while( av_read_frame( context, &pkt ) >= 0 )
{
if(pkt.stream_index != idx)
continue;
int gotSubtitle = 0;
AVSubtitle subtitle;
result = avcodec_decode_subtitle2( avstream->codec, &subtitle, &gotSubtitle, &pkt );
uint64_t bufferSize = 1024 * 1024 ;
uint8_t *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t));
memset(buffer, 0, bufferSize);
if( result >= 0 )
{
result = avcodec_encode_subtitle( outStream->codec, buffer, bufferSize, &subtitle );
// cerr << "Encode subtitle result: " << result << endl;
}
// cerr << "Encoded subtitle: " << buffer << endl;
free(buffer);
}
}

Why do my in-kernel dynamic memory allocations fail for larger grid sizes?

I need to dynamically allocate some arrays inside the kernel function. Here is the code:
__global__
void kernel3(int N)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index < N)
{
float * cost = new float[100];
for (int i = 0; i < 100; i++)
cost[i] = 1;
}
}
and
int main()
{
cudaDeviceSynchronize();
cudaThreadSynchronize();
size_t mem_tot_01 = 0;
size_t mem_free_01 = 0;
cudaMemGetInfo(&mem_free_01, &mem_tot_01);
cout << "Free memory " << mem_free_01 << endl;
cout << "Total memory " << mem_tot_01 << endl;
system("pause");
int blocksize = 256;
int aaa = 16000;
int numBlocks = (aaa + blocksize - 1) / blocksize;
kernel3 << <numBlocks, blocksize >> >(aaa);
cudaDeviceSynchronize();
cudaError_t err1 = cudaGetLastError();
if (err1 != cudaSuccess)
{
printf("Error: %s\n", cudaGetErrorString(err1));
system("pause");
}
cudaMemGetInfo(&mem_free_01, &mem_tot_01);
cout << "Free memory " << mem_free_01 << endl;
cout << "Total memory " << mem_tot_01 << endl;
system("pause");
}
In the first round of cudaMemGetInfo:
Free memory:3600826368
Total memory:4294967297
and i got an error:
Error:unspecified launch failure
and i tried to change the "int aaa" to some smaller value, it won't get error but the memory info is not equal to what i assigned.
What's wrong with it? The memory should be enough, 16000x100x32=512x10e5<3600826368

The device memory allocated by the new operator comes from a runtime heap of fixed size.
If your code requires a large amount of runtime heap memory, you will probably need to increase the size of the heap before running any kernels. NVIDIA provide the cudaDeviceSetLimit API for this purpose, which is used with the cudaLimitMallocHeapSize flag to set the size of the heap.

Car detection using HOG features and cvsvm

I am doing a project for which I need to detect the rear of a car using HOG features. Once I calculated the HOG features I trained the cvsvm using positive and negative samples. cvsvm is correctly classifying the new data. Here is my code that I used to train cvsvm.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/ml/ml.hpp>
#include "opencv2/opencv.hpp"
#include "LinearSVM.h"
using namespace cv;
using namespace std;
int main(void)
{
LinearSVM *s = new LinearSVM;
vector<float> values, values1, values2, values3, values4;
FileStorage fs2("/home/ubuntu/Desktop/opencv-svm/vecSupport.yml", FileStorage::READ);
FileStorage fs3("/home/ubuntu/Desktop/opencv-svm/vecSupport1.yml", FileStorage::READ);
FileStorage fs4("/home/ubuntu/Desktop/opencv-svm/vecSupport2.yml", FileStorage::READ);
FileStorage fs5("/home/ubuntu/Desktop/opencv-svm/vecSupport3.yml", FileStorage::READ);
FileStorage fs6("/home/ubuntu/Desktop/opencv-svm/vecSupport4.yml", FileStorage::READ);
fs2["vector"]>>values;
fs3["vector"]>>values1;
fs4["vector"]>>values2;
fs5["vector"]>>values3;
fs6["vector"]>>values4;
//fill with data
values.insert(values.end(), values1.begin(), values1.end());
values.insert(values.end(), values2.begin(), values2.end());
fs2.release();
fs3.release();
fs4.release();
float arr[188496];
float car[2772];
float noncar[2772];
// move positive and negative to arr
std::copy(values.begin(), values.end(), arr);
std::copy(values3.begin(), values3.end(), car);
std::copy(values4.begin(), values4.end(), noncar);
float labels[68];
for (unsigned int s = 0; s < 68; s++)
{
if (s<34)
labels[s] = +1;
else
labels[s] = -1;
}
Mat labelsMat(68, 1, CV_32FC1, labels);
Mat trainingDataMat(68,2772, CV_32FC1, arr);
// Set up SVM's parameters
CvSVMParams params;
params.svm_type = CvSVM::C_SVC;
params.kernel_type = CvSVM::LINEAR;
params.term_crit = cvTermCriteria(CV_TERMCRIT_ITER, 100, 1e-6);
// Train the SVM
LinearSVM SVM;
SVM.train(trainingDataMat, labelsMat, Mat(), Mat(), params);
Mat matinput(1,2772,CV_32FC1,noncar);
//cout<<matinput;
float response = SVM.predict(matinput);
cout<<"Response : "<<response<<endl;
SVM.save("Classifier.xml");
vector<float>primal;
// LinearSVM s;
//s.getSupportVector(primal);
SVM.getSupportVector(primal);
FileStorage fs("/home/ubuntu/Desktop/opencv-svm/test.yml", FileStorage::WRITE);
fs << "dector" << primal;
fs.release();
}
// LinearSVM cpp file
#include "LinearSVM.h"
void LinearSVM::getSupportVector(std::vector<float>& support_vector) const {
int sv_count = get_support_vector_count();
const CvSVMDecisionFunc* df = decision_func;
const double* alphas = df[0].alpha;
double rho = df[0].rho;
int var_count = get_var_count();
support_vector.resize(var_count, 0);
for (unsigned int r = 0; r < (unsigned)sv_count; r++) {
float myalpha = alphas[r];
const float* v = get_support_vector(r);
for (int j = 0; j < var_count; j++,v++) {
support_vector[j] += (-myalpha) * (*v);
}
}
support_vector.push_back(rho);
}
// LinearSVM head file
#ifndef LINEAR_SVM_H_
#define LINEAR_SVM_H_
#include <opencv2/core/core.hpp>
#include <opencv2/ml/ml.hpp>
class LinearSVM: public CvSVM {
public:
void getSupportVector(std::vector<float>& support_vector) const;
};
#endif /* LINEAR_SVM_H_ */
After this step I got the vector file that I can fed into setsvmdetector method. Here is my code. I have used window size of 96 x 64 and scale of 1.11
#include <iostream>
#include <fstream>
#include <string>
#include <time.h>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <stdexcept>
#include <stdexcept>
#include "opencv2/gpu/gpu.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace std;
using namespace cv;
bool help_showed = false;
class Args
{
public:
Args();
static Args read(int argc, char** argv);
string src;
bool src_is_video;
bool src_is_camera;
int camera_id;
bool write_video;
string dst_video;
double dst_video_fps;
bool make_gray;
bool resize_src;
int width, height;
double scale;
int nlevels;
int gr_threshold;
double hit_threshold;
bool hit_threshold_auto;
int win_width;
int win_stride_width, win_stride_height;
bool gamma_corr;
};
class App
{
public:
App(const Args& s);
void run();
void handleKey(char key);
void hogWorkBegin();
void hogWorkEnd();
string hogWorkFps() const;
void workBegin();
void workEnd();
string workFps() const;
string message() const;
private:
App operator=(App&);
Args args;
bool running;
bool use_gpu;
bool make_gray;
double scale;
int gr_threshold;
int nlevels;
double hit_threshold;
bool gamma_corr;
int64 hog_work_begin;
double hog_work_fps;
int64 work_begin;
double work_fps;
};
static void printHelp()
{
cout << "Histogram of Oriented Gradients descriptor and detector sample.\n"
<< "\nUsage: hog_gpu\n"
<< " (<image>|--video <vide>|--camera <camera_id>) # frames source\n"
<< " [--make_gray <true/false>] # convert image to gray one or not\n"
<< " [--resize_src <true/false>] # do resize of the source image or not\n"
<< " [--width <int>] # resized image width\n"
<< " [--height <int>] # resized image height\n"
<< " [--hit_threshold <double>] # classifying plane distance threshold (0.0 usually)\n"
<< " [--scale <double>] # HOG window scale factor\n"
<< " [--nlevels <int>] # max number of HOG window scales\n"
<< " [--win_width <int>] # width of the window (48 or 64)\n"
<< " [--win_stride_width <int>] # distance by OX axis between neighbour wins\n"
<< " [--win_stride_height <int>] # distance by OY axis between neighbour wins\n"
<< " [--gr_threshold <int>] # merging similar rects constant\n"
<< " [--gamma_correct <int>] # do gamma correction or not\n"
<< " [--write_video <bool>] # write video or not\n"
<< " [--dst_video <path>] # output video path\n"
<< " [--dst_video_fps <double>] # output video fps\n";
help_showed = true;
}
int main(int argc, char** argv)
{
try
{
if (argc < 2)
printHelp();
Args args = Args::read(argc, argv);
if (help_showed)
return -1;
App app(args);
app.run();
}
catch (const Exception& e) { return cout << "error: " << e.what() << endl, 1; }
catch (const exception& e) { return cout << "error: " << e.what() << endl, 1; }
catch(...) { return cout << "unknown exception" << endl, 1; }
return 0;
}
Args::Args()
{
src_is_video = false;
src_is_camera = false;
camera_id = 0;
write_video = false;
dst_video_fps = 24.;
make_gray = false;
resize_src = false;
width = 640;
height = 480;
scale = 1.11;
nlevels = 13;
gr_threshold = 1;
hit_threshold = 1.4;
hit_threshold_auto = true;
win_width = 64;
win_stride_width = 8;
win_stride_height = 8;
gamma_corr = true;
}
Args Args::read(int argc, char** argv)
{
Args args;
for (int i = 1; i < argc; i++)
{
if (string(argv[i]) == "--make_gray") args.make_gray = (string(argv[++i]) == "true");
else if (string(argv[i]) == "--resize_src") args.resize_src = (string(argv[++i]) == "true");
else if (string(argv[i]) == "--width") args.width = atoi(argv[++i]);
else if (string(argv[i]) == "--height") args.height = atoi(argv[++i]);
else if (string(argv[i]) == "--hit_threshold")
{
args.hit_threshold = atof(argv[++i]);
args.hit_threshold_auto = false;
}
else if (string(argv[i]) == "--scale") args.scale = atof(argv[++i]);
else if (string(argv[i]) == "--nlevels") args.nlevels = atoi(argv[++i]);
else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]);
else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]);
else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]);
else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]);
else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true");
else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true");
else if (string(argv[i]) == "--dst_video") args.dst_video = argv[++i];
else if (string(argv[i]) == "--dst_video_fps") args.dst_video_fps = atof(argv[++i]);
else if (string(argv[i]) == "--help") printHelp();
else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; }
else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; }
else if (args.src.empty()) args.src = argv[i];
else throw runtime_error((string("unknown key: ") + argv[i]));
}
return args;
}
App::App(const Args& s)
{
cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
args = s;
cout << "\nControls:\n"
<< "\tESC - exit\n"
<< "\tm - change mode GPU <-> CPU\n"
<< "\tg - convert image to gray or not\n"
<< "\t1/q - increase/decrease HOG scale\n"
<< "\t2/w - increase/decrease levels count\n"
<< "\t3/e - increase/decrease HOG group threshold\n"
<< "\t4/r - increase/decrease hit threshold\n"
<< endl;
use_gpu = true;
make_gray = args.make_gray;
scale = args.scale;
gr_threshold = args.gr_threshold;
nlevels = args.nlevels;
if (args.hit_threshold_auto)
args.hit_threshold = args.win_width == 48 ? 1.4 : 0.;
hit_threshold = args.hit_threshold;
gamma_corr = args.gamma_corr;
/*
if (args.win_width != 64 && args.win_width != 48)
args.win_width = 64;*/
cout << "Scale: " << scale << endl;
if (args.resize_src)
cout << "Resized source: (" << args.width << ", " << args.height << ")\n";
cout << "Group threshold: " << gr_threshold << endl;
cout << "Levels number: " << nlevels << endl;
cout << "Win width: " << args.win_width << endl;
cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n";
cout << "Hit threshold: " << hit_threshold << endl;
cout << "Gamma correction: " << gamma_corr << endl;
cout << endl;
}
void App::run()
{
FileStorage fs("/home/ubuntu/Desktop/implemenatation/vecSupport.yml", FileStorage::READ);
vector<float> detector;
int frameCount;
fs["vector"] >> detector;
for (unsigned int i=0; i<detector.size(); i++)
{
std::cout << std::fixed << std::setprecision(10) << detector[i] << std::endl;
}
fs.release();
running = true;
cv::VideoWriter video_writer;
Size win_size(96,64); //(64, 128) or (48, 96)
Size win_stride(args.win_stride_width, args.win_stride_height);
// Create HOG descriptors and detectors here
/*
vector<float> detector;
if (win_size == Size(64, 128))
detector = cv::gpu::HOGDescriptor::getPeopleDetector64x128();
else
detector = cv::gpu::HOGDescriptor::getPeopleDetector48x96();*/
cv::gpu::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
cv::gpu::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
cv::gpu::HOGDescriptor::DEFAULT_NLEVELS);
cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
gpu_hog.setSVMDetector(detector);
cpu_hog.setSVMDetector(detector);
while (running)
{
VideoCapture vc;
Mat frame;
if (args.src_is_video)
{
vc.open(args.src.c_str());
if (!vc.isOpened())
throw runtime_error(string("can't open video file: " + args.src));
vc >> frame;
}
else if (args.src_is_camera)
{
vc.open(args.camera_id);
if (!vc.isOpened())
{
stringstream msg;
msg << "can't open camera: " << args.camera_id;
throw runtime_error(msg.str());
}
vc >> frame;
}
else
{
frame = imread(args.src);
if (frame.empty())
throw runtime_error(string("can't open image file: " + args.src));
}
Mat img_aux, img, img_to_show;
gpu::GpuMat gpu_img;
// Iterate over all frames
while (running && !frame.empty())
{
workBegin();
// Change format of the image
if (make_gray) cvtColor(frame, img_aux, CV_BGR2GRAY);
else if (use_gpu) cvtColor(frame, img_aux, CV_BGR2BGRA);
else frame.copyTo(img_aux);
// Resize image
if (args.resize_src) resize(img_aux, img, Size(args.width, args.height));
else img = img_aux;
img_to_show = img;
gpu_hog.nlevels = nlevels;
cpu_hog.nlevels = nlevels;
vector<Rect> found;
// Perform HOG classification
hogWorkBegin();
if (use_gpu)
{
gpu_img.upload(img);
gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
Size(0, 0), scale, gr_threshold);
}
else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
Size(0, 0), scale, gr_threshold);
hogWorkEnd();
// Draw positive classified windows
for (size_t i = 0; i < found.size(); i++)
{
Rect r = found[i];
rectangle(img_to_show, r.tl(), r.br(), CV_RGB(0, 255, 0), 3);
}
if (use_gpu)
putText(img_to_show, "Mode: GPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
else
putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
imshow("opencv_gpu_hog", img_to_show);
if (args.src_is_video || args.src_is_camera) vc >> frame;
workEnd();
if (args.write_video)
{
if (!video_writer.isOpened())
{
video_writer.open(args.dst_video, CV_FOURCC('x','v','i','d'), args.dst_video_fps,
img_to_show.size(), true);
if (!video_writer.isOpened())
throw std::runtime_error("can't create video writer");
}
if (make_gray) cvtColor(img_to_show, img, CV_GRAY2BGR);
else cvtColor(img_to_show, img, CV_BGRA2BGR);
video_writer << img;
}
handleKey((char)waitKey(3));
}
}
}
void App::handleKey(char key)
{
switch (key)
{
case 27:
running = false;
break;
case 'm':
case 'M':
use_gpu = !use_gpu;
cout << "Switched to " << (use_gpu ? "CUDA" : "CPU") << " mode\n";
break;
case 'g':
case 'G':
make_gray = !make_gray;
cout << "Convert image to gray: " << (make_gray ? "YES" : "NO") << endl;
break;
case '1':
scale *= 1.11;
cout << "Scale: " << scale << endl;
break;
case 'q':
case 'Q':
scale /= 1.11;
cout << "Scale: " << scale << endl;
break;
case '2':
nlevels++;
cout << "Levels number: " << nlevels << endl;
break;
case 'w':
case 'W':
nlevels = max(nlevels - 1, 1);
cout << "Levels number: " << nlevels << endl;
break;
case '3':
gr_threshold++;
cout << "Group threshold: " << gr_threshold << endl;
break;
case 'e':
case 'E':
gr_threshold = max(0, gr_threshold - 1);
cout << "Group threshold: " << gr_threshold << endl;
break;
case '4':
hit_threshold+=0.25;
cout << "Hit threshold: " << hit_threshold << endl;
break;
case 'r':
case 'R':
hit_threshold = max(0.0, hit_threshold - 0.25);
cout << "Hit threshold: " << hit_threshold << endl;
break;
case 'c':
case 'C':
gamma_corr = !gamma_corr;
cout << "Gamma correction: " << gamma_corr << endl;
break;
}
}
inline void App::hogWorkBegin() { hog_work_begin = getTickCount(); }
inline void App::hogWorkEnd()
{
int64 delta = getTickCount() - hog_work_begin;
double freq = getTickFrequency();
hog_work_fps = freq / delta;
}
inline string App::hogWorkFps() const
{
stringstream ss;
ss << hog_work_fps;
return ss.str();
}
inline void App::workBegin() { work_begin = getTickCount(); }
inline void App::workEnd()
{
int64 delta = getTickCount() - work_begin;
double freq = getTickFrequency();
work_fps = freq / delta;
}
inline string App::workFps() const
{
stringstream ss;
ss << work_fps;
return ss.str();
}
Problem:
I am not able to detect anything. Can someone look at my work and can let me know what I am doing wrong. Any suggestions would be valuable. Thank you. From last four weeks I am doing these steps over and over again.
P.S: You can find yaml files here and test images along with the annotations here

First of all, partition your data for cross-validation as suggested already. Second thing is that it is a good idea to use RBF kernel rather than Linear kernel. I highly doubt that a linear kernel can learn complex objects. A brief explanation is given here. Finally, experiment with the parameters. To do that, you need to check the limits of the parameter space, it's been a while since I haven't used SVMs therefore I cannot provide any details but a grid search with 20% cross-validation is a good start.

Kinect 2 failing to 'AcquireLatestFrame'

I am using the following code from a sample I found online. It seems to pick up one frame but subsequently never succeeds to 'AcquireLatestFrame'. I have the exact same problem with the body reader. Can anyone see an issue that might be causing this?
IKinectSensor* pSensor;
HRESULT hResult = S_OK;
hResult = GetDefaultKinectSensor(&pSensor);
if (FAILED(hResult)) {
std::cerr << "Error : GetDefaultKinectSensor" << std::endl;
return -1;
}
hResult = pSensor->Open();
if (FAILED(hResult)) {
std::cerr << "Error : IKinectSensor::Open()" << std::endl;
return -1;
}
// Source
IColorFrameSource* pColorSource;
hResult = pSensor->get_ColorFrameSource(&pColorSource);
if (FAILED(hResult)) {
std::cerr << "Error : IKinectSensor::get_ColorFrameSource()" << std::endl;
return -1;
}
// Reader
IColorFrameReader* pColorReader;
hResult = pColorSource->OpenReader(&pColorReader);
if (FAILED(hResult)) {
std::cerr << "Error : IColorFrameSource::OpenReader()" << std::endl;
return -1;
}
// Description
IFrameDescription* pDescription;
hResult = pColorSource->get_FrameDescription(&pDescription);
if (FAILED(hResult)) {
std::cerr << "Error : IColorFrameSource::get_FrameDescription()" << std::endl;
return -1;
}
int width = 0;
int height = 0;
pDescription->get_Width(&width); // 1920
pDescription->get_Height(&height); // 1080
unsigned int bufferSize = width * height * 4 * sizeof(unsigned char);
cv::Mat bufferMat(height, width, CV_8UC4);
cv::Mat colorMat(height / 2, width / 2, CV_8UC4);
cv::namedWindow("Color");
while (1) {
// Frame
IColorFrame* pColorFrame = nullptr;
hResult = pColorReader->AcquireLatestFrame(&pColorFrame);
if (SUCCEEDED(hResult)) {
hResult = pColorFrame->CopyConvertedFrameDataToArray(bufferSize, reinterpret_cast<BYTE*>(bufferMat.data), ColorImageFormat::ColorImageFormat_Bgra);
if (SUCCEEDED(hResult)) {
cv::resize(bufferMat, colorMat, cv::Size(), 0.5, 0.5);
}
}
else
{
cout << "Can't aquire latest frame.\n";
}
cv::imshow("Color", colorMat);
if (cv::waitKey(30) == VK_ESCAPE) {
break;
}
}
if (pSensor) {
pSensor->Close();
}
cv::destroyAllWindows();

I wasn't releasing the pColorFrame. Doing so solved the issue.

OpenCL: Strange buffer or image bahaviour with NVidia but not Amd

I have a big problem (on Linux):
I create a buffer with defined data, then an OpenCL kernel takes this data and puts it into an image2d_t. When working on an AMD C50 (Fusion CPU/GPU) the program works as desired, but on my GeForce 9500 GT the given kernel computes the correct result very rarely. Sometimes the result is correct, but very often it is incorrect. Sometimes it depends on very strange changes like removing unused variable declarations or adding a newline. I realized that disabling the optimization will increase the probability to fail. I have the most actual display driver in both systems.
Here is my reduced code:
#include <CL/cl.h>
#include <string>
#include <iostream>
#include <sstream>
#include <cmath>
void checkOpenCLErr(cl_int err, std::string name){
const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
if (err != CL_SUCCESS) {
std::stringstream str;
str << errorString[-err] << " (" << err << ")";
throw std::string(name)+(str.str());
}
}
int main(){
try{
cl_context m_context;
cl_platform_id* m_platforms;
unsigned int m_numPlatforms;
cl_command_queue m_queue;
cl_device_id m_device;
cl_int error = 0; // Used to handle error codes
clGetPlatformIDs(0,NULL,&m_numPlatforms);
m_platforms = new cl_platform_id[m_numPlatforms];
error = clGetPlatformIDs(m_numPlatforms,m_platforms,&m_numPlatforms);
checkOpenCLErr(error, "getPlatformIDs");
// Device
error = clGetDeviceIDs(m_platforms[0], CL_DEVICE_TYPE_GPU, 1, &m_device, NULL);
checkOpenCLErr(error, "getDeviceIDs");
// Context
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(m_platforms[0]), 0};
m_context = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
// m_private->m_context = clCreateContext(properties, 1, &m_private->m_device, NULL, NULL, &error);
checkOpenCLErr(error, "Create context");
// Command-queue
m_queue = clCreateCommandQueue(m_context, m_device, 0, &error);
checkOpenCLErr(error, "Create command queue");
//Build program and kernel
const char* source = "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n"
"\n"
"__kernel void bufToImage(__global unsigned char* in, __write_only image2d_t out, const unsigned int offset_x, const unsigned int image_width , const unsigned int maxval ){\n"
"\tint i = get_global_id(0);\n"
"\tint j = get_global_id(1);\n"
"\tint width = get_global_size(0);\n"
"\tint height = get_global_size(1);\n"
"\n"
"\tint pos = j*image_width*3+(offset_x+i)*3;\n"
"\tif( maxval < 256 ){\n"
"\t\tfloat4 c = (float4)(in[pos],in[pos+1],in[pos+2],1.0f);\n"
"\t\tc.x /= maxval;\n"
"\t\tc.y /= maxval;\n"
"\t\tc.z /= maxval;\n"
"\t\twrite_imagef(out, (int2)(i,j), c);\n"
"\t}else{\n"
"\t\tfloat4 c = (float4)(255.0f*in[2*pos]+in[2*pos+1],255.0f*in[2*pos+2]+in[2*pos+3],255.0f*in[2*pos+4]+in[2*pos+5],1.0f);\n"
"\t\tc.x /= maxval;\n"
"\t\tc.y /= maxval;\n"
"\t\tc.z /= maxval;\n"
"\t\twrite_imagef(out, (int2)(i,j), c);\n"
"\t}\n"
"}\n"
"\n"
"__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
"\n"
"__kernel void imageToBuf(__read_only image2d_t in, __global unsigned char* out, const unsigned int offset_x, const unsigned int image_width ){\n"
"\tint i = get_global_id(0);\n"
"\tint j = get_global_id(1);\n"
"\tint pos = j*image_width*3+(offset_x+i)*3;\n"
"\tfloat4 c = read_imagef(in, imageSampler, (int2)(i,j));\n"
"\tif( c.x <= 1.0f && c.y <= 1.0f && c.z <= 1.0f ){\n"
"\t\tout[pos] = c.x*255.0f;\n"
"\t\tout[pos+1] = c.y*255.0f;\n"
"\t\tout[pos+2] = c.z*255.0f;\n"
"\t}else{\n"
"\t\tout[pos] = 200.0f;\n"
"\t\tout[pos+1] = 0.0f;\n"
"\t\tout[pos+2] = 255.0f;\n"
"\t}\n"
"}\n";
cl_int err;
cl_program prog = clCreateProgramWithSource(m_context,1,&source,NULL,&err);
if( -err != CL_SUCCESS ) throw std::string("clCreateProgramWithSources");
err = clBuildProgram(prog,0,NULL,"-cl-opt-disable",NULL,NULL);
if( -err != CL_SUCCESS ) throw std::string("clBuildProgram(fromSources)");
cl_kernel kernel = clCreateKernel(prog,"bufToImage",&err);
checkOpenCLErr(err,"CreateKernel");
cl_uint imageWidth = 80;
cl_uint imageHeight = 90;
//Initialize datas
cl_uint maxVal = 255;
cl_uint offsetX = 0;
int size = imageWidth*imageHeight*3;
int resSize = imageWidth*imageHeight*4;
cl_uchar* data = new cl_uchar[size];
cl_float* expectedData = new cl_float[resSize];
for( int i = 0,j=0; i < size; i++,j++ ){
data[i] = (cl_uchar)i;
expectedData[j] = (cl_float)((unsigned char)i)/255.0f;
if ( i%3 == 2 ){
j++;
expectedData[j] = 1.0f;
}
}
cl_mem inBuffer = clCreateBuffer(m_context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,size*sizeof(cl_uchar),data,&err);
checkOpenCLErr(err, "clCreateBuffer()");
clFinish(m_queue);
cl_image_format imgFormat;
imgFormat.image_channel_order = CL_RGBA;
imgFormat.image_channel_data_type = CL_FLOAT;
cl_mem outImg = clCreateImage2D( m_context, CL_MEM_READ_WRITE, &imgFormat, imageWidth, imageHeight, 0, NULL, &err );
checkOpenCLErr(err,"get2DImage()");
clFinish(m_queue);
size_t kernelRegion[]={imageWidth,imageHeight};
size_t kernelWorkgroup[]={1,1};
//Fill kernel with data
clSetKernelArg(kernel,0,sizeof(cl_mem),&inBuffer);
clSetKernelArg(kernel,1,sizeof(cl_mem),&outImg);
clSetKernelArg(kernel,2,sizeof(cl_uint),&offsetX);
clSetKernelArg(kernel,3,sizeof(cl_uint),&imageWidth);
clSetKernelArg(kernel,4,sizeof(cl_uint),&maxVal);
//Run kernel
err = clEnqueueNDRangeKernel(m_queue,kernel,2,NULL,kernelRegion,kernelWorkgroup,0,NULL,NULL);
checkOpenCLErr(err,"RunKernel");
clFinish(m_queue);
//Check resulting data for validty
cl_float* computedData = new cl_float[resSize];;
size_t region[]={imageWidth,imageHeight,1};
const size_t offset[] = {0,0,0};
err = clEnqueueReadImage(m_queue,outImg,CL_TRUE,offset,region,0,0,computedData,0,NULL,NULL);
checkOpenCLErr(err, "readDataFromImage()");
clFinish(m_queue);
for( int i = 0; i < resSize; i++ ){
if( fabs(expectedData[i]-computedData[i])>0.1 ){
std::cout << "Expected: \n";
for( int j = 0; j < resSize; j++ ){
std::cout << expectedData[j] << " ";
}
std::cout << "\nComputed: \n";
std::cout << "\n";
for( int j = 0; j < resSize; j++ ){
std::cout << computedData[j] << " ";
}
std::cout << "\n";
throw std::string("Error, computed and expected data are not the same!\n");
}
}
}catch(std::string& e){
std::cout << "\nCaught an exception: " << e << "\n";
return 1;
}
std::cout << "Works fine\n";
return 0;
}
I also uploaded the source code for you to make it easier to test it:
http://www.file-upload.net/download-3524302/strangeOpenCLError.cpp.html
Please can you tell me if I've done wrong anything?
Is there any mistake in the code or is this a bug in my driver?
Best reagards,
Alex
Edit: changed the program (both: here and the linked one) a little bit to make it more likely to get a mismatch.

I found the bug and this is an annoying one:
When working under linux and just linking the OpenCL program with the most actual "OpenCV" library (yes, the computation lib), the binary parts of the kernels, which get compiled and cached in ~/.nv are damaged.
Can you please install the actual OpenCV library and execute following commands:
Generating bad kernel maybe leading sometimes to bad behaviour:
rm -R ~/.nv && g++ strangeOpenCLError.cpp -lOpenCL -lopencv_gpu -o strangeOpenCLError && ./strangeOpenCLError && ls -la ~/.nv/ComputeCache/*/*
Generating good kernel which performs as desired:
rm -R ~/.nv && g++ strangeOpenCLError.cpp -lOpenCL -o strangeOpenCLError && ./strangeOpenCLError && ls -la ~/.nv/ComputeCache/*/*
In my system when using -lopencv_gpu or -lopencv_core I get a kernel object in ~/.nv with a slightly other size due to sightly different binary parts. So these smaller kernels computed bad results in my systems.
The problem is that the bug does not always appear: Sometimes just when working on buffers, which are big enough. So the more relyable measurement is the different kernel-cache size. I edited the program in my question, now it is more likely that it will create the bad result.
Best regards,
Alex
PS: I also created a bug report at NVidia and it is in progress. They could reproduce the bug on their system.

To turn off Nvidia compiler cache, set env. variable CUDA_CACHE_DISABLE=1. That may helps to avoid the problem in future.

In line
m_context = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
you should use &error as last parameter to get a meaningful error. Without it I got some silly error messages. (I needed to change the platform to get my GPU board.)
I can not reproduce the error with my nVidia GeForce 8600 GTS. I get a 'Works fine'. I tried it >20 times without any issue.
I also can not see any error beside that you code is a little confusing. You should remove all commented out code and introduce some blank lines for grouping the code a little bit.
Do you have the latest drivers? The behavior you describe sounds very familiar like an uninitialized buffer or variable, but I do not see anything like that.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

OpenCL Read Back bus speed extensive drop - buffer

Related

iOS FFMPEG hight level API

Why do my in-kernel dynamic memory allocations fail for larger grid sizes?

Car detection using HOG features and cvsvm

Kinect 2 failing to 'AcquireLatestFrame'

OpenCL: Strange buffer or image bahaviour with NVidia but not Amd

Categories

Resources