I am facing a problem with running out of memory on my CUDA device. I have detected the cause - in some of my member functions I'm returning a GpuMat constructed over data alocated by myself. Question here is - how do I free this memory?
class Test {
GpuMat test() {
float* dev_ptr = nullptr;
cv::Size size(8192,8192);
cudaMalloc((void**)&dev_ptr, size.width*size.height*sizeof(float));
return GpuMat(size, CV_32FC1, dev_ptr);
}
}
//main
Test t;
while(true) {
size_t free_mem, total_mem;
cudaMemGetInfo(&free_mem, &total_mem);
std::cout << free_mem << std::endl;
cv::namedWindow("test", CV_WINDOW_OPENGL | CV_WINDOW_NORMAL);
cv::imshow("test", t.test());
cv::waitKet(1);
}
You will see decreasing amount of free_mem each iteration (by the same amount).
What I've tried was to create custom allocator and setDefaultAllocator but it didn't work.
class CustomAllocator : public cv::cuda::GpuMat::Allocator
{
float* m_ptr;
public:
CustomAllocator(float* p) : cv::cuda::GpuMat::Allocator(), m_ptr(p) {}
virtual bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemsize) override
{
return true;
}
virtual void free(cv::cuda::GpuMat* mat) override
{
cudaFree(m_ptr);
}
};
// more or less looked like this...
GpuMat test() {
float* dev_ptr = nullptr;
cv::Size size(8192,8192);
cudaMalloc((void**)&dev_ptr, size.width*size.height*sizeof(float));
GpuMat retval(size, CV_32FC1, dev_ptr);
retval.setDefaultAllocator(new CustomAllocator(dev_ptr));
return retval;
}
Both Allocator and GpuMat have not so well written docs so any help would be much appreciated.
WHAT SEEMS TO BE WORKING
GpuMat test() {
float* dev_ptr = nullptr;
cv::Size size(8192,8192);
cudaMalloc((void**)&dev_ptr, size.width*size.height*sizeof(float));
// do processing...
// clone data so the GpuMat manages it by itself
auto retval = GpuMat(size, CV_32FC1, dev_ptr).clone();
// free manually allocated memory
cudaFree(dev_ptr);
return retval;
}
As I was writing it down right now I think that optimal would be to allocate GpuMat first and //do processing... using its data.
You're using the particular variant of GpuMat that involves a user-provided allocation. Furthermore, t never goes out of scope in your while loop, so it's not clear to me that any implicit method would work.
Since you are providing that memory allocation with cudaMalloc, my suggestion is that it should be freed by you with cudaFree. So, simplistically, make the dev_ptr a class data member rather than an immediate/temporary variable, and provide a Test::finish() member function that tests this pointer for non-NULL and if so runs cudaFree() on it. Of course if you wanted to handle via a constructor/destructor you might do it differently, and there are probably dozens of variations possible.
Here is one possible approach, coded in browser, not tested:
class Test {
float* dev_ptr;
public:
Test() {dev_ptr = nullptr;}
void finish() { if (dev_ptr != nullptr) cudaFree(dev_ptr);}
GpuMat test() {
cv::Size size(8192,8192);
cudaMalloc((void**)&dev_ptr, size.width*size.height*sizeof(float));
return GpuMat(size, CV_32FC1, dev_ptr);
}
}
//main
Test t;
while(true) {
size_t free_mem, total_mem;
cudaMemGetInfo(&free_mem, &total_mem);
std::cout << free_mem << std::endl;
cv::namedWindow("test", CV_WINDOW_OPENGL | CV_WINDOW_NORMAL);
cv::imshow("test", t.test());
cv::waitKey(1);
t.finish();
}
Alternatively, if you wanted to avoid the explicit call to finish(), and simply wanted to re-allocate, you could do:
class Test {
float* dev_ptr;
public:
Test() {dev_ptr = nullptr;}
GpuMat test() {
cv::Size size(8192,8192);
if (dev_ptr != nullptr) cudaFree(dev_ptr);
cudaMalloc((void**)&dev_ptr, size.width*size.height*sizeof(float));
return GpuMat(size, CV_32FC1, dev_ptr);
}
}
//main
Test t;
while(true) {
size_t free_mem, total_mem;
cudaMemGetInfo(&free_mem, &total_mem);
std::cout << free_mem << std::endl;
cv::namedWindow("test", CV_WINDOW_OPENGL | CV_WINDOW_NORMAL);
cv::imshow("test", t.test());
cv::waitKey(1);
}
Related
I am new to CUDA, now I'm trying to understand how cudaGetSymbolAddress works. I get an unexpected Segmentation Fault in a really simple code. What I do is the following:
I declare a global device variable(device_int)
In main() I ensure that the definition was correct by setting its value in a kernel
I create a pointer (host_pointer_to_device_int) in host memory and make it point to device_int via cudaGetSymbolAddress
I create one more pointer (host_pointer_to_host_int) and try to cudaMemcpy the value from host_pointer_to_device_int to host_pointer_to_host_int
All these operations finish with no errors, but I get Segmentation Fault when trying to print the host_pointer_to_host_int's value. Here is the code:
#include <iostream>
#include <cassert>
using namespace std;
__device__ int device_int;
__global__ void kernel()
{
device_int = 1000;
}
int main()
{
kernel<<<1, 1>>>();
assert(cudaGetLastError() == cudaSuccess); // The above operation executed successfully
int *host_pointer_to_device_int;
cudaGetSymbolAddress((void **)&host_pointer_to_device_int, device_int);
assert(cudaGetLastError() == cudaSuccess); // The above operation executed successfully
int *host_pointer_to_host_int;
// Copy the device_int's value
cudaMemcpy((void **)&host_pointer_to_host_int, host_pointer_to_device_int,
sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaGetLastError() == cudaSuccess); // The above operation executed successfully
cout << *host_pointer_to_host_int << endl; // Segmentation fault
}
My mistake was not in misunderstanding how cudaGetSymbolAddress works, but in using cudaMemcpy with wrong parameters types: I expected cudaMemcpy to allocate memory for me, so I've cast my variables to the wrong types.
The corrected code is:
#include <iostream>
#include <cassert>
using namespace std;
__device__ int device_int;
__global__ void kernel()
{
device_int = 1000;
}
int main()
{
kernel<<<1, 1>>>();
assert(cudaGetLastError() == cudaSuccess);
int *host_pointer_to_device_int;
/* Get a pointer to device_int. After this, I won't be able to access it,
* but I'm going to copy its value with cudaMemcpy */
cudaGetSymbolAddress((void **)&host_pointer_to_device_int, device_int);
assert(cudaGetLastError() == cudaSuccess); // The above operation executed successfully
int host_int;
// Copy the device_int's value
cudaMemcpy(&host_int, host_pointer_to_device_int,
sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaGetLastError() == cudaSuccess); // The above operation executed successfully
cout << host_int << endl; // Everything's fine!
}
Thanks to #talonmies for helping me in figuring it out.
i have recently started using opencv and this one has baffled me.
void saveImageSnippet(char *imageName, int height, int width, void* data, int nChannels) //data is char[height * width]
{
char fName[200]="c:\\testimg\\";
FILE *fptr;
IplImage *img;
sprintf(fName,"%s%s.bmp",fName,imageName);
img = cvCreateImageHeader(cvSize(width, height),8/*depth*/,nChannels);
img->imageData=(unsigned char*)data;
cvSaveImage(fName, img); //Unhandled exception
cvReleaseImage(&img);
}
At cvSaveImage : Unhandled exception at 0x6e8e871d in vc2008_1x.exe: 0xC0000005: Access violation reading location 0x745c3a63.
Is there anything i am not doing right?
now an interesting part,
if i add a couple of unused variables, cvSaveImage works just fine
void saveImageSnippet(char *imageName, int height, int width, void* data, int nChannels)
{
int var1, var2; //unused variables
char fName[200]="c:\\testimg\\";
FILE *fptr;
IplImage *img;
sprintf(fName,"%s%s.bmp",fName,imageName);
img = cvCreateImageHeader(cvSize(width, height),8/*depth*/,nChannels);
img->imageData=(unsigned char*)data;
cvSaveImage(fName, img); //now it works fine
cvReleaseImage(&img);
}
please use opencv's c++ api,
avoid anything, that has iplimages in it (especially here on SO !)
#include "opencv2/highgui.hpp" // no, *not* highgui.h !
using namespace cv;
int main()
{
string fName = format("c:\\testimg\\%s.bmp",imageName);
Mat img = imread( fName );
if ( img.empty() )
return -1;
// your image was not loaded
imsave(fName);
return 0;
}
I want to give a method a vector to fill it with a different number of Mat Objects, something like this:
int main()
{
vector<cv::Mat> areas;
doSomething(areas);
return 0;
}
doSomething(vector<cv::Mat> &areas)
{
cv::Mat first = zeros(...
cv::Mat second = ....
cv::Mat third = ...
areas.push_back(first);
areas.push_back(second);
areas.push_back(third);
}
Of course that doesn't work, because there is not enough memory allocated at the begin and the memory is only on the stack!
A second idea I had is to make pointers:
int main()
{
vector<cv::Mat> *areas = new vector<cv::Mat>();
doSomething(&areas);
return 0;
}
doSomething(vector<cv::Mat> *areas)
{
cv::Mat first = zeros(...
cv::Mat second = ....
cv::Mat third = ...
areas->push_back(first);
areas->push_back(second);
areas->push_back(third);
}
But again the problem is, that I have no idea at the beginning how much space is to allocate.
The third idea was to return the vector at the "normal" way:
int main()
{
vector<cv::Mat> areas;
areas = doSomething();
return 0;
}
vector<cv::Mat> doSomething()
{
vector<cv::Mat> areas;
cv::Mat first = zeros(...
cv::Mat second = ....
cv::Mat third = ...
areas->push_back(first);
areas->push_back(second);
areas->push_back(third);
return areas;
}
In this case of course only the headers of the vector was copied and not the Mat Objects in the vector
Do you have any idea how i am able to solve this problem?
Thanks in advance!
I am trying to translate the OpenCV CascadeClassifier tutorial from C++ to Java. Working good in C++. Also this java tutorial is working fine.
But the translation is simply not detecting the face. I don't get explicit errors. I can see the processing of the video input from the webcam (grey/histogram...) and the video display. Cascade load doesn't give error. But the CascadeClassifier call just doesn't return any faces... So, you probably can skip all the code and just go to my CascadeClassifier call, down to public Mat detect(Mat inputframe). As I am new to Java and OpenCV, I paste the rest (I removed anything I felt may not be significant), just in case, but don't mean for you to debug that...
I have also tried this call (and other portions) in many different ways and nothing... running out of ideas...
Thank you!!
import java.awt.*;
import java.awt.image.BufferedImage;
import javax.swing.*;
import org.opencv.core.Mat;
import org.opencv.core.MatOfRect;
import org.opencv.highgui.VideoCapture;
import org.opencv.imgproc.Imgproc;
import org.opencv.objdetect.CascadeClassifier;
class My_Panel extends JPanel{
private static final long serialVersionUID = 1L;
private BufferedImage image;
private CascadeClassifier face_cascade;
// Create a constructor method
public My_Panel(){
super();
String face_cascade_name = "/haarcascade_frontalface_alt.xml";
//String face_cascade_name = "/lbpcascade_frontalface.xml";
//-- 1. Load the cascades
String str;
str = getClass().getResource(face_cascade_name).getPath();
str = str.replace("/C:","C:");
face_cascade_name=str;
face_cascade=new CascadeClassifier(face_cascade_name);
if( !face_cascade.empty())
{
System.out.println("--(!)Error loading A\n");
return;
}
else
{
System.out.println("Face classifier loooaaaaaded up");
}
}
private BufferedImage getimage(){
return image;
}
public void setimage(BufferedImage newimage){
image=newimage;
return;
}
/**
* Converts/writes a Mat into a BufferedImage.
*
* #param matrix Mat of type CV_8UC3 or CV_8UC1
* #return BufferedImage of type TYPE_3BYTE_BGR or TYPE_BYTE_GRAY
*/
public BufferedImage matToBufferedImage(Mat matrix) {
int cols = matrix.cols();
int rows = matrix.rows();
int elemSize = (int)matrix.elemSize();
byte[] data = new byte[cols * rows * elemSize];
int type;
matrix.get(0, 0, data);
switch (matrix.channels()) {
case 1:
type = BufferedImage.TYPE_BYTE_GRAY;
break;
case 3:
type = BufferedImage.TYPE_3BYTE_BGR;
// bgr to rgb
byte b;
for(int i=0; i<data.length; i=i+3) {
b = data[i];
data[i] = data[i+2];
data[i+2] = b;
}
break;
default:
return null;
}
BufferedImage image2 = new BufferedImage(cols, rows, type);
image2.getRaster().setDataElements(0, 0, cols, rows, data);
return image2;
}
public void paintComponent(Graphics g){
BufferedImage temp=getimage();
g.drawImage(temp,10,10,temp.getWidth(),temp.getHeight(), this);
}
public Mat detect(Mat inputframe){
Mat mRgba=new Mat();
Mat mGrey=new Mat();
MatOfRect faces = new MatOfRect();
//MatOfRect eyes = new MatOfRect();
inputframe.copyTo(mRgba);
inputframe.copyTo(mGrey);
Imgproc.cvtColor( mRgba, mGrey, Imgproc.COLOR_BGR2GRAY);
Imgproc.equalizeHist( mGrey, mGrey );
face_cascade.detectMultiScale(mGrey, faces);
//face_cascade.detectMultiScale(mGrey, faces, 1.1, 2, 0|Objdetect.CASCADE_SCALE_IMAGE, new Size(30, 30), new Size(200,200) );
//face_cascade.detectMultiScale(mGrey, faces, 1.1, 2, 2//CV_HAAR_SCALE_IMAGE,
// ,new Size(30, 30), new Size(200,200) );
System.out.println(String.format("Detected %s faces", faces.toArray().length));
return mGrey;
}
}
public class window {
public static void main(String arg[]){
// Load the native library.
System.loadLibrary("opencv_java245");
String window_name = "Capture - Face detection";
JFrame frame = new JFrame(window_name);
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
frame.setSize(400,400);
My_Panel my_panel = new My_Panel();
frame.setContentPane(my_panel);
frame.setVisible(true);
//-- 2. Read the video stream
BufferedImage temp;
Mat webcam_image=new Mat();
VideoCapture capture =new VideoCapture(0);
if( capture.isOpened())
{
while( true )
{
capture.read(webcam_image);
if( !webcam_image.empty() )
{
frame.setSize(webcam_image.width()+40,webcam_image.height()+60);
//-- 3. Apply the classifier to the captured image
// At this point I was wondering where this should be done.
// I put it within the panel class, but maybe one could actually
// create a processor object...
webcam_image=my_panel.detect(webcam_image);
//-- 4. Display the image
temp=my_panel.matToBufferedImage(webcam_image);
my_panel.setimage(temp);
my_panel.repaint();
}
else
{
System.out.println(" --(!) No captured frame -- Break!");
break;
}
}
}
return;
}
}
PS.: Other info, just in case:
mGrey is: Mat [ 480*640*CV_8UC1, isCont=true, isSubmat=false, nativeObj=0x19d9af48, dataAddr=0x19dc3430 ]
face is: Mat [ 0*0*CV_8UC1, isCont=false, isSubmat=false, nativeObj=0x194bb048, dataAddr=0x0 ]
I have tried your code and it works fine! You have only one issue with haarcascade_frontalface_alt.xml file location. Try to use a full path to the file:
face_cascade= new CascadeClassifier("D:/HelloCV/src/haarcascade_frontalface_alt.xml");
I'm trying Lua and want to know how lua_State working
code and result:
state.c
#include <stdio.h>
#include "lua/src/lua.h"
#include "lua/src/lauxlib.h"
static void stackDump(lua_State *L){
int i;
int top = lua_gettop(L);
for(i = 1; i<= top; i++) {
int t = lua_type(L, i);
switch(t){
case LUA_TSTRING:
printf("'%s'", lua_tostring(L, i));
break;
case LUA_TBOOLEAN:
printf(lua_toboolean(L, i) ?"true":"false");
break;
case LUA_TNUMBER:
printf("%g", lua_tonumber(L, i));
break;
default:
printf("%s", lua_typename(L, t));
break;
}
printf(" ");
}
printf("\n");
}
static int divide(struct lua_State *L){
double a = lua_tonumber(L, 1);
double b = lua_tonumber(L, 2);
printf("%p\n", L);
stackDump(L);
int quot = (int)a / (int)b;
int rem = (int)a % (int)b;
lua_pushnumber(L, quot);
lua_pushnumber(L, rem);
stackDump(L);
printf("---end div---\n");
return 2;
}
int main(void){
struct lua_State *L = lua_open();
lua_pushboolean(L, 1);
lua_pushnumber(L, 10);
lua_pushnil(L);
lua_pushstring(L, "hello");
printf("%p\n", L);
stackDump(L);
lua_register(L, "div", divide);
luaL_dofile(L, "div.lua");
stackDump(L);
lua_close(L);
return 0;
}
div.lua
local c = div(20, 10)
0x100c009e0
true 10 nil 'hello'
---start div---
0x100c009e0
20 10
20 10 2 0
---end div---
true 10 nil 'hello'
I see lua_State in divide is the same with the main one, but they have different data in stack, How this be done ?
I know the best way to understand this is to read source code of Lua , maybe you can tell me where to find the right place.
Think of lua_State as containing the Lua stack, as well as indices delimiting the current visible part of the stack. When you invoke a Lua function, it may look like you have a new stack, but really only the indices have changed. That's the simplified version.
lua_State is defined in lstate.h. I've pulled out the relevant parts for you. stack is the beginning of the big Lua stack containing everything. base is the beginning of the stack for the current function. This is what your function sees as "the stack" when it is executing.
struct lua_State {
/* ... */
StkId top; /* first free slot in the stack */
StkId base; /* base of current function */
/* ... */
StkId stack_last; /* last free slot in the stack */
StkId stack; /* stack base */
/* ... */
};
Programming in Lua, 2nd Edition discusses Lua states in chapter 30: Threads and States. You'll find some good information there. For example, lua_State not only represents a Lua state, but also a thread within that state. Furthermore, all threads have their own stack.
It gets different data the same way anything gets different data: code changes the data inside of the object.
struct Object
{
int val;
};
void more_stuff(Object *the_data)
{
//the_data->val has 5 in it now.
}
void do_stuff(Object *the_data)
{
int old_val = the_data->val;
the_data->val = 5;
more_stuff(the_data);
the_data->val = old_val;
}
int main()
{
Object my_data;
my_data.val = 1;
//my_data.val has 1.
do_stuff(&my_data);
//my_data.val still has 1.
}
When Lua calls a registered C function, it gives it a new stack frame.