cudaMemcpy invalid argument: in simple vector example - memory

The following example:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <math.h>
#define N 100
#define t_num 256
int main(){
int vector_one_h[t_num], vector_one_g[t_num];
cudaError_t err = cudaMalloc((void**)&vector_one_g, t_num * sizeof(int));
printf("Cuda malloc vector swap one: %s \n", cudaGetErrorString(err));
printf("Device Vector: %p \n:" , vector_one_g);
for(int m = 0; m < t_num; m++){
vector_one_h[m] = rand() % N;
}
err = cudaMemcpy(vector_one_g, vector_one_h, t_num * sizeof(int), cudaMemcpyHostToDevice);
printf("Cuda mem copy vector swap one: %s \n", cudaGetErrorString(err));
}
Will return:
Cuda malloc vector swap one: no error
Device Vector: 0x7ffcf028eea0
:Cuda mem copy vector swap one: invalid argument
So why is cudaMemcpy receiving an invalid argument?
From the documentation for cudaMemcpy() here I thought the problem may be that I need to give the second argument as the address, &vector_one_h, but placing that in the code returns the exact same error.
And also, while there are many posts about cudaMemcpy invalid arguments, I believe this is not a duplicate as most of the other questions have very complicated examples while this is a very simple and minimal example.

Try changing the first line to:
int vector_one_h[t_num], *vector_one_g;
BTW, prefixing an array name with an & has no effect. Array names are constant pointers by themselves, by the definition of C syntax.

Related

How to get list element from rosparam parameter from the cmd?

I loaded this yaml file:
num_boxes: 1
boxes: [[x: 0.349, y: 0.213, z: 0.117]]
with rosparam load my_config.yaml then I can do rosparam get boxes and get:
- - {x: 0.349}
- {y: 0.213}
- {z: 0.117}
But how can I access only the first list or elements in the second list? I tried boxes[0], boxes(0) and boxes{0} but nothing worked.
This is an old question, but I'm answering it because it's still useful to put an answer here.
Using the example given, boxes is a list of lists (YAML syntax). From the rosparam command line tool, we can't process rosparam get /boxes any further (excepting grep and regex). In common use, we can access the Parameter Server in C++ / Python.
# Python
>>> import rospy
>>> boxes = rospy.get_param("/boxes"); boxes
[[{'x': 0.349}, {'y': 0.213}, {'z': 0.117}]]
# Boxes is a python list/array of list of dicts
>>> boxes[0]
[{'x': 0.349}, {'y': 0.213}, {'z': 0.117}]
>>> boxes[0][0]
{'x': 0.349}
// C++
#include <vector>
#include <ros/ros.h>
// yaml_list: [1, 2]
ros::NodeHandle *nh;
std::vector<int> yaml_list;
double x = 0.0;
int main(int argc, char** argv){
ros::init(argc, argv, "get_params_node");
nh = new ros::NodeHandle("");
nh->getParam("/yaml_list", yaml_list);
x = yaml_list[0];
// ...
return 0;
}
To have deeper data structures, such as vectors of vectors of maps, you have to use the xmlrpcpp (.h/.cpp) interface.
// C++
#include <vector>
#include <map>
#include <string>
#include <ros/ros.h>
#include <xmlrpcpp/XmlRpcValue.h> // catkin component
ros::NodeHandle *nh;
XmlRpc::XmlRpcValue boxes; // std::vector<std::vector<std::map<std::string,double>>>
double x = 0.0;
int i = 0;
double point[3] = {0};
int main(int argc, char** argv){
ros::init(argc, argv, "get_params_node");
nh = new ros::NodeHandle("");
nh->getParam("/boxes", boxes);
if(boxes.getType() == XmlRpc::XmlRpcValue::Type::TypeArray && boxes.size() > 0){
// boxes[0] is a 'TypeArray' aka vector
if(boxes[0].getType() == XmlRpc::XmlRpcValue::Type::TypeArray && boxes[0].size() > 0){
// boxes[0][0] is a 'TypeStruct' aka map
if(boxes[0][0].getType() == XmlRpc::XmlRpcValue::Type::TypeStruct && boxes[0][0].hasMember("x")){
x = double(boxes[0][0]["x"]);
for(XmlRpc::XmlRpcValue::iterator it = boxes[0][0].begin(); it != boxes[0][0].end(); ++it){
point[i++] = double(*it);
}
}
}
}
// ...
return 0;
}
In standard use, the <rosparam> XML tag keeps the same YAML or rosparam command line syntax.

Nested table built with Lua C API crashes

I'm trying to make a deeply nested table in Lua. When I nest past 16 levels my program crashes.
In the example program below, when I change DEPTH to 16 instead of 17 the program does not crash. I can't find any resources that say there is a maximum table depth, and one so low seems odd. The crash is within the call to lua_close().
Am I misunderstanding how to build a table in Lua using the C API, or is there in fact a maximum depth?
#include <assert.h>
#include "lua.h"
#include "lauxlib.h"
#include "lualib.h"
#define DEPTH 17
int main(int argc, char* argv[])
{
lua_State *L = NULL;
size_t i = 0;
L = luaL_newstate();
assert(NULL!=L);
luaL_openlibs(L);
// create the root table
lua_newtable(L);
// push DEPTH levels deep onto the table
for (i=0; i<DEPTH; i++)
{
lua_pushstring(L, "subtable");
lua_newtable(L);
}
// nest the DEPTH levels
for (i=0; i<DEPTH; i++)
{
lua_settable(L, -3);
}
lua_close(L);
return 0;
}
You need to increase the stack with lua_checkstack or luaL_checkstack to allow 2*DEPTH slots.

Global device memory size limit when using statically alocated memory in cuda

I thought the maximal size of global memory should be only limited by the GPU device no matter it is allocated statically using __device__ __manged__ or dynamically using cudaMalloc.
But I found that if using the __device__ manged__ way, the maximum array size I can declare is much smaller than the GPU device limit.
The minimal working example is as follows:
#include <stdio.h>
#include <cuda_runtime.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define MX 64
#define MY 64
#define MZ 64
#define NX 64
#define NY 64
#define M (MX * MY * MZ)
__device__ __managed__ float A[NY][NX][M];
__device__ __managed__ float B[NY][NX][M];
__global__ void swapAB()
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
for(int j = 0; j < NY; j++)
for(int i = 0; i < NX; i++)
A[j][i][tid] = B[j][i][tid];
}
int main()
{
swapAB<<<M/256,256>>>();
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
return 0;
}
It uses 64 ^5 * 2 * 4 / 2^30 GB = 8 GB global memory, and I'll run compile and run it on a Nvidia Telsa K40c GPU which has a 12GB global memory.
Compiler cmd:
nvcc test.cu -gencode arch=compute_30,code=sm_30
Output warning:
warning: overflow in implicit constant conversion.
When I ran the generated executable, an error says:
GPUassert: an illegal memory access was encountered test.cu
Surprisingly, if I use the dynamically allocated global memory of the same size (8GB) via the cudaMalloc API instead, there is no compiling warning and runtime error.
I'm wondering if there are any special limitation about the allocatable size of static global device memory in CUDA.
Thanks!
PS: OS and CUDA: CentOS 6.5 x64, CUDA-7.5.
This would appear to be a limitation of the CUDA runtime API. The root cause is this function (in CUDA 7.5):
__cudaRegisterVar(
void **fatCubinHandle,
char *hostVar,
char *deviceAddress,
const char *deviceName,
int ext,
int size,
int constant,
int global
);
which only accepts a signed int for the size of any statically declared device variable. This would limit the maximum size to 2^31 (2147483648) bytes. The warning you see is because the CUDA front end is emitting boilerplate code containing calls to __cudaResgisterVar like this:
__cudaRegisterManagedVariable(__T26, __shadow_var(A,::A), 0, 4294967296, 0, 0);
__cudaRegisterManagedVariable(__T26, __shadow_var(B,::B), 0, 4294967296, 0, 0);
It is the 4294967296 which is the source of the problem. The size will overflow the signed integer and cause the API call to blow up. So it seems you are limited to 2Gb per static variable for the moment. I would recommend raising this as a bug with NVIDIA if it is a serious problem for your application.

OpenCV: fast matrix computation

I have an nxd matrix V=[v_1; v_2;...; v_n] (; means new row) where v_i are 1xd vectors.
I want to compute the following sum: v_1^T*v_1 + v_2^T*v_2 + ... + v_n^T*v_n, which is a dxd matrix (v_i^T is the transpose of v_i).
For the moment I use a for loop, as in the code below, which is not efficient when n is very large (I think so).
#include <iostream>
#include <opencv2/core.hpp>
using namespace cv;
using namespace std;
int main (int argc, char * argv[])
{
int n=5, d=3;
Mat V = Mat(n, d, CV_32F);
randu(V, Scalar::all(0), Scalar::all(10));
cout<<V<<endl<<endl;
Mat M = Mat::zeros(d, d, CV_32F);
for(int i=0; i<n; i++)
{
M = M + V.row(i).t()*V.row(i);
}
cout<<M<<endl<<endl;
return 0;
}
Hope that somebody can suggest a faster way. Thanks in advance.
You can just take V.t()*V
(It took me a minute to realize it too, but if you go through the matrix multiplication you'll see it's the same)

Finding camera and distortion matrix usinc cvCalibrateCamera2( )

I was trying to find out the camera matrix and distortion coefficients using cvCalibrateCamera2. There were no compilation errors, but when I am trying to execute the program it gives:
OpenCV error: Sizes of input arguments do not match( both matrices must have the same number of points) in cvConvertPointsHomogenous, file /build/buildd/opencv-2.3.1/modules/calib3d/src/fundam.cpp
size of the matrix storing object points in 4 x 3 and that of the matrix storing image points in 4 X2, what could be wrong?
Now I made certain changes to my code.
This is the code that I am using:
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/core/core.hpp"
#include "opencv2/imgproc/imgproc_c.h"
#include "opencv2/calib3d/calib3d.hpp"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//function to call cvCalibrateCamera2()
void calibrate(CvMat* object_points, CvMat* image_points, CvMat* intrinsic,CvMat* distortion)
{
const int point_count= object_points->rows;
const int image_count=object_points->rows/point_count;
CvMat* const full_object_points = cvCreateMat(image_count*point_count,3,CV_32FC1);
CvMat* const point_counts= cvCreateMat(image_count,1,CV_32SC1);
for(int i =0; i<image_count;i++)
{
CV_MAT_ELEM(*point_counts,float , i,0)= point_count;
for(int j=0;j<point_count;j++)
{
for(int k=0; k<3;k++){
CV_MAT_ELEM(*full_object_points,float,i*point_count+j,k)=CV_MAT_ELEM(*object_points,float,j,k);
}
}
}
cvCalibrateCamera2(full_object_points,image_points,point_counts,cvSize(1,1),intrinsic, distortion,NULL,NULL,0);
}
int main()
{
const float points[][2]={{1,2},{0,0},{3,5},{5,2}};
const int image_count=5;
const int point_count=sizeof(points)/sizeof(points[1]);
CvMat* const object_points=cvCreateMat(point_count,3,CV_32FC1);
for(int i=0; i<point_count;i++)
{
CV_MAT_ELEM(*object_points, float, i,0)=points[i][0];
CV_MAT_ELEM(*object_points, float, i,1)=points[i][1];
CV_MAT_ELEM(*object_points, float, i,2)=0;
}
CvMat* const image_points=cvCreateMat(image_count*point_count,2,CV_32FC1);
CvMat* const intrinsic=cvCreateMat(3,3,CV_32FC1);
CvMat* const distortion=cvCreateMat(5,1,CV_32FC1);
calibrate(object_points,image_points,intrinsic,distortion);
}
On execution I am getting the following error:
OpenCV Error: Bad argument (The total number of matrix elements is not divisible by the new number of rows) in cvReshape, file /build/buildd/opencv-2.3.1/modules/core/src/array.cpp, line 2755
terminate called after throwing an instance of 'cv::Exception'
what(): /build/buildd/opencv-2.3.1/modules/core/src/array.cpp:2755: error: (-5) The total number of matrix elements is not divisible by the new number of rows in function cvReshape
Aborted (core dumped)
Exactly what the error message means. It needs an equal amount of points for the object as for the image points. If you would try to calculate with say 10 sets of image points, but with 9 sets of object points, this error is returned.
I'd recommend using the c++ openCV, as this accepts vectors with points and matrices as input. Checking the length of such vectors is far easier than for matrices.

Resources