CUDA: __syncthreads() before shared memory operation? - memory

I'm in the rather poor situation of not being able to use the CUDA debugger. I'm getting some strange results from usage of __syncthreads in an application with a single shared array (deltas). The following piece of code is performed in a loop:
__syncthreads(); //if I comment this out, things get funny
deltas[lex_index_block] = intensity - mean;
__syncthreads(); //this line doesnt seem to matter regardless if the first sync is commented out or not
//after sync: do something with the values of delta written in this threads and other threads of this block
Basically, I have code with overlapping blocks (required due to the nature of the algorithm). The program does compile and run but somehow I get systematically wrong values in the areas of vertical overlap. This is very confusing to me as I thought that the correct way to sync is to sync after the threads have performed my write to the shared memory.
This is the whole function:
//XC without repetitions
template <int blocksize, int order>
__global__ void __xc(unsigned short* raw_input_data, int num_frames, int width, int height,
float * raw_sofi_data, int block_size, int order_deprecated){
//we make a distinction between real pixels and virtual pixels
//real pixels are pixels that exist in the original data
//overlap correction: every new block has a margin of 3 threads doing less work (only computing deltas)
int x_corrected = global_x() - blockIdx.x * 3;
int y_corrected = global_y() - blockIdx.y * 3;
//if the thread is responsible for any real pixel
if (x_corrected < width && y_corrected < height){
// __shared__ float deltas[blocksize];
__shared__ float deltas[blocksize];
//the outer pixels of a block do not update SOFI values as they do not have sufficient information available
//they are used only to compute mean and delta
//also, pixels at the global edge have to be thrown away (as there is not sufficient data to interpolate)
bool within_inner_block =
threadIdx.x > 0
&& threadIdx.y > 0
&& threadIdx.x < blockDim.x - 2
&& threadIdx.y < blockDim.y - 2
//global edge
&& x_corrected > 0
&& y_corrected > 0
&& x_corrected < width - 1
&& y_corrected < height - 1
;
//init virtual pixels
float virtual_pixels[order * order];
if (within_inner_block){
for (int i = 0; i < order * order; ++i) {
virtual_pixels[i] = 0;
}
}
float mean = 0;
float intensity;
int lex_index_block = threadIdx.x + threadIdx.y * blockDim.x;
//main loop
for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
//shared memory read and computation of mean/delta
intensity = raw_input_data[lex_index_3D(x_corrected,y_corrected, frame_idx, width, height)];
__syncthreads(); //if I comment this out, things break
deltas[lex_index_block] = intensity - mean;
__syncthreads(); //this doesnt seem to matter
mean = deltas[lex_index_block]/(float)(frame_idx+1);
//if the thread is responsible for correlated pixels, i.e. not at the border of the original frame
if (within_inner_block){
//WORKING WITH DELTA STARTS HERE
virtual_pixels[0] += deltas[lex_index_2D(
threadIdx.x,
threadIdx.y + 1,
blockDim.x)]
*
deltas[lex_index_2D(
threadIdx.x,
threadIdx.y - 1,
blockDim.x)];
virtual_pixels[1] += deltas[lex_index_2D(
threadIdx.x,
threadIdx.y,
blockDim.x)]
*
deltas[lex_index_2D(
threadIdx.x + 1,
threadIdx.y,
blockDim.x)];
virtual_pixels[2] += deltas[lex_index_2D(
threadIdx.x,
threadIdx.y,
blockDim.x)]
*
deltas[lex_index_2D(
threadIdx.x,
threadIdx.y + 1,
blockDim.x)];
virtual_pixels[3] += deltas[lex_index_2D(
threadIdx.x,
threadIdx.y,
blockDim.x)]
*
deltas[lex_index_2D(
threadIdx.x+1,
threadIdx.y+1,
blockDim.x)];
// xc_update<order>(virtual_pixels, delta2, mean);
}
}
if (within_inner_block){
for (int virtual_idx = 0; virtual_idx < order*order; ++virtual_idx) {
raw_sofi_data[lex_index_2D(x_corrected*order + virtual_idx % order,
y_corrected*order + (int)floorf(virtual_idx / order),
width*order)]=virtual_pixels[virtual_idx];
}
}
}
}

From what I can see, there could be a hazard in your application between loop iterations. The write to deltas[lex_index_block] for loop iteration frame_idx+1 could be mapped to the same location as the read of deltas[lex_index_2D(threadIdx.x, threadIdx.y -1, blockDim.x)] in a different thread at iteration frame_idx. The two accesses are unordered and the result is nondeterministic. Try running the app with cuda-memcheck --tool racecheck.

Related

cudaFree is not freeing memory

The code below calculates the dot product of two vectors a and b. The correct result is 8192. When I run it for the first time the result is correct. Then when I run it for the second time the result is the previous result + 8192 and so on:
1st iteration: result = 8192
2nd iteration: result = 8192 + 8192
3rd iteration: result = 8192 + 8192
and so on.
I checked by printing it on screen and the device variable dev_c is not freed. What's more writing to it causes something like a sum, the result beeing the previous value plus the new one being written to it. I guess that could be something with the atomicAdd() operation, but nonetheless cudaFree(dev_c) should erase it after all.
#define N 8192
#define THREADS_PER_BLOCK 512
#define NUMBER_OF_BLOCKS (N/THREADS_PER_BLOCK)
#include <stdio.h>
__global__ void dot( int *a, int *b, int *c ) {
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if( 0 == threadIdx.x ) {
int sum = 0;
for( int i= 0; i< THREADS_PER_BLOCK; i++ ){
sum += temp[i];
}
atomicAdd(c,sum);
}
}
int main( void ) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
int size = N * sizeof( int);
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, sizeof(int));
a = (int*)malloc(size);
b = (int*)malloc(size);
c = (int*)malloc(sizeof(int));
for(int i = 0 ; i < N ; i++){
a[i] = 1;
b[i] = 1;
}
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
dot<<< N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>( dev_a, dev_b, dev_c);
cudaMemcpy( c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
printf("Dot product = %d\n", *c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
free(a);
free(b);
free(c);
return 0;
}
cudaFree doesn't erase anything, it simply returns memory to a pool to be re-allocated. cudaMalloc doesn't guarantee the value of memory that has been allocated. You need to initialize memory (both global and shared) that your program uses, in order to have consistent results. The same is true for malloc and free, by the way.
From the documentation of cudaMalloc();
The memory is not cleared.
That means that dev_c is not initialized, and your atomicAdd(c,sum); will add to any random value that happens to be stored in memory at the returned position.

Fast Pixel Count on Binary Image- ARM neon intrinsics - iOS Dev

Can someone tell me a fast function to count the number of white pixels in a binary image. I need it for iOS app dev. I am working directly on the memory of the image defined as
bool *imageData = (bool *) malloc(noOfPixels * sizeof(bool));
I am implementing the function
int whiteCount = 0;
for (int q=i; q<i+windowHeight; q++)
{
for (int w=j; w<j+windowWidth; w++)
{
if (imageData[q*W + w] == 1)
whiteCount++;
}
}
This is obviously the slowest function possible. I heard that ARM Neon intrinsics on the iOS
can be used to make several operations in 1 cycle. Maybe thats the way to go ??
The problem is that I am not very familiar and don't have enough time to learn assembly language at the moment. So it would be great if anyone can post a Neon intrinsics code for the problem mentioned above or any other fast implementation in C/C++.
The only code in neon intrinsics that I am able to find online is the code for rgb to gray
http://computer-vision-talks.com/2011/02/a-very-fast-bgra-to-grayscale-conversion-on-iphone/
Firstly you can speed up the original code a little by factoring out the multiply and getting rid of the branch:
int whiteCount = 0;
for (int q = i; q < i + windowHeight; q++)
{
const bool * const row = &imageData[q * W];
for (int w = j; w < j + windowWidth; w++)
{
whiteCount += row[w];
}
}
(This assumes that imageData[] is truly binary, i.e. each element can only ever be 0 or 1.)
Here is a simple NEON implementation:
#include <arm_neon.h>
// ...
int i, w;
int whiteCount = 0;
uint32x4_t v_count = { 0 };
for (q = i; q < i + windowHeight; q++)
{
const bool * const row = &imageData[q * W];
uint16x8_t vrow_count = { 0 };
for (w = j; w <= j + windowWidth - 16; w += 16) // SIMD loop
{
uint8x16_t v = vld1q_u8(&row[j]); // load 16 x 8 bit pixels
vrow_count = vpadalq_u8(vrow_count, v); // accumulate 16 bit row counts
}
for ( ; w < j + windowWidth; ++w) // scalar clean up loop
{
whiteCount += row[j];
}
v_count = vpadalq_u16(v_count, vrow_count); // update 32 bit image counts
} // from 16 bit row counts
// add 4 x 32 bit partial counts from SIMD loop to scalar total
whiteCount += vgetq_lane_s32(v_count, 0);
whiteCount += vgetq_lane_s32(v_count, 1);
whiteCount += vgetq_lane_s32(v_count, 2);
whiteCount += vgetq_lane_s32(v_count, 3);
// total is now in whiteCount
(This assumes that imageData[] is truly binary, imageWidth <= 2^19, and sizeof(bool) == 1.)
Updated version for unsigned char and values of 255 for white, 0 for black:
#include <arm_neon.h>
// ...
int i, w;
int whiteCount = 0;
const uint8x16_t v_mask = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
uint32x4_t v_count = { 0 };
for (q = i; q < i + windowHeight; q++)
{
const uint8_t * const row = &imageData[q * W];
uint16x8_t vrow_count = { 0 };
for (w = j; w <= j + windowWidth - 16; w += 16) // SIMD loop
{
uint8x16_t v = vld1q_u8(&row[j]); // load 16 x 8 bit pixels
v = vandq_u8(v, v_mask); // mask out all but LS bit
vrow_count = vpadalq_u8(vrow_count, v); // accumulate 16 bit row counts
}
for ( ; w < j + windowWidth; ++w) // scalar clean up loop
{
whiteCount += (row[j] == 255);
}
v_count = vpadalq_u16(v_count, vrow_count); // update 32 bit image counts
} // from 16 bit row counts
// add 4 x 32 bit partial counts from SIMD loop to scalar total
whiteCount += vgetq_lane_s32(v_count, 0);
whiteCount += vgetq_lane_s32(v_count, 1);
whiteCount += vgetq_lane_s32(v_count, 2);
whiteCount += vgetq_lane_s32(v_count, 3);
// total is now in whiteCount
(This assumes that imageData[] is has values of 255 for white and 0 for black, and imageWidth <= 2^19.)
Note that all the above code is untested and may need some further work.
http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
Section 6.55.3.6
The vectorized algorithm will do the comparisons and put them in a structure for you, but you'd still need to go through each element of the structure and determine if it's a zero or not.
How fast does that loop currently run and how fast do you need it to run? Also remember that NEON will work in the same registers as the floating point unit, so using NEON here may force an FPU context switch.

Cuda-memcheck not reporting out of bounds shared memory access

I am runnig the follwoing code using shared memory:
__global__ void computeAddShared(int *in , int *out, int sizeInput){
//not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
extern __shared__ float temp[];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int ltid = threadIdx.x;
temp[ltid] = 0;
while(tid < sizeInput){
temp[ltid] += in[tid];
tid+=gridDim.x * blockDim.x; // to handle array of any size
}
__syncthreads();
int offset = 1;
while(offset < blockDim.x){
if(ltid % (offset * 2) == 0){
temp[ltid] = temp[ltid] + temp[ltid + offset];
}
__syncthreads();
offset*=2;
}
if(ltid == 0){
out[blockIdx.x] = temp[0];
}
}
int main(){
int size = 16; // size of present input array. Changes after every loop iteration
int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
/*FILE *f;
f = fopen("invertedList.txt" , "w");
a[0] = 1 + (rand() % 8);
fprintf(f, "%d,",a[0]);
for( int i = 1 ; i< N; i++){
a[i] = a[i-1] + (rand() % 8) + 1;
fprintf(f, "%d,",a[i]);
}
fclose(f);*/
int* gidata;
int* godata;
cudaMalloc((void**)&gidata, size* sizeof(int));
cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice);
int TPB = 4;
int blocks = 10; //to get things kicked off
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
while(blocks != 1 ){
if(size < TPB){
TPB = size; // size is 2^sth
}
blocks = (size+ TPB -1 ) / TPB;
cudaMalloc((void**)&godata, blocks * sizeof(int));
computeAddShared<<<blocks, TPB,TPB>>>(gidata, godata,size);
cudaFree(gidata);
gidata = godata;
size = blocks;
}
//printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime , start, stop);
printf("time is %f ms", elapsedTime);
int *output = (int*)malloc(sizeof(int));
cudaMemcpy(output, gidata, sizeof(int), cudaMemcpyDeviceToHost);
//Cant free either earlier as both point to same location
cudaError_t chk = cudaFree(godata);
if(chk!=0){
printf("First chk also printed error. Maybe error in my logic\n");
}
printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
printf("The sum of the array is %d\n", output[0]);
getchar();
return 0;
}
Clearly, the first while loop in computeAddShared is causing out of bounds error because I am allocating 4 bytes to shared memory. Why does cudamemcheck not catch this. Below is the output of cuda-memcheck
========= CUDA-MEMCHECK
time is 12.334816 msThe error by threadsyn is no errorThe sum of the array is 13
6
========= ERROR SUMMARY: 0 errors
Shared memory allocation granularity. The Hardware undoubtedly has a page size for allocations (probably the same as the L1 cache line side). With only 4 threads per block, there will "accidentally" be enough shared memory in a single page to let you code work. If you used a sensible number of threads block (ie. a round multiple of the warp size) the error would be detected because there would not be enough allocated memory.

CUDA memory limitations

If I try to send to my CUDA device a struct wich is heavier than the size of memory available, will CUDA give me any kind of warning or error?
I'm asking that because my GPU has 1024 MBytes (1073414144 bytes) Total amount of global memory, but I don't know how I should handle and eventual problem.
That's my code:
#define VECSIZE 2250000
#define WIDTH 1500
#define HEIGHT 1500
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
struct Matrix
{
int width;
int height;
int* elements;
};
int main()
{
Matrix M;
M.width = WIDTH;
M.height = HEIGHT;
M.elements = (int *) calloc(VECSIZE,sizeof(int));
int row, col;
// define Matrix M
// Matrix generator:
for (int i = 0; i < M.height; i++)
for(int j = 0; j < M.width; j++)
{
row = i;
col = j;
if (i == j)
M.elements[row * M.width + col] = INFINITY;
else
{
M.elements[row * M.width + col] = (rand() % 2); // because 'rand() % 1' just does not seems to work ta all.
if (M.elements[row * M.width + col] == 0) // can't have zero weight.
M.elements[row * M.width + col] = INFINITY;
else if (M.elements[row * M.width + col] == 2)
M.elements[row * M.width + col] = 1;
}
}
// Declare & send device Matrix to Device.
Matrix d_M;
d_M.width = M.width;
d_M.height = M.height;
size_t size = M.width * M.height * sizeof(int);
cudaMalloc(&d_M.elements, size);
cudaMemcpy(d_M.elements, M.elements, size, cudaMemcpyHostToDevice);
int *d_k= (int*) malloc(sizeof(int));
cudaMalloc((void**) &d_k, sizeof (int));
int *d_width=(int*)malloc(sizeof(int));
cudaMalloc((void**) &d_width, sizeof(int));
unsigned int *width=(unsigned int*)malloc(sizeof(unsigned int));
width[0] = M.width;
cudaMemcpy(d_width, width, sizeof(int), cudaMemcpyHostToDevice);
int *d_height=(int*)malloc(sizeof(int));
cudaMalloc((void**) &d_height, sizeof(int));
unsigned int *height=(unsigned int*)malloc(sizeof(unsigned int));
height[0] = M.height;
cudaMemcpy(d_height, height, sizeof(int), cudaMemcpyHostToDevice);
/*
et cetera .. */
While you may not currently be sending enough data to the GPU to max out it's memory, when you do, your cudaMalloc will return the error code cudaErrorMemoryAllocation which as per the cuda api docs, signals that the memory allocation failed. I note that in your example code you are not checking the return values of the cuda calls. These return codes need to be checked to make sure your program is running correctly. The cuda api does not throw exceptions: you must check the return codes. See this article for info on checking the errors and getting meaningful messages about the errors
If you are using cutil.h, then it provides two very useful macros:
CUDA_SAFE_CALL (used while issuing functions like cudaMalloc, cudaMemcpy etc.)
and
CUT_CHECK_ERROR (used after executing a kernel to check for errors in kernel execution).
They take care of the errors, if any, by using the error checking mechanism detailed in the article provided by flipchart.

Search for lines with a small range of angles in OpenCV

I'm using the Hough transform in OpenCV to detect lines. However, I know in advance that I only need lines within a very limited range of angles (about 10 degrees or so). I'm doing this in a very performance sensitive setting, so I'd like to avoid the extra work spent detecting lines at other angles, lines I know in advance I don't care about.
I could extract the Hough source from OpenCV and just hack it to take min_rho and max_rho parameters, but I'd like a less fragile approach (have to manually update my code w/ each OpenCV update, etc.).
What's the best approach here?
Well, i've modified the icvHoughlines function to go for a certain range of angles. I'm sure there's cleaner ways that plays with memory allocation as well, but I got a speed gain going from 100ms to 33ms for a range of angle going from 180deg to 60deg, so i'm happy with that.
Note that this code also outputs the accumulator value. Also, I only output 1 line because that fit my purposes but there was no gain really there.
static void
icvHoughLinesStandard2( const CvMat* img, float rho, float theta,
int threshold, CvSeq *lines, int linesMax )
{
cv::AutoBuffer<int> _accum, _sort_buf;
cv::AutoBuffer<float> _tabSin, _tabCos;
const uchar* image;
int step, width, height;
int numangle, numrho;
int total = 0;
float ang;
int r, n;
int i, j;
float irho = 1 / rho;
double scale;
CV_Assert( CV_IS_MAT(img) && CV_MAT_TYPE(img->type) == CV_8UC1 );
image = img->data.ptr;
step = img->step;
width = img->cols;
height = img->rows;
numangle = cvRound(CV_PI / theta);
numrho = cvRound(((width + height) * 2 + 1) / rho);
_accum.allocate((numangle+2) * (numrho+2));
_sort_buf.allocate(numangle * numrho);
_tabSin.allocate(numangle);
_tabCos.allocate(numangle);
int *accum = _accum, *sort_buf = _sort_buf;
float *tabSin = _tabSin, *tabCos = _tabCos;
memset( accum, 0, sizeof(accum[0]) * (numangle+2) * (numrho+2) );
// find n and ang limits (in our case we want 60 to 120
float limit_min = 60.0/180.0*PI;
float limit_max = 120.0/180.0*PI;
//num_steps = (limit_max - limit_min)/theta;
int start_n = floor(limit_min/theta);
int stop_n = floor(limit_max/theta);
for( ang = limit_min, n = start_n; n < stop_n; ang += theta, n++ )
{
tabSin[n] = (float)(sin(ang) * irho);
tabCos[n] = (float)(cos(ang) * irho);
}
// stage 1. fill accumulator
for( i = 0; i < height; i++ )
for( j = 0; j < width; j++ )
{
if( image[i * step + j] != 0 )
//
for( n = start_n; n < stop_n; n++ )
{
r = cvRound( j * tabCos[n] + i * tabSin[n] );
r += (numrho - 1) / 2;
accum[(n+1) * (numrho+2) + r+1]++;
}
}
int max_accum = 0;
int max_ind = 0;
for( r = 0; r < numrho; r++ )
{
for( n = start_n; n < stop_n; n++ )
{
int base = (n+1) * (numrho+2) + r+1;
if (accum[base] > max_accum)
{
max_accum = accum[base];
max_ind = base;
}
}
}
CvLinePolar2 line;
scale = 1./(numrho+2);
int idx = max_ind;
n = cvFloor(idx*scale) - 1;
r = idx - (n+1)*(numrho+2) - 1;
line.rho = (r - (numrho - 1)*0.5f) * rho;
line.angle = n * theta;
line.votes = accum[idx];
cvSeqPush( lines, &line );
}
If you use the Probabilistic Hough transform then the output is in the form of a cvPoint each for lines[0] and lines[1] parameters. We can get x and y co-ordinated for each of the two points by pt1.x, pt1.y and pt2.x and pt2.y.
Then use the simple formula for finding slope of a line - (y2-y1)/(x2-x1). Taking arctan (tan inverse) of that will yield that angle in radians. Then simply filter out desired angles from the values for each hough line obtained.
I think it's more natural to use standart HoughLines(...) function, which gives collection of lines directly in rho and theta terms and select nessessary angle range from it, rather than recalculate angle from segment end points.

Resources