Resize image using nearest neighborhood with cuda - opencv

I am implementing a nearest neighborhood kernel function to resize the input image. But the result is wrong and I have no idea.
Here is the input image
the result is wrong.
I use opencv to read the input image.
cv::Mat image = cv::imread("/home/tumh/test.jpg");
unsigned char* data = image.data;
int outH, outW;
float *out_data_host = test(data, image.rows, image.cols, outH, outW);
cv::Mat out_image(outH, outW, CV_32FC3);
memcpy(out_image.data, out_data_host, outH * outW * 3 * sizeof(float));
float* test(unsigned char* in_data_host, const int &inH, const int &inW, int &outH, int &outW) {
// get the output size
int im_size_min = std::min(inW, inH);
int im_size_max = std::max(inW, inH);
float scale_factor = static_cast<float>(640) / im_size_min;
float im_scale_x = std::floor(inW * scale_factor / 64) * 64 / inW;
float im_scale_y = std::floor(inH * scale_factor / 64) * 64 / inH;
outW = inW * im_scale_x;
outH = inH * im_scale_y;
int channel = 3;
unsigned char* in_data_dev;
CUDA_CHECK(cudaMalloc(&in_data_dev, sizeof(unsigned char) * channel * inH * inW));
CUDA_CHECK(cudaMemcpy(in_data_dev, in_data_host, 1 * sizeof(unsigned char) * channel * inH * inW, cudaMemcpyHostToDevice));
// image pre process
const float2 scale = make_float2( im_scale_x, im_scale_y);
float * out_buffer = NULL;
CUDA_CHECK(cudaMalloc(&out_buffer, sizeof(float) * channel * outH * outW));
float *out_data_host = new float[sizeof(float) * channel * outH * outW];
const dim3 threads(32, 32);
const dim3 block(iDivUp(outW, threads.x), iDivUp(outW, threads.y));
gpuPreImageNet<<<block, threads>>>(scale, in_data_dev, inW, out_buffer, outW, outH);
CUDA_CHECK(cudaFree(in_data_dev));
CUDA_CHECK(cudaMemcpy(out_data_host, out_buffer, sizeof(float) * channel * outH * outW, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(out_buffer));
return out_data_host;
}
Here is the resize kernel function
__global__ void gpuPreImageNet( float2 scale, unsigned char* input, int iWidth, float* output, int oWidth, int oHeight )
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int n = oWidth * oHeight;
int channel = 3;
if( x >= oWidth || y >= oHeight )
return;
const int dx = ((float)x * scale.x);
const int dy = ((float)y * scale.y);
const unsigned char* px = input + dy * iWidth * channel + dx * channel ;
const float3 bgr = make_float3(*(px + 0), *(px + 1), *(px + 2));
output[channel * y * oWidth + channel * x + 0] = bgr.x;
output[channel * y * oWidth + channel * x + 1] = bgr.y;
output[channel * y * oWidth + channel * x + 2] = bgr.z;
}
Most of the implementation is from https://github.com/soulsheng/ResizeNN/blob/master/resizeCUDA/resizeNN.cu
Any idea?

Maybe you are observing an uninitialized memory problem.
As i understand your code, out_data_host allocation is too big
new float[sizeof(float) * channel * outH * outW];
should be
new float[channel * outH * outW]
Then out_buffer is uninitialized, add a cudaMemset after the cudaMalloc line.
To clarify your code, since you already use OpenCV to load images, why don't you use opencv to resize your images ?
cv::resize // Host side method is probably better since you'll have less data copied through PCI-Express
// or
cv::cuda::resize

It took me around two days to figure out a solution for this problem. Basically, I was building a GPU based image preprocessing pipeline for my project. Here's the custom Cuda Kernel.
For Gray scale Image Resizing, change channel from 3 -> 1 and it should work.
__global__ void resize_kernel( real* pIn, real* pOut, int widthIn, int heightIn, int widthOut, int heightOut)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if( i < heightOut && j < widthOut )
{
int iIn = i * heightIn / heightOut;
int jIn = j * widthIn / widthOut;
for(int c = 0; c < channel; c++)
pOut[ (i*widthOut + j)*channel + c ] = pIn[ (iIn*widthIn + jIn)*channel + c ];
}
}

Related

WTV in Opencv's Opencl code for Image resizing

What does WTV stands for in the following Opencl code?
I can't find much info for that. The code is from Opencv for processing on gpu.
__
kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
float ifx, float ify, __global const int * ofs_tab,
__global const int * map_tab, __global const float * alpha_tab)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if (dx < dst_cols && dy < dst_rows)
{
int dst_index = mad24(dy, dst_step, dst_offset);
__global const int * xmap_tab = map_tab;
__global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
__global const float * xalpha_tab = alpha_tab;
__global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
__global const int * xofs_tab = ofs_tab;
__global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);
int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];
int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];
WTV sum = (WTV)(0), buf;
int src_index = mad24(sy0, src_step, src_offset);
for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
{
WTV beta = (WTV)(yalpha_tab[yk]);
buf = (WTV)(0);
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
{
WTV alpha = (WTV)(xalpha_tab[xk]);
buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
}
sum += buf * beta;
}
storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
}
}
It is not defined in the source you shared. It appears to be a type, like float. Just guessing: it's defined using "-D WTV=something" while compiling the kernel.

Example usage of a libyuv API MJPGToI420()

I'm trying to use a libyuv API, more specifically MJPGToI420().
I want to first take a jpeg image as input to MJPGToI420(), the signature of which is below:
int MJPGToI420(const uint8_t* sample,
size_t sample_size,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int src_width,
int src_height,
int dst_width,
int dst_height);
Then, I want to allocate space for the dst_y, dst_u, and dst_v pointers. However, I don't know how much space to allocate for them. I'm also confused about what the strides should be, i.e., what the parameters dst_stride_y, dst_stride_u and dst_stride_v should be.
Would really appreciate any pointers in the right direction.
EDIT: Here's a snippet of code from the libyuv source unit tests that uses this function. However, the test returns 1 which is failure of the function as the intended behavior. The test also just uses zeroes for the data, instead of an actual MJPG file.
TEST_F(LibYUVConvertTest, MJPGToI420) {
const int kOff = 10;
const int kMinJpeg = 64;
const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
? benchmark_width_ * benchmark_height_
: kMinJpeg;
const int kSize = kImageSize + kOff;
align_buffer_page_end(orig_pixels, kSize);
align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
SUBSAMPLE(benchmark_height_, 2));
align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
SUBSAMPLE(benchmark_height_, 2));
// EOI, SOI to make MJPG appear valid.
memset(orig_pixels, 0, kSize);
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
orig_pixels[kSize - kOff + 0] = 0xff;
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
for (int times = 0; times < benchmark_iterations_; ++times) {
int ret =
MJPGToI420(orig_pixels, kSize, dst_y_opt, benchmark_width_, dst_u_opt,
SUBSAMPLE(benchmark_width_, 2), dst_v_opt,
SUBSAMPLE(benchmark_width_, 2), benchmark_width_,
benchmark_height_, benchmark_width_, benchmark_height_);
// Expect failure because image is not really valid.
EXPECT_EQ(1, ret);
}
free_aligned_buffer_page_end(dst_y_opt);
free_aligned_buffer_page_end(dst_u_opt);
free_aligned_buffer_page_end(dst_v_opt);
free_aligned_buffer_page_end(orig_pixels);
}
EDIT 2: Furthermore, this is what I've tried, however, the end yuv files are not even viewable in a yuv viewer (created using the buffers dst_u_opt and dst_y_opt), which makes me believe there might be something that I'm messing up with the function:
int convertMJPGToI420() {
auto fileSize = filesize(IMG_NAME);
// load image into memory
uint8_t* my_img = (uint8_t*) calloc(fileSize, 1);
std::ifstream fin(IMG_NAME, ios::in | ios::binary);
fin.read(reinterpret_cast<char*>(my_img), fileSize);
// exif data offset
// This is the size of the exif data
const int kOff = 4096;
// 4k image is being sent in
int benchmark_width_ = 3840;
int benchmark_height_ = 2160;
const int kSize = fileSize;
// align_buffer_page_end is a macro (look at link posted for unit tests above)
// I'm not sure if the size allocation for these is correct
// I have tried to model it based off the example
align_buffer_page_end(orig_pixels, kSize);
align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
SUBSAMPLE(benchmark_height_, 2));
align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
SUBSAMPLE(benchmark_height_, 2));
// EOI, SOI to make MJPG appear valid
memset(orig_pixels, 0, kSize);
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI
memcpy(orig_pixels + 2, my_img, kSize - kOff - 3);
orig_pixels[kSize - kOff + 0] = 0xff;
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI
// using async as this function might be ansynchronous
std::future<int> ret = std::async(libyuv::MJPGToI420, orig_pixels, kSize, dst_y_opt, benchmark_width_,
dst_u_opt, SUBSAMPLE(benchmark_width_, 2),
dst_v_opt, SUBSAMPLE(benchmark_width_, 2),
benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_);
ret.wait();
// ret is always one, which means there was a failure
if(ret.get() == 0) {
cout << "return value was zero" << endl;
} else {
cout << "return value was one" << endl;
}
FILE* file = fopen("/data/dst_u_opt", "wb");
fwrite(dst_y_opt, 1, SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2) , file);
fclose(file);
file = fopen("/data/dst_v_opt", "wb");
fwrite(dst_y_opt, 1, SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2), file);
fclose(file);
free_aligned_buffer_page_end(dst_y_opt);
free_aligned_buffer_page_end(dst_u_opt);
free_aligned_buffer_page_end(dst_v_opt);
free_aligned_buffer_page_end(orig_pixels);
return 0;
}
You'll need to know the width and height of the jpeg.
I420 is a 420 sub-sampled YUV.
The Y plane is width * height in bytes.
The dst_stride_y value is width
e.g.
char* dst_y = malloc(width * height);
The U and V planes are half width and height. To handle odd sizes you should round up.
dst_stride_u = (width + 1) / 2;
dst_stride_v = (width + 1) / 2;
The u and v planes are ((width + 1) / 2) * ((height + 1) / 2) bytes.
char* dst_u = malloc(((width + 1) / 2) * ((height + 1) / 2));
char* dst_y = malloc(((width + 1) / 2) * ((height + 1) / 2));
If you'd like to file an issue, including better documentation, post it here:
https://bugs.chromium.org/p/libyuv/issues/list

RGB2GRAY with CUDA and CImg library

I need to do a RGB2GRAY image processing algorithm. I just need some help in completing the global function or how I can access the * d_src pointer. This is my code, your help will be greatly appreciated.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "CImg.h"
#include <iostream>
using namespace std;
using namespace cimg_library;
__global__ void rgb2gray(unsigned char * d_src, unsigned char * d_dst, int width, int height){
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
}
int main(){
//Load image
CImg<unsigned char> src("lena.jpg");
int width = src.width();
int height = src.height();
unsigned long sizee = src.size();
int sze = width * height;
cout << sze << endl;
//create pointer to image
unsigned char *h_src = src.data();
CImg<unsigned char> dst(width, height, 1, 1);
unsigned char *h_dst = dst.data();
unsigned char *d_src;
unsigned char *d_dst;
cout << sizee << endl;
cudaMalloc((void**)&d_src, sizee);
cudaMalloc((void**)&d_dst, width*height*sizeof(int));
cudaMemcpy(d_src, h_src, sizee, cudaMemcpyHostToDevice);
//launch the kernel
rgb2gray << <(width/16,height/16,1), (16, 16, 1) >> >(d_src, d_dst, width, height);
//force the printf()s to flush
cudaDeviceSynchronize();
// copy back the result array to the CPU
cudaMemcpy(h_dst, d_dst, width*height, cudaMemcpyDeviceToHost);
cudaFree(d_src);
cudaFree(d_dst);
CImgDisplay main_disp(dst, "After Processing");
while (!main_disp.is_closed())
main_disp.wait();
return 0;
}
Firstly, since your dst object consists of unsigned char, allocate d_dst as follows;
cudaMalloc((void**)&d_dst, width*height*sizeof(unsigned char));
Next, grid must cover every pixels, considering cases when width or height are not a multiple of 16. Launch kernel with following kernel configuration.
dim3 blkDim (16, 16, 1);
dim3 grdDim ((width + 15)/16, (height + 15)/16, 1);
rgb2gray<<<grdDim, blkDim>>>(d_src, d_dst, width, height);
Lastly, your kernel should look like this. Note that RGB channels are split in d_src.
int pos_x = blockIdx.x * blockDim.x + threadIdx.x;
int pos_y = blockIdx.y * blockDim.y + threadIdx.y;
if (pos_x >= width || pos_y >= height)
return;
unsigned char r = d_src[pos_y * width + pos_x];
unsigned char g = d_src[(height + pos_y) * width + pos_x];
unsigned char b = d_src[(height * 2 + pos_y) * width + pos_x];
unsigned int _gray = (unsigned int)((float)(r + g + b) / 3.0f + 0.5);
unsigned char gray = _gray > 255 ? 255 : _gray;
d_dst[pos_y * width + pos_x] = gray;
You can see the full code here.

Converting cv::Mat to MTLTexture

An intermediate step of my current project requires conversion of opencv's cv::Mat to MTLTexture, the texture container of Metal. I need to store the Floats in the Mat as Floats in the texture; my project cannot quite afford the loss of precision.
This is my attempt at such a conversion.
- (id<MTLTexture>)texForMat:(cv::Mat)image context:(MBEContext *)context
{
id<MTLTexture> texture;
int width = image.cols;
int height = image.rows;
Float32 *rawData = (Float32 *)calloc(height * width * 4,sizeof(float));
int bytesPerPixel = 4;
int bytesPerRow = bytesPerPixel * width;
float r, g, b,a;
for(int i = 0; i < height; i++)
{
Float32* imageData = (Float32*)(image.data + image.step * i);
for(int j = 0; j < width; j++)
{
r = (Float32)(imageData[4 * j]);
g = (Float32)(imageData[4 * j + 1]);
b = (Float32)(imageData[4 * j + 2]);
a = (Float32)(imageData[4 * j + 3]);
rawData[image.step * (i) + (4 * j)] = r;
rawData[image.step * (i) + (4 * j + 1)] = g;
rawData[image.step * (i) + (4 * j + 2)] = b;
rawData[image.step * (i) + (4 * j + 3)] = a;
}
}
MTLTextureDescriptor *textureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
width:width
height:height
mipmapped:NO];
texture = [context.device newTextureWithDescriptor:textureDescriptor];
MTLRegion region = MTLRegionMake2D(0, 0, width, height);
[texture replaceRegion:region mipmapLevel:0 withBytes:rawData bytesPerRow:bytesPerRow];
free(rawData);
return texture;
}
But it doesn't seem to be working. It reads zeroes every time from the Mat, and throws up EXC_BAD_ACCESS. I need the MTLTexture in MTLPixelFormatRGBA16Float to keep the precision.
Thanks for considering this issue.
One problem here is you’re loading up rawData with Float32s but your texture is RGBA16Float, so the data will be corrupted (16Float is half the size of Float32). This shouldn’t cause your crash, but it’s an issue you’ll have to deal with.
Also as “chappjc” noted you’re using ‘image.step’ when writing your data out, but that buffer should be contiguous and not ever have a step that’s not just (width * bytesPerPixel).

Why do operations with an array corrupt the values?

I'm trying to implement the Particle Swarm Optimization on CUDA. I'm partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.
The problem is, when I'm trying to modify array element like so
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.
I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can't spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].
Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;
typedef struct {
unsigned k_max;
float c1;
float c2;
unsigned p;
float inertia_factor;
float Ef;
float X_low[f_n];
float X_high[f_n];
float X_min[n][f_n];
} params_t;
typedef void (*kernelWrapperType) (
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
);
typedef float (*twoArgsFuncType) (
float x1,
float x2
);
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
__device__ float kernelF1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
void f1(
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
) {
float *X_d = NULL;
float *Y_d = NULL;
unsigned length = n * p;
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
size_t pitch;
size_t dpitch;
cudaError_t err;
unsigned width = n;
unsigned height = p;
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
pitch = n * sizeof(float);
err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
err = cudaMalloc (&Y_d, sizeof(float) * p);
err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
err = cudaFree(X_d);
err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
err = cudaFree(Y_d);
}
float F1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
/*
* Generates random float in [0.0; 1.0]
*/
float frand(){
return (float)rand()/(float)RAND_MAX;
}
/*
* This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
* brings the result vector back to host and dumps it on the console.
*/
int main() {
const params_t params = {
100,
0.5,
0.5,
p,
0.98,
0.01,
{-5.12, -2.048, -5.12},
{5.12, 2.048, 5.12},
{{0, 1, 0}, {0, 1, 0}}
};
float X[p][n];
float X_highVec[n];
float V[p][n];
float X_best[p][n];
float Y[p] = {0};
float Y_best[p] = {0};
float X_swarmBest[n];
kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
twoArgsFuncType F[f_n] = {&F1, &F1, &F1};
for (unsigned f = 0; f < f_n; f++) {
printf("Optimizing function #%u\n", f);
srand ( time(NULL) );
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
X[i][j] = X_best[i][j] = frand();
for (int i = 0; i < n; i++)
X_highVec[i] = params.X_high[f];
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
V[i][j] = frand();
for (unsigned i = 0; i < p; i++)
Y_best[i] = F[f](X[i][0], X[i][1]);
for (unsigned i = 0; i < n; i++)
X_swarmBest[i] = params.X_high[f];
float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);
bool termination = false;
float inertia = 1.;
for (unsigned k = 0; k < params.k_max; k++) {
F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, &params, f);
}
for (unsigned i = 0; i < p; i++)
{
for (unsigned j = 0; j < n; j++)
{
printf("%f\t", X[i][j]);
}
printf("F = %f\n", Y[i]);
}
getchar();
}
}
Update: I tried adding error handling like so
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
fprintf(stderr, cudaGetErrorString(err));
exit(1);
}
after each API call, but it gave me nothing and didn't return (I still get all the results and program works to the end).
This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:
const unsigned n = 2;
const unsigned p = 64;
unsigned length = n * p
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.
As for infinite values, that is to be expected isn't it? Your code starts with values in (0,1), and then does
X[i] = X[i] * (5.12--5.12) - -5.12
100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by
X[i] = X[i] * (2.048--2.048) - -2.048
100 times, which is the rough equivalent of multiplying by 4^100, finally followed by
X[i] = X[i] * (5.12--5.12) - -5.12
again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.

Resources