Related
What does WTV stands for in the following Opencl code?
I can't find much info for that. The code is from Opencv for processing on gpu.
__
kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
float ifx, float ify, __global const int * ofs_tab,
__global const int * map_tab, __global const float * alpha_tab)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if (dx < dst_cols && dy < dst_rows)
{
int dst_index = mad24(dy, dst_step, dst_offset);
__global const int * xmap_tab = map_tab;
__global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
__global const float * xalpha_tab = alpha_tab;
__global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
__global const int * xofs_tab = ofs_tab;
__global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);
int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];
int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];
WTV sum = (WTV)(0), buf;
int src_index = mad24(sy0, src_step, src_offset);
for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
{
WTV beta = (WTV)(yalpha_tab[yk]);
buf = (WTV)(0);
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
{
WTV alpha = (WTV)(xalpha_tab[xk]);
buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
}
sum += buf * beta;
}
storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
}
}
It is not defined in the source you shared. It appears to be a type, like float. Just guessing: it's defined using "-D WTV=something" while compiling the kernel.
I want to identify the Expression like int a = function(b,c), so I wrote the code as followers:
void foo(int* a, int *b) {
int x;
int m;
int z;
int *p;
if (a[0] > 1) {
b[0] = 2;
z=10;
x = function( sizeof(char));
}
m = function( sizeof(char));
bar(x,m);
}
void bar(float x, float y);
int function(int size){
return size;
}
And than I used clang -Xclang -ast-dump -fsyntax-only cfunc_with_if.c to get the AST of the code:
From the result I found the AST Node type of int a = function(b,c) is BinaryOperator. In order to verify this, I use VisitStmt(Stmt *s) to print out all stmts' type.
bool VisitStmt(Stmt *s) {
if(isa<Stmt>(s)) {
Stmt *Statement = dyn_cast<Stmt>(s);
//Statement->dump();
std::string st(Statement->getStmtClassName());
st = st + "\n";
TheRewriter.InsertText(Statement->getLocStart(), st, true, true);
}
return true;
}
But the result is so weird. There is nothing printed out about the type of int a = function(b,c). and I'm so confused about the result. Is there some error in my code or something else?
There's no output at bar(x,m); either. Are there any errors when the tool compiles the code being analyzed? As written above, the code would fail to compile at x = function( sizeof(char)); since function has not been declared. Even when compilation has failed due to errors, the libtool tools can still run at least partially, with strange results.
Edit to add: what happens if you run the tool on this code?
void bar(float x, float y);
int function(int size);
void foo(int* a, int *b) {
int x;
int m;
int z;
int *p;
if (a[0] > 1) {
b[0] = 2;
z=10;
x = function( sizeof(char));
}
m = function( sizeof(char));
bar(x,m);
}
void bar(float x, float y);
int function(int size){
return size;
}
I am trying to implement an algorithm to process images with more than 256 bins.
The main issue to process histogram in such case comes from the impossibility to allocate more than 32 Kb as local tab in the GPU.
All the algorithms I found for 8 bits per pixel images use a fixed size tab locally.
The histogram is the first process in that tab then a barrier is up and at last an addition is made with the output vector.
I am working with IR image which has more than 32K bins of dynamic.
So I cannot allocate a fixed size tab inside the GPU.
My algorithm use an atomic_add in order to create directly the output histogram.
I am interfacing with OpenCV so, in order to manage the possible case of saturation my bins use floating points. Depending on the ability of the GPU to manage single or double precision.
OpenCV doesn't manage unsigned int, long, and unsigned long data type as matrix type.
I have an error... I do think this error is a kind of segmentation fault.
After several days I still have no idea what can be wrong.
Here is my code :
histogram.cl :
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
static void Atomic_Add_f64(__global double *val, double delta)
{
union {
double f;
ulong i;
} old;
union {
double f;
ulong i;
} new;
do {
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);
}
static void Atomic_Add_f32(__global float *val, double delta)
{
union
{
float f;
uint i;
} old;
union
{
float f;
uint i;
} new;
do
{
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);
}
__kernel void khist(
__global const uchar* _src,
const int src_steps,
const int src_offset,
const int rows,
const int cols,
__global uchar* _dst,
const int dst_steps,
const int dst_offset)
{
const int gid = get_global_id(0);
// printf("This message has been printed from the OpenCL kernel %d \n",gid);
if(gid < rows)
{
__global const _Sty* src = (__global const _Sty*)_src;
__global _Dty* dst = (__global _Dty*) _dst;
const int src_step1 = src_steps/sizeof(_Sty);
const int dst_step1 = dst_steps/sizeof(_Dty);
src += mad24(gid,src_step1,src_offset);
dst += mad24(gid,dst_step1,dst_offset);
_Dty one = (_Dty)1;
for(int c=0;c<cols;c++)
{
const _Rty idx = (_Rty)(*(src+c+src_offset));
ATOMIC_FUN(dst+idx+dst_offset,one);
}
}
}
The function Atomic_Add_f64 directly come from here and there
main.cpp
#include <opencv2/core.hpp>
#include <opencv2/core/ocl.hpp>
#include <fstream>
#include <sstream>
#include <chrono>
int main()
{
cv::Mat_<unsigned short> a(480,640);
cv::RNG rng(std::time(nullptr));
std::for_each(a.begin(),a.end(),[&](unsigned short& v){ v = rng.uniform(0,100);});
bool ret = false;
cv::String file_content;
{
std::ifstream file_stream("../test/histogram.cl");
std::ostringstream file_buf;
file_buf<<file_stream.rdbuf();
file_content = file_buf.str();
}
int output_flag = cv::ocl::Device::getDefault().doubleFPConfig() == 0 ? CV_32F : CV_64F;
cv::String atomic_fun = output_flag == CV_32F ? "Atomic_Add_f32" : "Atomic_Add_f64";
cv::ocl::ProgramSource source(file_content);
// std::cout<<source.source()<<std::endl;
cv::ocl::Kernel k;
cv::UMat src;
cv::UMat dst = cv::UMat::zeros(1,65536,output_flag);
a.copyTo(src);
atomic_fun = cv::format("-D _Sty=%s -D _Rty=%s -D _Dty=%s -D ATOMIC_FUN=%s",
cv::ocl::typeToStr(src.depth()),
cv::ocl::typeToStr(src.depth()), // this to manage case like a matrix of usigned short stored as a matrix of float.
cv::ocl::typeToStr(output_flag),
atomic_fun.c_str());
ret = k.create("khist",source,atomic_fun);
std::cout<<"check create : "<<ret<<std::endl;
k.args(cv::ocl::KernelArg::ReadOnly(src),cv::ocl::KernelArg::WriteOnlyNoSize(dst));
std::size_t sz = a.rows;
ret = k.run(1,&sz,nullptr,false);
std::cout<<"check "<<ret<<std::endl;
cv::Mat b;
dst.copyTo(b);
std::copy_n(b.ptr<double>(0),101,std::ostream_iterator<double>(std::cout," "));
std::cout<<std::endl;
return EXIT_SUCCESS;
}
Hello I arrived to fix it.
I don't really know where the issue come from.
But if I suppose the output as a pointer rather than a matrix it work.
The changes I made are these :
histogram.cl :
__kernel void khist(
__global const uchar* _src,
const int src_steps,
const int src_offset,
const int rows,
const int cols,
__global _Dty* _dst)
{
const int gid = get_global_id(0);
if(gid < rows)
{
__global const _Sty* src = (__global const _Sty*)_src;
__global _Dty* dst = _dst;
const int src_step1 = src_steps/sizeof(_Sty);
src += mad24(gid,src_step1,src_offset);
ulong one = 1;
for(int c=0;c<cols;c++)
{
const _Rty idx = (_Rty)(*(src+c+src_offset));
ATOMIC_FUN(dst+idx,one);
}
}
}
main.cpp
k.args(cv::ocl::KernelArg::ReadOnly(src),cv::ocl::KernelArg::PtrWriteOnly(dst));
The rest of the code is the same in the two files.
For me it work fine.
If someone know why it work when the ouput is declared as a pointer rather than a vector (matrix of one row) I am interested.
Nevertheless my issue is fix :).
Consider the code below:
#include <stdio.h>
#include <stdlib.h>
#define FORCE_CAST(var, type) *(type*)&var
struct processor_status_register
{
unsigned int cwp:5;
unsigned int et:1;
unsigned int ps:1;
unsigned int s:1;
unsigned int pil:4;
unsigned int ef:1;
unsigned int ec:1;
unsigned int reserved:6;
unsigned int c:1;
unsigned int v:1;
unsigned int z:1;
unsigned int n:1;
unsigned int ver:4;
unsigned int impl:4;
}__attribute__ ((__packed__));
struct registers
{
unsigned long* registerSet;
unsigned long* globalRegisters;
unsigned long* cwptr;
unsigned long wim, tbr, y, pc, npc;
unsigned short registerWindows;
/* Though Intel x86 architecture allows un-aligned memory access, SPARC mandates memory accesses to be 8 byte aligned. Without __attribute__ ((aligned (8))) or a preceding dummy byte e.g. unsigned short dummyByte, the code below crashes with a dreaded Bus error and Core dump. For more details, follow the links below:
http://blog.jgc.org/2007/04/debugging-solaris-bus-error-caused-by.html
https://groups.google.com/forum/?fromgroups=#!topic/comp.unix.solaris/8SgFiMudGL4
*/
struct processor_status_register __attribute__ ((aligned (8))) psr;
}__attribute__ ((__packed__));
int getBit(unsigned long bitStream, int position)
{
int bit;
bit = (bitStream & (1 << position)) >> position;
return bit;
}
char* showBits(unsigned long bitStream, int startPosition, int endPosition)
{
// Allocate one extra byte for NULL character
char* bits = (char*)malloc(endPosition - startPosition + 2);
int bitIndex;
for(bitIndex = 0; bitIndex <= endPosition; bitIndex++)
bits[bitIndex] = (getBit(bitStream, endPosition - bitIndex)) ? '1' : '0';
bits[bitIndex] = '\0';
return bits;
}
int main()
{
struct registers sparcRegisters; short isLittleEndian;
// Check for Endianness
unsigned long checkEndian = 0x00000001;
if(*((char*)(&checkEndian)))
{printf("Little Endian\n"); isLittleEndian = 1;} // Little
Endian architecture detected
else
{printf("Big Endian\n"); isLittleEndian = 0;} // Big
Endian architecture detected
unsigned long registerValue = 0xF30010A7;
unsigned long swappedRegisterValue = isLittleEndian ? registerValue :
__builtin_bswap32(registerValue);
sparcRegisters.psr = FORCE_CAST(swappedRegisterValue, struct
processor_status_register);
registerValue = isLittleEndian ? FORCE_CAST (sparcRegisters.psr,
unsigned long) : __builtin_bswap32(FORCE_CAST (sparcRegisters.psr,
unsigned long));
printf("\nPSR=0x%0X, IMPL=%u, VER=%u, CWP=%u\n", registerValue,
sparcRegisters.psr.impl, sparcRegisters.psr.ver,
sparcRegisters.psr.cwp);
printf("PSR=%s\n",showBits(registerValue, 0, 31));
sparcRegisters.psr.cwp = 7;
sparcRegisters.psr.et = 1;
sparcRegisters.psr.ps = 0;
sparcRegisters.psr.s = 1;
sparcRegisters.psr.pil = 0;
sparcRegisters.psr.ef = 0;
sparcRegisters.psr.ec = 0;
sparcRegisters.psr.reserved = 0;
sparcRegisters.psr.c = 0;
sparcRegisters.psr.v = 0;
sparcRegisters.psr.z = 0;
sparcRegisters.psr.n = 0;
sparcRegisters.psr.ver = 3;
sparcRegisters.psr.impl = 0xF;
registerValue = isLittleEndian ? FORCE_CAST (sparcRegisters.psr,
unsigned long) : __builtin_bswap32(FORCE_CAST (sparcRegisters.psr,
unsigned long));
printf("\nPSR=0x%0X, IMPL=%u, VER=%u, CWP=%u\n", registerValue,
sparcRegisters.psr.impl, sparcRegisters.psr.ver,
sparcRegisters.psr.cwp);
printf("PSR=%s\n\n",showBits(registerValue, 0, 31));
return 0;
}
I have used gcc-4.7.2 on Solaris 10 on SPARC to compile the following
code to produce the Big-Endian output:
Big Endian
PSR=0xF30010A7, IMPL=3, VER=15, CWP=20
PSR=11110011000000000001000010100111
PSR=0x3F00003D, IMPL=15, VER=3, CWP=7
PSR=00111111000000000000000000111101
I have used gcc-4.4 on Ubuntu-10.04 on Intel-x86 to compile the
following code to produce the Little-Endian output:
Little Endian
PSR=0xF30010A7, IMPL=15, VER=3, CWP=7
PSR=11110011000000000001000010100111
PSR=0xF30000A7, IMPL=15, VER=3, CWP=7
PSR=11110011000000000000000010100111
While the later one is as expected, can anyone please explain the
Big-Endian counterpart? Considering the showBits() method to be
correct, how can PSR=0x3F00003D give rise to IMPL=15, VER=3, CWP=7
values? How is the bit-field is being arranged and interpreted in
memory on a Big-Endian system?
... PSR=0x3F00003D give rise to IMPL=15, VER=3, CWP=7 values?
It cant. I don't know why you're calling __builtin_bswap32 but 0x3F00003D does not represent the memory of the sparcRegisters struct as you initialized it.
Lets check this code:
sparcRegisters.psr.cwp = 7;
sparcRegisters.psr.et = 1;
sparcRegisters.psr.ps = 0;
sparcRegisters.psr.s = 1;
sparcRegisters.psr.pil = 0;
sparcRegisters.psr.ef = 0;
sparcRegisters.psr.ec = 0;
sparcRegisters.psr.reserved = 0;
sparcRegisters.psr.c = 0;
sparcRegisters.psr.v = 0;
sparcRegisters.psr.z = 0;
sparcRegisters.psr.n = 0;
sparcRegisters.psr.ver = 3;
sparcRegisters.psr.impl = 0xF;
The individual translations are as follows:
7 => 00111
1 => 1
0 => 0
1 => 1
0 => 0000
0 => 0
0 => 0
0 => 000000
0 => 0
0 => 0
0 => 0
0 => 0
3 => 0011
F => 1111
The structure therefore in memory becomes 00111101000000000000000000111111 which is 0x3D00003F in big-endian.
You can confirm with this code (tested using CC in solaris):
#include <stdio.h>
#include <string.h>
struct processor_status_register
{
unsigned int cwp:5;
unsigned int et:1;
unsigned int ps:1;
unsigned int s:1;
unsigned int pil:4;
unsigned int ef:1;
unsigned int ec:1;
unsigned int reserved:6;
unsigned int c:1;
unsigned int v:1;
unsigned int z:1;
unsigned int n:1;
unsigned int ver:4;
unsigned int impl:4;
}__attribute__ ((__packed__));
int getBit(unsigned long bitStream, int position)
{
int bit;
bit = (bitStream & (1 << position)) >> position;
return bit;
}
char* showBits(unsigned long bitStream, int startPosition, int endPosition)
{
// Allocate one extra byte for NULL character
static char bits[33];
memset(bits, 0, 33);
int bitIndex;
for(bitIndex = 0; bitIndex <= endPosition; bitIndex++)
{
bits[bitIndex] = (getBit(bitStream, endPosition - bitIndex)) ? '1' : '0';
}
return bits;
}
int main()
{
processor_status_register psr;
psr.cwp = 7;
psr.et = 1;
psr.ps = 0;
psr.s = 1;
psr.pil = 0;
psr.ef = 0;
psr.ec = 0;
psr.reserved = 0;
psr.c = 0;
psr.v = 0;
psr.z = 0;
psr.n = 0;
psr.ver = 3;
psr.impl = 0xF;
unsigned long registerValue = 0;
memcpy(®isterValue, &psr, sizeof(registerValue));
printf("\nPSR=0x%0X, IMPL=%u, VER=%u, CWP=%u\n", registerValue,
psr.impl, psr.ver,
psr.cwp);
printf("PSR=%s\n\n",showBits(registerValue, 0, 31));
return 0;
}
The output of this is:
PSR=0x3D00003F, IMPL=15, VER=3, CWP=7
PSR=00111101000000000000000000111111
I'm trying to implement the Particle Swarm Optimization on CUDA. I'm partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.
The problem is, when I'm trying to modify array element like so
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.
I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can't spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].
Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;
typedef struct {
unsigned k_max;
float c1;
float c2;
unsigned p;
float inertia_factor;
float Ef;
float X_low[f_n];
float X_high[f_n];
float X_min[n][f_n];
} params_t;
typedef void (*kernelWrapperType) (
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
);
typedef float (*twoArgsFuncType) (
float x1,
float x2
);
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
__device__ float kernelF1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
void f1(
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
) {
float *X_d = NULL;
float *Y_d = NULL;
unsigned length = n * p;
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
size_t pitch;
size_t dpitch;
cudaError_t err;
unsigned width = n;
unsigned height = p;
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
pitch = n * sizeof(float);
err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
err = cudaMalloc (&Y_d, sizeof(float) * p);
err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
err = cudaFree(X_d);
err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
err = cudaFree(Y_d);
}
float F1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
/*
* Generates random float in [0.0; 1.0]
*/
float frand(){
return (float)rand()/(float)RAND_MAX;
}
/*
* This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
* brings the result vector back to host and dumps it on the console.
*/
int main() {
const params_t params = {
100,
0.5,
0.5,
p,
0.98,
0.01,
{-5.12, -2.048, -5.12},
{5.12, 2.048, 5.12},
{{0, 1, 0}, {0, 1, 0}}
};
float X[p][n];
float X_highVec[n];
float V[p][n];
float X_best[p][n];
float Y[p] = {0};
float Y_best[p] = {0};
float X_swarmBest[n];
kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
twoArgsFuncType F[f_n] = {&F1, &F1, &F1};
for (unsigned f = 0; f < f_n; f++) {
printf("Optimizing function #%u\n", f);
srand ( time(NULL) );
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
X[i][j] = X_best[i][j] = frand();
for (int i = 0; i < n; i++)
X_highVec[i] = params.X_high[f];
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
V[i][j] = frand();
for (unsigned i = 0; i < p; i++)
Y_best[i] = F[f](X[i][0], X[i][1]);
for (unsigned i = 0; i < n; i++)
X_swarmBest[i] = params.X_high[f];
float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);
bool termination = false;
float inertia = 1.;
for (unsigned k = 0; k < params.k_max; k++) {
F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, ¶ms, f);
}
for (unsigned i = 0; i < p; i++)
{
for (unsigned j = 0; j < n; j++)
{
printf("%f\t", X[i][j]);
}
printf("F = %f\n", Y[i]);
}
getchar();
}
}
Update: I tried adding error handling like so
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
fprintf(stderr, cudaGetErrorString(err));
exit(1);
}
after each API call, but it gave me nothing and didn't return (I still get all the results and program works to the end).
This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:
const unsigned n = 2;
const unsigned p = 64;
unsigned length = n * p
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.
As for infinite values, that is to be expected isn't it? Your code starts with values in (0,1), and then does
X[i] = X[i] * (5.12--5.12) - -5.12
100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by
X[i] = X[i] * (2.048--2.048) - -2.048
100 times, which is the rough equivalent of multiplying by 4^100, finally followed by
X[i] = X[i] * (5.12--5.12) - -5.12
again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.