Objective-C division on 32/64-bit device produces different results - ios

As described in the title, when I try to do the following divsion I get two different results depending on the architecture of the device:
unsigned int a = 42033;
unsigned int b = 360;
unsigned int c = 466
double result = a / (double)(b * c);
// on arm64 -> result = 0.25055436337625181
// on armv7 -> result = 0.24986030696800732
Why the results do not match?
According to Apple 64-Bit Transition Guide for Cocoa Touch these data type have the same size in 32 and 64 bit runtime.
EDIT
The complete code:
#import "UIImage+MyCategory.h"
#define CLIP_THRESHOLD 0.74 // if this much of the image is the clip color, leave it alone
typedef struct {
unsigned int leftNonColorIndex;
unsigned int rightNonColorIndex;
unsigned int nonColorCount;
} scanLineResult;
static inline scanLineResult scanOneLine(unsigned int *scanline, unsigned int count, unsigned int color, unsigned int mask) {
scanLineResult result = {UINT32_MAX, 0, 0};
for (int i = 0; i < count; i++) {
if ((*scanline++ & mask) != color) {
result.nonColorCount++;
result.leftNonColorIndex = MIN(result.leftNonColorIndex, i);
result.rightNonColorIndex = MAX(result.rightNonColorIndex, i);
}
}
return result;
}
typedef struct {
unsigned int leftNonColorIndex;
unsigned int topNonColorIndex;
unsigned int rightNonColorIndex;
unsigned int bottomNonColorIndex;
unsigned int nonColorCount;
double colorRatio;
} colorBoundaries;
static colorBoundaries findTrimColorBoundaries(unsigned int *buffer,
unsigned int width,
unsigned int height,
unsigned int bytesPerRow,
unsigned int color,
unsigned int mask)
{
colorBoundaries result = {UINT32_MAX, UINT32_MAX, 0, 0, 0.0};
unsigned int *currentLine = buffer;
for (int i = 0; i < height; i++) {
scanLineResult lineResult = scanOneLine(currentLine, width, color, mask);
if (lineResult.nonColorCount) {
result.nonColorCount += lineResult.nonColorCount;
result.topNonColorIndex = MIN(result.topNonColorIndex, i);
result.bottomNonColorIndex = MAX(result.bottomNonColorIndex, i);
result.leftNonColorIndex = MIN(result.leftNonColorIndex, lineResult.leftNonColorIndex);
result.rightNonColorIndex = MAX(result.rightNonColorIndex, lineResult.rightNonColorIndex);
}
currentLine = (unsigned int *)((char *)currentLine + bytesPerRow);
}
double delta = result.nonColorCount / (double)(width * height);
result.colorRatio = 1.0 - delta;
return result;
}
#implementation UIImage (MyCategory)
- (UIImage *)crop:(CGRect)rect {
rect = CGRectMake(rect.origin.x * self.scale,
rect.origin.y * self.scale,
rect.size.width * self.scale,
rect.size.height * self.scale);
CGImageRef imageRef = CGImageCreateWithImageInRect([self CGImage], rect);
UIImage *result = [UIImage imageWithCGImage:imageRef
scale:self.scale
orientation:self.imageOrientation];
CGImageRelease(imageRef);
return result;
}
- (UIImage*)trimWhiteBorders {
#ifdef __BIG_ENDIAN__
// undefined
#else
const unsigned int whiteXRGB = 0x00ffffff;
// Which bits to actually check
const unsigned int maskXRGB = 0x00ffffff;
#endif
CGImageRef image = [self CGImage];
CGBitmapInfo bitmapInfo = CGImageGetBitmapInfo(image);
// Only support default image formats
if (bitmapInfo != (kCGImageAlphaNoneSkipFirst | kCGBitmapByteOrder32Host))
return nil;
CGDataProviderRef dataProvider = CGImageGetDataProvider(image);
CFDataRef imageData = CGDataProviderCopyData(dataProvider);
colorBoundaries result = findTrimColorBoundaries((unsigned int *)CFDataGetBytePtr(imageData),
(unsigned int)CGImageGetWidth(image),
(unsigned int)CGImageGetHeight(image),
(unsigned int)CGImageGetBytesPerRow(image),
whiteXRGB,
maskXRGB);
CFRelease(imageData);
if (result.nonColorCount == 0 || result.colorRatio > CLIP_THRESHOLD)
return self;
CGRect trimRect = CGRectMake(result.leftNonColorIndex,
result.topNonColorIndex,
result.rightNonColorIndex - result.leftNonColorIndex + 1,
result.bottomNonColorIndex - result.topNonColorIndex + 1);
return [self crop:trimRect];
}
#end

Tested the code below in Xcode 6.3.1 using iOS 8.3 LLVM 6.1 on an iPad 3(armv7) and an iPhone 6 (arm64) and they produced the same values to at least 15 points of precision.
unsigned int a = 42033;
unsigned int b = 360;
unsigned int c = 466;
double result = a / (double)(b * c);
// on arm64 -> result = 0.25055436337625181
// on armv7 -> result = 0.24986030696800732
NSString *msg = [NSString stringWithFormat:#"result: %.15f", result];
[[[UIAlertView alloc] initWithTitle:#"" message:msg delegate:nil cancelButtonTitle:#"#jolo" otherButtonTitles:nil] show];
That being said, Xcode 6.3 includes LVVM 6.1 and that includes changes for arm64 and floating point math. See the Apple LLVM Compiler Version 6.1 section in the release notes.
https://developer.apple.com/library/ios/releasenotes/DeveloperTools/RN-Xcode/Chapters/xc6_release_notes.html

Java code:
public class Division {
public static void main(String[] args) {
int a = 42033;
int b = 360;
int c = 466;
double result = a / (double)(b * c);
System.out.println("Result = " + result);
double result2 = (a - 1) / (double) (b * c);
double result3 = (a) / (double) ((b + 1) * c);
double result4 = (a) / (double) (b * (c + 1));
System.out.println("Result2 = " + result2);
System.out.println("Result3 = " + result3);
System.out.println("Result4 = " + result4);
}
}
Results:
C:\JavaTools>java Division
Result = 0.2505543633762518
Result2 = 0.250548402479733
Result3 = 0.24986030696800732
Result4 = 0.25001784439685937
As can be seen, the "wrong" results are explained by having a value for b other than what was stated by the OP. Has nothing to do with the precision of the arithmetic.

Related

WTV in Opencv's Opencl code for Image resizing

What does WTV stands for in the following Opencl code?
I can't find much info for that. The code is from Opencv for processing on gpu.
__
kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
float ifx, float ify, __global const int * ofs_tab,
__global const int * map_tab, __global const float * alpha_tab)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if (dx < dst_cols && dy < dst_rows)
{
int dst_index = mad24(dy, dst_step, dst_offset);
__global const int * xmap_tab = map_tab;
__global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
__global const float * xalpha_tab = alpha_tab;
__global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
__global const int * xofs_tab = ofs_tab;
__global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);
int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];
int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];
WTV sum = (WTV)(0), buf;
int src_index = mad24(sy0, src_step, src_offset);
for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
{
WTV beta = (WTV)(yalpha_tab[yk]);
buf = (WTV)(0);
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
{
WTV alpha = (WTV)(xalpha_tab[xk]);
buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
}
sum += buf * beta;
}
storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
}
}
It is not defined in the source you shared. It appears to be a type, like float. Just guessing: it's defined using "-D WTV=something" while compiling the kernel.

How to do real-time pitch shifting from mic with Superpowered? [duplicate]

This question already has answers here:
Superpowered: real time pitch shift with timestretcher not working
(2 answers)
Closed 5 years ago.
I'm trying to make a pitch shift in real time from a microphone using superpowerd. I looked at the example that is for the file. Also tried to adapt it. I managed to change the sound, but it turned out very distorted with interference. What am I doing wrong? where to find more information on superpowered and timeStretching?
static bool audioProcessing(void *clientdata,
float **buffers,
unsigned int inputChannels,
unsigned int outputChannels,
unsigned int numberOfSamples,
unsigned int samplerate,
uint64_t hostTime) {
__unsafe_unretained Superpowered *self = (__bridge Superpowered *)clientdata;
float tempBuffer[numberOfSamples * 2 + 16];
SuperpoweredInterleave(buffers[0], buffers[1], tempBuffer, numberOfSamples);
float *outputBuffer = tempBuffer;
SuperpoweredAudiobufferlistElement inputBuffer;
inputBuffer.samplePosition = 0;
inputBuffer.startSample = 0;
inputBuffer.samplesUsed = 0;
inputBuffer.endSample = self->timeStretcher->numberOfInputSamplesNeeded;
inputBuffer.buffers[0] = SuperpoweredAudiobufferPool::getBuffer(self->timeStretcher->numberOfInputSamplesNeeded * 8 + 64);
inputBuffer.buffers[1] = inputBuffer.buffers[2] = inputBuffer.buffers[3] = NULL;
memcpy((float *)inputBuffer.buffers[0], outputBuffer, numberOfSamples * 2 + 16);
self->timeStretcher->process(&inputBuffer, self->outputBuffers);
// Do we have some output?
if (self->outputBuffers->makeSlice(0, self->outputBuffers->sampleLength)) {
while (true) { // Iterate on every output slice.
// Get pointer to the output samples.
int sampleCount = 0;
float *timeStretchedAudio = (float *)self->outputBuffers->nextSliceItem(&sampleCount);
if (!timeStretchedAudio) break;
SuperpoweredDeInterleave(timeStretchedAudio, buffers[0], buffers[1], numberOfSamples);
};
// Clear the output buffer list.
self->outputBuffers->clear();
};
return true;
}
I did the following:
static bool audioProcessing(void *clientdata,
float **buffers,
unsigned int inputChannels,
unsigned int outputChannels,
unsigned int numberOfSamples,
unsigned int samplerate,
uint64_t hostTime) {
__unsafe_unretained Superpowered *self = (__bridge Superpowered *)clientdata;
SuperpoweredAudiobufferlistElement inputBuffer;
inputBuffer.startSample = 0;
inputBuffer.samplesUsed = 0;
inputBuffer.endSample = numberOfSamples;
inputBuffer.buffers[0] = SuperpoweredAudiobufferPool::getBuffer((unsigned int) (numberOfSamples * 8 + 64));
inputBuffer.buffers[1] = inputBuffer.buffers[2] = inputBuffer.buffers[3] = NULL;
SuperpoweredInterleave(buffers[0], buffers[1], (float *)inputBuffer.buffers[0], numberOfSamples);
self->timeStretcher->process(&inputBuffer, self->outputBuffers);
// Do we have some output?
if (self->outputBuffers->makeSlice(0, self->outputBuffers->sampleLength)) {
while (true) { // Iterate on every output slice.
// Get pointer to the output samples.
int numSamples = 0;
float *timeStretchedAudio = (float *)self->outputBuffers->nextSliceItem(&numSamples);
if (!timeStretchedAudio || *timeStretchedAudio == 0) {
break;
}
SuperpoweredDeInterleave(timeStretchedAudio, buffers[0], buffers[1], numSamples);
}
// Clear the output buffer list.
self->outputBuffers->clear();
}
return true;
}
This might not work correctly when changing the speed also, but I wanted live pitch shifting only. People should be able to speak slower or faster themselves.

ios: EXC_ARM_DA_ALIGN error in release build

I have a function in my application, that store data from buffer. It works fine in debug mode both device and simulator, but when I create .ipa and run it on device, I have EXC_ARM_DA_ALIGN error libstdc++.6.dylib std::string::_M_replace_safe(unsigned long, unsigned long, char const, unsigned long)
struct stMemoryBlock
{
stMemoryBlock(void* InData, int InSize)
{
data = InData;
size = InSize;
offset = 0;
};
void* data;
unsigned int size;
unsigned int offset;
};
//-----------------------------------------------
char* cDataCollector::TestMemoryThink(char* Buffer, int BufferSize, int TestOffset, int TestSize)
{
char* result = NULL;
if (TestOffset + TestSize <= BufferSize)
{
result = &Buffer[TestOffset];
}
return result;
}
//-----------------------------------------------------
bool cDataCollector::StoreBinaryData(void* DataBuffer, int DataSize)
{
bool result = false;
char* InBuffer = (char *)DataBuffer;
if (!mPreparedData && !mPreparedDataSize && !mMemoryMap.size())
{
unsigned int CountElements = 0;
int offset = sizeof(unsigned int);
if (DataSize >= sizeof(unsigned int))
{
// CountElements = *(unsigned int*)(&InBuffer[0]);
memcpy(&CountElements, InBuffer, sizeof(CountElements));
}
result = true;
for (unsigned int i = 0; (i < CountElements) && result; ++i)
{
std::string ThinkName ;
stMemoryBlock * MemoryBlock = NULL;
result = result && TestMemoryThink(InBuffer, DataSize, offset, 0) != NULL;
if (result)
{
size_t name_think_size = strlen(&InBuffer[offset]);
char* think_name = TestMemoryThink(InBuffer, DataSize, offset, 0);
result = result && (think_name != NULL);
if (result)
{
ThinkName = think_name;
offset += (name_think_size + 1);
}
}
this line cause an error:
ThinkName = think_name;
maybe I need another way to read a string from memory location that isn’t word (32-bit) aligned? please,help!

Mean image with two functions difference

I want process image so each pixel value will be mean of its value and 4 neighbours.
Created two different functions:
Mat meanImage(cv::Mat& inputImage)
{
Mat output;
Mat kernel(3,3,CV_32F,0.0);
kernel.at<float>(0,1) = 0.2;
kernel.at<float>(1,0) = 0.2;
kernel.at<float>(1,1) = 0.2;
kernel.at<float>(1,2) = 0.2;
kernel.at<float>(2,1) = 0.2;
filter2D(inputImage,output,-1,kernel);
return output;
}
and:
Mat meanImage2(Mat& inputImage)
{
Mat temp;
Mat output(inputImage.rows,inputImage.cols,inputImage.type());
copyMakeBorder(inputImage,temp,1,1,1,1,BORDER_REPLICATE);
CV_Assert(output.isContinuous());
CV_Assert(temp.isContinuous());
const int len = output.rows * output.cols * output.channels();
const int rowLenTemp = temp.cols * temp.channels();
const int twoRowLenTemp = 2 * rowLenTemp;
const int rowLen = output.cols * output.channels();
uchar* outPtr = output.ptr<uchar>(0);
uchar* tempPtr = temp.ptr<uchar>(0);
for(int i = 0; i < len; ++i)
{
const int a = 6 * (i / rowLen) + 3;
outPtr[i] = (tempPtr[i+rowLenTemp+a] + tempPtr[i+a] +
tempPtr[i+rowLenTemp+a+3] + tempPtr[i+rowLenTemp+a-3] +
tempPtr[i+twoRowLenTemp+a]) / 5;
}
return output;
}
I've assumed that the result will be the same. So I've compared images:
Mat diff;
compare(meanImg1,meanImg2,diff,CMP_NE);
printf("Difference: %d\n",countNonZero(diff));
imshow("diff",diff);
And get a lot off differences. What is the difference between this functions?
Edit:
Difference for lena image taken from Lena
Beware that when you do the sum of pixels, you add unsigned chars and you may overflow.
Test your code by casting these pixels values to int.
outPtr[i] = ((int)tempPtr[i+rowLenTemp+a] + (int)tempPtr[i+a] +
(int)tempPtr[i+rowLenTemp+a+3] + (int)tempPtr[i+rowLenTemp+a-3] +
(int)tempPtr[i+twoRowLenTemp+a]) / 5;
Edit: I'd rather code this like (assuming image type is uchar and it has 3 channels)
for (int r = 0; r < output.rows; r++)
{
uchar* previousRow = temp.ptr<uchar>(r) + 3;
uchar* currentRow = temp.ptr<uchar>(r+1) + 3;
uchar* nextRow = temp.ptr<uchar>(r+2) + 3;
uchar* outRow = output.ptr<uchar>(r);
for (int c = 0; c < 3*output.cols; c++)
{
int value = (int)previousRow[c] +
(int)currentRow[c-3] + (int)currentRow [c] + (int)currentRow[c+3] +
(int)nextRow [c];
outRow[c] = value / 5;
}
}

Why do operations with an array corrupt the values?

I'm trying to implement the Particle Swarm Optimization on CUDA. I'm partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.
The problem is, when I'm trying to modify array element like so
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.
I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can't spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].
Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;
typedef struct {
unsigned k_max;
float c1;
float c2;
unsigned p;
float inertia_factor;
float Ef;
float X_low[f_n];
float X_high[f_n];
float X_min[n][f_n];
} params_t;
typedef void (*kernelWrapperType) (
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
);
typedef float (*twoArgsFuncType) (
float x1,
float x2
);
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
__device__ float kernelF1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
void f1(
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
) {
float *X_d = NULL;
float *Y_d = NULL;
unsigned length = n * p;
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
size_t pitch;
size_t dpitch;
cudaError_t err;
unsigned width = n;
unsigned height = p;
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
pitch = n * sizeof(float);
err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
err = cudaMalloc (&Y_d, sizeof(float) * p);
err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
err = cudaFree(X_d);
err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
err = cudaFree(Y_d);
}
float F1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
/*
* Generates random float in [0.0; 1.0]
*/
float frand(){
return (float)rand()/(float)RAND_MAX;
}
/*
* This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
* brings the result vector back to host and dumps it on the console.
*/
int main() {
const params_t params = {
100,
0.5,
0.5,
p,
0.98,
0.01,
{-5.12, -2.048, -5.12},
{5.12, 2.048, 5.12},
{{0, 1, 0}, {0, 1, 0}}
};
float X[p][n];
float X_highVec[n];
float V[p][n];
float X_best[p][n];
float Y[p] = {0};
float Y_best[p] = {0};
float X_swarmBest[n];
kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
twoArgsFuncType F[f_n] = {&F1, &F1, &F1};
for (unsigned f = 0; f < f_n; f++) {
printf("Optimizing function #%u\n", f);
srand ( time(NULL) );
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
X[i][j] = X_best[i][j] = frand();
for (int i = 0; i < n; i++)
X_highVec[i] = params.X_high[f];
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
V[i][j] = frand();
for (unsigned i = 0; i < p; i++)
Y_best[i] = F[f](X[i][0], X[i][1]);
for (unsigned i = 0; i < n; i++)
X_swarmBest[i] = params.X_high[f];
float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);
bool termination = false;
float inertia = 1.;
for (unsigned k = 0; k < params.k_max; k++) {
F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, &params, f);
}
for (unsigned i = 0; i < p; i++)
{
for (unsigned j = 0; j < n; j++)
{
printf("%f\t", X[i][j]);
}
printf("F = %f\n", Y[i]);
}
getchar();
}
}
Update: I tried adding error handling like so
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
fprintf(stderr, cudaGetErrorString(err));
exit(1);
}
after each API call, but it gave me nothing and didn't return (I still get all the results and program works to the end).
This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:
const unsigned n = 2;
const unsigned p = 64;
unsigned length = n * p
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.
As for infinite values, that is to be expected isn't it? Your code starts with values in (0,1), and then does
X[i] = X[i] * (5.12--5.12) - -5.12
100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by
X[i] = X[i] * (2.048--2.048) - -2.048
100 times, which is the rough equivalent of multiplying by 4^100, finally followed by
X[i] = X[i] * (5.12--5.12) - -5.12
again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.

Resources