Does the speed of ldp instruction depend on caching?

Does the speed of ldp instruction depend on caching? - arm64

I created an annotated trace with perf. For some reason, the instructions with the highest percentages are of type ldp:
5.56 │ ldp s9, s8, [x28,#80]
37.65 │ ldp s2, s3, [x19,#24]
37.65 │ ldp s2, s3, [x19,#24]
Is this a mistake? Or does it hint toward bad L1/L2 caching because ldp sped depends on cache?
Edit:
Here is an excerpt from an annotated output of perf. The excerpt is from a glm matrix multiplication my function uses:
│ _ZN3glmmlIdLNS_9precisionE0EEENS_7tmat4x4IT_XT0_EE8row_typeERKNS4_8col_typeERKS4_(): ▒
│ ) ▒
│ { ▒ ▒
│ ▒
│ return typename tmat4x4<T, P>::row_type( ◆
│ m[0][0] * v[0] + m[0][1] * v[1] + m[0][2] * v[2] + m[0][3] * v[3], ▒
28.86 │ 40: ldp d3, d0, [x0] ▒
│ add x4, x4, #0x20 ▒
│ ldp d18, d10, [x1] ▒
│ m[1][0] * v[0] + m[1][1] * v[1] + m[1][2] * v[2] + m[1][3] * v[3], ▒
0.34 │ ldp d21, d24, [x1,#32] ▒
│ m[2][0] * v[0] + m[2][1] * v[1] + m[2][2] * v[2] + m[2][3] * v[3], ▒
0.34 │ ldp d20, d23, [x1,#64] ▒
│ m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); ▒
0.34 │ ldp d19, d22, [x1,#96] ▒
│ m[1][0] * v[0] + m[1][1] * v[1] + m[1][2] * v[2] + m[1][3] * v[3], ▒
│ fmul d24, d0, d24 ▒
│ m[0][0] * v[0] + m[0][1] * v[1] + m[0][2] * v[2] + m[0][3] * v[3], ▒
10.07 │ ldp d2, d1, [x0,#16] ▒
│ m[2][0] * v[0] + m[2][1] * v[1] + m[2][2] * v[2] + m[2][3] * v[3], ▒
│ fmul d23, d0, d23 ▒
│ m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); ▒
0.34 │ ldp d8, d6, [x1,#16] ▒
│ fmul d22, d0, d22 ▒
0.34 │ ldp d17, d7, [x1,#48] ▒
│ m[0][0] * v[0] + m[0][1] * v[1] + m[0][2] * v[2] + m[0][3] * v[3], ▒
│ fmul d0, d0, d10 ▒
│ m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); ▒
│ ldp d16, d5, [x1,#80] ▒
│ m[1][0] * v[0] + m[1][1] * v[1] + m[1][2] * v[2] + m[1][3] * v[3], ▒
│ fmadd d21, d3, d21, d24 ▒
│ m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); ▒
0.34 │ ldp d9, d4, [x1,#112] ▒
│ m[2][0] * v[0] + m[2][1] * v[1] + m[2][2] * v[2] + m[2][3] * v[3], ▒
│ fmadd d20, d3, d20, d23 ▒
│ m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); ▒
│ fmadd d19, d3, d19, d22 ▒
│ m[0][0] * v[0] + m[0][1] * v[1] + m[0][2] * v[2] + m[0][3] * v[3], ▒
│ fmadd d0, d18, d3, d0 ▒
│ m[1][0] * v[0] + m[1][1] * v[1] + m[1][2] * v[2] + m[1][3] * v[3], ▒
0.67 │ fmadd d17, d2, d17, d21 ▒
│ m[2][0] * v[0] + m[2][1] * v[1] + m[2][2] * v[2] + m[2][3] * v[3], ▒
│ fmadd d16, d2, d16, d20 ▒
│ m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); ▒
│ fmadd d7, d1, d7, d17 ▒
│ fmadd d5, d1, d5, d16 ▒
1.01 │ fmadd d9, d2, d9, d19 ▒
│ m[0][0] * v[0] + m[0][1] * v[1] + m[0][2] * v[2] + m[0][3] * v[3], ▒
│ fmadd d2, d8, d2, d0 ▒
│ m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); ▒
│ fmadd d4, d1, d4, d9 ▒
│ fmadd d1, d6, d1, d2 ▒
0.67 │ add x0, x0, #0x20
Regards
Edit: I added the sourcecode of the current version of the method that produces ~2500 cache misses on 10000 input elements. Maybe someone has still ideas:
inline void
transformVector ( glm::mat4 const & matrix,
std::vector < glm::vec4 > const & input,
std::vector < glm::vec4 > & output )
{
float32x4x4_t iMatrix = *(float32x4x4_t *)&matrix;
float32x4_t rslt;
std::vector < glm::vec4 >::const_iterator inVertexStart = input.begin();
std::vector < glm::vec4 >::const_iterator inVertexEnd = input.end();
std::vector < glm::vec4 >::iterator outVertexStart = output.begin();
for ( ; inVertexStart != inVertexEnd; inVertexStart++, outVertexStart++ )
{
const float32x4_t input_local = *( float32x4_t const * )&(*inVertexStart);
rslt = vmulq_f32( iMatrix.val[0], input_local);
rslt = vmlaq_f32(rslt, iMatrix.val[1], input_local);
rslt = vmlaq_f32(rslt, iMatrix.val[2], input_local);
rslt = vmlaq_f32(rslt, iMatrix.val[3], input_local);
vst1q_f32( (float32_t*)&( *outVertexStart ), rslt);
}
}
ASM:
0x0000000000400fb0 <+0>: ldr x3, [x1,#8]
0x0000000000400fb4 <+4>: ldr x1, [x1]
0x0000000000400fb8 <+8>: ldr q5, [x0]
0x0000000000400fbc <+12>: cmp x1, x3
0x0000000000400fc0 <+16>: ldr q4, [x0,#16]
0x0000000000400fc4 <+20>: ldr q3, [x0,#32]
0x0000000000400fc8 <+24>: ldr q2, [x0,#48]
0x0000000000400fcc <+28>: ldr x2, [x2]
0x0000000000400fd0 <+32>: b.eq 0x400ff4 <transformVector(glm::tmat4x4<float, (glm::precision)0> const&, std::vector<glm::tvec4<float, (glm::precision)0>, std::allocator<glm::tvec4<float, (glm::precision)0> > > const&, std::vector<glm::tvec4<float, (glm::precision)0>, std::allocator<glm::tvec4<float, (glm::precision)0> > >&)+68>
0x0000000000400fd4 <+36>: ldr q1, [x1],#16
0x0000000000400fd8 <+40>: fmul v0.4s, v1.4s, v4.4s
0x0000000000400fdc <+44>: cmp x3, x1
0x0000000000400fe0 <+48>: fmla v0.4s, v1.4s, v5.4s
0x0000000000400fe4 <+52>: fmla v0.4s, v3.4s, v1.4s
0x0000000000400fe8 <+56>: fmla v0.4s, v1.4s, v2.4s
0x0000000000400fec <+60>: str q0, [x2],#16
0x0000000000400ff0 <+64>: b.ne 0x400fd4 <transformVector(glm::tmat4x4<float, (glm::precision)0> const&, std::vector<glm::tvec4<float, (glm::precision)0>, std::allocator<glm::tvec4<float, (glm::precision)0> > > const&, std::vector<glm::tvec4<float, (glm::precision)0>, std::allocator<glm::tvec4<float, (glm::precision)0> > >&)+36>
0x0000000000400ff4 <+68>: ret

Related

Why is there a performance penalty for nested subroutines in Delphi?

A static analyzer we use has a report that says:
Subprograms with local subprograms (OPTI7)
This section lists subprograms that themselves have local subprograms.
Especially when these subprograms share local variables, it can have a
negative effect on performance.
This guide says:
Do not use nested routines Nested routines (routines within other
routines; also known as "local procedures") require some special stack
manipulation so that the variables of the outer routine can be seen by
the inner routine. This results in a good bit of overhead. Instead of
nesting, move the procedure to the unit scoping level and pass the
necessary variables - if necessary by reference (use the var keyword)
- or make the variable global at the unit scope.
We were interested in knowing if we should take this report into consideration when validating our code. The answers to this question suggest that one should profile one's application to see if there is any performance difference, but not much is said about the difference between nested routines and normal subroutines.
What is the actual difference between nested routines and normal routines and how may it cause a performance penalty?

tl;dr
There are extra push/pops for nested subroutines
Turning on optimizations may strip those away, such that the generated code is the same for both nested subroutines and normal subroutines
Inlining results in the same code being generated for both nested and normal subroutines
For simple routines with few parameters and local variables we perceived no performance difference even with optimizations turned off
I wrote a little test to determine this, where GetRTClock is measuring the current time with a precision of 1ns:
function subprogram_main(z : Integer) : Int64;
var
n : Integer;
s : Int64;
function subprogram_aux(n, z : Integer) : Integer;
var
i : Integer;
begin
// Do some useless work on the aux program
for i := 0 to n - 1 do begin
if (i > z) then
z := z + i
else
z := z - i;
end;
Result := z;
end;
begin
s := GetRTClock;
// Do some minor work on the main program
n := z div 100 * 100 + 100;
// Call the aux program
z := subprogram_aux(n, z);
Result := GetRTClock - s;
end;
function normal_aux(n, z : Integer) : Integer;
var
i : Integer;
begin
// Do some useless work on the aux program
for i := 0 to n - 1 do begin
if (i > z) then
z := z + i
else
z := z - i;
end;
Result := z;
end;
function normal_main(z : Integer) : Int64;
var
n : Integer;
s : Int64;
begin
s := GetRTClock;
// Do some minor work on the main program
n := z div 100 * 100 + 100;
// Call the aux program
z := normal_aux(n, z);
Result := GetRTClock - s;
end;
This compiles to:
subprogram_main
MyFormU.pas.41: begin
005CE7D0 55 push ebp
005CE7D1 8BEC mov ebp,esp
005CE7D3 83C4E0 add esp,-$20
005CE7D6 8945FC mov [ebp-$04],eax
MyFormU.pas.42: s := GetRTClock;
...
MyFormU.pas.45: n := z div 100 * 100 + 100;
...
MyFormU.pas.47: z := subprogram_aux(n, z);
005CE7F8 55 push ebp
005CE7F9 8B55FC mov edx,[ebp-$04]
005CE7FC 8B45EC mov eax,[ebp-$14]
005CE7FF E880FFFFFF call subprogram_aux
005CE804 59 pop ecx
005CE805 8945FC mov [ebp-$04],eax
MyFormU.pas.49: Result := GetRTClock - s;
...
normal_main
MyFormU.pas.70: begin
005CE870 55 push ebp
005CE871 8BEC mov ebp,esp
005CE873 83C4E0 add esp,-$20
005CE876 8945FC mov [ebp-$04],eax
MyFormU.pas.71: s := GetRTClock;
...
MyFormU.pas.74: n := z div 100 * 100 + 100;
...
MyFormU.pas.76: z := normal_aux(n, z);
005CE898 8B55FC mov edx,[ebp-$04]
005CE89B 8B45EC mov eax,[ebp-$14]
005CE89E E881FFFFFF call normal_aux
005CE8A3 8945FC mov [ebp-$04],eax
MyFormU.pas.78: Result := GetRTClock - s;
...
subprogram_aux:
MyFormU.pas.31: begin
005CE784 55 push ebp
005CE785 8BEC mov ebp,esp
005CE787 83C4EC add esp,-$14
005CE78A 8955F8 mov [ebp-$08],edx
005CE78D 8945FC mov [ebp-$04],eax
MyFormU.pas.33: for i := 0 to n - 1 do begin
005CE790 8B45FC mov eax,[ebp-$04]
005CE793 48 dec eax
005CE794 85C0 test eax,eax
005CE796 7C29 jl $005ce7c1
005CE798 40 inc eax
005CE799 8945EC mov [ebp-$14],eax
005CE79C C745F000000000 mov [ebp-$10],$00000000
MyFormU.pas.34: if (i > z) then
005CE7A3 8B45F0 mov eax,[ebp-$10]
005CE7A6 3B45F8 cmp eax,[ebp-$08]
005CE7A9 7E08 jle $005ce7b3
MyFormU.pas.35: z := z + i
005CE7AB 8B45F0 mov eax,[ebp-$10]
005CE7AE 0145F8 add [ebp-$08],eax
005CE7B1 EB06 jmp $005ce7b9
MyFormU.pas.37: z := z - i;
005CE7B3 8B45F0 mov eax,[ebp-$10]
005CE7B6 2945F8 sub [ebp-$08],eax
normal_aux:
MyFormU.pas.55: begin
005CE824 55 push ebp
005CE825 8BEC mov ebp,esp
005CE827 83C4EC add esp,-$14
005CE82A 8955F8 mov [ebp-$08],edx
005CE82D 8945FC mov [ebp-$04],eax
MyFormU.pas.57: for i := 0 to n - 1 do begin
005CE830 8B45FC mov eax,[ebp-$04]
005CE833 48 dec eax
005CE834 85C0 test eax,eax
005CE836 7C29 jl $005ce861
005CE838 40 inc eax
005CE839 8945EC mov [ebp-$14],eax
005CE83C C745F000000000 mov [ebp-$10],$00000000
MyFormU.pas.58: if (i > z) then
005CE843 8B45F0 mov eax,[ebp-$10]
005CE846 3B45F8 cmp eax,[ebp-$08]
005CE849 7E08 jle $005ce853
MyFormU.pas.59: z := z + i
005CE84B 8B45F0 mov eax,[ebp-$10]
005CE84E 0145F8 add [ebp-$08],eax
005CE851 EB06 jmp $005ce859
MyFormU.pas.61: z := z - i;
005CE853 8B45F0 mov eax,[ebp-$10]
005CE856 2945F8 sub [ebp-$08],eax
The only difference is one push and one pop. What happens if we turn on optimizations?
MyFormU.pas.47: z := subprogram_aux(n, z);
005CE7C5 8BD3 mov edx,ebx
005CE7C7 8BC6 mov eax,esi
005CE7C9 E8B6FFFFFF call subprogram_aux
MyFormU.pas.76: z := normal_aux(n, z);
005CE82D 8BD3 mov edx,ebx
005CE82F 8BC6 mov eax,esi
005CE831 E8B6FFFFFF call normal_aux
Both compile exactly to the same thing.
What happens when inlining?
MyFormU.pas.76: z := normal_aux(n, z);
005CE804 8BD3 mov edx,ebx
005CE806 8BC8 mov ecx,eax
005CE808 49 dec ecx
005CE809 85C9 test ecx,ecx
005CE80B 7C11 jl $005ce81e
005CE80D 41 inc ecx
005CE80E 33C0 xor eax,eax
005CE810 3BD0 cmp edx,eax
005CE812 7D04 jnl $005ce818
005CE814 03D0 add edx,eax
005CE816 EB02 jmp $005ce81a
005CE818 2BD0 sub edx,eax
005CE81A 40 inc eax
005CE81B 49 dec ecx
005CE81C 75F2 jnz $005ce810
subprogram_main:
MyFormU.pas.47: z := subprogram_aux(n, z);
005CE7A8 8BD3 mov edx,ebx
005CE7AA 8BC8 mov ecx,eax
005CE7AC 49 dec ecx
005CE7AD 85C9 test ecx,ecx
005CE7AF 7C11 jl $005ce7c2
005CE7B1 41 inc ecx
005CE7B2 33C0 xor eax,eax
005CE7B4 3BD0 cmp edx,eax
005CE7B6 7D04 jnl $005ce7bc
005CE7B8 03D0 add edx,eax
005CE7BA EB02 jmp $005ce7be
005CE7BC 2BD0 sub edx,eax
005CE7BE 40 inc eax
005CE7BF 49 dec ecx
005CE7C0 75F2 jnz $005ce7b4
Again, no difference.
I also profiled this little example, taking an average of 30 executions for each (normal and subprogram), called in random order:
constructor TForm1.Create(AOwner: TComponent);
const
c_nSamples = 60;
rnd_sample : array[0..c_nSamples - 1] of byte = (1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0);
var
subprogram_gt_ns : Int64;
normal_gt_ns : Int64;
rnd_input : Integer;
i : Integer;
begin
inherited Create(AOwner);
normal_gt_ns := 0;
subprogram_gt_ns := 0;
rnd_input := Random(1000);
for i := 0 to c_nSamples - 1 do
if (rnd_sample[i] = 1) then
Inc(subprogram_gt_ns, subprogram_main(rnd_input))
else
Inc(normal_gt_ns, normal_main(rnd_input));
OutputDebugString(PChar(' Normal ' + FloatToStr(normal_gt_ns / 30) + ' Subprogram ' + FloatToStr(subprogram_gt_ns / 30)));
end;
There is no significant difference even with optimizations turned off:
Debug Output: Normal 1166,66666666667 Subprogram 1203,33333333333 Process MyProject.exe (1824)
Finally, both texts that warn about performance mention something about shared local variables.
If we do not pass z to subprogram_aux, instead access it directly, we get:
MyFormU.pas.47: z := subprogram_aux(n);
005CE7D2 55 push ebp
005CE7D3 8BC3 mov eax,ebx
005CE7D5 E8AAFFFFFF call subprogram_aux
005CE7DA 59 pop ecx
005CE7DB 8945FC mov [ebp-$04],eax
Even with optimizations turned on.

Optimizing SIMD histogram calculation

I worked on a code that implements an histogram calculation given an opencv struct IplImage * and a buffer unsigned int * to the histogram. I'm still new to SIMD so I might not be taking advantage of the full potential the instruction set provides.
histogramASM:
xor rdx, rdx
xor rax, rax
mov eax, dword [imgPtr + imgWidthOffset]
mov edx, dword [imgPtr + imgHeightOffset]
mul rdx
mov rdx, rax ; rdx = Image Size
mov r10, qword [imgPtr + imgDataOffset] ; r10 = ImgData
NextPacket:
mov rax, rdx
movdqu xmm0, [r10 + rax - 16]
mov rcx,16 ; 16 pixels/paq
PacketLoop:
pextrb rbx, xmm0, 0 ; saving the pixel value on rbx
shl rbx,2
inc dword [rbx + Hist]
psrldq xmm0,1
loop PacketLoop
sub rdx,16
cmp rdx,0
jnz NextPacket
ret
On C, I'd be running these piece of code to obtain the same result.
imgSize = (img->width)*(img->height);
pixelData = (unsigned char *) img->imageData;
for(i = 0; i < imgSize; i++)
{
pixel = *pixelData;
hist[pixel]++;
pixelData++;
}
But the time it takes for both, measured in my computer with rdtsc(), is only 1.5 times better SIMD's assembler. Is there a way to optimize the code above and quickly fill the histogram vector with SIMD?
Thanks in advance

Like Jester I'm surprised that your SIMD code had any significant improvement. Did you compile the C code with optimization turned on?
The one additional suggestion I can make is to unroll your Packetloop loop. This is a fairly simple optimization and reduces the number of instructions per "iteration" to just two:
pextrb ebx, xmm0, 0
inc dword [ebx * 4 + Hist]
pextrb ebx, xmm0, 1
inc dword [ebx * 4 + Hist]
pextrb ebx, xmm0, 2
inc dword [ebx * 4 + Hist]
...
pextrb ebx, xmm0, 15
inc dword [ebx * 4 + Hist]
If you're using NASM you can use the %rep directive to save some typing:
%assign pixel 0
%rep 16
pextrb rbx, xmm0, pixel
inc dword [rbx * 4 + Hist]
%assign pixel pixel + 1
%endrep

iOS cordova build throwing bad access exception immediately after launch, majority of the time

Seen similar issues to this on the net, but no solutions. It seems to have cropped up with iOS 8. With NSZombie turned on, I see the error:
[UIViewAnimationState release]: message sent to deallocated instance 0x1701c9d80
Then the stack of memory addresses in the webthread:
CoreFoundation`___forwarding___:
0x18708fdb4 <+0>: stp x26, x25, [sp, #-80]!
0x18708fdb8 <+4>: stp x24, x23, [sp, #16]
0x18708fdbc <+8>: stp x22, x21, [sp, #32]
0x18708fdc0 <+12>: stp x20, x19, [sp, #48]
0x18708fdc4 <+16>: stp x29, x30, [sp, #64]
0x18708fdc8 <+20>: add x29, sp, #64
0x18708fdcc <+24>: sub sp, sp, #32
0x18708fdd0 <+28>: mov x22, x1
0x18708fdd4 <+32>: mov x20, x0
0x18708fdd8 <+36>: ldr x19, [x20]
0x18708fddc <+40>: and x8, x19, #0x7000000000000000
0x18708fde0 <+44>: cmp x19, #0
0x18708fde4 <+48>: ccmp x8, #0, #0, lt
0x18708fde8 <+52>: b.eq 0x18708ffe8 ; <+564>
0x18708fdec <+56>: ldr x21, [x20, #8]
0x18708fdf0 <+60>: mov x0, x19
0x18708fdf4 <+64>: bl 0x1870f2f58 ; symbol stub for: +[NSOrderedSet orderedSetWithOrderedSet:]
0x18708fdf8 <+68>: mov x23, x0
0x18708fdfc <+72>: bl 0x1870f2db4 ; symbol stub for: __destroy_helper_block_167
0x18708fe00 <+76>: mov x24, x0
0x18708fe04 <+80>: adrp x8, 74051
0x18708fe08 <+84>: nop
0x18708fe0c <+88>: ldr x25, [x8, #384]
0x18708fe10 <+92>: mov x0, x23
0x18708fe14 <+96>: mov x1, x25
0x18708fe18 <+100>: bl 0x1870f2dd8 ; symbol stub for: __destroy_helper_block_171
0x18708fe1c <+104>: cbz w0, 0x18708fe58 ; <+164>
0x18708fe20 <+108>: mov x0, x19
0x18708fe24 <+112>: mov x1, x25
0x18708fe28 <+116>: mov x2, x21
0x18708fe2c <+120>: bl 0x1870f2ec8 ; symbol stub for: -[NSOrderedSet initWithObject:]
0x18708fe30 <+124>: cmp x0, #0
0x18708fe34 <+128>: ccmp x0, x19, #4, ne
0x18708fe38 <+132>: b.eq 0x18708fe58 ; <+164>
0x18708fe3c <+136>: and x8, x0, #0x7000000000000000
0x18708fe40 <+140>: cmp x0, #0
0x18708fe44 <+144>: ccmp x8, #0, #0, lt
0x18708fe48 <+148>: b.eq 0x18708ffe4 ; <+560>
0x18708fe4c <+152>: str x0, [x20]
0x18708fe50 <+156>: movz x20, #0
0x18708fe54 <+160>: b 0x187090024 ; <+624>
0x18708fe58 <+164>: adrp x1, 514
0x18708fe5c <+168>: add x1, x1, #2751
0x18708fe60 <+172>: movz x2, #0xa
0x18708fe64 <+176>: mov x0, x24
0x18708fe68 <+180>: bl 0x1870f3dec ; symbol stub for: __destroy_helper_block_467
0x18708fe6c <+184>: cbz w0, 0x187090044 ; <+656>
0x18708fe70 <+188>: adrp x8, 74051
0x18708fe74 <+192>: nop
0x18708fe78 <+196>: ldr x24, [x8, #328]
0x18708fe7c <+200>: mov x0, x23
0x18708fe80 <+204>: mov x1, x24
0x18708fe84 <+208>: bl 0x1870f2dd8 ; symbol stub for: __destroy_helper_block_171
0x18708fe88 <+212>: tbz w0, #0, 0x187090094 ; <+736>
0x18708fe8c <+216>: mov x0, x19
0x18708fe90 <+220>: mov x1, x24
0x18708fe94 <+224>: mov x2, x21
0x18708fe98 <+228>: bl 0x1870f2ec8 ; symbol stub for: -[NSOrderedSet initWithObject:]
0x18708fe9c <+232>: mov x24, x0
0x18708fea0 <+236>: cbz x24, 0x1870900e8 ; <+820>
0x18708fea4 <+240>: adrp x8, 74051
0x18708fea8 <+244>: nop
0x18708feac <+248>: ldr x1, [x8, #280]
0x18708feb0 <+252>: mov x0, x24
0x18708feb4 <+256>: bl 0x1870f2ec8 ; symbol stub for: -[NSOrderedSet initWithObject:]
0x18708feb8 <+260>: mov x23, x0
0x18708febc <+264>: ldr x8, [x23]
0x18708fec0 <+268>: ldrh w8, [x8, #34]
0x18708fec4 <+272>: ubfx x8, x8, #6, #1
0x18708fec8 <+276>: cmp x8, x22
0x18708fecc <+280>: b.eq 0x18708ff18 ; <+356>
0x18708fed0 <+284>: mov x0, x21
0x18708fed4 <+288>: bl 0x1870f2fa0 ; symbol stub for: -[NSMutableOrderedSet setObject:atIndex:]
0x18708fed8 <+292>: ldr x8, [x23]
0x18708fedc <+296>: ldrh w8, [x8, #34]
0x18708fee0 <+300>: adrp x9, 535
0x18708fee4 <+304>: add x9, x9, #4044
0x18708fee8 <+308>: tst w8, #0x40
0x18708feec <+312>: adrp x8, 506
0x18708fef0 <+316>: add x8, x8, #1640
0x18708fef4 <+320>: csel x10, x8, x9, ne
0x18708fef8 <+324>: cmp x22, #0
0x18708fefc <+328>: csel x8, x8, x9, ne
0x18708ff00 <+332>: str x8, [sp, #16]
0x18708ff04 <+336>: stp x0, x10, [sp]
0x18708ff08 <+340>: adrp x1, 74021
0x18708ff0c <+344>: add x1, x1, #3760
0x18708ff10 <+348>: orr w0, wzr, #0x4
0x18708ff14 <+352>: bl 0x187071f88 ; CFLog
0x18708ff18 <+356>: adrp x8, 74051
0x18708ff1c <+360>: ldr x0, [x8, #2360]
0x18708ff20 <+364>: nop
0x18708ff24 <+368>: nop
0x18708ff28 <+372>: ldr x1, [x8, #344]
0x18708ff2c <+376>: mov x2, x24
0x18708ff30 <+380>: mov x3, x20
0x18708ff34 <+384>: bl 0x1870f2ec8 ; symbol stub for: -[NSOrderedSet initWithObject:]
0x18708ff38 <+388>: mov x21, x0
0x18708ff3c <+392>: mov x0, x19
0x18708ff40 <+396>: bl 0x1870f2f58 ; symbol stub for: +[NSOrderedSet orderedSetWithOrderedSet:]
0x18708ff44 <+400>: adrp x8, 74051
0x18708ff48 <+404>: nop
0x18708ff4c <+408>: ldr x22, [x8, #392]
0x18708ff50 <+412>: mov x1, x22
0x18708ff54 <+416>: bl 0x1870f2dd8 ; symbol stub for: __destroy_helper_block_171
0x18708ff58 <+420>: cbz w0, 0x18708ff70 ; <+444>
0x18708ff5c <+424>: mov x0, x19
0x18708ff60 <+428>: mov x1, x22
0x18708ff64 <+432>: mov x2, x21
0x18708ff68 <+436>: bl 0x1870f2ec8 ; symbol stub for: -[NSOrderedSet initWithObject:]
0x18708ff6c <+440>: b 0x18708ff8c ; <+472>
0x18708ff70 <+444>: mov x0, x19
0x18708ff74 <+448>: bl 0x1870f2f64 ; symbol stub for: +[NSOrderedSet orderedSetWithSet:copyItems:]
0x18708ff78 <+452>: stp x19, x0, [sp]
0x18708ff7c <+456>: adrp x1, 74021
0x18708ff80 <+460>: add x1, x1, #3792
0x18708ff84 <+464>: orr w0, wzr, #0x4
0x18708ff88 <+468>: bl 0x187071f88 ; CFLog
0x18708ff8c <+472>: adrp x8, 74051
0x18708ff90 <+476>: ldrsw x8, [x8, #3972]
0x18708ff94 <+480>: ldrb w8, [x21, x8]
0x18708ff98 <+484>: cbz w8, 0x18708ffd4 ; <+544>
0x18708ff9c <+488>: ldr x8, [x23]
0x18708ffa0 <+492>: ldrb w9, [x8, #34]
0x18708ffa4 <+496>: tbz w9, #7, 0x18708ffd4 ; <+544>
0x18708ffa8 <+500>: adrp x9, 74051
0x18708ffac <+504>: ldrsw x9, [x9, #3964]
0x18708ffb0 <+508>: ldr x9, [x21, x9]
0x18708ffb4 <+512>: ldr w10, [x8, #28]
0x18708ffb8 <+516>: ldrb w11, [x8, #32]
0x18708ffbc <+520>: add x10, x11, x10
0x18708ffc0 <+524>: ldr x0, [x20, x10]
0x18708ffc4 <+528>: ldr x1, [x9, x10]
0x18708ffc8 <+532>: ldr x8, [x8]
0x18708ffcc <+536>: ldr w2, [x8, #16]
0x18708ffd0 <+540>: bl 0x1870f39fc ; symbol stub for: __NSOrderedSetEnumerate
0x18708ffd4 <+544>: adrp x8, 74051
0x18708ffd8 <+548>: ldrsw x8, [x8, #3960]
0x18708ffdc <+552>: ldr x20, [x21, x8]
0x18708ffe0 <+556>: b 0x187090024 ; <+624>
0x18708ffe4 <+560>: mov x19, x0
0x18708ffe8 <+564>: mov x0, x19
0x18708ffec <+568>: bl 0x18709399c ; getAtomTarget
0x18708fff0 <+572>: mov x21, x0
0x18708fff4 <+576>: str x21, [x20]
0x18708fff8 <+580>: adrp x0, 73983
0x18708fffc <+584>: ldr x0, [x0, #384]
0x187090000 <+588>: orr x3, xzr, #0x400
0x187090004 <+592>: mov x1, x20
0x187090008 <+596>: mov x2, x20
0x18709000c <+600>: movz w4, #0
0x187090010 <+604>: bl 0x187092070 ; __invoking___
0x187090014 <+608>: ldr x8, [x20]
0x187090018 <+612>: cmp x8, x21
0x18709001c <+616>: b.ne 0x187090024 ; <+624>
0x187090020 <+620>: str x19, [x20]
0x187090024 <+624>: mov x0, x20
0x187090028 <+628>: sub sp, x29, #64
0x18709002c <+632>: ldp x29, x30, [sp, #64]
0x187090030 <+636>: ldp x20, x19, [sp, #48]
0x187090034 <+640>: ldp x22, x21, [sp, #32]
0x187090038 <+644>: ldp x24, x23, [sp, #16]
0x18709003c <+648>: ldp x26, x25, [sp], #80
0x187090040 <+652>: ret
0x187090044 <+656>: adrp x8, 74066
0x187090048 <+660>: add x8, x8, #288
0x18709004c <+664>: ldrb w8, [x8]
0x187090050 <+668>: cbz w8, 0x18709006c ; <+696>
0x187090054 <+672>: movz x2, #0
0x187090058 <+676>: movz w0, #0x15
0x18709005c <+680>: mov x1, x19
0x187090060 <+684>: movz x3, #0
0x187090064 <+688>: movz x4, #0
0x187090068 <+692>: bl 0x187045204 ; __CFRecordAllocationEvent
0x18709006c <+696>: add x20, x24, #10
0x187090070 <+700>: mov x0, x21
0x187090074 <+704>: bl 0x1870f2fa0 ; symbol stub for: -[NSMutableOrderedSet setObject:atIndex:]
0x187090078 <+708>: mov x8, x0
0x18709007c <+712>: stp x8, x19, [sp, #8]
0x187090080 <+716>: str x20, [sp]
0x187090084 <+720>: adrp x8, 74020
0x187090088 <+724>: add x8, x8, #3600
0x18709008c <+728>: orr w0, wzr, #0x3
0x187090090 <+732>: b 0x187090174 ; <+960>
0x187090094 <+736>: mov x0, x23
0x187090098 <+740>: bl 0x1870f2dc0 ; symbol stub for: ___CFPreferencesRemoveSuitePreferencesFromAppWithContainer_block_invoke169
0x18709009c <+744>: mov x22, x0
0x1870900a0 <+748>: mov x0, x19
0x1870900a4 <+752>: bl 0x1870f2f64 ; symbol stub for: +[NSOrderedSet orderedSetWithSet:copyItems:]
0x1870900a8 <+756>: mov x20, x0
0x1870900ac <+760>: cbnz x22, 0x1870900d0 ; <+796>
0x1870900b0 <+764>: mov x0, x19
0x1870900b4 <+768>: bl 0x1870f2f64 ; symbol stub for: +[NSOrderedSet orderedSetWithSet:copyItems:]
0x1870900b8 <+772>: mov x8, x0
0x1870900bc <+776>: stp x20, x8, [sp, #8]
0x1870900c0 <+780>: str x19, [sp]
0x1870900c4 <+784>: adrp x8, 74020
0x1870900c8 <+788>: add x8, x8, #3632
0x1870900cc <+792>: b 0x1870900dc ; <+808>
0x1870900d0 <+796>: stp x19, x20, [sp]
0x1870900d4 <+800>: adrp x8, 74020
0x1870900d8 <+804>: add x8, x8, #3664
0x1870900dc <+808>: orr w0, wzr, #0x4
0x1870900e0 <+812>: mov x1, x8
0x1870900e4 <+816>: bl 0x187071f88 ; CFLog
0x1870900e8 <+820>: mov x0, x21
0x1870900ec <+824>: bl 0x1870f2fa0 ; symbol stub for: -[NSMutableOrderedSet setObject:atIndex:]
0x1870900f0 <+828>: mov x20, x0
0x1870900f4 <+832>: bl 0x1870f2fac ; symbol stub for: -[NSMutableOrderedSet setObject:atIndexedSubscript:]
0x1870900f8 <+836>: mov x8, x0
0x1870900fc <+840>: cmp x8, x21
0x187090100 <+844>: b.eq 0x187090120 ; <+876>
0x187090104 <+848>: stp x20, x8, [sp, #8]
0x187090108 <+852>: str x21, [sp]
0x18709010c <+856>: adrp x8, 74020
0x187090110 <+860>: add x8, x8, #3696
0x187090114 <+864>: orr w0, wzr, #0x4
0x187090118 <+868>: mov x1, x8
0x18709011c <+872>: bl 0x187071f88 ; CFLog
0x187090120 <+876>: mov x0, x19
0x187090124 <+880>: bl 0x1870f2f58 ; symbol stub for: +[NSOrderedSet orderedSetWithOrderedSet:]
0x187090128 <+884>: adrp x8, 74049
0x18709012c <+888>: nop
0x187090130 <+892>: ldr x20, [x8, #3456]
0x187090134 <+896>: mov x1, x20
0x187090138 <+900>: bl 0x1870f2dd8 ; symbol stub for: __destroy_helper_block_171
0x18709013c <+904>: mov x8, x0
0x187090140 <+908>: cbz w8, 0x187090158 ; <+932>
0x187090144 <+912>: mov x0, x19
0x187090148 <+916>: mov x1, x20
0x18709014c <+920>: mov x2, x21
0x187090150 <+924>: bl 0x1870f2ec8 ; symbol stub for: -[NSOrderedSet initWithObject:]
0x187090154 <+928>: brk #0x1
0x187090158 <+932>: mov x0, x19
0x18709015c <+936>: bl 0x1870f2f64 ; symbol stub for: +[NSOrderedSet orderedSetWithSet:copyItems:]
0x187090160 <+940>: mov x8, x0
0x187090164 <+944>: stp x19, x8, [sp]
0x187090168 <+948>: adrp x8, 74020
0x18709016c <+952>: add x8, x8, #3728
0x187090170 <+956>: orr w0, wzr, #0x4
0x187090174 <+960>: mov x1, x8
0x187090178 <+964>: bl 0x187071f88 ; CFLog
-> 0x18709017c <+968>: brk #0x1
Sometimes the app will build fine, and I can run it without issue. But other times it just crashes after pressing Run in XCode. I'm using the latest XCode on OS X Yosemite. Building for an iPad mini retina running 8.3
Edit
Stack trace from lldb:
* thread #9: tid = 0x14e8ec, 0x000000018709017c CoreFoundation`___forwarding___ + 968, name = 'WebThread', stop reason = EXC_BREAKPOINT (code=1, subcode=0x18709017c)
* frame #0: 0x000000018709017c CoreFoundation`___forwarding___ + 968
frame #1: 0x0000000186f92ccc CoreFoundation`_CF_forwarding_prep_0 + 92
frame #2: 0x0000000186f69458 CoreFoundation`CFRelease + 524
frame #3: 0x000000018b410b3c QuartzCore`CA::release_objects(X::List<void const*>*) + 32
frame #4: 0x000000018b416b50 QuartzCore`-[CAAnimation dealloc] + 84
frame #5: 0x0000000198551724 libobjc.A.dylib`(anonymous namespace)::AutoreleasePoolPage::pop(void*) + 564
frame #6: 0x0000000186f6d074 CoreFoundation`_CFAutoreleasePoolPop + 28
frame #7: 0x0000000187eacffc Foundation`-[NSAutoreleasePool drain] + 152
frame #8: 0x0000000186acdd80 CFNetwork`AutoAutoreleasePool::~AutoAutoreleasePool() + 36
frame #9: 0x0000000186aa9588 CFNetwork`___ZN27URLConnectionClient_Classic18_withDelegateAsyncEPKcU13block_pointerFvP16_CFURLConnectionPK33CFURLConnectionClientCurrent_VMaxE_block_invoke_2 + 228
frame #10: 0x00000001869cbac8 CFNetwork`RunloopBlockContext::_invoke_block(void const*, void*) + 76
frame #11: 0x0000000186f6ccdc CoreFoundation`CFArrayApplyFunction + 68
frame #12: 0x00000001869cb974 CFNetwork`RunloopBlockContext::perform() + 136
frame #13: 0x00000001869cb828 CFNetwork`MultiplexerSource::perform() + 312
frame #14: 0x00000001869cb654 CFNetwork`MultiplexerSource::_perform(void*) + 68
frame #15: 0x0000000187044240 CoreFoundation`__CFRUNLOOP_IS_CALLING_OUT_TO_A_SOURCE0_PERFORM_FUNCTION__ + 24
frame #16: 0x00000001870434e4 CoreFoundation`__CFRunLoopDoSources0 + 264
frame #17: 0x0000000187041594 CoreFoundation`__CFRunLoopRun + 712
frame #18: 0x0000000186f6d2d4 CoreFoundation`CFRunLoopRunSpecific + 396
frame #19: 0x0000000195618894 WebCore`RunWebThread(void*) + 468
frame #20: 0x0000000198d6bdc8 libsystem_pthread.dylib`_pthread_body + 164
frame #21: 0x0000000198d6bd24 libsystem_pthread.dylib`_pthread_start + 160

Cant get CoreMotion data in UIKit UIGravityDirection vector

So im trying to make an app for a school project in Swift. (im completely new to this language) The app should allow you to tilt your device, and by doing so making a square drop to the bottom of the screen. This is what ip using right now:
func gravityUpdated(){
if let data = motionManager.deviceMotion {
let gravity = data.gravity
Zwaartekracht.gravityDirection = CGVectorMake(CGFloat(gravity.x), CGFloat(gravity.y))
}
}
gravityUpdated()
When i run it on my iPhone 6, the build is successful, but then i get to see this: (sorry for long post):
libswiftCore.dylib`function signature specialization <Arg[0] = Exploded, Arg[1] = Exploded, Arg[2] = Dead, Arg[3] = Dead> of Swift._fatalErrorMessage (Swift.StaticString, Swift.StaticString, Swift.StaticString, Swift.UInt) -> ():
0x100171448 <+0>: stp x29, x30, [sp, #-16]!
0x10017144c <+4>: mov x29, sp
0x100171450 <+8>: sub sp, sp, #16
0x100171454 <+12>: and w8, w2, #0x1
0x100171458 <+16>: tbnz w8, #0, 0x100171478 ; <+48>
0x10017145c <+20>: tbnz x1, #63, 0x100171568 ; <+288>
0x100171460 <+24>: add x1, x0, x1
0x100171464 <+28>: mov x2, x3
0x100171468 <+32>: mov x3, x4
0x10017146c <+36>: mov x4, x5
0x100171470 <+40>: bl 0x1001a1334 ; function signature specialization <Arg[0] = Exploded, Arg[1] = Exploded> of Swift.(_fatalErrorMessage (Swift.StaticString, Swift.StaticString, Swift.StaticString, Swift.UInt) -> ()).(closure #2)
-> 0x100171474 <+44>: brk #0x1
0x100171478 <+48>: str xzr, [sp, #8]
0x10017147c <+52>: cmp x0, w0, uxtw
0x100171480 <+56>: b.ne 0x100171620 ; <+472>
0x100171484 <+60>: lsr w8, w0, #11
0x100171488 <+64>: cmp w8, #27
0x10017148c <+68>: b.ne 0x1001714b4 ; <+108>
0x100171490 <+72>: adr x0, #584602 ; "fatal error"
0x100171494 <+76>: nop
0x100171498 <+80>: adr x3, #586600 ; "high- and low-surrogate code points are not valid Unicode scalar values"
0x10017149c <+84>: nop
0x1001714a0 <+88>: movz w1, #0xb
0x1001714a4 <+92>: orr w2, wzr, #0x2
0x1001714a8 <+96>: movz w4, #0x47
0x1001714ac <+100>: orr w5, wzr, #0x2
0x1001714b0 <+104>: bl 0x100171448 ; <+0>
0x1001714b4 <+108>: lsr w10, w0, #16
0x1001714b8 <+112>: cmp w10, #16
0x1001714bc <+116>: b.hi 0x1001714ec ; <+164>
0x1001714c0 <+120>: cmp w0, #128
0x1001714c4 <+124>: b.lo 0x100171510 ; <+200>
0x1001714c8 <+128>: orr w8, wzr, #0x80
0x1001714cc <+132>: bfxil x8, x0, #0, #6
0x1001714d0 <+136>: cmp w0, #2047
0x1001714d4 <+140>: b.hi 0x10017158c ; <+324>
0x1001714d8 <+144>: movz x11, #0
0x1001714dc <+148>: movz x10, #0
0x1001714e0 <+152>: lsr w9, w0, #6
0x1001714e4 <+156>: orr w9, w9, #0xffffffc0
0x1001714e8 <+160>: b 0x1001715e8 ; <+416>
0x1001714ec <+164>: adr x0, #584510 ; "fatal error"
0x1001714f0 <+168>: nop
0x1001714f4 <+172>: adr x3, #586588 ; "value is outside of Unicode codespace"
0x1001714f8 <+176>: nop
0x1001714fc <+180>: movz w1, #0xb
0x100171500 <+184>: orr w2, wzr, #0x2
0x100171504 <+188>: movz w4, #0x25
0x100171508 <+192>: orr w5, wzr, #0x2
0x10017150c <+196>: bl 0x100171448 ; <+0>
0x100171510 <+200>: movz x10, #0
0x100171514 <+204>: movz x9, #0
0x100171518 <+208>: mov x8, x0
0x10017151c <+212>: orr w11, wzr, #0x8
0x100171520 <+216>: umulh x11, x9, x11
0x100171524 <+220>: cmp xzr, x11
0x100171528 <+224>: cset w11, ne
0x10017152c <+228>: tbnz w11, #0, 0x100171620 ; <+472>
0x100171530 <+232>: lsl x11, x9, #3
0x100171534 <+236>: cmp x11, #63
0x100171538 <+240>: b.hi 0x100171624 ; <+476>
0x10017153c <+244>: and x8, x8, #0xff
0x100171540 <+248>: lsl x8, x8, x11
0x100171544 <+252>: orr x8, x8, x10
0x100171548 <+256>: str x8, [sp, #8]
0x10017154c <+260>: cmn x9, #2
0x100171550 <+264>: b.le 0x100171568 ; <+288>
0x100171554 <+268>: add x8, sp, #8
0x100171558 <+272>: add x8, x9, x8
0x10017155c <+276>: add x1, x8, #1
0x100171560 <+280>: add x0, sp, #8
0x100171564 <+284>: b 0x100171464 ; <+28>
0x100171568 <+288>: adr x0, #584386 ; "fatal error"
0x10017156c <+292>: nop
0x100171570 <+296>: adr x3, #584656 ; "UnsafeBufferPointer with negative count"
0x100171574 <+300>: nop
0x100171578 <+304>: movz w1, #0xb
0x10017157c <+308>: orr w2, wzr, #0x2
0x100171580 <+312>: movz w4, #0x27
0x100171584 <+316>: orr w5, wzr, #0x2
0x100171588 <+320>: bl 0x100171448 ; <+0>
0x10017158c <+324>: orr w9, wzr, #0xffffff80
0x100171590 <+328>: bfxil w9, w0, #6, #6
0x100171594 <+332>: cbnz w10, 0x1001715ac ; <+356>
0x100171598 <+336>: movz x11, #0
0x10017159c <+340>: movz x12, #0
0x1001715a0 <+344>: lsr w10, w0, #12
0x1001715a4 <+348>: orr w10, w10, #0xffffffe0
0x1001715a8 <+352>: b 0x1001715d0 ; <+392>
0x1001715ac <+356>: lsr w10, w0, #18
0x1001715b0 <+360>: orr w11, w10, #0xf0
0x1001715b4 <+364>: cmp w11, w11, uxtb
0x1001715b8 <+368>: b.ne 0x100171620 ; <+472>
0x1001715bc <+372>: orr w10, wzr, #0xffffff80
0x1001715c0 <+376>: bfxil w10, w0, #12, #6
0x1001715c4 <+380>: and x12, x11, #0xff
0x1001715c8 <+384>: str x12, [sp, #8]
0x1001715cc <+388>: orr w11, wzr, #0x1
0x1001715d0 <+392>: and x10, x10, #0xff
0x1001715d4 <+396>: lsl x13, x11, #3
0x1001715d8 <+400>: lsl x10, x10, x13
0x1001715dc <+404>: orr x10, x10, x12
0x1001715e0 <+408>: str x10, [sp, #8]
0x1001715e4 <+412>: add x11, x11, #1
0x1001715e8 <+416>: orr w12, wzr, #0x8
0x1001715ec <+420>: umulh x12, x11, x12
0x1001715f0 <+424>: cmp xzr, x12
0x1001715f4 <+428>: cset w12, ne
0x1001715f8 <+432>: tbnz w12, #0, 0x100171620 ; <+472>
0x1001715fc <+436>: lsl x12, x11, #3
0x100171600 <+440>: cmp x12, #64
0x100171604 <+444>: b.hs 0x100171624 ; <+476>
0x100171608 <+448>: and x9, x9, #0xff
0x10017160c <+452>: lsl x9, x9, x12
0x100171610 <+456>: orr x10, x9, x10
0x100171614 <+460>: str x10, [sp, #8]
0x100171618 <+464>: add x9, x11, #1
0x10017161c <+468>: tbz x9, #63, 0x10017151c ; <+212>
0x100171620 <+472>: brk #0x1
0x100171624 <+476>: adr x0, #584198 ; "fatal error"
0x100171628 <+480>: nop
0x10017162c <+484>: adr x3, #584228 ; "shift amount is larger than type size in bits"
0x100171630 <+488>: nop
0x100171634 <+492>: movz w1, #0xb
0x100171638 <+496>: orr w2, wzr, #0x2
0x10017163c <+500>: movz w4, #0x2d
0x100171640 <+504>: orr w5, wzr, #0x2
0x100171644 <+508>: bl 0x100171448 ; <+0>
on the bottom it also states: fatal error: unexpectedly found nil while unwrapping an Optional value
(lldb)
i dont have a single clue how this can happen
anyone can help? thanks!

Got it working by using this code:
func gravityUpdated(motion: CMDeviceMotion!, error: NSError!) {
let grav : CMAcceleration = motion.gravity;
let x = CGFloat(grav.x);
let y = CGFloat(grav.y);
var p = CGPointMake(x,y)
// Have to correct for orientation.
var orientation = UIApplication.sharedApplication().statusBarOrientation;
if(orientation == UIInterfaceOrientation.LandscapeLeft) {
var t = p.x
p.x = 0 - p.y
p.y = t
} else if (orientation == UIInterfaceOrientation.LandscapeRight) {
var t = p.x
p.x = p.y
p.y = 0 - t
} else if (orientation == UIInterfaceOrientation.PortraitUpsideDown) {
p.x *= -1
p.y *= -1
}
var v = CGVectorMake(p.x, 0 - p.y);
gravity.gravityDirection = v;
animator.addBehavior(gravity)
}

faster alternative to InttoStr/StrToInt?

I wonder if there are faster alternative than System.IntToStr / System.StrToInt. There is a fast version but only UTF8. Which is Int32ToUTF8 from SynCommons.pas and due to slow string conversions it is bound to be slow. The purepascal RTL versions are really slow for 64 bit.

This routine is approximately 40% faster than the routine in the RTL. It could be much faster if you worked with WideChar[] buffers because the string allocation is taking up 75% of the time used by the conversion routine:
IntS32ToWide: 5,50 ns/item (PWideChar)
IntToStr: 34,51 ns/item (RTL)
IntS32ToStr: 24,77 ns/item (RTL replacement)
Please note that the routine below uses SSE2 and only x86 and x64 versions are fully implemented and tested.
In the initialization:
function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
function IntS32ToWide( X: Integer; P: PWideChar ): PWideChar; register;
function IntS32ToStr ( X: Longword ): UnicodeString; register; inline;
In the implementation:
{$CODEALIGN 16}
{$ALIGN 16}
const
DigitsClippedW: array [ 0..99 ] of LongWord = (
$000030, $000031, $000032, $000033, $000034, $000035, $000036, $000037, $000038, $000039,
$300031, $310031, $320031, $330031, $340031, $350031, $360031, $370031, $380031, $390031,
$300032, $310032, $320032, $330032, $340032, $350032, $360032, $370032, $380032, $390032,
$300033, $310033, $320033, $330033, $340033, $350033, $360033, $370033, $380033, $390033,
$300034, $310034, $320034, $330034, $340034, $350034, $360034, $370034, $380034, $390034,
$300035, $310035, $320035, $330035, $340035, $350035, $360035, $370035, $380035, $390035,
$300036, $310036, $320036, $330036, $340036, $350036, $360036, $370036, $380036, $390036,
$300037, $310037, $320037, $330037, $340037, $350037, $360037, $370037, $380037, $390037,
$300038, $310038, $320038, $330038, $340038, $350038, $360038, $370038, $380038, $390038,
$300039, $310039, $320039, $330039, $340039, $350039, $360039, $370039, $380039, $390039 );
// Delphi XE3 has no working alignment for 16 bytes for data but it has alignment for 16 bytes for code!
// So we encode our constants as a procedure and use constant offsets to the data.
const
Div10000_Shl45d = $00;
Shl16_minus_10000d = $10;
Div_1000_100_10_1w = $20;
Shl_1000_100_10_1w = $30;
Mul_10w = $40;
To_Asciiw = $50;
Mul_10000d = $60;
Div100_Shl19w = $70;
Mul100w = $80;
Div10_shl16w = $90;
To_Asciib = $A0;
procedure IntUToStrConsts();
asm
{$if defined( CPUX64 )}.NOFRAME{$ifend}
dd $d1b71759, $d1b71759, $d1b71759, $d1b71759; // RoundUp( 2^45 / 10000 )
dd $10000 - 10000, $10000 - 10000, $10000 - 10000, $10000 - 10000; // 1 shl 16 - 1e4
dw 8389, 5243, 13108, $8000, 8389, 5243, 13108, $8000; // 1000 100 10 1 div
dw 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15, 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15; // 1000 100 10 1 shr
dw 10, 10, 10, 10, 10, 10, 10, 10; // 10
dw $30, $30, $30, $30, $30, $30, $30, $30; // To Unicode / ASCII
dd 10000, 10000, 10000, 10000; // 10000
dw $147b, $147b, $147b, $147b, $147b, $147b, $147b, $147b // RoundUp( 2^19 / 100 )
dw 100, 100, 100, 100, 100, 100, 100, 100 // 100
dw $199a, $199a, $199a, $199a, $199a, $199a, $199a, $199a // RoundUp( 2^16 / 10 )
dd $30303030, $30303030, $30303030, $30303030 // To bytewise / ASCII
end;
function IntS32ToStr( X: Longword ): UnicodeString; register;
var
P, Q: PWideChar;
begin
SetLength( Result, 11 );
P := PWideChar( Pointer( Result ) );
// Full string buffer and set the length of the string with no resizing!
PLongword( ( NativeInt( Result ) - sizeof( Longword ) ) )^ := IntS32ToWide( X, P ) - P;
end;
function IntS32ToWide( X: Integer; P: PWideChar ): PWideChar;
{$if defined( CPUX86 )}
asm // eax = X, edx = P
cmp eax, 0
jge IntU32ToWide
mov word ptr [ edx ], Word( '-' )
neg eax
lea edx, [ edx + 2 ]
jmp IntU32ToWide
end;
{$else if defined( CPUX64 )}
asm // ecx = X, rdx = P
.NOFRAME
cmp ecx, 0
jge IntU32ToWide
mov word ptr [ rdx ], Word( '-' )
neg ecx
lea rdx, [ rdx + 2 ]
jmp IntU32ToWide
end;
{$else}
begin
if X >= 0 then begin
Result := IntU32ToWide( Longword( X ), P );
end else begin
P^ := '-';
Result := IntU32ToWide( Longword( -X ), P + 1 );
end;
end;
{$ifend}
function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
{$if defined( CPUX86 )}
asm
cmp eax, 100000000
jb #Medium
#Large:
push edx
xor edx, edx
mov ecx, 100000000
div ecx
pop ecx
// eax = high one or two digit value, edx = 8 digit value, ecx = pointer
// Emit the first 2 digits
mov eax, dword ptr [ DigitsClippedW + eax * 4 ]
mov [ ecx ], eax
cmp eax, $10000
setae al
movzx eax, al
lea eax, [ eax * 2 + ecx + 18 ]
// edx = 8 digit value, ecx = pointer
// Emit 8 follow digits
movd xmm1, edx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
por xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
shufps xmm0, xmm0, $4E
movdqu [ eax - 16 ], xmm0 // And save 8 digits at once
ret
#Medium:
cmp eax, 100
jb #Small
// eax 2..8 digits, edx = pointer
// Emit 2..8 digits
movd xmm1, eax // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
movdqa xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
por xmm0, xmm1
shufps xmm0, xmm0, $4E
// Now we have 8 Unicode characters in the xmm0 register in the correct order.
pcmpeqw xmm1, xmm0 // scan for zeroes.
pmovmskb eax, xmm1
packuswb xmm0, xmm0 // convert to bytes
xor eax, $FFFF // change polarity
bsf eax, eax // amount to shift in bytes.
lea ecx, [ eax * 4 ]
movd xmm1, ecx
psrlq xmm0, xmm1 // bytes shifted.
pxor xmm2, xmm2
punpcklbw xmm0, xmm2
neg eax
movdqu dqword ptr [ edx ], xmm0
lea eax, [ edx + 16 + eax ]
ret
#Small:
// eax 1..2 digits, edx = pointer
// Emit one or two digits
mov eax, dword ptr [ DigitsClippedW + eax * 4 ]
mov [ edx ], eax
cmp eax, $10000
setae al
movzx eax, al
lea eax, [ edx + eax * 2 + 2 ]
end;
{$else if defined( CPUX64 )}
asm
cmp ecx, 100000000
jb #Medium
#Large:
mov r8, rdx // r8 = pointer
// Split up low 8 digits from high 1 or 2 digits..
mov eax, ecx
mov r9, 12379400392853802749 // RoundUp( 2^64+26 / 1e8 )
mul rax, r9
shr rdx, 26
mov r10, rdx // r10 = eax div 1e8
mov rax, rdx
mov r9, 100000000
mul rax, r9
sub ecx, eax // ecx = eax mod 1e8
// Emit the first 2 digits
lea r9, [ DigitsClippedW ]
mov eax, dword ptr [ r9 + r10 * 4 ]
mov dword ptr [ r8 ], eax
// advance pointer ( also for the next 8 bytes)
cmp eax, $10000
setae al
movzx rax, al
lea rax, [ rax * 2 + r8 + 2 + 16 ]
// ecx = 8 digit value, r8 = pointer + 8
movd xmm1, ecx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
por xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
shufps xmm0, xmm0, $4E
movdqu [ rax - 16 ], xmm0 // And save 8 digits at once
ret
#Medium:
cmp ecx, 100
jb #Small
// eax 2..8 digits, rdx = pointer
// Emit 2..8 digits
movd xmm1, ecx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
movdqa xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
por xmm0, xmm1
shufps xmm0, xmm0, $4E
// Now we have 8 Unicode characters in the xmm0 register in the correct order.
pcmpeqw xmm1, xmm0 // scan for zeroes.
pmovmskb eax, xmm1
packuswb xmm0, xmm0 // convert to bytes
xor eax, $FFFF // change polarity
bsf eax, eax // amount to shift in bytes.
lea ecx, [ eax * 4 ]
movd xmm1, ecx
psrlq xmm0, xmm1 // bytes shifted.
pxor xmm2, xmm2
punpcklbw xmm0, xmm2
neg rax
movdqu dqword ptr [ rdx ], xmm0
lea rax, [ rdx + 16 + rax ]
ret
#Small:
// ecx 1..2 digits, rdx = pointer
// Emit one or two digits
lea r9, [ DigitsClippedW ]
mov eax, dword ptr [ r9 + rcx * 4 ]
mov [ rdx ], eax
cmp eax, $10000
setae al
movzx rax, al
lea rax, [ rdx + rax * 2 + 2 ]
end;
{$else}
begin
Assert( False, 'Not implemented.' );
end;
{$ifend}

In SynCommons.pas, you have also the following function:
function IntToString(Value: integer): string;
var tmp: array[0..15] of AnsiChar;
P: PAnsiChar;
begin
P := StrInt32(#tmp[15],Value);
Ansi7ToString(PWinAnsiChar(P),#tmp[15]-P,result);
end;
I suspect it will be also fast, even on Win64 platform. Slower than asm, but fast enough for small numbers (which tends to be most of the integer in the wild).
There will be only one memory allocation in this function, which is pretty fast even on Win64, thanks to the updated version of FastMM4, which has its own optimized x64 asm.

In my opinion, the key way to improve performance is to avoid heap allocations. The time spent by IntToStr doing the allocations is greater than the time spent doing the decimal conversion. And if you are wanting to use multiple threads then this is even more important because the default Delphi memory manager does not scale well under thread contention.
It's true that the decimal conversion can also be optimised, but I always try to optimise by picking off the low-hanging fruit first.
So, for the sake of completeness, in case these functions prove useful to others, here are my routines for heap allocation free integer to string conversion:
procedure DivMod(Dividend, Divisor: Cardinal; out Quotient, Remainder: Cardinal);
{$IFDEF CPUX86}
asm
PUSH EBX
MOV EBX,EDX
XOR EDX,EDX
DIV EBX
MOV [ECX],EAX
MOV EBX,Remainder
MOV [EBX],EDX
POP EBX
end;
{$ELSE IF Defined(CPUX64)}
asm
.NOFRAME
MOV EAX,ECX
MOV ECX,EDX
XOR EDX,EDX
DIV ECX
MOV [R8],EAX
MOV [R9],EDX
end;
{$ELSE}
{$Message Error 'Unrecognised platform.'}
{$ENDIF}
{$IFOPT R+}
{$DEFINE RANGECHECKSON}
{$R-}
{$ENDIF}
{$IFOPT Q+}
{$DEFINE OVERFLOWCHECKSON}
{$Q-}
{$ENDIF}
// disable range checks and overflow checks so that abs() functions in case Value = low(Value)
function CopyIntegerToAnsiBuffer(const Value: Integer; var Buffer: array of AnsiChar): Integer;
var
i, j: Integer;
val, remainder: Cardinal;
negative: Boolean;
tmp: array [0..15] of AnsiChar;
begin
negative := Value<0;
val := abs(Value);
Result := 0;
repeat
DivMod(val, 10, val, remainder);
tmp[Result] := AnsiChar(remainder + ord('0'));
inc(Result);
until val=0;
if negative then begin
tmp[Result] := '-';
inc(Result);
end;
Assert(Result<=Length(Buffer));
i := 0;
j := Result-1;
while i<Result do begin
Buffer[i] := tmp[j];
inc(i);
dec(j);
end;
end;
function CopyInt64ToAnsiBuffer(const Value: Int64; var Buffer: array of AnsiChar): Integer;
var
i, j: Integer;
val, remainder: UInt64;
negative: Boolean;
tmp: array [0..23] of AnsiChar;
begin
negative := Value<0;
val := abs(Value);
Result := 0;
repeat
DivMod(val, 10, val, remainder);
tmp[Result] := AnsiChar(remainder + ord('0'));
inc(Result);
until val=0;
if negative then begin
tmp[Result] := '-';
inc(Result);
end;
Assert(Result<=Length(Buffer));
i := 0;
j := Result-1;
while i<Result do begin
Buffer[i] := tmp[j];
inc(i);
dec(j);
end;
end;
{$IFDEF RANGECHECKSON}
{$R+}
{$UNDEF RANGECHECKSON}
{$ENDIF}
{$IFDEF OVERFLOWCHECKSON}
{$Q+}
{$UNDEF OVERFLOWCHECKSON}
{$ENDIF}
My use case requires an array of AnsiChar, but it is of course simple to amend these functions to populate WideChar arrays.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Does the speed of ldp instruction depend on caching? - arm64

Related

Why is there a performance penalty for nested subroutines in Delphi?

Optimizing SIMD histogram calculation

iOS cordova build throwing bad access exception immediately after launch, majority of the time

Cant get CoreMotion data in UIKit UIGravityDirection vector

faster alternative to InttoStr/StrToInt?

Categories

Resources