faster alternative to InttoStr/StrToInt? - delphi

I wonder if there are faster alternative than System.IntToStr / System.StrToInt. There is a fast version but only UTF8. Which is Int32ToUTF8 from SynCommons.pas and due to slow string conversions it is bound to be slow. The purepascal RTL versions are really slow for 64 bit.

This routine is approximately 40% faster than the routine in the RTL. It could be much faster if you worked with WideChar[] buffers because the string allocation is taking up 75% of the time used by the conversion routine:
IntS32ToWide: 5,50 ns/item (PWideChar)
IntToStr: 34,51 ns/item (RTL)
IntS32ToStr: 24,77 ns/item (RTL replacement)
Please note that the routine below uses SSE2 and only x86 and x64 versions are fully implemented and tested.
In the initialization:
function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
function IntS32ToWide( X: Integer; P: PWideChar ): PWideChar; register;
function IntS32ToStr ( X: Longword ): UnicodeString; register; inline;
In the implementation:
{$CODEALIGN 16}
{$ALIGN 16}
const
DigitsClippedW: array [ 0..99 ] of LongWord = (
$000030, $000031, $000032, $000033, $000034, $000035, $000036, $000037, $000038, $000039,
$300031, $310031, $320031, $330031, $340031, $350031, $360031, $370031, $380031, $390031,
$300032, $310032, $320032, $330032, $340032, $350032, $360032, $370032, $380032, $390032,
$300033, $310033, $320033, $330033, $340033, $350033, $360033, $370033, $380033, $390033,
$300034, $310034, $320034, $330034, $340034, $350034, $360034, $370034, $380034, $390034,
$300035, $310035, $320035, $330035, $340035, $350035, $360035, $370035, $380035, $390035,
$300036, $310036, $320036, $330036, $340036, $350036, $360036, $370036, $380036, $390036,
$300037, $310037, $320037, $330037, $340037, $350037, $360037, $370037, $380037, $390037,
$300038, $310038, $320038, $330038, $340038, $350038, $360038, $370038, $380038, $390038,
$300039, $310039, $320039, $330039, $340039, $350039, $360039, $370039, $380039, $390039 );
// Delphi XE3 has no working alignment for 16 bytes for data but it has alignment for 16 bytes for code!
// So we encode our constants as a procedure and use constant offsets to the data.
const
Div10000_Shl45d = $00;
Shl16_minus_10000d = $10;
Div_1000_100_10_1w = $20;
Shl_1000_100_10_1w = $30;
Mul_10w = $40;
To_Asciiw = $50;
Mul_10000d = $60;
Div100_Shl19w = $70;
Mul100w = $80;
Div10_shl16w = $90;
To_Asciib = $A0;
procedure IntUToStrConsts();
asm
{$if defined( CPUX64 )}.NOFRAME{$ifend}
dd $d1b71759, $d1b71759, $d1b71759, $d1b71759; // RoundUp( 2^45 / 10000 )
dd $10000 - 10000, $10000 - 10000, $10000 - 10000, $10000 - 10000; // 1 shl 16 - 1e4
dw 8389, 5243, 13108, $8000, 8389, 5243, 13108, $8000; // 1000 100 10 1 div
dw 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15, 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15; // 1000 100 10 1 shr
dw 10, 10, 10, 10, 10, 10, 10, 10; // 10
dw $30, $30, $30, $30, $30, $30, $30, $30; // To Unicode / ASCII
dd 10000, 10000, 10000, 10000; // 10000
dw $147b, $147b, $147b, $147b, $147b, $147b, $147b, $147b // RoundUp( 2^19 / 100 )
dw 100, 100, 100, 100, 100, 100, 100, 100 // 100
dw $199a, $199a, $199a, $199a, $199a, $199a, $199a, $199a // RoundUp( 2^16 / 10 )
dd $30303030, $30303030, $30303030, $30303030 // To bytewise / ASCII
end;
function IntS32ToStr( X: Longword ): UnicodeString; register;
var
P, Q: PWideChar;
begin
SetLength( Result, 11 );
P := PWideChar( Pointer( Result ) );
// Full string buffer and set the length of the string with no resizing!
PLongword( ( NativeInt( Result ) - sizeof( Longword ) ) )^ := IntS32ToWide( X, P ) - P;
end;
function IntS32ToWide( X: Integer; P: PWideChar ): PWideChar;
{$if defined( CPUX86 )}
asm // eax = X, edx = P
cmp eax, 0
jge IntU32ToWide
mov word ptr [ edx ], Word( '-' )
neg eax
lea edx, [ edx + 2 ]
jmp IntU32ToWide
end;
{$else if defined( CPUX64 )}
asm // ecx = X, rdx = P
.NOFRAME
cmp ecx, 0
jge IntU32ToWide
mov word ptr [ rdx ], Word( '-' )
neg ecx
lea rdx, [ rdx + 2 ]
jmp IntU32ToWide
end;
{$else}
begin
if X >= 0 then begin
Result := IntU32ToWide( Longword( X ), P );
end else begin
P^ := '-';
Result := IntU32ToWide( Longword( -X ), P + 1 );
end;
end;
{$ifend}
function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
{$if defined( CPUX86 )}
asm
cmp eax, 100000000
jb #Medium
#Large:
push edx
xor edx, edx
mov ecx, 100000000
div ecx
pop ecx
// eax = high one or two digit value, edx = 8 digit value, ecx = pointer
// Emit the first 2 digits
mov eax, dword ptr [ DigitsClippedW + eax * 4 ]
mov [ ecx ], eax
cmp eax, $10000
setae al
movzx eax, al
lea eax, [ eax * 2 + ecx + 18 ]
// edx = 8 digit value, ecx = pointer
// Emit 8 follow digits
movd xmm1, edx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
por xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
shufps xmm0, xmm0, $4E
movdqu [ eax - 16 ], xmm0 // And save 8 digits at once
ret
#Medium:
cmp eax, 100
jb #Small
// eax 2..8 digits, edx = pointer
// Emit 2..8 digits
movd xmm1, eax // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
movdqa xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
por xmm0, xmm1
shufps xmm0, xmm0, $4E
// Now we have 8 Unicode characters in the xmm0 register in the correct order.
pcmpeqw xmm1, xmm0 // scan for zeroes.
pmovmskb eax, xmm1
packuswb xmm0, xmm0 // convert to bytes
xor eax, $FFFF // change polarity
bsf eax, eax // amount to shift in bytes.
lea ecx, [ eax * 4 ]
movd xmm1, ecx
psrlq xmm0, xmm1 // bytes shifted.
pxor xmm2, xmm2
punpcklbw xmm0, xmm2
neg eax
movdqu dqword ptr [ edx ], xmm0
lea eax, [ edx + 16 + eax ]
ret
#Small:
// eax 1..2 digits, edx = pointer
// Emit one or two digits
mov eax, dword ptr [ DigitsClippedW + eax * 4 ]
mov [ edx ], eax
cmp eax, $10000
setae al
movzx eax, al
lea eax, [ edx + eax * 2 + 2 ]
end;
{$else if defined( CPUX64 )}
asm
cmp ecx, 100000000
jb #Medium
#Large:
mov r8, rdx // r8 = pointer
// Split up low 8 digits from high 1 or 2 digits..
mov eax, ecx
mov r9, 12379400392853802749 // RoundUp( 2^64+26 / 1e8 )
mul rax, r9
shr rdx, 26
mov r10, rdx // r10 = eax div 1e8
mov rax, rdx
mov r9, 100000000
mul rax, r9
sub ecx, eax // ecx = eax mod 1e8
// Emit the first 2 digits
lea r9, [ DigitsClippedW ]
mov eax, dword ptr [ r9 + r10 * 4 ]
mov dword ptr [ r8 ], eax
// advance pointer ( also for the next 8 bytes)
cmp eax, $10000
setae al
movzx rax, al
lea rax, [ rax * 2 + r8 + 2 + 16 ]
// ecx = 8 digit value, r8 = pointer + 8
movd xmm1, ecx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
por xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
shufps xmm0, xmm0, $4E
movdqu [ rax - 16 ], xmm0 // And save 8 digits at once
ret
#Medium:
cmp ecx, 100
jb #Small
// eax 2..8 digits, rdx = pointer
// Emit 2..8 digits
movd xmm1, ecx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
movdqa xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
por xmm0, xmm1
shufps xmm0, xmm0, $4E
// Now we have 8 Unicode characters in the xmm0 register in the correct order.
pcmpeqw xmm1, xmm0 // scan for zeroes.
pmovmskb eax, xmm1
packuswb xmm0, xmm0 // convert to bytes
xor eax, $FFFF // change polarity
bsf eax, eax // amount to shift in bytes.
lea ecx, [ eax * 4 ]
movd xmm1, ecx
psrlq xmm0, xmm1 // bytes shifted.
pxor xmm2, xmm2
punpcklbw xmm0, xmm2
neg rax
movdqu dqword ptr [ rdx ], xmm0
lea rax, [ rdx + 16 + rax ]
ret
#Small:
// ecx 1..2 digits, rdx = pointer
// Emit one or two digits
lea r9, [ DigitsClippedW ]
mov eax, dword ptr [ r9 + rcx * 4 ]
mov [ rdx ], eax
cmp eax, $10000
setae al
movzx rax, al
lea rax, [ rdx + rax * 2 + 2 ]
end;
{$else}
begin
Assert( False, 'Not implemented.' );
end;
{$ifend}

In SynCommons.pas, you have also the following function:
function IntToString(Value: integer): string;
var tmp: array[0..15] of AnsiChar;
P: PAnsiChar;
begin
P := StrInt32(#tmp[15],Value);
Ansi7ToString(PWinAnsiChar(P),#tmp[15]-P,result);
end;
I suspect it will be also fast, even on Win64 platform. Slower than asm, but fast enough for small numbers (which tends to be most of the integer in the wild).
There will be only one memory allocation in this function, which is pretty fast even on Win64, thanks to the updated version of FastMM4, which has its own optimized x64 asm.

In my opinion, the key way to improve performance is to avoid heap allocations. The time spent by IntToStr doing the allocations is greater than the time spent doing the decimal conversion. And if you are wanting to use multiple threads then this is even more important because the default Delphi memory manager does not scale well under thread contention.
It's true that the decimal conversion can also be optimised, but I always try to optimise by picking off the low-hanging fruit first.
So, for the sake of completeness, in case these functions prove useful to others, here are my routines for heap allocation free integer to string conversion:
procedure DivMod(Dividend, Divisor: Cardinal; out Quotient, Remainder: Cardinal);
{$IFDEF CPUX86}
asm
PUSH EBX
MOV EBX,EDX
XOR EDX,EDX
DIV EBX
MOV [ECX],EAX
MOV EBX,Remainder
MOV [EBX],EDX
POP EBX
end;
{$ELSE IF Defined(CPUX64)}
asm
.NOFRAME
MOV EAX,ECX
MOV ECX,EDX
XOR EDX,EDX
DIV ECX
MOV [R8],EAX
MOV [R9],EDX
end;
{$ELSE}
{$Message Error 'Unrecognised platform.'}
{$ENDIF}
{$IFOPT R+}
{$DEFINE RANGECHECKSON}
{$R-}
{$ENDIF}
{$IFOPT Q+}
{$DEFINE OVERFLOWCHECKSON}
{$Q-}
{$ENDIF}
// disable range checks and overflow checks so that abs() functions in case Value = low(Value)
function CopyIntegerToAnsiBuffer(const Value: Integer; var Buffer: array of AnsiChar): Integer;
var
i, j: Integer;
val, remainder: Cardinal;
negative: Boolean;
tmp: array [0..15] of AnsiChar;
begin
negative := Value<0;
val := abs(Value);
Result := 0;
repeat
DivMod(val, 10, val, remainder);
tmp[Result] := AnsiChar(remainder + ord('0'));
inc(Result);
until val=0;
if negative then begin
tmp[Result] := '-';
inc(Result);
end;
Assert(Result<=Length(Buffer));
i := 0;
j := Result-1;
while i<Result do begin
Buffer[i] := tmp[j];
inc(i);
dec(j);
end;
end;
function CopyInt64ToAnsiBuffer(const Value: Int64; var Buffer: array of AnsiChar): Integer;
var
i, j: Integer;
val, remainder: UInt64;
negative: Boolean;
tmp: array [0..23] of AnsiChar;
begin
negative := Value<0;
val := abs(Value);
Result := 0;
repeat
DivMod(val, 10, val, remainder);
tmp[Result] := AnsiChar(remainder + ord('0'));
inc(Result);
until val=0;
if negative then begin
tmp[Result] := '-';
inc(Result);
end;
Assert(Result<=Length(Buffer));
i := 0;
j := Result-1;
while i<Result do begin
Buffer[i] := tmp[j];
inc(i);
dec(j);
end;
end;
{$IFDEF RANGECHECKSON}
{$R+}
{$UNDEF RANGECHECKSON}
{$ENDIF}
{$IFDEF OVERFLOWCHECKSON}
{$Q+}
{$UNDEF OVERFLOWCHECKSON}
{$ENDIF}
My use case requires an array of AnsiChar, but it is of course simple to amend these functions to populate WideChar arrays.

Related

Why is there a performance penalty for nested subroutines in Delphi?

A static analyzer we use has a report that says:
Subprograms with local subprograms (OPTI7)
This section lists subprograms that themselves have local subprograms.
Especially when these subprograms share local variables, it can have a
negative effect on performance.
This guide says:
Do not use nested routines Nested routines (routines within other
routines; also known as "local procedures") require some special stack
manipulation so that the variables of the outer routine can be seen by
the inner routine. This results in a good bit of overhead. Instead of
nesting, move the procedure to the unit scoping level and pass the
necessary variables - if necessary by reference (use the var keyword)
- or make the variable global at the unit scope.
We were interested in knowing if we should take this report into consideration when validating our code. The answers to this question suggest that one should profile one's application to see if there is any performance difference, but not much is said about the difference between nested routines and normal subroutines.
What is the actual difference between nested routines and normal routines and how may it cause a performance penalty?
tl;dr
There are extra push/pops for nested subroutines
Turning on optimizations may strip those away, such that the generated code is the same for both nested subroutines and normal subroutines
Inlining results in the same code being generated for both nested and normal subroutines
For simple routines with few parameters and local variables we perceived no performance difference even with optimizations turned off
I wrote a little test to determine this, where GetRTClock is measuring the current time with a precision of 1ns:
function subprogram_main(z : Integer) : Int64;
var
n : Integer;
s : Int64;
function subprogram_aux(n, z : Integer) : Integer;
var
i : Integer;
begin
// Do some useless work on the aux program
for i := 0 to n - 1 do begin
if (i > z) then
z := z + i
else
z := z - i;
end;
Result := z;
end;
begin
s := GetRTClock;
// Do some minor work on the main program
n := z div 100 * 100 + 100;
// Call the aux program
z := subprogram_aux(n, z);
Result := GetRTClock - s;
end;
function normal_aux(n, z : Integer) : Integer;
var
i : Integer;
begin
// Do some useless work on the aux program
for i := 0 to n - 1 do begin
if (i > z) then
z := z + i
else
z := z - i;
end;
Result := z;
end;
function normal_main(z : Integer) : Int64;
var
n : Integer;
s : Int64;
begin
s := GetRTClock;
// Do some minor work on the main program
n := z div 100 * 100 + 100;
// Call the aux program
z := normal_aux(n, z);
Result := GetRTClock - s;
end;
This compiles to:
subprogram_main
MyFormU.pas.41: begin
005CE7D0 55 push ebp
005CE7D1 8BEC mov ebp,esp
005CE7D3 83C4E0 add esp,-$20
005CE7D6 8945FC mov [ebp-$04],eax
MyFormU.pas.42: s := GetRTClock;
...
MyFormU.pas.45: n := z div 100 * 100 + 100;
...
MyFormU.pas.47: z := subprogram_aux(n, z);
005CE7F8 55 push ebp
005CE7F9 8B55FC mov edx,[ebp-$04]
005CE7FC 8B45EC mov eax,[ebp-$14]
005CE7FF E880FFFFFF call subprogram_aux
005CE804 59 pop ecx
005CE805 8945FC mov [ebp-$04],eax
MyFormU.pas.49: Result := GetRTClock - s;
...
normal_main
MyFormU.pas.70: begin
005CE870 55 push ebp
005CE871 8BEC mov ebp,esp
005CE873 83C4E0 add esp,-$20
005CE876 8945FC mov [ebp-$04],eax
MyFormU.pas.71: s := GetRTClock;
...
MyFormU.pas.74: n := z div 100 * 100 + 100;
...
MyFormU.pas.76: z := normal_aux(n, z);
005CE898 8B55FC mov edx,[ebp-$04]
005CE89B 8B45EC mov eax,[ebp-$14]
005CE89E E881FFFFFF call normal_aux
005CE8A3 8945FC mov [ebp-$04],eax
MyFormU.pas.78: Result := GetRTClock - s;
...
subprogram_aux:
MyFormU.pas.31: begin
005CE784 55 push ebp
005CE785 8BEC mov ebp,esp
005CE787 83C4EC add esp,-$14
005CE78A 8955F8 mov [ebp-$08],edx
005CE78D 8945FC mov [ebp-$04],eax
MyFormU.pas.33: for i := 0 to n - 1 do begin
005CE790 8B45FC mov eax,[ebp-$04]
005CE793 48 dec eax
005CE794 85C0 test eax,eax
005CE796 7C29 jl $005ce7c1
005CE798 40 inc eax
005CE799 8945EC mov [ebp-$14],eax
005CE79C C745F000000000 mov [ebp-$10],$00000000
MyFormU.pas.34: if (i > z) then
005CE7A3 8B45F0 mov eax,[ebp-$10]
005CE7A6 3B45F8 cmp eax,[ebp-$08]
005CE7A9 7E08 jle $005ce7b3
MyFormU.pas.35: z := z + i
005CE7AB 8B45F0 mov eax,[ebp-$10]
005CE7AE 0145F8 add [ebp-$08],eax
005CE7B1 EB06 jmp $005ce7b9
MyFormU.pas.37: z := z - i;
005CE7B3 8B45F0 mov eax,[ebp-$10]
005CE7B6 2945F8 sub [ebp-$08],eax
normal_aux:
MyFormU.pas.55: begin
005CE824 55 push ebp
005CE825 8BEC mov ebp,esp
005CE827 83C4EC add esp,-$14
005CE82A 8955F8 mov [ebp-$08],edx
005CE82D 8945FC mov [ebp-$04],eax
MyFormU.pas.57: for i := 0 to n - 1 do begin
005CE830 8B45FC mov eax,[ebp-$04]
005CE833 48 dec eax
005CE834 85C0 test eax,eax
005CE836 7C29 jl $005ce861
005CE838 40 inc eax
005CE839 8945EC mov [ebp-$14],eax
005CE83C C745F000000000 mov [ebp-$10],$00000000
MyFormU.pas.58: if (i > z) then
005CE843 8B45F0 mov eax,[ebp-$10]
005CE846 3B45F8 cmp eax,[ebp-$08]
005CE849 7E08 jle $005ce853
MyFormU.pas.59: z := z + i
005CE84B 8B45F0 mov eax,[ebp-$10]
005CE84E 0145F8 add [ebp-$08],eax
005CE851 EB06 jmp $005ce859
MyFormU.pas.61: z := z - i;
005CE853 8B45F0 mov eax,[ebp-$10]
005CE856 2945F8 sub [ebp-$08],eax
The only difference is one push and one pop. What happens if we turn on optimizations?
MyFormU.pas.47: z := subprogram_aux(n, z);
005CE7C5 8BD3 mov edx,ebx
005CE7C7 8BC6 mov eax,esi
005CE7C9 E8B6FFFFFF call subprogram_aux
MyFormU.pas.76: z := normal_aux(n, z);
005CE82D 8BD3 mov edx,ebx
005CE82F 8BC6 mov eax,esi
005CE831 E8B6FFFFFF call normal_aux
Both compile exactly to the same thing.
What happens when inlining?
MyFormU.pas.76: z := normal_aux(n, z);
005CE804 8BD3 mov edx,ebx
005CE806 8BC8 mov ecx,eax
005CE808 49 dec ecx
005CE809 85C9 test ecx,ecx
005CE80B 7C11 jl $005ce81e
005CE80D 41 inc ecx
005CE80E 33C0 xor eax,eax
005CE810 3BD0 cmp edx,eax
005CE812 7D04 jnl $005ce818
005CE814 03D0 add edx,eax
005CE816 EB02 jmp $005ce81a
005CE818 2BD0 sub edx,eax
005CE81A 40 inc eax
005CE81B 49 dec ecx
005CE81C 75F2 jnz $005ce810
subprogram_main:
MyFormU.pas.47: z := subprogram_aux(n, z);
005CE7A8 8BD3 mov edx,ebx
005CE7AA 8BC8 mov ecx,eax
005CE7AC 49 dec ecx
005CE7AD 85C9 test ecx,ecx
005CE7AF 7C11 jl $005ce7c2
005CE7B1 41 inc ecx
005CE7B2 33C0 xor eax,eax
005CE7B4 3BD0 cmp edx,eax
005CE7B6 7D04 jnl $005ce7bc
005CE7B8 03D0 add edx,eax
005CE7BA EB02 jmp $005ce7be
005CE7BC 2BD0 sub edx,eax
005CE7BE 40 inc eax
005CE7BF 49 dec ecx
005CE7C0 75F2 jnz $005ce7b4
Again, no difference.
I also profiled this little example, taking an average of 30 executions for each (normal and subprogram), called in random order:
constructor TForm1.Create(AOwner: TComponent);
const
c_nSamples = 60;
rnd_sample : array[0..c_nSamples - 1] of byte = (1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0);
var
subprogram_gt_ns : Int64;
normal_gt_ns : Int64;
rnd_input : Integer;
i : Integer;
begin
inherited Create(AOwner);
normal_gt_ns := 0;
subprogram_gt_ns := 0;
rnd_input := Random(1000);
for i := 0 to c_nSamples - 1 do
if (rnd_sample[i] = 1) then
Inc(subprogram_gt_ns, subprogram_main(rnd_input))
else
Inc(normal_gt_ns, normal_main(rnd_input));
OutputDebugString(PChar(' Normal ' + FloatToStr(normal_gt_ns / 30) + ' Subprogram ' + FloatToStr(subprogram_gt_ns / 30)));
end;
There is no significant difference even with optimizations turned off:
Debug Output: Normal 1166,66666666667 Subprogram 1203,33333333333 Process MyProject.exe (1824)
Finally, both texts that warn about performance mention something about shared local variables.
If we do not pass z to subprogram_aux, instead access it directly, we get:
MyFormU.pas.47: z := subprogram_aux(n);
005CE7D2 55 push ebp
005CE7D3 8BC3 mov eax,ebx
005CE7D5 E8AAFFFFFF call subprogram_aux
005CE7DA 59 pop ecx
005CE7DB 8945FC mov [ebp-$04],eax
Even with optimizations turned on.

Using FPC .o files in Delphi 2010

I've written quite a big library for matrix operations for Delphi and FPC.
There exists now an extension for this library for the Intel AVX extension but
I could only manage to get that compiled in FPC. My idea was to create
.o files in FPC which contains the AVX assembler codes and include these
files in Delphi. I tried to follow this question here:
Linking FPC .o files into Delphi
but without success. I was able to dump the function names and tried to import
these in the Delphi unit. The problem is that I always get an error saying that
the .o files is in the wrong format.
I use CodeTyphoon for compilation which internally uses FPC 3.1.1 and
Delphi2010 as a first try.
The code is once compiled in FPC and one time in Delphi using the approriate
ifdefs.
My base code looks like this (just an excerpt):
// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2011, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################
unit AVXMatrixMultOperations;
interface
{$IFDEF CPUX64}
{$DEFINE x64}
{$ENDIF}
{$IFDEF cpux86_64}
{$DEFINE x64}
{$ENDIF}
{$IFNDEF x64}
uses MatrixConst;
{$IFNDEF FPC}
// this fails -> wrong object format
{$L '.\AVXPrecompiled\win32\AVXMatrixMultOperations.o'}
{$ENDIF}
// full matrix operations
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
{$IFNDEF FPC} external '' name 'AVXMATRIXMULTOPERATIONS_$$_AVXMATRIXMULTALIGNED$crc2A67AB04'; {$ENDIF}
{$ENDIF}
implementation
{$IFDEF FPC} {$ASMMODE intel} {$ENDIF}
{$IFNDEF x64}
{$IFDEF FPC}
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
var bytesWidth2, destOffset : TASMNativeInt;
iter : TASMNativeInt;
{$IFDEF FPC}
begin
{$ENDIF}
asm
// prolog - simulate stack
push ebx;
push edi;
push esi;
mov ecx, dest;
mov edi, width1;
imul edi, -8;
mov iter, edi;
sub mt1, edi;
//destOffset := destLineWidth - Width2*sizeof(double);
mov ebx, Width2;
shl ebx, 3;
mov eax, destLineWidth;
sub eax, ebx;
mov destOffset, eax;
//bytesWidth2 := width2*sizeof(double);
mov bytesWidth2, ebx;
// for y := 0 to height1 - 1 do
##foryloop:
// r12 -> counter to width2
mov esi, width2;
sub esi, 2;
jl #LastXColumn;
##forxloop:
// for x := 0 to width2 div 2 - 1
// esi: mt1 - width1*sizeof(double)
// mt2: mt2
mov edx, mt1;
mov ebx, mt2;
mov eax, iter;
mov edi, LineWidth2;
vxorpd ymm0, ymm0, ymm0;
vxorpd ymm1, ymm1, ymm1;
cmp eax, -32;
jg ##Innerloop2Begin;
// for z := 0 to width1 - 1do
// AVX part:
##InnerLoop1:
// 4x4 block
vmovapd xmm2, [ebx];
add ebx, edi;
vmovapd xmm4, xmm2;
vmovapd xmm3, [ebx];
add ebx, edi;
// shuffle so we can multiply
// swap such that we can immediately multiply
vmovlhps xmm2, xmm2, xmm3;
vmovhlps xmm3, xmm3, xmm4;
// next 4 elements
vmovapd xmm4, [ebx];
add ebx, edi;
vmovapd xmm6, xmm4;
vmovapd xmm5, [ebx];
add ebx, edi;
vmovapd ymm7, [edx + eax]
vmovlhps xmm4, xmm4, xmm5;
vmovhlps xmm5, xmm5, xmm6;
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
// now multiply and add
vmulpd ymm2, ymm2, ymm7;
vmulpd ymm3, ymm3, ymm7;
vaddpd ymm0, ymm0, ymm2;
vaddpd ymm1, ymm1, ymm3;
add eax, 32;
jl ##InnerLoop1;
vextractf128 xmm2, ymm0, 1;
vextractf128 xmm3, ymm1, 1;
vhaddpd xmm0, xmm0, xmm2;
vhaddpd xmm1, xmm1, xmm3;
test eax, eax;
jz ##InnerLoopEnd2;
##Innerloop2Begin:
// rest in single elements
##InnerLoop2:
vmovapd xmm2, [ebx];
add ebx, edi;
vmovddup xmm3, [edx + eax];
vmulpd xmm2, xmm2, xmm3;
vmovhlps xmm4, xmm4, xmm2;
vaddsd xmm0, xmm0, xmm2;
vaddsd xmm1, xmm1, xmm4;
add eax, 8;
jnz ##InnerLoop2;
##InnerLoopEnd2:
// finall horizontal addition
vhaddpd xmm0, xmm0, xmm1;
vmovapd [ecx], xmm0;
// increment the pointers
// inc(mt2), inc(dest);
//add dword ptr [mt2], 8;
add mt2, 16;
add ecx, 16;
// end for x := 0 to width2 div 2 - 1
sub esi, 2;
jge ##forxloop;
#LastXColumn:
cmp esi, -1;
jne #NextLine;
// last column of mt2
mov eax, iter;
mov ebx, mt2;
vxorpd xmm0, xmm0, xmm0;
#InnerLoop2:
vmovsd xmm1, [edx + eax];
vmovsd xmm2, [ebx];
vmulsd xmm1, xmm1, xmm2;
vaddsd xmm0, xmm0, xmm1;
add ebx, edi;
add eax, 8;
jnz #InnerLoop2;
vmovsd [ecx], xmm0;
add ecx, 8;
add mt2, 8;
#NextLine:
// dec(mt2, Width2);
// inc(PByte(mt1), LineWidth1);
// inc(PByte(dest), destOffset);
//mov ebx, bytesWidth2;
//sub dword ptr [mt2], ebx;
mov eax, bytesWidth2;
sub mt2, eax;
mov eax, LineWidth1;
add mt1, eax;
add ecx, destOffset;
// end for y := 0 to height1 - 1
//dec eax;
dec height1;
jnz ##foryloop;
// epilog
vzeroupper;
pop esi;
pop edi;
pop ebx;
end;
{$IFDEF FPC}
end;
{$ENDIF}
{$ENDIF}
{$ENDIF}
end.
Since there is a single function involved here, the easiest is IMHO to convert the FPC AVXMatrixMultOperations.o file directly.
Use the great Object file converter tool.
You may try to convert from one binary format to another, accepted by Delphi.
But I guess that the cleanest way is to convert it to asm:
objconv -fasm AVXMatrixMultOperations.o
It will create a AVXMatrixMultOperations.asm file, which could be used to replace the unknown AVX instructions by simple db ..,..,..,.. bytes. Typically, the generated .asm file has the assembler on the left side, and the raw hexadecimal bytes on the right side.
This is how I dealt with old Delphi compilers in my libraries, for instance:
function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
asm // eax=crc, edx=buf, ecx=len
not eax
test ecx, ecx
jz #0
test edx, edx
jz #0
#3: test edx, 3
jz #8 // align to 4 bytes boundary
{$ifdef ISDELPHI2010}
crc32 eax, byte ptr[edx]
{$else}
db $F2, $0F, $38, $F0, $02
{$endif}
inc edx
....
So in your case, something like
{$ifdef FPC}
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
{$else}
db $xx,$yy,$zz
db $xx,$yy,$zz
{$endif}

Combining ASM with non-asm code (or SwapInt64 ASM function needed)

I need to process a file that is coming from the old Mac era (old Motorola CPU). The bytes are big endian so I have a function that swaps and Int64 to Intel little endian. The function is ASM and works on 32 bit CPU but not on 64. For 64 bit I have a different function that is not ASM. I want to combine the functions using IFDEF. Can I do this? Will it be a problem?
interface
function SwapInt64(Value: Int64): Int64; assembler;
implementation
{$IFDEF CPUx86}
function SwapInt64(Value: Int64): Int64; assembler; { Does not work on 64 bit } {
asm
MOV EDX,[DWORD PTR EBP + 12]
MOV EAX,[DWORD PTR EBP + 8]
BSWAP EAX
XCHG EAX,EDX
BSWAP EAX
end;
{$else}
function SwapInt64 (Value: Int64): Int64;
var P: PInteger;
begin
Result: = (Value shl 32) or (Value shr 32);
P: = #Result;
P ^: = (Swap (P ^) shl 16) or (Swap (P ^ shr 16));
Inc (P);
P ^: = (Swap (P ^) shl 16) or (Swap (P ^ shr 16));
end;
{$ENDIF}
I think the compiler will correctly compile/call the appropriate function no matter if one is ASM and the other is Pascal.
What you are proposing is perfectly fine. It is a quite reasonable approach.
If you want a 64 bit swap in asm, for x64, that's quite simple:
function SwapInt64(Value: Int64): Int64;
asm
MOV RAX,RCX
BSWAP RAX
end;
Combine this with the 32 bit version using conditional, as you have done in the question.
function SwapInt64(Value: Int64): Int64;
{$IF Defined(CPUX86)}
asm
MOV EDX,[DWORD PTR EBP + 12]
MOV EAX,[DWORD PTR EBP + 8]
BSWAP EAX
XCHG EAX,EDX
BSWAP EAX
end;
{$ELSEIF Defined(CPUX64)}
asm
MOV RAX,RCX
BSWAP RAX
end;
{$ELSE}
{$Message Fatal 'Unsupported architecture'}
{$ENDIF}
Or include a Pascal implementation in the {$ELSE} block.
The approach of swapping the bytes in a separate routine that cannot be inlined is a bit silly if performance is what you're after.
A better way to a assume you've got a block of data and all dword/qwords in it need to have their endianness changed.
This would look something like this.
For dwords
function SwapDWords(var Data; size: cardinal): boolean;
{ifdef CPUX64}
asm
//Data in RCX, Size in EDX
xor EAX,EAX //failure
test EDX,3
jz #MultipleOf4
#error:
ret
#MultipleOf4
neg EDX //Count up instead of down
jz #done
ADD RCX,RDX
#loop
mov R8d, [RCX+RDX]
bswap R8d
mov [RCX+RDX],R8d
add RDX,4 //add is faster than inc on modern processors
jnz #loop
#done:
inc EAX //success
ret
end;
For qwords
function SwapQWords(var Data; size: cardinal): boolean;
{ifdef CPUX64}
asm
//Data in RCX, Size in EDX
xor EAX,EAX //failure
test EDX,7
jz #MultipleOf8
#error:
ret
#MultipleOf8
neg EDX //Count up instead of down
jz #done
ADD RCX,RDX
#loop
mov R8, [RCX+RDX]
bswap R8
mov [RCX+RDX],R8
add RDX,8 //add is faster than inc on modern processors
jnz #loop
#done:
inc EAX //success
ret
end;
If you're already on 64 bit, then you have SSE2, and can use the 128-bit SSE registers.
Now you can process 4 dwords at a time, effectively unrolling the loop 4 times.
See: http://www.asmcommunity.net/forums/topic/?id=29743
movntpd xmm5,[RCX+RDX] //non-temporal move to avoid polluting the cache
movdqu xmm0, xmm5
movdqu xmm1, xmm5
pxor xmm5, xmm5
punpckhbw xmm0, xmm5 ; interleave '0' with bytes of original
punpcklbw xmm1, xmm5 ; so they become words
pshuflw xmm0, xmm0, 27 ; swap the words by shuffling
pshufhw xmm0, xmm0, 27 ;//27 = B00_01_10_11
pshuflw xmm1, xmm1, 27
pshufhw xmm1, xmm1, 27
packuswb xmm1, xmm0 ; make the words back into bytes.
movntpd [RCX+RDX], xmm1 //non-temporal move to keep the cache clean.
Simply use either LEToN() or BEtoN()
Use the LE variant if the data is little endian (e.g. 32 or 64-bits x86 mac, modern arm), use the BE if the source data (e.g. file from disk) is in big endian format.
Depending on the used architecture a swap or "nothing" will be inlined, usually a fairly optimal one for single conversions. For block oriented solutions see the posted SSE code (or Agner Fog's)

Optimizing SIMD histogram calculation

I worked on a code that implements an histogram calculation given an opencv struct IplImage * and a buffer unsigned int * to the histogram. I'm still new to SIMD so I might not be taking advantage of the full potential the instruction set provides.
histogramASM:
xor rdx, rdx
xor rax, rax
mov eax, dword [imgPtr + imgWidthOffset]
mov edx, dword [imgPtr + imgHeightOffset]
mul rdx
mov rdx, rax ; rdx = Image Size
mov r10, qword [imgPtr + imgDataOffset] ; r10 = ImgData
NextPacket:
mov rax, rdx
movdqu xmm0, [r10 + rax - 16]
mov rcx,16 ; 16 pixels/paq
PacketLoop:
pextrb rbx, xmm0, 0 ; saving the pixel value on rbx
shl rbx,2
inc dword [rbx + Hist]
psrldq xmm0,1
loop PacketLoop
sub rdx,16
cmp rdx,0
jnz NextPacket
ret
On C, I'd be running these piece of code to obtain the same result.
imgSize = (img->width)*(img->height);
pixelData = (unsigned char *) img->imageData;
for(i = 0; i < imgSize; i++)
{
pixel = *pixelData;
hist[pixel]++;
pixelData++;
}
But the time it takes for both, measured in my computer with rdtsc(), is only 1.5 times better SIMD's assembler. Is there a way to optimize the code above and quickly fill the histogram vector with SIMD?
Thanks in advance
Like Jester I'm surprised that your SIMD code had any significant improvement. Did you compile the C code with optimization turned on?
The one additional suggestion I can make is to unroll your Packetloop loop. This is a fairly simple optimization and reduces the number of instructions per "iteration" to just two:
pextrb ebx, xmm0, 0
inc dword [ebx * 4 + Hist]
pextrb ebx, xmm0, 1
inc dword [ebx * 4 + Hist]
pextrb ebx, xmm0, 2
inc dword [ebx * 4 + Hist]
...
pextrb ebx, xmm0, 15
inc dword [ebx * 4 + Hist]
If you're using NASM you can use the %rep directive to save some typing:
%assign pixel 0
%rep 16
pextrb rbx, xmm0, pixel
inc dword [rbx * 4 + Hist]
%assign pixel pixel + 1
%endrep

Delphi - Detect Int64 Overflow Error

In Delphi how can I detect overflow errors for Int64?
For Integers we could do:
type
MyInt = Integer; //Int64
function TryMaxTimes10(out Res: MyInt): boolean;
var
a, b: MyInt;
begin
{$Q+}
try
a := High(MyInt);
b := 10;
Res := a * b; //REF1
Result := True;
except
Result := False;
end;
{$Q-}
end;
For MyInt = Integer, line REF1 gives an exception and so TryMaxTimes10 returns false.
But if we change MyInt to MyInt = Int64, then REF1 does not give an exception and TryMaxTimes10 returns true!
I understand that the help for {$Q+} does not specifically mention Int64: ... {$Q+} state, certain integer arithmetic operations ... are checked for overflow.
QUESTION: So my question is, how can we detect overflow errors for Int64?
(I'm using Delphi 7. Does the same thing happen in newer versions of Delphi?)
This is a known issue. See http://qc.embarcadero.com/wc/qcmain.aspx?d=10185, and the comments Andy wrote at the bottom.
My suggestion would be to create a function (I did not compile nor test this - just an example):
function Foo(A, B : Int64) : Int64;
var bNeg : boolean;
begin
// Do we expect a negative result?
bNeg := ((a < 0) xor (b < 0));
// Get the real result
Result := a * b;
// If the result is wrong, raise an error
if ((Result < 0) xor bNeg) then begin
// Raise EOverFlow
end;
end;
This bug has been fixed in RAD Studio 10.2 Tokyo.
The issue can be found here (but one have to log in with embarcadero account to see it).
Here is correct version of __llmulo by John O'Harrow (licensed under MPL 1.1) shipped with Delphi versions 10.2 and above:
// Param 1(edx:eax), Param 2([esp+8]:[esp+4])
// Result is stored in edx:eax
// O-flag set on exit => result is invalid
// O-flag clear on exit => result is valid
procedure __llmulo();
asm
test edx, edx {Param1-Hi = 0?}
jne ##Large {No, More than one multiply may be needed}
cmp edx, [esp+8] {Param2-Hi = 0?}
jne ##Large {No, More than one multiply may be needed}
mul dword ptr [esp+4] {Only one multiply needed, Set Result}
and eax, eax {Clear Overflow Flag}
ret 8
##Large:
sub esp, 28 {allocate local storage}
mov [esp], ebx {save used registers}
mov [esp+4], esi
mov [esp+8], edi
mov [esp+12], ebp
mov ebx, [esp+32] {Param2-Lo}
mov ecx, [esp+36] {Param2-Hi}
mov esi, edx
mov edi, ecx
sar esi, 31
sar edi, 31
xor eax, esi
xor edx, esi
sub eax, esi
sbb edx, esi {edx:eax (a1:a0) = abs(Param1)}
xor ebx, edi
xor ecx, edi
sub ebx, edi
sbb ecx, edi {ecx:ebx (b1:b0) = abs(Param2)}
xor esi, edi {Sign Flag, 0 if Params have same sign else -1}
mov [esp+16], eax {a0}
mov [esp+20], edx {a1}
mov [esp+24], ecx {b1}
mul ebx {edx:eax (c1:c0) = a0*b0}
xchg ebx, edx {ebx = c1, edx = b0}
mov edi, eax {abs(Result-Lo) = c0}
xor ecx, ecx {Upper 32 bits of 128 bit result}
xor ebp, ebp {Second 32 bits of 128 bit result}
mov eax, [esp+20] {a1}
mul edx {edx:eax (d1:d0) = a1*b0}
add ebx, eax {c1 + d0}
adc ebp, edx {d1 + carry}
adc ecx, 0 {Possible carry into Upper 32 bits}
mov eax, [esp+16] {a0}
mov edx, [esp+24] {b1}
mul edx {edx:eax (e1:e0) = a0*b1}
add ebx, eax {abs(Result-Hi) = c1 + d0 + e0}
adc ebp, edx {d1 + e1 + carry}
adc ecx, 0 {Possible carry into Upper 32 bits}
mov eax, [esp+20] {a1}
mov edx, [esp+24] {b1}
mul edx {edx:eax (f1:f0) = a1*b1}
add ebp, eax {d1 + e1 + f0 + carry}
adc ecx, edx {f1 + carry}
or ecx, ebp {Overflow if ecx <> 0 or ebp <> 0}
jnz ##Overflow
mov edx, ebx {Set abs(Result-Hi)}
mov eax, edi {Set abs(Result-Lo)}
cmp edx, $80000000
jae ##CheckRange {Possible Overflow if edx>=$80000000}
##SetSign:
xor eax, esi {Correct Sign of Result}
xor edx, esi
sub eax, esi
sbb edx, esi
mov ebx, [esp] {restore used registers}
mov esi, [esp+4]
mov edi, [esp+8]
mov ebp, [esp+12]
add esp, 28 {Clears Overflow flag}
ret 8
##CheckRange:
jne ##Overflow {Overflow if edx>$80000000}
test esi, esi {edx=$80000000, Is Sign Flag=0?}
jnz ##SetSign {No, Result is Ok (-MaxInt64)}
##Overflow:
mov ebx, [esp] {restore used registers}
mov esi, [esp+4]
mov edi, [esp+8]
mov ebp, [esp+12]
add esp, 28
mov ecx, $80000000
dec ecx {Set Overflow Flag}
ret 8
end;

Resources