faster alternative to InttoStr/StrToInt? - delphi
I wonder if there are faster alternative than System.IntToStr / System.StrToInt. There is a fast version but only UTF8. Which is Int32ToUTF8 from SynCommons.pas and due to slow string conversions it is bound to be slow. The purepascal RTL versions are really slow for 64 bit.
This routine is approximately 40% faster than the routine in the RTL. It could be much faster if you worked with WideChar[] buffers because the string allocation is taking up 75% of the time used by the conversion routine:
IntS32ToWide: 5,50 ns/item (PWideChar)
IntToStr: 34,51 ns/item (RTL)
IntS32ToStr: 24,77 ns/item (RTL replacement)
Please note that the routine below uses SSE2 and only x86 and x64 versions are fully implemented and tested.
In the initialization:
function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
function IntS32ToWide( X: Integer; P: PWideChar ): PWideChar; register;
function IntS32ToStr ( X: Longword ): UnicodeString; register; inline;
In the implementation:
{$CODEALIGN 16}
{$ALIGN 16}
const
DigitsClippedW: array [ 0..99 ] of LongWord = (
$000030, $000031, $000032, $000033, $000034, $000035, $000036, $000037, $000038, $000039,
$300031, $310031, $320031, $330031, $340031, $350031, $360031, $370031, $380031, $390031,
$300032, $310032, $320032, $330032, $340032, $350032, $360032, $370032, $380032, $390032,
$300033, $310033, $320033, $330033, $340033, $350033, $360033, $370033, $380033, $390033,
$300034, $310034, $320034, $330034, $340034, $350034, $360034, $370034, $380034, $390034,
$300035, $310035, $320035, $330035, $340035, $350035, $360035, $370035, $380035, $390035,
$300036, $310036, $320036, $330036, $340036, $350036, $360036, $370036, $380036, $390036,
$300037, $310037, $320037, $330037, $340037, $350037, $360037, $370037, $380037, $390037,
$300038, $310038, $320038, $330038, $340038, $350038, $360038, $370038, $380038, $390038,
$300039, $310039, $320039, $330039, $340039, $350039, $360039, $370039, $380039, $390039 );
// Delphi XE3 has no working alignment for 16 bytes for data but it has alignment for 16 bytes for code!
// So we encode our constants as a procedure and use constant offsets to the data.
const
Div10000_Shl45d = $00;
Shl16_minus_10000d = $10;
Div_1000_100_10_1w = $20;
Shl_1000_100_10_1w = $30;
Mul_10w = $40;
To_Asciiw = $50;
Mul_10000d = $60;
Div100_Shl19w = $70;
Mul100w = $80;
Div10_shl16w = $90;
To_Asciib = $A0;
procedure IntUToStrConsts();
asm
{$if defined( CPUX64 )}.NOFRAME{$ifend}
dd $d1b71759, $d1b71759, $d1b71759, $d1b71759; // RoundUp( 2^45 / 10000 )
dd $10000 - 10000, $10000 - 10000, $10000 - 10000, $10000 - 10000; // 1 shl 16 - 1e4
dw 8389, 5243, 13108, $8000, 8389, 5243, 13108, $8000; // 1000 100 10 1 div
dw 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15, 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15; // 1000 100 10 1 shr
dw 10, 10, 10, 10, 10, 10, 10, 10; // 10
dw $30, $30, $30, $30, $30, $30, $30, $30; // To Unicode / ASCII
dd 10000, 10000, 10000, 10000; // 10000
dw $147b, $147b, $147b, $147b, $147b, $147b, $147b, $147b // RoundUp( 2^19 / 100 )
dw 100, 100, 100, 100, 100, 100, 100, 100 // 100
dw $199a, $199a, $199a, $199a, $199a, $199a, $199a, $199a // RoundUp( 2^16 / 10 )
dd $30303030, $30303030, $30303030, $30303030 // To bytewise / ASCII
end;
function IntS32ToStr( X: Longword ): UnicodeString; register;
var
P, Q: PWideChar;
begin
SetLength( Result, 11 );
P := PWideChar( Pointer( Result ) );
// Full string buffer and set the length of the string with no resizing!
PLongword( ( NativeInt( Result ) - sizeof( Longword ) ) )^ := IntS32ToWide( X, P ) - P;
end;
function IntS32ToWide( X: Integer; P: PWideChar ): PWideChar;
{$if defined( CPUX86 )}
asm // eax = X, edx = P
cmp eax, 0
jge IntU32ToWide
mov word ptr [ edx ], Word( '-' )
neg eax
lea edx, [ edx + 2 ]
jmp IntU32ToWide
end;
{$else if defined( CPUX64 )}
asm // ecx = X, rdx = P
.NOFRAME
cmp ecx, 0
jge IntU32ToWide
mov word ptr [ rdx ], Word( '-' )
neg ecx
lea rdx, [ rdx + 2 ]
jmp IntU32ToWide
end;
{$else}
begin
if X >= 0 then begin
Result := IntU32ToWide( Longword( X ), P );
end else begin
P^ := '-';
Result := IntU32ToWide( Longword( -X ), P + 1 );
end;
end;
{$ifend}
function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
{$if defined( CPUX86 )}
asm
cmp eax, 100000000
jb #Medium
#Large:
push edx
xor edx, edx
mov ecx, 100000000
div ecx
pop ecx
// eax = high one or two digit value, edx = 8 digit value, ecx = pointer
// Emit the first 2 digits
mov eax, dword ptr [ DigitsClippedW + eax * 4 ]
mov [ ecx ], eax
cmp eax, $10000
setae al
movzx eax, al
lea eax, [ eax * 2 + ecx + 18 ]
// edx = 8 digit value, ecx = pointer
// Emit 8 follow digits
movd xmm1, edx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
por xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
shufps xmm0, xmm0, $4E
movdqu [ eax - 16 ], xmm0 // And save 8 digits at once
ret
#Medium:
cmp eax, 100
jb #Small
// eax 2..8 digits, edx = pointer
// Emit 2..8 digits
movd xmm1, eax // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
movdqa xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
por xmm0, xmm1
shufps xmm0, xmm0, $4E
// Now we have 8 Unicode characters in the xmm0 register in the correct order.
pcmpeqw xmm1, xmm0 // scan for zeroes.
pmovmskb eax, xmm1
packuswb xmm0, xmm0 // convert to bytes
xor eax, $FFFF // change polarity
bsf eax, eax // amount to shift in bytes.
lea ecx, [ eax * 4 ]
movd xmm1, ecx
psrlq xmm0, xmm1 // bytes shifted.
pxor xmm2, xmm2
punpcklbw xmm0, xmm2
neg eax
movdqu dqword ptr [ edx ], xmm0
lea eax, [ edx + 16 + eax ]
ret
#Small:
// eax 1..2 digits, edx = pointer
// Emit one or two digits
mov eax, dword ptr [ DigitsClippedW + eax * 4 ]
mov [ edx ], eax
cmp eax, $10000
setae al
movzx eax, al
lea eax, [ edx + eax * 2 + 2 ]
end;
{$else if defined( CPUX64 )}
asm
cmp ecx, 100000000
jb #Medium
#Large:
mov r8, rdx // r8 = pointer
// Split up low 8 digits from high 1 or 2 digits..
mov eax, ecx
mov r9, 12379400392853802749 // RoundUp( 2^64+26 / 1e8 )
mul rax, r9
shr rdx, 26
mov r10, rdx // r10 = eax div 1e8
mov rax, rdx
mov r9, 100000000
mul rax, r9
sub ecx, eax // ecx = eax mod 1e8
// Emit the first 2 digits
lea r9, [ DigitsClippedW ]
mov eax, dword ptr [ r9 + r10 * 4 ]
mov dword ptr [ r8 ], eax
// advance pointer ( also for the next 8 bytes)
cmp eax, $10000
setae al
movzx rax, al
lea rax, [ rax * 2 + r8 + 2 + 16 ]
// ecx = 8 digit value, r8 = pointer + 8
movd xmm1, ecx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
por xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
shufps xmm0, xmm0, $4E
movdqu [ rax - 16 ], xmm0 // And save 8 digits at once
ret
#Medium:
cmp ecx, 100
jb #Small
// eax 2..8 digits, rdx = pointer
// Emit 2..8 digits
movd xmm1, ecx // xmm1 = Value
movdqa xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
pmuludq xmm0, xmm1
psrlq xmm0, 45 // xmm0 = xmm1 div 10000
pmuludq xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
paddd xmm0, xmm1 // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
psllq xmm0, 2
punpcklwd xmm0, xmm0
punpckldq xmm0, xmm0 // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
pmulhuw xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
movdqa xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ] // xmm2 := xmm0 * 10; shift to left one word.
pmullw xmm2, xmm0
psllq xmm2, 16
psubw xmm0, xmm2 // Extract digits
movdqa xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ] // Digits to ASCII
por xmm0, xmm1
shufps xmm0, xmm0, $4E
// Now we have 8 Unicode characters in the xmm0 register in the correct order.
pcmpeqw xmm1, xmm0 // scan for zeroes.
pmovmskb eax, xmm1
packuswb xmm0, xmm0 // convert to bytes
xor eax, $FFFF // change polarity
bsf eax, eax // amount to shift in bytes.
lea ecx, [ eax * 4 ]
movd xmm1, ecx
psrlq xmm0, xmm1 // bytes shifted.
pxor xmm2, xmm2
punpcklbw xmm0, xmm2
neg rax
movdqu dqword ptr [ rdx ], xmm0
lea rax, [ rdx + 16 + rax ]
ret
#Small:
// ecx 1..2 digits, rdx = pointer
// Emit one or two digits
lea r9, [ DigitsClippedW ]
mov eax, dword ptr [ r9 + rcx * 4 ]
mov [ rdx ], eax
cmp eax, $10000
setae al
movzx rax, al
lea rax, [ rdx + rax * 2 + 2 ]
end;
{$else}
begin
Assert( False, 'Not implemented.' );
end;
{$ifend}
In SynCommons.pas, you have also the following function:
function IntToString(Value: integer): string;
var tmp: array[0..15] of AnsiChar;
P: PAnsiChar;
begin
P := StrInt32(#tmp[15],Value);
Ansi7ToString(PWinAnsiChar(P),#tmp[15]-P,result);
end;
I suspect it will be also fast, even on Win64 platform. Slower than asm, but fast enough for small numbers (which tends to be most of the integer in the wild).
There will be only one memory allocation in this function, which is pretty fast even on Win64, thanks to the updated version of FastMM4, which has its own optimized x64 asm.
In my opinion, the key way to improve performance is to avoid heap allocations. The time spent by IntToStr doing the allocations is greater than the time spent doing the decimal conversion. And if you are wanting to use multiple threads then this is even more important because the default Delphi memory manager does not scale well under thread contention.
It's true that the decimal conversion can also be optimised, but I always try to optimise by picking off the low-hanging fruit first.
So, for the sake of completeness, in case these functions prove useful to others, here are my routines for heap allocation free integer to string conversion:
procedure DivMod(Dividend, Divisor: Cardinal; out Quotient, Remainder: Cardinal);
{$IFDEF CPUX86}
asm
PUSH EBX
MOV EBX,EDX
XOR EDX,EDX
DIV EBX
MOV [ECX],EAX
MOV EBX,Remainder
MOV [EBX],EDX
POP EBX
end;
{$ELSE IF Defined(CPUX64)}
asm
.NOFRAME
MOV EAX,ECX
MOV ECX,EDX
XOR EDX,EDX
DIV ECX
MOV [R8],EAX
MOV [R9],EDX
end;
{$ELSE}
{$Message Error 'Unrecognised platform.'}
{$ENDIF}
{$IFOPT R+}
{$DEFINE RANGECHECKSON}
{$R-}
{$ENDIF}
{$IFOPT Q+}
{$DEFINE OVERFLOWCHECKSON}
{$Q-}
{$ENDIF}
// disable range checks and overflow checks so that abs() functions in case Value = low(Value)
function CopyIntegerToAnsiBuffer(const Value: Integer; var Buffer: array of AnsiChar): Integer;
var
i, j: Integer;
val, remainder: Cardinal;
negative: Boolean;
tmp: array [0..15] of AnsiChar;
begin
negative := Value<0;
val := abs(Value);
Result := 0;
repeat
DivMod(val, 10, val, remainder);
tmp[Result] := AnsiChar(remainder + ord('0'));
inc(Result);
until val=0;
if negative then begin
tmp[Result] := '-';
inc(Result);
end;
Assert(Result<=Length(Buffer));
i := 0;
j := Result-1;
while i<Result do begin
Buffer[i] := tmp[j];
inc(i);
dec(j);
end;
end;
function CopyInt64ToAnsiBuffer(const Value: Int64; var Buffer: array of AnsiChar): Integer;
var
i, j: Integer;
val, remainder: UInt64;
negative: Boolean;
tmp: array [0..23] of AnsiChar;
begin
negative := Value<0;
val := abs(Value);
Result := 0;
repeat
DivMod(val, 10, val, remainder);
tmp[Result] := AnsiChar(remainder + ord('0'));
inc(Result);
until val=0;
if negative then begin
tmp[Result] := '-';
inc(Result);
end;
Assert(Result<=Length(Buffer));
i := 0;
j := Result-1;
while i<Result do begin
Buffer[i] := tmp[j];
inc(i);
dec(j);
end;
end;
{$IFDEF RANGECHECKSON}
{$R+}
{$UNDEF RANGECHECKSON}
{$ENDIF}
{$IFDEF OVERFLOWCHECKSON}
{$Q+}
{$UNDEF OVERFLOWCHECKSON}
{$ENDIF}
My use case requires an array of AnsiChar, but it is of course simple to amend these functions to populate WideChar arrays.
Related
Why is there a performance penalty for nested subroutines in Delphi?
A static analyzer we use has a report that says: Subprograms with local subprograms (OPTI7) This section lists subprograms that themselves have local subprograms. Especially when these subprograms share local variables, it can have a negative effect on performance. This guide says: Do not use nested routines Nested routines (routines within other routines; also known as "local procedures") require some special stack manipulation so that the variables of the outer routine can be seen by the inner routine. This results in a good bit of overhead. Instead of nesting, move the procedure to the unit scoping level and pass the necessary variables - if necessary by reference (use the var keyword) - or make the variable global at the unit scope. We were interested in knowing if we should take this report into consideration when validating our code. The answers to this question suggest that one should profile one's application to see if there is any performance difference, but not much is said about the difference between nested routines and normal subroutines. What is the actual difference between nested routines and normal routines and how may it cause a performance penalty?
tl;dr There are extra push/pops for nested subroutines Turning on optimizations may strip those away, such that the generated code is the same for both nested subroutines and normal subroutines Inlining results in the same code being generated for both nested and normal subroutines For simple routines with few parameters and local variables we perceived no performance difference even with optimizations turned off I wrote a little test to determine this, where GetRTClock is measuring the current time with a precision of 1ns: function subprogram_main(z : Integer) : Int64; var n : Integer; s : Int64; function subprogram_aux(n, z : Integer) : Integer; var i : Integer; begin // Do some useless work on the aux program for i := 0 to n - 1 do begin if (i > z) then z := z + i else z := z - i; end; Result := z; end; begin s := GetRTClock; // Do some minor work on the main program n := z div 100 * 100 + 100; // Call the aux program z := subprogram_aux(n, z); Result := GetRTClock - s; end; function normal_aux(n, z : Integer) : Integer; var i : Integer; begin // Do some useless work on the aux program for i := 0 to n - 1 do begin if (i > z) then z := z + i else z := z - i; end; Result := z; end; function normal_main(z : Integer) : Int64; var n : Integer; s : Int64; begin s := GetRTClock; // Do some minor work on the main program n := z div 100 * 100 + 100; // Call the aux program z := normal_aux(n, z); Result := GetRTClock - s; end; This compiles to: subprogram_main MyFormU.pas.41: begin 005CE7D0 55 push ebp 005CE7D1 8BEC mov ebp,esp 005CE7D3 83C4E0 add esp,-$20 005CE7D6 8945FC mov [ebp-$04],eax MyFormU.pas.42: s := GetRTClock; ... MyFormU.pas.45: n := z div 100 * 100 + 100; ... MyFormU.pas.47: z := subprogram_aux(n, z); 005CE7F8 55 push ebp 005CE7F9 8B55FC mov edx,[ebp-$04] 005CE7FC 8B45EC mov eax,[ebp-$14] 005CE7FF E880FFFFFF call subprogram_aux 005CE804 59 pop ecx 005CE805 8945FC mov [ebp-$04],eax MyFormU.pas.49: Result := GetRTClock - s; ... normal_main MyFormU.pas.70: begin 005CE870 55 push ebp 005CE871 8BEC mov ebp,esp 005CE873 83C4E0 add esp,-$20 005CE876 8945FC mov [ebp-$04],eax MyFormU.pas.71: s := GetRTClock; ... MyFormU.pas.74: n := z div 100 * 100 + 100; ... MyFormU.pas.76: z := normal_aux(n, z); 005CE898 8B55FC mov edx,[ebp-$04] 005CE89B 8B45EC mov eax,[ebp-$14] 005CE89E E881FFFFFF call normal_aux 005CE8A3 8945FC mov [ebp-$04],eax MyFormU.pas.78: Result := GetRTClock - s; ... subprogram_aux: MyFormU.pas.31: begin 005CE784 55 push ebp 005CE785 8BEC mov ebp,esp 005CE787 83C4EC add esp,-$14 005CE78A 8955F8 mov [ebp-$08],edx 005CE78D 8945FC mov [ebp-$04],eax MyFormU.pas.33: for i := 0 to n - 1 do begin 005CE790 8B45FC mov eax,[ebp-$04] 005CE793 48 dec eax 005CE794 85C0 test eax,eax 005CE796 7C29 jl $005ce7c1 005CE798 40 inc eax 005CE799 8945EC mov [ebp-$14],eax 005CE79C C745F000000000 mov [ebp-$10],$00000000 MyFormU.pas.34: if (i > z) then 005CE7A3 8B45F0 mov eax,[ebp-$10] 005CE7A6 3B45F8 cmp eax,[ebp-$08] 005CE7A9 7E08 jle $005ce7b3 MyFormU.pas.35: z := z + i 005CE7AB 8B45F0 mov eax,[ebp-$10] 005CE7AE 0145F8 add [ebp-$08],eax 005CE7B1 EB06 jmp $005ce7b9 MyFormU.pas.37: z := z - i; 005CE7B3 8B45F0 mov eax,[ebp-$10] 005CE7B6 2945F8 sub [ebp-$08],eax normal_aux: MyFormU.pas.55: begin 005CE824 55 push ebp 005CE825 8BEC mov ebp,esp 005CE827 83C4EC add esp,-$14 005CE82A 8955F8 mov [ebp-$08],edx 005CE82D 8945FC mov [ebp-$04],eax MyFormU.pas.57: for i := 0 to n - 1 do begin 005CE830 8B45FC mov eax,[ebp-$04] 005CE833 48 dec eax 005CE834 85C0 test eax,eax 005CE836 7C29 jl $005ce861 005CE838 40 inc eax 005CE839 8945EC mov [ebp-$14],eax 005CE83C C745F000000000 mov [ebp-$10],$00000000 MyFormU.pas.58: if (i > z) then 005CE843 8B45F0 mov eax,[ebp-$10] 005CE846 3B45F8 cmp eax,[ebp-$08] 005CE849 7E08 jle $005ce853 MyFormU.pas.59: z := z + i 005CE84B 8B45F0 mov eax,[ebp-$10] 005CE84E 0145F8 add [ebp-$08],eax 005CE851 EB06 jmp $005ce859 MyFormU.pas.61: z := z - i; 005CE853 8B45F0 mov eax,[ebp-$10] 005CE856 2945F8 sub [ebp-$08],eax The only difference is one push and one pop. What happens if we turn on optimizations? MyFormU.pas.47: z := subprogram_aux(n, z); 005CE7C5 8BD3 mov edx,ebx 005CE7C7 8BC6 mov eax,esi 005CE7C9 E8B6FFFFFF call subprogram_aux MyFormU.pas.76: z := normal_aux(n, z); 005CE82D 8BD3 mov edx,ebx 005CE82F 8BC6 mov eax,esi 005CE831 E8B6FFFFFF call normal_aux Both compile exactly to the same thing. What happens when inlining? MyFormU.pas.76: z := normal_aux(n, z); 005CE804 8BD3 mov edx,ebx 005CE806 8BC8 mov ecx,eax 005CE808 49 dec ecx 005CE809 85C9 test ecx,ecx 005CE80B 7C11 jl $005ce81e 005CE80D 41 inc ecx 005CE80E 33C0 xor eax,eax 005CE810 3BD0 cmp edx,eax 005CE812 7D04 jnl $005ce818 005CE814 03D0 add edx,eax 005CE816 EB02 jmp $005ce81a 005CE818 2BD0 sub edx,eax 005CE81A 40 inc eax 005CE81B 49 dec ecx 005CE81C 75F2 jnz $005ce810 subprogram_main: MyFormU.pas.47: z := subprogram_aux(n, z); 005CE7A8 8BD3 mov edx,ebx 005CE7AA 8BC8 mov ecx,eax 005CE7AC 49 dec ecx 005CE7AD 85C9 test ecx,ecx 005CE7AF 7C11 jl $005ce7c2 005CE7B1 41 inc ecx 005CE7B2 33C0 xor eax,eax 005CE7B4 3BD0 cmp edx,eax 005CE7B6 7D04 jnl $005ce7bc 005CE7B8 03D0 add edx,eax 005CE7BA EB02 jmp $005ce7be 005CE7BC 2BD0 sub edx,eax 005CE7BE 40 inc eax 005CE7BF 49 dec ecx 005CE7C0 75F2 jnz $005ce7b4 Again, no difference. I also profiled this little example, taking an average of 30 executions for each (normal and subprogram), called in random order: constructor TForm1.Create(AOwner: TComponent); const c_nSamples = 60; rnd_sample : array[0..c_nSamples - 1] of byte = (1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0); var subprogram_gt_ns : Int64; normal_gt_ns : Int64; rnd_input : Integer; i : Integer; begin inherited Create(AOwner); normal_gt_ns := 0; subprogram_gt_ns := 0; rnd_input := Random(1000); for i := 0 to c_nSamples - 1 do if (rnd_sample[i] = 1) then Inc(subprogram_gt_ns, subprogram_main(rnd_input)) else Inc(normal_gt_ns, normal_main(rnd_input)); OutputDebugString(PChar(' Normal ' + FloatToStr(normal_gt_ns / 30) + ' Subprogram ' + FloatToStr(subprogram_gt_ns / 30))); end; There is no significant difference even with optimizations turned off: Debug Output: Normal 1166,66666666667 Subprogram 1203,33333333333 Process MyProject.exe (1824) Finally, both texts that warn about performance mention something about shared local variables. If we do not pass z to subprogram_aux, instead access it directly, we get: MyFormU.pas.47: z := subprogram_aux(n); 005CE7D2 55 push ebp 005CE7D3 8BC3 mov eax,ebx 005CE7D5 E8AAFFFFFF call subprogram_aux 005CE7DA 59 pop ecx 005CE7DB 8945FC mov [ebp-$04],eax Even with optimizations turned on.
Using FPC .o files in Delphi 2010
I've written quite a big library for matrix operations for Delphi and FPC. There exists now an extension for this library for the Intel AVX extension but I could only manage to get that compiled in FPC. My idea was to create .o files in FPC which contains the AVX assembler codes and include these files in Delphi. I tried to follow this question here: Linking FPC .o files into Delphi but without success. I was able to dump the function names and tried to import these in the Delphi unit. The problem is that I always get an error saying that the .o files is in the wrong format. I use CodeTyphoon for compilation which internally uses FPC 3.1.1 and Delphi2010 as a first try. The code is once compiled in FPC and one time in Delphi using the approriate ifdefs. My base code looks like this (just an excerpt): // ################################################################### // #### This file is part of the mathematics library project, and is // #### offered under the licence agreement described on // #### http://www.mrsoft.org/ // #### // #### Copyright:(c) 2011, Michael R. . All rights reserved. // #### // #### Unless required by applicable law or agreed to in writing, software // #### distributed under the License is distributed on an "AS IS" BASIS, // #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // #### See the License for the specific language governing permissions and // #### limitations under the License. // ################################################################### unit AVXMatrixMultOperations; interface {$IFDEF CPUX64} {$DEFINE x64} {$ENDIF} {$IFDEF cpux86_64} {$DEFINE x64} {$ENDIF} {$IFNDEF x64} uses MatrixConst; {$IFNDEF FPC} // this fails -> wrong object format {$L '.\AVXPrecompiled\win32\AVXMatrixMultOperations.o'} {$ENDIF} // full matrix operations procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt); {$IFNDEF FPC} external '' name 'AVXMATRIXMULTOPERATIONS_$$_AVXMATRIXMULTALIGNED$crc2A67AB04'; {$ENDIF} {$ENDIF} implementation {$IFDEF FPC} {$ASMMODE intel} {$ENDIF} {$IFNDEF x64} {$IFDEF FPC} procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt); var bytesWidth2, destOffset : TASMNativeInt; iter : TASMNativeInt; {$IFDEF FPC} begin {$ENDIF} asm // prolog - simulate stack push ebx; push edi; push esi; mov ecx, dest; mov edi, width1; imul edi, -8; mov iter, edi; sub mt1, edi; //destOffset := destLineWidth - Width2*sizeof(double); mov ebx, Width2; shl ebx, 3; mov eax, destLineWidth; sub eax, ebx; mov destOffset, eax; //bytesWidth2 := width2*sizeof(double); mov bytesWidth2, ebx; // for y := 0 to height1 - 1 do ##foryloop: // r12 -> counter to width2 mov esi, width2; sub esi, 2; jl #LastXColumn; ##forxloop: // for x := 0 to width2 div 2 - 1 // esi: mt1 - width1*sizeof(double) // mt2: mt2 mov edx, mt1; mov ebx, mt2; mov eax, iter; mov edi, LineWidth2; vxorpd ymm0, ymm0, ymm0; vxorpd ymm1, ymm1, ymm1; cmp eax, -32; jg ##Innerloop2Begin; // for z := 0 to width1 - 1do // AVX part: ##InnerLoop1: // 4x4 block vmovapd xmm2, [ebx]; add ebx, edi; vmovapd xmm4, xmm2; vmovapd xmm3, [ebx]; add ebx, edi; // shuffle so we can multiply // swap such that we can immediately multiply vmovlhps xmm2, xmm2, xmm3; vmovhlps xmm3, xmm3, xmm4; // next 4 elements vmovapd xmm4, [ebx]; add ebx, edi; vmovapd xmm6, xmm4; vmovapd xmm5, [ebx]; add ebx, edi; vmovapd ymm7, [edx + eax] vmovlhps xmm4, xmm4, xmm5; vmovhlps xmm5, xmm5, xmm6; vinsertf128 ymm2, ymm2, xmm4, 1; vinsertf128 ymm3, ymm3, xmm5, 1; // now multiply and add vmulpd ymm2, ymm2, ymm7; vmulpd ymm3, ymm3, ymm7; vaddpd ymm0, ymm0, ymm2; vaddpd ymm1, ymm1, ymm3; add eax, 32; jl ##InnerLoop1; vextractf128 xmm2, ymm0, 1; vextractf128 xmm3, ymm1, 1; vhaddpd xmm0, xmm0, xmm2; vhaddpd xmm1, xmm1, xmm3; test eax, eax; jz ##InnerLoopEnd2; ##Innerloop2Begin: // rest in single elements ##InnerLoop2: vmovapd xmm2, [ebx]; add ebx, edi; vmovddup xmm3, [edx + eax]; vmulpd xmm2, xmm2, xmm3; vmovhlps xmm4, xmm4, xmm2; vaddsd xmm0, xmm0, xmm2; vaddsd xmm1, xmm1, xmm4; add eax, 8; jnz ##InnerLoop2; ##InnerLoopEnd2: // finall horizontal addition vhaddpd xmm0, xmm0, xmm1; vmovapd [ecx], xmm0; // increment the pointers // inc(mt2), inc(dest); //add dword ptr [mt2], 8; add mt2, 16; add ecx, 16; // end for x := 0 to width2 div 2 - 1 sub esi, 2; jge ##forxloop; #LastXColumn: cmp esi, -1; jne #NextLine; // last column of mt2 mov eax, iter; mov ebx, mt2; vxorpd xmm0, xmm0, xmm0; #InnerLoop2: vmovsd xmm1, [edx + eax]; vmovsd xmm2, [ebx]; vmulsd xmm1, xmm1, xmm2; vaddsd xmm0, xmm0, xmm1; add ebx, edi; add eax, 8; jnz #InnerLoop2; vmovsd [ecx], xmm0; add ecx, 8; add mt2, 8; #NextLine: // dec(mt2, Width2); // inc(PByte(mt1), LineWidth1); // inc(PByte(dest), destOffset); //mov ebx, bytesWidth2; //sub dword ptr [mt2], ebx; mov eax, bytesWidth2; sub mt2, eax; mov eax, LineWidth1; add mt1, eax; add ecx, destOffset; // end for y := 0 to height1 - 1 //dec eax; dec height1; jnz ##foryloop; // epilog vzeroupper; pop esi; pop edi; pop ebx; end; {$IFDEF FPC} end; {$ENDIF} {$ENDIF} {$ENDIF} end.
Since there is a single function involved here, the easiest is IMHO to convert the FPC AVXMatrixMultOperations.o file directly. Use the great Object file converter tool. You may try to convert from one binary format to another, accepted by Delphi. But I guess that the cleanest way is to convert it to asm: objconv -fasm AVXMatrixMultOperations.o It will create a AVXMatrixMultOperations.asm file, which could be used to replace the unknown AVX instructions by simple db ..,..,..,.. bytes. Typically, the generated .asm file has the assembler on the left side, and the raw hexadecimal bytes on the right side. This is how I dealt with old Delphi compilers in my libraries, for instance: function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal; asm // eax=crc, edx=buf, ecx=len not eax test ecx, ecx jz #0 test edx, edx jz #0 #3: test edx, 3 jz #8 // align to 4 bytes boundary {$ifdef ISDELPHI2010} crc32 eax, byte ptr[edx] {$else} db $F2, $0F, $38, $F0, $02 {$endif} inc edx .... So in your case, something like {$ifdef FPC} vinsertf128 ymm2, ymm2, xmm4, 1; vinsertf128 ymm3, ymm3, xmm5, 1; {$else} db $xx,$yy,$zz db $xx,$yy,$zz {$endif}
Combining ASM with non-asm code (or SwapInt64 ASM function needed)
I need to process a file that is coming from the old Mac era (old Motorola CPU). The bytes are big endian so I have a function that swaps and Int64 to Intel little endian. The function is ASM and works on 32 bit CPU but not on 64. For 64 bit I have a different function that is not ASM. I want to combine the functions using IFDEF. Can I do this? Will it be a problem? interface function SwapInt64(Value: Int64): Int64; assembler; implementation {$IFDEF CPUx86} function SwapInt64(Value: Int64): Int64; assembler; { Does not work on 64 bit } { asm MOV EDX,[DWORD PTR EBP + 12] MOV EAX,[DWORD PTR EBP + 8] BSWAP EAX XCHG EAX,EDX BSWAP EAX end; {$else} function SwapInt64 (Value: Int64): Int64; var P: PInteger; begin Result: = (Value shl 32) or (Value shr 32); P: = #Result; P ^: = (Swap (P ^) shl 16) or (Swap (P ^ shr 16)); Inc (P); P ^: = (Swap (P ^) shl 16) or (Swap (P ^ shr 16)); end; {$ENDIF} I think the compiler will correctly compile/call the appropriate function no matter if one is ASM and the other is Pascal.
What you are proposing is perfectly fine. It is a quite reasonable approach. If you want a 64 bit swap in asm, for x64, that's quite simple: function SwapInt64(Value: Int64): Int64; asm MOV RAX,RCX BSWAP RAX end; Combine this with the 32 bit version using conditional, as you have done in the question. function SwapInt64(Value: Int64): Int64; {$IF Defined(CPUX86)} asm MOV EDX,[DWORD PTR EBP + 12] MOV EAX,[DWORD PTR EBP + 8] BSWAP EAX XCHG EAX,EDX BSWAP EAX end; {$ELSEIF Defined(CPUX64)} asm MOV RAX,RCX BSWAP RAX end; {$ELSE} {$Message Fatal 'Unsupported architecture'} {$ENDIF} Or include a Pascal implementation in the {$ELSE} block.
The approach of swapping the bytes in a separate routine that cannot be inlined is a bit silly if performance is what you're after. A better way to a assume you've got a block of data and all dword/qwords in it need to have their endianness changed. This would look something like this. For dwords function SwapDWords(var Data; size: cardinal): boolean; {ifdef CPUX64} asm //Data in RCX, Size in EDX xor EAX,EAX //failure test EDX,3 jz #MultipleOf4 #error: ret #MultipleOf4 neg EDX //Count up instead of down jz #done ADD RCX,RDX #loop mov R8d, [RCX+RDX] bswap R8d mov [RCX+RDX],R8d add RDX,4 //add is faster than inc on modern processors jnz #loop #done: inc EAX //success ret end; For qwords function SwapQWords(var Data; size: cardinal): boolean; {ifdef CPUX64} asm //Data in RCX, Size in EDX xor EAX,EAX //failure test EDX,7 jz #MultipleOf8 #error: ret #MultipleOf8 neg EDX //Count up instead of down jz #done ADD RCX,RDX #loop mov R8, [RCX+RDX] bswap R8 mov [RCX+RDX],R8 add RDX,8 //add is faster than inc on modern processors jnz #loop #done: inc EAX //success ret end; If you're already on 64 bit, then you have SSE2, and can use the 128-bit SSE registers. Now you can process 4 dwords at a time, effectively unrolling the loop 4 times. See: http://www.asmcommunity.net/forums/topic/?id=29743 movntpd xmm5,[RCX+RDX] //non-temporal move to avoid polluting the cache movdqu xmm0, xmm5 movdqu xmm1, xmm5 pxor xmm5, xmm5 punpckhbw xmm0, xmm5 ; interleave '0' with bytes of original punpcklbw xmm1, xmm5 ; so they become words pshuflw xmm0, xmm0, 27 ; swap the words by shuffling pshufhw xmm0, xmm0, 27 ;//27 = B00_01_10_11 pshuflw xmm1, xmm1, 27 pshufhw xmm1, xmm1, 27 packuswb xmm1, xmm0 ; make the words back into bytes. movntpd [RCX+RDX], xmm1 //non-temporal move to keep the cache clean.
Simply use either LEToN() or BEtoN() Use the LE variant if the data is little endian (e.g. 32 or 64-bits x86 mac, modern arm), use the BE if the source data (e.g. file from disk) is in big endian format. Depending on the used architecture a swap or "nothing" will be inlined, usually a fairly optimal one for single conversions. For block oriented solutions see the posted SSE code (or Agner Fog's)
Optimizing SIMD histogram calculation
I worked on a code that implements an histogram calculation given an opencv struct IplImage * and a buffer unsigned int * to the histogram. I'm still new to SIMD so I might not be taking advantage of the full potential the instruction set provides. histogramASM: xor rdx, rdx xor rax, rax mov eax, dword [imgPtr + imgWidthOffset] mov edx, dword [imgPtr + imgHeightOffset] mul rdx mov rdx, rax ; rdx = Image Size mov r10, qword [imgPtr + imgDataOffset] ; r10 = ImgData NextPacket: mov rax, rdx movdqu xmm0, [r10 + rax - 16] mov rcx,16 ; 16 pixels/paq PacketLoop: pextrb rbx, xmm0, 0 ; saving the pixel value on rbx shl rbx,2 inc dword [rbx + Hist] psrldq xmm0,1 loop PacketLoop sub rdx,16 cmp rdx,0 jnz NextPacket ret On C, I'd be running these piece of code to obtain the same result. imgSize = (img->width)*(img->height); pixelData = (unsigned char *) img->imageData; for(i = 0; i < imgSize; i++) { pixel = *pixelData; hist[pixel]++; pixelData++; } But the time it takes for both, measured in my computer with rdtsc(), is only 1.5 times better SIMD's assembler. Is there a way to optimize the code above and quickly fill the histogram vector with SIMD? Thanks in advance
Like Jester I'm surprised that your SIMD code had any significant improvement. Did you compile the C code with optimization turned on? The one additional suggestion I can make is to unroll your Packetloop loop. This is a fairly simple optimization and reduces the number of instructions per "iteration" to just two: pextrb ebx, xmm0, 0 inc dword [ebx * 4 + Hist] pextrb ebx, xmm0, 1 inc dword [ebx * 4 + Hist] pextrb ebx, xmm0, 2 inc dword [ebx * 4 + Hist] ... pextrb ebx, xmm0, 15 inc dword [ebx * 4 + Hist] If you're using NASM you can use the %rep directive to save some typing: %assign pixel 0 %rep 16 pextrb rbx, xmm0, pixel inc dword [rbx * 4 + Hist] %assign pixel pixel + 1 %endrep
Delphi - Detect Int64 Overflow Error
In Delphi how can I detect overflow errors for Int64? For Integers we could do: type MyInt = Integer; //Int64 function TryMaxTimes10(out Res: MyInt): boolean; var a, b: MyInt; begin {$Q+} try a := High(MyInt); b := 10; Res := a * b; //REF1 Result := True; except Result := False; end; {$Q-} end; For MyInt = Integer, line REF1 gives an exception and so TryMaxTimes10 returns false. But if we change MyInt to MyInt = Int64, then REF1 does not give an exception and TryMaxTimes10 returns true! I understand that the help for {$Q+} does not specifically mention Int64: ... {$Q+} state, certain integer arithmetic operations ... are checked for overflow. QUESTION: So my question is, how can we detect overflow errors for Int64? (I'm using Delphi 7. Does the same thing happen in newer versions of Delphi?)
This is a known issue. See http://qc.embarcadero.com/wc/qcmain.aspx?d=10185, and the comments Andy wrote at the bottom. My suggestion would be to create a function (I did not compile nor test this - just an example): function Foo(A, B : Int64) : Int64; var bNeg : boolean; begin // Do we expect a negative result? bNeg := ((a < 0) xor (b < 0)); // Get the real result Result := a * b; // If the result is wrong, raise an error if ((Result < 0) xor bNeg) then begin // Raise EOverFlow end; end;
This bug has been fixed in RAD Studio 10.2 Tokyo. The issue can be found here (but one have to log in with embarcadero account to see it). Here is correct version of __llmulo by John O'Harrow (licensed under MPL 1.1) shipped with Delphi versions 10.2 and above: // Param 1(edx:eax), Param 2([esp+8]:[esp+4]) // Result is stored in edx:eax // O-flag set on exit => result is invalid // O-flag clear on exit => result is valid procedure __llmulo(); asm test edx, edx {Param1-Hi = 0?} jne ##Large {No, More than one multiply may be needed} cmp edx, [esp+8] {Param2-Hi = 0?} jne ##Large {No, More than one multiply may be needed} mul dword ptr [esp+4] {Only one multiply needed, Set Result} and eax, eax {Clear Overflow Flag} ret 8 ##Large: sub esp, 28 {allocate local storage} mov [esp], ebx {save used registers} mov [esp+4], esi mov [esp+8], edi mov [esp+12], ebp mov ebx, [esp+32] {Param2-Lo} mov ecx, [esp+36] {Param2-Hi} mov esi, edx mov edi, ecx sar esi, 31 sar edi, 31 xor eax, esi xor edx, esi sub eax, esi sbb edx, esi {edx:eax (a1:a0) = abs(Param1)} xor ebx, edi xor ecx, edi sub ebx, edi sbb ecx, edi {ecx:ebx (b1:b0) = abs(Param2)} xor esi, edi {Sign Flag, 0 if Params have same sign else -1} mov [esp+16], eax {a0} mov [esp+20], edx {a1} mov [esp+24], ecx {b1} mul ebx {edx:eax (c1:c0) = a0*b0} xchg ebx, edx {ebx = c1, edx = b0} mov edi, eax {abs(Result-Lo) = c0} xor ecx, ecx {Upper 32 bits of 128 bit result} xor ebp, ebp {Second 32 bits of 128 bit result} mov eax, [esp+20] {a1} mul edx {edx:eax (d1:d0) = a1*b0} add ebx, eax {c1 + d0} adc ebp, edx {d1 + carry} adc ecx, 0 {Possible carry into Upper 32 bits} mov eax, [esp+16] {a0} mov edx, [esp+24] {b1} mul edx {edx:eax (e1:e0) = a0*b1} add ebx, eax {abs(Result-Hi) = c1 + d0 + e0} adc ebp, edx {d1 + e1 + carry} adc ecx, 0 {Possible carry into Upper 32 bits} mov eax, [esp+20] {a1} mov edx, [esp+24] {b1} mul edx {edx:eax (f1:f0) = a1*b1} add ebp, eax {d1 + e1 + f0 + carry} adc ecx, edx {f1 + carry} or ecx, ebp {Overflow if ecx <> 0 or ebp <> 0} jnz ##Overflow mov edx, ebx {Set abs(Result-Hi)} mov eax, edi {Set abs(Result-Lo)} cmp edx, $80000000 jae ##CheckRange {Possible Overflow if edx>=$80000000} ##SetSign: xor eax, esi {Correct Sign of Result} xor edx, esi sub eax, esi sbb edx, esi mov ebx, [esp] {restore used registers} mov esi, [esp+4] mov edi, [esp+8] mov ebp, [esp+12] add esp, 28 {Clears Overflow flag} ret 8 ##CheckRange: jne ##Overflow {Overflow if edx>$80000000} test esi, esi {edx=$80000000, Is Sign Flag=0?} jnz ##SetSign {No, Result is Ok (-MaxInt64)} ##Overflow: mov ebx, [esp] {restore used registers} mov esi, [esp+4] mov edi, [esp+8] mov ebp, [esp+12] add esp, 28 mov ecx, $80000000 dec ecx {Set Overflow Flag} ret 8 end;