InttoStr/StrToInt 的更快替代品

我想知道是否有比System.IntToStr / System.StrToInt更快的替代方案。有一个快速版本，但只有 UTF8。这是从SynCommons.pas Int32ToUTF8的，并且由于字符串转换缓慢，它必然会很慢。purepascal RTL 版本对于 64 位来说真的很慢。

此例程比 RTL 中的例程快约 40%。如果使用 WideChar[] 缓冲区，速度可能会快得多，因为字符串分配占用了转换例程所用时间的 75%：

IntS32至宽： 5,50 纳秒/项（PWideChar）
IntToStr： 34,51 纳秒/项（RTL）
IntS32ToStr： 24,77 ns/项目（RTL 替代）

请注意，下面的例程使用 SSE2，并且仅完全实现和测试了 x86 和 x64 版本。

在初始化中：

function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
function IntS32ToWide( X: Integer;  P: PWideChar ): PWideChar; register;
function IntS32ToStr ( X: Longword ): UnicodeString; register; inline;

在实现中：

{$CODEALIGN 16}
{$ALIGN 16}
const
  DigitsClippedW: array [ 0..99 ] of LongWord = (
    $000030, $000031, $000032, $000033, $000034, $000035, $000036, $000037, $000038, $000039,
    $300031, $310031, $320031, $330031, $340031, $350031, $360031, $370031, $380031, $390031,
    $300032, $310032, $320032, $330032, $340032, $350032, $360032, $370032, $380032, $390032,
    $300033, $310033, $320033, $330033, $340033, $350033, $360033, $370033, $380033, $390033,
    $300034, $310034, $320034, $330034, $340034, $350034, $360034, $370034, $380034, $390034,
    $300035, $310035, $320035, $330035, $340035, $350035, $360035, $370035, $380035, $390035,
    $300036, $310036, $320036, $330036, $340036, $350036, $360036, $370036, $380036, $390036,
    $300037, $310037, $320037, $330037, $340037, $350037, $360037, $370037, $380037, $390037,
    $300038, $310038, $320038, $330038, $340038, $350038, $360038, $370038, $380038, $390038,
    $300039, $310039, $320039, $330039, $340039, $350039, $360039, $370039, $380039, $390039 );
// Delphi XE3 has no working alignment for 16 bytes for data but it has alignment for 16 bytes for code!
// So we encode our constants as a procedure and use constant offsets to the data.
const
  Div10000_Shl45d        = $00;
  Shl16_minus_10000d     = $10;
  Div_1000_100_10_1w     = $20;
  Shl_1000_100_10_1w     = $30;
  Mul_10w                = $40;
  To_Asciiw              = $50;
  Mul_10000d             = $60;
  Div100_Shl19w          = $70;
  Mul100w                = $80;
  Div10_shl16w           = $90;
  To_Asciib              = $A0;
procedure IntUToStrConsts();
asm
  {$if defined( CPUX64 )}.NOFRAME{$ifend}
  dd $d1b71759,         $d1b71759,          $d1b71759,         $d1b71759;          // RoundUp( 2^45 / 10000 )
  dd $10000 - 10000,    $10000 - 10000,     $10000 - 10000,    $10000 - 10000;     // 1 shl 16 - 1e4
  dw 8389,    5243,     13108,    $8000,    8389,    5243,     13108,    $8000;    // 1000 100 10 1 div
  dw 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15, 1 shl 7, 1 shl 11, 1 shl 13, 1 shl 15; // 1000 100 10 1 shr
  dw 10,      10,       10,       10,       10,      10,       10,       10;       // 10
  dw $30,     $30,      $30,      $30,      $30,     $30,      $30,      $30;      // To Unicode / ASCII
  dd 10000,             10000,              10000,             10000;              // 10000
  dw $147b,   $147b,    $147b,    $147b,    $147b,   $147b,    $147b,    $147b     // RoundUp( 2^19 / 100 )
  dw 100,     100,      100,      100,      100,     100,      100,      100       // 100
  dw $199a,   $199a,    $199a,    $199a,    $199a,   $199a,    $199a,    $199a     // RoundUp( 2^16 / 10 )
  dd $30303030,         $30303030,          $30303030,         $30303030           // To bytewise / ASCII
end;

function IntS32ToStr( X: Longword ): UnicodeString; register;
var
  P, Q: PWideChar;
begin
  SetLength( Result, 11 );
  P := PWideChar( Pointer( Result ) );
  // Full string buffer and set the length of the string with no resizing!
  PLongword( ( NativeInt( Result ) - sizeof( Longword ) ) )^ := IntS32ToWide( X, P ) - P;
end;
function IntS32ToWide( X: Integer; P: PWideChar ): PWideChar;
{$if defined( CPUX86 )}
asm // eax = X, edx = P
  cmp  eax, 0
  jge  IntU32ToWide
  mov  word ptr [ edx ], Word( '-' )
  neg  eax
  lea  edx, [ edx + 2 ]
  jmp  IntU32ToWide
end;
{$else if defined( CPUX64 )}
asm // ecx = X, rdx = P
  .NOFRAME
  cmp  ecx, 0
  jge  IntU32ToWide
  mov  word ptr [ rdx ], Word( '-' )
  neg  ecx
  lea  rdx, [ rdx + 2 ]
  jmp  IntU32ToWide
end;
{$else}
begin
  if X >= 0 then begin
    Result := IntU32ToWide( Longword( X ), P );
  end else begin
    P^ := '-';
    Result := IntU32ToWide( Longword( -X ), P + 1 );
  end;
end;
{$ifend}
function IntU32ToWide( X: Longword; P: PWideChar ): PWideChar; register;
{$if defined( CPUX86 )}
asm
  cmp       eax, 100000000
  jb        @Medium
@Large:
  push      edx
  xor       edx, edx
  mov       ecx, 100000000
  div       ecx
  pop       ecx
  //        eax = high one or two digit value, edx = 8 digit value, ecx = pointer
  // Emit the first 2 digits
  mov       eax, dword ptr [ DigitsClippedW + eax * 4 ]
  mov       [ ecx ], eax
  cmp       eax, $10000
  setae     al
  movzx     eax, al
  lea       eax, [ eax * 2 + ecx + 18 ]
  //        edx = 8 digit value, ecx = pointer
  // Emit 8 follow digits
  movd      xmm1, edx          // xmm1 = Value
  movdqa    xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
  pmuludq   xmm0, xmm1
  psrlq     xmm0, 45           // xmm0 = xmm1 div 10000
  pmuludq   xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
  paddd     xmm0, xmm1         // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
  psllq     xmm0, 2
  punpcklwd xmm0, xmm0
  punpckldq xmm0, xmm0         // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
  movdqa    xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ]            // xmm2 := xmm0 * 10; shift to left one word.
  pmullw    xmm2, xmm0
  psllq     xmm2, 16
  psubw     xmm0, xmm2         // Extract digits
  por       xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ]          // Digits to ASCII
  shufps    xmm0, xmm0, $4E
  movdqu    [ eax - 16 ], xmm0    // And save 8 digits at once
  ret
@Medium:
  cmp       eax, 100
  jb        @Small
  //        eax 2..8 digits, edx = pointer
  // Emit 2..8 digits
  movd      xmm1, eax          // xmm1 = Value
  movdqa    xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
  pmuludq   xmm0, xmm1
  psrlq     xmm0, 45           // xmm0 = xmm1 div 10000
  pmuludq   xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
  paddd     xmm0, xmm1         // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
  psllq     xmm0, 2
  punpcklwd xmm0, xmm0
  punpckldq xmm0, xmm0         // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
  movdqa    xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ]            // xmm2 := xmm0 * 10; shift to left one word.
  pmullw    xmm2, xmm0
  psllq     xmm2, 16
  psubw     xmm0, xmm2         // Extract digits
  movdqa    xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ]          // Digits to ASCII
  por       xmm0, xmm1
  shufps    xmm0, xmm0, $4E
  // Now we have 8 Unicode characters in the xmm0 register in the correct order.
  pcmpeqw   xmm1, xmm0         // scan for zeroes.
  pmovmskb  eax, xmm1
  packuswb  xmm0, xmm0         // convert to bytes
  xor       eax, $FFFF         // change polarity
  bsf       eax, eax           // amount to shift in bytes.
  lea       ecx, [ eax * 4 ]
  movd      xmm1, ecx
  psrlq     xmm0, xmm1         // bytes shifted.
  pxor      xmm2, xmm2
  punpcklbw xmm0, xmm2
  neg       eax
  movdqu    dqword ptr [ edx ], xmm0
  lea       eax, [ edx + 16 + eax ]
  ret
@Small:
  //        eax 1..2 digits, edx = pointer
  // Emit one or two digits
  mov       eax, dword ptr [ DigitsClippedW + eax * 4 ]
  mov       [ edx ], eax
  cmp       eax, $10000
  setae     al
  movzx     eax, al
  lea       eax, [ edx + eax * 2 + 2 ]
end;
{$else if defined( CPUX64 )}
asm
  cmp       ecx, 100000000
  jb        @Medium
@Large:
  mov       r8,  rdx                  // r8 = pointer
  // Split up low 8 digits from high 1 or 2 digits..
  mov       eax, ecx
  mov       r9,  12379400392853802749 // RoundUp( 2^64+26 / 1e8 )
  mul       rax, r9
  shr       rdx, 26
  mov       r10, rdx                  // r10 = eax div 1e8
  mov       rax, rdx
  mov       r9,  100000000
  mul       rax, r9
  sub       ecx, eax                  // ecx = eax mod 1e8
  // Emit the first 2 digits
  lea       r9, [ DigitsClippedW ]
  mov       eax, dword ptr [ r9 + r10 * 4 ]
  mov       dword ptr [ r8 ], eax
  // advance pointer ( also for the next 8 bytes)
  cmp       eax, $10000
  setae     al
  movzx     rax, al
  lea       rax, [ rax * 2 + r8 + 2 + 16 ]
  // ecx = 8 digit value, r8 = pointer + 8
  movd      xmm1, ecx          // xmm1 = Value
  movdqa    xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
  pmuludq   xmm0, xmm1
  psrlq     xmm0, 45           // xmm0 = xmm1 div 10000
  pmuludq   xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
  paddd     xmm0, xmm1         // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
  psllq     xmm0, 2
  punpcklwd xmm0, xmm0
  punpckldq xmm0, xmm0         // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
  movdqa    xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ]            // xmm2 := xmm0 * 10; shift to left one word.
  pmullw    xmm2, xmm0
  psllq     xmm2, 16
  psubw     xmm0, xmm2         // Extract digits
  por       xmm0, dqword ptr [ IntUToStrConsts + To_ASCIIw ]          // Digits to ASCII
  shufps    xmm0, xmm0, $4E
  movdqu    [ rax - 16 ], xmm0    // And save 8 digits at once
  ret
@Medium:
  cmp       ecx, 100
  jb        @Small
  //        eax 2..8 digits, rdx = pointer
  //        Emit 2..8 digits
  movd      xmm1, ecx          // xmm1 = Value
  movdqa    xmm0, dqword ptr [ IntUToStrConsts + Div10000_Shl45d ]
  pmuludq   xmm0, xmm1
  psrlq     xmm0, 45           // xmm0 = xmm1 div 10000
  pmuludq   xmm0, dqword ptr [ IntUToStrConsts + Shl16_minus_10000d ]
  paddd     xmm0, xmm1         // xmm0 = word( lo digits ), word( hi digit ), 0 (6x)
  psllq     xmm0, 2
  punpcklwd xmm0, xmm0
  punpckldq xmm0, xmm0         // xmm0 *= 4 (lo, lo, lo, lo, hi, hi, hi, hi)W (LSW, MSW)
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Div_1000_100_10_1w ]
  pmulhuw   xmm0, dqword ptr [ IntUToStrConsts + Shl_1000_100_10_1w ] // xmm0 = ( lo, lo div 10, lo div 100, lo div 100, (same with hi) )W
  movdqa    xmm2, dqword ptr [ IntUToStrConsts + Mul_10w ]            // xmm2 := xmm0 * 10; shift to left one word.
  pmullw    xmm2, xmm0
  psllq     xmm2, 16
  psubw     xmm0, xmm2         // Extract digits
  movdqa    xmm1, dqword ptr [ IntUToStrConsts + To_ASCIIw ]          // Digits to ASCII
  por       xmm0, xmm1
  shufps    xmm0, xmm0, $4E
  // Now we have 8 Unicode characters in the xmm0 register in the correct order.
  pcmpeqw   xmm1, xmm0         // scan for zeroes.
  pmovmskb  eax, xmm1
  packuswb  xmm0, xmm0         // convert to bytes
  xor       eax, $FFFF         // change polarity
  bsf       eax, eax           // amount to shift in bytes.
  lea       ecx, [ eax * 4 ]
  movd      xmm1, ecx
  psrlq     xmm0, xmm1         // bytes shifted.
  pxor      xmm2, xmm2
  punpcklbw xmm0, xmm2
  neg       rax
  movdqu    dqword ptr [ rdx ], xmm0
  lea       rax, [ rdx + 16 + rax ]
  ret
@Small:
  //        ecx 1..2 digits, rdx = pointer
  // Emit one or two digits
  lea       r9, [ DigitsClippedW ]
  mov       eax, dword ptr [ r9 + rcx * 4 ]
  mov       [ rdx ], eax
  cmp       eax, $10000
  setae     al
  movzx     rax, al
  lea       rax, [ rdx + rax * 2 + 2 ]
end;
{$else}
begin
  Assert( False, 'Not implemented.' );
end;
{$ifend}

在SynCommons.pas中，您还有以下功能：

function IntToString(Value: integer): string;
var tmp: array[0..15] of AnsiChar;
    P: PAnsiChar;
begin
  P := StrInt32(@tmp[15],Value);
  Ansi7ToString(PWinAnsiChar(P),@tmp[15]-P,result);
end;

我怀疑它也会很快，即使在 Win64 平台上也是如此。比 asm 慢，但对于小数字（通常是野外大部分整数）来说足够快。

此函数中只有一个内存分配，即使在 Win64 上也非常快，这要归功于 FastMM4 的更新版本，它具有自己优化的 x64 asm。

在我看来，提高性能的关键方法是避免堆分配。IntToStr执行分配所花费的时间大于执行十进制转换所花费的时间。如果您想使用多个线程，那么这一点更为重要，因为默认的 Delphi 内存管理器在线程争用下无法很好地扩展。

的确，十进制转换也可以优化，但我总是试图通过先摘掉唾手可得的果实来优化。

因此，为了完整起见，如果这些函数证明对其他人有用，以下是我用于堆分配自由整数到字符串的例程：

procedure DivMod(Dividend, Divisor: Cardinal; out Quotient, Remainder: Cardinal);
{$IFDEF CPUX86}
asm
        PUSH  EBX
        MOV   EBX,EDX
        XOR   EDX,EDX
        DIV   EBX
        MOV   [ECX],EAX
        MOV   EBX,Remainder
        MOV   [EBX],EDX
        POP   EBX
end;
{$ELSE IF Defined(CPUX64)}
asm
        .NOFRAME
        MOV   EAX,ECX
        MOV   ECX,EDX
        XOR   EDX,EDX
        DIV   ECX
        MOV   [R8],EAX
        MOV   [R9],EDX
end;
{$ELSE}
  {$Message Error 'Unrecognised platform.'}
{$ENDIF}
{$IFOPT R+}
  {$DEFINE RANGECHECKSON}
  {$R-}
{$ENDIF}
{$IFOPT Q+}
  {$DEFINE OVERFLOWCHECKSON}
  {$Q-}
{$ENDIF}
// disable range checks and overflow checks so that abs() functions in case Value = low(Value)
function CopyIntegerToAnsiBuffer(const Value: Integer; var Buffer: array of AnsiChar): Integer;
var
  i, j: Integer;
  val, remainder: Cardinal;
  negative: Boolean;
  tmp: array [0..15] of AnsiChar;
begin
  negative := Value<0;
  val := abs(Value);
  Result := 0;
  repeat
    DivMod(val, 10, val, remainder);
    tmp[Result] := AnsiChar(remainder + ord('0'));
    inc(Result);
  until val=0;
  if negative then begin
    tmp[Result] := '-';
    inc(Result);
  end;
  Assert(Result<=Length(Buffer));
  i := 0;
  j := Result-1;
  while i<Result do begin
    Buffer[i] := tmp[j];
    inc(i);
    dec(j);
  end;
end;
function CopyInt64ToAnsiBuffer(const Value: Int64; var Buffer: array of AnsiChar): Integer;
var
  i, j: Integer;
  val, remainder: UInt64;
  negative: Boolean;
  tmp: array [0..23] of AnsiChar;
begin
  negative := Value<0;
  val := abs(Value);
  Result := 0;
  repeat
    DivMod(val, 10, val, remainder);
    tmp[Result] := AnsiChar(remainder + ord('0'));
    inc(Result);
  until val=0;
  if negative then begin
    tmp[Result] := '-';
    inc(Result);
  end;
  Assert(Result<=Length(Buffer));
  i := 0;
  j := Result-1;
  while i<Result do begin
    Buffer[i] := tmp[j];
    inc(i);
    dec(j);
  end;
end;
{$IFDEF RANGECHECKSON}
  {$R+}
  {$UNDEF RANGECHECKSON}
{$ENDIF}
{$IFDEF OVERFLOWCHECKSON}
  {$Q+}
  {$UNDEF OVERFLOWCHECKSON}
{$ENDIF}

我的用例需要一个AnsiChar数组，但修改这些函数以填充WideChar数组当然很简单。

相关内容

最新更新

热门标签：