c语言 - 32 位平台上的无符号 64x64->128 位整数乘法



在探索活动的背景下,我已经开始研究integer&用于32位平台的定点算术构建块。我的主要目标是ARM32(特别是armv7),顺便看一下RISC-V32,我预计它在嵌入式领域的重要性会越来越大。我选择检查的第一个示例构建块是未签名的64x64->128位整数乘法。本网站上关于此构建块的其他问题没有提供32位平台的详细报道。

在过去的三十年里,我已经为各种体系结构多次实现了这个和其他算术构建块,但始终是用汇编语言实现的。然而,在这个时候,我的希望和愿望是,这些可以直接用ISO-C编程,而不需要使用内部函数。理想情况下,C代码的单个版本可以跨体系结构生成良好的机器代码。我知道操纵HLL代码来控制机器代码的方法通常是脆弱的,但希望处理器体系结构和工具链已经足够成熟,使其可行。

汇编语言实现中使用的一些方法不太适合移植到C。在下面的示例代码中,我选择了六种似乎适合HLL实现的变体。除了生成所有变体都通用的部分乘积外,还有两种基本方法:(1)使用64位算术对部分乘积求和,让编译器负责32位半之间的进位传播。在这种情况下,有多个选择,按其顺序对部分乘积求和。(2) 使用32位算术求和,直接模拟进位标志。在这种情况下,我们可以选择在加法之后(a = a + b; carry = a < b;)或加法之前(carry = ~a < b; a = a + b;)生成进位。以下变体1至3属于前一类,变体5和6属于后一类。

在编译器资源管理器上,我重点介绍了感兴趣的平台的工具链gcc12.2和clang15.0。我用-O3编译。一般的发现是,平均而言,clang生成的代码比gcc更高效,变体之间的差异(使用的指令和寄存器的数量)在clang中更明显。虽然这在RISC-V作为较新架构的情况下是可以理解的,但在armv7的情况下,这让我感到惊讶,它已经存在了十几年。

我特别注意到三起案件。虽然我以前曾与编译器工程师合作过,对基本的代码转换、相位排序问题等有着合理的理解,但我所知道的唯一适用于此代码的技术是习语识别,我不知道这本身如何解释观察结果。第一种情况是变体3,clang 15.0生成了非常紧凑的代码,其中只有10条指令,我认为这些指令无法改进:

umul64wide:
push    {r4, lr}
umull   r12, r4, r2, r0
umull   lr, r0, r3, r0
umaal   lr, r4, r2, r1
umaal   r0, r4, r3, r1
ldr     r1, [sp, #8]
strd    r0, r4, [r1]
mov     r0, r12
mov     r1, lr
pop     {r4, pc}

相比之下,gcc生成的指令数量是指令数量的两倍,并且需要的寄存器数量是寄存器数量的一倍。我假设它不知道如何在这里使用乘法累加指令umaal,但这就是全部吗?相反的情况发生在变体6中,但并不那么引人注目,其中gcc 12.2产生了18条指令的序列,寄存器使用率较低:

umul64wide:
mov     ip, r0
push    {r4, r5, lr}
mov     lr, r1
umull   r0, r1, r0, r2
ldr     r4, [sp, #12]
umull   r5, ip, r3, ip
adds    r1, r1, r5
umull   r2, r5, lr, r2
adc     ip, ip, #0
umull   lr, r3, lr, r3
adds    r1, r1, r2
adc     r2, ip, #0
adds    r2, r2, r5
adc     r3, r3, #0
adds    r2, r2, lr
adc     r3, r3, #0
strd    r2, r3, [r4]
pop     {r4, r5, pc}

生成的代码很好地将模拟进位传播转换为实际进位传播。clang 15.0使用了九条指令和五个寄存器,如果不花更多的时间进行分析,我就无法真正弄清楚它要做什么。第三个观察是关于变体5与变体6产生的机器代码的差异,特别是clang。它们使用相同的基本算法,一个变体在添加之前计算模拟进位,另一个在添加之后计算模拟进位。我最终发现,一个变种,即变体4,似乎在两个工具链和两种架构中都是有效的。然而,在我着手其他构建块并面临类似的斗争之前,我想询问一下:

(1) 在下面的代码中,是否有我没有考虑过的编码习惯用法或算法可能会带来更好的结果?(2) 是否存在-O3中未包含的特定优化开关,例如假设的-ffrobnicate(请参阅此处),可以帮助编译器为此类位操作场景更一致地生成高效代码?解释哪些编译器机制可能导致观察到的代码生成中的显著差异,以及如何影响或解决这些差异,也可能有所帮助。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define VARIANT         (3)
#define USE_X64_ASM_REF (0)
/* Multiply two unsigned 64-bit operands a and b. Returns least significant 64 
bits of product as return value, most significant 64 bits of product via h.
*/
uint64_t umul64wide (uint64_t a, uint64_t b, uint64_t *h)
{
uint32_t a_lo = (uint32_t)a;
uint32_t a_hi = a >> 32;
uint32_t b_lo = (uint32_t)b;
uint32_t b_hi = b >> 32;
uint64_t p0 = (uint64_t)a_lo * b_lo;
uint64_t p1 = (uint64_t)a_lo * b_hi;
uint64_t p2 = (uint64_t)a_hi * b_lo;
uint64_t p3 = (uint64_t)a_hi * b_hi;
#if VARIANT == 1
uint32_t c = (uint32_t)(((p0 >> 32) + (uint32_t)p1 + (uint32_t)p2) >> 32);
*h = p3 + (p1 >> 32) + (p2 >> 32) + c;
return p0 + ((p1 + p2) << 32);
#elif VARIANT == 2
uint64_t s = (p0 >> 32) + p1;
uint64_t t = (uint32_t)s + p2;
*h = (s >> 32) + (t >> 32) + p3;
return (uint32_t)p0 + (t << 32);
#elif VARIANT == 3
*h = (p1 >> 32) + (((p0 >> 32) + (uint32_t)p1 + p2) >> 32) + p3;
return p0 + ((p1 + p2) << 32);
#elif VARIANT == 4
uint64_t t = (p0 >> 32) + p1 + (uint32_t)p2;
*h = (p2 >> 32) + (t >> 32) + p3;
return (uint32_t)p0 + (t << 32);
#elif VARIANT == 5
uint32_t r0, r1, r2, r3, r4, r5, r6;
r0 = (uint32_t)p0;
r1 = p0 >> 32;    
r5 = (uint32_t)p1;
r2 = p1 >> 32;
r1 = r1 + r5;
r6 = r1 < r5;
r2 = r2 + r6;
r6 = (uint32_t)p2;
r5 = p2 >> 32;
r1 = r1 + r6;
r6 = r1 < r6;
r2 = r2 + r6;
r4 = (uint32_t)p3;
r3 = p3 >> 32;
r2 = r2 + r5;
r6 = r2 < r5;
r3 = r3 + r6;
r2 = r2 + r4;
r6 = r2 < r4;
r3 = r3 + r6;
*h =   ((uint64_t)r3 << 32) | r2;
return ((uint64_t)r1 << 32) | r0;
#elif VARIANT == 6
uint32_t r0, r1, r2, r3, r4, r5, r6;
r0 = (uint32_t)p0;
r1 = p0 >> 32;    
r5 = (uint32_t)p1;
r2 = p1 >> 32;
r4 = ~r1;
r4 = r4 < r5;
r1 = r1 + r5;
r2 = r2 + r4;
r6 = (uint32_t)p2;
r5 = p2 >> 32;
r4 = ~r1;
r4 = r4 < r6;
r1 = r1 + r6;
r2 = r2 + r4;
r4 = (uint32_t)p3;
r3 = p3 >> 32;
r6 = ~r2;
r6 = r6 < r5;
r2 = r2 + r5;
r3 = r3 + r6;
r6 = ~r2;
r6 = r6 < r4;
r2 = r2 + r4;
r3 = r3 + r6;
*h =   ((uint64_t)r3 << 32) | r2;
return ((uint64_t)r1 << 32) | r0;
#else
#error unsupported VARIANT
#endif
}
#if defined(__SIZEOF_INT128__)
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
unsigned __int128 prod = ((unsigned __int128)a) * b;
*h = (uint64_t)(prod >> 32);
return (uint64_t)prod;
}
#elif defined(_MSC_VER) && defined(_WIN64)
#include <intrin.h>
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
*h = __umulh (a, b);
return a * b;
}
#elif USE_X64_ASM_REF
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
uint64_t res_l, res_h;
__asm__ (
"movq  %2, %%rax;nt"          // rax = a
"mulq  %3;nt"                 // rdx:rax = a * b
"movq  %%rdx, %0;nt"          // res_h = rdx
"movq  %%rax, %1;nt"          // res_l = rax
: "=rm" (res_h), "=rm"(res_l)
: "rm"(a), "rm"(b)
: "%rax", "%rdx");
*h = res_h;
return res_l;
}
#else // generic (and slow) reference implementation
#define ADDCcc(a,b,cy,t0,t1) 
(t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) 
(t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDC(a,b,cy,t0,t1) 
(t0=(b)+cy, t1=(a), t0+t1)
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
uint32_t cy, t0, t1;
uint32_t a_lo = (uint32_t)a;
uint32_t a_hi = a >> 32;
uint32_t b_lo = (uint32_t)b;
uint32_t b_hi = b >> 32;
uint64_t p0 = (uint64_t)a_lo * b_lo;
uint64_t p1 = (uint64_t)a_lo * b_hi;
uint64_t p2 = (uint64_t)a_hi * b_lo;
uint64_t p3 = (uint64_t)a_hi * b_hi;
uint32_t p0_lo = (uint32_t)p0;
uint32_t p0_hi = p0 >> 32;
uint32_t p1_lo = (uint32_t)p1;
uint32_t p1_hi = p1 >> 32;
uint32_t p2_lo = (uint32_t)p2;
uint32_t p2_hi = p2 >> 32;
uint32_t p3_lo = (uint32_t)p3;
uint32_t p3_hi = p3 >> 32;
uint32_t r0 = p0_lo;
uint32_t r1 = ADDcc  (p0_hi, p1_lo, cy, t0, t1);
uint32_t r2 = ADDCcc (p1_hi, p2_hi, cy, t0, t1);
uint32_t r3 = ADDC   (p3_hi,     0, cy, t0, t1);
r1 = ADDcc  (r1, p2_lo, cy, t0, t1);
r2 = ADDCcc (r2, p3_lo, cy, t0, t1);
r3 = ADDC   (r3,     0, cy, t0, t1);
*h =   ((uint64_t)r3 << 32) + r2;
return ((uint64_t)r1 << 32) + r0;
}
#endif 
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...@gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone.    The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, 
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, 
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), 
kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
int main (void)
{
uint64_t a, b, res_hi, res_lo, ref_hi, ref_lo, count = 0;
printf ("Smoke test of umul64wide variant %dn", VARIANT);
do {
a = KISS64;
b = KISS64;
ref_lo = umul64wide_ref (a, b, &ref_hi);
res_lo = umul64wide (a, b, &res_hi);
if ((res_lo ^ ref_lo) | (res_hi ^ ref_hi)) {
printf ("!!!! error: a=%016llx b=%016llx res=%016llx_%016llx ref=%016llx_%016llxn",
a, b, res_hi, res_lo, ref_hi, ref_lo);
return EXIT_FAILURE;
}
if (!(count & 0xfffffff)) printf ("r%llu", count);
count++;
} while (count);
return EXIT_SUCCESS;
}

我避免使用((x += y) < y)溢出测试,因为不是每个ISA都能有效地处理条件标志,并且在使用标志寄存器的结果时可能会禁止重新排序;x86[-64]是一个明显的例子,尽管稍后的BMI(2)指令可能有助于缓解这种情况。我还添加了一个32 x 32->用于比较的64位C实现——但我希望任何现代ISA至少能提供像ARM的umulh那样的"高字"乘法。

/******************************************************************************/
/* stackoverflow.com/questions/74713642 */
#include <inttypes.h>
#include <stdio.h>
/* umul_32_32 : 32 x 32 => 64 */
/* force inline (non-portable), or implement it as macro, e.g.,
* #define umul_32_32(rh, rl, x, y) do { ... } while (0) */
#if (1)
static inline __attribute__((always_inline))
uint64_t umul_32_32 (uint32_t x, uint32_t y)
{
return (((uint64_t) x) * y);
}
#else
/* if no widening multiply is available, the compiler probably
* generates something at least as efficient as the following -
* or (worst case) it calls a builtin function. */
static inline __attribute__((always_inline))
uint64_t umul_32_32 (uint32_t x, uint32_t y)
{
uint32_t m0, m1, m2, m3; /* (partial products) */
uint32_t x0, x1, y0, y1;
x0 = x & UINT16_MAX, x1 = x >> (16);
y0 = y & UINT16_MAX, y1 = y >> (16);
m0 = x0 * y0, m1 = x1 * y0;
m2 = x0 * y1, m3 = x1 * y1;
m1 += m0 >> (16);
m3 += m2 >> (16);
m1 += m2 & UINT16_MAX;
uint32_t rh = m3 + (m1 >> (16));
uint32_t rl = m1 << (16) | (m0 & UINT16_MAX);
return (((uint64_t) rh) << 32 | rl);
/* 32 x 32 => 64 : no branching or carry overflow tests. */
}
#endif
/* ensure the function is called to inspect code gen / assembly,
* otherwise gcc and clang evaluate this at compile time. */
__attribute__((noinline)) void umul_64_64 (
uint64_t *rh, uint64_t *rl, uint64_t x, uint64_t y)
{
uint64_t m0, m1, m2, m3; /* (partial products) */
uint32_t x0, x1, y0, y1;
x0 = (uint32_t) (x), x1 = (uint32_t) (x >> (32));
y0 = (uint32_t) (y), y1 = (uint32_t) (y >> (32));
m0 = umul_32_32(x0, y0), m1 = umul_32_32(x1, y0);
m2 = umul_32_32(x0, y1), m3 = umul_32_32(x1, y1);
m1 += m0 >> (32);
m3 += m2 >> (32);
m1 += m2 & UINT32_MAX;
*rh = m3 + (m1 >> (32));
*rl = m1 << (32) | (m0 & UINT32_MAX);
/* 64 x 64 => 128 : no branching or carry overflow tests. */
}
#if (0)
int main (void)
{
uint64_t x = UINT64_MAX, y = UINT64_MAX, rh, rl;
umul_64_64(& rh, & rl, x, y);
fprintf(stdout, "0x%016" PRIX64 ":0x%016" PRIX64 "n", rh, rl);
return (0);
}
#endif
/******************************************************************************/

对于ARM-7,我得到的结果与您的"变体3"代码大致相同,这并不奇怪,因为这是相同的基本想法。我在gcc-12和gcc-trunk上尝试了不同的标志,但无法改进。

我大胆猜测,随着苹果对AArch64硅的投资,有了更积极的优化和资金,clang也受益于32位ARM-7。但这纯粹是猜测。然而,对于这样一个主要平台来说,这是一个相当明显的差距。

最新更新