用C语言实现SHLD/SHRD指令

我正在尝试在不使用内联汇编的情况下高效地实现x86的SHLD和SHRD指令。

uint32_t shld_UB_on_0(uint32_t a, uint32_t b, uint32_t c) {
return a << c | b >> 32 - c;
}

似乎有效，但在c == 0时调用未定义行为，因为第二个移位的操作数变为32。第三个操作数为0的实际SHLD指令被很好地定义为不执行任何操作。(https://www.felixcloutier.com/x86/shld)

uint32_t shld_broken_on_0(uint32_t a, uint32_t b, uint32_t c) {
return a << c | b >> (-c & 31);
}

不调用未定义的行为，但当c == 0时，结果是a | b而不是a。

uint32_t shld_safe(uint32_t a, uint32_t b, uint32_t c) {
if (c == 0) return a;
return a << c | b >> 32 - c;
}

执行预期，但gcc现在放置了je。CCD_ 13足够聪明，可以将其转换为单个CCD_。

有没有任何方法可以在没有内联汇编的情况下正确有效地实现它？

为什么gcc如此努力不放shld？shld_safe的尝试被gcc11.2-O3翻译为(Godbolt(：

shld_safe:
mov     eax, edi
test    edx, edx
je      .L1
mov     ecx, 32
sub     ecx, edx
shr     esi, cl
mov     ecx, edx
sal     eax, cl
or      eax, esi
.L1:
ret

而clang是这样做的，

shld_safe:
mov     ecx, edx
mov     eax, edi
shld    eax, esi, cl
ret

就我使用gcc 9.3(x86-64(进行的测试而言，它将以下代码转换为shldq和shrdq。

uint64_t shldq_x64(uint64_t low, uint64_t high, uint64_t count) {
return (uint64_t)(((((unsigned __int128)high << 64) | (unsigned __int128)low) << (count & 63)) >> 64);
}
uint64_t shrdq_x64(uint64_t low, uint64_t high, uint64_t count) {
return (uint64_t)((((unsigned __int128)high << 64) | (unsigned __int128)low) >> (count & 63));
}

此外，gcc -m32 -O3将以下代码翻译为shld和shrd。(不过，我还没有用gcc(i386(进行测试。(

uint32_t shld_x86(uint32_t low, uint32_t high, uint32_t count) {
return (uint32_t)(((((uint64_t)high << 32) | (uint64_t)low) << (count & 31)) >> 32);
}
uint32_t shrd_x86(uint32_t low, uint32_t high, uint32_t count) {
return (uint32_t)((((uint64_t)high << 32) | (uint64_t)low) >> (count & 31));
}

(我刚刚阅读了gcc代码并编写了上述函数，即我不确定它们是否是您期望的函数。(

相关内容

最新更新

热门标签：