c语言 - 将十六进制数字映射到连续整数:GCC 的开关比我手写的 SSE2 内部函数快 1.5 倍 cmpeq / movemask / bsf?



我有一个函数,它获取一个字符并检查它,然后返回另一个字符(取决于收到的字符(。

我使用(开关(来检查提供的字符并返回我们想要的内容,但我需要更高的速度,所以我也使用了 (SSE2(。

我的 SSE2 函数比开关函数慢 1.5 倍。 为什么?我的 SSE2 函数有什么慢,gcc -O3要做什么来实现这么快switch

char
switch_func(char c) {
switch (c) {
case '0':
return 0x40;
case '1':
return 0x41;
case '2':
return 0x42;
case '3':
return 0x43;
case '4':
return 0x44;
case '5':
return 0x45;
case '6':
return 0x46;
case '7':
return 0x47;
case '8':
return 0x48;
case '9':
return 0x49;
case 'a':
return 0x4a;
case 'b':
return 0x4b;
case 'c':
return 0x4c;
case 'd':
return 0x4d;
case 'e':
return 0x4e;
case 'f':
return 0x4f;
default:
return 0x00;
}
}

和 SSE2 函数 ->

char
SSE2_func(char c) {
__m128i vec0 = _mm_set_epi8('f','e','d','c','b','a','9',
'8','7','6','5','4','3','2','1','0');
__m128i vec1 = _mm_set1_epi8(c);
static char list[] = {
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f
};
vec1 = _mm_cmpeq_epi8(vec0, vec1); // Compare to find (c) in (vec0) list
int x;
if((x = _mm_movemask_epi8(vec1)) != 0) {
if((x = __builtin_ctz(x)) < 16) { // x is the position of (c) character in (list[])
return list[__builtin_ctz(x)];
}
}
return 0x00;
}

GCC 编译器 : (-O3 -msse2(

大多数编译器会将开关转换为查找表或跳转表,就好像它类似于以下代码一样:

char lut_func(char c){
static const char lut[256] = {
['0']=0x40, ['1']=0x41, ['2']=0x42, ['3']=0x43,
['4']=0x44, ['5']=0x45, ['6']=0x46, ['7']=0x47,
['8']=0x48, ['9']=0x49, ['a']=0x4a, ['b']=0x4b,
['c']=0x4c, ['d']=0x4d, ['e']=0x4e, ['f']=0x4f,
/* everything else is set to 0 automatically */
};
return  lut[(unsigned char)c];
}

唯一的问题:

  • 无法矢量化
  • 常见的?数据(0-9,A-F(跨越2 64字节的数据缓存行

您可以通过正确对齐和偏移数据来纠正缓存行遗漏(如果您分析代码,编译器可能能够执行此操作(,如下所示:

char lut_func(char c){
static const char __attribute__((aligned(64)))lut_data[256+16] = {
['0'+16]=0x40, ['1'+16]=0x41, ['2'+16]=0x42, ['3'+16]=0x43,
['4'+16]=0x44, ['5'+16]=0x45, ['6'+16]=0x46, ['7'+16]=0x47,
['8'+16]=0x48, ['9'+16]=0x49, ['a'+16]=0x4a, ['b'+16]=0x4b,
['c'+16]=0x4c, ['d'+16]=0x4d, ['e'+16]=0x4e, ['f'+16]=0x4f,
/* everything else is set to 0 automatically */
};
char lut = lut_data+16;
return  lut[(unsigned char)c];
}

很难说这是否会有多大帮助,因为数据构成和基准都没有包括在内。

不幸的是,手写的SSE2代码(虽然很聪明(包含非SSE2代码,这会减慢代码速度,并且难以自动矢量化(__builtin_ctzif和字符数组访问(,特别是如果您仅限于SSE2。 当数据已经"热"时,这比单个数据访问效率低。 如果SSE2版本不经常调用,则可能仍然值得使用SSE2版本,但如果是这种情况,则无需对其进行优化。

如果可以按顺序访问数据,则可以使用矢量扩展来获取如下所示的 SIMD 代码:

//this vector extension syntax requires gcc or clang versions 5+
typedef __INT8_TYPE__ i8x16 __attribute__ ((__vector_size__ (16), aligned(16), __may_alias__));
i8x16 vec_func(i8x16 c){
i8x16 is09 = (c>='0') & (c<='9');
i8x16 isaf = (c>='a') & (c<='f');
return (c & (is09 | isaf)) + (16 & is09) - (23 & isaf);
}

在具有 SIMD 指令(x86_64、arm+neon、ppc+altivec 等(的架构上编译,编译为 ~20 条指令,并访问大约 80 字节的数据以计算 16 个连续字符(使用 AVX2,您只需进行最少的修改即可完成 32 个(

例如,使用泛型x86_64进行编译会产生:

vec_func:                                   # @lu16
movdqa  xmm1, xmm0
pcmpgtb xmm1, xmmword ptr [rip + .LCPI0_0]
movdqa  xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
pcmpgtb xmm2, xmm0
movdqa  xmm3, xmm0
pcmpgtb xmm3, xmmword ptr [rip + .LCPI0_2]
pand    xmm2, xmm1
movdqa  xmm1, xmmword ptr [rip + .LCPI0_3] # xmm1 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
pcmpgtb xmm1, xmm0
pand    xmm1, xmm3
movdqa  xmm3, xmm2
por     xmm3, xmm1
pand    xmm3, xmm0
pand    xmm2, xmmword ptr [rip + .LCPI0_4]
pand    xmm1, xmmword ptr [rip + .LCPI0_5]
por     xmm1, xmm2
paddb   xmm1, xmm3
movdqa  xmm0, xmm1
ret

或启用 AVX2

vec_func:
vpcmpgtb        xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
vmovdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
vpcmpgtb        xmm2, xmm2, xmm0
vpcmpgtb        xmm3, xmm0, xmmword ptr [rip + .LCPI0_2]
vpand   xmm1, xmm1, xmm2
vmovdqa xmm2, xmmword ptr [rip + .LCPI0_3] # xmm2 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
vpcmpgtb        xmm2, xmm2, xmm0
vpand   xmm2, xmm3, xmm2
vpor    xmm3, xmm1, xmm2
vpand   xmm0, xmm3, xmm0
vpand   xmm1, xmm1, xmmword ptr [rip + .LCPI0_4]
vpand   xmm2, xmm2, xmmword ptr [rip + .LCPI0_5]
vpor    xmm1, xmm2, xmm1
vpaddb  xmm0, xmm1, xmm0
ret

和 aarch64

vec_func:
movi    v2.16b, 0x61
movi    v4.16b, 0x66
movi    v1.16b, 0x30
movi    v5.16b, 0x39
cmge    v3.16b, v0.16b, v2.16b
cmge    v2.16b, v4.16b, v0.16b
cmge    v1.16b, v0.16b, v1.16b
cmge    v5.16b, v5.16b, v0.16b
movi    v4.16b, 0x10
and     v2.16b, v3.16b, v2.16b
and     v1.16b, v1.16b, v5.16b
movi    v5.16b, 0x17
and     v3.16b, v1.16b, v4.16b
orr     v1.16b, v1.16b, v2.16b
and     v2.16b, v2.16b, v5.16b
and     v1.16b, v1.16b, v0.16b
add     v1.16b, v1.16b, v3.16b
sub     v0.16b, v1.16b, v2.16b
ret

或电源9

vec_func:
xxspltib 35, 47
xxspltib 36, 58
vcmpgtsb 3, 2, 3
vcmpgtsb 4, 4, 2
xxland 0, 35, 36
xxspltib 35, 96
xxspltib 36, 103
vcmpgtsb 3, 2, 3
vcmpgtsb 4, 4, 2
xxland 1, 35, 36
xxlor 2, 0, 1
xxlxor 3, 3, 3
xxsel 34, 3, 34, 2
xxspltib 2, 16
xxsel 35, 3, 2, 0
xxspltib 0, 233
xxsel 36, 3, 0, 1
xxlor 35, 36, 35
vaddubm 2, 3, 2
blr

编译器不擅长优化内部函数。
这绝对是过早优化的情况。
为什么这个功能太慢了?
这些优化级别的任何主流编译器都会将此 switch 语句转换为跳转表,并在编译时尽可能解析答案。
对于如此小的操作,您应该坚持使用 switch 语句以获得可读性、可移植性和性能。

最新更新