我正在英特尔至强®Phi®上实现超快速popcount,因为它是各种生物信息学软件的性能热点。
我已经实现了五段代码,
#if defined(__MIC__)
#include <zmmintrin.h>
__attribute__((align(64))) static const uint32_t POPCOUNT_4bit[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
__attribute__((align(64))) static const uint32_t MASK_4bit[16] = {0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF};
inline uint64_t vpu_popcount1(uint64_t* buf, size_t n) {
register size_t result = 0;
size_t i;
register const __m512i popcnt = _mm512_load_epi32((void*)POPCOUNT_4bit);
register const __m512i mask = _mm512_load_epi32((void*)MASK_4bit);
register __m512i total;
register __m512i shuf;
#pragma unroll(8)
for (i = 0; i < n; i+=8) {
shuf = _mm512_load_epi32(&buf[i]);
_mm_prefetch((const char *)&buf[i+256], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
total = _mm512_setzero_epi32();
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(shuf, mask), popcnt), total);
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 4), mask), popcnt), total);
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 8), mask), popcnt), total);
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 12), mask), popcnt), total);
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 16), mask), popcnt), total);
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 20), mask), popcnt), total);
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 24), mask), popcnt), total);
total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 28), mask), popcnt), total);
/* Reduce add, which is analogous to SSSE3's PSADBW instruction,
is not implementated as a single instruction in VPUv1, thus
emulated by multiple instructions*/
result += _mm512_reduce_add_epi32(total);
}
return result;
}
__attribute__((align(64))) static const unsigned magic[] = {
0x55555555, 0x55555555, 0x55555555, 0x55555555,
0x55555555, 0x55555555, 0x55555555, 0x55555555,
0x55555555, 0x55555555, 0x55555555, 0x55555555,
0x55555555, 0x55555555, 0x55555555, 0x55555555,
0x33333333, 0x33333333, 0x33333333, 0x33333333,
0x33333333, 0x33333333, 0x33333333, 0x33333333,
0x33333333, 0x33333333, 0x33333333, 0x33333333,
0x33333333, 0x33333333, 0x33333333, 0x33333333,
0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
};
inline uint64_t vpu_popcount2(uint64_t* buf, size_t n) {
register size_t result = 0;
size_t i;
register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
register __m512i total;
register __m512i shuf;
#pragma unroll(8)
for (i = 0; i < n; i+=8) {
shuf = _mm512_load_epi32(&buf[i]);
_mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
total = _mm512_sub_epi32(shuf, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf,1)));
total = _mm512_add_epi32(_mm512_and_epi32(B1, total), _mm512_and_epi32(B1,_mm512_srli_epi32(total,2)));
total = _mm512_and_epi32(B2, _mm512_add_epi32(total, _mm512_srli_epi32(total,4)));
total = _mm512_and_epi32(B3, _mm512_add_epi32(total, _mm512_srli_epi32(total,8)));
total = _mm512_and_epi32(B4, _mm512_add_epi32(total, _mm512_srli_epi32(total,16)));
/* Reduce add, which is analogous to SSSE3's PSADBW instruction,
is not implementated as a single instruction in VPUv1, thus
emulated by multiple instructions*/
result += _mm512_reduce_add_epi32(total);
}
return result;
}
inline uint64_t vpu_popcount3(uint64_t* buf, size_t n) {
register size_t result = 0;
size_t i;
register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
register __m512i total;
register __m512i shuf;
#pragma unroll(4)
for (i = 0; i < n; i+=16) {
shuf = _mm512_load_epi32(&buf[i]);
result += _mm_countbits_64(buf[i+8]);
_mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+576], _MM_HINT_T1); // vprefetch1
result += _mm_countbits_64(buf[i+9]);
_mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+128], _MM_HINT_T0); // vprefetch0
total = _mm512_sub_epi32(shuf, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf,1)));
result += _mm_countbits_64(buf[i+10]);
total = _mm512_add_epi32(_mm512_and_epi32(B1, total), _mm512_and_epi32(B1,_mm512_srli_epi32(total,2)));
result += _mm_countbits_64(buf[i+11]);
total = _mm512_and_epi32(B2, _mm512_add_epi32(total, _mm512_srli_epi32(total,4)));
result += _mm_countbits_64(buf[i+12]);
total = _mm512_and_epi32(B3, _mm512_add_epi32(total, _mm512_srli_epi32(total,8)));
result += _mm_countbits_64(buf[i+13]);
total = _mm512_and_epi32(B4, _mm512_add_epi32(total, _mm512_srli_epi32(total,16)));
result += _mm_countbits_64(buf[i+14]);
/* Reduce add, which is analogous to SSSE3's PSADBW instruction,
is not implementated as a single instruction in VPUv1, thus
emulated by multiple instructions*/
result += _mm512_reduce_add_epi32(total);
result += _mm_countbits_64(buf[i+15]);
}
return result;
}
/* Using VPU or SSE's machine intrinsic, CPUs not supporting SIMD
* will use compiler's implementation, the speed of which depends */
static inline size_t scalar_popcountu(unsigned *buf, size_t n) {
register size_t cnt = 0;
size_t i;
#pragma vector always
#pragma unroll(8)
for (i = 0; i < n; i++) {
cnt += _mm_countbits_32(buf[i]);
_mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
}
return cnt;
}
static inline size_t scalar_popcountlu(uint64_t *buf, size_t n) {
register size_t cnt = 0;
size_t i;
#pragma vector always
#pragma unroll(8)
for (i = 0; i < n; i++) {
cnt += _mm_countbits_64(buf[i]);
_mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
}
return cnt;
}
#endif
支持OpenMP的代码可以从https://www.dropbox.com/sh/b3sfqps19wa2oi4/iFQ9wQ1NTg
下载。代码是使用Intel C/c++编译器x13编译的,使用命令:
icc -debug inline-debug-info -O3 -mmic -fno-alias -ansi-alias -opt-streaming-stores always -ipo popcnt-mmic.cpp -o popcnt-mmic -vec-report=2 -openmp
代码本机运行在协同处理器(61核)上,具有"122个线程",并且使用exports将线程亲和性设置为"平衡":
export OMP_NUM_THREADS=122;export KMP_AFFINITY=balanced
我使用Xeon Phi SE10p, B1步进,CentOS6.4在28mb的垃圾(由rand()填充)上进行测试,迭代10000次,性能如下:
Buffer allocated at: 0x7f456b000000
OpenMP scalar_popcountu 4310169 us; cnt = 28439328
OpenMP scalar_popcountlu 1421139 us; cnt = 28439328
OpenMP vpu_popcount 1489992 us; cnt = 28439328
OpenMP vpu_popcount2 1109530 us; cnt = 28439328
OpenMP vpu_popcount3 951122 us; cnt = 28439328
"scalar_popcountu"one_answers"scalar_popcountlu"分别使用了"_mm_countbits_32"one_answers"_mm_countbits_64"的内在属性,它们利用了标量"popcnt"指令。设置"#pragma vector always"要求编译器一次将加载和求和矢量化为16个无符号整型或8个无符号长型,尽管popcount本身仍然是一个标量指令。
vpu_popcount1的实现类似于SSSE3的popcount实现http://wm.ite.pl/articles/sse-popcount.html。然而,1)Xeon Phi不支持整数上的打包字节操作(最小值是双字,即32位),2)它不实现"绝对差的打包和"指令(如SSSE3中的_mm_sad_epu8),因此减少添加是由四组"vpermf32x4","vpaddd"one_answers"movslq"的组合执行的。因此,该实现比原来的SSSE3版本生成了更多的指令。
vpu_popcount2的实现类似于SSE2的popcount实现(可以参考"Hacker’s Delight")。与vpu_popcount1相比,该实现生成的指令更少,速度大约快30%。然而,繁琐的"减加"仍然无法避免。
vpu_popcount3的实现非常特定于Xeon Phi。混合矢量和标量操作,它比vpu_popcount2快15%左右(在我的实现中,矢量操作中标量操作的分散是空闲的,可以根据编译器生成的汇编代码重新排列标量操作,但就我而言,预期的改进是有限的)。改进是基于以下观察:1)Xeon Phi是有序调度,2)每个时钟周期可以发出两个标量指令或"1向量+ 1标量"指令。我把展开次数从8次减少到4次,以避免寄存器文件饱和。
每个函数从内存预取到L2提前8个循环,从L2预取到L1提前1个循环,使得L1命中率从0.38提高到0.994。
展开确实使性能提高了大约15%。这是反直觉的,因为Xeon Phi是顺序调度。但是unroll使icc编译器可以尽可能多地调度编译时间。
我们有更多的技术来提高性能吗?
Brian Nickerson的两段更快的代码,
OpenMP vpu_popcount2 1110737 us; cnt = 28439328
OpenMP vpu_popcount3 951459 us; cnt = 28439328
OpenMP vpu_popcount3_r 815126 us; cnt = 28439328
OpenMP vpu_popcount5 746852 us; cnt = 28439328
vpu_popcount3_revised:
inline uint64_t vpu_popcount3_revised(uint64_t* buf, size_t n) {
_mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
register size_t result;
size_t i;
register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
register __m512i total0;
register __m512i total1;
register __m512i shuf0;
register __m512i shuf1;
register __m512i result0;
register __m512i result1;
result0 = _mm512_setzero_epi32();
result1 = _mm512_setzero_epi32();
for (i = 0; i < n; i+=16) {
shuf0 = _mm512_load_epi32(&buf[i ]);
shuf1 = _mm512_load_epi32(&buf[i+8]);
_mm_prefetch((const char *)&buf[i+128], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+136], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+16], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+24], _MM_HINT_T0); // vprefetch0
total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));
total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2)));
total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));
total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
total0 = _mm512_and_epi32(B3, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8)));
total1 = _mm512_and_epi32(B3, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8)));
total0 = _mm512_and_epi32(B4, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));
total1 = _mm512_and_epi32(B4, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
result0 = _mm512_add_epi32(result0,total0);
result1 = _mm512_add_epi32(result1,total1);
}
result0 = _mm512_add_epi32(result0,result1);
result = _mm512_reduce_add_epi32(result0);
return result;
}
vpu_popcount5:
inline uint64_t vpu_popcount5(uint64_t* buf, size_t n) {
_mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[128], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[136], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[144], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[152], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[160], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[168], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[176], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[184], _MM_HINT_T1); // vprefetch1
register size_t result;
size_t i;
register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
register const __m512i B6 = _mm512_load_epi32((void*)(magic+80));
register __m512i total0;
register __m512i total1;
register __m512i total2;
register __m512i total3;
register __m512i shuf0;
register __m512i shuf1;
register __m512i shuf2;
register __m512i shuf3;
register __m512i result0;
register __m512i result1;
result0 = _mm512_setzero_epi32();
result1 = _mm512_setzero_epi32();
for (i = 0; i < n; i+=32) {
shuf0 = _mm512_load_epi32(&buf[i ]);
shuf1 = _mm512_load_epi32(&buf[i+ 8]);
shuf2 = _mm512_load_epi32(&buf[i+16]);
shuf3 = _mm512_load_epi32(&buf[i+24]);
_mm_prefetch((const char *)&buf[i+192], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+200], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+208], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+216], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+32], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+40], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+48], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+56], _MM_HINT_T0); // vprefetch0
total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1))); // max value in nn is 10
total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
total2 = _mm512_sub_epi32(shuf2, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf2,1)));
total3 = _mm512_sub_epi32(shuf3, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf3,1)));
total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2))); // max value in nnnn is 0100
total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
total2 = _mm512_add_epi32(_mm512_and_epi32(B1, total2), _mm512_and_epi32(B1,_mm512_srli_epi32(total2,2)));
total3 = _mm512_add_epi32(_mm512_and_epi32(B1, total3), _mm512_and_epi32(B1,_mm512_srli_epi32(total3,2)));
total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4))); // max value in 0000nnnn is 00001000
total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
total2 = _mm512_and_epi32(B2, _mm512_add_epi32(total2, _mm512_srli_epi32(total2,4)));
total3 = _mm512_and_epi32(B2, _mm512_add_epi32(total3, _mm512_srli_epi32(total3,4)));
total0 = _mm512_add_epi32(total0, total1); // max value in 000nnnnn is 00010000
total1 = _mm512_add_epi32(total2, total3);
total0 = _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8)); // max value in xxxxxxxx00nnnnnn is 00100000
total1 = _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8));
total0 = _mm512_and_epi32(B6, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16))); // max value in each element is 01000000, i.e. 64
total1 = _mm512_and_epi32(B6, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
result0 = _mm512_add_epi32(result0,total0);
result1 = _mm512_add_epi32(result1,total1);
}
result0 = _mm512_add_epi32(result0,result1);
result = _mm512_reduce_add_epi32(result0);
return result;
}
自从昨天发布以来,我已经能够在我自己的卡片上运行您的代码和我的建议。我没有得到与您完全相同的计时,可能是由于我的硬件的步进,也可能与我的编译器的版本有关。但是这个趋势持续下去,我的建议似乎使性能提高了15%。
我得到了一个额外的小性能提升,在5%到10%之间,稍微调整一下,如下面的代码所示。请注意,在下面的代码片段中,B6将每个元素设置为0x000000FF。在这一点上,我认为算法可能会非常接近从GDDR到L2缓存的最大可持续带宽。
注意:如果我用一个重复了10次的for循环来包装popcount5函数的体(注意,这是输入数据的"chunk_size"的10次快速重复,所以其中9次它将在L2中非常热),那么测试的总时间只增加了大约5倍,而不是10倍。我提出这个问题是因为我认为你的目标是调整位计数逻辑的速度,但也许你希望部署它的应用程序实际上有一个更小和/或更热的工作集。如果是这样的话,DRAM->L2带宽引入的节流使情况变得模糊。但请注意,减少测试输入的大小,使其在L2中保持更热,似乎会导致其他开销(可能是openmp开销)变得相对更大。inline uint64_t vpu_popcount5(uint64_t* buf, size_t n) {
_mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[128], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[136], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[144], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[152], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[160], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[168], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[176], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[184], _MM_HINT_T1); // vprefetch1
register size_t result;
size_t i;
register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
register const __m512i B6 = _mm512_load_epi32((void*)(magic+80));
register __m512i total0;
register __m512i total1;
register __m512i total2;
register __m512i total3;
register __m512i shuf0;
register __m512i shuf1;
register __m512i shuf2;
register __m512i shuf3;
register __m512i result0;
register __m512i result1;
result0 = _mm512_setzero_epi32();
result1 = _mm512_setzero_epi32();
for (i = 0; i < n; i+=32) {
shuf0 = _mm512_load_epi32(&buf[i ]);
shuf1 = _mm512_load_epi32(&buf[i+ 8]);
shuf2 = _mm512_load_epi32(&buf[i+16]);
shuf3 = _mm512_load_epi32(&buf[i+24]);
_mm_prefetch((const char *)&buf[i+192], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+200], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+208], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+216], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+32], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+40], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+48], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+56], _MM_HINT_T0); // vprefetch0
total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1))); // max value in nn is 10
total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
total2 = _mm512_sub_epi32(shuf2, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf2,1)));
total3 = _mm512_sub_epi32(shuf3, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf3,1)));
total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2))); // max value in nnnn is 0100
total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
total2 = _mm512_add_epi32(_mm512_and_epi32(B1, total2), _mm512_and_epi32(B1,_mm512_srli_epi32(total2,2)));
total3 = _mm512_add_epi32(_mm512_and_epi32(B1, total3), _mm512_and_epi32(B1,_mm512_srli_epi32(total3,2)));
total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4))); // max value in 0000nnnn is 00001000
total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
total2 = _mm512_and_epi32(B2, _mm512_add_epi32(total2, _mm512_srli_epi32(total2,4)));
total3 = _mm512_and_epi32(B2, _mm512_add_epi32(total3, _mm512_srli_epi32(total3,4)));
total0 = _mm512_add_epi32(total0, total1); // max value in 000nnnnn is 00010000
total1 = _mm512_add_epi32(total2, total3);
total0 = _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8)); // max value in xxxxxxxx00nnnnnn is 00100000
total1 = _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8));
total0 = _mm512_and_epi32(B6, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16))); // max value in each element is 01000000, i.e. 64
total1 = _mm512_and_epi32(B6, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
result0 = _mm512_add_epi32(result0,total0);
result1 = _mm512_add_epi32(result1,total1);
/* Reduce add, which is analogous to SSSE3's PSADBW instruction,
is not implementated as a single instruction in VPUv1, thus
emulated by multiple instructions*/
}
result0 = _mm512_add_epi32(result0,result1);
result = _mm512_reduce_add_epi32(result0);
return result;
}
请尝试下面的变体,并报告这是否提高了您的性能?我正在解决我认为在你的编码中不是很理想的几个点:
- 我觉得你的预取距离不太对。在我看来,你可能一直在考虑字节偏移距离,而实际上索引是以uint64为单位的。
- 我认为没有理由在每次循环迭代时都执行缩减操作。您可以在16个SIMD元素中执行位计数的部分累积,然后在循环之外执行单个减少。
- 我不认为做标量端popcount指令和真正得到最好的VPU调度一样有利。专注于一个优秀的VPU时间表是最重要的。我也不认为标量popcount指令实际上与矢量操作配对;也就是说,我认为它只支持u型管。
inline uint64_t vpu_popcount3_revised(uint64_t* buf, size_t n) {
_mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
register size_t result;
size_t i;
register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
register __m512i total0;
register __m512i total1;
register __m512i shuf0;
register __m512i shuf1;
register __m512i result0;
register __m512i result1;
result0 = _mm512_setzero_epi32();
result1 = _mm512_setzero_epi32();
for (i = 0; i < n; i+=16) {
shuf0 = _mm512_load_epi32(&buf[i ]);
shuf1 = _mm512_load_epi32(&buf[i+8]);
_mm_prefetch((const char *)&buf[i+128], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+136], _MM_HINT_T1); // vprefetch1
_mm_prefetch((const char *)&buf[i+16], _MM_HINT_T0); // vprefetch0
_mm_prefetch((const char *)&buf[i+24], _MM_HINT_T0); // vprefetch0
total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));
total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2)));
total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));
total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
total0 = _mm512_and_epi32(B3, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8)));
total1 = _mm512_and_epi32(B3, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8)));
total0 = _mm512_and_epi32(B4, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));
total1 = _mm512_and_epi32(B4, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
result0 = _mm512_add_epi32(result0,total0);
result1 = _mm512_add_epi32(result1,total1);
}
/* Reduce add, which is analogous to SSSE3's PSADBW instruction,
is not implementated as a single instruction in VPUv1, thus
emulated by multiple instructions*/
result0 = _mm512_add_epi32(result0,result1);
result = _mm512_reduce_add_epi32(result0);
return result;
}