这个问题是对此的更新:访问三个静态数组比一个包含3x数据的静态数组更快?但我有太多的更新,所以开始一个新的是有意义的。
我有700个"项目",每个项目都有三个属性——a
、b
和e
。我循环遍历每一项,并使用这三个atrribute来计算两个"全局"属性c
和d
的值。
我使用了两种技术来实现这一点:
1) 三个700元素的数组,三个属性各一个数组:
item0.a = array1[0]
item0.b = array2[0]
item0.e = array3[0]
2) 一个2100元素的数组,包含连续三个属性的数据:
item0.a = array[offset + 0]
item0.b = array[offset + 1]
item0.e = array[offset + 2]
现在,三个项属性a
、b
和e
在循环中一起使用,因此,如果将它们存储在一个数组中,则性能应该比使用三个数组技术更好。
这是因为在技术#1中,三个属性(每个相同项)中的每一个都相隔(700 x 4字节)。因此,我必须为一个项目检索三个不同的缓存行。
然而,使用技术#2,这三个属性在内存中相邻,因此一个缓存行将为我提供所有三个项属性。事实上,因为缓存行是64字节,而我使用的是整数——每个缓存行加载将有16个整数可用。这意味着我可以在每个缓存行中处理16/3=5个项目。
换句话说,技术#1应该需要至少3个高速缓存行加载来处理5个项目,而技术#2只需要一个高速缓存线加载。我原以为第二项技术会快很多。
结果(Win 7 64,MSVC 2012与桌面Ivy Bridge CPU):
- 技术#1:3000个CPU周期(平均)
- 技术#2:3400个CPU周期(平均)
技术#1 C++:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array1[700];
unsigned int array2[700];
unsigned int array3[700];
//I have left out code for simplicity. You can assume by now the arrays are populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned int a= array1[i]; //Array 1
unsigned int b= array2[i]; //Array 2
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array3[i]; //Array 3
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
if(data_for_all_items){
std::cout << finish - start << std::endl;
//This line prevents compiler over-optimizing
std::cout << c << d << std::endl;
}
技术#1装配:
unsigned int x;
unsigned int y;
start = __rdtscp(&x);
rdtscp
shl rdx,20h
lea r8,[this]
for(int i=0; i < 700; i++){
sub rdi,rsi
or rax,rdx
lea r10,[rsi+4]
mov ebx,46h
mov dword ptr [r8],ecx
mov r11,rax
sub r9,rsi
unsigned int a = array1[i];
unsigned int b = array2[i];
all_instr_found = all_instr_found & (a != -1 & b != -1);
cmp dword ptr [r10-4],0FFFFFFFFh
xorps xmm0,xmm0
setne cl
cmp dword ptr [r10+rdi-4],0FFFFFFFFh
setne al
and cl,al
and bpl,cl
unsigned int e = array3[i];
mov ecx,dword ptr [r10+r9-4]
c += (a * e);
mov eax,ecx
d += (b * e);
imul ecx,dword ptr [r10-4]
imul eax,dword ptr [r10+rdi-4]
cmp dword ptr [r10],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [rdi+r10],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r9+r10]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10]
xorps xmm0,xmm0
imul eax,dword ptr [rdi+r10]
cmp dword ptr [r10+4],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [rdi+r10+4],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r9+r10+4]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+4]
xorps xmm0,xmm0
imul eax,dword ptr [rdi+r10+4]
cmp dword ptr [r10+8],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [rdi+r10+8],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+8]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+8]
xorps xmm0,xmm0
imul eax,dword ptr [rdi+r10+8]
cmp dword ptr [r10+0Ch],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+0Ch],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+0Ch]
mov eax,ecx
d += (b * e);
addsd xmm7,xmm0
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+0Ch]
imul ecx,dword ptr [r10+0Ch]
cvtsi2sd xmm0,rax
addsd xmm6,xmm0
cmp dword ptr [r10+10h],0FFFFFFFFh
mov eax,ecx
xorps xmm0,xmm0
setne cl
cmp dword ptr [r10+rdi+10h],0FFFFFFFFh
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+10h]
addsd xmm7,xmm0
mov eax,ecx
xorps xmm0,xmm0
imul ecx,dword ptr [r10+10h]
imul eax,dword ptr [r10+rdi+10h]
cmp dword ptr [r10+14h],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+14h],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+14h]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+14h]
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+14h]
cmp dword ptr [r10+18h],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+18h],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+18h]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+18h]
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+18h]
cmp dword ptr [r10+1Ch],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+1Ch],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+1Ch]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+1Ch]
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+1Ch]
cmp dword ptr [r10+20h],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+20h],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+20h]
mov eax,ecx
addsd xmm7,xmm0
imul eax,dword ptr [r10+rdi+20h]
imul ecx,dword ptr [r10+20h]
xorps xmm0,xmm0
add r10,28h
cvtsi2sd xmm0,rax
mov eax,ecx
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm7,xmm0
dec rbx
jne 013FC6DA71h
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
(看起来编译器为技术#1展开了五次循环)
技术#2 C++:
unsigned int x;
unsigned int y;
unsigned short pos = 0;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array[2100];
//I have left out code for simplicity. You can assume by now the array is populated.
start = __rdtscp(&x);
while(pos < 2100){
unsigned int a = array[pos + 0];
unsigned int b = array[pos + 1];
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array[pos + 2];
c += (a * e);
d += (b * e);
pos = pos + 3;
}
finish = __rdtscp(&y);
if(data_for_all_items){
std::cout << finish - start << std::endl;
//This line prevents compiler over-optimizing
std::cout << c << d << std::endl;
}
装配技术#2:
start = __rdtscp(&x);
rdtscp
shl rdx,20h
lea r9,[this]
mov r11d,2BCh
or rax,rdx
mov dword ptr [r9],ecx
add r10,8
mov rbx,rax
nop word ptr [rax+rax]
while(pos < 2100){
unsigned int a = array[pos];
unsigned int b = array[pos + 1];
all_instr_found = all_instr_found & (a != -1 & b != -1);
cmp dword ptr [r10-4],0FFFFFFFFh
xorps xmm0,xmm0
setne dl
cmp dword ptr [r10-8],0FFFFFFFFh
setne cl
unsigned int e = array[pos + 2];
c += (a * e);
d += (b * e);
pos = pos + 3;
add r10,0Ch
and dl,cl
mov ecx,dword ptr [r10-0Ch]
mov eax,ecx
and r15b,dl
imul ecx,dword ptr [r10-10h]
imul eax,dword ptr [r10-14h]
cvtsi2sd xmm0,rax
mov eax,ecx
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm7,xmm0
dec r11
jne 013F21DA30h
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
所以没有循环展开?这一定是原因吧?我用这个C++:手动展开循环
while(pos < 2100){
unsigned int a1 = array[pos + 0];
unsigned int b1 = array[pos + 1];
unsigned int e1 = array[pos + 2];
unsigned int a2 = array[pos + 3];
unsigned int b2 = array[pos + 4];
unsigned int e2 = array[pos + 5];
unsigned int a3 = array[pos + 6];
unsigned int b3 = array[pos + 7];
unsigned int e3 = array[pos + 8];
unsigned int a4 = array[pos + 9];
unsigned int b4 = array[pos + 10];
unsigned int e4 = array[pos + 11];
unsigned int a5 = array[pos + 12];
unsigned int b5 = array[pos + 13];
unsigned int e5 = array[pos + 14];
c += (a1 * e1) + (a2 * e2) + (a3 * e3) + (a4 * e4) + (a5 * e5);
d += (b1 * e1) + (b2 * e2) + (b3 * e3) + (b4 * e4) + (b5 * e5);
data_for_all_items = data_for_all_items & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
pos += 15;
}
if(data_for_all_items){
std::cout << finish - start << std::endl;
//This line prevents compiler over-optimizing
std::cout << c << d << std::endl;
}
生成此程序集:
start = __rdtscp(&x);
rdtscp
lea r9,[x]
shl rdx,20h
mov qword ptr [rsp+108h],8Ch
mov dword ptr [r9],ecx
lea r9,[r8+8]
or rax,rdx
mov qword ptr [y],r9
mov qword ptr [rsp+20h],rax
nop
unsigned int a1 = array[pos + 0];
unsigned int b1 = array[pos + 1];
unsigned int e1 = array[pos + 2];
mov edi,dword ptr [r9]
unsigned int a2 = array[pos + 3];
mov r13d,dword ptr [r9+4]
unsigned int b2 = array[pos + 4];
mov r12d,dword ptr [r9+8]
xorps xmm0,xmm0
unsigned int e2 = array[pos + 5];
mov r10d,dword ptr [r9+0Ch]
unsigned int a3 = array[pos + 6];
mov r15d,dword ptr [r9+10h]
unsigned int b3 = array[pos + 7];
mov r14d,dword ptr [r9+14h]
unsigned int e3 = array[pos + 8];
mov r8d,dword ptr [r9+18h]
unsigned int a4 = array[pos + 9];
mov ebp,dword ptr [r9+1Ch]
unsigned int b4 = array[pos + 10];
mov esi,dword ptr [r9+20h]
unsigned int e4 = array[pos + 11];
mov edx,dword ptr [r9+24h]
unsigned int a5 = array[pos + 12];
mov ebx,dword ptr [r9+28h]
unsigned int b5 = array[pos + 13];
mov r11d,dword ptr [r9+2Ch]
unsigned int e5 = array[pos + 14];
mov r9d,dword ptr [r9+30h]
c += (a1 * e1) + (a2 * e2) + (a3 * e3) + (a4 * e4) + (a5 * e5);
mov eax,edx
mov dword ptr [x],r13d
d += (b1 * e1) + (b2 * e2) + (b3 * e3) + (b4 * e4) + (b5 * e5);
imul edx,esi
imul eax,ebp
mov ecx,r9d
imul r9d,r11d
imul ecx,ebx
add r9d,edx
add ecx,eax
mov eax,r8d
imul r8d,r14d
imul eax,r15d
add r9d,r8d
all_instr_found = all_instr_found & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
movzx r8d,byte ptr [this]
add ecx,eax
mov eax,r10d
imul r10d,r12d
add r9d,r10d
imul eax,r13d
mov r13,qword ptr [y]
add ecx,eax
mov eax,edi
imul eax,dword ptr [r13-8]
add eax,ecx
cvtsi2sd xmm0,rax
mov rax,r13
mov edx,dword ptr [rax-4]
imul edi,edx
cmp r11d,0FFFFFFFFh
addsd xmm6,xmm0
setne cl
all_instr_found = all_instr_found & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
cmp ebx,0FFFFFFFFh
lea eax,[rdi+r9]
mov r9,r13
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
cmp esi,0FFFFFFFFh
setne al
and cl,al
cmp ebp,0FFFFFFFFh
setne al
and cl,al
cmp r14d,0FFFFFFFFh
addsd xmm7,xmm0
setne al
and cl,al
cmp r15d,0FFFFFFFFh
setne al
and cl,al
cmp r12d,0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [x],0FFFFFFFFh
setne al
and cl,al
cmp edx,0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [r13-8],0FFFFFFFFh
setne al
and cl,al
and r8b,cl
pos += 15;
add r9,3Ch
mov qword ptr [y],r9
mov byte ptr [this],r8b
while(pos < 2100){
dec qword ptr [rsp+108h]
jne 013F31DA50h
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r9,[y]
mov dword ptr [r9],ecx
mov r15,qword ptr [rsp+0A8h]
mov r13,qword ptr [rsp+0B0h]
mov r12,qword ptr [rsp+0B8h]
or rax,rdx
新的手动循环展开平均需要3500个CPU周期,比未滚动的原始版本3400个CPU周期慢。
有人能解释一下为什么从缓存中提取的代码要慢3倍吗?是否有办法帮助编译器或比技术#1更快地获得这些代码?
在#2中使用一些预取,您可能会达到这个崇高的性能目标
对于gcc,使用内联汇编程序和3dNowPrefetch指令(现在几乎只剩下该扩展了)。
您的示例1可能运行得更快,因为执行3个独立的内存请求(由于执行无序)意味着内存总线上的吞吐量更好(空闲时间更少)。
这是一个很好的例子,微观优化不仅会让事情变得更难,而且会弄巧成拙
处理器设计者和制造商以及编译器作者花费了惊人的金钱、时间和人才来优化常见情况,因此很难对其进行事后猜测。
一些简单的信封后运算。
从循环中的int数组abc[3N]中提取将需要(3N*4+63)/64次缓存提取。在处理5.33个三元组(5*4*3=60)之后,您将需要下一个缓存行。
从三个不连续的数组中提取第一次迭代需要3次提取,但这三条缓存线可以在接下来的15次迭代中保留在内存中(64/4=16)。因此,您将需要3*((N*4+63)/64)回迁。
使用700作为N:
F1 = (8400+63)/64 = 132
F2 = 3*((700*4+63)/64) = 132
考虑到CPU的缓存大小,当迭代在三个数组上并行进行时,不可能抛出行。
Intel Ivy Brige CPU可以预测获取多个缓存线,只要预测正确,您就不会注意到单个数据流和多达6个数据流之间的任何差异(流的确切数量取决于CPU设计)。
你看到的差异很可能更多地与组装输出的变化有关,而不是与L-疼痛行为有关。在每种情况下生成的asm显然都非常不同,上一个版本(你的手展开的版本)每次迭代的指令最多——这是一个非常安全的指示,表明它应该更慢,只是为了通过CPU的指令解码器和执行单元强制执行更多的"实际工作"。
此外,cvtsi2sd
尤其是Ivy Bridge上的一条慢指令,它可能会简单地根据它在指令流中相对于输入(依赖项)和输出(结果延迟)的位置来扭曲基准测试结果。如果您想隔离测试以单独测量内存性能。我强烈建议将该指令从基准中删除。