访问连续L1高速缓冲存储器的代码比访问3x非连续L1存储器的代码慢



这个问题是对此的更新:访问三个静态数组比一个包含3x数据的静态数组更快?但我有太多的更新,所以开始一个新的是有意义的。

我有700个"项目",每个项目都有三个属性——abe。我循环遍历每一项,并使用这三个atrribute来计算两个"全局"属性cd的值。

我使用了两种技术来实现这一点:

1) 三个700元素的数组,三个属性各一个数组:

 item0.a = array1[0]
 item0.b = array2[0]
 item0.e = array3[0]

2) 一个2100元素的数组,包含连续三个属性的数据:

 item0.a = array[offset + 0]
 item0.b = array[offset + 1]
 item0.e = array[offset + 2]

现在,三个项属性abe在循环中一起使用,因此,如果将它们存储在一个数组中,则性能应该比使用三个数组技术更好。

这是因为在技术#1中,三个属性(每个相同项)中的每一个都相隔(700 x 4字节)。因此,我必须为一个项目检索三个不同的缓存行。

然而,使用技术#2,这三个属性在内存中相邻,因此一个缓存行将为我提供所有三个项属性。事实上,因为缓存行是64字节,而我使用的是整数——每个缓存行加载将有16个整数可用。这意味着我可以在每个缓存行中处理16/3=5个项目。

换句话说,技术#1应该需要至少3个高速缓存行加载来处理5个项目,而技术#2只需要一个高速缓存线加载。我原以为第二项技术会快很多。

结果(Win 7 64,MSVC 2012与桌面Ivy Bridge CPU):

  • 技术#1:3000个CPU周期(平均)
  • 技术#2:3400个CPU周期(平均)

技术#1 C++:

unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array1[700];
unsigned int array2[700];
unsigned int array3[700];
//I have left out code for simplicity. You can assume by now the arrays are populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
    unsigned int a= array1[i];     //Array 1
    unsigned int b= array2[i];     //Array 2
    data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
    unsigned int e = array3[i];    //Array 3
    c += (a * e);
    d += (b * e);
}
finish = __rdtscp(&y);
if(data_for_all_items){
    std::cout << finish - start << std::endl;
    //This line prevents compiler over-optimizing
    std::cout << c << d << std::endl;
}

技术#1装配:

            unsigned int x;
            unsigned int y;
            start = __rdtscp(&x);
 rdtscp  
 shl         rdx,20h  
 lea         r8,[this]  
            for(int i=0; i < 700; i++){
 sub         rdi,rsi  
 or          rax,rdx  
 lea         r10,[rsi+4]  
 mov         ebx,46h  
 mov         dword ptr [r8],ecx  
 mov         r11,rax  
 sub         r9,rsi  
                unsigned int a = array1[i];
                unsigned int b = array2[i];
                all_instr_found = all_instr_found & (a != -1 & b != -1);
 cmp         dword ptr [r10-4],0FFFFFFFFh  
 xorps       xmm0,xmm0  
 setne       cl  
 cmp         dword ptr [r10+rdi-4],0FFFFFFFFh  
 setne       al  
 and         cl,al  
 and         bpl,cl  
                unsigned int e = array3[i];
 mov         ecx,dword ptr [r10+r9-4]  

                c += (a * e);
 mov         eax,ecx  
                d += (b * e);
 imul        ecx,dword ptr [r10-4]  
 imul        eax,dword ptr [r10+rdi-4]  
 cmp         dword ptr [r10],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [rdi+r10],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r9+r10]  
 mov         eax,ecx  
 addsd       xmm7,xmm0  
 imul        ecx,dword ptr [r10]  
 xorps       xmm0,xmm0  
 imul        eax,dword ptr [rdi+r10]  
 cmp         dword ptr [r10+4],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [rdi+r10+4],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r9+r10+4]  
 mov         eax,ecx  
 addsd       xmm7,xmm0  
 imul        ecx,dword ptr [r10+4]  
 xorps       xmm0,xmm0  
 imul        eax,dword ptr [rdi+r10+4]  
 cmp         dword ptr [r10+8],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [rdi+r10+8],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r10+r9+8]  
 mov         eax,ecx  
 addsd       xmm7,xmm0  
 imul        ecx,dword ptr [r10+8]  
 xorps       xmm0,xmm0  
 imul        eax,dword ptr [rdi+r10+8]  
 cmp         dword ptr [r10+0Ch],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [r10+rdi+0Ch],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r10+r9+0Ch]  
 mov         eax,ecx  
                d += (b * e);
 addsd       xmm7,xmm0  
 xorps       xmm0,xmm0  
 imul        eax,dword ptr [r10+rdi+0Ch]  
 imul        ecx,dword ptr [r10+0Ch]  
 cvtsi2sd    xmm0,rax  
 addsd       xmm6,xmm0  
 cmp         dword ptr [r10+10h],0FFFFFFFFh  
 mov         eax,ecx  
 xorps       xmm0,xmm0  
 setne       cl  
 cmp         dword ptr [r10+rdi+10h],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r10+r9+10h]  
 addsd       xmm7,xmm0  
 mov         eax,ecx  
 xorps       xmm0,xmm0  
 imul        ecx,dword ptr [r10+10h]  
 imul        eax,dword ptr [r10+rdi+10h]  
 cmp         dword ptr [r10+14h],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [r10+rdi+14h],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r10+r9+14h]  
 mov         eax,ecx  
 addsd       xmm7,xmm0  
 imul        ecx,dword ptr [r10+14h]  
 xorps       xmm0,xmm0  
 imul        eax,dword ptr [r10+rdi+14h]  
 cmp         dword ptr [r10+18h],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [r10+rdi+18h],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r10+r9+18h]  
 mov         eax,ecx  
 addsd       xmm7,xmm0  
 imul        ecx,dword ptr [r10+18h]  
 xorps       xmm0,xmm0  
 imul        eax,dword ptr [r10+rdi+18h]  
 cmp         dword ptr [r10+1Ch],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [r10+rdi+1Ch],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r10+r9+1Ch]  
 mov         eax,ecx  
 addsd       xmm7,xmm0  
 imul        ecx,dword ptr [r10+1Ch]  
 xorps       xmm0,xmm0  
 imul        eax,dword ptr [r10+rdi+1Ch]  
 cmp         dword ptr [r10+20h],0FFFFFFFFh  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 setne       cl  
 cmp         dword ptr [r10+rdi+20h],0FFFFFFFFh  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 and         bpl,cl  
 mov         ecx,dword ptr [r10+r9+20h]  
 mov         eax,ecx  
 addsd       xmm7,xmm0  
 imul        eax,dword ptr [r10+rdi+20h]  
 imul        ecx,dword ptr [r10+20h]  
 xorps       xmm0,xmm0  
 add         r10,28h  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 addsd       xmm7,xmm0  
 dec         rbx  
 jne         013FC6DA71h  
            }
            finish = __rdtscp(&y);
 rdtscp  
 shl         rdx,20h  
 lea         r8,[y]  
 or          rax,rdx  
 mov         dword ptr [r8],ecx

(看起来编译器为技术#1展开了五次循环)

技术#2 C++:

unsigned int x;
unsigned int y;
unsigned short pos = 0;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array[2100];
//I have left out code for simplicity. You can assume by now the array is populated.
start = __rdtscp(&x);
while(pos < 2100){
    unsigned int a = array[pos + 0];     
    unsigned int b = array[pos + 1];     
    data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
    unsigned int e = array[pos + 2];    
    c += (a * e);
    d += (b * e);
    pos = pos + 3;
}
finish = __rdtscp(&y);
if(data_for_all_items){
    std::cout << finish - start << std::endl;
    //This line prevents compiler over-optimizing
    std::cout << c << d << std::endl;
}

装配技术#2:

            start = __rdtscp(&x);
 rdtscp  
 shl         rdx,20h  
 lea         r9,[this]  
 mov         r11d,2BCh  
 or          rax,rdx  
 mov         dword ptr [r9],ecx  
 add         r10,8  
 mov         rbx,rax  
 nop         word ptr [rax+rax]  
            while(pos < 2100){
                unsigned int a = array[pos];
                unsigned int b = array[pos + 1];
                all_instr_found = all_instr_found & (a != -1 & b != -1);
 cmp         dword ptr [r10-4],0FFFFFFFFh  
 xorps       xmm0,xmm0  
 setne       dl  
 cmp         dword ptr [r10-8],0FFFFFFFFh  
 setne       cl  
                unsigned int e = array[pos + 2];
                c += (a * e);
                d += (b * e);
                pos = pos + 3;
 add         r10,0Ch  
 and         dl,cl  
 mov         ecx,dword ptr [r10-0Ch]  
 mov         eax,ecx  
 and         r15b,dl  
 imul        ecx,dword ptr [r10-10h]  
 imul        eax,dword ptr [r10-14h]  
 cvtsi2sd    xmm0,rax  
 mov         eax,ecx  
 addsd       xmm6,xmm0  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 addsd       xmm7,xmm0  
 dec         r11  
 jne         013F21DA30h 
            }

            finish = __rdtscp(&y);
 rdtscp  
 shl         rdx,20h  
 lea         r8,[y]  
 or          rax,rdx  
 mov         dword ptr [r8],ecx

所以没有循环展开?这一定是原因吧?我用这个C++:手动展开循环

    while(pos < 2100){
        unsigned int a1 = array[pos + 0];
        unsigned int b1 = array[pos + 1];
        unsigned int e1 = array[pos + 2];
        unsigned int a2 = array[pos + 3];
        unsigned int b2 = array[pos + 4];
        unsigned int e2 = array[pos + 5];
        unsigned int a3 = array[pos + 6];
        unsigned int b3 = array[pos + 7];
        unsigned int e3 = array[pos + 8];
        unsigned int a4 = array[pos + 9];
        unsigned int b4 = array[pos + 10];
        unsigned int e4 = array[pos + 11];
        unsigned int a5 = array[pos + 12];
        unsigned int b5 = array[pos + 13];
        unsigned int e5 = array[pos + 14];
        c += (a1 * e1) + (a2 * e2) + (a3 * e3) + (a4 * e4) + (a5 * e5);
        d += (b1 * e1) + (b2 * e2) + (b3 * e3) + (b4 * e4) + (b5 * e5);
        data_for_all_items = data_for_all_items & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
        pos += 15;
    }
    if(data_for_all_items){
        std::cout << finish - start << std::endl;
        //This line prevents compiler over-optimizing
        std::cout << c << d << std::endl;
    }

生成此程序集:

           start = __rdtscp(&x);
 rdtscp  
 lea         r9,[x]  
 shl         rdx,20h  
 mov         qword ptr [rsp+108h],8Ch  
 mov         dword ptr [r9],ecx  
 lea         r9,[r8+8]  
 or          rax,rdx  
 mov         qword ptr [y],r9  
 mov         qword ptr [rsp+20h],rax  
 nop  
                unsigned int a1 = array[pos + 0];
                unsigned int b1 = array[pos + 1];
                unsigned int e1 = array[pos + 2];
 mov         edi,dword ptr [r9]  
                unsigned int a2 = array[pos + 3];
 mov         r13d,dword ptr [r9+4]  
                unsigned int b2 = array[pos + 4];
 mov         r12d,dword ptr [r9+8]  
 xorps       xmm0,xmm0  
                unsigned int e2 = array[pos + 5];
 mov         r10d,dword ptr [r9+0Ch]  
                unsigned int a3 = array[pos + 6];
 mov         r15d,dword ptr [r9+10h]  
                unsigned int b3 = array[pos + 7];
 mov         r14d,dword ptr [r9+14h]  
                unsigned int e3 = array[pos + 8];
 mov         r8d,dword ptr [r9+18h]  
                unsigned int a4 = array[pos + 9];
 mov         ebp,dword ptr [r9+1Ch]  
                unsigned int b4 = array[pos + 10];
 mov         esi,dword ptr [r9+20h]  
                unsigned int e4 = array[pos + 11];
 mov         edx,dword ptr [r9+24h]  
                unsigned int a5 = array[pos + 12];
 mov         ebx,dword ptr [r9+28h]  
                unsigned int b5 = array[pos + 13];
 mov         r11d,dword ptr [r9+2Ch]  
                unsigned int e5 = array[pos + 14];
 mov         r9d,dword ptr [r9+30h]  
                c += (a1 * e1) + (a2 * e2) + (a3 * e3) + (a4 * e4) + (a5 * e5);
 mov         eax,edx  
 mov         dword ptr [x],r13d  
                d += (b1 * e1) + (b2 * e2) + (b3 * e3) + (b4 * e4) + (b5 * e5);
 imul        edx,esi  
 imul        eax,ebp  
 mov         ecx,r9d  
 imul        r9d,r11d  
 imul        ecx,ebx  
 add         r9d,edx  
 add         ecx,eax  
 mov         eax,r8d  
 imul        r8d,r14d  
 imul        eax,r15d  
 add         r9d,r8d  
                all_instr_found = all_instr_found & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
 movzx       r8d,byte ptr [this]  
 add         ecx,eax  
 mov         eax,r10d  
 imul        r10d,r12d  
 add         r9d,r10d  
 imul        eax,r13d  
 mov         r13,qword ptr [y]  
 add         ecx,eax  
 mov         eax,edi  
 imul        eax,dword ptr [r13-8]  
 add         eax,ecx  
 cvtsi2sd    xmm0,rax  
 mov         rax,r13  
 mov         edx,dword ptr [rax-4]  
 imul        edi,edx  
 cmp         r11d,0FFFFFFFFh  
 addsd       xmm6,xmm0  
 setne       cl  
                all_instr_found = all_instr_found & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
 cmp         ebx,0FFFFFFFFh  
 lea         eax,[rdi+r9]  
 mov         r9,r13  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 setne       al  
 and         cl,al  
 cmp         esi,0FFFFFFFFh  
 setne       al  
 and         cl,al  
 cmp         ebp,0FFFFFFFFh  
 setne       al  
 and         cl,al  
 cmp         r14d,0FFFFFFFFh  
 addsd       xmm7,xmm0  
 setne       al  
 and         cl,al  
 cmp         r15d,0FFFFFFFFh  
 setne       al  
 and         cl,al  
 cmp         r12d,0FFFFFFFFh  
 setne       al  
 and         cl,al  
 cmp         dword ptr [x],0FFFFFFFFh  
 setne       al  
 and         cl,al  
 cmp         edx,0FFFFFFFFh  
 setne       al  
 and         cl,al  
 cmp         dword ptr [r13-8],0FFFFFFFFh  
 setne       al  
 and         cl,al  
 and         r8b,cl  
                pos += 15;
 add         r9,3Ch  
 mov         qword ptr [y],r9  
 mov         byte ptr [this],r8b  

            while(pos < 2100){
 dec         qword ptr [rsp+108h]  
 jne         013F31DA50h  
            }

            finish = __rdtscp(&y);
 rdtscp  
 shl         rdx,20h  
 lea         r9,[y]  
 mov         dword ptr [r9],ecx  
 mov         r15,qword ptr [rsp+0A8h]  
 mov         r13,qword ptr [rsp+0B0h]  
 mov         r12,qword ptr [rsp+0B8h]  
 or          rax,rdx  

新的手动循环展开平均需要3500个CPU周期,比未滚动的原始版本3400个CPU周期慢。

有人能解释一下为什么从缓存中提取的代码要慢3倍吗?是否有办法帮助编译器或比技术#1更快地获得这些代码?

在#2中使用一些预取,您可能会达到这个崇高的性能目标
对于gcc,使用内联汇编程序和3dNowPrefetch指令(现在几乎只剩下该扩展了)。

您的示例1可能运行得更快,因为执行3个独立的内存请求(由于执行无序)意味着内存总线上的吞吐量更好(空闲时间更少)。

这是一个很好的例子,微观优化不仅会让事情变得更难,而且会弄巧成拙
处理器设计者和制造商以及编译器作者花费了惊人的金钱、时间和人才来优化常见情况,因此很难对其进行事后猜测。

一些简单的信封后运算。

从循环中的int数组abc[3N]中提取将需要(3N*4+63)/64次缓存提取。在处理5.33个三元组(5*4*3=60)之后,您将需要下一个缓存行。

从三个不连续的数组中提取第一次迭代需要3次提取,但这三条缓存线可以在接下来的15次迭代中保留在内存中(64/4=16)。因此,您将需要3*((N*4+63)/64)回迁。

使用700作为N:

F1 = (8400+63)/64 = 132
F2 = 3*((700*4+63)/64) = 132

考虑到CPU的缓存大小,当迭代在三个数组上并行进行时,不可能抛出行。

Intel Ivy Brige CPU可以预测获取多个缓存线,只要预测正确,您就不会注意到单个数据流和多达6个数据流之间的任何差异(流的确切数量取决于CPU设计)。

你看到的差异很可能更多地与组装输出的变化有关,而不是与L-疼痛行为有关。在每种情况下生成的asm显然都非常不同,上一个版本(你的手展开的版本)每次迭代的指令最多——这是一个非常安全的指示,表明它应该更慢,只是为了通过CPU的指令解码器和执行单元强制执行更多的"实际工作"。

此外,cvtsi2sd尤其是Ivy Bridge上的一条慢指令,它可能会简单地根据它在指令流中相对于输入(依赖项)和输出(结果延迟)的位置来扭曲基准测试结果。如果您想隔离测试以单独测量内存性能。我强烈建议将该指令从基准中删除。

相关内容

  • 没有找到相关文章

最新更新