C语言 我的CPU可以执行多个nop每个CPU周期



我写了一个简单的程序,在一个循环中执行一堆NOP指令,令我惊讶的是,它每秒执行大约10600000000条指令,或者大约10Ghz,而我的CPU只有2.2GHz。

这怎么可能?是CPU将它们视为单个的mega-NOP,还是我刚刚发现了"指令级并行"的含义?

每秒指令数的更好度量是什么?执行add指令的速度只有414900000/s,是我的CPU所报告速度的十分之一:4390.03

C代码:

#include <stdio.h>
#include <stdint.h>
#include <time.h>
#define ten(a) a a a a a a a a a a
#define hundred(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) 
        ten(a) ten(a) ten(a)
#define ITER 10000000
int main(void) {
  uint64_t i=0;
  uint64_t t=time(NULL);
  while(1) {
    for(int j=0; j<ITER;j++) {
    hundred(asm volatile ("nop");)
    }
    i+=ITER*100;
    printf("%lu/%lun", i, time(NULL)-t);
  }
  return 0;
}

编制装配:

    .file   "gbloopinc.c"
    .section    .rodata
.LC0:
    .string "%lu/%lun"
    .text
    .globl  main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $32, %rsp
    movq    $0, -16(%rbp)
    movl    $0, %edi
    call    time
    movq    %rax, -8(%rbp)
.L4:
    movl    $0, -20(%rbp)
    jmp .L2
.L3:
#APP
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
#NO_APP
    addl    $1, -20(%rbp)
.L2:
    cmpl    $9999999, -20(%rbp)
    jle .L3
    addq    $1000000000, -16(%rbp)
    movl    $0, %edi
    call    time
    subq    -8(%rbp), %rax
    movq    %rax, %rdx
    movq    -16(%rbp), %rax
    movq    %rax, %rsi
    movl    $.LC0, %edi
    movl    $0, %eax
    call    printf
    jmp .L4
    .cfi_endproc
.LFE0:
    .size   main, .-main
    .ident  "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.2) 5.4.0 20160609"
    .section    .note.GNU-stack,"",@progbits

这与多核无关。核心不是"端口"。


每个时钟4 NOPs是超标量/乱序CPU的问题/退役管道宽度。nop甚至不需要执行单元/执行端口(ALU或加载或存储),因此您甚至不受整数执行单元数量的限制。即使是Core2 (Intel的首款4-wide x86 CPU)每个时钟也可以运行4个nop。

正如您所猜测的,这是指令级并行的一个示例。nop当然没有输入依赖。

在你的Sandybridge CPU(每核有3个ALU执行单元),你可以运行3个ADD和一个加载或存储指令每个时钟,因为它的管道宽度是4个上限。请参阅Agner Fog的microarch pdf和x86标签wiki中的其他链接。在一个独立的ADD指令流上,比如

add  eax, eax
add  ebx, ebx
add  ecx, ecx
add  edx, edx
...

您将看到SnB上的每时钟吞吐量约为3,在整数ALU执行端口上出现瓶颈。Haswell可以在每个时钟运行4个add,因为它有第4个ALU执行端口,可以处理非向量整数操作(和分支)。

乱序cpu通常具有比执行单元数量更宽的前端和发行/退役宽度。一旦有空闲的执行单元,就对更多的指令进行解码并准备好执行,这就增加了它们的利用率。否则,如果执行由于串行依赖而停滞或减慢,无序机器只能提前看到当前正在执行的内容。(例如,add eax,eax/add eax,eax需要第一个add的输出作为第二个add的输入,因此每个时钟只能运行一次。)

我将进一步阐述HansPassant的评论。

现代处理器既是超标量又是多核的。多核处理器很容易理解——它有多个核心。另一方面,Superscalar需要更多的硬件知识。这是一个堆栈交换问题,它解释了处理器是超标量意味着什么。超标量处理器在同一个核心中有许多功能单元,并且是高度流水线化的。这就是为什么多个指令可以在单个内核中同时调度和运行的原因。以下是处理器中的一些功能单元:整数加减、浮点乘法、浮点除法、整数乘法、整数除法。

我鼓励你去谷歌更多关于超标量处理器的信息,特别是查找更多关于你的处理器的信息。

相关内容

  • 没有找到相关文章

最新更新