如何在Godbolt中为rust禁用LLVM优化(用于asm教育目的)



我正在学习rust和asm,并使用godbolt。

我有一个程序看起来像:

pub fn test() -> i32 {
let a = 1;
let b = 2;
let c = 3;
a + b + c
}

我希望输出看起来像

example::test:
subq    $16, %rsp
movl    $1, (%rsp)
movl    $2, 4(%rsp)
movl    $3, 8(%rsp)
movl    (%rsp), %eax
addl    4(%rsp), %eax
addl    8(%rsp), %eax
addq    $16, %rsp
retq

但是我实际上得到:

example::test:
mov     eax, 6
ret

当试图演示堆栈分配,加法等时,这是无用的。

我正在使用编译器标志:-Z mir-opt-level=0 -C opt-level=0 -C overflow-checks=off

所以MIR没有优化掉这些附加。MIR输出为:

// WARNING: This output format is intended for human consumers only
// and is subject to change without notice. Knock yourself out.
fn test() -> i32 {
let mut _0: i32;                     // return place in scope 0 at /app/example.rs:2:18: 2:21
let _1: i32;                         // in scope 0 at /app/example.rs:3:9: 3:10
let mut _4: i32;                     // in scope 0 at /app/example.rs:6:5: 6:10
let mut _5: i32;                     // in scope 0 at /app/example.rs:6:5: 6:6
let mut _6: i32;                     // in scope 0 at /app/example.rs:6:9: 6:10
let mut _7: i32;                     // in scope 0 at /app/example.rs:6:13: 6:14
scope 1 {
debug a => _1;                   // in scope 1 at /app/example.rs:3:9: 3:10
let _2: i32;                     // in scope 1 at /app/example.rs:4:9: 4:10
scope 2 {
debug b => _2;               // in scope 2 at /app/example.rs:4:9: 4:10
let _3: i32;                 // in scope 2 at /app/example.rs:5:9: 5:10
scope 3 {
debug c => _3;           // in scope 3 at /app/example.rs:5:9: 5:10
}
}
}
bb0: {
StorageLive(_1);                 // scope 0 at /app/example.rs:3:9: 3:10
_1 = const 1_i32;                // scope 0 at /app/example.rs:3:13: 3:14
StorageLive(_2);                 // scope 1 at /app/example.rs:4:9: 4:10
_2 = const 2_i32;                // scope 1 at /app/example.rs:4:13: 4:14
StorageLive(_3);                 // scope 2 at /app/example.rs:5:9: 5:10
_3 = const 3_i32;                // scope 2 at /app/example.rs:5:13: 5:14
StorageLive(_4);                 // scope 3 at /app/example.rs:6:5: 6:10
StorageLive(_5);                 // scope 3 at /app/example.rs:6:5: 6:6
_5 = _1;                         // scope 3 at /app/example.rs:6:5: 6:6
StorageLive(_6);                 // scope 3 at /app/example.rs:6:9: 6:10
_6 = _2;                         // scope 3 at /app/example.rs:6:9: 6:10
_4 = Add(move _5, move _6);      // scope 3 at /app/example.rs:6:5: 6:10
StorageDead(_6);                 // scope 3 at /app/example.rs:6:9: 6:10
StorageDead(_5);                 // scope 3 at /app/example.rs:6:9: 6:10
StorageLive(_7);                 // scope 3 at /app/example.rs:6:13: 6:14
_7 = _3;                         // scope 3 at /app/example.rs:6:13: 6:14
_0 = Add(move _4, move _7);      // scope 3 at /app/example.rs:6:5: 6:14
StorageDead(_7);                 // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_4);                 // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_3);                 // scope 2 at /app/example.rs:7:1: 7:2
StorageDead(_2);                 // scope 1 at /app/example.rs:7:1: 7:2
StorageDead(_1);                 // scope 0 at /app/example.rs:7:1: 7:2
return;                          // scope 0 at /app/example.rs:7:2: 7:2
}
}

LLVM IR输出为:

define i32 @_ZN7example4test17h2e9277ab15e59fbdE() unnamed_addr #0 !dbg !5 {
start:
ret i32 6, !dbg !10
}
attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }

因此,当添加被优化出来时,它处于MIR->LLVM级别。

我该如何预防?

谢谢!

注意

如果我使用元组,优化不会发生。如

pub fn test() -> i32 {
let a = (1,2,3);
a.0 + a.1 + a.2
}

就变成:

example::test:
subq    $16, %rsp
movl    $1, (%rsp)
movl    $2, 4(%rsp)
movl    $3, 8(%rsp)
movl    (%rsp), %eax
addl    4(%rsp), %eax
addl    8(%rsp), %eax
addq    $16, %rsp
retq

有一个black_box提示可以防止在编译时进行计算。

请注意,在撰写本文时,它仅在夜间可用。

#![feature(bench_black_box)]
pub fn test() -> i32 {
let a = std::hint::black_box(1);
let b = std::hint::black_box(2);
let c = std::hint::black_box(3);
a + b + c
}
example::test:
sub     rsp, 12
mov     dword ptr [rsp], 1
mov     rax, rsp
mov     eax, dword ptr [rsp]
mov     dword ptr [rsp + 4], 2
lea     rcx, [rsp + 4]
add     eax, dword ptr [rsp + 4]
mov     dword ptr [rsp + 8], 3
lea     rcx, [rsp + 8]
add     eax, dword ptr [rsp + 8]
add     rsp, 12
ret

rust nightly-C opt-level=3编译。

https://rust.godbolt.org/z/rMWhao11W

将对它们的可变引用传递给其他函数(或内联asm)以强制它们具有内存地址。声明函数而不定义函数的一种方法是extern "C"

extern "C" {
fn ext(x: &i32);   // void ext(const int *x);
}
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
unsafe{ ext(&b); }
//dummy(&c, &a);   // alternative, declare as non-inline an use std::hint::black_box
a + b + c
}

-C opt-level=0 -C overflow-checks=offon Godbolt,编译器将两个参数溢出到函数调用周围的内存中。

example::test:
push    rax                          // align the stack and reserve 8 bytes
mov     dword ptr [rsp], edi
mov     dword ptr [rsp + 4], esi
lea     rdi, [rsp + 4]               // &b
call    qword ptr [rip + ext@GOTPCREL]  // function call  -fno-plt 
style
mov     eax, dword ptr [rsp]         // reload a and b
add     eax, dword ptr [rsp + 4]
add     eax, 3                       // constant-propagation for c
pop     rcx                          // dealloc stack space with a dummy pop
ret

在不禁用优化的情况下,LLVM如预期的那样保存/恢复一个调用保留寄存器,以在函数调用期间保持a

example::test:
push    rbx                          // save a call-preserved reg
sub     rsp, 16
mov     ebx, edi                     // use it to hold a
mov     dword ptr [rsp + 12], esi    // spill b
lea     rdi, [rsp + 12]              // and pass a pointer to it
call    qword ptr [rip + ext@GOTPCREL]
mov     eax, dword ptr [rsp + 12]
add     eax, ebx
add     eax, 3
add     rsp, 16                      // epilogue
pop     rbx
ret

或者特别地阻止常量折叠,使用函数参数而不是常量。比如:如何去除"噪音"从GCC/clang汇编输出?

pub fn test2(a: i32, b: i32) -> i32 {
let c = 3;
a + b + c
}

但是即使在Godbolt上使用-C opt-level=0 -C overflow-checks=off
Rustc仍然不会像clang - 0那样溢出/重新加载堆栈空间。

example::test2:
mov     eax, edi
add     eax, esi
add     eax, 3
ret

(opt-level=3当然使用LEA而不是MOV+ADD,但仍然使用常数3的单独添加来优化延迟,而不是像Skylake这样的cpu上的吞吐量,其中3组件LEA具有3个周期延迟而不是1个。与Alder Lake不同的是,lea eax, [rsi+rdi+3]是1个周期延迟,并且在缩放索引时将是2。或者在Zen或Alder Lake e核上进行2次循环,所以使用单独的LEA/ADD来实现收支平衡。https://uops.info/)


#[inline(never)]

这是建议如何声明一个函数而不实现它?作为获得非内联函数调用的一种方式。我们可以使用@Finomnis建议的std::hint::black_box来实际使用args,并强制调用者在传递引用时在内存中具体化一个值。

取消上面Godbolt链接的注释,尝试一下。

#![feature(bench_black_box)]
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
dummy(&c, &a);
a + b + c
}

#[inline(never)]
pub extern fn dummy(_a: &i32, _b: &i32) {
//use std::sync::atomic::*;
//compiler_fence(Ordering::Release);   // make the function non-empty even without args
std::hint::black_box(_a);
std::hint::black_box(_b);
}

相关内容

  • 没有找到相关文章

最新更新