我正在学习rust和asm,并使用godbolt。
我有一个程序看起来像:
pub fn test() -> i32 {
let a = 1;
let b = 2;
let c = 3;
a + b + c
}
我希望输出看起来像
example::test:
subq $16, %rsp
movl $1, (%rsp)
movl $2, 4(%rsp)
movl $3, 8(%rsp)
movl (%rsp), %eax
addl 4(%rsp), %eax
addl 8(%rsp), %eax
addq $16, %rsp
retq
但是我实际上得到:
example::test:
mov eax, 6
ret
当试图演示堆栈分配,加法等时,这是无用的。
我正在使用编译器标志:-Z mir-opt-level=0 -C opt-level=0 -C overflow-checks=off
所以MIR没有优化掉这些附加。MIR输出为:
// WARNING: This output format is intended for human consumers only
// and is subject to change without notice. Knock yourself out.
fn test() -> i32 {
let mut _0: i32; // return place in scope 0 at /app/example.rs:2:18: 2:21
let _1: i32; // in scope 0 at /app/example.rs:3:9: 3:10
let mut _4: i32; // in scope 0 at /app/example.rs:6:5: 6:10
let mut _5: i32; // in scope 0 at /app/example.rs:6:5: 6:6
let mut _6: i32; // in scope 0 at /app/example.rs:6:9: 6:10
let mut _7: i32; // in scope 0 at /app/example.rs:6:13: 6:14
scope 1 {
debug a => _1; // in scope 1 at /app/example.rs:3:9: 3:10
let _2: i32; // in scope 1 at /app/example.rs:4:9: 4:10
scope 2 {
debug b => _2; // in scope 2 at /app/example.rs:4:9: 4:10
let _3: i32; // in scope 2 at /app/example.rs:5:9: 5:10
scope 3 {
debug c => _3; // in scope 3 at /app/example.rs:5:9: 5:10
}
}
}
bb0: {
StorageLive(_1); // scope 0 at /app/example.rs:3:9: 3:10
_1 = const 1_i32; // scope 0 at /app/example.rs:3:13: 3:14
StorageLive(_2); // scope 1 at /app/example.rs:4:9: 4:10
_2 = const 2_i32; // scope 1 at /app/example.rs:4:13: 4:14
StorageLive(_3); // scope 2 at /app/example.rs:5:9: 5:10
_3 = const 3_i32; // scope 2 at /app/example.rs:5:13: 5:14
StorageLive(_4); // scope 3 at /app/example.rs:6:5: 6:10
StorageLive(_5); // scope 3 at /app/example.rs:6:5: 6:6
_5 = _1; // scope 3 at /app/example.rs:6:5: 6:6
StorageLive(_6); // scope 3 at /app/example.rs:6:9: 6:10
_6 = _2; // scope 3 at /app/example.rs:6:9: 6:10
_4 = Add(move _5, move _6); // scope 3 at /app/example.rs:6:5: 6:10
StorageDead(_6); // scope 3 at /app/example.rs:6:9: 6:10
StorageDead(_5); // scope 3 at /app/example.rs:6:9: 6:10
StorageLive(_7); // scope 3 at /app/example.rs:6:13: 6:14
_7 = _3; // scope 3 at /app/example.rs:6:13: 6:14
_0 = Add(move _4, move _7); // scope 3 at /app/example.rs:6:5: 6:14
StorageDead(_7); // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_4); // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_3); // scope 2 at /app/example.rs:7:1: 7:2
StorageDead(_2); // scope 1 at /app/example.rs:7:1: 7:2
StorageDead(_1); // scope 0 at /app/example.rs:7:1: 7:2
return; // scope 0 at /app/example.rs:7:2: 7:2
}
}
LLVM IR输出为:
define i32 @_ZN7example4test17h2e9277ab15e59fbdE() unnamed_addr #0 !dbg !5 {
start:
ret i32 6, !dbg !10
}
attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
因此,当添加被优化出来时,它处于MIR->LLVM级别。
我该如何预防?
谢谢!
注意
如果我使用元组,优化不会发生。如
pub fn test() -> i32 {
let a = (1,2,3);
a.0 + a.1 + a.2
}
就变成:
example::test:
subq $16, %rsp
movl $1, (%rsp)
movl $2, 4(%rsp)
movl $3, 8(%rsp)
movl (%rsp), %eax
addl 4(%rsp), %eax
addl 8(%rsp), %eax
addq $16, %rsp
retq
有一个black_box
提示可以防止在编译时进行计算。
请注意,在撰写本文时,它仅在夜间可用。
#![feature(bench_black_box)]
pub fn test() -> i32 {
let a = std::hint::black_box(1);
let b = std::hint::black_box(2);
let c = std::hint::black_box(3);
a + b + c
}
example::test:
sub rsp, 12
mov dword ptr [rsp], 1
mov rax, rsp
mov eax, dword ptr [rsp]
mov dword ptr [rsp + 4], 2
lea rcx, [rsp + 4]
add eax, dword ptr [rsp + 4]
mov dword ptr [rsp + 8], 3
lea rcx, [rsp + 8]
add eax, dword ptr [rsp + 8]
add rsp, 12
ret
用rust nightly
和-C opt-level=3
编译。
https://rust.godbolt.org/z/rMWhao11W
将对它们的可变引用传递给其他函数(或内联asm)以强制它们具有内存地址。声明函数而不定义函数的一种方法是extern "C"
。
extern "C" {
fn ext(x: &i32); // void ext(const int *x);
}
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
unsafe{ ext(&b); }
//dummy(&c, &a); // alternative, declare as non-inline an use std::hint::black_box
a + b + c
}
与-C opt-level=0 -C overflow-checks=off
on Godbolt,编译器将两个参数溢出到函数调用周围的内存中。
example::test:
push rax // align the stack and reserve 8 bytes
mov dword ptr [rsp], edi
mov dword ptr [rsp + 4], esi
lea rdi, [rsp + 4] // &b
call qword ptr [rip + ext@GOTPCREL] // function call -fno-plt
style
mov eax, dword ptr [rsp] // reload a and b
add eax, dword ptr [rsp + 4]
add eax, 3 // constant-propagation for c
pop rcx // dealloc stack space with a dummy pop
ret
在不禁用优化的情况下,LLVM如预期的那样保存/恢复一个调用保留寄存器,以在函数调用期间保持a
。
example::test:
push rbx // save a call-preserved reg
sub rsp, 16
mov ebx, edi // use it to hold a
mov dword ptr [rsp + 12], esi // spill b
lea rdi, [rsp + 12] // and pass a pointer to it
call qword ptr [rip + ext@GOTPCREL]
mov eax, dword ptr [rsp + 12]
add eax, ebx
add eax, 3
add rsp, 16 // epilogue
pop rbx
ret
或者特别地阻止常量折叠,使用函数参数而不是常量。比如:如何去除"噪音"从GCC/clang汇编输出?
pub fn test2(a: i32, b: i32) -> i32 {
let c = 3;
a + b + c
}
但是即使在Godbolt上使用-C opt-level=0 -C overflow-checks=off
,
Rustc仍然不会像clang - 0那样溢出/重新加载堆栈空间。
example::test2:
mov eax, edi
add eax, esi
add eax, 3
ret
(opt-level=3
当然使用LEA而不是MOV+ADD,但仍然使用常数3的单独添加来优化延迟,而不是像Skylake这样的cpu上的吞吐量,其中3组件LEA具有3个周期延迟而不是1个。与Alder Lake不同的是,lea eax, [rsi+rdi+3]
是1个周期延迟,并且在缩放索引时将是2。或者在Zen或Alder Lake e核上进行2次循环,所以使用单独的LEA/ADD来实现收支平衡。https://uops.info/)
#[inline(never)]
这是建议如何声明一个函数而不实现它?作为获得非内联函数调用的一种方式。我们可以使用@Finomnis建议的std::hint::black_box
来实际使用args,并强制调用者在传递引用时在内存中具体化一个值。
取消上面Godbolt链接的注释,尝试一下。
#![feature(bench_black_box)]
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
dummy(&c, &a);
a + b + c
}
#[inline(never)]
pub extern fn dummy(_a: &i32, _b: &i32) {
//use std::sync::atomic::*;
//compiler_fence(Ordering::Release); // make the function non-empty even without args
std::hint::black_box(_a);
std::hint::black_box(_b);
}