c -在寄存器机VM上为循环改进一个简单的自制JIT的性能



现在我正在学习汇编,我的项目是将字节码从一个幻想的架构转换成用字节码编写的真正的汇编,并使用JIT执行。

为了做到这一点,我必须实现来自另一个体系结构的指令。其中一些很简单,就像常见的汇编指令一样,其中有两个需要更多的字节来实现,就像这两个:

RX和RY - 32位寄存器内存-包含原始字节码

的以小端序排列的字节数组。mov RX, memory[RY] -读取内存中接下来的4个字节(从RY开始),右移这些字节并将它们连接到RX中。mov内存[RX], RY -逆操作。读取RY中的值,左移字节以小端序排列。

在C代码中,这些指令是(考虑到R和mem是全局的):
// mov RX, mem[RY]
void movRxMemRy(unsigned char x, unsigned char y) {
if (R[y]+3 > 128) endExecution = 1;
else R[x] = mem[R[y]+3] << 24 | mem[R[y]+2] << 16 | mem[R[y]+1] << 8 | mem[R[y]];
}
// mov mem[RX], RY
void movMemRxRy(unsigned char x, unsigned char y) {
if (R[x]+3 > 128) {
endExecution = 1;
} else {
mem[R[x]] = (R[y]);  
mem[R[x]+1] = (R[y]) >> 8;  
mem[R[x]+2] = (R[y]) >> 16;  
mem[R[x]+3] = (R[y]) >> 24;        
}
return;
}

这些指令是作为解释器的一部分实现的,它应该比汇编/jit实现慢5-10倍(或更多),但现在运行这些指令需要1/3的时间(大约1,7~1,8秒)。我们的指令实现必须运行教授在原始字节码上给我们的以下指令:

mov R0, 0x006C
mov R1, 0x0001
mov R2, [R0]       # start of huge the loop. [R0] contains the loop counter
cmp R15, R2        # R15 = 0
je 0x0030          # ends the loop execution
mov R14, R2
add R13, R14
sub R2, R1         # decrements the loop counter by 1
mov [R0], R2       # saves the loop counter
jmp 0xFFC8         # returns to the start of the loop

由于循环占用了99%以上的执行时间,所以我决定只粘贴这一部分。循环计数器是包含在[R0]中的值,它从0x03885533开始(每次迭代递减1)。一旦值达到零,它就退出循环。

最复杂的指令是那些负责大部分执行时间的指令,除了添加和子指令。我需要对它们进行优化,使其更快,如果可能的话,使用最少的字节,因为我认为它们可能有问题,因为它在4,5秒内运行。汇编/jit版本比解释版本快,必须在不到1秒的时间内运行(这个项目的时间限制)。我目前的实现是:

r15:包含16个32位"寄存器"的数组;从用于存储最终结果的幻想架构Rbx:包含原始字节码(和计数器值)的内存数组。

//Sub指令与add指令相同,但改变了操作码字节。

void add(unsigned char opcode, unsigned char x, unsigned char y) {
start = c;
// 0x09 - add rx, ry
// mov r14d, [r15+4*y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 4*y;
// add [r15+4x], r14d
machine[c++] = 0x45;
machine[c++] = 0x01;
machine[c++] = 0x77;
machine[c++] = 4*x;
end = c;
for (k= 0; k < (88 - (end-start)); k++) {
machine[c++] = 0x90;
}
end = c;
}
// mov RX, mem[RY]
void movRxMemRy(unsigned char opcode, unsigned char x, unsigned char y) {
// xor r14, r14
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xf6;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// xor r12, r12
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xe4;
// mov    r12d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x67;
machine[c++] = 0x4*y;
// mov    r13b,BYTE PTR [rbx+r12*1+0x3]    
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x6c;
machine[c++] = 0x23;
machine[c++] = 0x03;
// shl    r13,0x18
machine[c++] = 0x49;
machine[c++] = 0xc1;
machine[c++] = 0xe5;
machine[c++] = 0x18;
// or     r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// mov    r13b,BYTE PTR [rbx+r12*1+0x2]    
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x6c;
machine[c++] = 0x23;
machine[c++] = 0x02;
// shl    r13,0x10
machine[c++] = 0x49;
machine[c++] = 0xc1;
machine[c++] = 0xe5;
machine[c++] = 0x10;
// or     r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// mov    r13b,BYTE PTR [rbx+r12*1+0x1]    
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x6c;
machine[c++] = 0x23;
machine[c++] = 0x01;
// shl    r13,0x18
machine[c++] = 0x49;
machine[c++] = 0xc1;
machine[c++] = 0xe5;
machine[c++] = 0x08;
// or     r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// mov    r13b,BYTE PTR [rbx+r12*1]    
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x2c;
machine[c++] = 0x23;
// or     r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// mov    r13b,BYTE PTR [rbx+r12*1+0x3]    
machine[c++] = 0x45;
machine[c++] = 0x89;
machine[c++] = 0x77;
machine[c++] = x*4;

end = c;
}

void movMemRxRy(unsigned char opcode, unsigned char x, unsigned char y) {
start = c;
// xor r14, r14
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xf6;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// xor r12, r12
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xe4;
// r12d,DWORD PTR [r15+0xc] (atual)
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x67;
machine[c++] = 0x4*x;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;
// mov DWORD PTR [rbx+r12*1], r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x34;
machine[c++] = 0x23;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;
// shl r14d, 0x8
machine[c++] = 0x41;
machine[c++] = 0xc1;
machine[c++] = 0xee;
machine[c++] = 0x08;
// mov BYTE PTR [rbx+r12*1+0x1],r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x74;
machine[c++] = 0x23;
machine[c++] = 0x01;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;

// shl r14d, 0x10
machine[c++] = 0x41;
machine[c++] = 0xc1;
machine[c++] = 0xee;
machine[c++] = 0x10;
// mov BYTE PTR [rbx+r12*1+0x2],r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x74;
machine[c++] = 0x23;
machine[c++] = 0x02;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;

// shl r14d, 0x18
machine[c++] = 0x41;
machine[c++] = 0xc1;
machine[c++] = 0xee;
machine[c++] = 0x18;
// mov BYTE PTR [rbx+r12*1+0x2],r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x74;
machine[c++] = 0x23;
machine[c++] = 0x03;
end = c;
for (k= 0; k < (88 - (end-start)); k++) {
machine[c++] = 0x90;
}
end = c;
}

因为对于第一个项目,我们不需要检查Rn+3>我决定先让它工作,而不必在汇编代码上实现条件。

关于我如何改进我的代码以低于15秒运行的任何想法?如有任何帮助,不胜感激。

我犯了一些菜鸟错误,通过实现mov操作将C代码直接转换为汇编代码,以逐个字节连接,而我本可以将它们视为双字。

通过这样做,指令变得清晰和快速。此外,我必须添加带有nop的填充来对齐指令并使跳跃更容易,这是不必要的执行。所以我在每条指令的末尾给下一条指令加了一个跳转。作为一个例子,mov指令现在是如何实现的:

mov Rx, mem[Ry]:
mov r12d, dword ptr [r15+4y]
mov r12d, dword ptr [rbx+r12]
mov DWORD PTR [r15+4x],r12d
jmp nextInstruction
Mov mem[Rx], Ry:
mov r12d,DWORD PTR [r15+4*y]
mov r14d,DWORD PTR [r15+4*x]
mov DWORD PTR [rbx+2*x],r12d
jmp nextInstruction
通过这样做,代码在0.6秒内执行。

感谢Peter Cordes和fuz帮助发现了这些问题。

相关内容

  • 没有找到相关文章

最新更新