我在C 项目中使用的Visual Studio中有2个函数(MASM(。它们是一个未签名的64位倍数功能,可产生128位的结果,而无符号的128位分隔函数产生了128位商的商品并返回剩余的32位。
我需要的是功能的签名版本,但我不确定如何做。
以下是带有未签名函数的.ASM文件的代码:
.MODEL flat, stdcall
.CODE
MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
push EAX
push EDX
push EBX
push ECX
push EDI
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
MUL EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
MUL EDX
ADD EAX,ECX
ADC EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
MUL EDX
ADD EAX,EBX
ADC ECX,EDX
PUSHFD ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
MUL EDX
POPFD ; Retrieve carry from above.
ADC EAX,ECX
ADC EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop ECX
pop EBX
pop EDX
pop EAX
ret 20
MUL64 ENDP
IMUL64 PROC, A:SQWORD, B:SQWORD, pi128:DWORD
; How to make this work?
ret 20
IMUL64 ENDP
DIV128 PROC, pDividend128:DWORD, Divisor:DWORD, pQuotient128:DWORD
push EDX
push EBX
push ESI
push EDI
MOV ESI,pDividend128
MOV EDI,pQuotient128
MOV EBX,Divisor
XOR EDX,EDX
MOV EAX,[ESI+12]
DIV EBX
MOV [EDI+12],EAX
MOV EAX,[ESI+8]
DIV EBX
MOV [EDI+8],EAX
MOV EAX,[ESI+4]
DIV EBX
MOV [EDI+4],EAX
MOV EAX,[ESI]
DIV EBX
MOV [EDI],EAX
MOV EAX,EDX
pop EDI
pop ESI
pop EBX
pop EDX
ret 12
DIV128 ENDP
IDIV128 PROC, pDividend128:DWORD, Divisor:DWORD, pQuotient128:DWORD
; How to make this work?
ret 12
IDIV128 ENDP
END
,如果您在任何情况下都发现了这一点,请通过帮助编码功能的签名版本来帮助该项目。
首先,mul64函数不起作用100%
为了解决此问题,应将POPFD
指令后的随身携带标志添加到EDX,这是结果的最高32位部分。现在遵循Peter Cordes的建议,删除EAX/ECX/EDX的推动和弹出。最后,使用setc BL
和movzx EBX,BL
保存标志。注意:您无法轻易使用xor EBX,EBX
将其归零,因为xor
会影响标志。我们使用movzx
,因为它的速度比add BL,0xFF
快,并且add
比基于Skylake Specs的adc
快。
结果:
MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
push EBX
push EDI
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
mul EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
mul EDX
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
mul EDX
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
movzx EBX,BL ; Zero-Extend carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
mul EDX
add EDX,EBX ; Add carry from above.
add EAX,ECX
adc EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop EBX
ret 20
MUL64 ENDP
现在,要制作签名的函数版本使用此公式:
my128.Hi -= (((A < 0) ? B : 0) + ((B < 0) ? A : 0));
结果:
IMUL64 PROC, A:SQWORD, B:SQWORD, pi128:DWORD
push EBX
push EDI
mov EDI,pi128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
mul EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
mul EDX
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
mul EDX
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
movzx EBX,BL ; Zero-Extend carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
mul EDX
add EDX,EBX ; Add carry from above.
add EAX,ECX
adc EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
; Signed version only:
cmp DWORD PTR A+4,0
jg zero_b
jl use_b
cmp DWORD PTR A,0
jae zero_b
use_b:
mov ECX,DWORD PTR B
mov EBX,DWORD PTR B+4
jmp test_b
zero_b:
xor ECX,ECX
mov EBX,ECX
test_b:
cmp DWORD PTR B+4,0
jg zero_a
jl use_a
cmp DWORD PTR B,0
jae zero_a
use_a:
mov EAX,DWORD PTR A
mov EDX,DWORD PTR A+4
jmp do_last_op
zero_a:
xor EAX,EAX
mov EDX,EAX
do_last_op:
add EAX,ECX
adc EDX,EBX
sub [EDI+8],EAX
sbb [EDI+12],EDX
; End of signed version!
pop EDI
pop EBX
ret 20
IMUL64 ENDP
DIV128功能应该很好(可能也可能是最快(,以便从32位除数获得128位的商品,但是如果您需要使用128位的除数,则请查看此代码https://www.codeproject.com/tips/785014/uint-Division-modulus,其示例是将二进制移位算法用于128位分区。如果在组装中写入可能会更快3倍。
要制作Div128的签名版本,首先确定除数和股息的符号是相同还是不同。如果它们是相同的,那么结果应该为正。如果它们不同,则结果应该为负。因此,如果股息为负,并致电Div128,则将股息和除数呈阳性,此后,如果符号不同,则将结果否定。
这是C
编写的一些示例代码VOID IDIV128(PSDQWORD Dividend, PSDQWORD Divisor, PSDQWORD Quotient, PSDQWORD Remainder)
{
BOOL Negate;
DQWORD DD, DV;
Negate = TRUE;
// Use local DD and DV so Dividend and Divisor dont get currupted.
DD.Lo = Dividend->Lo;
DD.Hi = Dividend->Hi;
DV.Lo = Divisor->Lo;
DV.Hi = Divisor->Hi;
// if the signs are the same then: Negate = FALSE;
if ((DD.Hi & 0x8000000000000000) == (DV.Hi & 0x8000000000000000)) Negate = FALSE;
// Covert Dividend and Divisor to possitive if negative: (negate)
if (DD.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DD);
if (DV.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DV);
DIV128(&DD, &DV, (PDQWORD)Quotient, (PDQWORD)Remainder);
if (Negate == TRUE)
{
NEG128(Quotient);
NEG128(Remainder);
}
}
编辑:
遵循Peter Cordes的建议,我们可以更优化MUL64/IMUL64。查看所做的具体更改的评论。我还用MUL64@20:
和IMUL64@20:
替换了MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
,以消除MASM添加的不必要的EBP。我还为IMUL64优化了固定工作。
MUL64/IMUL64的当前.ASM文件
.MODEL flat, stdcall
EXTERNDEF MUL64@20 :PROC
EXTERNDEF IMUL64@20 :PROC
.CODE
MUL64@20:
push EBX
push EDI
; -----------------
; | pu128 |
; |---------------|
; | B |
; |---------------|
; | A |
; |---------------|
; | ret address |
; |---------------|
; | EBX |
; |---------------|
; ESP---->| EDI |
; -----------------
A TEXTEQU <[ESP+12]>
B TEXTEQU <[ESP+20]>
pu128 TEXTEQU <[ESP+28]>
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mul DWORD PTR B
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mul DWORD PTR B+4
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B+4
add EAX,ECX
movzx ECX,BL ; Zero-Extend saved carry from above.
adc EDX,ECX
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop EBX
ret 20
IMUL64@20:
push EBX
push EDI
; -----------------
; | pi128 |
; |---------------|
; | B |
; |---------------|
; | A |
; |---------------|
; | ret address |
; |---------------|
; | EBX |
; |---------------|
; ESP---->| EDI |
; -----------------
A TEXTEQU <[ESP+12]>
B TEXTEQU <[ESP+20]>
pi128 TEXTEQU <[ESP+28]>
mov EDI,pi128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mul DWORD PTR B
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mul DWORD PTR B+4
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B+4
add EAX,ECX
movzx ECX,BL ; Zero-Extend saved carry from above.
adc EDX,ECX
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
; Signed version only:
mov BL,BYTE PTR B+7
and BL,80H
jz zero_a
mov EAX,DWORD PTR A
mov EDX,DWORD PTR A+4
jmp test_a
zero_a:
xor EAX,EAX
mov EDX,EAX
test_a:
mov BL,BYTE PTR A+7
and BL,80H
jz do_last_op
add EAX,DWORD PTR B
adc EDX,DWORD PTR B+4
do_last_op:
sub [EDI+8],EAX
sbb [EDI+12],EDX
; End of signed version!
pop EDI
pop EBX
ret 20
END