outerLoop:
skipCarry:
mov eax, 2
mov inCompare, eax ;Set our inCompare to 2
mov eax, outCompare
push ecx ;Save status of these registers before entering the innerLoop
push eax
mov ecx, innerLoopCount
isComposite:
mov eax, outCompare ;Value to be checked for composite
mov edx, 0 ;Make sure edx does not have a left over value
div inCompare ;Divide outCompare by inCompare to see if we get a remainder of 0 in edx
cmp edx, 0
jne skipPrint ;If the remainder is not 0, we have not proven that the number in question is composite, so do not print
;Print out a composite value if its been proven
mov eax, outCompare
call WriteDec
mov edx, OFFSET spaces
call WriteString ;Write spaces between the composites
mov ebx, writeCounter
inc ebx ;This will help to make sure we start a new line after writing 10 composites
mov writeCounter, ebx
cmp ebx, 10 ;If counter is at 10, we start a new line
jne exitInnerLoop ;exit the inner loop because we found that the number in question is composite
call CrLf
mov writeCounter, 0
jmp exitInnerLoop ;exit the inner loop because we found that the number in question is composite
skipPrint: ;Jumped to if the number in question was not confirmed to be composite
;This is to continue testing for composite of the number in question by incrementing the divisor
mov ebx, inCompare
inc ebx
mov eax, outCompare
cmp eax, ebx
je exitInnerLoop
mov inCompare, ebx
skipIncrement: ;Will be jumped to if there are no new divisors to be tested
loop isComposite ;Retest with new divisior if there is a possibility
exitInnerLoop:
pop eax ;restore outer loop status
pop ecx
inc eax ;Next number to check for being composite
mov outCompare, eax
loop outerLoop
我得到的错误代码说跳转太长了5个字节,引用了outerLoop。我知道只使用jmp命令并自己跟踪计数器会更简单,但我的家庭作业要求使用循环。有没有办法扩展循环的范围,或者有没有办法将循环中的内容缩小5个字节,而我没有看到?
循环应该检查一个数字是否是复合的,这样功能就已知了。
否,loop
/loopne
各只有一个编码,使用SHORT(带符号的8位rel8
)相对位移。
x86自386以来具有CCD_ 4(又称NEAR条件分支)以及CCD_。如果需要跳得比rel8
更远,那么对于小代码大小,loop
是错误的选择。dec ecx
/jnz
基本上是等价的,只是它破坏了一些标志。
代码大小是您应该使用loop
的唯一原因之一;除了最近的AMD,它在所有方面都很慢)。(speed~=8086上的代码大小,所以它在那里很有用。后来的CPU故意让它变慢,以与驱动程序中的(坏?)延迟循环兼容。
如果你真的想用loop
跳得更远,你可以这样做:
loop .stay_in_loop
jmp .leave_loop
.stay_in_loop:
jmp .far_away
.leave_loop:
.stay_in_loop
块可以在函数结束后,也可以在127B内从未到达的其他地方,这样可以省略jmp .leave_loop
。
您也可以"作弊"并将一些循环体放在线外。例如,您可以将Print
块移动到函数结束之后(或开始之前),并将jcc
移动到它,将jmp
移动回循环内部。
使用两个嵌套的loop
指令只是脑力劳动,但似乎需要使用它。push/pop保存/恢复外循环的ecx
不是必需的;您可以使用另一个寄存器和mov ecx, esi
之类的东西。无论哪种方式都有点糟糕,因为push或mov esi, ecx
必须位于循环的顶部。除非你通过自己做dec esi
而不是复制回loop
产生的值来伪造它。这使代码更干净,但可能被视为欺骗/违背了要求的目的P
也许您的任务的真正目标是让您优化代码大小,以便短跳可以达到。在您的案例中,这绝对是提高代码质量的正确方法。
初学者代码效率低下是很正常的,所以这里有很多简单的本地优化。例如CCD_ 23比5字节CCD_。使用test edx,edx
与零进行比较。它更小更好,设置的标志与cmp edx,0
相同
另外,将变量保存在寄存器中。像mov writeCounter, 0
这样的指令使用disp32寻址模式汇编为mov r/m32, imm32
。这是8个字节的地址和数据,加上2个字节的操作码+ModR/M。每个绝对地址而不是寄存器操作数的开销为4字节。
如果您需要在循环中保存更多的代码大小,您可以将内容复制到堆栈中,并使用[ebp+disp8]
寻址模式访问它们(ModR/M之外的1字节,而不是4字节)。
要释放更多寄存器用于循环,请在函数的开始/结束处按下/弹出它们。
问题中代码的清理版本
外侧loop
中的位移为0xc9
或-55。所以我的版本不到你版本的一半。(循环外ResetWriteCounter
中有12个字节的代码。)
push ebx
push ebp ; we're NOT using it as a frame pointer
push edi
push esi
;;; Important to keep in registers
; edi = inCompare = trial divisor
; ebx = outCompare = prime/composite candidate
;; nice to keep in registers
; ebp = WriteCounter (but done as a down-counter with dec instead of inc/cmp)
; esi = outer loop counter
;; these variable names are kinda bad, IMO.
;; candidate / divisor would be better names.
; ecx = inner loop counter
; eax,edx = scratch for DIV
mov ebx, outCompare ; first composite candidate
mov ebp, 10 ; down-counter to trigger newlines
push ' ' ; two spaces followed by 2 zero bytes (i.e. null-terminated string)
; Use mov edx, esp instead of mov edx, OFFSET spaces in the print block
;;; note that code before this isn't part of the size that LOOP has to jump over.
outerLoop:
test bl, 1
jz Print ;; multiple of 2
;; inner loop now only has to check the odd divisors
mov edi, 3 ; first trial divisor = first odd prime (other than 1)
mov ecx, innerLoopCount ; ideally set to sqrt(ebx), e.g. cvtsi2ss xmm0, ebx / sqrtss xmm0,xmm0 / cvtss2si xmm0, ecx might be correct :P
; could instead use ecx = ebx directly if you don't care about speed, so try divisors all the way up to the candidate.
;; inner loop
isComposite:
mov eax, ebx ; edx:eax = dividend = candidate
xor edx, edx
div edi ; trial division
add edi, 2 ; next trial divisor = next odd number
test edx, edx ; equivalent to cmp edx,0 to check remainder
jz Print ; you could maybe fold this into a LOOPNZ and check ZF after the loop
;If the remainder is not 0, we have not proven that the number in question is composite, so do not print
;;cmp edi, ebx ; divisor, candidate
;;jae exitInnerLoop ; what's the point of this? isn't ecx set so that loop falls through after this many iterations?
loop isComposite ;Retest with new divisor if there is a possibility
jmp NoPrint ; else it was prime
;;; I put this outside the inner loop in the first place instead of needing to multiple jumps out of this to exitInnerLoop
Print: ;Print out a composite value if its been proven
mov eax, ebx
call WriteDec
dec ebp ; new line when the down-counter reaches zero
jz ResetWriteCounter
; print spaces only if not printing a newline
mov edx, esp ; OFFSET spaces; Use stack memory we pushed earlier
call WriteString ;Write spaces between the composites
;; tail of outer loop. Prime numbers jump here without printing.
exitInnerLoop:
NoPrint:
inc ebx ;Next number to check for being composite
; TODO: increment by 2 and don't even check the even numbers, just print them after every odd prime/composite
mov ecx, esi ; ecx = outer loop counter for the LOOP insn
dec esi ; update outer counter instead of having mov esi, ecx at the top of the loop
loop outerLoop
add esp, 4 ; clear the ' ' we pushed earlier. pop esi is smaller and about as efficiently
pop esi
pop edi
pop ebp
pop ebx
ret
; after the end of the function is a good place for infrequently-use blocks that you jump to and then jump back.
ResetWriteCounter:
mov ebp, 10 ; reset counter
; start a new line after writing 10 composites
call CrLf
; could have done this branchlessly by selecting a string of ' ',0 or 13,10,0 based on ebp being zero
; but doing the ebp update branchlessly too is worse than this highly predictable branch. Maybe if the count was a power of 2, DEC/AND
jmp NoPrint ; back where we came from, skipping the write spaces
; alternative: duplicate inc / mov / dec / loop block here
在分支结构方面,您可以做更多的工作。例如,我想在掉出循环之后消除jmp NoPrint
。这看起来很笨拙。把Print
块放在其他地方就可以了,但Print必须跳起来,而不是掉下去。
TODO:使用ecx
作为试用除数。倒计时可能比倒计时慢得多,因为小素数更常见。还需要避免1
。
我本来打算只应用一些局部优化,但当我了解完你的分支结构时,我对代码大小很感兴趣。代码中的golf=获胜的字节数最少。
查看我的一些x86机器代码高尔夫答案,了解代码大小技巧(包括以牺牲速度为代价,有时会降低很多):
- 8字节循环中的GCD
- Chroma键在13个字节中混合了两个图像(数组)
- Extreme Fibonacci:105字节x86 32位代码中
Fib(10^9)
的前1000位数字(使用Linux系统调用),使用10次方小数分支进行扩展精度adc
,便于截断,只保留最有效的数字。这一个很巧妙,因为还有性能要求,所以内环针对速度进行了优化。(在我的i7-6700k上,整个过程大约需要70秒。) - 对数刻度适用于退出者:strlen和memset使用
rep
字符串,用基本^N填充字符替换N个字符的字符串;以21字节为单位 jcc rel32
0和n^3
是否具有相同的一组十进制数字,即40字节的机器代码。(使用bts
在寄存器中构建位图)