Differences

This shows you the differences between two versions of the page.

--- base:fast_sqrt [2019-07-27 18:20] – verz
+++ base:fast_sqrt [2019-08-04 02:03] – verz
@@ Line 390: / Line 390: @@
         }
 </code>
-where //p// is the number of bits of the result; or half the bits of the radicand (+1 if the radicand has an odd amount of bits).\\
+where //p// is the number of bits of the result; (or half the bits of the radicand, +1 if the radicand has an odd amount of bits).\\
 //p=ceil(#bits-of-radicand/2)//\\
 Incidentally the algorithm provides also the remainder via (M LSR p)\\
-//p// can be greater than necessary, just producing more iterations. A value of p=16 can be choosed to perform the calculations for 32bit numbers and smaller.
+//p// can be greater than necessary, just producing more iterations. A value of p=16 permits to perform the calculations for 32bit numbers and smaller.
 This is the code for a 32bit integer Sqrt. Provides the result and the remainder:
@@ Line 405: / Line 405: @@
 ;*      input:  square, the 4-byte source number
 ;*      output: sqrt, 16bit value
-;*              remnd, 16bit value
+;*              remnd, 17bit value
 ;*
@@ Line 411: / Line 411: @@
         sta sqrt        ; R=0
         sta sqrt+1
+        sta M+4
         ;sta T+1        ; (T+1) is zero until last iteration; (T+0) is always 0
         ldy #14         ; 15 iterations (14-->0) + last iteration
-loopsq  clc
+loopsq
-        lda stablo,y    ; (2*R+D) LSR 1; actually: R+(D LSR 1)
+        lda sqrt        ; (2*R+D) LSR 1; actually: R+(D LSR 1)
-        adc sqrt
+        ora stablo,y    ; using ORA instead of ADC is ok because the bit to be set
-        sta T+2
+        sta T+2         ;    will have not been affected yet
-        lda stabhi,y
+        lda sqrt+1
-        adc sqrt+1
+        ora stabhi,y
         sta T+3
+        bcs skip0       ; takes care of large numbers; if set, M>T
         lda M+3
@@ Line 436: / Line 438: @@
         sbc T+3
         sta M+3
-        clc             ; possibly unnecessary
         lda sqrt        ; R=R+D
-        adc stablo+1,y
+        ora stablo+1,y
         sta sqrt
         lda sqrt+1
-        adc stabhi+1,y
+        ora stabhi+1,y
         sta sqrt+1
 skip1
@@ Line 451: / Line 452: @@
         bpl loopsq
 lastiter                ; code for last iteration
+        bcs skp0        ; takes care of large numbers; if set, M>T
         ; during last iteration D=1, so [(2*R+D) LSR 1] makes D the MSB of T+1
         lda M+3
@@ Line 474: / Line 476: @@
         sta M+3
         inc sqrt        ; R=R+D with D=1
-        bne skp1
+skp1    asl M+1         ; M=M*2; location M+0=0
-        inc sqrt+1
-skp1    asl M           ; M=M*2
-        rol M+1
         rol M+2
         rol M+3
+        rol M+4
         rts
 ;**** Variables and Shift table
-       byte 0
 stabhi byte 0,0,0,0,0,0,0,0
 stablo BYTE $01,$02,$04,$08,$10,$20,$40,$80
        byte 0,0,0,0,0,0,0,0
-square  = $5B     ; 4 bytes: input value
+square = $57    ; 5 bytes: input value; during calculation needs the 5th byte
-sqrt    = $5F     ; 2 bytes: result
+sqrt   = $5F    ; 2 bytes: result
-remnd   = M+2     ; 2 bytes: is in fact the high bytes of M (M LSR 16)
+remnd  = M+2    ; 2 B + 1 b: is in the high bytes of M (M LSR 16); msb is in T+0 (the 5th byte of square)
-T       = $57     ; 4 bytes: could be 2 bytes: T+0 is always 0; T+1 is 0 until last iteration
+T      = $5B    ; 4 bytes: could be 2 bytes: T+0 is always 0; T+1 is 0 until last iteration
-M       = $5B     ; 4 bytes: over the input square
+M      = square ; 4 bytes: over the input square
 </code>
-The algorithm is pretty fast: it has a top cycles count of around 1700, but seems to average at 1.3ms with variables in page zero.\\
+The algorithm is pretty fast: it has a top cycles count of around 1700, but seems to average at 1.3ms (using variables in page zero).\\
-I think it could be convenient to add a control to check whether M=0 and exit earlier.