Differences

This shows you the differences between two versions of the page.

--- base:fast_sqrt [2019-07-23 00:16] – verz
+++ base:fast_sqrt [2019-08-18 10:20] – verz
@@ Line 372: / Line 372: @@
         RTS
 </code>
+----
+by Verz: the generic algorithm is:
+<code>
+        R= 0
+        M= N
+        D= 2^(p-1)
+        for n= 1 to p
+        {
+            T= (R+R+D) ASL (p-1)
+            if (T <= M) then M=M-T: R=R+D
+            M= M ASL 1
+            D= D LSR 1
+        }
+</code>
+where //p// is the number of bits of the result; (or half the bits of the radicand, +1 if the radicand has an odd amount of bits).\\
+//p=ceil(#bits-of-radicand/2)//\\
+Incidentally the algorithm provides also the remainder via (M LSR p)\\
+//p// can be greater than necessary, just producing more iterations. A value of p=16 permits to perform the calculations for 32bit numbers and smaller.
+This is the code for a 32bit integer Sqrt. Provides the result and the remainder:
+<code 6502acme>
+;********************************************
+;*    sqrt32
+;*
+;*   computes Sqrt of a 32bit number
+;********************************************
+;*   by Verz - Jul2019
+;********************************************
+;*
+;*  input:  square, 32bit source number
+;*  output: sqrt,   16bit value
+;*          remnd,  17bit value
+;********************************************
+sqrt32  lda #0
+        sta sqrt        ; R=0
+        sta sqrt+1
+        sta M+4
+        ;sta T+1        ; (T+1) is zero until last iteration; (T+0) is always 0
+        ldy #14         ; 15 iterations (14-->0) + last iteration
+loopsq
+        lda sqrt        ; (2*R+D) LSR 1; actually: R+(D LSR 1)
+        ora stablo,y    ; using ORA instead of ADC is ok because the bit to be set
+        sta T+2         ;    will have not been affected yet
+        lda sqrt+1
+        ora stabhi,y
+        sta T+3
+        bcs skip0       ; takes care of large numbers; if set, M>T
+        lda M+3
+        cmp T+3
+        bcc skip1       ; T <= M    (branch if T>M)
+        bne skip0
+        lda M+2
+        cmp T+2
+        bcc skip1
+skip0   ;sec
+        lda M+2         ; M=M-T
+        sbc T+2
+        sta M+2
+        lda M+3
+        sbc T+3
+        sta M+3
+        lda sqrt        ; R=R+D
+        ora stablo+1,y
+        sta sqrt
+        lda sqrt+1
+        ora stabhi+1,y
+        sta sqrt+1
+skip1
+        asl M           ; M=M*2
+        rol M+1
+        rol M+2
+        rol M+3
+        dey             ; implicit: D=D/2, by the decrement of .Y
+        bpl loopsq
+lastiter                ; code for last iteration
+        bcs skp0        ; takes care of large numbers; if set, M>T
+        ; during last iteration D=1, so [(2*R+D) LSR 1] makes D the MSB of T+1
+        lda M+3
+        cmp sqrt+1      ; (T+3) = sqrtHI
+        bcc skp1        ; T <= M    branch if T>M
+        bne skp0
+        lda M+2
+        cmp sqrt        ; (T+2) = sqrtLO
+        bcc skp1
+        bne skp0
+        lda M+1
+        cmp #$80        ; value of (T+1) during last iteration
+        bcc skp1
+skp0    ;sec
+        lda M+1
+        sbc #$80        ; (T+1) during last iteration
+        sta M+1
+        lda M+2         ; M=M-T
+        sbc sqrt        ; (T+2)
+        sta M+2
+        lda M+3
+        sbc sqrt+1      ; (T+3)
+        sta M+3
+        inc sqrt        ; R=R+D with D=1
+skp1    asl M+1         ; M=M*2; location M+0=0
+        rol M+2
+        rol M+3
+        rol M+4
+        rts
+;**** Variables and Shift table
+stabhi byte 0,0,0,0,0,0,0,0
+stablo BYTE $01,$02,$04,$08,$10,$20,$40,$80
+       byte 0,0,0,0,0,0,0,0
+square = $57    ; 5 bytes: input value; during calculation needs the 5th byte
+sqrt   = $5F    ; 2 bytes: result
+remnd  = M+2    ; 2 B + 1 b: is in the high bytes of M (M LSR 16); msb is in T+0 (the 5th byte of square)
+T      = $5B    ; 4 bytes: could be 2 bytes: T+0 is always 0; T+1 is 0 until last iteration
+M      = square ; 4 bytes: over the input square
+</code>
+The algorithm is pretty fast: it has a top cycles count of around 1700, but seems to average at 1.3ms (using variables in page zero).\\