Differences

This shows you the differences between two versions of the page.

--- base:fast_sqrt [2019-07-24 00:43] – verz
+++ base:fast_sqrt [2019-08-04 02:03] – verz
@@ Line 374: / Line 374: @@
-I'm still working on it, anyway this is based on one of the partial algorithms, and computes Sqrt of a 32bit number.
+----
-Put the number in _square and get the result in _sqrt and _remainder.
+by Verz: the generic algorithm is:
 <code>
-_square  = $5B          ; 4 bytes: $5b-$5e; input number
+        R= 0
-_sqrt    = $5F          ; 2 bytes: $5f-$60; result of the computation
+        M= N
-_remainder = _M+2       ; 2 bytes: $5d-$5e; is in fact the high bytes of _M
+        D= 2^(p-1)
-_T       = $57          ; 4 bytes: $57-$5a
+        for n= 1 to p
-_M       = $5B          ; 4 bytes: $5b-$5e, same as the input _square
+        {
-_D       = $61          ; 2 bytes: $61-$62
+            T= (R+R+D) ASL (p-1)
+            if (T <= M) then M=M-T: R=R+D
+            M= M ASL 1
+            D= D LSR 1
+        }
+</code>
+where //p// is the number of bits of the result; (or half the bits of the radicand, +1 if the radicand has an odd amount of bits).\\
+//p=ceil(#bits-of-radicand/2)//\\
-sqrt32
+Incidentally the algorithm provides also the remainder via (M LSR p)\\
-        ldx #0
+//p// can be greater than necessary, just producing more iterations. A value of p=16 permits to perform the calculations for 32bit numbers and smaller.
-        stx _sqrt               ;R
-        stx _sqrt+1
-        stx _D
-        ;stx _T         ; Tlo is always 0
-        ldx #$80
-        stx _D+1
-        ldy #16
+This is the code for a 32bit integer Sqrt. Provides the result and the remainder:
+<code>
+;********************************************
+;*    sqrt32
+;*
+;*   computes Sqrt of a 32bit number
+;*
+;*      input:  square, the 4-byte source number
+;*      output: sqrt, 16bit value
+;*              remnd, 17bit value
+;*
-_loopsq
+sqrt32  lda #0
-        lda _sqrt
+        sta sqrt        ; R=0
-        asl
+        sta sqrt+1
-        sta _T+2
+        sta M+4
-        lda _sqrt+1
+        ;sta T+1        ; (T+1) is zero until last iteration; (T+0) is always 0
-        rol
-        sta _T+3
-        clc
-        lda _D
-        adc _T+2
-        sta _T+2
-        lda _D+1
-        adc _T+3
-        lsr
-        sta _T+3
-        lda _T+2
-        ror
-        sta _T+2
-        lda #0
-;       sta _T          ; Tlo is always 0
-        ror
-        sta _T+1
-        LDA _M+3
+        ldy #14         ; 15 iterations (14-->0) + last iteration
-        CMP _T+3
+loopsq
-        BCC skip1       ; T <= M    branch if T>M
+        lda sqrt        ; (2*R+D) LSR 1; actually: R+(D LSR 1)
+        ora stablo,y    ; using ORA instead of ADC is ok because the bit to be set
+        sta T+2         ;    will have not been affected yet
+        lda sqrt+1
+        ora stabhi,y
+        sta T+3
+        bcs skip0       ; takes care of large numbers; if set, M>T
+        lda M+3
+        cmp T+3
+        bcc skip1       ; T <= M    (branch if T>M)
         bne skip0
-        lda _M+2
+        lda M+2
-        cmp _T+2
+        cmp T+2
         bcc skip1
-        bne skip0
+skip0   ;sec
-        lda _M+1
+        lda M+2         ; M=M-T
-        cmp _T+1
+        sbc T+2
-        bcc skip1
+        sta M+2
-;        bne skip0      ;_Tlo is always 0
+        lda M+3
-;        lda _reM
+        sbc T+3
-;        cmp _T
+        sta M+3
-;        bcc skip1
+        lda sqrt        ; R=R+D
+        ora stablo+1,y
-skip0   sec
+        sta sqrt
-;        lda _M         ; Tlo is always 0
+        lda sqrt+1
-;        sbc _T
+        ora stabhi+1,y
-;        sta _M
+        sta sqrt+1
-        lda _M+1        ; M=M-T
-        sbc _T+1
-        sta _M+1
-        lda _M+2
-        sbc _T+2
-        sta _M+2
-        lda _M+3
-        sbc _T+3
-        sta _M+3
-        clc            ; possibly unnecessary
-        lda _sqrt      ; R=R+D
-        adc _D
-        sta _sqrt
-        lda _sqrt+1
-        adc _D+1
-        sta _sqrt+1
 skip1
-        asl _M
+        asl M           ; M=M*2
-        rol _M+1
+        rol M+1
-        rol _M+2
+        rol M+2
-        rol _M+3
+        rol M+3
-        lsr _D+1
+        dey             ; implicit: D=D/2, by the decrement of .Y
-        ror _D
+        bpl loopsq
-        dey
+lastiter                ; code for last iteration
-        beq _endsqrt
+        bcs skp0        ; takes care of large numbers; if set, M>T
-        jmp _loopsq
+        ; during last iteration D=1, so [(2*R+D) LSR 1] makes D the MSB of T+1
-_endsqrt
+        lda M+3
+        cmp sqrt+1      ; (T+3) = sqrtHI
+        bcc skp1        ; T <= M    branch if T>M
+        bne skp0
+        lda M+2
+        cmp sqrt        ; (T+2) = sqrtLO
+        bcc skp1
+        bne skp0
+        lda M+1
+        cmp #$80        ; value of (T+1) during last iteration
+        bcc skp1
+skp0    ;sec
+        lda M+1
+        sbc #$80        ; (T+1) during last iteration
+        sta M+1
+        lda M+2         ; M=M-T
+        sbc sqrt        ; (T+2)
+        sta M+2
+        lda M+3
+        sbc sqrt+1      ; (T+3)
+        sta M+3
+        inc sqrt        ; R=R+D with D=1
+skp1    asl M+1         ; M=M*2; location M+0=0
+        rol M+2
+        rol M+3
+        rol M+4
         rts
-</code>
+;**** Variables and Shift table
+stabhi byte 0,0,0,0,0,0,0,0
+stablo BYTE $01,$02,$04,$08,$10,$20,$40,$80
+       byte 0,0,0,0,0,0,0,0
+square = $57    ; 5 bytes: input value; during calculation needs the 5th byte
+sqrt   = $5F    ; 2 bytes: result
+remnd  = M+2    ; 2 B + 1 b: is in the high bytes of M (M LSR 16); msb is in T+0 (the 5th byte of square)
+T      = $5B    ; 4 bytes: could be 2 bytes: T+0 is always 0; T+1 is 0 until last iteration
+M      = square ; 4 bytes: over the input square
+</code>
+The algorithm is pretty fast: it has a top cycles count of around 1700, but seems to average at 1.3ms (using variables in page zero).\\