base:fastest_multiplication
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
base:fastest_multiplication [2017-04-19 10:28] – repose | base:fastest_multiplication [2024-02-13 08:24] (current) – accurate timing if Jack Asser code repose | ||
---|---|---|---|
Line 5: | Line 5: | ||
Requires tables or a generator routine such as [[table_generator_routine_for_fast_8_bit_mul_table]] | Requires tables or a generator routine such as [[table_generator_routine_for_fast_8_bit_mul_table]] | ||
- | Jack Asser' | + | Jack Asser' |
- | Chris Jam's: 204.5 ref: http:// | + | Chris Jam's: 204.5* ref: http:// |
- | Mine: 196 zp variation: | + | Mine: 198.6 zp variation: |
- | Times above need to add 12 for jsr/rts \\ | + | Times above need to add 6 for rts \\ |
+ | Note: updated 2023; corrected code, typos and timings \\ | ||
+ | *Timing approximate for external code \\ | ||
+ | This is the winner out of 120 published algorithms as independently tested here: | ||
+ | https:// | ||
+ | Note: I have a new version which is 188.1 cycles, excepting the RTS. \\ | ||
< | < | ||
Line 21: | Line 26: | ||
;zero page ram required: minimum 8 bytes, ideally 14 | ;zero page ram required: minimum 8 bytes, ideally 14 | ||
;do_add: 30 bytes in zp, if used | ;do_add: 30 bytes in zp, if used | ||
- | ; | + | ; |
- | ; | + | ; |
;How to use: | ;How to use: | ||
- | ;put numbers in x/y and result is Y reg, X reg, z1, z0 | + | ;put numbers in x/y and result is Y reg (z3), A reg (z2), z1, z0 |
;tables of squares | ;tables of squares | ||
; | ; | ||
; | ; | ||
- | sqrlo=$c000;511 bytes | + | sqrlo=$c100;511 bytes |
- | sqrhi=$c200;511 bytes | + | sqrhi=$c300;511 bytes |
- | negsqrlo=$c400;511 bytes | + | negsqrlo=$c500;511 bytes |
- | negsqrhi=$c600;511 bytes | + | negsqrhi=$c700;511 bytes |
;pointers to square tables above | ;pointers to square tables above | ||
Line 48: | Line 53: | ||
z0=$80; | z0=$80; | ||
z1=$81 | z1=$81 | ||
- | ;z2=$82 returned in X reg | + | z2=$82 |
- | ;z3=$83 returned in Y reg | + | z3=$83 |
;Example showing use | ;Example showing use | ||
+ | *=$c000 | ||
lda #$ff | lda #$ff | ||
sta x0 | sta x0 | ||
Line 57: | Line 63: | ||
sta y0 | sta y0 | ||
sta y1 | sta y1 | ||
- | jsr maketables | + | jsr makesqrtables |
jsr umult16 | jsr umult16 | ||
+ | sta z2 | ||
+ | sty z3 | ||
+ | rts | ||
;result should be $fffe0001, e.g. as viewed with a typical m 0080 monitor command: | ;result should be $fffe0001, e.g. as viewed with a typical m 0080 monitor command: | ||
;0080 01 00 fe ff | ;0080 01 00 fe ff | ||
Line 121: | Line 130: | ||
sta p_invsqr_hi; | sta p_invsqr_hi; | ||
- | ldy y0 | ||
sec | sec | ||
+ | ldy y0 | ||
lda (p_sqr_lo), | lda (p_sqr_lo), | ||
- | sbc (p_invsqr_lo), | + | sbc (p_invsqr_lo), |
sta z0;x0*y0l | sta z0;x0*y0l | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | sta c1a+1; | + | sta c1a+1; |
;c1a means column 1, row a (partial product to be added later) | ;c1a means column 1, row a (partial product to be added later) | ||
ldy y1 | ldy y1 | ||
- | ;sec ;notice that the high byte of sub above is always | + | ;sec ;notice that the high byte of subtraction |
lda (p_sqr_lo), | lda (p_sqr_lo), | ||
sbc (p_invsqr_lo), | sbc (p_invsqr_lo), | ||
Line 138: | Line 147: | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | sta c2a+1; | + | sta c2a+1; |
;set multiplier as x1 | ;set multiplier as x1 | ||
Line 155: | Line 164: | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | sta c2b+1;x1*y1h;31 | + | sta c2b+1;x1*y0h;32.992 |
ldy y1 | ldy y1 | ||
Line 164: | Line 173: | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | tay;x1*y1h;Y=z3, 30 cycles | + | tay;Y=x1*y1h, 30.992 cycles |
- | ;17+33+31+17+31+30=159 cycles for main multiply part | + | ;17+34+33+17+33+31=164.97 |
;jmp do_adds; can put do_adds in zp for a slight speed increase | ;jmp do_adds; can put do_adds in zp for a slight speed increase | ||
Line 178: | Line 187: | ||
c2a: lda #0 | c2a: lda #0 | ||
c2b: adc #0 | c2b: adc #0 | ||
- | tax;X=z2, 6 cycles | + | tax |
- | bcc c1c;3/6 avg 4.5 | + | bcc c1c;9 |
iny;z3++ | iny;z3++ | ||
- | clc | + | clc;(+3) taken 7% of the time, 3*.07=+.21 |
;-add last number of column 1 | ;-add last number of column 1 | ||
Line 189: | Line 198: | ||
;-add last number of column 2 | ;-add last number of column 2 | ||
- | txa;A=z2 | + | txa |
c2c: adc #0 | c2c: adc #0 | ||
- | tax;X=z2, 6 | + | ;A=z2 |
- | bcc fin;3/4 avg 3.5 | + | bcc fin;7 |
- | iny;z3++ | + | iny;(+1) taken 42% of the time, 1*.42=.42 |
- | ; | + | ; |
- | ;9+6+4.5+8+6+3.5=37 | + | ;add partials part total cycles=33.63 (minimum=33, |
- | fin: rts | + | ;total time=164.97+33.63=198.6 |
+ | fin: rts;add 6 to include this (204.6) | ||
Diagram of the additions | Diagram of the additions |
base/fastest_multiplication.txt · Last modified: 2024-02-13 08:24 by repose