User Tools

Site Tools


base:fastest_multiplication

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Last revisionBoth sides next revision
base:fastest_multiplication [2017-04-17 12:10] reposebase:fastest_multiplication [2023-12-11 12:20] repose
Line 1: Line 1:
-Requires tables or a generator routine such as http://www.codebase64.org/doku.php?id=base:table_generator_routine_for_fast_8_bit_mul_table+====== Fastest 16x16 unsigned multiplication ======
  
 +By Repose
 +
 +Requires tables or a generator routine such as [[table_generator_routine_for_fast_8_bit_mul_table]]
 +
 +Jack Asser's: 233* cycles  ref: [[seriously_fast_multiplication]] \\
 +Chris Jam's: 204.5*  ref: http://csdb.dk/forums/?roomid=11&topicid=91766 \\
 +Mine: 198.6 zp variation: 194.6 \\
 +Times above need to add 6 for rts \\
 +Note: updated 2023; corrected code, typos and timings \\
 +*Timing approximate for external code \\
 +This is the winner out of 120 published algorithms as independently tested here:
 +https://github.com/TobyLobster/multiply_test \\
 +Note: I have a new version which is 188.1 cycles, excepting the RTS. \\
 +
 +<code>
 ;World's fastest 16x16 unsigned mult for 6502 ;World's fastest 16x16 unsigned mult for 6502
 ;you can go faster, but not without more code and/or data ;you can go faster, but not without more code and/or data
 ;and being less elegant and harder to follow. ;and being less elegant and harder to follow.
 ;by Repose 2017 ;by Repose 2017
 +;table generator by Graham
 +;addition improvement suggested by JackAsser
 +
 +;data: 2044 bytes
 +;zero page ram required: minimum 8 bytes, ideally 14
 +;do_add: 30 bytes in zp, if used
 +;time: 198.6 cycles, option for 194.6 if you use 30 more zp bytes for do_add
 +;measurement method: average timings over all input combinations
 +
 +;How to use:
 +;put numbers in x/y and result is Y reg (z3), A reg (z2), z1, z0
  
 ;tables of squares ;tables of squares
 ;sqr(x)=x^2/4 ;sqr(x)=x^2/4
 ;negsqr(x)=(255-x)^2/4 ;negsqr(x)=(255-x)^2/4
-sqrlo=$c000;511 bytes +sqrlo=$c100;511 bytes 
-sqrhi=$c200;511 bytes +sqrhi=$c300;511 bytes 
-negsqrlo=$c400;511 bytes +negsqrlo=$c500;511 bytes 
-negsqrhi=$c600;511 bytes+negsqrhi=$c700;511 bytes
  
 ;pointers to square tables above ;pointers to square tables above
Line 25: Line 51:
 y0=$fd;multiplicand, 2 bytes y0=$fd;multiplicand, 2 bytes
 y1=$fe y1=$fe
-z0=$80;product, bytes+z0=$80;product, bytes
 z1=$81 z1=$81
-z2=$82 +z2=$82 ;returned in A reg 
-z3=$83+z3=$83 ;returned in Y reg
  
-;not shown is routine to make the tables +;Example showing use 
-;also you need to init the pointers' high bytes to the tables+*=$c000 
 +lda #$ff 
 +sta x0 
 +sta x1 
 +sta y0 
 +sta y1 
 +jsr makesqrtables 
 +jsr umult16 
 +sta z2 
 +sty z3 
 +rts 
 +;result should be $fffe0001, e.g. as viewed with typical m 0080 monitor command: 
 +;0080 01 00 fe ff 
 + 
 +makesqrtables: 
 +;init zp square tables pointers 
 +lda #>sqrlo 
 +sta p_sqr_lo+1 
 +lda #>sqrhi 
 +sta p_sqr_hi+1 
 +lda #>negsqrlo 
 +sta p_invsqr_lo+1 
 +lda #>negsqrhi 
 +sta p_invsqr_hi+1 
 + 
 +;generate sqr(x)=x^2/
 +      ldx #$00 
 +      txa 
 +      !by $c9   ; CMP #immediate - skip TYA and clear carry flag 
 +lb1:  tya 
 +      adc #$00 
 +ml1:  sta sqrhi,x 
 +      tay 
 +      cmp #$40 
 +      txa 
 +      ror 
 +ml9:  adc #$00 
 +      sta ml9+1 
 +      inx 
 +ml0:  sta sqrlo,x 
 +      bne lb1 
 +      inc ml0+2 
 +      inc ml1+2 
 +      clc 
 +      iny 
 +      bne lb1 
 + 
 +;generate negsqr(x)=(255-x)^2/
 +      ldx #$00 
 +      ldy #$ff 
 +mt1: 
 +      lda sqrhi+1,x 
 +      sta negsqrhi+$100,
 +      lda sqrhi,x 
 +      sta negsqrhi,
 +      lda sqrlo+1,x 
 +      sta negsqrlo+$100,
 +      lda sqrlo,x 
 +      sta negsqrlo,
 +      dey 
 +      inx 
 +      bne mt1 
 +      rts
  
 umult16: umult16:
Line 42: Line 130:
 sta p_invsqr_hi;17 sta p_invsqr_hi;17
  
-ldy y0 
 sec sec
 +ldy y0
 lda (p_sqr_lo),y lda (p_sqr_lo),y
-sbc (p_invsqr_lo),y;note these two lines taken as 11 total+sbc (p_invsqr_lo),y;note these two lines taken as 10.996 total or 10+65280/65536
 sta z0;x0*y0l sta z0;x0*y0l
 lda (p_sqr_hi),y lda (p_sqr_hi),y
 sbc (p_invsqr_hi),y sbc (p_invsqr_hi),y
-sta c1a+1;x0*y0h;31+sta c1a+1;x0*y0h; 2+3+10.996+3+10.996+4=33.992
 ;c1a means column 1, row a (partial product to be added later) ;c1a means column 1, row a (partial product to be added later)
  
 ldy y1 ldy y1
-;sec  ;notice that the high byte of sub above is always +ve+;sec  ;notice that the high byte of subtraction above is always positive, leaving Carry set
 lda (p_sqr_lo),y lda (p_sqr_lo),y
 sbc (p_invsqr_lo),y sbc (p_invsqr_lo),y
Line 59: Line 147:
 lda (p_sqr_hi),y lda (p_sqr_hi),y
 sbc (p_invsqr_hi),y sbc (p_invsqr_hi),y
-sta c2a+1;x0*y1h;31+sta c2a+1;x0*y1h; 3+10.996+4+10.996+4=32.992
  
 ;set multiplier as x1 ;set multiplier as x1
Line 76: Line 164:
 lda (p_sqr_hi),y lda (p_sqr_hi),y
 sbc (p_invsqr_hi),y sbc (p_invsqr_hi),y
-sta c2b+1;x1*y1h;31+sta c2b+1;x1*y0h;32.992
  
 ldy y1 ldy y1
Line 85: Line 173:
 lda (p_sqr_hi),y lda (p_sqr_hi),y
 sbc (p_invsqr_hi),y sbc (p_invsqr_hi),y
-sta z3;x1*y1h;31+tay;Y=x1*y1h, 30.992 cycles 
 +;17+34+33+17+33+31=164.97 cycles for main multiply part (minimum=157, maximum=173)
  
-;4*31+2*17 so far=158 +;jmp do_adds; can put do_adds in zp for a slight speed increase
-;add partials +
-;-add first two numbers in column 1 +
-;jmp do_adds;put in zp to save 3 cycles :)+
 do_adds: do_adds:
-clc +;-add the first two numbers of column 1 
-c1a lda #0  + clc 
-c1b adc #0;add first two rows of column 1 +c1a: lda #0 
-sta z1;9 +c1b: adc #0 
-;-continue to first two numbers in column 2                    + sta z1;9 
-c2a lda #0 + 
-c2b adc #0  +;-continue to first two numbers of column 2 
-sta z2;7 +c2a: lda #0 
-bcc c1c;3 taken/not taken, avg 6  +c2b: adc #0 
-inc z3 + tax 
-clc + bcc c1c;9 
-;-add last number of column 1 (row c) + iny;z3++ 
-c1c lda #0  + clc;(+3) taken 7% of the time, 3*.07=+.21 
-adc z1 + 
-sta z1;8+;-add last number of column 1 
 +c1c: lda #0 
 + adc z1 
 + sta z1;8 
 ;-add last number of column 2 ;-add last number of column 2
-c2c lda #0  + txa 
-adc z2 +c2c: adc #0 
-sta z2;8 + ;A=z2 
-bcc fin;3/avg 5 + bcc fin;7 
-inc z3 + iny;(+1) taken 42% of the time, 1*.42=.42
-;9+7+6+8+8+5=43 +
-fin rts+
  
 +;Y=z3, A=z2
 +;add partials part total cycles=33.63 (minimum=33, maximum=37)
 +;total time=164.97+33.63=198.6
 +fin: rts;add 6 to include this (204.6)
  
 +Diagram of the additions
 +                y1    y0
 +              x1    x0
 +                --------
 +             x0y0h x0y0l
 ++      x0y1h x0y1l
 ++      x1y0h x1y0l
 ++x1y1h x1y1l
 +------------------------
 +    z3    z2    z1    z0     
 +</code>
base/fastest_multiplication.txt · Last modified: 2024-02-13 08:24 by repose