base:seriously_fast_multiplication
no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
— | base:seriously_fast_multiplication [2015-04-17 04:33] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Seriously fast multiplication (8-bit and 16-bit) ====== | ||
+ | |||
+ | By Jackasser | ||
+ | |||
+ | Without further explanation here's the code for really fast multiplications. They require 2k of tables which can be generated using [[table_generator_routine_for_fast_8_bit_mul_table]]. The article from C=Hacking 16 which is mentioned in the source is available [[magazines: | ||
+ | |||
+ | Here are four routines, signed/ | ||
+ | |||
+ | < | ||
+ | ; Description: | ||
+ | ; | ||
+ | ; Input: 8-bit unsigned value in T1 | ||
+ | ; 8-bit unsigned value in T2 | ||
+ | ; Carry=0: Re-use T1 from previous multiplication (faster) | ||
+ | ; Carry=1: Set T1 (slower) | ||
+ | ; | ||
+ | ; Output: 16-bit unsigned value in PRODUCT | ||
+ | ; | ||
+ | ; Clobbered: PRODUCT, X, A, C | ||
+ | ; | ||
+ | ; Allocation setup: T1,T2 and PRODUCT preferably on Zero-page. | ||
+ | ; | ||
+ | ; page aligned. Each table are 512 bytes. Total 2kb. | ||
+ | ; | ||
+ | ; Table generation: I: | ||
+ | ; | ||
+ | ; | ||
+ | ; | ||
+ | ; | ||
+ | .proc multiply_8bit_unsigned | ||
+ | bcc :+ | ||
+ | lda T1 | ||
+ | sta sm1+1 | ||
+ | sta sm3+1 | ||
+ | eor #$ff | ||
+ | sta sm2+1 | ||
+ | sta sm4+1 | ||
+ | : | ||
+ | |||
+ | ldx T2 | ||
+ | sec | ||
+ | sm1: lda square1_lo, | ||
+ | sm2: sbc square2_lo, | ||
+ | sta PRODUCT+0 | ||
+ | sm3: lda square1_hi, | ||
+ | sm4: sbc square2_hi, | ||
+ | sta PRODUCT+1 | ||
+ | |||
+ | rts | ||
+ | .endproc | ||
+ | |||
+ | |||
+ | |||
+ | ; Description: | ||
+ | ; | ||
+ | ; Input: 8-bit signed value in T1 | ||
+ | ; 8-bit signed value in T2 | ||
+ | ; Carry=0: Re-use T1 from previous multiplication (faster) | ||
+ | ; Carry=1: Set T1 (slower) | ||
+ | ; | ||
+ | ; Output: 16-bit signed value in PRODUCT | ||
+ | ; | ||
+ | ; Clobbered: PRODUCT, X, A, C | ||
+ | .proc multiply_8bit_signed | ||
+ | jsr multiply_8bit_unsigned | ||
+ | |||
+ | ; Apply sign (See C=Hacking16 for details). | ||
+ | lda T1 | ||
+ | bpl :+ | ||
+ | sec | ||
+ | lda PRODUCT+1 | ||
+ | sbc T2 | ||
+ | sta PRODUCT+1 | ||
+ | : | ||
+ | lda T2 | ||
+ | bpl :+ | ||
+ | sec | ||
+ | lda PRODUCT+1 | ||
+ | sbc T1 | ||
+ | sta PRODUCT+1 | ||
+ | : | ||
+ | |||
+ | rts | ||
+ | .endproc | ||
+ | |||
+ | |||
+ | |||
+ | ; Description: | ||
+ | ; | ||
+ | ; Input: 16-bit unsigned value in T1 | ||
+ | ; 16-bit unsigned value in T2 | ||
+ | ; Carry=0: Re-use T1 from previous multiplication (faster) | ||
+ | ; Carry=1: Set T1 (slower) | ||
+ | ; | ||
+ | ; Output: 32-bit unsigned value in PRODUCT | ||
+ | ; | ||
+ | ; Clobbered: PRODUCT, X, A, C | ||
+ | ; | ||
+ | ; Allocation setup: T1,T2 and PRODUCT preferably on Zero-page. | ||
+ | ; | ||
+ | ; page aligned. Each table are 512 bytes. Total 2kb. | ||
+ | ; | ||
+ | ; Table generation: I: | ||
+ | ; | ||
+ | ; | ||
+ | ; | ||
+ | ; | ||
+ | .proc multiply_16bit_unsigned | ||
+ | ; <T1 * <T2 = AAaa | ||
+ | ; <T1 * >T2 = BBbb | ||
+ | ; >T1 * <T2 = CCcc | ||
+ | ; >T1 * >T2 = DDdd | ||
+ | ; | ||
+ | ; | ||
+ | ; | ||
+ | ; | ||
+ | ; + DDdd | ||
+ | ; ---------- | ||
+ | ; | ||
+ | |||
+ | ; Setup T1 if changed | ||
+ | bcc :+ | ||
+ | lda T1+0 | ||
+ | sta sm1a+1 | ||
+ | sta sm3a+1 | ||
+ | sta sm5a+1 | ||
+ | sta sm7a+1 | ||
+ | eor #$ff | ||
+ | sta sm2a+1 | ||
+ | sta sm4a+1 | ||
+ | sta sm6a+1 | ||
+ | sta sm8a+1 | ||
+ | lda T1+1 | ||
+ | sta sm1b+1 | ||
+ | sta sm3b+1 | ||
+ | sta sm5b+1 | ||
+ | sta sm7b+1 | ||
+ | eor #$ff | ||
+ | sta sm2b+1 | ||
+ | sta sm4b+1 | ||
+ | sta sm6b+1 | ||
+ | sta sm8b+1 | ||
+ | : | ||
+ | |||
+ | ; Perform <T1 * <T2 = AAaa | ||
+ | ldx T2+0 | ||
+ | sec | ||
+ | sm1a: lda square1_lo, | ||
+ | sm2a: sbc square2_lo, | ||
+ | sta PRODUCT+0 | ||
+ | sm3a: lda square1_hi, | ||
+ | sm4a: sbc square2_hi, | ||
+ | sta _AA+1 | ||
+ | |||
+ | ; Perform >T1_hi * <T2 = CCcc | ||
+ | sec | ||
+ | sm1b: lda square1_lo, | ||
+ | sm2b: sbc square2_lo, | ||
+ | sta _cc+1 | ||
+ | sm3b: lda square1_hi, | ||
+ | sm4b: sbc square2_hi, | ||
+ | sta _CC+1 | ||
+ | |||
+ | ; Perform <T1 * >T2 = BBbb | ||
+ | ldx T2+1 | ||
+ | sec | ||
+ | sm5a: lda square1_lo, | ||
+ | sm6a: sbc square2_lo, | ||
+ | sta _bb+1 | ||
+ | sm7a: lda square1_hi, | ||
+ | sm8a: sbc square2_hi, | ||
+ | sta _BB+1 | ||
+ | |||
+ | ; Perform >T1 * >T2 = DDdd | ||
+ | sec | ||
+ | sm5b: lda square1_lo, | ||
+ | sm6b: sbc square2_lo, | ||
+ | sta _dd+1 | ||
+ | sm7b: lda square1_hi, | ||
+ | sm8b: sbc square2_hi, | ||
+ | sta PRODUCT+3 | ||
+ | |||
+ | ; Add the separate multiplications together | ||
+ | clc | ||
+ | _AA: lda #0 | ||
+ | _bb: adc #0 | ||
+ | sta PRODUCT+1 | ||
+ | _BB: lda #0 | ||
+ | _CC: adc #0 | ||
+ | sta PRODUCT+2 | ||
+ | bcc :+ | ||
+ | inc PRODUCT+3 | ||
+ | clc | ||
+ | : | ||
+ | _cc: lda #0 | ||
+ | adc PRODUCT+1 | ||
+ | sta PRODUCT+1 | ||
+ | _dd: lda #0 | ||
+ | adc PRODUCT+2 | ||
+ | sta PRODUCT+2 | ||
+ | bcc :+ | ||
+ | inc PRODUCT+3 | ||
+ | : | ||
+ | |||
+ | rts | ||
+ | .endproc | ||
+ | |||
+ | |||
+ | |||
+ | ; Description: | ||
+ | ; | ||
+ | ; Input: 16-bit signed value in T1 | ||
+ | ; 16-bit signed value in T2 | ||
+ | ; Carry=0: Re-use T1 from previous multiplication (faster) | ||
+ | ; Carry=1: Set T1 (slower) | ||
+ | ; | ||
+ | ; Output: 32-bit signed value in PRODUCT | ||
+ | ; | ||
+ | ; Clobbered: PRODUCT, X, A, C | ||
+ | .proc multiply_16bit_signed | ||
+ | jsr multiply_16bit_unsigned | ||
+ | |||
+ | ; Apply sign (See C=Hacking16 for details). | ||
+ | lda T1+1 | ||
+ | bpl :+ | ||
+ | sec | ||
+ | lda PRODUCT+2 | ||
+ | sbc T2+0 | ||
+ | sta PRODUCT+2 | ||
+ | lda PRODUCT+3 | ||
+ | sbc T2+1 | ||
+ | sta PRODUCT+3 | ||
+ | : | ||
+ | lda T2+1 | ||
+ | bpl :+ | ||
+ | sec | ||
+ | lda PRODUCT+2 | ||
+ | sbc T1+0 | ||
+ | sta PRODUCT+2 | ||
+ | lda PRODUCT+3 | ||
+ | sbc T1+1 | ||
+ | sta PRODUCT+3 | ||
+ | : | ||
+ | |||
+ | rts | ||
+ | .endproc | ||
+ | |||
+ | </ | ||
+ | |||
base/seriously_fast_multiplication.txt · Last modified: 2015-04-17 04:33 by 127.0.0.1