517 lines
12 KiB
NASM
517 lines
12 KiB
NASM
// vim: filetype=kickass
|
|
|
|
*=* "Math Routines"
|
|
|
|
// Multiplication routines found on
|
|
// https://codebase64.org/doku.php?id=base:seriously_fast_multiplication
|
|
.const T1=$26
|
|
.const T2=$fd
|
|
.const PRODUCT=$22
|
|
|
|
|
|
// Description: Unsigned 8-bit multiplication with unsigned 16-bit result.
|
|
//
|
|
// Input: 8-bit unsigned value in T1
|
|
// 8-bit unsigned value in T2
|
|
// Carry=0: Re-use T1 from previous multiplication (faster)
|
|
// Carry=1: Set T1 (slower)
|
|
//
|
|
// Output: 16-bit unsigned value in PRODUCT
|
|
//
|
|
// Clobbered: PRODUCT, X, A, C
|
|
//
|
|
// Allocation setup: T1,T2 and PRODUCT preferably on Zero-page.
|
|
// square1_lo, square1_hi, square2_lo, square2_hi must be
|
|
// page aligned. Each table are 512 bytes. Total 2kb.
|
|
//
|
|
// Table generation: I:0..511
|
|
// square1_lo = <((I*I)/4)
|
|
// square1_hi = >((I*I)/4)
|
|
// square2_lo = <(((I-255)*(I-255))/4)
|
|
// square2_hi = >(((I-255)*(I-255))/4)
|
|
multiply_8bit_unsigned:
|
|
bcc !+
|
|
lda T1
|
|
sta sm1+1
|
|
sta sm3+1
|
|
eor #$ff
|
|
sta sm2+1
|
|
sta sm4+1
|
|
!:
|
|
|
|
ldx T2
|
|
sec
|
|
sm1: lda square1_lo,x
|
|
sm2: sbc square2_lo,x
|
|
sta PRODUCT+0
|
|
sm3: lda square1_hi,x
|
|
sm4: sbc square2_hi,x
|
|
sta PRODUCT+1
|
|
|
|
rts
|
|
|
|
|
|
// Description: Signed 8-bit multiplication with signed 16-bit result.
|
|
//
|
|
// Input: 8-bit signed value in T1
|
|
// 8-bit signed value in T2
|
|
// Carry=0: Re-use T1 from previous multiplication (faster)
|
|
// Carry=1: Set T1 (slower)
|
|
//
|
|
// Output: 16-bit signed value in PRODUCT
|
|
//
|
|
// Clobbered: PRODUCT, X, A, C
|
|
multiply_8bit_signed:
|
|
jsr multiply_8bit_unsigned
|
|
|
|
// Apply sign (See C=Hacking16 for details).
|
|
lda T1
|
|
bpl !+
|
|
sec
|
|
lda PRODUCT+1
|
|
sbc T2
|
|
sta PRODUCT+1
|
|
!:
|
|
lda T2
|
|
bpl !+
|
|
sec
|
|
lda PRODUCT+1
|
|
sbc T1
|
|
sta PRODUCT+1
|
|
!:
|
|
|
|
rts
|
|
|
|
|
|
// Description: Unsigned 16-bit multiplication with unsigned 32-bit result.
|
|
//
|
|
// Input: 16-bit unsigned value in T1
|
|
// 16-bit unsigned value in T2
|
|
// Carry=0: Re-use T1 from previous multiplication (faster)
|
|
// Carry=1: Set T1 (slower)
|
|
//
|
|
// Output: 32-bit unsigned value in PRODUCT
|
|
//
|
|
// Clobbered: PRODUCT, X, A, C
|
|
//
|
|
// Allocation setup: T1,T2 and PRODUCT preferably on Zero-page.
|
|
// square1_lo, square1_hi, square2_lo, square2_hi must be
|
|
// page aligned. Each table are 512 bytes. Total 2kb.
|
|
//
|
|
// Table generation: I:0..511
|
|
// square1_lo = <((I*I)/4)
|
|
// square1_hi = >((I*I)/4)
|
|
// square2_lo = <(((I-255)*(I-255))/4)
|
|
// square2_hi = >(((I-255)*(I-255))/4)
|
|
multiply_16bit_unsigned:
|
|
// <T1 * <T2 = AAaa
|
|
// <T1 * >T2 = BBbb
|
|
// >T1 * <T2 = CCcc
|
|
// >T1 * >T2 = DDdd
|
|
//
|
|
// AAaa
|
|
// BBbb
|
|
// CCcc
|
|
// + DDdd
|
|
// ----------
|
|
// PRODUCT!
|
|
|
|
// Setup T1 if changed
|
|
bcc !+
|
|
lda T1+0
|
|
sta sm1a+1
|
|
sta sm3a+1
|
|
sta sm5a+1
|
|
sta sm7a+1
|
|
eor #$ff
|
|
sta sm2a+1
|
|
sta sm4a+1
|
|
sta sm6a+1
|
|
sta sm8a+1
|
|
lda T1+1
|
|
sta sm1b+1
|
|
sta sm3b+1
|
|
sta sm5b+1
|
|
sta sm7b+1
|
|
eor #$ff
|
|
sta sm2b+1
|
|
sta sm4b+1
|
|
sta sm6b+1
|
|
sta sm8b+1
|
|
!:
|
|
|
|
// Perform <T1 * <T2 = AAaa
|
|
ldx T2+0
|
|
sec
|
|
sm1a: lda square1_lo,x
|
|
sm2a: sbc square2_lo,x
|
|
sta PRODUCT+0
|
|
sm3a: lda square1_hi,x
|
|
sm4a: sbc square2_hi,x
|
|
sta _AA+1
|
|
|
|
// Perform >T1_hi * <T2 = CCcc
|
|
sec
|
|
sm1b: lda square1_lo,x
|
|
sm2b: sbc square2_lo,x
|
|
sta _cc+1
|
|
sm3b: lda square1_hi,x
|
|
sm4b: sbc square2_hi,x
|
|
sta _CC+1
|
|
|
|
// Perform <T1 * >T2 = BBbb
|
|
ldx T2+1
|
|
sec
|
|
sm5a: lda square1_lo,x
|
|
sm6a: sbc square2_lo,x
|
|
sta _bb+1
|
|
sm7a: lda square1_hi,x
|
|
sm8a: sbc square2_hi,x
|
|
sta _BB+1
|
|
|
|
// Perform >T1 * >T2 = DDdd
|
|
sec
|
|
sm5b: lda square1_lo,x
|
|
sm6b: sbc square2_lo,x
|
|
sta _dd+1
|
|
sm7b: lda square1_hi,x
|
|
sm8b: sbc square2_hi,x
|
|
sta PRODUCT+3
|
|
|
|
// Add the separate multiplications together
|
|
clc
|
|
_AA: lda #0
|
|
_bb: adc #0
|
|
sta PRODUCT+1
|
|
_BB: lda #0
|
|
_CC: adc #0
|
|
sta PRODUCT+2
|
|
bcc !+
|
|
inc PRODUCT+3
|
|
clc
|
|
!:
|
|
_cc: lda #0
|
|
adc PRODUCT+1
|
|
sta PRODUCT+1
|
|
_dd: lda #0
|
|
adc PRODUCT+2
|
|
sta PRODUCT+2
|
|
bcc !+
|
|
inc PRODUCT+3
|
|
!:
|
|
|
|
rts
|
|
|
|
|
|
.const aa = $38
|
|
.const AA = $39
|
|
.const bb = $3a
|
|
.const BB = $3b
|
|
.const dd = $3c
|
|
.const DD = $3d
|
|
.const hh = $3e
|
|
.const HH = $3f
|
|
multiply_32bit_unsigned:
|
|
// BBbb AAaa
|
|
// DDdd CCcc
|
|
// FFff EEee
|
|
// + HHhh GGgg
|
|
// --------------------------
|
|
// 3e 3c 3a 38
|
|
// HHhh DDdd BBbb AAaa
|
|
// FFff CCcc
|
|
// GGgg EEee
|
|
|
|
u32_u32_move_imm(aa, 0)
|
|
u32_u32_move_imm(dd, 0)
|
|
|
|
// Perform <T1 * <T2 == BBbbAAaa
|
|
u16_u16_move(T1, $30)
|
|
u16_u16_move(T2, $34)
|
|
sec
|
|
jsr multiply_16bit_unsigned
|
|
u16_u16_move(aa, PRODUCT + 0) // AAaa
|
|
u16_u16_move(bb, PRODUCT + 2) // BBbb
|
|
|
|
// Perform >T1 * <T2 = DDddCCcc
|
|
u16_u16_move(T1, $32)
|
|
u16_u16_move(T2, $34)
|
|
sec
|
|
jsr multiply_16bit_unsigned
|
|
i32_i16_add(bb, PRODUCT) // CCcc
|
|
i16_i16_add(dd, PRODUCT + 2) // DDdd
|
|
|
|
// Perform <T1 * >T2 = FFffEEee
|
|
u16_u16_move(T1, $30)
|
|
u16_u16_move(T2, $36)
|
|
sec
|
|
jsr multiply_16bit_unsigned
|
|
i32_i16_add(bb, PRODUCT) // EEee
|
|
i32_i16_add(dd, PRODUCT + 2) // FFff
|
|
|
|
// Perform >T1 * >T2 = HHhhGGgg
|
|
u16_u16_move(T1, $32)
|
|
u16_u16_move(T2, $36)
|
|
sec
|
|
jsr multiply_16bit_unsigned
|
|
i32_i16_add(dd, PRODUCT) // GGgg
|
|
i16_i16_add(hh, PRODUCT + 2) // HHhh
|
|
|
|
rts
|
|
|
|
|
|
// Description: Signed 16-bit multiplication with signed 32-bit result.
|
|
//
|
|
// Input: 16-bit signed value in T1
|
|
// 16-bit signed value in T2
|
|
// Carry=0: Re-use T1 from previous multiplication (faster)
|
|
// Carry=1: Set T1 (slower)
|
|
//
|
|
// Output: 32-bit signed value in PRODUCT
|
|
//
|
|
// Clobbered: PRODUCT, X, A, C
|
|
multiply_16bit_signed:
|
|
jsr multiply_16bit_unsigned
|
|
|
|
// Apply sign (See C=Hacking16 for details).
|
|
lda T1+1
|
|
bpl !+
|
|
sec
|
|
lda PRODUCT+2
|
|
sbc T2+0
|
|
sta PRODUCT+2
|
|
lda PRODUCT+3
|
|
sbc T2+1
|
|
sta PRODUCT+3
|
|
!:
|
|
lda T2+1
|
|
bpl !+
|
|
sec
|
|
lda PRODUCT+2
|
|
sbc T1+0
|
|
sta PRODUCT+2
|
|
lda PRODUCT+3
|
|
sbc T1+1
|
|
sta PRODUCT+3
|
|
!:
|
|
|
|
rts
|
|
|
|
generate_multiplication_tables:
|
|
ldx #0
|
|
txa
|
|
.byte $c9
|
|
!lb1:
|
|
tya
|
|
adc #0
|
|
!ml1:
|
|
sta square1_hi, x
|
|
tay
|
|
cmp #$40
|
|
txa
|
|
ror
|
|
!ml9:
|
|
adc #0
|
|
sta !ml9- + 1
|
|
inx
|
|
!ml0:
|
|
sta square1_lo, x
|
|
bne !lb1-
|
|
inc !ml0- + 2
|
|
inc !ml1- + 2
|
|
clc
|
|
iny
|
|
bne !lb1-
|
|
|
|
ldx #$00
|
|
ldy #$ff
|
|
!:
|
|
lda square1_hi + 1, x
|
|
sta square2_hi + $0100, x
|
|
lda square1_hi, x
|
|
sta square2_hi, y
|
|
lda square1_lo + 1, x
|
|
sta square2_lo + $0100, x
|
|
lda square1_lo, x
|
|
sta square2_lo, y
|
|
dey
|
|
inx
|
|
|
|
bne !-
|
|
|
|
rts
|
|
|
|
|
|
//
|
|
// udivmod32
|
|
//
|
|
// 32-bit unsigned integer div/mod routine.
|
|
// Based on https://bisqwit.iki.fi/story/howto/bitmath/
|
|
//
|
|
// TODO consistent name
|
|
// TODO could probably be optimized further
|
|
//
|
|
// Input:
|
|
// - udivmod32_dividend ($10..$13): LSB of the dividend.
|
|
// - udivmod32_divisor ($14..$17): LSB of the divisor.
|
|
//
|
|
// Output:
|
|
// - udivmod32_result ($18..1b): LSB of the division result.
|
|
// - udivmod32_remainder ($1c..$1f): LSB of the modulo result.
|
|
//
|
|
// Destroys: a, $20..$2b
|
|
//
|
|
.const scaled_divisor = $20 // ..23
|
|
.const multiple = $24 // ..27
|
|
.const temp = $28 // ..2b
|
|
udivmod32:
|
|
lda #0
|
|
sta udivmod32_result + 0
|
|
sta udivmod32_result + 1
|
|
sta udivmod32_result + 2
|
|
sta udivmod32_result + 3
|
|
|
|
// if (divisor == 0) {
|
|
lda udivmod32_divisor + 0
|
|
bne !if_end+
|
|
lda udivmod32_divisor + 1
|
|
bne !if_end+
|
|
lda udivmod32_divisor + 2
|
|
bne !if_end+
|
|
lda udivmod32_divisor + 3
|
|
bne !if_end+
|
|
|
|
// return 0;
|
|
rts
|
|
// }
|
|
!if_end:
|
|
|
|
// uint32_t scaled_divisor = divisor;
|
|
lda udivmod32_divisor + 0
|
|
sta scaled_divisor + 0
|
|
lda udivmod32_divisor + 1
|
|
sta scaled_divisor + 1
|
|
lda udivmod32_divisor + 2
|
|
sta scaled_divisor + 2
|
|
lda udivmod32_divisor + 3
|
|
sta scaled_divisor + 3
|
|
|
|
// remainder = dividend;
|
|
lda udivmod32_dividend + 0
|
|
sta udivmod32_remainder + 0
|
|
lda udivmod32_dividend + 1
|
|
sta udivmod32_remainder + 1
|
|
lda udivmod32_dividend + 2
|
|
sta udivmod32_remainder + 2
|
|
lda udivmod32_dividend + 3
|
|
sta udivmod32_remainder + 3
|
|
|
|
|
|
// uint32_t multiple = 1;
|
|
lda #1
|
|
sta multiple + 0
|
|
lda #0
|
|
sta multiple + 1
|
|
sta multiple + 2
|
|
sta multiple + 3
|
|
|
|
|
|
// while (!(scaled_divisor & 0x80000000)) {
|
|
!while_start:
|
|
lda #$80
|
|
and scaled_divisor + 3
|
|
bne !while_end+
|
|
|
|
// scaled_divisor <<= 1
|
|
asl scaled_divisor + 0
|
|
rol scaled_divisor + 1
|
|
rol scaled_divisor + 2
|
|
rol scaled_divisor + 3
|
|
|
|
// multiple <<= 1
|
|
asl multiple + 0
|
|
rol multiple + 1
|
|
rol multiple + 2
|
|
rol multiple + 3
|
|
|
|
// }
|
|
jmp !while_start-
|
|
!while_end:
|
|
|
|
|
|
// do {
|
|
!do_start:
|
|
// if (remainder >= scaled_divisor) {
|
|
sec
|
|
lda udivmod32_remainder + 0
|
|
sbc scaled_divisor + 0
|
|
sta temp + 0
|
|
|
|
lda udivmod32_remainder + 1
|
|
sbc scaled_divisor + 1
|
|
sta temp + 1
|
|
|
|
lda udivmod32_remainder + 2
|
|
sbc scaled_divisor + 2
|
|
sta temp + 2
|
|
|
|
lda udivmod32_remainder + 3
|
|
sbc scaled_divisor + 3
|
|
sta temp + 3
|
|
|
|
bcc !if_end+
|
|
|
|
// remain -= scaled_divisor;
|
|
lda temp + 0
|
|
sta udivmod32_remainder + 0
|
|
lda temp + 1
|
|
sta udivmod32_remainder + 1
|
|
lda temp + 2
|
|
sta udivmod32_remainder + 2
|
|
lda temp + 3
|
|
sta udivmod32_remainder + 3
|
|
|
|
// result += multiple;
|
|
clc
|
|
lda udivmod32_result + 0
|
|
adc multiple + 0
|
|
sta udivmod32_result + 0
|
|
|
|
lda udivmod32_result + 1
|
|
adc multiple + 1
|
|
sta udivmod32_result + 1
|
|
|
|
lda udivmod32_result + 2
|
|
adc multiple + 2
|
|
sta udivmod32_result + 2
|
|
|
|
lda udivmod32_result + 3
|
|
adc multiple + 3
|
|
sta udivmod32_result + 3
|
|
|
|
// }
|
|
!if_end:
|
|
|
|
// scaled_divisor >>= 1;
|
|
lsr scaled_divisor + 3
|
|
ror scaled_divisor + 2
|
|
ror scaled_divisor + 1
|
|
ror scaled_divisor + 0
|
|
|
|
// multiple >>= 1;
|
|
lsr multiple + 3
|
|
ror multiple + 2
|
|
ror multiple + 1
|
|
ror multiple + 0
|
|
|
|
// } while (multiple != 0);
|
|
lda multiple + 0
|
|
bne !do_start-
|
|
lda multiple + 1
|
|
bne !do_start-
|
|
lda multiple + 2
|
|
bne !do_start-
|
|
lda multiple + 3
|
|
bne !do_start-
|
|
|
|
rts
|