diff --git a/c64/math.asm b/c64/math.asm new file mode 100644 index 0000000..f8bb0c7 --- /dev/null +++ b/c64/math.asm @@ -0,0 +1,432 @@ +.const T1=$03 +.const T2=$fd +.const PRODUCT=$22 + + +// Description: Unsigned 8-bit multiplication with unsigned 16-bit result. +// +// Input: 8-bit unsigned value in T1 +// 8-bit unsigned value in T2 +// Carry=0: Re-use T1 from previous multiplication (faster) +// Carry=1: Set T1 (slower) +// +// Output: 16-bit unsigned value in PRODUCT +// +// Clobbered: PRODUCT, X, A, C +// +// Allocation setup: T1,T2 and PRODUCT preferably on Zero-page. +// square1_lo, square1_hi, square2_lo, square2_hi must be +// page aligned. Each table are 512 bytes. Total 2kb. +// +// Table generation: I:0..511 +// square1_lo = <((I*I)/4) +// square1_hi = >((I*I)/4) +// square2_lo = <(((I-255)*(I-255))/4) +// square2_hi = >(((I-255)*(I-255))/4) +multiply_8bit_unsigned: + bcc !+ + lda T1 + sta sm1+1 + sta sm3+1 + eor #$ff + sta sm2+1 + sta sm4+1 + !: + + ldx T2 + sec +sm1: lda square1_lo,x +sm2: sbc square2_lo,x + sta PRODUCT+0 +sm3: lda square1_hi,x +sm4: sbc square2_hi,x + sta PRODUCT+1 + + rts + + +// Description: Signed 8-bit multiplication with signed 16-bit result. +// +// Input: 8-bit signed value in T1 +// 8-bit signed value in T2 +// Carry=0: Re-use T1 from previous multiplication (faster) +// Carry=1: Set T1 (slower) +// +// Output: 16-bit signed value in PRODUCT +// +// Clobbered: PRODUCT, X, A, C +multiply_8bit_signed: + jsr multiply_8bit_unsigned + + // Apply sign (See C=Hacking16 for details). + lda T1 + bpl !+ + sec + lda PRODUCT+1 + sbc T2 + sta PRODUCT+1 + !: + lda T2 + bpl !+ + sec + lda PRODUCT+1 + sbc T1 + sta PRODUCT+1 + !: + + rts + + +// Description: Unsigned 16-bit multiplication with unsigned 32-bit result. +// +// Input: 16-bit unsigned value in T1 +// 16-bit unsigned value in T2 +// Carry=0: Re-use T1 from previous multiplication (faster) +// Carry=1: Set T1 (slower) +// +// Output: 32-bit unsigned value in PRODUCT +// +// Clobbered: PRODUCT, X, A, C +// +// Allocation setup: T1,T2 and PRODUCT preferably on Zero-page. +// square1_lo, square1_hi, square2_lo, square2_hi must be +// page aligned. Each table are 512 bytes. Total 2kb. +// +// Table generation: I:0..511 +// square1_lo = <((I*I)/4) +// square1_hi = >((I*I)/4) +// square2_lo = <(((I-255)*(I-255))/4) +// square2_hi = >(((I-255)*(I-255))/4) +multiply_16bit_unsigned: + // T2 = BBbb + // >T1 * T1 * >T2 = DDdd + // + // AAaa + // BBbb + // CCcc + // + DDdd + // ---------- + // PRODUCT! + + // Setup T1 if changed + bcc !+ + lda T1+0 + sta sm1a+1 + sta sm3a+1 + sta sm5a+1 + sta sm7a+1 + eor #$ff + sta sm2a+1 + sta sm4a+1 + sta sm6a+1 + sta sm8a+1 + lda T1+1 + sta sm1b+1 + sta sm3b+1 + sta sm5b+1 + sta sm7b+1 + eor #$ff + sta sm2b+1 + sta sm4b+1 + sta sm6b+1 + sta sm8b+1 + !: + + // Perform T1_hi * T2 = BBbb + ldx T2+1 + sec +sm5a: lda square1_lo,x +sm6a: sbc square2_lo,x + sta _bb+1 +sm7a: lda square1_hi,x +sm8a: sbc square2_hi,x + sta _BB+1 + + // Perform >T1 * >T2 = DDdd + sec +sm5b: lda square1_lo,x +sm6b: sbc square2_lo,x + sta _dd+1 +sm7b: lda square1_hi,x +sm8b: sbc square2_hi,x + sta PRODUCT+3 + + // Add the separate multiplications together + clc +_AA: lda #0 +_bb: adc #0 + sta PRODUCT+1 +_BB: lda #0 +_CC: adc #0 + sta PRODUCT+2 + bcc !+ + inc PRODUCT+3 + clc + !: +_cc: lda #0 + adc PRODUCT+1 + sta PRODUCT+1 +_dd: lda #0 + adc PRODUCT+2 + sta PRODUCT+2 + bcc !+ + inc PRODUCT+3 + !: + + rts + + +// Description: Signed 16-bit multiplication with signed 32-bit result. +// +// Input: 16-bit signed value in T1 +// 16-bit signed value in T2 +// Carry=0: Re-use T1 from previous multiplication (faster) +// Carry=1: Set T1 (slower) +// +// Output: 32-bit signed value in PRODUCT +// +// Clobbered: PRODUCT, X, A, C +multiply_16bit_signed: + jsr multiply_16bit_unsigned + + // Apply sign (See C=Hacking16 for details). + lda T1+1 + bpl !+ + sec + lda PRODUCT+2 + sbc T2+0 + sta PRODUCT+2 + lda PRODUCT+3 + sbc T2+1 + sta PRODUCT+3 + !: + lda T2+1 + bpl !+ + sec + lda PRODUCT+2 + sbc T1+0 + sta PRODUCT+2 + lda PRODUCT+3 + sbc T1+1 + sta PRODUCT+3 + !: + + rts + +generate_multiplication_tables: + ldx #0 + txa + .byte $c9 +!lb1: + tya + adc #0 +!ml1: + sta square1_hi, x + tay + cmp #$40 + txa + ror +!ml9: + adc #0 + sta !ml9- + 1 + inx +!ml0: + sta square1_lo, x + bne !lb1- + inc !ml0- + 2 + inc !ml1- + 2 + clc + iny + bne !lb1- + + ldx #$00 + ldy #$ff +!: + lda square1_hi + 1, x + sta square2_hi + $0100, x + lda square1_hi, x + sta square2_hi, y + lda square1_lo + 1, x + sta square2_lo + $0100, x + lda square1_lo, x + sta square2_lo, y + dey + inx + + bne !- + + rts + + +// +// udivmod32 +// +// TODO consistent name +// TODO document input, output, destroyed +// +.const udivmod32_dividend = $10 // 11,12,13 +.const udivmod32_divisor = $14 // 15,16,17 +.const udivmod32_result = $18 // 19,1a,1b +.const udivmod32_remainder = $1c // 1d,1e,1f + +.const scaled_divisor = $20 // ..23 +.const multiple = $28 // ..2b +.const temp = $2b // ..2f +udivmod32: + // if (divisor == 0) { + lda udivmod32_divisor + 0 + bne !+ + lda udivmod32_divisor + 1 + bne !+ + lda udivmod32_divisor + 2 + bne !+ + lda udivmod32_divisor + 3 + bne !+ + + // return 0; + rts +!: + // } + + // uint32_t scaled_divisor = divisor; + lda udivmod32_divisor + 0 + sta scaled_divisor + 0 + lda udivmod32_divisor + 1 + sta scaled_divisor + 1 + lda udivmod32_divisor + 2 + sta scaled_divisor + 2 + lda udivmod32_divisor + 3 + sta scaled_divisor + 3 + + // remainder = dividend; + lda udivmod32_dividend + 0 + sta udivmod32_remainder + 0 + lda udivmod32_dividend + 0 + sta udivmod32_remainder + 0 + lda udivmod32_dividend + 0 + sta udivmod32_remainder + 0 + lda udivmod32_dividend + 0 + sta udivmod32_remainder + 0 + + + // uint32_t multiple = 1; + lda #1 + sta multiple + 0 + lda #0 + sta multiple + 1 + sta multiple + 2 + sta multiple + 3 + + // uint32_t result = 0; + sta result + 0 + sta result + 1 + sta result + 2 + sta result + 3 + + + // while (scaled_divisor < dividend) { +!while_start: + sec + lda scaled_divisor + 0 + sbc dividend + 0 + lda scaled_divisor + 1 + sbc dividend + 1 + lda scaled_divisor + 2 + sbc dividend + 2 + lda scaled_divisor + 3 + sbc dividend + 3 + bcs !while_end+ + + // scaled_divisor <<= 1 + asl scaled_divisor + 0 + rol scaled_divisor + 1 + rol scaled_divisor + 2 + rol scaled_divisor + 3 + + // multiple <<= 1 + asl multiple + 0 + rol multiple + 1 + rol multiple + 2 + rol multiple + 3 + + // } + jmp !while_start- +!while_end: + + + // do { +!do_start: + // if (remainder >= scaled_divisor) { + sec + lda udivmod32_remainder + 0 + sbc scaled_divisor + 0 + sta temp + 0 + lda udivmod32_remainder + 1 + sbc scaled_divisor + 1 + sta temp + 1 + lda udivmod32_remainder + 2 + sbc scaled_divisor + 2 + sta temp + 2 + lda udivmod32_remainder + 3 + sbc scaled_divisor + 3 + sta temp + 3 + bcc !if_end+ + + // remain -= scaled_divisor; + lda temp + 0 + sta udivmod32_remainder + 0 + lda temp + 1 + sta udivmod32_remainder + 1 + lda temp + 2 + sta udivmod32_remainder + 2 + lda temp + 3 + sta udivmod32_remainder + 3 + + // } +!if_end + + // scaled_divisor >>= 1; + lsr scaled_divisor + 3 + ror scaled_divisor + 2 + ror scaled_divisor + 1 + ror scaled_divisor + 0 + + // multiple >>= 1; + lsr multiple + 3 + ror multiple + 2 + ror multiple + 1 + ror multiple + 0 + + lda multiple + 0 + bne !do_start + lda multiple + 1 + bne !do_start + lda multiple + 2 + bne !do_start + lda multiple + 3 + bne !do_start + // } while (multiple != 0); + + rts + diff --git a/c64/math.inc b/c64/math.inc index 7bae2dc..a613bb0 100644 --- a/c64/math.inc +++ b/c64/math.inc @@ -27,6 +27,35 @@ ror cursor_pointer_lo } +// +// u16_div10 +// +// Divides an unsigned 16-bit integer by 10. +// +// d = n * (1 / 10) +// = floor(n * floor(65536 / 10) / 65536) +// = floor(n * 6553 / 65536) +// = floor(n * 6553 >> 16) +// +// Input: +// - $03..$04 - 16-bit unsigned value +// - Generated multiplication table +// +// Output: +// - $24..$25 - 16-bit unsigned result +// +// Destroys: a, x, $22..$25, $fd, $fe +// +.macro u16_div10() { + lda #$19 + sta $fe + lda #$99 + sta $fd + + sec + jsr multiply_16bit_unsigned +} + .macro i8_mul5_a() { sta zp_temp asl @@ -92,3 +121,49 @@ lda #1 i16_i8_add_a(lo, hi) } + +.macro i16_i16_sub_imm(dst_lo, dst_hi, src_lo, src_hi, imm) { + sec + + lda src_lo + sbc #imm + sta dst_hi +} + +// Destroys: a +.macro i16_i16_sub(dst_lo, dst_hi, a_lo, a_hi, b_lo, b_hi) { + sec + + lda a_lo + sbc b_lo + sta dst_lo + lda a_hi + sbc b_hi + sta dst_hi +} + +.macro i16_imm_i16_sub(dst_lo, dst_hi, imm, src_lo, src_hi) { + lda src_lo + eor #$ff + sec + adc #imm + sta dst_hi +} + +.macro i16_i16_cmp_bne(a_lo, a_hi, b_lo, b_hi, target) { + lda a_lo + cmp b_lo + bne target + lda a_hi + cmp b_hi + bne target +}