// vim: filetype=kickass *=* "Math Routines" // Multiplication routines found on // https://codebase64.org/doku.php?id=base:seriously_fast_multiplication .const T1=$26 .const T2=$fd .const PRODUCT=$22 // Description: Unsigned 8-bit multiplication with unsigned 16-bit result. // // Input: 8-bit unsigned value in T1 // 8-bit unsigned value in T2 // Carry=0: Re-use T1 from previous multiplication (faster) // Carry=1: Set T1 (slower) // // Output: 16-bit unsigned value in PRODUCT // // Clobbered: PRODUCT, X, A, C // // Allocation setup: T1,T2 and PRODUCT preferably on Zero-page. // square1_lo, square1_hi, square2_lo, square2_hi must be // page aligned. Each table are 512 bytes. Total 2kb. // // Table generation: I:0..511 // square1_lo = <((I*I)/4) // square1_hi = >((I*I)/4) // square2_lo = <(((I-255)*(I-255))/4) // square2_hi = >(((I-255)*(I-255))/4) multiply_8bit_unsigned: bcc !+ lda T1 sta sm1+1 sta sm3+1 eor #$ff sta sm2+1 sta sm4+1 !: ldx T2 sec sm1: lda square1_lo,x sm2: sbc square2_lo,x sta PRODUCT+0 sm3: lda square1_hi,x sm4: sbc square2_hi,x sta PRODUCT+1 rts // Description: Signed 8-bit multiplication with signed 16-bit result. // // Input: 8-bit signed value in T1 // 8-bit signed value in T2 // Carry=0: Re-use T1 from previous multiplication (faster) // Carry=1: Set T1 (slower) // // Output: 16-bit signed value in PRODUCT // // Clobbered: PRODUCT, X, A, C multiply_8bit_signed: jsr multiply_8bit_unsigned // Apply sign (See C=Hacking16 for details). lda T1 bpl !+ sec lda PRODUCT+1 sbc T2 sta PRODUCT+1 !: lda T2 bpl !+ sec lda PRODUCT+1 sbc T1 sta PRODUCT+1 !: rts // Description: Unsigned 16-bit multiplication with unsigned 32-bit result. // // Input: 16-bit unsigned value in T1 // 16-bit unsigned value in T2 // Carry=0: Re-use T1 from previous multiplication (faster) // Carry=1: Set T1 (slower) // // Output: 32-bit unsigned value in PRODUCT // // Clobbered: PRODUCT, X, A, C // // Allocation setup: T1,T2 and PRODUCT preferably on Zero-page. // square1_lo, square1_hi, square2_lo, square2_hi must be // page aligned. Each table are 512 bytes. Total 2kb. // // Table generation: I:0..511 // square1_lo = <((I*I)/4) // square1_hi = >((I*I)/4) // square2_lo = <(((I-255)*(I-255))/4) // square2_hi = >(((I-255)*(I-255))/4) multiply_16bit_unsigned: // T2 = BBbb // >T1 * T1 * >T2 = DDdd // // AAaa // BBbb // CCcc // + DDdd // ---------- // PRODUCT! // Setup T1 if changed bcc !+ lda T1+0 sta sm1a+1 sta sm3a+1 sta sm5a+1 sta sm7a+1 eor #$ff sta sm2a+1 sta sm4a+1 sta sm6a+1 sta sm8a+1 lda T1+1 sta sm1b+1 sta sm3b+1 sta sm5b+1 sta sm7b+1 eor #$ff sta sm2b+1 sta sm4b+1 sta sm6b+1 sta sm8b+1 !: // Perform T1_hi * T2 = BBbb ldx T2+1 sec sm5a: lda square1_lo,x sm6a: sbc square2_lo,x sta _bb+1 sm7a: lda square1_hi,x sm8a: sbc square2_hi,x sta _BB+1 // Perform >T1 * >T2 = DDdd sec sm5b: lda square1_lo,x sm6b: sbc square2_lo,x sta _dd+1 sm7b: lda square1_hi,x sm8b: sbc square2_hi,x sta PRODUCT+3 // Add the separate multiplications together clc _AA: lda #0 _bb: adc #0 sta PRODUCT+1 _BB: lda #0 _CC: adc #0 sta PRODUCT+2 bcc !+ inc PRODUCT+3 clc !: _cc: lda #0 adc PRODUCT+1 sta PRODUCT+1 _dd: lda #0 adc PRODUCT+2 sta PRODUCT+2 bcc !+ inc PRODUCT+3 !: rts // Description: Signed 16-bit multiplication with signed 32-bit result. // // Input: 16-bit signed value in T1 // 16-bit signed value in T2 // Carry=0: Re-use T1 from previous multiplication (faster) // Carry=1: Set T1 (slower) // // Output: 32-bit signed value in PRODUCT // // Clobbered: PRODUCT, X, A, C multiply_16bit_signed: jsr multiply_16bit_unsigned // Apply sign (See C=Hacking16 for details). lda T1+1 bpl !+ sec lda PRODUCT+2 sbc T2+0 sta PRODUCT+2 lda PRODUCT+3 sbc T2+1 sta PRODUCT+3 !: lda T2+1 bpl !+ sec lda PRODUCT+2 sbc T1+0 sta PRODUCT+2 lda PRODUCT+3 sbc T1+1 sta PRODUCT+3 !: rts generate_multiplication_tables: ldx #0 txa .byte $c9 !lb1: tya adc #0 !ml1: sta square1_hi, x tay cmp #$40 txa ror !ml9: adc #0 sta !ml9- + 1 inx !ml0: sta square1_lo, x bne !lb1- inc !ml0- + 2 inc !ml1- + 2 clc iny bne !lb1- ldx #$00 ldy #$ff !: lda square1_hi + 1, x sta square2_hi + $0100, x lda square1_hi, x sta square2_hi, y lda square1_lo + 1, x sta square2_lo + $0100, x lda square1_lo, x sta square2_lo, y dey inx bne !- rts // // udivmod32 // // 32-bit unsigned integer div/mod routine. // Based on https://bisqwit.iki.fi/story/howto/bitmath/ // // TODO consistent name // TODO could probably be optimized further // // Input: // - udivmod32_dividend: LSB of the dividend. // - udivmod32_divisor: LSB of the divisor. // // Output: // - udivmod32_result: LSB of the division result. // - udivmod32_remainder: LSB of the modulo result. // // Destroys: a, $20..$2b // .const scaled_divisor = $20 // ..23 .const multiple = $24 // ..27 .const temp = $28 // ..2b udivmod32: lda #0 sta udivmod32_result + 0 sta udivmod32_result + 1 sta udivmod32_result + 2 sta udivmod32_result + 3 // if (divisor == 0) { lda udivmod32_divisor + 0 bne !if_end+ lda udivmod32_divisor + 1 bne !if_end+ lda udivmod32_divisor + 2 bne !if_end+ lda udivmod32_divisor + 3 bne !if_end+ // return 0; rts // } !if_end: // uint32_t scaled_divisor = divisor; lda udivmod32_divisor + 0 sta scaled_divisor + 0 lda udivmod32_divisor + 1 sta scaled_divisor + 1 lda udivmod32_divisor + 2 sta scaled_divisor + 2 lda udivmod32_divisor + 3 sta scaled_divisor + 3 // remainder = dividend; lda udivmod32_dividend + 0 sta udivmod32_remainder + 0 lda udivmod32_dividend + 1 sta udivmod32_remainder + 1 lda udivmod32_dividend + 2 sta udivmod32_remainder + 2 lda udivmod32_dividend + 3 sta udivmod32_remainder + 3 // uint32_t multiple = 1; lda #1 sta multiple + 0 lda #0 sta multiple + 1 sta multiple + 2 sta multiple + 3 // while (!(scaled_divisor & 0x80000000)) { !while_start: lda #$80 and scaled_divisor + 3 bne !while_end+ // scaled_divisor <<= 1 asl scaled_divisor + 0 rol scaled_divisor + 1 rol scaled_divisor + 2 rol scaled_divisor + 3 // multiple <<= 1 asl multiple + 0 rol multiple + 1 rol multiple + 2 rol multiple + 3 // } jmp !while_start- !while_end: // do { !do_start: // if (remainder >= scaled_divisor) { sec lda udivmod32_remainder + 0 sbc scaled_divisor + 0 sta temp + 0 lda udivmod32_remainder + 1 sbc scaled_divisor + 1 sta temp + 1 lda udivmod32_remainder + 2 sbc scaled_divisor + 2 sta temp + 2 lda udivmod32_remainder + 3 sbc scaled_divisor + 3 sta temp + 3 bcc !if_end+ // remain -= scaled_divisor; lda temp + 0 sta udivmod32_remainder + 0 lda temp + 1 sta udivmod32_remainder + 1 lda temp + 2 sta udivmod32_remainder + 2 lda temp + 3 sta udivmod32_remainder + 3 // result += multiple; clc lda udivmod32_result + 0 adc multiple + 0 sta udivmod32_result + 0 lda udivmod32_result + 1 adc multiple + 1 sta udivmod32_result + 1 lda udivmod32_result + 2 adc multiple + 2 sta udivmod32_result + 2 lda udivmod32_result + 3 adc multiple + 3 sta udivmod32_result + 3 // } !if_end: // scaled_divisor >>= 1; lsr scaled_divisor + 3 ror scaled_divisor + 2 ror scaled_divisor + 1 ror scaled_divisor + 0 // multiple >>= 1; lsr multiple + 3 ror multiple + 2 ror multiple + 1 ror multiple + 0 // } while (multiple != 0); lda multiple + 0 bne !do_start- lda multiple + 1 bne !do_start- lda multiple + 2 bne !do_start- lda multiple + 3 bne !do_start- rts