diff options
author | Michael Sevakis <jethead71@rockbox.org> | 2012-05-07 03:12:56 -0400 |
---|---|---|
committer | Michael Sevakis <jethead71@rockbox.org> | 2013-05-16 18:52:21 +0200 |
commit | a7dee7f4472d9ebc77d6917043dc3c59a38e40d5 (patch) | |
tree | 5aabd08e93a2f398d041c7994f395be365582f43 | |
parent | 91b850ec425545d52d25c5b0f9a2caf6f853dfb7 (diff) | |
download | rockbox-a7dee7f.tar.gz rockbox-a7dee7f.zip |
Introduce new hermite polynomial resampler.
Uses the Catmull-Rom case of Hermite cubic splines.
Vastly improves the quality and accuracy of audio resampling with a
rather minor additional overhead compared to the previous linear
implementation.
ARM and Coldfire assembly implementations included.
Change-Id: Ic45d84bc66c5b312ef373198297a952167a4be26
Reviewed-on: http://gerrit.rockbox.org/304
Reviewed-by: Michael Sevakis <jethead71@rockbox.org>
Tested-by: Michael Sevakis <jethead71@rockbox.org>
-rw-r--r-- | lib/rbcodec/dsp/dsp_arm.S | 310 | ||||
-rw-r--r-- | lib/rbcodec/dsp/dsp_cf.S | 385 | ||||
-rw-r--r-- | lib/rbcodec/dsp/resample.c | 108 |
3 files changed, 543 insertions, 260 deletions
diff --git a/lib/rbcodec/dsp/dsp_arm.S b/lib/rbcodec/dsp/dsp_arm.S index 136814d79a..5b899fa718 100644 --- a/lib/rbcodec/dsp/dsp_arm.S +++ b/lib/rbcodec/dsp/dsp_arm.S @@ -289,114 +289,224 @@ crossfeed_meier_process: ldmpc regs=r4-r10 @ restore non-volatile context, return .size crossfeed_meier_process, .-crossfeed_meier_process - /**************************************************************************** - * int resample_linear(struct resample_data *data, struct dsp_buffer *src, - * struct dsp_buffer *dst) + * int resample_hermite(struct resample_data *data, struct dsp_buffer *src, + * struct dsp_buffer *dst) */ .section .text, "ax",%progbits - .global resample_linear -resample_linear: + .global resample_hermite +resample_hermite: @input: r0 = data, r1 = src, r2 = dst - stmfd sp!, { r4-r11, lr } @ stack modified regs - ldr r4, [r0] @ r4 = data->delta - add r10, r0, #4 @ r10 = &data->phase - ldrb r3, [r1, #17] @ r3 = num_channels, - stmfd sp!, { r1, r10 } @ stack src, &data->phase -.lrs_channel_loop: - ldr r5, [r10] @ r5 = data->phase - ldr r6, [r1] @ r6 = srcrem = src->remcount - ldr r7, [r1, r3, lsl #2] @ r7 = src->p32[ch] - ldr r8, [r2, r3, lsl #2] @ r8 = dst->p32[ch] - ldr r9, [r2, #12] @ r9 = dstrem = dst->bufcount - - cmp r6, #0x8000 @ srcrem = MIN(srcrem, 0x8000) - movgt r6, #0x8000 @ - mov r0, r5, lsr #16 @ pos = MIN(pos, srcrem) - cmp r0, r6 @ - movgt r0, r6 @ r0 = pos = phase >> 16 - cmp r0, #0 @ - ldrle r11, [r10, r3, lsl #2] @ pos <= 0? r11 = last = last_sample[ch] - addgt r12, r7, r0, lsl #2 @ pos > 0? r1 = last = s[pos - 1] - ldrgt r11, [r12, #-4] @ - cmp r0, r6 @ - bge .lrs_channel_done @ pos >= count? channel complete - - cmp r4, #0x10000 @ delta >= 1.0? - ldrhs r12, [r7, r0, lsl #2] @ yes? r12 = s[pos] - bhs .lrs_dsstart @ yes? is downsampling + stmfd sp!, { r0-r2, r4-r11, lr } @ stack parms, modified regs + ldr r9, [r1] @ r9 = srcrem = src->remcount + ldrb r10, [r1, #17] @ r10 = ch = num_channels + ldr r14, [r0] @ r14 = data->delta, r0 = data + + cmp r9, #0x8000 @ srcrem = MIN(srcrem, 0x8000) + movgt r9, #0x8000 @ + + @ Channels are processed high to low while history is saved low to high + @ It's really noone's business how we do this + add r12, r0, #8 @ r12 = h = data->history + +.hrs_channel_loop: + stmfd sp!, { r10, r12 } @ push ch, h + ldr r5, [r0, #4] @ r5 = data->phase + ldr r6, [r1, r10, lsl #2] @ r6 = src->p32[ch] + ldr r7, [r2, r10, lsl #2] @ r7 = dst->p32[ch] + ldr r8, [r2, #12] @ r8 = dstrem = dst->bufcount + + mov r0, r5, lsr #16 @ r0 = pos = phase >> 16 + cmp r0, r9 @ r0 = pos = MIN(pos, srcrem) + movgt r0, r9 @ + + add r6, r6, r0, lsl #2 @ r6 = &s[pos] + + cmp r0, #3 @ pos >= 3? history not needed + ldmgedb r6, { r1-r3 } @ x3..x1 = s[pos-3]..s[pos-1] + bge .hrs_loadhist_done @ + add r10, r0, r0, lsl #1 @ branch pc + pos*12 + add pc, pc, r10, lsl #2 @ + nop @ + + ldmia r12, { r1-r3 } @ x3..x1 = h[0]..h[2] + b .hrs_loadhist_done @ + nop @ + + ldmib r12, { r1-r2 } @ x3..x2 = h[1]..h[2] + ldr r3, [r6, #-4] @ x1 = s[0] + b .hrs_loadhist_done @ + + ldr r1, [r12, #8] @ x3 = h[2] + ldmdb r6, { r2-r3 } @ x2..x1 = s[0]..s[1] +.hrs_loadhist_done: + + cmp r0, r9 @ pos past end? + bge .hrs_channel_done + + cmp r14, #0x10000 @ delta >= 1.0? + bhs .hrs_dsstart @ yes? is downsampling /** Upsampling **/ - mov r5, r5, lsl #16 @ Move phase into high halfword - add r7, r7, r0, lsl #2 @ r7 = &s[pos] - sub r0, r6, r0 @ r0 = dte = srcrem - pos -.lrs_usloop_1: - ldr r12, [r7], #4 @ r12 = s[pos] - sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1] -.lrs_usloop_0: - mov r1, r5, lsr #16 @ r1 = frac = phase >> 16 - @ keep frac in Rs to take advantage of multiplier early termination - smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi) - add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff - add r1, r1, r10, lsl #16 @ - str r1, [r8], #4 @ *d++ = out - subs r9, r9, #1 @ destination full? - bls .lrs_usfull @ yes? channel is done - adds r5, r5, r4, lsl #16 @ phase += delta << 16 - bcc .lrs_usloop_0 @ if carry is set, pos is incremented - subs r0, r0, #1 @ if srcrem > 0, do another sample - mov r11, r12 @ r11 = last = s[pos-1] (pos changed) - bgt .lrs_usloop_1 - b .lrs_usdone - -.lrs_usfull: - adds r5, r5, r4, lsl #16 @ do missed phase increment - subcs r0, r0, #1 @ do missed srcrem decrement - movcs r11, r12 @ r11 = s[pos-1] (pos changed) - -.lrs_usdone: - sub r0, r6, r0 @ r0 = pos = srcrem - dte - orr r5, r5, r0 @ reconstruct swapped phase - mov r5, r5, ror #16 @ swap pos and frac for phase - b .lrs_channel_done @ + str r9, [sp, #-4]! @ push srcrem + mov r5, r5, lsl #16 @ r5 = phase << 16 + sub r0, r9, r0 @ r0 = dte = srcrem - pos + mov r14, r14, lsl #16 @ r14 = delta << 16 + + @ Register usage in loop: + @ r0 = dte + @ r1 = x3, r2 = x2, r3 = x1, r4 = x0 + @ r5 = phase << 16/frac, r6 = &s[pos], r7 = d, r8 = dstrem + @ r9 = scratch/acclo, r10 = scratch/acchi + @ r11 = c2, r12 = c3, c1 calculated in frac loop + @ r14 = delta << 16 + @ + @ Try to avoid overflow as much as possible and at the same time preserve + @ accuracy. Same formulas apply to downsampling but registers and + @ instruction order differ due to specific constraints. + @ c1 = -0.5*x3 + 0.5*x1 + @ = 0.5*(x1 - x3) <-- + @ + @ v = x1 - x2, -v = x2 - x1 + @ c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0 + @ = x3 + 2*(x1 - x2) - 0.5*(x0 + x2) + @ = x3 + 2*v - 0.5*(x0 + x2) <-- + @ + @ c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0 + @ = 0.5*(x0 - x3 + (x2 - x1)) + (x2 - x1) + @ = 0.5*(x0 - x3 - v) - v <-- +.hrs_usloop_carry: + ldr r4, [r6], #4 @ x0 = s[pos] + sub r9, r3, r2 @ r9 = v, r11 = c2, r12 = c3 + add r11, r1, r9, asl #1 @ + add r10, r4, r2 @ + sub r12, r4, r1 @ + sub r12, r12, r9 @ + sub r11, r11, r10, asr #1 @ + rsb r12, r9, r12, asr #1 @ +.hrs_usloop_frac: + mov r5, r5, lsr #16 @ r5 = phase -> frac + smull r9, r10, r12, r5 @ acc = frac * c3 + c2 + add r9, r11, r9, lsr #16 @ + add r9, r9, r10, asl #16 @ + smull r9, r10, r5, r9 @ acc = frac * acc + c1 + mov r9, r9, lsr #16 @ + orr r9, r9, r10, asl #16 @ + sub r10, r3, r1 @ + add r9, r9, r10, asr #1 @ + smull r9, r10, r5, r9 @ acc = frac * acc + x2 + subs r8, r8, #1 @ destination full? + add r9, r2, r9, lsr #16 @ + add r9, r9, r10, asl #16 @ + str r9, [r7], #4 @ *d++ = acc + bls .hrs_usfull @ yes? channel is done + adds r5, r14, r5, lsl #16 @ frac += delta + bcc .hrs_usloop_frac @ if carry is set, pos is incremented + + subs r0, r0, #1 @ if dte > 0, do another sample + mov r1, r2 @ x3 = x2 + mov r2, r3 @ x2 = x1 + mov r3, r4 @ x1 = x0 + bgt .hrs_usloop_carry + b .hrs_usdone + +.hrs_usfull: + adds r5, r14, r5, lsl #16 @ do missed phase increment + bcc .hrs_usdone @ + sub r0, r0, #1 @ do missed dte decrement + mov r1, r2 @ do missed history update + mov r2, r3 @ + mov r3, r4 @ + +.hrs_usdone: + ldr r9, [sp], #4 @ r9 = pop srcrem + mov r14, r14, lsr #16 @ restore delta for next round + sub r0, r9, r0 @ r0 = pos = srcrem - dte + orr r5, r5, r0 @ reconstruct swapped phase + mov r5, r5, ror #16 @ swap pos and frac for phase + b .hrs_channel_done /** Downsampling **/ -.lrs_dsloop: - add r10, r7, r0, lsl #2 @ r10 = &s[pos] - ldmda r10, { r11, r12 } @ r11 = last, r12 = s[pos] -.lrs_dsstart: - sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1] - @ keep frac in Rs to take advantage of multiplier early termination - bic r1, r5, r0, lsl #16 @ frac = phase & 0xffff - smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi) - add r5, r5, r4 @ phase += delta - subs r9, r9, #1 @ destination full? ... - mov r0, r5, lsr #16 @ pos = phase >> 16 - add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff - add r1, r1, r10, lsl #16 @ - str r1, [r8], #4 @ *d++ = out - cmpgt r6, r0 @ ... || pos >= srcrem? ... - bgt .lrs_dsloop @ ... no, do more samples - - cmp r0, r6 @ pos = MIN(pos, srcrem) - movgt r0, r6 @ - sub r1, r0, #1 @ pos must always be > 0 since step >= 1.0 - ldr r11, [r7, r1, lsl #2] @ r11 = s[pos - 1] - -.lrs_channel_done: - ldmia sp, { r1, r10 } @ recover src, &data->phase - str r11, [r10, r3, lsl #2] @ last_sample[ch] = last - subs r3, r3, #1 @ - bgt .lrs_channel_loop @ - - ldr r6, [r2, #12] @ r6 = dst->bufcount - sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16) - str r5, [r10] @ data->phase = r5 - sub r6, r6, r9 @ r6 = dst->bufcount - dstrem = dstcount - str r6, [r2] @ dst->remcount = dstcount - add sp, sp, #8 @ adjust stack for temp variables - ldmpc regs=r4-r11 @ ... and we're out - .size resample_linear, .-resample_linear + @ Register usage in loop: + @ r0 = pos/frac + @ r1 = x3, r2 = x2, r3 = x1, r4 = x0 + @ r5 = phase, r6 = &s[pos], r7 = d, r8 = dstrem + @ r9 = srcrem, r10 = scratch/acclo + @ r11 = c2/scratch, r12 = c3/acchi + @ r14 = delta +.hrs_dsloop_4: + ldmdb r6, { r1-r3 } @ x3..x0 = s[pos-3]..s[pos-1] + b .hrs_dsloop +.hrs_dsloop_3: + ldmdb r6, { r2-r3 } @ x2..x0 = s[pos-2]..s[pos-1] + mov r1, r4 @ x3 = x0 + b .hrs_dsloop +.hrs_dsloop_2: + mov r1, r3 @ x3 = x1 + ldr r3, [r6, #-4] @ x1 = s[pos-1] + mov r2, r4 @ x2 = x0 + b .hrs_dsloop +.hrs_dsloop_1: @ expected loop destination + mov r1, r2 @ x3 = x2 + mov r2, r3 @ x2 = x1 + mov r3, r4 @ x1 = x0 +.hrs_dsloop: + subs r8, r8, #1 @ destination full? + cmpgt r9, r0 @ ... || pos >= srcrem? + ble .hrs_channel_done +.hrs_dsstart: + ldr r4, [r6] @ x0 = s[pos] + sub r10, r3, r2 @ r10 = v, r11 = c2, r12 = c3 + add r11, r4, r2 @ + bic r0, r5, r0, lsl #16 @ r0 = frac = phase & 0xffff + sub r11, r1, r11, asr #1 @ + add r11, r11, r10, asl #1 @ + sub r12, r4, r1 @ + sub r12, r12, r10 @ + rsb r12, r10, r12, asr #1 @ + smull r10, r12, r0, r12 @ acc = frac * c3 + c2 + add r10, r11, r10, lsr #16 @ + add r10, r10, r12, asl #16 @ + sub r11, r3, r1 @ + smull r10, r12, r0, r10 @ acc = frac * acc + c1 + mov r11, r11, asr #1 @ + add r10, r11, r10, lsr #16 @ + add r10, r10, r12, asl #16 @ + smull r10, r12, r0, r10 @ acc = frac * acc + x2 + mov r11, r5, lsr #16 @ r11 = last_pos + add r5, r5, r14 @ phase += delta + mov r0, r5, lsr #16 @ r0 = pos = phase >> 16 + add r10, r2, r10, lsr #16 @ + add r10, r10, r12, asl #16 @ + str r10, [r7], #4 @ *d++ = acc + + cmp r0, r9 @ r0 = pos = MIN(pos, srcrem) + movgt r0, r9 @ + sub r11, r0, r11 @ shift = pos - last_pos + cmp r11, #4 @ + add r6, r6, r11, lsl #2 @ r6 += shift * 4 + bge .hrs_dsloop_4 @ + ldr pc, [pc, r11, lsl #2] @ branch to corresponding loop address + .word 0, 0 + .word .hrs_dsloop_1 + .word .hrs_dsloop_2 + .word .hrs_dsloop_3 + +.hrs_channel_done: + ldmfd sp!, { r10, r12 } @ recover ch, h + subs r10, r10, #1 @ --ch + stmia r12!, { r1-r3 } @ h[0..2] = x3..x1 + ldmgtia sp, { r0-r2 } @ load data, src, dst + bgt .hrs_channel_loop + + ldmfd sp!, { r1-r3 } @ pop data, src, dst + sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16) + ldr r2, [r3, #12] @ r2 = dst->bufcount + str r5, [r1, #4] @ data->phase = r5 + sub r2, r2, r8 @ r2 = dst->bufcount - dstrem + str r2, [r3] @ dst->remcount = r2 + ldmpc regs=r4-r11 @ ... and we're out + .size resample_hermite, .-resample_hermite /**************************************************************************** * void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p) diff --git a/lib/rbcodec/dsp/dsp_cf.S b/lib/rbcodec/dsp/dsp_cf.S index 7245fae4cd..4b0c6276e1 100644 --- a/lib/rbcodec/dsp/dsp_cf.S +++ b/lib/rbcodec/dsp/dsp_cf.S @@ -179,145 +179,286 @@ crossfeed_meier_process: .size crossfeed_meier_process, .-crossfeed_meier_process /**************************************************************************** - * int resample_linear(struct resample_data *data, struct dsp_buffer *src, - * struct dsp_buffer *dst) + * int resample_hermite(struct resample_data *data, struct dsp_buffer *src, + * struct dsp_buffer *dst) */ .section .text .align 2 - .global resample_linear -resample_linear: + .global resample_hermite +resample_hermite: | input: 4(sp) = data, 8(sp) = src, 12(sp) = dst - lea.l -44(%sp), %sp | save non-volatiles - movem.l %d2-%d7/%a2-%a6, (%sp) | - movem.l 48(%sp), %a0-%a2 | %a0 = data + lea.l -52(%sp), %sp | save non-volatiles, allocate temps + movem.l %d2-%d7/%a2-%a6, 8(%sp) | + movem.l 56(%sp), %a0-%a2 | %a0 = data | %a1 = src | %a2 = dst - clr.l %d1 | %d1 = ch = src->format.num_channels - move.b 17(%a1), %d1 | - moveq.l #16, %d7 | %d7 = shift -.lrs_channel_loop: | - movem.l (%a0), %d2-%d3 | %d2 = delta = data->delta, - | %d3 = phase = data->phase - move.l (%a1), %d4 | %d4 = srcrem = src->remcount - move.l 12(%a2), %d5 | %d5 = dstrem = dst->bufcount - cmp.l #0x8000, %d4 | %d4 = MIN(srcrem, 0x8000) - ble.b 10f | - move.l #0x8000, %d4 | -10: | - move.l (%a1, %d1.l*4), %a3 | %a3 = s = src->p32[ch] - move.l (%a2, %d1.l*4), %a4 | %a4 = d = dst->p32[ch] - move.l %d3, %d0 | %d0 = pos - lsr.l %d7, %d0 | ... - beq.b 11f | pos == 0? - cmp.l %d4, %d0 | pos = MIN(pos, srcrem) - blt.b 12f | - move.l %d4, %d0 | pos = srcrem - move.l -4(%a3, %d0.l*4), %d6 | %d6 = last = s[pos - 1] - bra.w .lrs_channel_complete | at limit; nothing to do but next -11: | - move.l 4(%a0, %d1.l*4), %d6 | %d6 = last = last_sample[ch] - tpf.l | trap next move.l (last = s[pos - 1]) -12: | - move.l -4(%a3, %d0.l*4), %d6 | %d6 = last = s[pos - 1] - cmp.l #0x10000, %d2 | delta >= 1.0? - bhs.b .lrs_downsample | yes? downsampling + clr.l %d5 | %d5 = ch = src->format.num_channels + move.b 17(%a1), %d5 | + lea.l 8(%a0), %a5 | %a5 = h = history[ch] + moveq.l #16, %d7 | %d7 = shift val +.hrs_channel_loop: | + movem.l %d5/%a5, (%sp) | store ch, h + movem.l (%a0), %d1-%d2 | %d1 = delta = data->delta, + | %d2 = phase = data->phase + move.l (%a1), %d3 | %d3 = srcrem = src->remcount + move.l 12(%a2), %d4 | %d4 = dstrem = dst->bufcount + + cmp.l #0x8000, %d3 | %d4 = MIN(srcrem, 0x8000) + ble.b 1f | + move.l #0x8000, %d3 | +1: | + + move.l (%a1, %d5.l*4), %a1 | %a1 = s = src->p32[ch] + move.l (%a2, %d5.l*4), %a2 | %a2 = d = dst->p32[ch] + + move.l %d2, %d0 | %d0 = pos = phase >> 16 + lsr.l %d7, %d0 | + + cmp.l %d3, %d0 | pos = MIN(pos, srcrem) + ble.b 1f | + move.l %d3, %d0 | +1: + + lea.l (%a1, %d0.l*4), %a1 | %a1 = &s[pos] + + cmp.l #3, %d0 | + bge.b 1f | + move.l %d0, %a0 | + lea.l (%a0, %a0.l*2), %a0 | + jmp 2(%pc, %a0.l*4) | 4b | + | 0 + movem.l (%a5), %a3-%a5 | 4b | x3..x1 = h[0]..h[2] + bra.b 2f | 2b | + .dcb.w 3,0 | 6b | filler + | 1 + movem.l 4(%a5), %a3-%a4 | 6b | x3..x2 = h[1]..h[2] + move.l -4(%a1), %a5 | 4b | x1 = s[0] + bra.b 2f | 2b | + | 2 + move.l 8(%a5), %a3 | 4b | x3 = h[2] + movem.l -8(%a1), %a4-%a5 | 6b | x2..x1 = s[0]..s[1] + bra.b 2f | 2b | +1: | 3 + + movem.l -12(%a1), %a3-%a5 | x3...x1 = s[pos-3]..s[pos-1] +2: + + cmp.l %d3, %d0 | pos past end? + bge.w .hrs_channel_done | + + cmp.l #0x10000, %d1 | delta >= 1.0? + bhs.w .hrs_dsstart | yes? downsampling | /** Upsampling **/ | - lea.l (%a3, %d0.l*4), %a3 | %a3 = &s[pos] - sub.l %d4, %d0 | %d0 = pos - srcrem = -dte - lsl.l %d7, %d2 | move delta to bits 30..15 + sub.l %d3, %d0 | %d0 = pos - srcrem = -dte + lsl.l %d7, %d1 | move delta to bits 30..15 + lsr.l #1, %d1 | + lsl.l %d7, %d2 | move phase to bits 30..15 lsr.l #1, %d2 | - lsl.l %d7, %d3 | move phase to bits 30..15 - lsr.l #1, %d3 | - move.l (%a3)+, %a5 | %a5 = s[pos] - move.l %a5, %a6 | %a6 = diff = s[pos] - last - sub.l %d6, %a6 | - bra.b 22f | - /* Funky loop structure is to avoid emac latency stalls */ -20: | - move.l (%a3)+, %a5 | %a5 = s[pos] - move.l %a5, %a6 | %a6 = diff = s[pos] - last - sub.l %d6, %a6 | -21: | - movclr.l %acc0, %d7 | *d++ = %d7 = result - move.l %d7, (%a4)+ | -22: | - move.l %d6, %acc0 | %acc0 = last - mac.l %d3, %a6, %acc0 | %acc0 += frac * diff - subq.l #1, %d5 | dstrem <= 0? - ble.b 23f | yes? stop - add.l %d2, %d3 | phase += delta - bpl.b 21b | load next values? - move.l %a5, %d6 | - bclr.l #31, %d3 | clear sign bit + | + | Register usage in loop: + | r0 = dte, d1 = delta, d2 = phase, d3 = srcrem, d4 = dstrem + | d5 = scratch, d6 = c3, d7 = scratch + | a0 = c2, a1 = &s[pos], a2 = d, + | a3 = x3, a4 = x2, a5 = x1, a6 = x0 + | + | Try to avoid overflow as much as possible and at the same time preserve + | accuracy. Same formulas apply to downsampling but registers and + | instruction order differ due to specific constraints. + | c1 = -0.5*x3 + 0.5*x1 + | = 0.5*(x1 - x3) <-- + | + | v = x1 - x2, -v = x2 - x1 + | c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0 + | = x3 + 2*(x1 - x2) - 0.5*(x0 + x2) + | = x3 + 2*v - 0.5*(x0 + x2) <-- + | + | c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0 + | = 0.5*x0 - 0.5*x3 + 0.5*(x2 - x1) + (x2 - x1) + | = 0.5*(x0 - x3 - v) - v <-- + | +.hrs_usloop_carry: + move.l (%a1)+, %a6 | %a6 = s[pos] + + move.l %a5, %d5 | v + sub.l %a4, %d5 | + + move.l %a6, %d6 | c3 + sub.l %a3, %d6 | + sub.l %d5, %d6 | + asr.l #1, %d6 | + sub.l %d5, %d6 | + + lea.l (%a3, %d5.l*2), %a0 | c2 + move.l %a6, %d5 | + add.l %a4, %d5 | + asr.l #1, %d5 | + sub.l %d5, %a0 | + +.hrs_usloop_frac: + move.l %a0, %acc0 | %acc0 = frac * c3 + c2 + mac.l %d2, %d6, %acc0 | + + move.l %a5, %d5 | c1 + sub.l %a3, %d5 | + asr.l #1, %d5 | + + movclr.l %acc0, %d7 | %acc1 = frac * acc + c1 + move.l %d5, %acc1 | + mac.l %d2, %d7, %acc1 | + + move.l %a4, %acc0 | %acc0 = frac * acc + x2 + movclr.l %acc1, %d5 | + mac.l %d2, %d5, %acc0 | + + subq.l #1, %d4 | dstrem <= 0? + ble.b .hrs_usfull | yes? stop + + movclr.l %acc0, %d5 | *d++ = d5 = result + move.l %d5, (%a2)+ | + + add.l %d1, %d2 | phase += delta + bpl.b .hrs_usloop_frac | load next values? + + move.l %a4, %a3 | x3 = x2 + move.l %a5, %a4 | x2 = x1 + move.l %a6, %a5 | x1 = x0 + + bclr.l #31, %d2 | clear sign bit addq.l #1, %d0 | dte > 0? - bmi.b 20b | yes? continue resampling - tpf.w | trap next add.l (phase += delta) -23: | - add.l %d2, %d3 | phase += delta - lsl.l #1, %d3 | frac -> phase - bcs.b 24f | was sign bit set? - tpf.l | -24: | - move.l %a5, %d6 | yes? was going to move to new s[pos] - addq.l #1, %d0 | - movclr.l %acc0, %d7 | *d = %d7 = result - move.l %d7, (%a4) | - add.l %d4, %d0 | %d0 = -dte + srcrem = pos - or.l %d0, %d3 | restore phase - swap.w %d3 | - moveq.l #16, %d7 | %d7 = shift - bra.b .lrs_channel_complete | - | - /** Downsampling **/ | -.lrs_downsample: | - move.l (%a3, %d0.l*4), %a5 | %a5 = s[pos] - bra.b 31f | -30: | - lea.l -4(%a3, %d0.l*4), %a5 | %d6 = s[pos - 1], %a5 = s[pos] - movem.l (%a5), %d6/%a5 | -31: | - move.l %d6, %acc0 | %acc0 = last - sub.l %d6, %a5 | %a5 = diff = s[pos] - s[pos - 1] - move.l %d3, %d0 | frac = (phase << 16) >> 1 - lsl.l %d7, %d0 | - lsr.l #1, %d0 | - mac.l %d0, %a5, %acc0 | %acc0 += frac * diff - add.l %d2, %d3 | phase += delta - move.l %d3, %d0 | pos = phase >> 16 + bmi.b .hrs_usloop_carry | yes? continue resampling + bra.b .hrs_usdone + +.hrs_usfull: + movclr.l %acc0, %d5 | *d++ = d5 = result + move.l %d5, (%a2) | + + add.l %d1, %d2 | do missed phase increment + bpl.b .hrs_usdone | was sign bit set? + + move.l %a4, %a3 | do missed history update + move.l %a5, %a4 | + move.l %a6, %a5 | + + addq.l #1, %d0 | do missed dte decrement + +.hrs_usdone: + moveq.l #16, %d7 | restore shift + lsl.l #1, %d2 | frac -> phase + add.l %d3, %d0 | %d0 = -dte + srcrem = pos + or.l %d0, %d2 | restore phase + swap.w %d2 | + + bra.w .hrs_channel_done | + + /** Downsampling **/ + | + | Register usage in loop: + | r0 = pos, d1 = delta, d2 = phase, d3 = srcrem, d4 = dstrem + | d5 = scratch, d6 = scratch, d7 = 16 (shift value) + | a0 = scratch, a1 = &s[pos], a2 = d, + | a3 = x3, a4 = x2, a5 = x1, a6 = x0 + | +.hrs_dsloop: + movclr.l %acc0, %d5 | *d++ = acc + move.l %d5, (%a2)+ | + + sub.l %d0, %a0 | %a0 = -shift = last_pos - pos + move.l %a0, %d5 | + asl.l #2, %d5 | -shift -> -bytes + sub.l %d5, %a1 | %a1 = s = s - -bytes + cmp.l #-4, %a0 | >= 4? + ble.b 1f | + add.l %d5, %a0 | %a0 = 5 * -shift + jmp 40(%pc, %a0.l*2) | 4b | +1: | +4 + + movem.l -12(%a1), %a3-%a5 | 6b | x3..x0 = s[pos-3]..s[pos-1] + bra.b 1f | 2b | + | +3 + move.l %a6, %a3 | 2b | x3 = x0 + movem.l -8(%a1), %a4-%a5 | 6b | x2..x0 = s[pos-2]..s[pos-1] + bra.b 1f | 2b | 10 + | +2 + move.l %a5, %a3 | 2b | x3 = x1 + move.l %a6, %a4 | 2b | x2 = x0 + move.l -4(%a1), %a5 | 4b | x1 = s[pos-1] + bra.b 1f | 2b | 10 + | +1 + move.l %a4, %a3 | 2b | x3 = x2 | expected loop destination + move.l %a5, %a4 | 2b | x2 = x1 + move.l %a6, %a5 | 2b | x1 = x0 +1: + + subq.l #1, %d4 | 2b | dstrem <= 0? + ble.b .hrs_channel_done | 2b | yes? stop + cmp.l %d3, %d0 | + bge.b .hrs_channel_done | + +.hrs_dsstart: + move.l (%a1), %a6 | %a6 = s[pos] + move.l %a5, %d5 | v + sub.l %a4, %d5 | + + move.l %a6, %d6 | c3 + sub.l %a3, %d6 | + sub.l %d5, %d6 | + asr.l #1, %d6 | + sub.l %d5, %d6 | + + lea.l (%a3, %d5.l*2), %a0 | c2 + move.l %a6, %d5 | + add.l %a4, %d5 | + asr.l #1, %d5 | + sub.l %d5, %a0 | + + move.l %d2, %d5 | phase -> frac + lsl.l %d7, %d5 | + lsr.l #1, %d5 | + + move.l %a0, %acc0 | %acc0 = frac * c3 + c2 + mac.l %d5, %d6, %acc0 | + + move.l %a5, %d6 | c1 + sub.l %a3, %d6 | + asr.l #1, %d6 | + + movclr.l %acc0, %a0 | %acc1 = frac * acc + c1 + move.l %d6, %acc1 | + mac.l %d5, %a0, %acc1 | + + move.l %d0, %a0 | %a0 = last_pos + add.l %d1, %d2 | phase += delta + move.l %d2, %d0 | pos = phase >> 16 lsr.l %d7, %d0 | - movclr.l %acc0, %a5 | - move.l %a5, (%a4)+ | *d++ = %d0 - subq.l #1, %d5 | dst full? - ble.b 32f | yes? stop - cmp.l %d4, %d0 | pos < srcrem? - blt.b 30b | yes? continue resampling - tpf.l | trap cmp.l and ble.b -32: | - cmp.l %d4, %d0 | pos = MIN(pos, srcrem) - ble.b 33f | - move.l %d4, %d0 | -33: | - move.l -4(%a3, %d0.l*4), %d6 | %d6 = s[pos - 1] - | -.lrs_channel_complete: | - move.l %d6, 4(%a0, %d1.l*4) | last_sample[ch] = last - subq.l #1, %d1 | ch > 0? - bgt.w .lrs_channel_loop | yes? process next channel - | + + movclr.l %acc1, %d6 | %acc0 = frac * acc + x2 + move.l %a4, %acc0 | + mac.l %d5, %d6, %acc0 | + + cmp.l %d3, %d0 | %d0 = MIN(pos, srcrem) + ble.w .hrs_dsloop | + move.l %d3, %d0 | + bra.w .hrs_dsloop | + +.hrs_channel_done: | + movem.l (%sp), %d5/%a0 | restore ch, h + movem.l %a3-%a5, (%a0) | h[0..2] = x3..x1 + lea.l 12(%a0), %a5 | h++ + movem.l 56(%sp), %a0-%a2 | load data, src, dst + subq.l #1, %d5 | ch > 0? + bgt.w .hrs_channel_loop | yes? process next channel + move.l 12(%a2), %d1 | %d1 = dst->bufcount - sub.l %d5, %d1 | written = dst->bufcount - dstrem + sub.l %d4, %d1 | written = dst->bufcount - dstrem move.l %d1, (%a2) | dst->remcount = written move.l %d0, %d1 | wrap phase to position in next frame lsl.l %d7, %d1 | data->phase = phase - (pos << 16) - sub.l %d1, %d3 | ... - move.l %d3, 4(%a0) | ... - movem.l (%sp), %d2-%d7/%a2-%a6 | restore non-volatiles - lea.l 44(%sp), %sp | cleanup stack + sub.l %d1, %d2 | + move.l %d2, 4(%a0) | + movem.l 8(%sp), %d2-%d7/%a2-%a6 | restore non-volatiles + lea.l 52(%sp), %sp | cleanup stack rts | buh-bye - .size resample_linear, .-resample_linear - + .size resample_hermite, .-resample_hermite /**************************************************************************** * void channel_mode_proc_mono(struct dsp_proc_entry *this, diff --git a/lib/rbcodec/dsp/resample.c b/lib/rbcodec/dsp/resample.c index 4a8997cc44..176fedc742 100644 --- a/lib/rbcodec/dsp/resample.c +++ b/lib/rbcodec/dsp/resample.c @@ -9,6 +9,7 @@ * * Copyright (C) 2005 Miika Pekkarinen * Copyright (C) 2012 Michael Sevakis + * Copyright (C) 2013 Michael Giacomelli * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -31,9 +32,7 @@ * of our inability to look into the future at the end of a frame. */ -#if 0 /* Set to '1' to enable debug messages */ -#include <debug.h> -#else +#if 1 /* Set to '0' to enable debug messages */ #undef DEBUGF #define DEBUGF(...) #endif @@ -46,24 +45,24 @@ static int32_t resample_out_bufs[3][RESAMPLE_BUF_COUNT] IBSS_ATTR; /* Data for each resampler on each DSP */ static struct resample_data { - uint32_t delta; /* 00h: Phase delta for each step */ - uint32_t phase; /* 04h: Current phase [pos16|frac16] */ - int32_t last_sample[2]; /* 08h: Last samples for interpolation (L+R) */ - /* 10h */ - int32_t frequency; /* Virtual samplerate */ + uint32_t delta; /* 00h: Phase delta for each step in s15.16*/ + uint32_t phase; /* 04h: Current phase [pos16|frac16] */ + int32_t history[2][3]; /* 08h: Last samples for interpolation (L+R) + 0 = oldest, 2 = newest */ + /* 20h */ + int32_t frequency; /* Virtual samplerate */ struct dsp_buffer resample_buf; /* Buffer descriptor for resampled data */ - int32_t *resample_out_p[2]; /* Actual output buffer pointers */ + int32_t *resample_out_p[2]; /* Actual output buffer pointers */ } resample_data[DSP_COUNT] IBSS_ATTR; /* Actual worker function. Implemented here or in target assembly code. */ -int resample_linear(struct resample_data *data, struct dsp_buffer *src, - struct dsp_buffer *dst); +int resample_hermite(struct resample_data *data, struct dsp_buffer *src, + struct dsp_buffer *dst); static void resample_flush_data(struct resample_data *data) { data->phase = 0; - data->last_sample[0] = 0; - data->last_sample[1] = 0; + memset(&data->history, 0, sizeof (data->history)); } static void resample_flush(struct dsp_proc_entry *this) @@ -84,8 +83,8 @@ static bool resample_new_delta(struct resample_data *data, if (frequency == NATIVE_FREQUENCY) { /* NOTE: If fully glitch-free transistions from no resampling to - resampling are desired, last_sample history should be maintained - even when not resampling. */ + resampling are desired, history should be maintained even when + not resampling. */ resample_flush_data(data); return false; } @@ -94,9 +93,8 @@ static bool resample_new_delta(struct resample_data *data, } #if !defined(CPU_COLDFIRE) && !defined(CPU_ARM) -/* Where the real work is done */ -int resample_linear(struct resample_data *data, struct dsp_buffer *src, - struct dsp_buffer *dst) +int resample_hermite(struct resample_data *data, struct dsp_buffer *src, + struct dsp_buffer *dst) { int ch = src->format.num_channels - 1; uint32_t count = MIN(src->remcount, 0x8000); @@ -111,35 +109,70 @@ int resample_linear(struct resample_data *data, struct dsp_buffer *src, d = dst->p32[ch]; int32_t *dmax = d + dst->bufcount; + /* Restore state */ phase = data->phase; pos = phase >> 16; pos = MIN(pos, count); - int32_t last = pos > 0 ? s[pos - 1] : data->last_sample[ch]; - - if (pos < count) + while (pos < count && d < dmax) { - while (1) - { - *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last); - phase += delta; - pos = phase >> 16; - - if (pos >= count || d >= dmax) - break; + int x0, x1, x2, x3; - if (pos > 0) - last = s[pos - 1]; + if (pos < 3) + { + x3 = data->history[ch][pos+0]; + x2 = pos < 2 ? data->history[ch][pos+1] : s[pos-2]; + x1 = pos < 1 ? data->history[ch][pos+2] : s[pos-1]; } - - if (pos > 0) + else { - pos = MIN(pos, count); - last = s[pos - 1]; + x3 = s[pos-3]; + x2 = s[pos-2]; + x1 = s[pos-1]; } + + x0 = s[pos]; + + int32_t frac = (phase & 0xffff) << 15; + + /* 4-point, 3rd-order Hermite/Catmull-Rom spline (x-form): + * c1 = -0.5*x3 + 0.5*x1 + * = 0.5*(x1 - x3) <-- + * + * v = x1 - x2, -v = x2 - x1 + * c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0 + * = x3 + 2*(x1 - x2) - 0.5*(x0 + x2) + * = x3 + 2*v - 0.5*(x0 + x2) <-- + * + * c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0 + * = 0.5*x0 - 0.5*x3 + 0.5*(x2 - x1) + (x2 - x1) + * = 0.5*(x0 - x3 - v) - v <-- + * + * polynomial coefficients */ + int32_t c1 = (x1 - x3) >> 1; + int32_t v = x1 - x2; + int32_t c2 = x3 + 2*v - ((x0 + x2) >> 1); + int32_t c3 = ((x0 - x3 - v) >> 1) - v; + + /* Evaluate polynomial at time 'frac'; Horner's rule. */ + int32_t acc; + acc = FRACMUL(c3, frac) + c2; + acc = FRACMUL(acc, frac) + c1; + acc = FRACMUL(acc, frac) + x2; + + *d++ = acc; + + phase += delta; + pos = phase >> 16; } - data->last_sample[ch] = last; + pos = MIN(pos, count); + + /* Save delay samples for next time. Must do this even if pos was + * clamped before loop in order to keep record up to date. */ + data->history[ch][0] = pos < 3 ? data->history[ch][pos+0] : s[pos-3]; + data->history[ch][1] = pos < 2 ? data->history[ch][pos+1] : s[pos-2]; + data->history[ch][2] = pos < 1 ? data->history[ch][pos+2] : s[pos-1]; } while (--ch >= 0); @@ -147,7 +180,6 @@ int resample_linear(struct resample_data *data, struct dsp_buffer *src, data->phase = phase - (pos << 16); dst->remcount = d - dst->p32[0]; - return pos; } #endif /* CPU */ @@ -175,7 +207,7 @@ static void resample_process(struct dsp_proc_entry *this, { dst->bufcount = RESAMPLE_BUF_COUNT; - int consumed = resample_linear(data, src, dst); + int consumed = resample_hermite(data, src, dst); /* Advance src by consumed amount */ if (consumed > 0) |