summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Sevakis <jethead71@rockbox.org>2012-05-07 03:12:56 -0400
committerMichael Sevakis <jethead71@rockbox.org>2013-05-16 18:52:21 +0200
commita7dee7f4472d9ebc77d6917043dc3c59a38e40d5 (patch)
tree5aabd08e93a2f398d041c7994f395be365582f43
parent91b850ec425545d52d25c5b0f9a2caf6f853dfb7 (diff)
downloadrockbox-a7dee7f.tar.gz
rockbox-a7dee7f.zip
Introduce new hermite polynomial resampler.
Uses the Catmull-Rom case of Hermite cubic splines. Vastly improves the quality and accuracy of audio resampling with a rather minor additional overhead compared to the previous linear implementation. ARM and Coldfire assembly implementations included. Change-Id: Ic45d84bc66c5b312ef373198297a952167a4be26 Reviewed-on: http://gerrit.rockbox.org/304 Reviewed-by: Michael Sevakis <jethead71@rockbox.org> Tested-by: Michael Sevakis <jethead71@rockbox.org>
-rw-r--r--lib/rbcodec/dsp/dsp_arm.S310
-rw-r--r--lib/rbcodec/dsp/dsp_cf.S385
-rw-r--r--lib/rbcodec/dsp/resample.c108
3 files changed, 543 insertions, 260 deletions
diff --git a/lib/rbcodec/dsp/dsp_arm.S b/lib/rbcodec/dsp/dsp_arm.S
index 136814d79a..5b899fa718 100644
--- a/lib/rbcodec/dsp/dsp_arm.S
+++ b/lib/rbcodec/dsp/dsp_arm.S
@@ -289,114 +289,224 @@ crossfeed_meier_process:
ldmpc regs=r4-r10 @ restore non-volatile context, return
.size crossfeed_meier_process, .-crossfeed_meier_process
-
/****************************************************************************
- * int resample_linear(struct resample_data *data, struct dsp_buffer *src,
- * struct dsp_buffer *dst)
+ * int resample_hermite(struct resample_data *data, struct dsp_buffer *src,
+ * struct dsp_buffer *dst)
*/
.section .text, "ax",%progbits
- .global resample_linear
-resample_linear:
+ .global resample_hermite
+resample_hermite:
@input: r0 = data, r1 = src, r2 = dst
- stmfd sp!, { r4-r11, lr } @ stack modified regs
- ldr r4, [r0] @ r4 = data->delta
- add r10, r0, #4 @ r10 = &data->phase
- ldrb r3, [r1, #17] @ r3 = num_channels,
- stmfd sp!, { r1, r10 } @ stack src, &data->phase
-.lrs_channel_loop:
- ldr r5, [r10] @ r5 = data->phase
- ldr r6, [r1] @ r6 = srcrem = src->remcount
- ldr r7, [r1, r3, lsl #2] @ r7 = src->p32[ch]
- ldr r8, [r2, r3, lsl #2] @ r8 = dst->p32[ch]
- ldr r9, [r2, #12] @ r9 = dstrem = dst->bufcount
-
- cmp r6, #0x8000 @ srcrem = MIN(srcrem, 0x8000)
- movgt r6, #0x8000 @
- mov r0, r5, lsr #16 @ pos = MIN(pos, srcrem)
- cmp r0, r6 @
- movgt r0, r6 @ r0 = pos = phase >> 16
- cmp r0, #0 @
- ldrle r11, [r10, r3, lsl #2] @ pos <= 0? r11 = last = last_sample[ch]
- addgt r12, r7, r0, lsl #2 @ pos > 0? r1 = last = s[pos - 1]
- ldrgt r11, [r12, #-4] @
- cmp r0, r6 @
- bge .lrs_channel_done @ pos >= count? channel complete
-
- cmp r4, #0x10000 @ delta >= 1.0?
- ldrhs r12, [r7, r0, lsl #2] @ yes? r12 = s[pos]
- bhs .lrs_dsstart @ yes? is downsampling
+ stmfd sp!, { r0-r2, r4-r11, lr } @ stack parms, modified regs
+ ldr r9, [r1] @ r9 = srcrem = src->remcount
+ ldrb r10, [r1, #17] @ r10 = ch = num_channels
+ ldr r14, [r0] @ r14 = data->delta, r0 = data
+
+ cmp r9, #0x8000 @ srcrem = MIN(srcrem, 0x8000)
+ movgt r9, #0x8000 @
+
+ @ Channels are processed high to low while history is saved low to high
+ @ It's really noone's business how we do this
+ add r12, r0, #8 @ r12 = h = data->history
+
+.hrs_channel_loop:
+ stmfd sp!, { r10, r12 } @ push ch, h
+ ldr r5, [r0, #4] @ r5 = data->phase
+ ldr r6, [r1, r10, lsl #2] @ r6 = src->p32[ch]
+ ldr r7, [r2, r10, lsl #2] @ r7 = dst->p32[ch]
+ ldr r8, [r2, #12] @ r8 = dstrem = dst->bufcount
+
+ mov r0, r5, lsr #16 @ r0 = pos = phase >> 16
+ cmp r0, r9 @ r0 = pos = MIN(pos, srcrem)
+ movgt r0, r9 @
+
+ add r6, r6, r0, lsl #2 @ r6 = &s[pos]
+
+ cmp r0, #3 @ pos >= 3? history not needed
+ ldmgedb r6, { r1-r3 } @ x3..x1 = s[pos-3]..s[pos-1]
+ bge .hrs_loadhist_done @
+ add r10, r0, r0, lsl #1 @ branch pc + pos*12
+ add pc, pc, r10, lsl #2 @
+ nop @
+
+ ldmia r12, { r1-r3 } @ x3..x1 = h[0]..h[2]
+ b .hrs_loadhist_done @
+ nop @
+
+ ldmib r12, { r1-r2 } @ x3..x2 = h[1]..h[2]
+ ldr r3, [r6, #-4] @ x1 = s[0]
+ b .hrs_loadhist_done @
+
+ ldr r1, [r12, #8] @ x3 = h[2]
+ ldmdb r6, { r2-r3 } @ x2..x1 = s[0]..s[1]
+.hrs_loadhist_done:
+
+ cmp r0, r9 @ pos past end?
+ bge .hrs_channel_done
+
+ cmp r14, #0x10000 @ delta >= 1.0?
+ bhs .hrs_dsstart @ yes? is downsampling
/** Upsampling **/
- mov r5, r5, lsl #16 @ Move phase into high halfword
- add r7, r7, r0, lsl #2 @ r7 = &s[pos]
- sub r0, r6, r0 @ r0 = dte = srcrem - pos
-.lrs_usloop_1:
- ldr r12, [r7], #4 @ r12 = s[pos]
- sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
-.lrs_usloop_0:
- mov r1, r5, lsr #16 @ r1 = frac = phase >> 16
- @ keep frac in Rs to take advantage of multiplier early termination
- smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
- add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
- add r1, r1, r10, lsl #16 @
- str r1, [r8], #4 @ *d++ = out
- subs r9, r9, #1 @ destination full?
- bls .lrs_usfull @ yes? channel is done
- adds r5, r5, r4, lsl #16 @ phase += delta << 16
- bcc .lrs_usloop_0 @ if carry is set, pos is incremented
- subs r0, r0, #1 @ if srcrem > 0, do another sample
- mov r11, r12 @ r11 = last = s[pos-1] (pos changed)
- bgt .lrs_usloop_1
- b .lrs_usdone
-
-.lrs_usfull:
- adds r5, r5, r4, lsl #16 @ do missed phase increment
- subcs r0, r0, #1 @ do missed srcrem decrement
- movcs r11, r12 @ r11 = s[pos-1] (pos changed)
-
-.lrs_usdone:
- sub r0, r6, r0 @ r0 = pos = srcrem - dte
- orr r5, r5, r0 @ reconstruct swapped phase
- mov r5, r5, ror #16 @ swap pos and frac for phase
- b .lrs_channel_done @
+ str r9, [sp, #-4]! @ push srcrem
+ mov r5, r5, lsl #16 @ r5 = phase << 16
+ sub r0, r9, r0 @ r0 = dte = srcrem - pos
+ mov r14, r14, lsl #16 @ r14 = delta << 16
+
+ @ Register usage in loop:
+ @ r0 = dte
+ @ r1 = x3, r2 = x2, r3 = x1, r4 = x0
+ @ r5 = phase << 16/frac, r6 = &s[pos], r7 = d, r8 = dstrem
+ @ r9 = scratch/acclo, r10 = scratch/acchi
+ @ r11 = c2, r12 = c3, c1 calculated in frac loop
+ @ r14 = delta << 16
+ @
+ @ Try to avoid overflow as much as possible and at the same time preserve
+ @ accuracy. Same formulas apply to downsampling but registers and
+ @ instruction order differ due to specific constraints.
+ @ c1 = -0.5*x3 + 0.5*x1
+ @ = 0.5*(x1 - x3) <--
+ @
+ @ v = x1 - x2, -v = x2 - x1
+ @ c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0
+ @ = x3 + 2*(x1 - x2) - 0.5*(x0 + x2)
+ @ = x3 + 2*v - 0.5*(x0 + x2) <--
+ @
+ @ c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0
+ @ = 0.5*(x0 - x3 + (x2 - x1)) + (x2 - x1)
+ @ = 0.5*(x0 - x3 - v) - v <--
+.hrs_usloop_carry:
+ ldr r4, [r6], #4 @ x0 = s[pos]
+ sub r9, r3, r2 @ r9 = v, r11 = c2, r12 = c3
+ add r11, r1, r9, asl #1 @
+ add r10, r4, r2 @
+ sub r12, r4, r1 @
+ sub r12, r12, r9 @
+ sub r11, r11, r10, asr #1 @
+ rsb r12, r9, r12, asr #1 @
+.hrs_usloop_frac:
+ mov r5, r5, lsr #16 @ r5 = phase -> frac
+ smull r9, r10, r12, r5 @ acc = frac * c3 + c2
+ add r9, r11, r9, lsr #16 @
+ add r9, r9, r10, asl #16 @
+ smull r9, r10, r5, r9 @ acc = frac * acc + c1
+ mov r9, r9, lsr #16 @
+ orr r9, r9, r10, asl #16 @
+ sub r10, r3, r1 @
+ add r9, r9, r10, asr #1 @
+ smull r9, r10, r5, r9 @ acc = frac * acc + x2
+ subs r8, r8, #1 @ destination full?
+ add r9, r2, r9, lsr #16 @
+ add r9, r9, r10, asl #16 @
+ str r9, [r7], #4 @ *d++ = acc
+ bls .hrs_usfull @ yes? channel is done
+ adds r5, r14, r5, lsl #16 @ frac += delta
+ bcc .hrs_usloop_frac @ if carry is set, pos is incremented
+
+ subs r0, r0, #1 @ if dte > 0, do another sample
+ mov r1, r2 @ x3 = x2
+ mov r2, r3 @ x2 = x1
+ mov r3, r4 @ x1 = x0
+ bgt .hrs_usloop_carry
+ b .hrs_usdone
+
+.hrs_usfull:
+ adds r5, r14, r5, lsl #16 @ do missed phase increment
+ bcc .hrs_usdone @
+ sub r0, r0, #1 @ do missed dte decrement
+ mov r1, r2 @ do missed history update
+ mov r2, r3 @
+ mov r3, r4 @
+
+.hrs_usdone:
+ ldr r9, [sp], #4 @ r9 = pop srcrem
+ mov r14, r14, lsr #16 @ restore delta for next round
+ sub r0, r9, r0 @ r0 = pos = srcrem - dte
+ orr r5, r5, r0 @ reconstruct swapped phase
+ mov r5, r5, ror #16 @ swap pos and frac for phase
+ b .hrs_channel_done
/** Downsampling **/
-.lrs_dsloop:
- add r10, r7, r0, lsl #2 @ r10 = &s[pos]
- ldmda r10, { r11, r12 } @ r11 = last, r12 = s[pos]
-.lrs_dsstart:
- sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
- @ keep frac in Rs to take advantage of multiplier early termination
- bic r1, r5, r0, lsl #16 @ frac = phase & 0xffff
- smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
- add r5, r5, r4 @ phase += delta
- subs r9, r9, #1 @ destination full? ...
- mov r0, r5, lsr #16 @ pos = phase >> 16
- add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
- add r1, r1, r10, lsl #16 @
- str r1, [r8], #4 @ *d++ = out
- cmpgt r6, r0 @ ... || pos >= srcrem? ...
- bgt .lrs_dsloop @ ... no, do more samples
-
- cmp r0, r6 @ pos = MIN(pos, srcrem)
- movgt r0, r6 @
- sub r1, r0, #1 @ pos must always be > 0 since step >= 1.0
- ldr r11, [r7, r1, lsl #2] @ r11 = s[pos - 1]
-
-.lrs_channel_done:
- ldmia sp, { r1, r10 } @ recover src, &data->phase
- str r11, [r10, r3, lsl #2] @ last_sample[ch] = last
- subs r3, r3, #1 @
- bgt .lrs_channel_loop @
-
- ldr r6, [r2, #12] @ r6 = dst->bufcount
- sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16)
- str r5, [r10] @ data->phase = r5
- sub r6, r6, r9 @ r6 = dst->bufcount - dstrem = dstcount
- str r6, [r2] @ dst->remcount = dstcount
- add sp, sp, #8 @ adjust stack for temp variables
- ldmpc regs=r4-r11 @ ... and we're out
- .size resample_linear, .-resample_linear
+ @ Register usage in loop:
+ @ r0 = pos/frac
+ @ r1 = x3, r2 = x2, r3 = x1, r4 = x0
+ @ r5 = phase, r6 = &s[pos], r7 = d, r8 = dstrem
+ @ r9 = srcrem, r10 = scratch/acclo
+ @ r11 = c2/scratch, r12 = c3/acchi
+ @ r14 = delta
+.hrs_dsloop_4:
+ ldmdb r6, { r1-r3 } @ x3..x0 = s[pos-3]..s[pos-1]
+ b .hrs_dsloop
+.hrs_dsloop_3:
+ ldmdb r6, { r2-r3 } @ x2..x0 = s[pos-2]..s[pos-1]
+ mov r1, r4 @ x3 = x0
+ b .hrs_dsloop
+.hrs_dsloop_2:
+ mov r1, r3 @ x3 = x1
+ ldr r3, [r6, #-4] @ x1 = s[pos-1]
+ mov r2, r4 @ x2 = x0
+ b .hrs_dsloop
+.hrs_dsloop_1: @ expected loop destination
+ mov r1, r2 @ x3 = x2
+ mov r2, r3 @ x2 = x1
+ mov r3, r4 @ x1 = x0
+.hrs_dsloop:
+ subs r8, r8, #1 @ destination full?
+ cmpgt r9, r0 @ ... || pos >= srcrem?
+ ble .hrs_channel_done
+.hrs_dsstart:
+ ldr r4, [r6] @ x0 = s[pos]
+ sub r10, r3, r2 @ r10 = v, r11 = c2, r12 = c3
+ add r11, r4, r2 @
+ bic r0, r5, r0, lsl #16 @ r0 = frac = phase & 0xffff
+ sub r11, r1, r11, asr #1 @
+ add r11, r11, r10, asl #1 @
+ sub r12, r4, r1 @
+ sub r12, r12, r10 @
+ rsb r12, r10, r12, asr #1 @
+ smull r10, r12, r0, r12 @ acc = frac * c3 + c2
+ add r10, r11, r10, lsr #16 @
+ add r10, r10, r12, asl #16 @
+ sub r11, r3, r1 @
+ smull r10, r12, r0, r10 @ acc = frac * acc + c1
+ mov r11, r11, asr #1 @
+ add r10, r11, r10, lsr #16 @
+ add r10, r10, r12, asl #16 @
+ smull r10, r12, r0, r10 @ acc = frac * acc + x2
+ mov r11, r5, lsr #16 @ r11 = last_pos
+ add r5, r5, r14 @ phase += delta
+ mov r0, r5, lsr #16 @ r0 = pos = phase >> 16
+ add r10, r2, r10, lsr #16 @
+ add r10, r10, r12, asl #16 @
+ str r10, [r7], #4 @ *d++ = acc
+
+ cmp r0, r9 @ r0 = pos = MIN(pos, srcrem)
+ movgt r0, r9 @
+ sub r11, r0, r11 @ shift = pos - last_pos
+ cmp r11, #4 @
+ add r6, r6, r11, lsl #2 @ r6 += shift * 4
+ bge .hrs_dsloop_4 @
+ ldr pc, [pc, r11, lsl #2] @ branch to corresponding loop address
+ .word 0, 0
+ .word .hrs_dsloop_1
+ .word .hrs_dsloop_2
+ .word .hrs_dsloop_3
+
+.hrs_channel_done:
+ ldmfd sp!, { r10, r12 } @ recover ch, h
+ subs r10, r10, #1 @ --ch
+ stmia r12!, { r1-r3 } @ h[0..2] = x3..x1
+ ldmgtia sp, { r0-r2 } @ load data, src, dst
+ bgt .hrs_channel_loop
+
+ ldmfd sp!, { r1-r3 } @ pop data, src, dst
+ sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16)
+ ldr r2, [r3, #12] @ r2 = dst->bufcount
+ str r5, [r1, #4] @ data->phase = r5
+ sub r2, r2, r8 @ r2 = dst->bufcount - dstrem
+ str r2, [r3] @ dst->remcount = r2
+ ldmpc regs=r4-r11 @ ... and we're out
+ .size resample_hermite, .-resample_hermite
/****************************************************************************
* void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p)
diff --git a/lib/rbcodec/dsp/dsp_cf.S b/lib/rbcodec/dsp/dsp_cf.S
index 7245fae4cd..4b0c6276e1 100644
--- a/lib/rbcodec/dsp/dsp_cf.S
+++ b/lib/rbcodec/dsp/dsp_cf.S
@@ -179,145 +179,286 @@ crossfeed_meier_process:
.size crossfeed_meier_process, .-crossfeed_meier_process
/****************************************************************************
- * int resample_linear(struct resample_data *data, struct dsp_buffer *src,
- * struct dsp_buffer *dst)
+ * int resample_hermite(struct resample_data *data, struct dsp_buffer *src,
+ * struct dsp_buffer *dst)
*/
.section .text
.align 2
- .global resample_linear
-resample_linear:
+ .global resample_hermite
+resample_hermite:
| input: 4(sp) = data, 8(sp) = src, 12(sp) = dst
- lea.l -44(%sp), %sp | save non-volatiles
- movem.l %d2-%d7/%a2-%a6, (%sp) |
- movem.l 48(%sp), %a0-%a2 | %a0 = data
+ lea.l -52(%sp), %sp | save non-volatiles, allocate temps
+ movem.l %d2-%d7/%a2-%a6, 8(%sp) |
+ movem.l 56(%sp), %a0-%a2 | %a0 = data
| %a1 = src
| %a2 = dst
- clr.l %d1 | %d1 = ch = src->format.num_channels
- move.b 17(%a1), %d1 |
- moveq.l #16, %d7 | %d7 = shift
-.lrs_channel_loop: |
- movem.l (%a0), %d2-%d3 | %d2 = delta = data->delta,
- | %d3 = phase = data->phase
- move.l (%a1), %d4 | %d4 = srcrem = src->remcount
- move.l 12(%a2), %d5 | %d5 = dstrem = dst->bufcount
- cmp.l #0x8000, %d4 | %d4 = MIN(srcrem, 0x8000)
- ble.b 10f |
- move.l #0x8000, %d4 |
-10: |
- move.l (%a1, %d1.l*4), %a3 | %a3 = s = src->p32[ch]
- move.l (%a2, %d1.l*4), %a4 | %a4 = d = dst->p32[ch]
- move.l %d3, %d0 | %d0 = pos
- lsr.l %d7, %d0 | ...
- beq.b 11f | pos == 0?
- cmp.l %d4, %d0 | pos = MIN(pos, srcrem)
- blt.b 12f |
- move.l %d4, %d0 | pos = srcrem
- move.l -4(%a3, %d0.l*4), %d6 | %d6 = last = s[pos - 1]
- bra.w .lrs_channel_complete | at limit; nothing to do but next
-11: |
- move.l 4(%a0, %d1.l*4), %d6 | %d6 = last = last_sample[ch]
- tpf.l | trap next move.l (last = s[pos - 1])
-12: |
- move.l -4(%a3, %d0.l*4), %d6 | %d6 = last = s[pos - 1]
- cmp.l #0x10000, %d2 | delta >= 1.0?
- bhs.b .lrs_downsample | yes? downsampling
+ clr.l %d5 | %d5 = ch = src->format.num_channels
+ move.b 17(%a1), %d5 |
+ lea.l 8(%a0), %a5 | %a5 = h = history[ch]
+ moveq.l #16, %d7 | %d7 = shift val
+.hrs_channel_loop: |
+ movem.l %d5/%a5, (%sp) | store ch, h
+ movem.l (%a0), %d1-%d2 | %d1 = delta = data->delta,
+ | %d2 = phase = data->phase
+ move.l (%a1), %d3 | %d3 = srcrem = src->remcount
+ move.l 12(%a2), %d4 | %d4 = dstrem = dst->bufcount
+
+ cmp.l #0x8000, %d3 | %d4 = MIN(srcrem, 0x8000)
+ ble.b 1f |
+ move.l #0x8000, %d3 |
+1: |
+
+ move.l (%a1, %d5.l*4), %a1 | %a1 = s = src->p32[ch]
+ move.l (%a2, %d5.l*4), %a2 | %a2 = d = dst->p32[ch]
+
+ move.l %d2, %d0 | %d0 = pos = phase >> 16
+ lsr.l %d7, %d0 |
+
+ cmp.l %d3, %d0 | pos = MIN(pos, srcrem)
+ ble.b 1f |
+ move.l %d3, %d0 |
+1:
+
+ lea.l (%a1, %d0.l*4), %a1 | %a1 = &s[pos]
+
+ cmp.l #3, %d0 |
+ bge.b 1f |
+ move.l %d0, %a0 |
+ lea.l (%a0, %a0.l*2), %a0 |
+ jmp 2(%pc, %a0.l*4) | 4b |
+ | 0
+ movem.l (%a5), %a3-%a5 | 4b | x3..x1 = h[0]..h[2]
+ bra.b 2f | 2b |
+ .dcb.w 3,0 | 6b | filler
+ | 1
+ movem.l 4(%a5), %a3-%a4 | 6b | x3..x2 = h[1]..h[2]
+ move.l -4(%a1), %a5 | 4b | x1 = s[0]
+ bra.b 2f | 2b |
+ | 2
+ move.l 8(%a5), %a3 | 4b | x3 = h[2]
+ movem.l -8(%a1), %a4-%a5 | 6b | x2..x1 = s[0]..s[1]
+ bra.b 2f | 2b |
+1: | 3 +
+ movem.l -12(%a1), %a3-%a5 | x3...x1 = s[pos-3]..s[pos-1]
+2:
+
+ cmp.l %d3, %d0 | pos past end?
+ bge.w .hrs_channel_done |
+
+ cmp.l #0x10000, %d1 | delta >= 1.0?
+ bhs.w .hrs_dsstart | yes? downsampling
|
/** Upsampling **/ |
- lea.l (%a3, %d0.l*4), %a3 | %a3 = &s[pos]
- sub.l %d4, %d0 | %d0 = pos - srcrem = -dte
- lsl.l %d7, %d2 | move delta to bits 30..15
+ sub.l %d3, %d0 | %d0 = pos - srcrem = -dte
+ lsl.l %d7, %d1 | move delta to bits 30..15
+ lsr.l #1, %d1 |
+ lsl.l %d7, %d2 | move phase to bits 30..15
lsr.l #1, %d2 |
- lsl.l %d7, %d3 | move phase to bits 30..15
- lsr.l #1, %d3 |
- move.l (%a3)+, %a5 | %a5 = s[pos]
- move.l %a5, %a6 | %a6 = diff = s[pos] - last
- sub.l %d6, %a6 |
- bra.b 22f |
- /* Funky loop structure is to avoid emac latency stalls */
-20: |
- move.l (%a3)+, %a5 | %a5 = s[pos]
- move.l %a5, %a6 | %a6 = diff = s[pos] - last
- sub.l %d6, %a6 |
-21: |
- movclr.l %acc0, %d7 | *d++ = %d7 = result
- move.l %d7, (%a4)+ |
-22: |
- move.l %d6, %acc0 | %acc0 = last
- mac.l %d3, %a6, %acc0 | %acc0 += frac * diff
- subq.l #1, %d5 | dstrem <= 0?
- ble.b 23f | yes? stop
- add.l %d2, %d3 | phase += delta
- bpl.b 21b | load next values?
- move.l %a5, %d6 |
- bclr.l #31, %d3 | clear sign bit
+ |
+ | Register usage in loop:
+ | r0 = dte, d1 = delta, d2 = phase, d3 = srcrem, d4 = dstrem
+ | d5 = scratch, d6 = c3, d7 = scratch
+ | a0 = c2, a1 = &s[pos], a2 = d,
+ | a3 = x3, a4 = x2, a5 = x1, a6 = x0
+ |
+ | Try to avoid overflow as much as possible and at the same time preserve
+ | accuracy. Same formulas apply to downsampling but registers and
+ | instruction order differ due to specific constraints.
+ | c1 = -0.5*x3 + 0.5*x1
+ | = 0.5*(x1 - x3) <--
+ |
+ | v = x1 - x2, -v = x2 - x1
+ | c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0
+ | = x3 + 2*(x1 - x2) - 0.5*(x0 + x2)
+ | = x3 + 2*v - 0.5*(x0 + x2) <--
+ |
+ | c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0
+ | = 0.5*x0 - 0.5*x3 + 0.5*(x2 - x1) + (x2 - x1)
+ | = 0.5*(x0 - x3 - v) - v <--
+ |
+.hrs_usloop_carry:
+ move.l (%a1)+, %a6 | %a6 = s[pos]
+
+ move.l %a5, %d5 | v
+ sub.l %a4, %d5 |
+
+ move.l %a6, %d6 | c3
+ sub.l %a3, %d6 |
+ sub.l %d5, %d6 |
+ asr.l #1, %d6 |
+ sub.l %d5, %d6 |
+
+ lea.l (%a3, %d5.l*2), %a0 | c2
+ move.l %a6, %d5 |
+ add.l %a4, %d5 |
+ asr.l #1, %d5 |
+ sub.l %d5, %a0 |
+
+.hrs_usloop_frac:
+ move.l %a0, %acc0 | %acc0 = frac * c3 + c2
+ mac.l %d2, %d6, %acc0 |
+
+ move.l %a5, %d5 | c1
+ sub.l %a3, %d5 |
+ asr.l #1, %d5 |
+
+ movclr.l %acc0, %d7 | %acc1 = frac * acc + c1
+ move.l %d5, %acc1 |
+ mac.l %d2, %d7, %acc1 |
+
+ move.l %a4, %acc0 | %acc0 = frac * acc + x2
+ movclr.l %acc1, %d5 |
+ mac.l %d2, %d5, %acc0 |
+
+ subq.l #1, %d4 | dstrem <= 0?
+ ble.b .hrs_usfull | yes? stop
+
+ movclr.l %acc0, %d5 | *d++ = d5 = result
+ move.l %d5, (%a2)+ |
+
+ add.l %d1, %d2 | phase += delta
+ bpl.b .hrs_usloop_frac | load next values?
+
+ move.l %a4, %a3 | x3 = x2
+ move.l %a5, %a4 | x2 = x1
+ move.l %a6, %a5 | x1 = x0
+
+ bclr.l #31, %d2 | clear sign bit
addq.l #1, %d0 | dte > 0?
- bmi.b 20b | yes? continue resampling
- tpf.w | trap next add.l (phase += delta)
-23: |
- add.l %d2, %d3 | phase += delta
- lsl.l #1, %d3 | frac -> phase
- bcs.b 24f | was sign bit set?
- tpf.l |
-24: |
- move.l %a5, %d6 | yes? was going to move to new s[pos]
- addq.l #1, %d0 |
- movclr.l %acc0, %d7 | *d = %d7 = result
- move.l %d7, (%a4) |
- add.l %d4, %d0 | %d0 = -dte + srcrem = pos
- or.l %d0, %d3 | restore phase
- swap.w %d3 |
- moveq.l #16, %d7 | %d7 = shift
- bra.b .lrs_channel_complete |
- |
- /** Downsampling **/ |
-.lrs_downsample: |
- move.l (%a3, %d0.l*4), %a5 | %a5 = s[pos]
- bra.b 31f |
-30: |
- lea.l -4(%a3, %d0.l*4), %a5 | %d6 = s[pos - 1], %a5 = s[pos]
- movem.l (%a5), %d6/%a5 |
-31: |
- move.l %d6, %acc0 | %acc0 = last
- sub.l %d6, %a5 | %a5 = diff = s[pos] - s[pos - 1]
- move.l %d3, %d0 | frac = (phase << 16) >> 1
- lsl.l %d7, %d0 |
- lsr.l #1, %d0 |
- mac.l %d0, %a5, %acc0 | %acc0 += frac * diff
- add.l %d2, %d3 | phase += delta
- move.l %d3, %d0 | pos = phase >> 16
+ bmi.b .hrs_usloop_carry | yes? continue resampling
+ bra.b .hrs_usdone
+
+.hrs_usfull:
+ movclr.l %acc0, %d5 | *d++ = d5 = result
+ move.l %d5, (%a2) |
+
+ add.l %d1, %d2 | do missed phase increment
+ bpl.b .hrs_usdone | was sign bit set?
+
+ move.l %a4, %a3 | do missed history update
+ move.l %a5, %a4 |
+ move.l %a6, %a5 |
+
+ addq.l #1, %d0 | do missed dte decrement
+
+.hrs_usdone:
+ moveq.l #16, %d7 | restore shift
+ lsl.l #1, %d2 | frac -> phase
+ add.l %d3, %d0 | %d0 = -dte + srcrem = pos
+ or.l %d0, %d2 | restore phase
+ swap.w %d2 |
+
+ bra.w .hrs_channel_done |
+
+ /** Downsampling **/
+ |
+ | Register usage in loop:
+ | r0 = pos, d1 = delta, d2 = phase, d3 = srcrem, d4 = dstrem
+ | d5 = scratch, d6 = scratch, d7 = 16 (shift value)
+ | a0 = scratch, a1 = &s[pos], a2 = d,
+ | a3 = x3, a4 = x2, a5 = x1, a6 = x0
+ |
+.hrs_dsloop:
+ movclr.l %acc0, %d5 | *d++ = acc
+ move.l %d5, (%a2)+ |
+
+ sub.l %d0, %a0 | %a0 = -shift = last_pos - pos
+ move.l %a0, %d5 |
+ asl.l #2, %d5 | -shift -> -bytes
+ sub.l %d5, %a1 | %a1 = s = s - -bytes
+ cmp.l #-4, %a0 | >= 4?
+ ble.b 1f |
+ add.l %d5, %a0 | %a0 = 5 * -shift
+ jmp 40(%pc, %a0.l*2) | 4b |
+1: | +4 +
+ movem.l -12(%a1), %a3-%a5 | 6b | x3..x0 = s[pos-3]..s[pos-1]
+ bra.b 1f | 2b |
+ | +3
+ move.l %a6, %a3 | 2b | x3 = x0
+ movem.l -8(%a1), %a4-%a5 | 6b | x2..x0 = s[pos-2]..s[pos-1]
+ bra.b 1f | 2b | 10
+ | +2
+ move.l %a5, %a3 | 2b | x3 = x1
+ move.l %a6, %a4 | 2b | x2 = x0
+ move.l -4(%a1), %a5 | 4b | x1 = s[pos-1]
+ bra.b 1f | 2b | 10
+ | +1
+ move.l %a4, %a3 | 2b | x3 = x2 | expected loop destination
+ move.l %a5, %a4 | 2b | x2 = x1
+ move.l %a6, %a5 | 2b | x1 = x0
+1:
+
+ subq.l #1, %d4 | 2b | dstrem <= 0?
+ ble.b .hrs_channel_done | 2b | yes? stop
+ cmp.l %d3, %d0 |
+ bge.b .hrs_channel_done |
+
+.hrs_dsstart:
+ move.l (%a1), %a6 | %a6 = s[pos]
+ move.l %a5, %d5 | v
+ sub.l %a4, %d5 |
+
+ move.l %a6, %d6 | c3
+ sub.l %a3, %d6 |
+ sub.l %d5, %d6 |
+ asr.l #1, %d6 |
+ sub.l %d5, %d6 |
+
+ lea.l (%a3, %d5.l*2), %a0 | c2
+ move.l %a6, %d5 |
+ add.l %a4, %d5 |
+ asr.l #1, %d5 |
+ sub.l %d5, %a0 |
+
+ move.l %d2, %d5 | phase -> frac
+ lsl.l %d7, %d5 |
+ lsr.l #1, %d5 |
+
+ move.l %a0, %acc0 | %acc0 = frac * c3 + c2
+ mac.l %d5, %d6, %acc0 |
+
+ move.l %a5, %d6 | c1
+ sub.l %a3, %d6 |
+ asr.l #1, %d6 |
+
+ movclr.l %acc0, %a0 | %acc1 = frac * acc + c1
+ move.l %d6, %acc1 |
+ mac.l %d5, %a0, %acc1 |
+
+ move.l %d0, %a0 | %a0 = last_pos
+ add.l %d1, %d2 | phase += delta
+ move.l %d2, %d0 | pos = phase >> 16
lsr.l %d7, %d0 |
- movclr.l %acc0, %a5 |
- move.l %a5, (%a4)+ | *d++ = %d0
- subq.l #1, %d5 | dst full?
- ble.b 32f | yes? stop
- cmp.l %d4, %d0 | pos < srcrem?
- blt.b 30b | yes? continue resampling
- tpf.l | trap cmp.l and ble.b
-32: |
- cmp.l %d4, %d0 | pos = MIN(pos, srcrem)
- ble.b 33f |
- move.l %d4, %d0 |
-33: |
- move.l -4(%a3, %d0.l*4), %d6 | %d6 = s[pos - 1]
- |
-.lrs_channel_complete: |
- move.l %d6, 4(%a0, %d1.l*4) | last_sample[ch] = last
- subq.l #1, %d1 | ch > 0?
- bgt.w .lrs_channel_loop | yes? process next channel
- |
+
+ movclr.l %acc1, %d6 | %acc0 = frac * acc + x2
+ move.l %a4, %acc0 |
+ mac.l %d5, %d6, %acc0 |
+
+ cmp.l %d3, %d0 | %d0 = MIN(pos, srcrem)
+ ble.w .hrs_dsloop |
+ move.l %d3, %d0 |
+ bra.w .hrs_dsloop |
+
+.hrs_channel_done: |
+ movem.l (%sp), %d5/%a0 | restore ch, h
+ movem.l %a3-%a5, (%a0) | h[0..2] = x3..x1
+ lea.l 12(%a0), %a5 | h++
+ movem.l 56(%sp), %a0-%a2 | load data, src, dst
+ subq.l #1, %d5 | ch > 0?
+ bgt.w .hrs_channel_loop | yes? process next channel
+
move.l 12(%a2), %d1 | %d1 = dst->bufcount
- sub.l %d5, %d1 | written = dst->bufcount - dstrem
+ sub.l %d4, %d1 | written = dst->bufcount - dstrem
move.l %d1, (%a2) | dst->remcount = written
move.l %d0, %d1 | wrap phase to position in next frame
lsl.l %d7, %d1 | data->phase = phase - (pos << 16)
- sub.l %d1, %d3 | ...
- move.l %d3, 4(%a0) | ...
- movem.l (%sp), %d2-%d7/%a2-%a6 | restore non-volatiles
- lea.l 44(%sp), %sp | cleanup stack
+ sub.l %d1, %d2 |
+ move.l %d2, 4(%a0) |
+ movem.l 8(%sp), %d2-%d7/%a2-%a6 | restore non-volatiles
+ lea.l 52(%sp), %sp | cleanup stack
rts | buh-bye
- .size resample_linear, .-resample_linear
-
+ .size resample_hermite, .-resample_hermite
/****************************************************************************
* void channel_mode_proc_mono(struct dsp_proc_entry *this,
diff --git a/lib/rbcodec/dsp/resample.c b/lib/rbcodec/dsp/resample.c
index 4a8997cc44..176fedc742 100644
--- a/lib/rbcodec/dsp/resample.c
+++ b/lib/rbcodec/dsp/resample.c
@@ -9,6 +9,7 @@
*
* Copyright (C) 2005 Miika Pekkarinen
* Copyright (C) 2012 Michael Sevakis
+ * Copyright (C) 2013 Michael Giacomelli
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -31,9 +32,7 @@
* of our inability to look into the future at the end of a frame.
*/
-#if 0 /* Set to '1' to enable debug messages */
-#include <debug.h>
-#else
+#if 1 /* Set to '0' to enable debug messages */
#undef DEBUGF
#define DEBUGF(...)
#endif
@@ -46,24 +45,24 @@ static int32_t resample_out_bufs[3][RESAMPLE_BUF_COUNT] IBSS_ATTR;
/* Data for each resampler on each DSP */
static struct resample_data
{
- uint32_t delta; /* 00h: Phase delta for each step */
- uint32_t phase; /* 04h: Current phase [pos16|frac16] */
- int32_t last_sample[2]; /* 08h: Last samples for interpolation (L+R) */
- /* 10h */
- int32_t frequency; /* Virtual samplerate */
+ uint32_t delta; /* 00h: Phase delta for each step in s15.16*/
+ uint32_t phase; /* 04h: Current phase [pos16|frac16] */
+ int32_t history[2][3]; /* 08h: Last samples for interpolation (L+R)
+ 0 = oldest, 2 = newest */
+ /* 20h */
+ int32_t frequency; /* Virtual samplerate */
struct dsp_buffer resample_buf; /* Buffer descriptor for resampled data */
- int32_t *resample_out_p[2]; /* Actual output buffer pointers */
+ int32_t *resample_out_p[2]; /* Actual output buffer pointers */
} resample_data[DSP_COUNT] IBSS_ATTR;
/* Actual worker function. Implemented here or in target assembly code. */
-int resample_linear(struct resample_data *data, struct dsp_buffer *src,
- struct dsp_buffer *dst);
+int resample_hermite(struct resample_data *data, struct dsp_buffer *src,
+ struct dsp_buffer *dst);
static void resample_flush_data(struct resample_data *data)
{
data->phase = 0;
- data->last_sample[0] = 0;
- data->last_sample[1] = 0;
+ memset(&data->history, 0, sizeof (data->history));
}
static void resample_flush(struct dsp_proc_entry *this)
@@ -84,8 +83,8 @@ static bool resample_new_delta(struct resample_data *data,
if (frequency == NATIVE_FREQUENCY)
{
/* NOTE: If fully glitch-free transistions from no resampling to
- resampling are desired, last_sample history should be maintained
- even when not resampling. */
+ resampling are desired, history should be maintained even when
+ not resampling. */
resample_flush_data(data);
return false;
}
@@ -94,9 +93,8 @@ static bool resample_new_delta(struct resample_data *data,
}
#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
-/* Where the real work is done */
-int resample_linear(struct resample_data *data, struct dsp_buffer *src,
- struct dsp_buffer *dst)
+int resample_hermite(struct resample_data *data, struct dsp_buffer *src,
+ struct dsp_buffer *dst)
{
int ch = src->format.num_channels - 1;
uint32_t count = MIN(src->remcount, 0x8000);
@@ -111,35 +109,70 @@ int resample_linear(struct resample_data *data, struct dsp_buffer *src,
d = dst->p32[ch];
int32_t *dmax = d + dst->bufcount;
+ /* Restore state */
phase = data->phase;
pos = phase >> 16;
pos = MIN(pos, count);
- int32_t last = pos > 0 ? s[pos - 1] : data->last_sample[ch];
-
- if (pos < count)
+ while (pos < count && d < dmax)
{
- while (1)
- {
- *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
- phase += delta;
- pos = phase >> 16;
-
- if (pos >= count || d >= dmax)
- break;
+ int x0, x1, x2, x3;
- if (pos > 0)
- last = s[pos - 1];
+ if (pos < 3)
+ {
+ x3 = data->history[ch][pos+0];
+ x2 = pos < 2 ? data->history[ch][pos+1] : s[pos-2];
+ x1 = pos < 1 ? data->history[ch][pos+2] : s[pos-1];
}
-
- if (pos > 0)
+ else
{
- pos = MIN(pos, count);
- last = s[pos - 1];
+ x3 = s[pos-3];
+ x2 = s[pos-2];
+ x1 = s[pos-1];
}
+
+ x0 = s[pos];
+
+ int32_t frac = (phase & 0xffff) << 15;
+
+ /* 4-point, 3rd-order Hermite/Catmull-Rom spline (x-form):
+ * c1 = -0.5*x3 + 0.5*x1
+ * = 0.5*(x1 - x3) <--
+ *
+ * v = x1 - x2, -v = x2 - x1
+ * c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0
+ * = x3 + 2*(x1 - x2) - 0.5*(x0 + x2)
+ * = x3 + 2*v - 0.5*(x0 + x2) <--
+ *
+ * c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0
+ * = 0.5*x0 - 0.5*x3 + 0.5*(x2 - x1) + (x2 - x1)
+ * = 0.5*(x0 - x3 - v) - v <--
+ *
+ * polynomial coefficients */
+ int32_t c1 = (x1 - x3) >> 1;
+ int32_t v = x1 - x2;
+ int32_t c2 = x3 + 2*v - ((x0 + x2) >> 1);
+ int32_t c3 = ((x0 - x3 - v) >> 1) - v;
+
+ /* Evaluate polynomial at time 'frac'; Horner's rule. */
+ int32_t acc;
+ acc = FRACMUL(c3, frac) + c2;
+ acc = FRACMUL(acc, frac) + c1;
+ acc = FRACMUL(acc, frac) + x2;
+
+ *d++ = acc;
+
+ phase += delta;
+ pos = phase >> 16;
}
- data->last_sample[ch] = last;
+ pos = MIN(pos, count);
+
+ /* Save delay samples for next time. Must do this even if pos was
+ * clamped before loop in order to keep record up to date. */
+ data->history[ch][0] = pos < 3 ? data->history[ch][pos+0] : s[pos-3];
+ data->history[ch][1] = pos < 2 ? data->history[ch][pos+1] : s[pos-2];
+ data->history[ch][2] = pos < 1 ? data->history[ch][pos+2] : s[pos-1];
}
while (--ch >= 0);
@@ -147,7 +180,6 @@ int resample_linear(struct resample_data *data, struct dsp_buffer *src,
data->phase = phase - (pos << 16);
dst->remcount = d - dst->p32[0];
-
return pos;
}
#endif /* CPU */
@@ -175,7 +207,7 @@ static void resample_process(struct dsp_proc_entry *this,
{
dst->bufcount = RESAMPLE_BUF_COUNT;
- int consumed = resample_linear(data, src, dst);
+ int consumed = resample_hermite(data, src, dst);
/* Advance src by consumed amount */
if (consumed > 0)