summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThom Johansen <thomj@rockbox.org>2007-10-24 22:39:08 +0000
committerThom Johansen <thomj@rockbox.org>2007-10-24 22:39:08 +0000
commitcd9fc7a2b95204f0169e20409583278a13fe1ded (patch)
tree23f66a76141a583b747785d16a891691c5358916
parentfb709522283bfb7558bf2b824a4143a919d59e97 (diff)
downloadrockbox-cd9fc7a2b95204f0169e20409583278a13fe1ded.tar.gz
rockbox-cd9fc7a2b95204f0169e20409583278a13fe1ded.zip
Coldfire assembler version of qmf_synth(). Wideband and ultra-wideband Speex files should see a great speedup. Also add faster and symmetric clipping in iir_mem16().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15292 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libspeex/filters.c3
-rw-r--r--apps/codecs/libspeex/filters_cf.S182
2 files changed, 168 insertions, 17 deletions
diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c
index 02f93a27b1..e64f087a5d 100644
--- a/apps/codecs/libspeex/filters.c
+++ b/apps/codecs/libspeex/filters.c
@@ -47,6 +47,7 @@
#include "filters_arm4.h"
#elif defined (COLDFIRE_ASM)
#define OVERRIDE_IIR_MEM16
+#define OVERRIDE_QMF_SYNTH
#elif defined (BFIN_ASM)
#include "filters_bfin.h"
#endif
@@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1
}
}
+#ifndef OVERRIDE_QMF_SYNTH
/* Re-synthesised a signal from the QMF low-band and high-band signals */
void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
/* assumptions:
@@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_
for (i = 0; i < M2; i++)
mem2[2*i+1] = xx2[i];
}
+#endif
#ifdef FIXED_POINT
#if 0
diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S
index 579af11581..dd650844c8 100644
--- a/apps/codecs/libspeex/filters_cf.S
+++ b/apps/codecs/libspeex/filters_cf.S
@@ -31,7 +31,6 @@
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-
.text
/* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
.global iir_mem16
@@ -59,14 +58,18 @@ iir_mem16:
move.w (%a3)+, %d0
ext.l %d0
add.l %d1, %d0 | Add with x[i]
- move.l #32768, %d1
- add.l %d1, %d0 | Bias result to [0..65535]
- cmp.l #65535, %d0 | Clip to [0..65535] range
- jle 1f
- spl.b %d0
- ext.w %d0
+ move.l #32767, %d1
+ move.l #65534, %a6
+ add.l %d1, %d0 | Bias result to [-1..65534]
+ cmp.l %a6, %d0 | Now do clip to [0..65534] range
+ jls 2f
+ jpl 1f
+ clr.l %d0 | Clip low
+ .word 0x51fa | trapf.w, shadow next insn
1:
- sub.l %d1, %d0 | Bias clipped result back to [-32768..32767]
+ move.l %a6, %d0 | Clip high
+2:
+ sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
neg.l %d0 | msac.w is bugged in gas, do this for now
move.w %d0, (%a5)+ | Write result to y[i]
move.l (%a4)+, %a6 | Fetch den[0] and den[1]
@@ -111,14 +114,18 @@ iir_mem16:
move.w (%a3)+, %d0
ext.l %d0
add.l %d1, %d0 | Add with x[i]
- move.l #32768, %d1
- add.l %d1, %d0 | Bias result to [0..65535]
- cmp.l #65535, %d0 | Clip to [0..65535] range
- jle 1f
- spl.b %d0
- ext.w %d0
+ move.l #32767, %d1
+ move.l #65534, %a6
+ add.l %d1, %d0 | Bias result to [-1..65534]
+ cmp.l %a6, %d0 | Now do clip to [0..65534] range
+ jls 2f
+ jpl 1f
+ clr.l %d0 | Clip low
+ .word 0x51fa | trapf.w, shadow next insn
1:
- sub.l %d1, %d0 | Bias clipped result back to [-32768..32767]
+ move.l %a6, %d0 | Clip high
+2:
+ sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
neg.l %d0 | msac.w is bugged in gas, do this for now
move.w %d0, (%a5)+ | Write result to y[i]
move.l (%a4)+, %a6 | Fetch den[0] and den[1]
@@ -159,7 +166,148 @@ iir_mem16:
movem.l %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
.exit:
- movem.l (%sp), %d2-%d7/%a2-%a6
- lea.l (44, %sp), %sp
+ movem.l (%sp), %d2-%d7/%a2-%a6
+ lea.l (44, %sp), %sp
+ rts
+
+/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
+ .global qmf_synth
+qmf_synth:
+ lea.l (-44, %sp), %sp
+ movem.l %d2-%d7/%a2-%a6, (%sp)
+ movem.l (44+4, %sp), %a0-%a3 | a0 = x1, a1 = x2, a2 = a, a3 = y
+ movem.l (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
+ move.l #0x80, %macsr | Enable saturation
+
+ | Comments make more sense when compared to the reference C version
+ move.l %a2, %d6 | Backup a
+ lsr.l #1, %d0 | N2 = N >> 1
+ lsr.l #1, %d1 | M2 = M >> 1
+ move.l %d1, %d7 | Backup M2
+ clr.l %d2
+ sub.l %d0, %d2
+ sub.l %d1, %d2 | d2 = -(N2 + M2)
+ lea.l (%sp, %d2.l*2), %a2 | Alloc two buffers of N2 + M2 shorts
+ lea.l (%a2, %d2.l*2), %a6 | a2 = xx1, a6 = xx2
+ move.l %sp, %d3
+ move.l %a6, %sp | Update sp
+ move.l %d3, -(%sp) | Stack old %sp
+
+ | Backwards copy x1 and x2 arrays to xx1 and xx2
+ | TODO: these copying loops probably have more potential for optimization
+ lea.l (%a0, %d0.l*2), %a0 | x1 += N2
+ lea.l (%a1, %d0.l*2), %a1 | x2 += N2
+ move.l %d0, %d2 | Loop counter is N2
+0:
+ move.w -(%a0), (%a2)+
+ move.w -(%a1), (%a6)+
+ subq.l #1, %d2
+ jne 0b
+
+ | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
+ move.l %d1, %d2 | Loop counter is M2
+ addq.l #4, %a4 | a4 = &mem1[1]
+ addq.l #4, %a5 | a5 = &mem2[1]
+ move.l %a4, %d3 | Backup mem1 and mem2
+ move.l %a5, %d4
+0:
+ move.l (%a4), %d5
+ move.w %d5, (%a2)+
+ move.l (%a5), %d5
+ move.w %d5, (%a6)+
+ addq.l #8, %a4
+ addq.l #8, %a5
+ subq.l #1, %d2
+ jne 0b
+ move.l %d3, %a4 | a4 = &mem1[1]
+ move.l %d4, %a5 | a5 = &mem2[1]
+
+ clr.l %d2
+ sub.l %d1, %d2 | d2 = -M2
+ lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
+ lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
+ move.l %d6, %a2 | a2 = a
+
+ | Main loop, register usage:
+ | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
+ | d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
+ | a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
+0: | Outerloop
+ move.l #32768, %d2 | Rounding constant
+ move.l %d2, %acc0
+ move.l %d2, %acc1
+ move.l %d2, %acc2
+ move.l %d2, %acc3
+ move.w (%a0)+, %d2 | d2 = x10
+ move.w (%a1)+, %d4 | d4 = x20
+ move.l (%a2)+, %d6 | d6 = [a0, a1]
+1: | Innerloop
+ move.w (%a0)+, %d3 | d3 = x11
+ move.w (%a1)+, %d5 | d5 = x21
+ mac.w %d6u, %d3l, #1, %acc0 | acc0 += a0*x11
+ msac.w %d6u, %d5l, #1, %acc0 | acc0 -= a0*x21
+ mac.w %d6l, %d3l, #1, %acc1 | acc1 += a1*x11
+ mac.w %d6l, %d5l, #1, %acc1 | acc1 += a1*x21
+ mac.w %d6u, %d2l, #1, %acc2 | acc2 += a0*x10
+ msac.w %d6u, %d4l, #1, %acc2 | acc2 -= a0*x20
+ mac.w %d6l, %d2l, #1, %acc3 | acc3 += a1*x10
+ mac.w %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20
+
+ move.w (%a0)+, %d2 | d2 = x10
+ move.w (%a1)+, %d4 | d4 = x20
+ mac.w %d6u, %d2l, #1, %acc0 | acc0 += a0*x10
+ msac.w %d6u, %d4l, #1, %acc0 | acc0 -= a0*x20
+ mac.w %d6l, %d2l, #1, %acc1 | acc1 += a1*x10
+ mac.w %d6l, %d4l, #1, %acc1 | acc1 += a1*x20
+ mac.w %d6u, %d3l, #1, %acc2 | acc2 += a0*x11
+ msac.w %d6u, %d5l, #1, %acc2 | acc2 -= a0*x21
+ mac.w %d6l, %d3l, #1, %acc3 | acc3 += a1*x11
+ mac.w %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21
+ subq.l #2, %d1
+ jne 1b
+
+ sub.l %d7, %d1 | d1 = -M2
+ lea.l (-4, %a2, %d1.l*4), %a2 | a2 = &a[0]
+ lea.l (-6, %a0, %d1.l*2), %a0 | a0 = &xx1[N2 - 2 - i]
+ lea.l (-6, %a1, %d1.l*2), %a1 | a1 = &xx2[N2 - 2 - i]
+ neg.l %d1 | d1 = M2
+ movclr.l %acc0, %d2
+ movclr.l %acc1, %d3
+ movclr.l %acc2, %d4
+ movclr.l %acc3, %d5
+ swap.w %d2 | Shift 16 right
+ swap.w %d3
+ swap.w %d4
+ swap.w %d5
+ | Thanks to the extra shift in the mac chain, we get clipping for free.
+ | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
+ | but since qmf_synth() is called so late in the signal chain, it should
+ | work fine.
+ move.w %d2, (%a3)+ | Write results to y[]
+ move.w %d3, (%a3)+
+ move.w %d4, (%a3)+
+ move.w %d5, (%a3)+
+ subq.l #2, %d0
+ jne 0b
+
+ | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
+ addq.l #4, %a0 | a0 = &xx1[0]
+ addq.l #4, %a1 | a1 = &xx2[0]
+0:
+ move.w (%a0)+, %d2
+ move.w (%a1)+, %d3
+ ext.l %d2
+ ext.l %d3
+ move.l %d2, (%a4)
+ move.l %d3, (%a5)
+ addq.l #8, %a4
+ addq.l #8, %a5
+ subq.l #1, %d1
+ jne 0b
+
+ move.l #0, %macsr
+ move.l (%sp), %sp
+ movem.l (%sp), %d2-%d7/%a2-%a6
+ lea.l (44, %sp), %sp
rts