summaryrefslogtreecommitdiffstats
path: root/apps/codecs
diff options
context:
space:
mode:
authorDave Chapman <dave@dchapman.com>2007-07-28 15:21:25 +0000
committerDave Chapman <dave@dchapman.com>2007-07-28 15:21:25 +0000
commit66b51909c09bda9910b5c891b241cb9cf8556970 (patch)
treecffa940b47e71edd3370159eba80a551d4cfe36b /apps/codecs
parent488b3db547a09b25eac212e77ccb64ef81f8ce3f (diff)
downloadrockbox-66b51909c09bda9910b5c891b241cb9cf8556970.tar.gz
rockbox-66b51909c09bda9910b5c891b241cb9cf8556970.zip
FS #6705 - ARM optimisations for libmad by Tomasz Malesinski. Modified slightly by me to not put code in IRAM for PP502x (it's slower), and for the mpegplayer version of libmad for PP5002 (there isn't enough room). On my ipod Color, it increases a 320kbps MP3 test file from 169% realtime to 188% realtime. Reported speedup on the ipod 3G was from 118% to 155% realtime for a 192kbps MP3.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@14041 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r--apps/codecs/libmad/SOURCES2
-rw-r--r--apps/codecs/libmad/bit.c28
-rw-r--r--apps/codecs/libmad/dct32_arm.S332
-rw-r--r--apps/codecs/libmad/layer3.c32
-rw-r--r--apps/codecs/libmad/synth.c375
-rw-r--r--apps/codecs/libmad/synth_full_arm.S343
6 files changed, 1091 insertions, 21 deletions
diff --git a/apps/codecs/libmad/SOURCES b/apps/codecs/libmad/SOURCES
index 74b8f2889d..a4b6e8ca9a 100644
--- a/apps/codecs/libmad/SOURCES
+++ b/apps/codecs/libmad/SOURCES
@@ -14,4 +14,6 @@ imdct_mcf5249.S
#endif
#if defined(CPU_ARM) && !defined(SIMULATOR)
imdct_l_arm.S
+dct32_arm.S
+synth_full_arm.S
#endif
diff --git a/apps/codecs/libmad/bit.c b/apps/codecs/libmad/bit.c
index 85c5baadd7..6c984ef078 100644
--- a/apps/codecs/libmad/bit.c
+++ b/apps/codecs/libmad/bit.c
@@ -128,13 +128,7 @@ void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len)
* NAME: bit->read()
* DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value
*/
-unsigned long bmask[] ICONST_ATTR =
-{ 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f,
- 0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff,
- 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff,
- 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff,
- 0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff,
- 0x3fffffff, 0x7fffffff, 0xffffffff };
+
unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) ICODE_ATTR;
unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
{
@@ -142,19 +136,13 @@ unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
if(len)
{
- if((bitptr->readbit ^ (bitptr->readbit + len - 1)) < 32)
- {
- bitptr->readbit += len;
-
- return (betoh32(curr[0]) >> (-bitptr->readbit & 31)) & bmask[len];
- }
- else
- {
- bitptr->readbit += len;
-
- return ((betoh32(curr[0]) << ( bitptr->readbit & 31))
- + (betoh32(curr[1]) >> (-bitptr->readbit & 31))) & bmask[len];
- }
+ unsigned long r = betoh32(curr[0]) << (bitptr->readbit & 31);
+
+ if((bitptr->readbit & 31) + len > 32)
+ r += betoh32(curr[1]) >> (-bitptr->readbit & 31);
+
+ bitptr->readbit += len;
+ return r >> (32 - len);
}
return 0;
diff --git a/apps/codecs/libmad/dct32_arm.S b/apps/codecs/libmad/dct32_arm.S
new file mode 100644
index 0000000000..4d94896b0b
--- /dev/null
+++ b/apps/codecs/libmad/dct32_arm.S
@@ -0,0 +1,332 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2007 by Tomasz Malesinski
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+ .global dct32
+
+/* This performs slower in IRAM on PP502x and there is no space in
+ mpegplayer on the PP5002 */
+#if defined(CPU_PP502x) || (CONFIG_CPU == PP5002 && defined(MPEGPLAYER))
+ .section .text,"ax",%progbits
+#else
+ .section .icode,"ax",%progbits
+#endif
+
+dct32:
+ stmdb r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+ sub r13, r13, #144
+ str r0, [r13, #12]
+ str r1, [r13, #8]
+ str r2, [r13, #4]
+ str r3, [r13]
+ add r0, r13, #16
+ add r1, r0, #128
+ ldr r2, =bitrev
+.shuffle:
+ ldr r5, [r13, #12]
+ ldr r3, [r2], #4
+ sub r4, r5, r3, lsl #4
+ add r3, r5, r3, lsl #4
+ ldr r6, [r3]
+ ldr r8, [r4, #124]
+ add r6, r6, r8
+ sub r8, r6, r8, lsl #1
+ ldr r7, [r3, #8]
+ ldr lr, [r4, #116]
+ add r7, r7, lr
+ sub lr, r7, lr, lsl #1
+ ldr r10, [r3, #64]
+ ldr r9, [r4, #60]
+ add r10, r10, r9
+ sub r9, r10, r9, lsl #1
+ ldr r11, [r3, #72]
+ ldr r12, [r4, #52]
+ add r11, r11, r12
+ sub r12, r11, r12, lsl #1
+ add r6, r6, r10
+ sub r10, r6, r10, lsl #1
+ add r7, r7, r11
+ sub r11, r7, r11, lsl #1
+ add r8, r8, r12
+ sub r12, r8, r12, lsl #1
+ add lr, lr, r9
+ sub r9, lr, r9, lsl #1
+ stmia r0!, {r6, r7, r8, r9, r10, r11, r12, lr}
+ cmp r0, r1
+ bne .shuffle
+ ldr r0, =189812531
+ add r1, r13, #16
+ add r3, r1, #128
+.l2:
+ add r2, r1, #32
+ ldmia r2, {r4, r5, r8, r9}
+ ldmia r1, {r6, r7, r10, r11}
+ add r6, r6, r4
+ sub r4, r6, r4, lsl #1
+ add r7, r7, r5
+ sub r5, r7, r5, lsl #1
+ stmia r2!, {r4, r5}
+ stmia r1!, {r6, r7}
+ add r9, r9, r8
+ sub r8, r9, r8, lsl #1
+ smull r4, r6, r9, r0
+ movs r4, r4, lsr #28
+ adc r4, r4, r6, lsl #4
+ smull r5, r6, r8, r0
+ movs r5, r5, lsr #28
+ adc r5, r5, r6, lsl #4
+ add r10, r10, r4
+ sub r4, r10, r4, lsl #1
+ add r11, r11, r5
+ sub r5, r11, r5, lsl #1
+ stmia r2!, {r4, r5}
+ stmia r1!, {r10, r11}
+ ldmia r2, {r5, r6, r8, r11}
+ ldmia r1, {r4, r7, r9, r10}
+ add r4, r4, r6
+ sub r6, r4, r6, lsl #1
+ add r7, r7, r5
+ sub r5, r7, r5, lsl #1
+ stmia r2!, {r6, r7}
+ stmia r1!, {r4, r5}
+ add r11, r11, r8
+ sub r8, r11, r8, lsl #1
+ smull r5, r4, r8, r0
+ movs r5, r5, lsr #28
+ adc r5, r5, r4, lsl #4
+ smull r6, r4, r11, r0
+ movs r6, r6, lsr #28
+ adc r6, r6, r4, lsl #4
+ add r9, r9, r5
+ sub r5, r9, r5, lsl #1
+ sub r10, r10, r6
+ add r6, r10, r6, lsl #1
+ stmia r2!, {r5, r6}
+ stmia r1!, {r9, r10}
+ add r1, r1, #32
+ cmp r1, r3
+ bne .l2
+ add r2, r13, #16
+ add r3, r2, #64
+ ldr r0, =sincos
+ add r1, r0, #128
+.lbut8:
+ ldmia r3, {r7, r8}
+ ldmia r0, {r9, r10}
+ add r0, r0, #16
+ smull r6, r5, r7, r9
+ smlal r6, r5, r10, r8
+ movs r6, r6, lsr #28
+ adc r6, r6, r5, lsl #4
+ smull r10, r5, r7, r10
+ rsb r9, r9, #0
+ smlal r10, r5, r8, r9
+ movs r10, r10, lsr #28
+ adc r5, r10, r5, lsl #4
+ ldmia r2, {r7, r8}
+ add r7, r7, r5
+ sub r5, r7, r5, lsl #1
+ add r8, r8, r6
+ sub r6, r8, r6, lsl #1
+ stmia r3!, {r5, r6}
+ stmia r2!, {r7, r8}
+ cmp r0, r1
+ bne .lbut8
+ add r1, r13, #16
+ ldr r2, =sincos
+ ldr r3, =sincos2
+ ldr r0, [r13, #8]
+ mov r0, r0, lsl #2
+ ldr r4, [r13, #4]
+ add r4, r4, r0
+ ldr r5, [r13]
+ add r5, r5, #480
+ add r5, r5, r0
+ mov r0, #0
+.l4:
+ rsb r12, r0, #16
+ and r12, r12, #15
+ add lr, r13, #16
+ add r12, lr, r12, lsl #3
+ ldmia r1!, {r10, r11}
+ ldmia r12, {r6, r7}
+ add r6, r6, r10
+ sub r10, r6, r10, lsl #1
+ add r11, r11, r7
+ sub r7, r11, r7, lsl #1
+ ldmia r2!, {r12, lr}
+ smull r9, r8, r11, r12
+ smlal r9, r8, lr, r10
+ movs r9, r9, lsr #28
+ adc r9, r9, r8, lsl #4
+ smull lr, r8, r11, lr
+ rsb r12, r12, #0
+ smlal lr, r8, r10, r12
+ movs lr, lr, lsr #28
+ adc r8, lr, r8, lsl #4
+ add r6, r6, r8
+ sub r8, r6, r8, lsl #1
+ add r7, r7, r9
+ sub r9, r7, r9, lsl #1
+ add lr, r3, #128
+ ldmia lr, {r10, r11}
+ smull lr, r12, r8, r11
+ smlal lr, r12, r9, r10
+ movs lr, lr, lsr #28
+ adc r12, lr, r12, lsl #4
+ str r12, [r4], #32
+ cmp r0, #0
+ cmpne r0, #8
+ beq .skip1
+ smull lr, r12, r8, r10
+ rsb r9, r9, #0
+ smlal lr, r12, r9, r11
+ movs lr, lr, lsr #28
+ adc r12, lr, r12, lsl #4
+ add lr, r5, r0, lsl #6
+ str r12, [lr, #-512]
+.skip1:
+ ldmia r3!, {r10, r11}
+ smull lr, r12, r7, r10
+ smlal lr, r12, r6, r11
+ movs lr, lr, lsr #28
+ adc r12, lr, r12, lsl #4
+ str r12, [r5], #-32
+ cmp r0, #0
+ cmpne r0, #8
+ beq .skip2
+ smull lr, r12, r6, r10
+ rsb r7, r7, #0
+ smlal lr, r12, r7, r11
+ movs lr, lr, lsr #28
+ adc r12, lr, r12, lsl #4
+ sub lr, r4, r0, lsl #6
+ str r12, [lr, #480]
+.skip2:
+ add r0, r0, #1
+ cmp r0, #9
+ bne .l4
+ add r13, r13, #144
+ ldmia r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+bitrev:
+ .word 0x0
+ .word 0x2
+ .word 0x1
+ .word 0x3
+
+sincos:
+ .word 0x0
+ .word 0x10000000
+ .word -0x31f1708
+ .word 0xfb14be8
+ .word -0x61f78aa
+ .word 0xec835e8
+ .word -0x8e39d9d
+ .word 0xd4db315
+ .word -0xb504f33
+ .word 0xb504f33
+ .word -0xd4db315
+ .word 0x8e39d9d
+ .word -0xec835e8
+ .word 0x61f78aa
+ .word -0xfb14be8
+ .word 0x31f1708
+ .word -0x10000000
+ .word 0x0
+ .word -0xfb14be8
+ .word -0x31f1708
+ .word -0xec835e8
+ .word -0x61f78aa
+ .word -0xd4db315
+ .word -0x8e39d9d
+ .word -0xb504f33
+ .word -0xb504f33
+ .word -0x8e39d9d
+ .word -0xd4db315
+ .word -0x61f78aa
+ .word -0xec835e8
+ .word -0x31f1708
+ .word -0xfb14be8
+
+sincos2:
+ .word 0x0
+ .word 0x8000000
+ .word 0x647d98
+ .word 0x7fd8879
+ .word 0xc8bd36
+ .word 0x7f62369
+ .word 0x12c8107
+ .word 0x7e9d560
+ .word 0x18f8b84
+ .word 0x7d8a5f4
+ .word 0x1f19f98
+ .word 0x7c29fbf
+ .word 0x25280c6
+ .word 0x7a7d056
+ .word 0x2b1f34f
+ .word 0x7884841
+ .word 0x30fbc55
+ .word 0x7641af4
+ .word 0x36ba201
+ .word 0x73b5ebd
+ .word 0x3c56ba7
+ .word 0x70e2cbc
+ .word 0x41ce1e6
+ .word 0x6dca0d1
+ .word 0x471cece
+ .word 0x6a6d98a
+ .word 0x4c3fdff
+ .word 0x66cf812
+ .word 0x5133cc9
+ .word 0x62f201b
+ .word 0x55f5a4d
+ .word 0x5ed77c9
+ .word 0x5a8279a
+ .word 0x5a8279a
+ .word 0x5ed77c9
+ .word 0x55f5a4d
+ .word 0x62f201b
+ .word 0x5133cc9
+ .word 0x66cf812
+ .word 0x4c3fdff
+ .word 0x6a6d98a
+ .word 0x471cece
+ .word 0x6dca0d1
+ .word 0x41ce1e6
+ .word 0x70e2cbc
+ .word 0x3c56ba7
+ .word 0x73b5ebd
+ .word 0x36ba201
+ .word 0x7641af4
+ .word 0x30fbc55
+ .word 0x7884841
+ .word 0x2b1f34f
+ .word 0x7a7d056
+ .word 0x25280c6
+ .word 0x7c29fbf
+ .word 0x1f19f98
+ .word 0x7d8a5f4
+ .word 0x18f8b84
+ .word 0x7e9d560
+ .word 0x12c8107
+ .word 0x7f62369
+ .word 0xc8bd36
+ .word 0x7fd8879
+ .word 0x647d98
diff --git a/apps/codecs/libmad/layer3.c b/apps/codecs/libmad/layer3.c
index 38e488ddbb..a95927e10f 100644
--- a/apps/codecs/libmad/layer3.c
+++ b/apps/codecs/libmad/layer3.c
@@ -922,8 +922,19 @@ mad_fixed_t III_requantize(unsigned int value, signed int exp)
}
/* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */
+# if defined(CPU_ARM)
+# define MASK(cache, sz, bits) \
+ ({ unsigned long res; \
+ asm ("mov %0, #1\n\t" \
+ "rsb %0, %0, %0, lsl %3\n\t" \
+ "and %0, %0, %1, lsr %2" \
+ : "=&r" (res) : "r" (cache), "r" ((sz) - (bits)), "r" (bits)); \
+ res; \
+ })
+#else
# define MASK(cache, sz, bits) \
(((cache) >> ((sz) - (bits))) & ((1 << (bits)) - 1))
+#endif
# define MASK1BIT(cache, sz) \
((cache) & (1 << ((sz) - 1)))
@@ -1546,6 +1557,9 @@ enum mad_error III_stereo(mad_fixed_t xr[2][576],
return MAD_ERROR_NONE;
}
+#if defined(CPU_ARM)
+void III_aliasreduce(mad_fixed_t xr[576], int lines);
+#else
/*
* NAME: III_aliasreduce()
* DESCRIPTION: perform frequency line alias reduction
@@ -1600,6 +1614,7 @@ void III_aliasreduce(mad_fixed_t xr[576], int lines)
}
}
}
+#endif
# if defined(ASO_IMDCT)
void III_imdct_l(mad_fixed_t const [18], mad_fixed_t [36], unsigned int);
@@ -2894,6 +2909,11 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
#endif
+#ifdef CPU_ARM
+void III_overlap(mad_fixed_t const output[36], mad_fixed_t overlap[18],
+ mad_fixed_t sample[18][32], unsigned int sb);
+#else
+
/*
* NAME: III_overlap()
* DESCRIPTION: perform overlap-add of windowed IMDCT outputs
@@ -2941,6 +2961,7 @@ void III_overlap(mad_fixed_t const output[36], mad_fixed_t overlap[18],
}
# endif
}
+#endif
/*
* NAME: III_overlap_z()
@@ -3142,10 +3163,21 @@ enum mad_error III_decode(struct mad_bitptr *ptr, struct mad_frame *frame,
/* (nonzero) subbands 2-31 */
+/*
i = 576;
while (i > 36 && xr[ch][i - 1] == 0)
--i;
+*/
+ {
+ /* saves ~600k cycles */
+ mad_fixed_t *p = &xr[ch][576];
+ mad_fixed_t tmp = xr[ch][35];
+ xr[ch][35] = 1;
+ while (!*--p);
+ xr[ch][35] = tmp;
+ i = p - &xr[ch][0] + 1;
+ }
sblimit = 32 - (576 - i) / 18;
if (channel->block_type != 2) {
diff --git a/apps/codecs/libmad/synth.c b/apps/codecs/libmad/synth.c
index 8613f77f79..c3a868a0dc 100644
--- a/apps/codecs/libmad/synth.c
+++ b/apps/codecs/libmad/synth.c
@@ -67,6 +67,13 @@ void mad_synth_mute(struct mad_synth *synth)
}
}
+#ifdef FPM_ARM
+
+void dct32(mad_fixed_t const in[32], unsigned int slot,
+ mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]);
+
+#else
+
/*
* An optional optimization called here the Subband Synthesis Optimization
* (SSO) improves the performance of subband synthesis at the expense of
@@ -533,6 +540,8 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
# undef MUL
# undef SHIFT
+#endif
+
/* third SSO shift and/or D[] optimization preshift */
# if defined(OPT_SSO)
@@ -816,7 +825,370 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
}
}
-#else
+#elif defined(FPM_ARM)
+
+#define PROD_ODD_0(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #4]\n\t" \
+ "smull %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #60]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #52]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #44]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #36]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #28]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #20]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #12]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "=&r" (lo), "=&r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+#define PROD_ODD_A(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #4]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #60]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #52]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #44]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #36]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #28]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #20]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #12]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "+r" (lo), "+r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+#define PROD_EVEN_0(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #0]\n\t" \
+ "smull %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #56]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #48]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #40]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #32]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #24]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #16]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #8]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "=&r" (lo), "=&r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+#define PROD_EVEN_A(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #0]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #56]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #48]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #40]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #32]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #24]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #16]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #8]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "+r" (lo), "+r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+#define PROD_EVENBACK_0(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #60]\n\t" \
+ "smull %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #68]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #76]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #84]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #92]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #100]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #108]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #116]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "=&r" (lo), "=&r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+#define PROD_EVENBACK_A(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #60]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #68]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #76]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #84]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #92]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #100]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #108]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #116]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "+r" (lo), "+r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+#define PROD_ODDBACK_0(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #120]\n\t" \
+ "smull %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #64]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #72]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #80]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #88]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #96]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #104]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #112]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "=&r" (lo), "=&r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+#define PROD_ODDBACK_A(hi, lo, f, ptr) \
+ do { \
+ mad_fixed_t *__p = (f); \
+ asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #120]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #64]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #72]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #80]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ "ldmia %2, {r0, r1, r2, r3}\n\t" \
+ "ldr r4, [%3, #88]\n\t" \
+ "smlal %0, %1, r0, r4\n\t" \
+ "ldr r4, [%3, #96]\n\t" \
+ "smlal %0, %1, r1, r4\n\t" \
+ "ldr r4, [%3, #104]\n\t" \
+ "smlal %0, %1, r2, r4\n\t" \
+ "ldr r4, [%3, #112]\n\t" \
+ "smlal %0, %1, r3, r4\n\t" \
+ : "+r" (lo), "+r" (hi), "+r" (__p) \
+ : "r" (ptr) \
+ : "r0", "r1", "r2", "r3", "r4"); \
+ } while (0)
+
+void synth_full1(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8],
+ mad_fixed_t const (*D0ptr)[32],
+ mad_fixed_t const (*D1ptr)[32]);
+void synth_full2(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8],
+ mad_fixed_t const (*D0ptr)[32],
+ mad_fixed_t const (*D1ptr)[32]);
+
+/* This performs slower in IRAM on PP502x and there is no space in
+ mpegplayer on the PP5002 */
+#if !defined(CPU_PP502x) && !(CONFIG_CPU == PP5002 && defined(MPEGPLAYER))
+static
+void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
+ unsigned int nch, unsigned int ns) ICODE_ATTR;
+#endif
+static
+void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
+ unsigned int nch, unsigned int ns)
+{
+ int p;
+ unsigned int phase, ch, s;
+ mad_fixed_t *pcm, (*filter)[2][2][16][8];
+ mad_fixed_t const (*sbsample)[36][32];
+ mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
+ mad_fixed_t const (*D0ptr)[32], *ptr;
+ mad_fixed_t const (*D1ptr)[32];
+ mad_fixed64hi_t hi;
+ mad_fixed64lo_t lo;
+
+ for (ch = 0; ch < nch; ++ch) {
+ sbsample = &frame->sbsample[ch];
+ filter = &synth->filter[ch];
+ phase = synth->phase;
+ pcm = synth->pcm.samples[ch];
+
+ for (s = 0; s < ns; ++s) {
+ dct32((*sbsample)[s], phase >> 1,
+ (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
+
+ p = (phase - 1) & 0xf;
+
+ /* calculate 32 samples */
+ fe = &(*filter)[0][ phase & 1][0];
+ fx = &(*filter)[0][~phase & 1][0];
+ fo = &(*filter)[1][~phase & 1][0];
+
+ D0ptr = (void*)&D[0][ p];
+ D1ptr = (void*)&D[0][-p];
+
+ if(s & 1)
+ {
+ ptr = *D0ptr;
+/*
+ ML0(hi, lo, (*fx)[0], ptr[ 1]);
+ MLA(hi, lo, (*fx)[1], ptr[15]);
+ MLA(hi, lo, (*fx)[2], ptr[13]);
+ MLA(hi, lo, (*fx)[3], ptr[11]);
+ MLA(hi, lo, (*fx)[4], ptr[ 9]);
+ MLA(hi, lo, (*fx)[5], ptr[ 7]);
+ MLA(hi, lo, (*fx)[6], ptr[ 5]);
+ MLA(hi, lo, (*fx)[7], ptr[ 3]);
+*/
+ PROD_ODD_0(hi, lo, *fx, ptr);
+ MLN(hi, lo);
+/*
+ MLA(hi, lo, (*fe)[0], ptr[ 0]);
+ MLA(hi, lo, (*fe)[1], ptr[14]);
+ MLA(hi, lo, (*fe)[2], ptr[12]);
+ MLA(hi, lo, (*fe)[3], ptr[10]);
+ MLA(hi, lo, (*fe)[4], ptr[ 8]);
+ MLA(hi, lo, (*fe)[5], ptr[ 6]);
+ MLA(hi, lo, (*fe)[6], ptr[ 4]);
+ MLA(hi, lo, (*fe)[7], ptr[ 2]);
+*/
+ PROD_EVEN_A(hi, lo, *fe, ptr);
+ pcm[0] = SHIFT(MLZ(hi, lo));
+ pcm += 16;
+
+ synth_full1(pcm, fo, fe, D0ptr, D1ptr);
+ D0ptr += 15;
+ D1ptr += 15;
+ fo += 15;
+ fe += 15;
+
+ ptr = *(D0ptr + 1);
+ PROD_ODD_0(hi, lo, *fo, ptr);
+/*
+ ML0(hi, lo, (*fo)[0], ptr[ 1]);
+ MLA(hi, lo, (*fo)[1], ptr[15]);
+ MLA(hi, lo, (*fo)[2], ptr[13]);
+ MLA(hi, lo, (*fo)[3], ptr[11]);
+ MLA(hi, lo, (*fo)[4], ptr[ 9]);
+ MLA(hi, lo, (*fo)[5], ptr[ 7]);
+ MLA(hi, lo, (*fo)[6], ptr[ 5]);
+ MLA(hi, lo, (*fo)[7], ptr[ 3]);
+*/
+ pcm[0] = SHIFT(-MLZ(hi, lo));
+ }
+ else
+ {
+ ptr = *D0ptr;
+/*
+ ML0(hi, lo, (*fx)[0], ptr[ 0]);
+ MLA(hi, lo, (*fx)[1], ptr[14]);
+ MLA(hi, lo, (*fx)[2], ptr[12]);
+ MLA(hi, lo, (*fx)[3], ptr[10]);
+ MLA(hi, lo, (*fx)[4], ptr[ 8]);
+ MLA(hi, lo, (*fx)[5], ptr[ 6]);
+ MLA(hi, lo, (*fx)[6], ptr[ 4]);
+ MLA(hi, lo, (*fx)[7], ptr[ 2]);
+*/
+ PROD_EVEN_0(hi, lo, *fx, ptr);
+ MLN(hi, lo);
+/*
+ MLA(hi, lo, (*fe)[0], ptr[ 1]);
+ MLA(hi, lo, (*fe)[1], ptr[15]);
+ MLA(hi, lo, (*fe)[2], ptr[13]);
+ MLA(hi, lo, (*fe)[3], ptr[11]);
+ MLA(hi, lo, (*fe)[4], ptr[ 9]);
+ MLA(hi, lo, (*fe)[5], ptr[ 7]);
+ MLA(hi, lo, (*fe)[6], ptr[ 5]);
+ MLA(hi, lo, (*fe)[7], ptr[ 3]);
+*/
+ PROD_ODD_A(hi, lo, *fe, ptr);
+ pcm[0] = SHIFT(MLZ(hi, lo));
+ pcm += 16;
+
+ synth_full2(pcm, fo, fe, D0ptr, D1ptr);
+ D0ptr += 15;
+ D1ptr += 15;
+ fo += 15;
+ fe += 15;
+
+ ptr = *(D0ptr + 1);
+/*
+ ML0(hi, lo, (*fo)[0], ptr[ 0]);
+ MLA(hi, lo, (*fo)[1], ptr[14]);
+ MLA(hi, lo, (*fo)[2], ptr[12]);
+ MLA(hi, lo, (*fo)[3], ptr[10]);
+ MLA(hi, lo, (*fo)[4], ptr[ 8]);
+ MLA(hi, lo, (*fo)[5], ptr[ 6]);
+ MLA(hi, lo, (*fo)[6], ptr[ 4]);
+ MLA(hi, lo, (*fo)[7], ptr[ 2]);
+*/
+ PROD_EVEN_0(hi, lo, *fo, ptr);
+ pcm[0] = SHIFT(-MLZ(hi, lo));
+ }
+
+ pcm += 16;
+ phase = (phase + 1) % 16;
+ }
+ }
+}
+
+# else
static
void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
@@ -1020,6 +1392,7 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
}
}
}
+
# endif
# endif
diff --git a/apps/codecs/libmad/synth_full_arm.S b/apps/codecs/libmad/synth_full_arm.S
new file mode 100644
index 0000000000..b880a7b3c6
--- /dev/null
+++ b/apps/codecs/libmad/synth_full_arm.S
@@ -0,0 +1,343 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2007 by Tomasz Malesinski
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+/* This performs slower in IRAM on PP502x and there is no space in
+ mpegplayer on the PP5002 */
+#if defined(CPU_PP502x) || (CONFIG_CPU == PP5002 && defined(MPEGPLAYER))
+ .section .text,"ax",%progbits
+#else
+ .section .icode,"ax",%progbits
+#endif
+
+ .global synth_full1
+ .global synth_full2
+
+ ;; r0 = pcm
+ ;; r1 = fo
+ ;; r2 = fe
+ ;; r3 = D0ptr
+ ;; r4 = D1ptr
+synth_full1:
+ stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+ ldr r4, [sp, #40]
+ ldr r5, =synth_full_sp
+ str sp, [r5]
+ mov r5, #15
+ add r2, r2, #32
+.l:
+ add r3, r3, #128
+ add r4, r4, #128
+ ldmia r1!, {r10, r11, r12, lr}
+ ldr r7, [r3, #4]
+ smull r6, r7, r10, r7
+ ldr r9, [r4, #120]
+ smull r8, r9, r10, r9
+
+ ldr r10, [r3, #60]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #52]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #44]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #64]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #72]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #80]
+ smlal r8, r9, lr, r10
+
+ ldmia r1!, {r11, r12, sp, lr}
+ ldr r10, [r3, #36]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #28]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #20]
+ smlal r6, r7, sp, r10
+ ldr r10, [r3, #12]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #88]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #96]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #104]
+ smlal r8, r9, sp, r10
+ ldr r10, [r4, #112]
+ smlal r8, r9, lr, r10
+
+ rsbs r6, r6, #0
+ rsc r7, r7, #0
+
+ ldmia r2!, {r11, r12, sp, lr}
+
+ ldr r10, [r3, #0]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #56]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #48]
+ smlal r6, r7, sp, r10
+ ldr r10, [r3, #40]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #60]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #68]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #76]
+ smlal r8, r9, sp, r10
+ ldr r10, [r4, #84]
+ smlal r8, r9, lr, r10
+
+ ldmia r2!, {r11, r12, sp, lr}
+ ldr r10, [r3, #32]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #24]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #16]
+ smlal r6, r7, sp, r10
+ ldr r10, [r3, #8]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #92]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #100]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #108]
+ smlal r8, r9, sp, r10
+ ldr r10, [r4, #116]
+ smlal r8, r9, lr, r10
+
+ movs r6, r6, lsr #16
+ adc r6, r6, r7, lsl #16
+ str r6, [r0, -r5, lsl #2]
+
+ movs r8, r8, lsr #16
+ adc r8, r8, r9, lsl #16
+ str r8, [r0, r5, lsl #2]
+
+ subs r5, r5, #1
+ bne .l
+
+ ldr r5, =synth_full_sp
+ ldr sp, [r5]
+ ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+synth_full2:
+ stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+ ldr r4, [sp, #40]
+ ldr r5, =synth_full_sp
+ str sp, [r5]
+ mov r5, #15
+ add r2, r2, #32
+.l2:
+ add r3, r3, #128
+ add r4, r4, #128
+ ldmia r1!, {r10, r11, r12, lr}
+ ldr r7, [r3, #0]
+ smull r6, r7, r10, r7
+ ldr r9, [r4, #60]
+ smull r8, r9, r10, r9
+
+ ldr r10, [r3, #56]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #48]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #40]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #68]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #76]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #84]
+ smlal r8, r9, lr, r10
+
+ ldmia r1!, {r11, r12, sp, lr}
+ ldr r10, [r3, #32]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #24]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #16]
+ smlal r6, r7, sp, r10
+ ldr r10, [r3, #8]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #92]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #100]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #108]
+ smlal r8, r9, sp, r10
+ ldr r10, [r4, #116]
+ smlal r8, r9, lr, r10
+
+ rsbs r6, r6, #0
+ rsc r7, r7, #0
+
+ ldmia r2!, {r11, r12, sp, lr}
+
+ ldr r10, [r3, #4]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #60]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #52]
+ smlal r6, r7, sp, r10
+ ldr r10, [r3, #44]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #120]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #64]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #72]
+ smlal r8, r9, sp, r10
+ ldr r10, [r4, #80]
+ smlal r8, r9, lr, r10
+
+ ldmia r2!, {r11, r12, sp, lr}
+ ldr r10, [r3, #36]
+ smlal r6, r7, r11, r10
+ ldr r10, [r3, #28]
+ smlal r6, r7, r12, r10
+ ldr r10, [r3, #20]
+ smlal r6, r7, sp, r10
+ ldr r10, [r3, #12]
+ smlal r6, r7, lr, r10
+
+ ldr r10, [r4, #88]
+ smlal r8, r9, r11, r10
+ ldr r10, [r4, #96]
+ smlal r8, r9, r12, r10
+ ldr r10, [r4, #104]
+ smlal r8, r9, sp, r10
+ ldr r10, [r4, #112]
+ smlal r8, r9, lr, r10
+
+ movs r6, r6, lsr #16
+ adc r6, r6, r7, lsl #16
+ str r6, [r0, -r5, lsl #2]
+
+ movs r8, r8, lsr #16
+ adc r8, r8, r9, lsl #16
+ str r8, [r0, r5, lsl #2]
+
+ subs r5, r5, #1
+ bne .l2
+
+ ldr r5, =synth_full_sp
+ ldr sp, [r5]
+ ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+ .global III_aliasreduce
+
+III_aliasreduce:
+ stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+ add r1, r0, r1, lsl #2
+ add r0, r0, #72
+.arl1:
+ mov r2, #8
+ mov r3, r0 @ a
+ mov r4, r0 @ b
+ ldr r5, =csa @ cs/ca
+.arl2:
+ ldmdb r3, {r6, r12}
+ ldmia r4, {r7, lr}
+
+ ldmia r5!, {r8, r9}
+ smull r10, r11, r7, r8
+ smlal r10, r11, r12, r9
+ movs r10, r10, lsr #28
+ adc r10, r10, r11, lsl #4
+
+ rsb r7, r7, #0
+ smull r11, r8, r12, r8
+ smlal r11, r8, r7, r9
+ movs r11, r11, lsr #28
+ adc r11, r11, r8, lsl #4
+
+ ldmia r5!, {r8, r9}
+ smull r12, r7, lr, r8
+ smlal r12, r7, r6, r9
+ movs r12, r12, lsr #28
+ adc r12, r12, r7, lsl #4
+ stmia r4!, {r10, r12}
+
+ rsb lr, lr, #0
+ smull r7, r10, r6, r8
+ smlal r7, r10, lr, r9
+ movs r7, r7, lsr #28
+ adc r7, r7, r10, lsl #4
+ stmdb r3!, {r7, r11}
+
+ subs r2, r2, #2
+ bne .arl2
+ add r0, r0, #72
+ cmp r0, r1
+ blo .arl1
+ ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+csa:
+ .word +0x0db84a81
+ .word -0x083b5fe7
+ .word +0x0e1b9d7f
+ .word -0x078c36d2
+ .word +0x0f31adcf
+ .word -0x05039814
+ .word +0x0fbba815
+ .word -0x02e91dd1
+ .word +0x0feda417
+ .word -0x0183603a
+ .word +0x0ffc8fc8
+ .word -0x00a7cb87
+ .word +0x0fff964c
+ .word -0x003a2847
+ .word +0x0ffff8d3
+ .word -0x000f27b4
+
+ .global III_overlap
+III_overlap:
+ stmdb sp!, {r4, r5, r6, r7, r8, lr}
+ add r2, r2, r3, lsl #2
+ mov r3, #6
+.ol:
+ ldmia r0!, {r4, r5, r6}
+ ldmia r1!, {r7, r8, lr}
+ add r4, r4, r7
+ add r5, r5, r8
+ add r6, r6, lr
+ str r4, [r2], #128
+ str r5, [r2], #128
+ str r6, [r2], #128
+ subs r3, r3, #1
+ bne .ol
+ sub r1, r1, #72
+ ldmia r0!, {r4, r5, r6, r7, r8, lr}
+ stmia r1!, {r4, r5, r6, r7, r8, lr}
+ ldmia r0!, {r4, r5, r6, r7, r8, lr}
+ stmia r1!, {r4, r5, r6, r7, r8, lr}
+ ldmia r0!, {r4, r5, r6, r7, r8, lr}
+ stmia r1!, {r4, r5, r6, r7, r8, lr}
+ ldmia sp!, {r4, r5, r6, r7, r8, pc}
+
+ .section .ibss,"aw",%nobits
+synth_full_sp:
+ .space 4