summaryrefslogtreecommitdiffstats
path: root/apps/codecs
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-05 00:10:05 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-05 00:10:05 +0000
commitfe04e40be7a26c758a82e410e58be63c1f3d571c (patch)
tree955b1557f3da7cd8362bc05d96302cac08a72ff2 /apps/codecs
parent7a835ee0c64bb941f205a2eb915cf0aaf460f1bc (diff)
downloadrockbox-fe04e40be7a26c758a82e410e58be63c1f3d571c.tar.gz
rockbox-fe04e40be7a26c758a82e410e58be63c1f3d571c.zip
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r--apps/codecs/demac/libdemac/rangecoding.h14
-rw-r--r--apps/codecs/lib/SOURCES3
-rw-r--r--apps/codecs/lib/codeclib.h9
-rw-r--r--apps/codecs/lib/udiv32_armv4.S114
4 files changed, 137 insertions, 3 deletions
diff --git a/apps/codecs/demac/libdemac/rangecoding.h b/apps/codecs/demac/libdemac/rangecoding.h
index c96886e32b..645fd1ad92 100644
--- a/apps/codecs/demac/libdemac/rangecoding.h
+++ b/apps/codecs/demac/libdemac/rangecoding.h
@@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).
*/
+#ifdef ROCKBOX
+#include "../lib/codeclib.h"
+/* for UDIV32() */
+#endif
+
+#ifndef UDIV32
+#define UDIV32(a, b) (a / b)
+#endif
/* BITSTREAM READING FUNCTIONS */
@@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
static inline int range_decode_culfreq(int tot_f)
{
range_dec_normalize();
- rc.help = rc.range / tot_f;
- return rc.low / rc.help;
+ rc.help = UDIV32(rc.range, tot_f);
+ return UDIV32(rc.low, rc.help);
}
static inline int range_decode_culshift(int shift)
{
range_dec_normalize();
rc.help = rc.range >> shift;
- return rc.low / rc.help;
+ return UDIV32(rc.low, rc.help);
}
diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES
index 9c6d4e7ff6..8099620098 100644
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@@ -5,6 +5,9 @@ codeclib.c
mdct2.c
#ifdef CPU_ARM
mdct_arm.S
+#if ARM_ARCH == 4
+udiv32_armv4.S
+#endif
#endif
#elif defined(SIMULATOR) && defined(__APPLE__)
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h
index 744accb8aa..477818a23d 100644
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
extern void mdct_backward(int n, int32_t *in, int32_t *out);
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+/* optimised unsigned integer division for ARMv4, in IRAM */
+unsigned udiv32_arm(unsigned a, unsigned b);
+#define UDIV32(a, b) udiv32_arm(a, b)
+#else
+/* default */
+#define UDIV32(a, b) (a / b)
+#endif
+
/* Various codec helper functions */
int codec_init(void);
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
new file mode 100644
index 0000000000..a659a9eb8e
--- /dev/null
+++ b/apps/codecs/lib/udiv32_armv4.S
@@ -0,0 +1,114 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ *
+ * Optimised unsigned integer division for ARMv4
+ *
+ * Based on: libgcc routines for ARM cpu.
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+/* Codecs should not normally do this, but we need to check a macro, and
+ * codecs.h would confuse the assembler. */
+
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+
+ mov \result, \dividend
+ mov \curbit, #90 @ 3 * 30, (calculating branch dest)
+ cmp \divisor, \result, lsr #16
+ movls \result,\result, lsr #16
+ subls \curbit, \curbit, #48
+ cmp \divisor, \result, lsr #8
+ movls \result,\result, lsr #8
+ subls \curbit, \curbit, #24
+ cmp \divisor, \result, lsr #4
+ movls \result,\result, lsr #4
+ subls \curbit, \curbit, #12
+ cmp \divisor, \result, lsr #2
+ subls \curbit, \curbit, #6
+ @ calculation is only done down to shift=2, because the shift=1 step
+ @ would need 3 more cycles, but would only gain 1.5 cycles on average
+ mov \result, #0
+ add pc, pc, \curbit, lsl #2
+ nop
+ .set shift, 32
+ .rept 32
+ .set shift, shift - 1
+ cmp \dividend, \divisor, lsl #shift
+ adc \result, \result, \result
+ subcs \dividend, \dividend, \divisor, lsl #shift
+ .endr
+.endm
+
+.macro ARM_DIV2_ORDER divisor, order
+
+ cmp \divisor, #(1 << 16)
+ movhs \divisor, \divisor, lsr #16
+ movhs \order, #16
+ movlo \order, #0
+
+ cmp \divisor, #(1 << 8)
+ movhs \divisor, \divisor, lsr #8
+ addhs \order, \order, #8
+
+ cmp \divisor, #(1 << 4)
+ movhs \divisor, \divisor, lsr #4
+ addhs \order, \order, #4
+
+ cmp \divisor, #(1 << 2)
+ addhi \order, \order, #3
+ addls \order, \order, \divisor, lsr #1
+.endm
+
+
+#ifdef USE_IRAM
+ .section .icode,"ax",%progbits
+#else
+ .text
+#endif
+ .align
+ .global udiv32_arm
+ .type udiv32_arm,%function
+
+udiv32_arm:
+ subs r2, r1, #1
+ bxeq lr
+ bcc 20f
+ cmp r0, r1
+ bls 10f
+ tst r1, r2
+ beq 30f
+
+ ARM_DIV_BODY r0, r1, r2, r3
+ mov r0, r2
+ bx lr
+
+10:
+ moveq r0, #1
+20:
+ movne r0, #0
+ bx lr
+
+30:
+ ARM_DIV2_ORDER r1, r2
+ mov r0, r0, lsr r2
+ bx lr