summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2010-01-03 04:30:13 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2010-01-03 04:30:13 +0000
commitc1f4d4037a8be88ebb94a5c28eba0f394efe623a (patch)
tree591d0cdc236ee6f11561501191e22d3a6e86c9a8
parent1d469590122767c86af7bb0503e7e62cc62c4f6c (diff)
downloadrockbox-c1f4d4037a8be88ebb94a5c28eba0f394efe623a.tar.gz
rockbox-c1f4d4037a8be88ebb94a5c28eba0f394efe623a.tar.bz2
rockbox-c1f4d4037a8be88ebb94a5c28eba0f394efe623a.zip
More comments for udiv32_armv4.S, reduce zero divisor test to one cycle for the skipped branch by setting flags when inverting divisor, 32-bit numerators are handled by calling the 31-bit divider and fixing the results.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24151 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/lib/udiv32_armv4.S107
1 files changed, 54 insertions, 53 deletions
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
index 491cf43862..b54156809c 100644
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@@ -36,11 +36,14 @@
iteration by storing quotient and remainder together and adding the previous
quotient bit during trial subtraction. Modified to work with any dividend
and divisor both less than 1 << 30, and skipping trials by calculating bits
- in output.
-*/
-.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient
+ in output. */
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
mov \bits, #1
+ /* Shift the divisor left until it aligns with the numerator. If it already
+ has the high bit set, this is fine, everything inside .rept will be
+ skipped, and the add before and adcs after will set the one-bit result
+ to zero. */
cmp \divisor, \dividend, lsr #16
movls \divisor, \divisor, lsl #16
addls \bits, \bits, #16
@@ -56,7 +59,8 @@
cmp \divisor, \dividend, lsr #1
movls \divisor, \divisor, lsl #1
addls \bits, \bits, #1
- rsb \divisor, \divisor, #0
+ rsbs \divisor, \divisor, #0
+ bcs .L_div0
adds \result, \dividend, \divisor
subcc \result, \result, \divisor
rsb \curbit, \bits, #31
@@ -64,44 +68,14 @@
nop
.rept 30
adcs \result, \divisor, \result, lsl #1
+ /* Fix the remainder portion of the result. This must be done because the
+ handler for 32-bit numerators needs the remainder. */
subcc \result, \result, \divisor
.endr
- /* shift remainder/quotient left one, add final quotient bit */
+ /* Shift remainder/quotient left one, add final quotient bit */
adc \result, \result, \result
- mov \dividend, \result, lsr \bits
- eor \quotient, \result, \dividend, lsl \bits
-.endm
-
-.macro ARM_DIV_32_BODY dividend, divisor, result, curbit
-
- mov \result, \dividend
- mov \curbit, #90 @ 3 * 30, (calculating branch dest)
- cmp \divisor, \result, lsr #16
- movls \result,\result, lsr #16
- subls \curbit, \curbit, #48
- cmp \divisor, \result, lsr #8
- movls \result,\result, lsr #8
- subls \curbit, \curbit, #24
- cmp \divisor, \result, lsr #4
- movls \result,\result, lsr #4
- subls \curbit, \curbit, #12
- cmp \divisor, \result, lsr #2
- subls \curbit, \curbit, #6
- @ Calculation is only done down to shift=2, because the shift=1 step
- @ would need 3 more cycles, but would only gain 1.5 cycles on average.
- mov \result, #0
- add pc, pc, \curbit, lsl #2
- nop
- .set shift, 32
- .rept 31
- .set shift, shift - 1
- cmp \divisor, \dividend, lsr #shift
- orrls \result, \result, #(1 << shift)
- subls \dividend, \dividend, \divisor, lsl #shift
- .endr @ shift==0 in the .rept would cause a warning for lsr #0
- cmp \divisor, \dividend
- orrls \result, \result, #1
- @subls \dividend, \dividend, \divisor @ correct remainder not needed
+ mov \remainder, \result, lsr \bits
+ eor \quotient, \result, \remainder, lsl \bits
.endm
#ifdef USE_IRAM
@@ -114,21 +88,48 @@
.type udiv32_arm,%function
udiv32_arm:
- cmp r1, #0
- beq 20f
tst r0, r0
- /* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of
- divisor is also unset dividend has been tested to be >= divisor.
- */
- bmi 10f
- ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0
- bx lr
-
-10:
- ARM_DIV_32_BODY r0, r1, r2, r3
- mov r0, r2
+ /* High bit must be unset, otherwise shift numerator right, calculate,
+ and correct results. As this case is very uncommon we want to avoid
+ any other delays on the main path in handling it, so the long divide
+ calls the short divide as a function. */
+ bmi .L_udiv32
+.L_udiv31:
+ ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
bx lr
-20:
- movne r0, #0
+.L_udiv32:
+ /* store original numerator and divisor, we'll need them to correct the
+ result, */
+ stmdb sp, { r0, r1, lr }
+ /* Call __div0 here if divisor is zero, otherwise it would report the wrong
+ address. */
+ mov r0, r0, lsr #1
+ bl .L_udiv31
+ /* This address is never a branch target, but is used to test lr before
+ calling __div0. */
+.L_udiv32_div0_trap:
+ ldmdb sp, { r2, r3, lr }
+ /* Move the low bit of the original numerator to the carry bit */
+ movs r2, r2, lsr #1
+ /* Shift the remainder left one and add in the carry bit */
+ adc r1, r1, r1
+ /* Subtract the original divisor from the remainder, setting carry if the
+ result is non-negative */
+ subs r1, r1, r3
+ /* Shift quotient left one and add carry bit */
+ adc r0, r0, r0
bx lr
+.L_div0:
+ /* Check the return address, since .L_udiv32 uses bl to wrap the 31-bit
+ divider. If the return address is at .L_udiv32_div0_trap, then the
+ the return address of the original caller is at sp - 4
+ */
+ adr r2, .L_udiv32_div0_trap
+ cmp r2, lr
+ subeq sp, sp, #4
+ bleq __div0
+ /* Otherwise, push lr to the stack before calling __div0 */
+ stmdb sp!, { lr }
+ bl __div0
+ .size udiv32_arm, . - udiv32_arm