More comments for udiv32_armv4.S, reduce zero divisor test to one cycle for the skipped branch by setting flags when inverting divisor, 32-bit numerators are handled by calling the 31-bit divider and fixing the results.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24151 a1c6a512-1295-4272-9138-f99709370657
author: Andrew Mahone <andrew.mahone@gmail.com> 2010-01-03 04:30:13 +0000
committer: Andrew Mahone <andrew.mahone@gmail.com> 2010-01-03 04:30:13 +0000
commit: c1f4d4037a8be88ebb94a5c28eba0f394efe623a (patch)
tree: 591d0cdc236ee6f11561501191e22d3a6e86c9a8
parent: 1d469590122767c86af7bb0503e7e62cc62c4f6c (diff)
download: rockbox-c1f4d4037a8be88ebb94a5c28eba0f394efe623a.tar.gz
rockbox-c1f4d4037a8be88ebb94a5c28eba0f394efe623a.zip
1 files changed, 54 insertions, 53 deletions
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
index 491cf43862..b54156809c 100644
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@@ -36,11 +36,14 @@
    iteration by storing quotient and remainder together and adding the previous
    quotient bit during trial subtraction. Modified to work with any dividend
    and divisor both less than 1 << 30, and skipping trials by calculating bits
-   in output.
-*/
-.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient
+   in output. */
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
 
     mov     \bits, #1
+    /* Shift the divisor left until it aligns with the numerator. If it already
+       has the high bit set, this is fine, everything inside .rept will be
+       skipped, and the add before and adcs after will set the one-bit result
+       to zero. */
     cmp     \divisor, \dividend, lsr #16
     movls   \divisor, \divisor, lsl #16
     addls   \bits, \bits, #16
@@ -56,7 +59,8 @@
     cmp     \divisor, \dividend, lsr #1
     movls   \divisor, \divisor, lsl #1
     addls   \bits, \bits, #1
-    rsb     \divisor, \divisor, #0
+    rsbs    \divisor, \divisor, #0
+    bcs     .L_div0
     adds    \result, \dividend, \divisor
     subcc   \result, \result, \divisor
     rsb     \curbit, \bits, #31
@@ -64,44 +68,14 @@
     nop
     .rept   30
     adcs    \result, \divisor, \result, lsl #1
+    /* Fix the remainder portion of the result. This must be done because the
+       handler for 32-bit numerators needs the remainder. */
     subcc   \result, \result, \divisor
     .endr
-    /* shift remainder/quotient left one, add final quotient bit */
+    /* Shift remainder/quotient left one, add final quotient bit */
     adc     \result, \result, \result
-    mov     \dividend, \result, lsr \bits
-    eor     \quotient, \result, \dividend, lsl \bits
-.endm
-
-.macro ARM_DIV_32_BODY dividend, divisor, result, curbit
-
-    mov     \result, \dividend
-    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
-    cmp     \divisor, \result, lsr #16
-    movls   \result,\result, lsr #16
-    subls   \curbit, \curbit, #48
-    cmp     \divisor, \result, lsr #8
-    movls   \result,\result, lsr #8
-    subls   \curbit, \curbit, #24
-    cmp     \divisor, \result, lsr #4
-    movls   \result,\result, lsr #4
-    subls   \curbit, \curbit, #12
-    cmp     \divisor, \result, lsr #2
-    subls   \curbit, \curbit, #6
-    @ Calculation is only done down to shift=2, because the shift=1 step
-    @ would need 3 more cycles, but would only gain 1.5 cycles on average.
-    mov     \result, #0
-    add     pc, pc, \curbit, lsl #2
-    nop
-    .set    shift, 32
-    .rept   31
-    .set    shift, shift - 1
-    cmp     \divisor, \dividend, lsr #shift
-    orrls   \result, \result, #(1 << shift)
-    subls   \dividend, \dividend, \divisor, lsl #shift
-    .endr   @ shift==0 in the .rept would cause a warning  for lsr #0
-    cmp     \divisor, \dividend
-    orrls   \result, \result, #1
-    @subls  \dividend, \dividend, \divisor  @ correct remainder not needed
+    mov     \remainder, \result, lsr \bits
+    eor     \quotient, \result, \remainder, lsl \bits
 .endm
 
 #ifdef USE_IRAM
@@ -114,21 +88,48 @@
     .type   udiv32_arm,%function
 
 udiv32_arm:
-    cmp     r1, #0
-    beq     20f
     tst     r0, r0
-    /* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of
-       divisor is also unset dividend has been tested to be >= divisor.
-    */
-    bmi     10f
-    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0
-    bx      lr
-
-10:
-    ARM_DIV_32_BODY r0, r1, r2, r3
-    mov     r0, r2
+    /* High bit must be unset, otherwise shift numerator right, calculate,
+       and correct results. As this case is very uncommon we want to avoid
+       any other delays on the main path in handling it, so the long divide
+       calls the short divide as a function. */
+    bmi     .L_udiv32
+.L_udiv31:
+    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
     bx      lr
 
-20:
-    movne   r0, #0
+.L_udiv32:
+    /* store original numerator and divisor, we'll need them to correct the
+       result, */
+    stmdb   sp, { r0, r1, lr }
+    /* Call __div0 here if divisor is zero, otherwise it would report the wrong
+       address. */
+    mov     r0, r0, lsr #1
+    bl      .L_udiv31
+    /* This address is never a branch target, but is used to test lr before
+       calling __div0. */
+.L_udiv32_div0_trap:
+    ldmdb   sp, { r2, r3, lr }
+    /* Move the low bit of the original numerator to the carry bit */
+    movs    r2, r2, lsr #1
+    /* Shift the remainder left one and add in the carry bit */
+    adc     r1, r1, r1
+    /* Subtract the original divisor from the remainder, setting carry if the
+       result is non-negative */
+    subs    r1, r1, r3
+    /* Shift quotient left one and add carry bit */
+    adc     r0, r0, r0
     bx      lr
+.L_div0:
+    /* Check the return address, since .L_udiv32 uses bl to wrap the 31-bit
+       divider. If the return address is at .L_udiv32_div0_trap, then the 
+       the return address of the original caller is at sp - 4
+    */
+    adr     r2, .L_udiv32_div0_trap
+    cmp     r2, lr
+    subeq     sp, sp, #4
+    bleq    __div0
+    /* Otherwise, push lr to the stack before calling __div0 */
+    stmdb sp!, { lr }
+    bl      __div0
+    .size udiv32_arm, . - udiv32_arm
author	Andrew Mahone <andrew.mahone@gmail.com>	2010-01-03 04:30:13 +0000
committer	Andrew Mahone <andrew.mahone@gmail.com>	2010-01-03 04:30:13 +0000
commit	c1f4d4037a8be88ebb94a5c28eba0f394efe623a (patch)
tree	591d0cdc236ee6f11561501191e22d3a6e86c9a8
parent	1d469590122767c86af7bb0503e7e62cc62c4f6c (diff)
download	rockbox-c1f4d4037a8be88ebb94a5c28eba0f394efe623a.tar.gz rockbox-c1f4d4037a8be88ebb94a5c28eba0f394efe623a.zip