summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2009-06-28 02:32:43 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2009-06-28 02:32:43 +0000
commit815dcfdd3502bd23c4f2705ff2b044755dd512cc (patch)
tree004612371e0b949a02410b4b60fbba5acd132371
parent99ae7bcc438495d468322b0a81864a12a782f37b (diff)
downloadrockbox-815dcfdd3502bd23c4f2705ff2b044755dd512cc.tar.gz
rockbox-815dcfdd3502bd23c4f2705ff2b044755dd512cc.zip
Use hand-written constants table on ARMv5+ for JPEG IDCT, and load four 16-bit constants at a time with ldrd. Not useful for ARMv4, since one load per constant would still be needed, and limited range of ldrsh would force multiple copies of table.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21535 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/recorder/jpeg_idct_arm.S162
1 files changed, 86 insertions, 76 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
index 01b08c4b5a..d84e5e7962 100644
--- a/apps/recorder/jpeg_idct_arm.S
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -113,7 +113,11 @@ jpeg_idct2h:
results can not be stored merged.
*/
stmdb sp!, { r4-r5, lr }
+#if ARM_ARCH < 5
ldr r14, =4112
+#else
+ ldrsh r14, .Lpool4+2
+#endif
1:
ldrsh r12, [r0]
ldrsh r4, [r0, #2]
@@ -140,7 +144,7 @@ jpeg_idct2h:
ldmia sp!, { r4-r5, pc }
#else
stmdb sp!, { r4, lr }
- ldr r14, =4112
+ ldrsh r14, .Lpool4+2
1:
ldr r12, [r0]
sadd16 r12, r12, r14
@@ -198,27 +202,26 @@ jpeg_idct4v:
ldmia sp!, { r4-r7, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r8, lr }
- ldr r8, =1024
- ldr r14, =4433
- ldr r12, =3302955134
+ mov r8, #1024
+ ldrd r4, .Lpool4
1:
- ldrsh r5, [r0, #48]
+ ldrsh r14, [r0, #48]
ldrsh r3, [r0, #16]
- ldrsh r4, [r0, #32]
+ ldrsh r12, [r0, #32]
ldrsh r2, [r0]
- add r6, r3, r5 /* r6 = z1 = d1 + d3 */
- add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
- smlabb r6, r14, r6, r8 /* z1 *= 4433 */
- sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
- smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
- smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
+ add r6, r3, r14 /* r6 = z1 = d1 + d3 */
+ add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
+ smlabb r6, r5, r6, r8 /* z1 *= 4433 */
+ sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
+ smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
+ smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
mov r7, r7, lsl #2
mov r2, r2, lsl #2
- add r4, r7, r3, asr #11 /* r4 = o0 */
- sub r7, r7, r3, asr #11 /* r7 = o3 */
- add r3, r2, r5, asr #11 /* r3 = o1 */
- sub r2, r2, r5, asr #11 /* r2 = o2 */
- strh r4, [r0]
+ add r12, r7, r3, asr #11 /* r12 = o0 */
+ sub r7, r7, r3, asr #11 /* r7 = o3 */
+ add r3, r2, r14, asr #11 /* r3 = o1 */
+ sub r2, r2, r14, asr #11 /* r2 = o2 */
+ strh r12, [r0]
strh r7, [r0, #48]
strh r3, [r0, #16]
strh r2, [r0, #32]
@@ -228,9 +231,8 @@ jpeg_idct4v:
ldmia sp!, { r4-r8, pc }
#else
stmdb sp!, { r4-r10, lr }
- ldr r2, =1024
- ldr r3, =4433
- ldr r12, =3302955134
+ ldrd r2, .Lpool4
+ mov r12, #1024
1:
ldr r6, [r0, #32]
ldr r4, [r0]
@@ -247,12 +249,12 @@ jpeg_idct4v:
/* multiplication expands values beyond 16 bits, so this part needs to be
split. the values will be merged below so that the rest of the addition
can be done in parallel */
- smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
- smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
- smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
- smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
- smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
- smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
+ smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
+ smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
+ smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
+ smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
+ smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
+ smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
mov r8, r8, lsl #2 /* complete the parallel shift started */
mov r4, r4, lsl #2 /* with the earlier bic instructions */
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
@@ -276,6 +278,17 @@ jpeg_idct4v:
#endif
.size jpeg_idct4v, .-jpeg_idct4v
+#if ARM_ARCH > 4
+ .align 4
+.Lpool4:
+ .short -15137
+ .short 4112
+ .short 4433
+ .short 6270
+
+ .align 2
+#endif
+
jpeg_idct4h:
#if ARM_ARCH < 5
stmdb sp!, { r4-r10, lr }
@@ -328,88 +341,85 @@ jpeg_idct4h:
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r10, pc }
-#elif ARM_ARCH < 6
- stmdb sp!, { r4-r10, lr }
- ldr r10, =4433
- ldr r14, =4112
- ldr r12, =3302955134
+#elif ARM_ARCH < 6 || 1
+ stmdb sp!, { r4-r9, lr }
+ ldrd r4, .Lpool4
1:
ldrsh r7, [r0, #6]
- ldrsh r5, [r0, #2]
- ldrsh r4, [r0]
+ ldrsh r14, [r0, #2]
+ ldrsh r12, [r0]
ldrsh r6, [r0, #4]
- add r8, r5, r7 /* r8 = z1 = d1 + d3 */
- add r4, r4, r14
- smulbb r8, r10, r8 /* z1 *= 4433 */
- add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
- smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
- smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
- sub r4, r4, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
- add r6, r5, r9, lsl #13 /* r6 = o0 */
- rsb r9, r5, r9, lsl #13 /* r9 = o3 */
- add r5, r7, r4, lsl #13 /* r5 = o1 */
- rsb r4, r7, r4, lsl #13 /* r4 = o2 */
+ add r8, r14, r7 /* r8 = z1 = d1 + d3 */
+ add r12, r12, r4, lsr #16
+ smulbb r8, r5, r8 /* z1 *= 4433 */
+ add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
+ smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
+ smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
+ sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
+ add r6, r14, r9, lsl #13 /* r6 = o0 */
+ rsb r9, r14, r9, lsl #13 /* r9 = o3 */
+ add r14, r7, r12, lsl #13 /* r14= o1 */
+ rsb r12, r7, r12, lsl #13 /* r12= o2 */
mov r6, r6, asr #18
- mov r5, r5, asr #18
- mov r4, r4, asr #18
+ mov r14, r14, asr #18
+ mov r12, r12, asr #18
mov r9, r9, asr #18
cmp r6, #255
mvnhi r6, r6, asr #31
- cmp r5, #255
- mvnhi r5, r5, asr #31
- cmp r4, #255
- mvnhi r4, r4, asr #31
+ cmp r14, #255
+ mvnhi r14, r14, asr #31
+ cmp r12, #255
+ mvnhi r12, r12, asr #31
cmp r9, #255
mvnhi r9, r9, asr #31
#ifdef HAVE_LCD_COLOR
strb r6, [r1]
- strb r5, [r1, #4]
- strb r4, [r1, #8]
+ strb r14, [r1, #4]
+ strb r12, [r1, #8]
strb r9, [r1, #12]
#else
strb r6, [r1]
- strb r5, [r1, #1]
- strb r4, [r1, #2]
+ strb r14, [r1, #1]
+ strb r12, [r1, #2]
strb r9, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
- ldmia sp!, { r4-r10, pc }
+ ldmia sp!, { r4-r9, pc }
#else
stmdb sp!, { r4-r9, lr }
- ldr r9, =4433
- ldr r14, =4112
- ldr r12, =3302955134
+ ldrd r4, .Lpool4
+ mov r9, r4, lsr #16
1:
- ldmia r0, { r4-r5 }
- sadd16 r4, r4, r14
- sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
- ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
- smulbt r8, r9, r6
+ ldmia r0, { r12, r14 }
+ sadd16 r12, r12, r9
+ sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
+ ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
+ smulbt r8, r5, r6
sxth r6, r6
- smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
- smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
+ smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
+ smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
sxth r7, r7
- add r8, r4, r6, lsl #13 /* r8 = o0 */
- rsb r6, r4, r6, lsl #13 /* r6 = o3 */
- add r4, r5, r7, lsl #13 /* r4 = o1 */
- rsb r5, r5, r7, lsl #13 /* r5 = o2 */
+ add r8, r12, r6, lsl #13 /* r8 = o0 */
+ rsb r6, r12, r6, lsl #13 /* r6 = o3 */
+ add r12, r14, r7, lsl #13 /* r12= o1 */
+ rsb r14, r14, r7, lsl #13 /* r14= o2 */
usat r8, #8, r8, asr #18
usat r6, #8, r6, asr #18
- usat r4, #8, r4, asr #18
- usat r5, #8, r5, asr #18
+ usat r12, #8, r12, asr #18
+ usat r14, #8, r14, asr #18
#ifdef HAVE_LCD_COLOR
strb r8, [r1]
strb r6, [r1, #12]
- strb r4, [r1, #4]
- strb r5, [r1, #8]
+ strb r12, [r1, #4]
+ strb r14, [r1, #8]
#else
strb r8, [r1]
strb r6, [r1, #3]
- strb r4, [r1, #1]
- strb r5, [r1, #2]
+ strb r12, [r1, #1]
+ strb r14, [r1, #2]
#endif
add r0, r0, #16
add r1, r1, r3
@@ -450,7 +460,7 @@ jpeg_idct8v:
mov r11, r11, asr #16 /* r11 = z3 = d6 */
add r8, r8, #8192
add r9, r10, r11
- mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
+ mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
ldr r14, =6270
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */