summaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-10-03 12:30:18 +0000
committerJens Arnold <amiconn@rockbox.org>2008-10-03 12:30:18 +0000
commitd456460707f79ec48d08baf5d8f28c88c9641e64 (patch)
treea922a1829e3a90886f4eacc698f92c6b749dcd93 /apps
parent7fc446263f99aad5f0b2f9f674fde02e6eac4d5c (diff)
downloadrockbox-d456460707f79ec48d08baf5d8f28c88c9641e64.tar.gz
rockbox-d456460707f79ec48d08baf5d8f28c88c9641e64.tar.bz2
rockbox-d456460707f79ec48d08baf5d8f28c88c9641e64.zip
Further speedup for ARMv6 by better pipelining in scalarproduct().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18697 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps')
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv6.h80
1 files changed, 53 insertions, 27 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h
index e963e10ff0..bf50d9cabd 100644
--- a/apps/codecs/demac/libdemac/vector_math16_armv6.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h
@@ -217,54 +217,80 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
"beq 20f \n"
"10: \n"
- "ldrh r4, [%[v2]], #2 \n"
- "mov r4, r4, lsl #16 \n"
+ "ldrh r2, [%[v2]], #2 \n"
+ "ldr r0, [%[v1]], #4 \n"
+ "ldr r3, [%[v2]], #4 \n"
+ "mov r2, r2, lsl #16 \n"
"1: \n"
- "ldmia %[v1]!, {r0-r3} \n"
- "ldmia %[v2]!, {r5-r8} \n"
+ "ldr r1, [%[v1]], #4 \n"
+ "smlabt %[res], r0, r2, %[res] \n"
+ "ldr r4, [%[v2]], #4 \n"
+ "smlatb %[res], r0, r3, %[res] \n"
+ "ldr r0, [%[v1]], #4 \n"
+ "smlabt %[res], r1, r3, %[res] \n"
+ "ldr r5, [%[v2]], #4 \n"
+ "smlatb %[res], r1, r4, %[res] \n"
+ "ldr r1, [%[v1]], #4 \n"
"smlabt %[res], r0, r4, %[res] \n"
+ "ldr r6, [%[v2]], #4 \n"
"smlatb %[res], r0, r5, %[res] \n"
+ "ldr r0, [%[v1]], #4 \n"
"smlabt %[res], r1, r5, %[res] \n"
+ "ldr r3, [%[v2]], #4 \n"
"smlatb %[res], r1, r6, %[res] \n"
- "smlabt %[res], r2, r6, %[res] \n"
- "smlatb %[res], r2, r7, %[res] \n"
- "smlabt %[res], r3, r7, %[res] \n"
- "smlatb %[res], r3, r8, %[res] \n"
- "mov r4, r8 \n"
- "ldmia %[v1]!, {r0-r3} \n"
- "ldmia %[v2]!, {r5-r8} \n"
+ "mov r2, r6 \n"
+ "ldr r1, [%[v1]], #4 \n"
+ "smlabt %[res], r0, r2, %[res] \n"
+ "ldr r4, [%[v2]], #4 \n"
+ "smlatb %[res], r0, r3, %[res] \n"
+ "ldr r0, [%[v1]], #4 \n"
+ "smlabt %[res], r1, r3, %[res] \n"
+ "ldr r5, [%[v2]], #4 \n"
+ "smlatb %[res], r1, r4, %[res] \n"
+ "ldr r1, [%[v1]], #4 \n"
"smlabt %[res], r0, r4, %[res] \n"
+ "ldr r6, [%[v2]], #4 \n"
"smlatb %[res], r0, r5, %[res] \n"
- "smlabt %[res], r1, r5, %[res] \n"
- "smlatb %[res], r1, r6, %[res] \n"
- "smlabt %[res], r2, r6, %[res] \n"
- "smlatb %[res], r2, r7, %[res] \n"
- "smlabt %[res], r3, r7, %[res] \n"
- "smlatb %[res], r3, r8, %[res] \n"
#if ORDER > 16
- "mov r4, r8 \n"
"subs %[cnt], %[cnt], #1 \n"
+ "ldrne r0, [%[v1]], #4 \n"
+ "smlabt %[res], r1, r5, %[res] \n"
+ "ldrne r3, [%[v2]], #4 \n"
+ "smlatb %[res], r1, r6, %[res] \n"
+ "mov r2, r6 \n"
"bne 1b \n"
+#else
+ "smlabt %[res], r1, r5, %[res] \n"
+ "smlatb %[res], r1, r6, %[res] \n"
#endif
"b 99f \n"
"20: \n"
+ "ldmia %[v1]!, {r0-r1} \n"
+ "ldmia %[v2]!, {r4-r5} \n"
"1: \n"
- "ldmia %[v1]!, {r0-r3} \n"
- "ldmia %[v2]!, {r4-r7} \n"
+ "ldmia %[v1]!, {r2-r3} \n"
"smlad %[res], r0, r4, %[res] \n"
+ "ldmia %[v2]!, {r6-r7} \n"
"smlad %[res], r1, r5, %[res] \n"
+ "ldmia %[v1]!, {r0-r1} \n"
"smlad %[res], r2, r6, %[res] \n"
+ "ldmia %[v2]!, {r4-r5} \n"
"smlad %[res], r3, r7, %[res] \n"
- "ldmia %[v1]!, {r0-r3} \n"
- "ldmia %[v2]!, {r4-r7} \n"
+ "ldmia %[v1]!, {r2-r3} \n"
"smlad %[res], r0, r4, %[res] \n"
+ "ldmia %[v2]!, {r6-r7} \n"
"smlad %[res], r1, r5, %[res] \n"
- "smlad %[res], r2, r6, %[res] \n"
- "smlad %[res], r3, r7, %[res] \n"
#if ORDER > 16
"subs %[cnt], %[cnt], #1 \n"
- "bne 1b \n"
+ "ldmneia %[v1]!, {r0-r1} \n"
+ "smlad %[res], r2, r6, %[res] \n"
+ "ldmneia %[v2]!, {r4-r5} \n"
+ "smlad %[res], r3, r7, %[res] \n"
+ "bne 1b \n"
+#else
+ "smlad %[res], r2, r6, %[res] \n"
+ "smlad %[res], r3, r7, %[res] \n"
#endif
"99: \n"
@@ -277,8 +303,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
[res]"+r"(res)
: /* inputs */
: /* clobbers */
- "r0", "r1", "r2", "r3", "r4",
- "r5", "r6", "r7", "r8"
+ "r0", "r1", "r2", "r3",
+ "r4", "r5", "r6", "r7"
);
return res;
}