summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2007-10-25 18:58:16 +0000
committerJens Arnold <amiconn@rockbox.org>2007-10-25 18:58:16 +0000
commit35f23267bfc97d070284a03e4adaa2c6b7bb6852 (patch)
treec42fe719f16e68512b0575bfa581105cfa8170bc
parent3ea3caf34165ddc8114ecf3cd39ed0016192b1d7 (diff)
downloadrockbox-35f23267bfc97d070284a03e4adaa2c6b7bb6852.tar.gz
rockbox-35f23267bfc97d070284a03e4adaa2c6b7bb6852.zip
Further optimised the filter vector math assembly for coldfire, and added assembly filter vector math for ARM. Both make use of the fact that the first argument of the vector functions is longword aligned. * The ARM version is tailored for ARM7TDMI, and would slow down arm9 or higher. Introduced a new CPU_ macro for ARM7TDMI. Speedup for coldfire: -c3000 104%->109%, -c4000 43%->46%, -c5000 1.7%->2.0%. Speedup for PP502x: -c2000 66%->75%, -c3000 37%->48%, -c4000 11%->18%, -c5000 2.5%->3.7%
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15302 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/filter.c2
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_arm7.h293
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_cf.h230
3 files changed, 485 insertions, 40 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index e85e42fb00..92d86edd7d 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -31,6 +31,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#ifdef CPU_COLDFIRE
#include "vector_math16_cf.h"
+#elif defined CPU_ARM7TDMI
+#include "vector_math16_arm7.h"
#else
#include "vector_math16.h"
#endif
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h
new file mode 100644
index 0000000000..1565ca9602
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_arm7.h
@@ -0,0 +1,293 @@
+/*
+
+libdemac - A Monkey's Audio decoder
+
+$Id$
+
+Copyright (C) Dave Chapman 2007
+
+ARM7 vector math copyright (C) 2007 Jens Arnold
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+
+*/
+
+/* This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned, otherwise it will result either in a data abort, or
+ * incorrect results (if ARM aligncheck is disabled). */
+static inline void vector_add(int16_t* v1, int16_t* v2)
+{
+#if ORDER > 16
+ int cnt = ORDER>>4;
+#endif
+
+#define ADDHALFREGS(sum, s1) /* Adds register */ \
+ "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \
+ "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \
+ "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \
+ "mov " #sum ", " #sum ", lsl #16 \n" \
+ "orr " #sum ", " #sum ", r8 , lsr #16 \n"
+
+#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \
+ "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \
+ "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
+ "mov " #sum ", " #sum ", lsl #16 \n" \
+ "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n"
+
+ asm volatile (
+ "tst %[v2], #2 \n"
+ "beq 20f \n"
+
+ "10: \n"
+ "ldrh r4, [%[v2]], #2 \n"
+ "mov r4, r4, lsl #16 \n"
+ "1: \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r5-r8} \n"
+ ADDHALFXREGS(r0, r4, r5)
+ ADDHALFXREGS(r1, r5, r6)
+ ADDHALFXREGS(r2, r6, r7)
+ ADDHALFXREGS(r3, r7, r8)
+ "stmia %[v1]!, {r0-r3} \n"
+ "mov r4, r8 \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r5-r8} \n"
+ ADDHALFXREGS(r0, r4, r5)
+ ADDHALFXREGS(r1, r5, r6)
+ ADDHALFXREGS(r2, r6, r7)
+ ADDHALFXREGS(r3, r7, r8)
+ "stmia %[v1]!, {r0-r3} \n"
+#if ORDER > 16
+ "mov r4, r8 \n"
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
+#endif
+ "b 99f \n"
+
+ "20: \n"
+ "1: \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ ADDHALFREGS(r0, r4)
+ ADDHALFREGS(r1, r5)
+ ADDHALFREGS(r2, r6)
+ ADDHALFREGS(r3, r7)
+ "stmia %[v1]!, {r0-r3} \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ ADDHALFREGS(r0, r4)
+ ADDHALFREGS(r1, r5)
+ ADDHALFREGS(r2, r6)
+ ADDHALFREGS(r3, r7)
+ "stmia %[v1]!, {r0-r3} \n"
+#if ORDER > 16
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
+#endif
+
+ "99: \n"
+ : /* outputs */
+#if ORDER > 16
+ [cnt]"+r"(cnt),
+#endif
+ [v1] "+r"(v1),
+ [v2] "+r"(v2)
+ : /* inputs */
+ : /* clobbers */
+ "r0", "r1", "r2", "r3", "r4",
+ "r5", "r6", "r7", "r8", "memory"
+ );
+}
+
+/* This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned, otherwise it will result either in a data abort, or
+ * incorrect results (if ARM aligncheck is disabled). */
+static inline void vector_sub(int16_t* v1, int16_t* v2)
+{
+#if ORDER > 16
+ int cnt = ORDER>>4;
+#endif
+
+#define SUBHALFREGS(dif, s1) /* Subtracts register */ \
+ "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \
+ "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \
+ "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \
+ "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \
+ "orr " #dif ", r8 , " #dif ", lsl #16 \n"
+
+#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \
+ "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \
+ "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \
+ "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \
+ "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n"
+
+ asm volatile (
+ "mov r9, #0xff \n"
+ "orr r9, r9, #0xff00 \n"
+ "tst %[v2], #2 \n"
+ "beq 20f \n"
+
+ "10: \n"
+ "ldrh r4, [%[v2]], #2 \n"
+ "mov r4, r4, lsl #16 \n"
+ "1: \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r5-r8} \n"
+ SUBHALFXREGS(r0, r4, r5)
+ SUBHALFXREGS(r1, r5, r6)
+ SUBHALFXREGS(r2, r6, r7)
+ SUBHALFXREGS(r3, r7, r8)
+ "stmia %[v1]!, {r0-r3} \n"
+ "mov r4, r8 \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r5-r8} \n"
+ SUBHALFXREGS(r0, r4, r5)
+ SUBHALFXREGS(r1, r5, r6)
+ SUBHALFXREGS(r2, r6, r7)
+ SUBHALFXREGS(r3, r7, r8)
+ "stmia %[v1]!, {r0-r3} \n"
+#if ORDER > 16
+ "mov r4, r8 \n"
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
+#endif
+ "b 99f \n"
+
+ "20: \n"
+ "1: \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ SUBHALFREGS(r0, r4)
+ SUBHALFREGS(r1, r5)
+ SUBHALFREGS(r2, r6)
+ SUBHALFREGS(r3, r7)
+ "stmia %[v1]!, {r0-r3} \n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ SUBHALFREGS(r0, r4)
+ SUBHALFREGS(r1, r5)
+ SUBHALFREGS(r2, r6)
+ SUBHALFREGS(r3, r7)
+ "stmia %[v1]!, {r0-r3} \n"
+#if ORDER > 16
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
+#endif
+
+ "99: \n"
+ : /* outputs */
+#if ORDER > 16
+ [cnt]"+r"(cnt),
+#endif
+ [v1] "+r"(v1),
+ [v2] "+r"(v2)
+ : /* inputs */
+ : /* clobbers */
+ "r0", "r1", "r2", "r3", "r4", "r5",
+ "r6", "r7", "r8", "r9", "memory"
+ );
+}
+
+/* This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned, otherwise it will result either in a data abort, or
+ * incorrect results (if ARM aligncheck is disabled). It is optimised
+ * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
+ * than the C version. */
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+ int res = 0;
+#if ORDER > 16
+ int cnt = ORDER>>4;
+#endif
+
+#define MLABLOCK2(f1, f2) \
+ "mov r8, " #f1 ", lsl #16 \n" \
+ "mov r8, r8 , asr #16 \n" \
+ "mov r9, " #f2 ", lsl #16 \n" \
+ "mov r9, r9 , asr #16 \n" \
+ "mla %[res], r8, r9, %[res] \n" \
+ "mov r8, " #f1 ", asr #16 \n" \
+ "mov r9, " #f2 ", asr #16 \n" \
+ "mla %[res], r8, r9, %[res] \n"
+
+#define MLABLOCK2_U2(f1, f2) \
+ "mov r8, " #f1 ", lsl #16 \n" \
+ "mov r8, r8 , asr #16 \n" \
+ "mla %[res], r8, r9, %[res] \n" \
+ "mov r8, " #f1 ", asr #16 \n" \
+ "mov r9, " #f2 ", lsl #16 \n" \
+ "mov r9, r9 , asr #16 \n" \
+ "mla %[res], r8, r9, %[res] \n" \
+ "mov r9, " #f2 ", asr #16 \n"
+
+ asm volatile (
+ "tst %[v2], #2 \n"
+ "beq 20f \n"
+
+ "10: \n"
+ "ldrsh r9, [%[v2]], #2 \n"
+ "1: \n"
+ "ldmia %[v1]!, {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ MLABLOCK2_U2(r0, r4)
+ MLABLOCK2_U2(r1, r5)
+ MLABLOCK2_U2(r2, r6)
+ MLABLOCK2_U2(r3, r7)
+ "ldmia %[v1]!, {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ MLABLOCK2_U2(r0, r4)
+ MLABLOCK2_U2(r1, r5)
+ MLABLOCK2_U2(r2, r6)
+ MLABLOCK2_U2(r3, r7)
+#if ORDER > 16
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
+#endif
+ "b 99f \n"
+
+ "20: \n"
+ "1: \n"
+ "ldmia %[v1]!, {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ MLABLOCK2(r0, r4)
+ MLABLOCK2(r1, r5)
+ MLABLOCK2(r2, r6)
+ MLABLOCK2(r3, r7)
+ "ldmia %[v1]!, {r0-r3} \n"
+ "ldmia %[v2]!, {r4-r7} \n"
+ MLABLOCK2(r0, r4)
+ MLABLOCK2(r1, r5)
+ MLABLOCK2(r2, r6)
+ MLABLOCK2(r3, r7)
+#if ORDER > 16
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
+#endif
+
+ "99: \n"
+ : /* outputs */
+#if ORDER > 16
+ [cnt]"+r"(cnt),
+#endif
+ [v1] "+r"(v1),
+ [v2] "+r"(v2),
+ [res]"+r"(res)
+ : /* inputs */
+ : /* clobbers */
+ "r0", "r1", "r2", "r3", "r4",
+ "r5", "r6", "r7", "r8", "r9"
+ );
+ return res;
+}
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
index 937462c293..0c3aaca223 100644
--- a/apps/codecs/demac/libdemac/vector_math16_cf.h
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -24,20 +24,71 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/
+/* This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned, otherwise performance will suffer. */
static inline void vector_add(int16_t* v1, int16_t* v2)
{
-#define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \
- "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \
- "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \
- "clr.w %%d4 \n" \
- "add.l %%d4 , " #sum "\n" \
+#if ORDER > 16
+ int cnt = ORDER>>4;
+#endif
+
+#define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \
+ "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \
+ "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \
+ "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \
+ "add.l %%d4 , " #sum "\n" \
+ "move.w " #s1 ", " #sum "\n"
+
+#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
+ "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
+ "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
+ "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
+ "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
"move.w " #s1 ", " #sum "\n"
asm volatile (
-#if ORDER > 16
- "moveq.l %[cnt], %%d5 \n"
+ "move.l %[v2], %%d0 \n"
+ "and.l #2, %%d0 \n"
+ "jeq 20f \n"
+
+ "10: \n"
+ "move.w (%[v2])+, %%d0 \n"
+ "swap %%d0 \n"
"1: \n"
+ "movem.l (%[v1]), %%a0-%%a3 \n"
+ "movem.l (%[v2]), %%d1-%%d4 \n"
+ ADDHALFXREGS(%%a0, %%d1, %%d0)
+ "move.l %%d0, (%[v1])+ \n"
+ ADDHALFXREGS(%%a1, %%d2, %%d1)
+ "move.l %%d1, (%[v1])+ \n"
+ ADDHALFXREGS(%%a2, %%d3, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFXREGS(%%a3, %%d4, %%d3)
+ "move.l %%d3, (%[v1])+ \n"
+ "lea.l (16, %[v2]), %[v2] \n"
+ "move.l %%d4, %%d0 \n"
+
+ "movem.l (%[v1]), %%a0-%%a3 \n"
+ "movem.l (%[v2]), %%d1-%%d4 \n"
+ ADDHALFXREGS(%%a0, %%d1, %%d0)
+ "move.l %%d0, (%[v1])+ \n"
+ ADDHALFXREGS(%%a1, %%d2, %%d1)
+ "move.l %%d1, (%[v1])+ \n"
+ ADDHALFXREGS(%%a2, %%d3, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFXREGS(%%a3, %%d4, %%d3)
+ "move.l %%d3, (%[v1])+ \n"
+#if ORDER > 16
+ "lea.l (16, %[v2]), %[v2] \n"
+ "move.l %%d4, %%d0 \n"
+
+ "subq.l #1, %[cnt] \n"
+ "jne 1b \n"
#endif
+ "jra 99f \n"
+
+ "20: \n"
+ "1: \n"
"movem.l (%[v2]), %%a0-%%a3 \n"
"movem.l (%[v1]), %%d0-%%d3 \n"
ADDHALFREGS(%%a0, %%d0)
@@ -48,7 +99,6 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
"move.l %%d2, (%[v1])+ \n"
ADDHALFREGS(%%a3, %%d3)
"move.l %%d3, (%[v1])+ \n"
-
"lea.l (16, %[v2]), %[v2] \n"
"movem.l (%[v2]), %%a0-%%a3 \n"
@@ -64,34 +114,89 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
#if ORDER > 16
"lea.l (16, %[v2]), %[v2] \n"
- "subq.l #1, %%d5 \n"
- "bne.w 1b \n"
+ "subq.l #1, %[cnt] \n"
+ "jne 1b \n"
#endif
+ "99: \n"
: /* outputs */
- [v1]"+a"(v1),
- [v2]"+a"(v2)
+#if ORDER > 16
+ [cnt]"+d"(cnt),
+#endif
+ [v1] "+a"(v1),
+ [v2] "+a"(v2)
: /* inputs */
- [cnt]"n"(ORDER>>4)
: /* clobbers */
- "d0", "d1", "d2", "d3", "d4", "d5",
+ "d0", "d1", "d2", "d3", "d4",
"a0", "a1", "a2", "a3", "memory"
);
}
+/* This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned, otherwise performance will suffer. */
static inline void vector_sub(int16_t* v1, int16_t* v2)
{
-#define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \
- "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \
- "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \
- "clr.w " #sub "\n" \
- "sub.l " #sub ", " #dif "\n" \
+#if ORDER > 16
+ int cnt = ORDER>>4;
+#endif
+
+#define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
+ "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
+ "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
+ "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
+ "sub.l " #sub ", " #dif "\n" \
"move.w " #min ", " #dif "\n"
+
+#define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
+ "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
+ "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
+ "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
+ "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
+ "sub.l " #s2 ", " #min "\n" \
+ "move.w " #min ", " #s1d "\n"
asm volatile (
-#if ORDER > 16
- "moveq.l %[cnt], %%d5 \n"
+ "move.l %[v2], %%d0 \n"
+ "and.l #2, %%d0 \n"
+ "jeq 20f \n"
+
+ "10: \n"
+ "move.w (%[v2])+, %%d0 \n"
+ "swap %%d0 \n"
"1: \n"
+ "movem.l (%[v2]), %%d1-%%d4 \n"
+ "movem.l (%[v1]), %%a0-%%a3 \n"
+ SUBHALFXREGS(%%a0, %%d1, %%d0)
+ "move.l %%d0, (%[v1])+ \n"
+ SUBHALFXREGS(%%a1, %%d2, %%d1)
+ "move.l %%d1, (%[v1])+ \n"
+ SUBHALFXREGS(%%a2, %%d3, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFXREGS(%%a3, %%d4, %%d3)
+ "move.l %%d3, (%[v1])+ \n"
+ "lea.l (16, %[v2]), %[v2] \n"
+ "move.l %%d4, %%d0 \n"
+
+ "movem.l (%[v2]), %%d1-%%d4 \n"
+ "movem.l (%[v1]), %%a0-%%a3 \n"
+ SUBHALFXREGS(%%a0, %%d1, %%d0)
+ "move.l %%d0, (%[v1])+ \n"
+ SUBHALFXREGS(%%a1, %%d2, %%d1)
+ "move.l %%d1, (%[v1])+ \n"
+ SUBHALFXREGS(%%a2, %%d3, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFXREGS(%%a3, %%d4, %%d3)
+ "move.l %%d3, (%[v1])+ \n"
+#if ORDER > 16
+ "lea.l (16, %[v2]), %[v2] \n"
+ "move.l %%d4, %%d0 \n"
+
+ "subq.l #1, %[cnt] \n"
+ "bne.w 1b \n"
#endif
+ "jra 99f \n"
+
+ "20: \n"
+ "1: \n"
"movem.l (%[v2]), %%d1-%%d4 \n"
"movem.l (%[v1]), %%a0-%%a3 \n"
SUBHALFREGS(%%a0, %%d1, %%d0)
@@ -118,37 +223,79 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
#if ORDER > 16
"lea.l (16, %[v2]), %[v2] \n"
- "subq.l #1, %%d5 \n"
+ "subq.l #1, %[cnt] \n"
"bne.w 1b \n"
#endif
+
+ "99: \n"
: /* outputs */
- [v1]"+a"(v1),
- [v2]"+a"(v2)
+#if ORDER > 16
+ [cnt]"+d"(cnt),
+#endif
+ [v1] "+a"(v1),
+ [v2] "+a"(v2)
: /* inputs */
- [cnt]"n"(ORDER>>4)
: /* clobbers */
- "d0", "d1", "d2", "d3", "d4", "d5",
+ "d0", "d1", "d2", "d3", "d4",
"a0", "a1", "a2", "a3", "memory"
);
}
#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
-/* Needs EMAC in signed integer mode! */
+/* This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
+ * in signed integer mode - call above macro before use. */
static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
{
int res = 0;
+#if ORDER > 32
+ int cnt = ORDER>>5;
+#endif
#define MACBLOCK4 \
"mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \
- "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \
- "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \
- "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
+ "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
+ "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+
+#define MACBLOCK4_U2 \
+ "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
+ "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
+ "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
+ "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
asm volatile (
+ "move.l %[v2], %%d0 \n"
+ "and.l #2, %%d0 \n"
+ "jeq 20f \n"
+
+ "10: \n"
+ "move.l (%[v1])+, %%d0 \n"
+ "move.w (%[v2])+, %%d1 \n"
+ "1: \n"
+#if ORDER > 16
+ MACBLOCK4_U2
+ MACBLOCK4_U2
+ MACBLOCK4_U2
+ MACBLOCK4_U2
+#endif
+ MACBLOCK4_U2
+ MACBLOCK4_U2
+ MACBLOCK4_U2
+ "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
#if ORDER > 32
- "moveq.l %[cnt], %[res] \n"
+ "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+ "subq.l #1, %[res] \n"
+ "bne.w 1b \n"
+#else
+ "mac.w %%d0l, %%d1u, %%acc0 \n"
#endif
+ "jra 99f \n"
+
+ "20: \n"
"move.l (%[v1])+, %%d0 \n"
"move.l (%[v2])+, %%d1 \n"
"1: \n"
@@ -162,26 +309,29 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
MACBLOCK4
MACBLOCK4
"mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
- "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n"
+ "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
#if ORDER > 32
- "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n"
- "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n"
-
+ "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+ "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
"subq.l #1, %[res] \n"
"bne.w 1b \n"
#else
- "mac.w %%d2u, %%d3u, %%acc0 \n"
- "mac.w %%d2l, %%d3l, %%acc0 \n"
+ "mac.w %%d2u, %%d1u, %%acc0 \n"
+ "mac.w %%d2l, %%d1l, %%acc0 \n"
#endif
+
+ "99: \n"
"movclr.l %%acc0, %[res] \n"
: /* outputs */
[v1]"+a"(v1),
[v2]"+a"(v2),
- [res]"=&d"(res)
+ [res]"=d"(res)
: /* inputs */
- [cnt]"n"(ORDER>>5)
+#if ORDER > 32
+ [cnt]"[res]"(cnt)
+#endif
: /* clobbers */
- "d0", "d1", "d2", "d3"
+ "d0", "d1", "d2"
);
return res;
}