summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-24 23:09:09 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-24 23:09:09 +0000
commitd7e4e54bcb03b19411caa1329795c3a1db608478 (patch)
treebb10bb9f4e278126b2de4ed550746dff68a730fe
parent7f521d3b1fb69f91608f27705f82e69318bab932 (diff)
downloadrockbox-d7e4e54bcb03b19411caa1329795c3a1db608478.tar.gz
rockbox-d7e4e54bcb03b19411caa1329795c3a1db608478.tar.bz2
rockbox-d7e4e54bcb03b19411caa1329795c3a1db608478.zip
Reorder instructions to avoid pipeline stalls on ARMv6 wherever possible (sometimes using different registers to allow this). Speeds up the predictor by almost 20% on ARMv6 (overall speedup for -c1000 is 5%), and might also help a bit on ARMv5. ARMv4 speed is unaffected.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19210 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/predictor-arm.S164
1 files changed, 80 insertions, 84 deletions
diff --git a/apps/codecs/demac/libdemac/predictor-arm.S b/apps/codecs/demac/libdemac/predictor-arm.S
index d62b6186f8..ca8a3f4736 100644
--- a/apps/codecs/demac/libdemac/predictor-arm.S
+++ b/apps/codecs/demac/libdemac/predictor-arm.S
@@ -89,29 +89,29 @@ loop:
@ Predictor Y, Filter A
ldr r10, [r12, #YlastA] @ r10 := p->YlastA
- add r11, r14, #YDELAYA-12 @ r11 := &p->buf[YDELAYA-3]
+ add r11, r14, #YDELAYA-12 @ r11 := &p->buf[YDELAYA-3]
ldmia r11, { r2 - r4 } @ r2 := p->buf[YDELAYA-3]
@ r3 := p->buf[YDELAYA-2]
@ r4 := p->buf[YDELAYA-1]
- subs r4, r10, r4 @ r4 := r10 - r4
-
- add r1, r12, #YcoeffsA
- ldmia r1, {r6 - r9} @ r6 := p->YcoeffsA[0]
+ add r11, r12, #YcoeffsA
+ ldmia r11, {r6 - r9} @ r6 := p->YcoeffsA[0]
@ r7 := p->YcoeffsA[1]
@ r8 := p->YcoeffsA[2]
@ r9 := p->YcoeffsA[3]
+ subs r4, r10, r4 @ r4 := r10 - r4
+
+ add r11, r14, #YDELAYA-4 @ r11 := &p->buf[YDELAYA-1]
+ stmia r11, { r4, r10 } @ p->buf[YDELAYA-1] = r4
+ @ p->buf[YDELAYA] = r10
+
mul r0, r10, r6 @ r0 := p->buf[YDELAYA] * p->YcoeffsA[0]
mla r0, r4, r7, r0 @ r0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
mla r0, r3, r8, r0 @ r0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
mla r0, r2, r9, r0 @ r0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
- add r11, r14, #YDELAYA-4
- stmia r11, { r4, r10 } @ p->buf[YDELAYA-1] = r4
- @ p->buf[YDELAYA] = r10
-
@ flags were set above, in the subs instruction
mvngt r4, #0
movlt r4, #1 @ r4 := SIGN(r4) (see .c for SIGN macro)
@@ -128,23 +128,20 @@ loop:
@ Predictor Y, Filter B
- add r2, r12, #YfilterB
- ldmia r2, {r2, r11} @ r2 := p->YfilterB
- @ r11 := p->XfilterA
-
- rsb r2, r2, r2, lsl #5 @ r2 := r2 * 32 - r2 ( == r2*31)
- sub r10, r11, r2, asr #5 @ r10 (p->buf[YDELAYB]) := r11 - (r2 >> 5)
-
- str r11, [r12, #YfilterB] @ p->YfilterB := r11 (p->XfilterA)
+ add r11, r12, #YfilterB
+ ldmia r11, {r6, r7} @ r6 := p->YfilterB
+ @ r7 := p->XfilterA
add r11, r14, #YDELAYB-16 @ r11 := &p->buf[YDELAYB-4]
-
ldmia r11, { r2 - r5 } @ r2 := p->buf[YDELAYB-4]
@ r3 := p->buf[YDELAYB-3]
@ r4 := p->buf[YDELAYB-2]
@ r5 := p->buf[YDELAYB-1]
- subs r5, r10, r5 @ r5 := r10 - r5
+ rsb r6, r6, r6, lsl #5 @ r6 := r6 * 32 - r6 ( == r6*31)
+ sub r10, r7, r6, asr #5 @ r10 (p->buf[YDELAYB]) := r7 - (r6 >> 5)
+
+ str r7, [r12, #YfilterB] @ p->YfilterB := r7 (p->XfilterA)
add r1, r12, #YcoeffsB
ldmia r1, {r6,r7,r8,r9,r11} @ r6 := p->YcoeffsB[0]
@@ -153,16 +150,18 @@ loop:
@ r9 := p->YcoeffsB[3]
@ r11 := p->YcoeffsB[4]
+ subs r5, r10, r5 @ r5 := r10 - r5
+
+ add r1, r14, #YDELAYB-4 @ r1 := &p->buf[YDELAYB-1]
+ stmia r1, { r5, r10 } @ p->buf[YDELAYB-1] = r5
+ @ p->buf[YDELAYB] = r10
+
mul r1, r10, r6 @ r1 := p->buf[YDELAYB] * p->YcoeffsB[0]
mla r1, r5, r7, r1 @ r1 += p->buf[YDELAYB-1] * p->YcoeffsB[1]
mla r1, r4, r8, r1 @ r1 += p->buf[YDELAYB-2] * p->YcoeffsB[2]
mla r1, r3, r9, r1 @ r1 += p->buf[YDELAYB-3] * p->YcoeffsB[3]
mla r1, r2, r11, r1 @ r1 += p->buf[YDELAYB-4] * p->YcoeffsB[4]
- add r2, r14, #YDELAYB-4 @ r2 := &p->buf[YDELAYB-1]
- stmia r2, { r5, r10 } @ p->buf[YDELAYB-1] = r5
- @ p->buf[YDELAYB] = r10
-
@ flags were set above, in the subs instruction
mvngt r5, #0
movlt r5, #1 @ r5 := SIGN(r5) (see .c for SIGN macro)
@@ -182,12 +181,11 @@ loop:
ldr r2, [sp] @ r2 := decoded0
add r0, r0, r1, asr #1 @ r0 := r0 + (r1 >> 1)
+ ldr r4, [r12, #YfilterA] @ r4 := p->YfilterA
ldr r3, [r2] @ r3 := *decoded0
+ rsb r4, r4, r4, lsl #5 @ r4 := r4 * 32 - r4 ( == r4*31)
add r1, r3, r0, asr #10 @ r1 := r3 + (r0 >> 10)
str r1, [r12, #YlastA] @ p->YlastA := r1
-
- ldr r4, [r12, #YfilterA] @ r4 := p->YfilterA
- rsb r4, r4, r4, lsl #5 @ r4 := r4 * 32 - r4 ( == r4*31)
add r1, r1, r4, asr #5 @ r1 := r1 + (r4 >> 5)
str r1, [r12, #YfilterA] @ p->YfilterA := r1
@@ -198,10 +196,10 @@ loop:
@ r6, r7, r8, r9, r11 contain p->YcoeffsB[0..4]
@ r5, r10 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB]
- cmp r3, #0
- stmia r2!, {r1} @ *(decoded0++) := r1 (p->YfilterA)
+ str r1, [r2], #4 @ *(decoded0++) := r1 (p->YfilterA)
str r2, [sp] @ save decoded0
- beq 2f
+ cmp r3, #0
+ beq 3f
add r1, r14, #YADAPTCOEFFSB-16
ldmia r1, { r2, r3, r4 } @ r2 := p->buf[YADAPTCOEFFSB-4]
@@ -213,9 +211,9 @@ loop:
sub r6, r6, r10 @ r6 := p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB]
sub r7, r7, r5 @ r7 := p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1]
- sub r8, r8, r4 @ r8 := p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2]
- sub r9, r9, r3 @ r9 := p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3]
sub r11, r11, r2 @ r11 := p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4]
+ sub r9, r9, r3 @ r9 := p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3]
+ sub r8, r8, r4 @ r8 := p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2]
add r0, r12, #YcoeffsB
stmia r0, {r6,r7,r8,r9,r11} @ Save p->YcoeffsB[]
@@ -232,12 +230,11 @@ loop:
@ r8 := p->buf[YADAPTCOEFFSA-1]
@ r9 := p->buf[YADAPTCOEFFSA]
- sub r2, r2, r9 @ r2 := p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
- sub r3, r3, r8 @ r3 := p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
- sub r4, r4, r7 @ r4 := p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
sub r5, r5, r6 @ r5 := p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+ sub r4, r4, r7 @ r4 := p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+ sub r3, r3, r8 @ r3 := p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+ sub r2, r2, r9 @ r2 := p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
- stmia r1, {r2-r5} @ Save p->YcoeffsA
b 2f
@@ -245,9 +242,9 @@ loop:
add r6, r6, r10 @ r6 := p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB]
add r7, r7, r5 @ r7 := p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1]
- add r8, r8, r4 @ r8 := p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2]
- add r9, r9, r3 @ r9 := p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3]
add r11, r11, r2 @ r11 := p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4]
+ add r9, r9, r3 @ r9 := p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3]
+ add r8, r8, r4 @ r8 := p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2]
add r0, r12, #YcoeffsB
stmia r0, {r6,r7,r8,r9,r11} @ Save p->YcoeffsB[]
@@ -264,43 +261,44 @@ loop:
@ r8 := p->buf[YADAPTCOEFFSA-1]
@ r9 := p->buf[YADAPTCOEFFSA]
- add r2, r2, r9 @ r2 := p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
- add r3, r3, r8 @ r3 := p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
- add r4, r4, r7 @ r4 := p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
add r5, r5, r6 @ r5 := p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
+ add r4, r4, r7 @ r4 := p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
+ add r3, r3, r8 @ r3 := p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
+ add r2, r2, r9 @ r2 := p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
+2:
stmia r1, {r2-r5} @ Save p->YcoeffsA
-2:
+3:
@@@@@@@@@@@@@@@@@@@@@@@@@@@ PREDICTOR X
@ Predictor X, Filter A
ldr r10, [r12, #XlastA] @ r10 := p->XlastA
- add r11, r14, #XDELAYA-12 @ r11 := &p->buf[XDELAYA-3]
+ add r11, r14, #XDELAYA-12 @ r11 := &p->buf[XDELAYA-3]
ldmia r11, { r2 - r4 } @ r2 := p->buf[XDELAYA-3]
@ r3 := p->buf[XDELAYA-2]
@ r4 := p->buf[XDELAYA-1]
- subs r4, r10, r4 @ r4 := r10 - r4
-
- add r1, r12, #XcoeffsA
- ldmia r1, {r6 - r9} @ r6 := p->XcoeffsA[0]
+ add r11, r12, #XcoeffsA
+ ldmia r11, {r6 - r9} @ r6 := p->XcoeffsA[0]
@ r7 := p->XcoeffsA[1]
@ r8 := p->XcoeffsA[2]
@ r9 := p->XcoeffsA[3]
+ subs r4, r10, r4 @ r4 := r10 - r4
+
+ add r11, r14, #XDELAYA-4 @ r11 := &p->buf[XDELAYA-1]
+ stmia r11, { r4, r10 } @ p->buf[XDELAYA-1] = r4
+ @ p->buf[XDELAYA] = r10
+
mul r0, r10, r6 @ r0 := p->buf[XDELAYA] * p->XcoeffsA[0]
mla r0, r4, r7, r0 @ r0 += p->buf[XDELAYA-1] * p->XcoeffsA[1]
mla r0, r3, r8, r0 @ r0 += p->buf[XDELAYA-2] * p->XcoeffsA[2]
mla r0, r2, r9, r0 @ r0 += p->buf[XDELAYA-3] * p->XcoeffsA[3]
- add r11, r14, #XDELAYA-4
- stmia r11, { r4, r10 } @ p->buf[XDELAYA-1] = r4
- @ p->buf[XDELAYA] = r10
-
@ flags were set above, in the subs instruction
mvngt r4, #0
movlt r4, #1 @ r4 := SIGN(r4) (see .c for SIGN macro)
@@ -317,23 +315,20 @@ loop:
@ Predictor X, Filter B
- add r2, r12, #XfilterB
- ldmia r2, {r2, r11} @ r2 := p->XfilterB
- @ r11 := p->YfilterA
-
- rsb r2, r2, r2, lsl #5 @ r2 := r2 * 32 - r2 ( == r2*31)
- sub r10, r11, r2, asr #5 @ r10 (p->buf[XDELAYB]) := r11 - (r2 >> 5)
-
- str r11, [r12, #XfilterB] @ p->XfilterB := r11 (p->YfilterA)
+ add r11, r12, #XfilterB
+ ldmia r11, {r6, r7} @ r6 := p->XfilterB
+ @ r7 := p->YfilterA
add r11, r14, #XDELAYB-16 @ r11 := &p->buf[XDELAYB-4]
-
ldmia r11, { r2 - r5 } @ r2 := p->buf[XDELAYB-4]
@ r3 := p->buf[XDELAYB-3]
@ r4 := p->buf[XDELAYB-2]
@ r5 := p->buf[XDELAYB-1]
- subs r5, r10, r5 @ r5 := r10 - r5
+ rsb r6, r6, r6, lsl #5 @ r6 := r2 * 32 - r6 ( == r6*31)
+ sub r10, r7, r6, asr #5 @ r10 (p->buf[XDELAYB]) := r7 - (r6 >> 5)
+
+ str r7, [r12, #XfilterB] @ p->XfilterB := r11 (p->YfilterA)
add r1, r12, #XcoeffsB
ldmia r1, {r6,r7,r8,r9,r11} @ r6 := p->XcoeffsB[0]
@@ -342,16 +337,18 @@ loop:
@ r9 := p->XcoeffsB[3]
@ r11 := p->XcoeffsB[4]
+ subs r5, r10, r5 @ r5 := r10 - r5
+
+ add r1, r14, #XDELAYB-4 @ r1 := &p->buf[XDELAYB-1]
+ stmia r1, { r5, r10 } @ p->buf[XDELAYB-1] = r5
+ @ p->buf[XDELAYB] = r10
+
mul r1, r10, r6 @ r1 := p->buf[XDELAYB] * p->XcoeffsB[0]
mla r1, r5, r7, r1 @ r1 += p->buf[XDELAYB-1] * p->XcoeffsB[1]
mla r1, r4, r8, r1 @ r1 += p->buf[XDELAYB-2] * p->XcoeffsB[2]
mla r1, r3, r9, r1 @ r1 += p->buf[XDELAYB-3] * p->XcoeffsB[3]
mla r1, r2, r11, r1 @ r1 += p->buf[XDELAYB-4] * p->XcoeffsB[4]
- add r2, r14, #XDELAYB-4 @ r2 := &p->buf[XDELAYB-1]
- stmia r2, { r5, r10 } @ p->buf[XDELAYB-1] = r5
- @ p->buf[XDELAYB] = r10
-
@ flags were set above, in the subs instruction
mvngt r5, #0
movlt r5, #1 @ r5 := SIGN(r5) (see .c for SIGN macro)
@@ -371,12 +368,11 @@ loop:
ldr r2, [sp, #4] @ r2 := decoded1
add r0, r0, r1, asr #1 @ r0 := r0 + (r1 >> 1)
+ ldr r4, [r12, #XfilterA] @ r4 := p->XfilterA
ldr r3, [r2] @ r3 := *decoded1
+ rsb r4, r4, r4, lsl #5 @ r4 := r4 * 32 - r4 ( == r4*31)
add r1, r3, r0, asr #10 @ r1 := r3 + (r0 >> 10)
str r1, [r12, #XlastA] @ p->XlastA := r1
-
- ldr r4, [r12, #XfilterA] @ r4 := p->XfilterA
- rsb r4, r4, r4, lsl #5 @ r4 := r4 * 32 - r4 ( == r4*31)
add r1, r1, r4, asr #5 @ r1 := r1 + (r4 >> 5)
str r1, [r12, #XfilterA] @ p->XfilterA := r1
@@ -387,10 +383,10 @@ loop:
@ r6, r7, r8, r9, r11 contain p->XcoeffsB[0..4]
@ r5, r10 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB]
- cmp r3, #0
- stmia r2!, {r1} @ *(decoded1++) := r1 (p->XfilterA)
+ str r1, [r2], #4 @ *(decoded1++) := r1 (p->XfilterA)
str r2, [sp, #4] @ save decoded1
- beq 2f
+ cmp r3, #0
+ beq 3f
add r1, r14, #XADAPTCOEFFSB-16
ldmia r1, { r2, r3, r4 } @ r2 := p->buf[XADAPTCOEFFSB-4]
@@ -402,9 +398,9 @@ loop:
sub r6, r6, r10 @ r6 := p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB]
sub r7, r7, r5 @ r7 := p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1]
- sub r8, r8, r4 @ r8 := p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2]
+ sub r11, r11, r2 @ r11 := p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4]
sub r9, r9, r3 @ r9 := p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3]
- sub r11, r11, r2 @ r11 := p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4]
+ sub r8, r8, r4 @ r8 := p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2]
add r0, r12, #XcoeffsB
stmia r0, {r6,r7,r8,r9,r11} @ Save p->XcoeffsB[]
@@ -421,12 +417,11 @@ loop:
@ r8 := p->buf[XADAPTCOEFFSA-1]
@ r9 := p->buf[XADAPTCOEFFSA]
- sub r2, r2, r9 @ r2 := p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA]
- sub r3, r3, r8 @ r3 := p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1]
- sub r4, r4, r7 @ r4 := p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2]
sub r5, r5, r6 @ r5 := p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3]
+ sub r4, r4, r7 @ r4 := p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2]
+ sub r3, r3, r8 @ r3 := p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1]
+ sub r2, r2, r9 @ r2 := p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA]
- stmia r1, {r2-r5} @ Save p->XcoeffsA
b 2f
@@ -434,9 +429,9 @@ loop:
add r6, r6, r10 @ r6 := p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB]
add r7, r7, r5 @ r7 := p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1]
- add r8, r8, r4 @ r8 := p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2]
- add r9, r9, r3 @ r9 := p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3]
add r11, r11, r2 @ r11 := p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4]
+ add r9, r9, r3 @ r9 := p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3]
+ add r8, r8, r4 @ r8 := p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2]
add r0, r12, #XcoeffsB
stmia r0, {r6,r7,r8,r9,r11} @ Save p->XcoeffsB[]
@@ -453,14 +448,15 @@ loop:
@ r8 := p->buf[XADAPTCOEFFSA-1]
@ r9 := p->buf[XADAPTCOEFFSA]
- add r2, r2, r9 @ r2 := p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA]
- add r3, r3, r8 @ r3 := p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1]
- add r4, r4, r7 @ r4 := p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2]
add r5, r5, r6 @ r5 := p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3]
+ add r4, r4, r7 @ r4 := p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2]
+ add r3, r3, r8 @ r3 := p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1]
+ add r2, r2, r9 @ r2 := p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA]
+2:
stmia r1, {r2-r5} @ Save p->XcoeffsA
-2:
+3:
@@@@@@@@@@@@@@@@@@@@@@@@@@@ COMMON
@@ -471,11 +467,11 @@ loop:
sub r10, r14, #PREDICTOR_HISTORY_SIZE*4
@ r10 := p->buf - PREDICTOR_HISTORY_SIZE
+ ldr r0, [sp, #8]
cmp r10, r11
beq move_hist @ The history buffer is full, we need to do a memmove
@ Check loop count
- ldr r0, [sp, #8]
subs r0, r0, #1
strne r0, [sp, #8]
bne loop
@@ -501,10 +497,10 @@ move_hist:
ldmia r14!, {r0-r9} @ 40 bytes
stmia r11!, {r0-r9}
+ ldr r0, [sp, #8]
add r14, r12, #historybuffer @ p->buf = &p->historybuffer[0]
@ Check loop count
- ldr r0, [sp, #8]
subs r0, r0, #1
strne r0, [sp, #8]
bne loop