summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThom Johansen <thomj@rockbox.org>2005-11-05 17:54:37 +0000
committerThom Johansen <thomj@rockbox.org>2005-11-05 17:54:37 +0000
commit97a21a3b36aa74d133af5bf5411cbf6d576f8a86 (patch)
tree85e235134730fdb1c8bc3bcc6f5e5f1b70d12c6a
parent63fbc0729f66ad55413579da4cb93b9ea51db223 (diff)
downloadrockbox-97a21a3b36aa74d133af5bf5411cbf6d576f8a86.tar.gz
rockbox-97a21a3b36aa74d133af5bf5411cbf6d576f8a86.zip
Unrolled loops up to order 10 plus slight optimisation of default case.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@7759 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libffmpegFLAC/coldfire.S212
1 files changed, 132 insertions, 80 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S
index 33964cdbc1..1d144ecc76 100644
--- a/apps/codecs/libffmpegFLAC/coldfire.S
+++ b/apps/codecs/libffmpegFLAC/coldfire.S
@@ -20,16 +20,16 @@
/* The following is an assembler optimised version of the LPC filtering
routines needed for FLAC decoding. It is optimised for use with the
MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
- All LPC filtering up to order 8 is done in specially optimised unrolled
+ All LPC filtering up to order 10 is done in specially optimised unrolled
loops, while every order above this is handled by a slower default routine.
*/
.section .icode,"ax",@progbits
.global lpc_decode_emac
.align 2
lpc_decode_emac:
- lea.l (-40, %sp), %sp
- movem.l %d2-%d7/%a2-%a5, (%sp)
- movem.l (40+4, %sp), %d0-%d2/%a0-%a1
+ lea.l (-44, %sp), %sp
+ movem.l %d2-%d7/%a2-%a6, (%sp)
+ movem.l (44+4, %sp), %d0-%d2/%a0-%a1
/* d0 = blocksize, d1 = qlevel, d2 = pred_order
a0 = data, a1 = coeffs
*/
@@ -39,17 +39,17 @@ lpc_decode_emac:
and free a register by not saving data pointer.
*/
move.l %d2, %d3
- neg.l %d3
+ neg.l %d3
lea.l (%a0, %d3.l*4), %a0 | history
clr.l %d3
move.l %d3, %macsr | we'll need integer mode for this
tst.l %d0
jeq .exit | zero samples to process, exit
- moveq.l #8, %d3
+ moveq.l #10, %d3
cmp.l %d3, %d2
- jgt .default | order is over 8, jump to default case
+ jgt .default | order is over 10, jump to default case
jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order
-.jumptable:
+| jumptable:
bra.w .exit | zero order filter isn't possible, exit function
bra.w .order1
bra.w .order2
@@ -58,39 +58,84 @@ lpc_decode_emac:
bra.w .order5
bra.w .order6
bra.w .order7
+ bra.w .order8
+ bra.w .order9
| last jump table entry coincides with target, so leave it out
-.order8:
- movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs
- move.l (%a0)+, %a5 | load first history sample
-.loop8:
- mac.l %a5, %a4, (%a0)+, %a5, %acc0
- mac.l %a5, %a3, (%a0)+, %a5, %acc0
- mac.l %a5, %a2, (%a0)+, %a5, %acc0
- mac.l %a5, %d7, (%a0)+, %a5, %acc0
- mac.l %a5, %d6, (%a0)+, %a5, %acc0
- mac.l %a5, %d5, (%a0)+, %a5, %acc0
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration
+.order10:
+ movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs
+ move.l (%a0)+, %a6 | load first history sample
+.loop10:
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (%a0)+, %a6, %acc0
+ mac.l %a6, %a2, (%a0)+, %a6, %acc0
+ mac.l %a6, %a1, (%a0)+, %a6, %acc0
+ mac.l %a6, %d7, (%a0)+, %a6, %acc0
+ mac.l %a6, %d6, (%a0)+, %a6, %acc0
+ mac.l %a6, %d5, (%a0)+, %a6, %acc0
+ mac.l %a6, %d4, (%a0)+, %a6, %acc0
+ mac.l %a6, %d3, (-9*4, %a0), %a6, %acc0 | load for the next iteration
movclr.l %acc0, %d2 | get sum
- asr.l %d1, %d2 | shift sum by lp_quantization bits
+ asr.l %d1, %d2 | shift sum by qlevel bits
add.l %d2, (%a0) | add residual and save
- lea.l (-6*4, %a0), %a0 | point history back at second element
- subq.l #1, %d0 | decrement counter
- jne .loop8 | are we done?
+ lea.l (-8*4, %a0), %a0 | point history back at second element
+ subq.l #1, %d0 | decrement sample count
+ jne .loop10 | are we done?
+ jra .exit
+
+.order9:
+ movem.l (%a1), %d4-%d7/%a1-%a5
+ move.l (%a0)+, %a6
+.loop9:
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (%a0)+, %a6, %acc0
+ mac.l %a6, %a2, (%a0)+, %a6, %acc0
+ mac.l %a6, %a1, (%a0)+, %a6, %acc0
+ mac.l %a6, %d7, (%a0)+, %a6, %acc0
+ mac.l %a6, %d6, (%a0)+, %a6, %acc0
+ mac.l %a6, %d5, (%a0)+, %a6, %acc0
+ mac.l %a6, %d4, (-8*4, %a0), %a6, %acc0
+ movclr.l %acc0, %d2
+ asr.l %d1, %d2
+ add.l %d2, (%a0)
+ lea.l (-7*4, %a0), %a0
+ subq.l #1, %d0
+ jne .loop9
+ jra .exit
+
+.order8:
+ movem.l (%a1), %d5-%d7/%a1-%a5
+ move.l (%a0)+, %a6
+.loop8:
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (%a0)+, %a6, %acc0
+ mac.l %a6, %a2, (%a0)+, %a6, %acc0
+ mac.l %a6, %a1, (%a0)+, %a6, %acc0
+ mac.l %a6, %d7, (%a0)+, %a6, %acc0
+ mac.l %a6, %d6, (%a0)+, %a6, %acc0
+ mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0
+ movclr.l %acc0, %d2
+ asr.l %d1, %d2
+ add.l %d2, (%a0)
+ lea.l (-6*4, %a0), %a0
+ subq.l #1, %d0
+ jne .loop8
jra .exit
.order7:
- movem.l (%a1), %d3-%d7/%a2-%a3
- move.l (%a0)+, %a5
+ movem.l (%a1), %d6-%d7/%a1-%a5
+ move.l (%a0)+, %a6
.loop7:
- mac.l %a5, %a3, (%a0)+, %a5, %acc0
- mac.l %a5, %a2, (%a0)+, %a5, %acc0
- mac.l %a5, %d7, (%a0)+, %a5, %acc0
- mac.l %a5, %d6, (%a0)+, %a5, %acc0
- mac.l %a5, %d5, (%a0)+, %a5, %acc0
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, (-6*4, %a0), %a5, %acc0
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (%a0)+, %a6, %acc0
+ mac.l %a6, %a2, (%a0)+, %a6, %acc0
+ mac.l %a6, %a1, (%a0)+, %a6, %acc0
+ mac.l %a6, %d7, (%a0)+, %a6, %acc0
+ mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
movclr.l %acc0, %d2
asr.l %d1, %d2
add.l %d2, (%a0)
@@ -100,15 +145,15 @@ lpc_decode_emac:
jra .exit
.order6:
- movem.l (%a1), %d3-%d7/%a2
- move.l (%a0)+, %a5
+ movem.l (%a1), %d7/%a1-%a5
+ move.l (%a0)+, %a6
.loop6:
- mac.l %a5, %a2, (%a0)+, %a5, %acc0
- mac.l %a5, %d7, (%a0)+, %a5, %acc0
- mac.l %a5, %d6, (%a0)+, %a5, %acc0
- mac.l %a5, %d5, (%a0)+, %a5, %acc0
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, (-5*4, %a0), %a5, %acc0
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (%a0)+, %a6, %acc0
+ mac.l %a6, %a2, (%a0)+, %a6, %acc0
+ mac.l %a6, %a1, (%a0)+, %a6, %acc0
+ mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
movclr.l %acc0, %d2
asr.l %d1, %d2
add.l %d2, (%a0)
@@ -118,14 +163,14 @@ lpc_decode_emac:
jra .exit
.order5:
- movem.l (%a1), %d3-%d7
- move.l (%a0)+, %a5
+ movem.l (%a1), %a1-%a5
+ move.l (%a0)+, %a6
.loop5:
- mac.l %a5, %d7, (%a0)+, %a5, %acc0
- mac.l %a5, %d6, (%a0)+, %a5, %acc0
- mac.l %a5, %d5, (%a0)+, %a5, %acc0
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, (-4*4, %a0), %a5, %acc0
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (%a0)+, %a6, %acc0
+ mac.l %a6, %a2, (%a0)+, %a6, %acc0
+ mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
movclr.l %acc0, %d2
asr.l %d1, %d2
add.l %d2, (%a0)
@@ -135,13 +180,13 @@ lpc_decode_emac:
jra .exit
.order4:
- movem.l (%a1), %d3-%d6
- move.l (%a0)+, %a5
+ movem.l (%a1), %a2-%a5
+ move.l (%a0)+, %a6
.loop4:
- mac.l %a5, %d6, (%a0)+, %a5, %acc0
- mac.l %a5, %d5, (%a0)+, %a5, %acc0
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, (-3*4, %a0), %a5, %acc0
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (%a0)+, %a6, %acc0
+ mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
movclr.l %acc0, %d2
asr.l %d1, %d2
add.l %d2, (%a0)
@@ -151,12 +196,12 @@ lpc_decode_emac:
jra .exit
.order3:
- movem.l (%a1), %d3-%d5
- move.l (%a0)+, %a5
+ movem.l (%a1), %a3-%a5
+ move.l (%a0)+, %a6
.loop3:
- mac.l %a5, %d5, (%a0)+, %a5, %acc0
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, (-2*4, %a0), %a5, %acc0
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, (%a0)+, %a6, %acc0
+ mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
movclr.l %acc0, %d2
asr.l %d1, %d2
add.l %d2, (%a0)
@@ -166,11 +211,11 @@ lpc_decode_emac:
jra .exit
.order2:
- movem.l (%a1), %d3-%d4
- move.l (%a0)+, %a5
+ movem.l (%a1), %a4-%a5
+ move.l (%a0)+, %a6
.loop2:
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, %acc0 | data for next iteration is already loaded
+ mac.l %a6, %a5, (%a0)+, %a6, %acc0
+ mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
movclr.l %acc0, %d2
asr.l %d1, %d2
add.l %d2, (%a0)
@@ -180,9 +225,9 @@ lpc_decode_emac:
.order1:
| no point in using mac here
- move.l (%a1), %d3
+ move.l (%a1), %a5
.loop1:
- move.l %d3, %d2
+ move.l %a5, %d2
muls.l (%a0)+, %d2
asr.l %d1, %d2
add.l %d2, (%a0)
@@ -192,8 +237,7 @@ lpc_decode_emac:
.default:
/* we do the filtering in an unrolled by 4 loop as far as we can, and then
- do the rest in an ordinary one by one sample loop.
- */
+ do the rest by jump table. */
lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs
move.l %a0, %a3 | working copy of history pointer
move.l %d2, %d3
@@ -209,26 +253,34 @@ lpc_decode_emac:
subq.l #1, %d3 | any more unrolled loop operations left?
jne .dloop1
- move.l %d2, %d3
- moveq.l #3, %d4 | mask 0x00000003
- and.l %d4, %d3 | get the remaining samples to be filtered
- jeq .dsave | no remaining samples
-.dloop2:
- move.l -(%a2), %d4 | get lpc coef
+ moveq.l #3, %d3 | mask 0x00000003
+ and.l %d2, %d3 | get the remaining samples to be filtered
+ jmp.l (2, %pc, %d3*2) | then jump into mac.l chain
+| jumptable:
+ bra.b .dsave
+ bra.b .oneleft
+ bra.b .twoleft
+| implicit .threeleft
+ move.l -(%a2), %d4
+ mac.l %a5, %d4, (%a3)+, %a5, %acc0
+.twoleft:
+ move.l -(%a2), %d4
mac.l %a5, %d4, (%a3)+, %a5, %acc0
- subq.l #1, %d3 | any more iterations left?
- jne .dloop2
+.oneleft:
+ move.l -(%a2), %d4
+ mac.l %a5, %d4, (%a3)+, %a5, %acc0 | need this fetch to not break line below
+
.dsave:
- movclr.l %acc0, %d3 | get result
- asr.l %d1, %d3 | shift lp_quantization bits right
subq.l #4, %a3 | we're one past the save location
+ movclr.l %acc0, %d3 | get result
+ asr.l %d1, %d3 | shift qlevel bits right
add.l %d3, (%a3) | add residual and save
addq.l #4, %a0 | increment history pointer
- subq.l #1, %d0 | decrement data_len
+ subq.l #1, %d0 | decrement sample count
jne .default | are we done?
| if so, fall through to exit
.exit:
- movem.l (%sp), %d2-%d7/%a2-%a5
- lea.l (40, %sp), %sp
+ movem.l (%sp), %d2-%d7/%a2-%a6
+ lea.l (44, %sp), %sp
rts