summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTomasz Malesinski <tomal@rockbox.org>2007-09-27 21:58:51 +0000
committerTomasz Malesinski <tomal@rockbox.org>2007-09-27 21:58:51 +0000
commitc13eba29ff5615cc74a7818e42cc9d464a7c7075 (patch)
treeeef1dfc0d4ed2b69e16b119b0d47052801ef827f
parent1aaf5dbdb660d29ef384674f25c916f23da505bb (diff)
downloadrockbox-c13eba29ff5615cc74a7818e42cc9d464a7c7075.tar.gz
rockbox-c13eba29ff5615cc74a7818e42cc9d464a7c7075.zip
FS #7833: Optimizations to the Vorbis codec:
- ARM assembly version of parts of mdct, - special case for vorbis_book_decodevv_add for 2 channels and even book->dim, - store the output in vb->pcm if possible, as it is usually in IRAM as opposed to v->pcm. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@14875 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/Tremor/SOURCES3
-rw-r--r--apps/codecs/Tremor/block.c20
-rw-r--r--apps/codecs/Tremor/codebook.c54
-rw-r--r--apps/codecs/Tremor/ivorbiscodec.h1
-rw-r--r--apps/codecs/Tremor/mdct.c15
-rw-r--r--apps/codecs/Tremor/mdct_arm.S419
6 files changed, 502 insertions, 10 deletions
diff --git a/apps/codecs/Tremor/SOURCES b/apps/codecs/Tremor/SOURCES
index 0877941808..9b8c05e340 100644
--- a/apps/codecs/Tremor/SOURCES
+++ b/apps/codecs/Tremor/SOURCES
@@ -7,6 +7,9 @@ framing.c
info.c
mapping0.c
mdct.c
+#ifdef CPU_ARM
+mdct_arm.S
+#endif
registry.c
res012.c
sharedbook.c
diff --git a/apps/codecs/Tremor/block.c b/apps/codecs/Tremor/block.c
index 80cbb7809c..e609fc44f7 100644
--- a/apps/codecs/Tremor/block.c
+++ b/apps/codecs/Tremor/block.c
@@ -171,6 +171,7 @@ static int _vds_init(vorbis_dsp_state *v,vorbis_info *vi){
v->pcm_storage=ci->blocksizes[1];
v->pcm=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcm));
+ v->pcmb=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmb));
v->pcmret=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmret));
for(i=0;i<vi->channels;i++)
@@ -308,25 +309,28 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){
/* large/large */
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
ogg_int32_t *p=vb->pcm[j];
- vect_add(pcm, p, n1);
+ vect_add(p, pcm, n1);
+ v->pcmb[j]=p;
}else{
/* large/small */
ogg_int32_t *pcm=v->pcm[j]+prevCenter+n1/2-n0/2;
ogg_int32_t *p=vb->pcm[j];
vect_add(pcm, p, n0);
+ v->pcmb[j]=v->pcm[j]+prevCenter;
}
}else{
if(v->W){
/* small/large */
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2;
- vect_add(pcm, p, n0);
- vect_copy(&pcm[n0], &p[n0], n1/2-n0/2);
+ vect_add(p, pcm, n0);
+ v->pcmb[j]=p;
}else{
/* small/small */
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
ogg_int32_t *p=vb->pcm[j];
- vect_add(pcm, p, n0);
+ vect_add(p, pcm, n0);
+ v->pcmb[j]=p;
}
}
@@ -351,10 +355,8 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){
v->pcm_returned=thisCenter;
v->pcm_current=thisCenter;
}else{
- v->pcm_returned=prevCenter;
- v->pcm_current=prevCenter+
- ci->blocksizes[v->lW]/4+
- ci->blocksizes[v->W]/4;
+ v->pcm_returned=0;
+ v->pcm_current=ci->blocksizes[v->lW]/4+ci->blocksizes[v->W]/4;
}
}
@@ -436,7 +438,7 @@ int vorbis_synthesis_pcmout(vorbis_dsp_state *v,ogg_int32_t ***pcm){
if(pcm){
int i;
for(i=0;i<vi->channels;i++)
- v->pcmret[i]=v->pcm[i]+v->pcm_returned;
+ v->pcmret[i]=v->pcmb[i]+v->pcm_returned;
*pcm=v->pcmret;
}
return(v->pcm_current-v->pcm_returned);
diff --git a/apps/codecs/Tremor/codebook.c b/apps/codecs/Tremor/codebook.c
index 1287a95011..8c319ab49e 100644
--- a/apps/codecs/Tremor/codebook.c
+++ b/apps/codecs/Tremor/codebook.c
@@ -199,7 +199,7 @@ STIN long decode_packed_entry_number(codebook *book,
return(-1);
}
-static inline long decode_packed_block(codebook *book, oggpack_buffer *b,
+static long decode_packed_block(codebook *book, oggpack_buffer *b,
long *buf, int n){
long *bufptr = buf;
long *bufend = buf + n;
@@ -399,6 +399,55 @@ long vorbis_book_decodev_set(codebook *book,ogg_int32_t *a,
return(0);
}
+static long vorbis_book_decodevv_add_2ch_even(codebook *book,ogg_int32_t **a,
+ long offset,oggpack_buffer *b,
+ int n,int point){
+ long i,k,chunk,read;
+ int shift=point-book->binarypoint;
+ long entries[32];
+ ogg_int32_t *p0 = &(a[0][offset]);
+ ogg_int32_t *p1 = &(a[1][offset]);
+
+ if(shift>=0){
+
+ for(i=0;i<n;){
+ chunk=32;
+ if (chunk*book->dim>(n-i)*2)
+ chunk=((n-i)*2+book->dim-1)/book->dim;
+ read = decode_packed_block(book,b,entries,chunk);
+ for(k=0;k<read;k++){
+ const ogg_int32_t *t = book->valuelist+entries[k]*book->dim;
+ const ogg_int32_t *u = t+book->dim;
+ do{
+ *p0++ += *t++>>shift;
+ *p1++ += *t++>>shift;
+ }while(t<u);
+ }
+ if (read<chunk)return-1;
+ i += read*book->dim/2;
+ }
+ }else{
+ shift = -shift;
+ for(i=0;i<n;){
+ chunk=32;
+ if (chunk*book->dim>(n-i)*2)
+ chunk=((n-i)*2+book->dim-1)/book->dim;
+ read = decode_packed_block(book,b,entries,chunk);
+ for(k=0;k<read;k++){
+ const ogg_int32_t *t = book->valuelist+entries[k]*book->dim;
+ const ogg_int32_t *u = t+book->dim;
+ do{
+ *p0++ += *t++<<shift;
+ *p1++ += *t++<<shift;
+ }while(t<u);
+ }
+ if (read<chunk)return-1;
+ i += read*book->dim/2;
+ }
+ }
+ return(0);
+}
+
long vorbis_book_decodevv_add(codebook *book,ogg_int32_t **a,
long offset,int ch,
oggpack_buffer *b,int n,int point){
@@ -408,6 +457,9 @@ long vorbis_book_decodevv_add(codebook *book,ogg_int32_t **a,
int shift=point-book->binarypoint;
long entries[32];
+ if (!(book->dim&1) && ch==2)
+ return vorbis_book_decodevv_add_2ch_even(book,a,offset,b,n,point);
+
if(shift>=0){
for(i=offset;i<offset+n;){
diff --git a/apps/codecs/Tremor/ivorbiscodec.h b/apps/codecs/Tremor/ivorbiscodec.h
index b3e63226ee..2574a11f2a 100644
--- a/apps/codecs/Tremor/ivorbiscodec.h
+++ b/apps/codecs/Tremor/ivorbiscodec.h
@@ -59,6 +59,7 @@ typedef struct vorbis_dsp_state{
vorbis_info *vi;
ogg_int32_t **pcm;
+ ogg_int32_t **pcmb;
ogg_int32_t **pcmret;
int pcm_storage;
int pcm_current;
diff --git a/apps/codecs/Tremor/mdct.c b/apps/codecs/Tremor/mdct.c
index 8334cdf3c4..20abdb47f4 100644
--- a/apps/codecs/Tremor/mdct.c
+++ b/apps/codecs/Tremor/mdct.c
@@ -38,6 +38,19 @@
#include "mdct.h"
#include "mdct_lookup.h"
+#ifdef CPU_ARM
+
+extern void mdct_butterfly_32(DATA_TYPE *x);
+extern void mdct_butterfly_generic_loop(DATA_TYPE *x1, DATA_TYPE *x2,
+ LOOKUP_T *T0, int step,
+ LOOKUP_T *Ttop);
+
+STIN void mdct_butterfly_generic(DATA_TYPE *x,int points, int step){
+ mdct_butterfly_generic_loop(x + points, x + (points>>1),
+ sincos_lookup0, step, sincos_lookup0+1024);
+}
+
+#else
/* 8 point butterfly (in place) */
STIN void mdct_butterfly_8(DATA_TYPE *x){
@@ -225,6 +238,8 @@ void mdct_butterfly_generic(DATA_TYPE *x,int points, int step){
}while(T>sincos_lookup0);
}
+#endif /* CPU_ARM */
+
STIN void mdct_butterflies(DATA_TYPE *x,int points,int shift) {
int stages=8-shift;
diff --git a/apps/codecs/Tremor/mdct_arm.S b/apps/codecs/Tremor/mdct_arm.S
new file mode 100644
index 0000000000..495e6a17c9
--- /dev/null
+++ b/apps/codecs/Tremor/mdct_arm.S
@@ -0,0 +1,419 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id: $
+ *
+ * Copyright (C) 2007 by Tomasz Malesinski
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#define cPI3_8 (0x30fbc54d)
+#define cPI2_8 (0x5a82799a)
+#define cPI1_8 (0x7641af3d)
+
+ .section .icode,"ax",%progbits
+ .align
+
+ .global mdct_butterfly_32
+ .global mdct_butterfly_generic_loop
+
+mdct_butterfly_8:
+ add r9, r5, r1 @ x4 + x0
+ sub r5, r5, r1 @ x4 - x0
+ add r7, r6, r2 @ x5 + x1
+ sub r6, r6, r2 @ x5 - x1
+ add r8, r10, r3 @ x6 + x2
+ sub r10, r10, r3 @ x6 - x2
+ add r12, r11, r4 @ x7 + x3
+ sub r11, r11, r4 @ x7 - x3
+
+ add r1, r10, r6 @ y0 = (x6 - x2) + (x5 - x1)
+ sub r2, r11, r5 @ y1 = (x7 - x3) - (x4 - x0)
+ sub r3, r10, r6 @ y2 = (x6 - x2) - (x5 - x1)
+ add r4, r11, r5 @ y3 = (x7 - x3) + (x4 - x0)
+ sub r5, r8, r9 @ y4 = (x6 + x2) - (x4 + x0)
+ sub r6, r12, r7 @ y5 = (x7 + x3) - (x5 + x1)
+ add r10, r8, r9 @ y6 = (x6 + x2) + (x4 + x0)
+ add r11, r12, r7 @ y7 = (x7 + x3) + (x5 + x1)
+ stmia r0, {r1, r2, r3, r4, r5, r6, r10, r11}
+
+ mov pc, lr
+
+mdct_butterfly_16:
+ str lr, [sp, #-4]!
+ add r1, r0, #8*4
+
+ ldmia r0, {r2, r3, r4, r5}
+ ldmia r1, {r6, r7, r8, r9}
+ add r6, r6, r2 @ y8 = x8 + x0
+ rsb r2, r6, r2, asl #1 @ x0 - x8
+ add r7, r7, r3 @ y9 = x9 + x1
+ rsb r3, r7, r3, asl #1 @ x1 - x9
+ add r8, r8, r4 @ y10 = x10 + x2
+ sub r11, r8, r4, asl #1 @ x10 - x2
+ add r9, r9, r5 @ y11 = x11 + x3
+ rsb r10, r9, r5, asl #1 @ x3 - x11
+
+ stmia r1!, {r6, r7, r8, r9}
+
+ add r2, r2, r3 @ (x0 - x8) + (x1 - x9)
+ rsb r3, r2, r3, asl #1 @ (x1 - x9) - (x0 - x8)
+
+ ldr r12, =cPI2_8
+ smull r8, r5, r2, r12
+ mov r5, r5, asl #1
+ smull r8, r6, r3, r12
+ mov r6, r6, asl #1
+
+ stmia r0!, {r5, r6, r10, r11}
+
+ ldmia r0, {r2, r3, r4, r5}
+ ldmia r1, {r6, r7, r8, r9}
+ add r6, r6, r2 @ y12 = x12 + x4
+ sub r2, r6, r2, asl #1 @ x12 - x4
+ add r7, r7, r3 @ y13 = x13 + x5
+ sub r3, r7, r3, asl #1 @ x13 - x5
+ add r8, r8, r4 @ y10 = x14 + x6
+ sub r10, r8, r4, asl #1 @ x14 - x6
+ add r9, r9, r5 @ y11 = x15 + x7
+ sub r11, r9, r5, asl #1 @ x15 - x7
+
+ stmia r1, {r6, r7, r8, r9}
+
+ sub r2, r2, r3 @ (x12 - x4) - (x13 - x5)
+ add r3, r2, r3, asl #1 @ (x12 - x4) + (x13 - x5)
+
+ smull r8, r5, r2, r12
+ mov r5, r5, asl #1
+ smull r8, r6, r3, r12
+ mov r6, r6, asl #1
+ @ no stmia here, r5, r6, r10, r11 are passed to mdct_butterfly_8
+
+ sub r0, r0, #4*4
+ ldmia r0, {r1, r2, r3, r4}
+ bl mdct_butterfly_8
+ add r0, r0, #8*4
+ ldmia r0, {r1, r2, r3, r4, r5, r6, r10, r11}
+ bl mdct_butterfly_8
+
+ ldr pc, [sp], #4
+
+mdct_butterfly_32:
+ stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+ add r1, r0, #16*4
+
+ ldmia r0, {r2, r3, r4, r5}
+ ldmia r1, {r6, r7, r8, r9}
+ add r6, r6, r2 @ y16 = x16 + x0
+ rsb r2, r6, r2, asl #1 @ x0 - x16
+ add r7, r7, r3 @ y17 = x17 + x1
+ rsb r3, r7, r3, asl #1 @ x1 - x17
+ add r8, r8, r4 @ y18 = x18 + x2
+ rsb r4, r8, r4, asl #1 @ x2 - x18
+ add r9, r9, r5 @ y19 = x19 + x3
+ rsb r5, r9, r5, asl #1 @ x3 - x19
+
+ stmia r1!, {r6, r7, r8, r9}
+
+ ldr r12, =cPI1_8
+ ldr lr, =cPI3_8
+ smull r10, r6, r2, r12
+ smlal r10, r6, r3, lr
+ rsb r2, r2, #0
+ smull r10, r7, r3, r12
+ smlal r10, r7, r2, lr
+ mov r6, r6, asl #1
+ mov r7, r7, asl #1
+
+ add r4, r4, r5 @ (x3 - x19) + (x2 - x18)
+ rsb r5, r4, r5, asl #1 @ (x3 - x19) - (x2 - x18)
+
+ ldr r11, =cPI2_8
+ smull r10, r8, r4, r11
+ mov r8, r8, asl #1
+ smull r10, r9, r5, r11
+ mov r9, r9, asl #1
+
+ stmia r0!, {r6, r7, r8, r9}
+
+ ldmia r0, {r2, r3, r4, r5}
+ ldmia r1, {r6, r7, r8, r9}
+ add r6, r6, r2 @ y20 = x20 + x4
+ rsb r2, r6, r2, asl #1 @ x4 - x20
+ add r7, r7, r3 @ y21 = x21 + x5
+ rsb r3, r7, r3, asl #1 @ x5 - x21
+ add r8, r8, r4 @ y22 = x22 + x6
+ sub r4, r8, r4, asl #1 @ x22 - x6
+ add r9, r9, r5 @ y23 = x23 + x7
+ rsb r5, r9, r5, asl #1 @ x7 - x23
+
+ stmia r1!, {r6, r7, r8, r9}
+
+ smull r10, r6, r2, lr
+ smlal r10, r6, r3, r12
+ rsb r2, r2, #0
+ smull r10, r7, r3, lr
+ smlal r10, r7, r2, r12
+ mov r6, r6, asl #1
+ mov r7, r7, asl #1
+
+ mov r8, r5
+ mov r9, r4
+ stmia r0!, {r6, r7, r8, r9}
+
+ ldmia r0, {r2, r3, r4, r5}
+ ldmia r1, {r6, r7, r8, r9}
+ add r6, r6, r2 @ y24 = x24 + x8
+ sub r2, r6, r2, asl #1 @ x24 - x8
+ add r7, r7, r3 @ y25 = x25 + x9
+ sub r3, r7, r3, asl #1 @ x25 - x9
+ add r8, r8, r4 @ y26 = x26 + x10
+ sub r4, r8, r4, asl #1 @ x26 - x10
+ add r9, r9, r5 @ y27 = x27 + x11
+ sub r5, r9, r5, asl #1 @ x27 - x11
+
+ stmia r1!, {r6, r7, r8, r9}
+
+ smull r10, r7, r2, r12
+ smlal r10, r7, r3, lr
+ rsb r3, r3, #0
+ smull r10, r6, r3, r12
+ smlal r10, r6, r2, lr
+ mov r6, r6, asl #1
+ mov r7, r7, asl #1
+
+ sub r4, r4, r5 @ (x26 - x10) - (x27 - x11)
+ add r5, r4, r5, asl #1 @ (x26 - x10) + (x27 - x11)
+
+ ldr r11, =cPI2_8
+ smull r10, r8, r4, r11
+ mov r8, r8, asl #1
+ smull r10, r9, r5, r11
+ mov r9, r9, asl #1
+
+ stmia r0!, {r6, r7, r8, r9}
+
+ ldmia r0, {r2, r3, r4, r5}
+ ldmia r1, {r6, r7, r8, r9}
+ add r6, r6, r2 @ y28 = x28 + x12
+ sub r2, r6, r2, asl #1 @ x28 - x12
+ add r7, r7, r3 @ y29 = x29 + x13
+ sub r3, r7, r3, asl #1 @ x29 - x13
+ add r8, r8, r4 @ y30 = x30 + x14
+ sub r4, r8, r4, asl #1 @ x30 - x14
+ add r9, r9, r5 @ y31 = x31 + x15
+ sub r5, r9, r5, asl #1 @ x31 - x15
+
+ stmia r1, {r6, r7, r8, r9}
+
+ smull r10, r7, r2, lr
+ smlal r10, r7, r3, r12
+ rsb r3, r3, #0
+ smull r10, r6, r3, lr
+ smlal r10, r6, r2, r12
+ mov r6, r6, asl #1
+ mov r7, r7, asl #1
+
+ mov r8, r4
+ mov r9, r5
+ stmia r0, {r6, r7, r8, r9}
+
+ sub r0, r0, #12*4
+ str r0, [sp, #-4]!
+ bl mdct_butterfly_16
+
+ ldr r0, [sp], #4
+ add r0, r0, #16*4
+ bl mdct_butterfly_16
+
+ ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+ @ mdct_butterfly_generic_loop(x1, x2, T0, step, Ttop)
+mdct_butterfly_generic_loop:
+ stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+ str r2, [sp, #-4]
+ ldr r4, [sp, #40]
+1:
+ ldmdb r0, {r6, r7, r8, r9}
+ ldmdb r1, {r10, r11, r12, r14}
+
+ add r6, r6, r10
+ sub r10, r6, r10, asl #1
+ add r7, r7, r11
+ rsb r11, r7, r11, asl #1
+ add r8, r8, r12
+ sub r12, r8, r12, asl #1
+ add r9, r9, r14
+ rsb r14, r9, r14, asl #1
+
+ stmdb r0!, {r6, r7, r8, r9}
+
+ ldmia r2, {r6, r7}
+ smull r5, r8, r14, r6
+ smlal r5, r8, r12, r7
+ rsb r14, r14, #0
+ smull r5, r9, r12, r6
+ smlal r5, r9, r14, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ add r2, r2, r3, asl #2
+
+ ldmia r2, {r6, r7}
+ smull r5, r8, r11, r6
+ smlal r5, r8, r10, r7
+ rsb r11, r11, #0
+ smull r5, r9, r10, r6
+ smlal r5, r9, r11, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ add r2, r2, r3, asl #2
+
+ cmp r2, r4
+ blo 1b
+
+ ldr r4, [sp, #-4]
+1:
+ ldmdb r0, {r6, r7, r8, r9}
+ ldmdb r1, {r10, r11, r12, r14}
+
+ add r6, r6, r10
+ sub r10, r6, r10, asl #1
+ add r7, r7, r11
+ sub r11, r7, r11, asl #1
+ add r8, r8, r12
+ sub r12, r8, r12, asl #1
+ add r9, r9, r14
+ sub r14, r9, r14, asl #1
+
+ stmdb r0!, {r6, r7, r8, r9}
+
+ ldmia r2, {r6, r7}
+ smull r5, r9, r14, r6
+ smlal r5, r9, r12, r7
+ rsb r14, r14, #0
+ smull r5, r8, r12, r6
+ smlal r5, r8, r14, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ sub r2, r2, r3, asl #2
+
+ ldmia r2, {r6, r7}
+ smull r5, r9, r11, r6
+ smlal r5, r9, r10, r7
+ rsb r11, r11, #0
+ smull r5, r8, r10, r6
+ smlal r5, r8, r11, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ sub r2, r2, r3, asl #2
+
+ cmp r2, r4
+ bhi 1b
+
+ ldr r4, [sp, #40]
+1:
+ ldmdb r0, {r6, r7, r8, r9}
+ ldmdb r1, {r10, r11, r12, r14}
+
+ add r6, r6, r10
+ rsb r10, r6, r10, asl #1
+ add r7, r7, r11
+ rsb r11, r7, r11, asl #1
+ add r8, r8, r12
+ rsb r12, r8, r12, asl #1
+ add r9, r9, r14
+ rsb r14, r9, r14, asl #1
+
+ stmdb r0!, {r6, r7, r8, r9}
+
+ ldmia r2, {r6, r7}
+ smull r5, r8, r12, r6
+ smlal r5, r8, r14, r7
+ rsb r12, r12, #0
+ smull r5, r9, r14, r6
+ smlal r5, r9, r12, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ add r2, r2, r3, asl #2
+
+ ldmia r2, {r6, r7}
+ smull r5, r8, r10, r6
+ smlal r5, r8, r11, r7
+ rsb r10, r10, #0
+ smull r5, r9, r11, r6
+ smlal r5, r9, r10, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ add r2, r2, r3, asl #2
+
+ cmp r2, r4
+ blo 1b
+
+ ldr r4, [sp, #-4]
+1:
+ ldmdb r0, {r6, r7, r8, r9}
+ ldmdb r1, {r10, r11, r12, r14}
+
+ add r6, r6, r10
+ sub r10, r6, r10, asl #1
+ add r7, r7, r11
+ rsb r11, r7, r11, asl #1
+ add r8, r8, r12
+ sub r12, r8, r12, asl #1
+ add r9, r9, r14
+ rsb r14, r9, r14, asl #1
+
+ stmdb r0!, {r6, r7, r8, r9}
+
+ ldmia r2, {r6, r7}
+ smull r5, r9, r12, r6
+ smlal r5, r9, r14, r7
+ rsb r12, r12, #0
+ smull r5, r8, r14, r6
+ smlal r5, r8, r12, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ sub r2, r2, r3, asl #2
+
+ ldmia r2, {r6, r7}
+ smull r5, r9, r10, r6
+ smlal r5, r9, r11, r7
+ rsb r10, r10, #0
+ smull r5, r8, r11, r6
+ smlal r5, r8, r10, r7
+
+ mov r8, r8, asl #1
+ mov r9, r9, asl #1
+ stmdb r1!, {r8, r9}
+ sub r2, r2, r3, asl #2
+
+ cmp r2, r4
+ bhi 1b
+
+ ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+