From 6b9350b4d355a7598b737c00a2a3c02dd99bb1ec Mon Sep 17 00:00:00 2001 From: Pedro Vasconcelos Date: Fri, 10 Jun 2005 22:34:57 +0000 Subject: A little improvement on Vorbis block synthesis. Added myself to the list of contributors. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6664 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/Tremor/asm_mcf5249.h | 117 +++++++++++++++++++++----- apps/codecs/Tremor/block.c | 163 +++++++++++++++++++++---------------- apps/codecs/Tremor/mapping0.c | 12 +-- apps/codecs/Tremor/mdct.c | 4 - apps/codecs/Tremor/synthesis.c | 6 +- apps/codecs/Tremor/window_lookup.h | 8 +- docs/CREDITS | 1 + 7 files changed, 204 insertions(+), 107 deletions(-) diff --git a/apps/codecs/Tremor/asm_mcf5249.h b/apps/codecs/Tremor/asm_mcf5249.h index 811148a8c8..9844cc05a4 100644 --- a/apps/codecs/Tremor/asm_mcf5249.h +++ b/apps/codecs/Tremor/asm_mcf5249.h @@ -21,6 +21,9 @@ #if CONFIG_CPU == MCF5249 && !defined(SIMULATOR) +/* attribute for 16-byte alignment */ +#define LINE_ATTR __attribute__ ((aligned (16))) + #ifndef _V_WIDE_MATH #define _V_WIDE_MATH @@ -107,15 +110,14 @@ void XNPROD31(ogg_int32_t a, ogg_int32_t b, } - - -#if 1 /* Canonical definition */ +#if 1 +/* canonical definition */ #define XPROD32(_a, _b, _t, _v, _x, _y) \ { (_x)=MULT32(_a,_t)+MULT32(_b,_v); \ (_y)=MULT32(_b,_t)-MULT32(_a,_v); } #else -/* Thom Johansen suggestion; this could loose the lsb by overflow - but does it matter in practice? */ +/* Thom Johansen's suggestion; this could loose the LSB by overflow; + Does it matter in practice? */ #define XPROD32(_a, _b, _t, _v, _x, _y) \ asm volatile ("mac.l %[a], %[t], %%acc0;" \ "mac.l %[b], %[v], %%acc0;" \ @@ -129,14 +131,82 @@ void XNPROD31(ogg_int32_t a, ogg_int32_t b, : [a] "r" (_a), [b] "r" (_b), \ [t] "r" (_t), [v] "r" (_v) \ : "cc"); -#endif +#endif -/* asm versions of vector multiplication for window.c */ +/* asm versions of vector operations for block.c, window.c */ /* assumes MAC is initialized & accumulators cleared */ +static inline +void mcf5249_vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) +{ + /* align to 16 bytes */ + while(n>0 && (int)x&16) { + *x++ += *y++; + n--; + } + asm volatile ("bra 1f;" + "0:" /* loop start */ + "movem.l (%[x]), %%d0-%%d3;" /* fetch values */ + "movem.l (%[y]), %%a0-%%a3;" + /* add */ + "add.l %%a0, %%d0;" + "add.l %%a1, %%d1;" + "add.l %%a2, %%d2;" + "add.l %%a3, %%d3;" + /* store and advance */ + "movem.l %%d0-%%d3, (%[x]);" + "lea.l (4*4, %[x]), %[x];" + "lea.l (4*4, %[y]), %[y];" + "subq.l #4, %[n];" /* done 4 elements */ + "1: cmpi.l #4, %[n];" + "bge 0b;" + : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y) + : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3", + "cc", "memory"); + /* add final elements */ + while (n>0) { + *x++ += *y++; + n--; + } +} + +static inline +void mcf5249_vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) +{ + /* align to 16 bytes */ + while(n>0 && (int)x&16) { + *x++ = *y++; + n--; + } + asm volatile ("bra 1f;" + "0:" /* loop start */ + "movem.l (%[y]), %%d0-%%d3;" /* fetch values */ + "movem.l %%d0-%%d3, (%[x]);" /* store */ + "lea.l (4*4, %[x]), %[x];" /* advance */ + "lea.l (4*4, %[y]), %[y];" + "subq.l #4, %[n];" /* done 4 elements */ + "1: cmpi.l #4, %[n];" + "bge 0b;" + : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y) + : : "%d0", "%d1", "%d2", "%d3", "cc", "memory"); + /* copy final elements */ + while (n>0) { + *x++ = *y++; + n--; + } +} + + static inline void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) { + /* ensure data is aligned to 16-bytes */ + while(n>0 && (int)data%16) { + *data = MULT31(*data, *window); + data++; + window++; + n--; + } asm volatile ("movem.l (%[d]), %%d0-%%d3;" /* loop start */ "movem.l (%[w]), %%a0-%%a3;" /* pre-fetch registers */ "lea.l (4*4, %[w]), %[w];" @@ -184,6 +254,13 @@ void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) static inline void mcf5249_vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) { + /* ensure at least data is aligned to 16-bytes */ + while(n>0 && (int)data%16) { + *data = MULT31(*data, *window); + data++; + window--; + n--; + } asm volatile ("lea.l (-3*4, %[w]), %[w];" /* loop start */ "movem.l (%[d]), %%d0-%%d3;" /* pre-fetch registers */ "movem.l (%[w]), %%a0-%%a3;" @@ -232,6 +309,11 @@ void mcf5249_vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) static inline void mcf5249_vect_zero(ogg_int32_t *ptr, int n) { + /* ensure ptr is aligned to 16-bytes */ + while(n>0 && (int)ptr%16) { + *ptr++ = 0; + n--; + } asm volatile ("clr.l %%d0;" "clr.l %%d1;" "clr.l %%d2;" @@ -241,23 +323,16 @@ void mcf5249_vect_zero(ogg_int32_t *ptr, int n) "bra 1f;" "0: movem.l %%d0-%%d3, (%[ptr]);" "lea (4*4, %[ptr]), %[ptr];" - "subq.l #4, %[n];" + "subq.l #4, %[n];" /* done 4 elements */ "1: bgt 0b;" - /* remaing elements */ - "tst.l %[n];" - "beq 1f;" /* n=0 */ - "clr.l (%[ptr])+;" - "subq.l #1, %[n];" - "beq 1f;" /* n=1 */ - "clr.l (%[ptr])+;" - "subq.l #1, %[n];" - "beq 1f;" /* n=2 */ - /* otherwise n = 3 */ - "clr.l (%[ptr])+;" - "1:" : [n] "+d" (n), [ptr] "+a" (ptr) : : "%d0","%d1","%d2","%d3","cc","memory"); + /* clear remaining elements */ + while(n>0) { + *ptr++ = 0; + n--; + } } #endif @@ -272,4 +347,6 @@ static inline ogg_int32_t CLIP_TO_15(register ogg_int32_t x) { } #endif +#else +#define LINE_ATTR #endif diff --git a/apps/codecs/Tremor/block.c b/apps/codecs/Tremor/block.c index 6f88fb812c..f51622b5ed 100644 --- a/apps/codecs/Tremor/block.c +++ b/apps/codecs/Tremor/block.c @@ -70,8 +70,8 @@ static int ilog(unsigned int v){ | | |endSr | |beginSr | |endSl - |beginSl - |beginW + |beginSl + |beginW */ /* block abstraction setup *********************************************/ @@ -173,10 +173,8 @@ static int _vds_init(vorbis_dsp_state *v,vorbis_info *vi){ v->pcm=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcm)); v->pcmret=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmret)); - // pbv: allow for extra padding for windowing for(i=0;ichannels;i++) v->pcm[i]=(ogg_int32_t *)_ogg_calloc(v->pcm_storage,sizeof(*v->pcm[i])); - // v->pcm[i]=(ogg_int32_t *)_ogg_calloc(v->pcm_storage,sizeof(*v->pcm[i])); /* all 1 (large block) or 0 (small block) */ @@ -190,7 +188,7 @@ static int _vds_init(vorbis_dsp_state *v,vorbis_info *vi){ int mapnum=ci->mode_param[i]->mapping; int maptype=ci->map_type[mapnum]; b->mode[i]=_mapping_P[maptype]->look(v,ci->mode_param[i], - ci->map_param[mapnum]); + ci->map_param[mapnum]); } return(0); } @@ -231,7 +229,7 @@ void vorbis_dsp_clear(vorbis_dsp_state *v){ if(v->pcm){ for(i=0;ichannels;i++) - if(v->pcm[i])_ogg_free(v->pcm[i]); + if(v->pcm[i])_ogg_free(v->pcm[i]); _ogg_free(v->pcm); if(v->pcmret)_ogg_free(v->pcmret); } @@ -239,9 +237,9 @@ void vorbis_dsp_clear(vorbis_dsp_state *v){ /* free mode lookups; these are actually vorbis_look_mapping structs */ if(ci){ for(i=0;imodes;i++){ - int mapnum=ci->mode_param[i]->mapping; - int maptype=ci->map_type[mapnum]; - if(b && b->mode)_mapping_P[maptype]->free_look(b->mode[i]); + int mapnum=ci->mode_param[i]->mapping; + int maptype=ci->map_type[mapnum]; + if(b && b->mode)_mapping_P[maptype]->free_look(b->mode[i]); } } @@ -262,7 +260,11 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ vorbis_info *vi=v->vi; codec_setup_info *ci=(codec_setup_info *)vi->codec_setup; private_state *b=v->backend_state; +#if CONFIG_CPU == MCF5249 + int j; +#else int i,j; +#endif if(v->pcm_current>v->pcm_returned && v->pcm_returned!=-1)return(OV_EINVAL); @@ -304,43 +306,64 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ for(j=0;jchannels;j++){ /* the overlap/add section */ if(v->lW){ - if(v->W){ - /* large/large */ - ogg_int32_t *pcm=v->pcm[j]+prevCenter; - ogg_int32_t *p=vb->pcm[j]; - for(i=0;ipcm[j]+prevCenter+n1/2-n0/2; - ogg_int32_t *p=vb->pcm[j]; - for(i=0;iW){ + /* large/large */ + ogg_int32_t *pcm=v->pcm[j]+prevCenter; + ogg_int32_t *p=vb->pcm[j]; +#if CONFIG_CPU == MCF5249 + mcf5249_vect_add(pcm, p, n1); +#else + for(i=0;ipcm[j]+prevCenter+n1/2-n0/2; + ogg_int32_t *p=vb->pcm[j]; +#if CONFIG_CPU == MCF5249 + mcf5249_vect_add(pcm, p, n0); +#else + for(i=0;iW){ - /* small/large */ - ogg_int32_t *pcm=v->pcm[j]+prevCenter; - ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2; - for(i=0;ipcm[j]+prevCenter; - ogg_int32_t *p=vb->pcm[j]; - for(i=0;iW){ + /* small/large */ + ogg_int32_t *pcm=v->pcm[j]+prevCenter; + ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2; +#if CONFIG_CPU == MCF5249 + mcf5249_vect_add(pcm, p, n0); + mcf5249_vect_copy(&pcm[n0], &p[n0], n1/2-n0/2); +#else + for(i=0;ipcm[j]+prevCenter; + ogg_int32_t *p=vb->pcm[j]; +#if CONFIG_CPU == MCF5249 + mcf5249_vect_add(pcm, p, n0); +#else + for(i=0;ipcm[j]+thisCenter; - ogg_int32_t *p=vb->pcm[j]+n; - for(i=0;ipcm[j]+thisCenter; + ogg_int32_t *p=vb->pcm[j]+n; +#if CONFIG_CPU == MCF5249 + mcf5249_vect_copy(pcm, p, n); +#else + for(i=0;ipcm_returned=prevCenter; v->pcm_current=prevCenter+ - ci->blocksizes[v->lW]/4+ - ci->blocksizes[v->W]/4; + ci->blocksizes[v->lW]/4+ + ci->blocksizes[v->W]/4; } } @@ -389,23 +412,23 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ /* is this a short page? */ if(b->sample_count>v->granulepos){ - /* corner case; if this is both the first and last audio page, - then spec says the end is cut, not beginning */ - if(vb->eofflag){ - /* trim the end */ - /* no preceeding granulepos; assume we started at zero (we'd - have to in a short single-page stream) */ - /* granulepos could be -1 due to a seek, but that would result - in a long coun`t, not short count */ - - v->pcm_current-=(b->sample_count-v->granulepos); - }else{ - /* trim the beginning */ - v->pcm_returned+=(b->sample_count-v->granulepos); - if(v->pcm_returned>v->pcm_current) - v->pcm_returned=v->pcm_current; - } - + /* corner case; if this is both the first and last audio page, + then spec says the end is cut, not beginning */ + if(vb->eofflag){ + /* trim the end */ + /* no preceeding granulepos; assume we started at zero (we'd + have to in a short single-page stream) */ + /* granulepos could be -1 due to a seek, but that would result + in a long coun`t, not short count */ + + v->pcm_current-=(b->sample_count-v->granulepos); + }else{ + /* trim the beginning */ + v->pcm_returned+=(b->sample_count-v->granulepos); + if(v->pcm_returned>v->pcm_current) + v->pcm_returned=v->pcm_current; + } + } } @@ -414,16 +437,16 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ if(vb->granulepos!=-1 && v->granulepos!=vb->granulepos){ if(v->granulepos>vb->granulepos){ - long extra=v->granulepos-vb->granulepos; - - if(extra) - if(vb->eofflag){ - /* partial last frame. Strip the extra samples off */ - v->pcm_current-=extra; - } /* else {Shouldn't happen *unless* the bitstream is out of - spec. Either way, believe the bitstream } */ + long extra=v->granulepos-vb->granulepos; + + if(extra) + if(vb->eofflag){ + /* partial last frame. Strip the extra samples off */ + v->pcm_current-=extra; + } /* else {Shouldn't happen *unless* the bitstream is out of + spec. Either way, believe the bitstream } */ } /* else {Shouldn't happen *unless* the bitstream is out of - spec. Either way, believe the bitstream } */ + spec. Either way, believe the bitstream } */ v->granulepos=vb->granulepos; } } @@ -441,7 +464,7 @@ int vorbis_synthesis_pcmout(vorbis_dsp_state *v,ogg_int32_t ***pcm){ if(pcm){ int i; for(i=0;ichannels;i++) - v->pcmret[i]=v->pcm[i]+v->pcm_returned; + v->pcmret[i]=v->pcm[i]+v->pcm_returned; *pcm=v->pcmret; } return(v->pcm_current-v->pcm_returned); diff --git a/apps/codecs/Tremor/mapping0.c b/apps/codecs/Tremor/mapping0.c index c53383de8f..6154f5de6b 100644 --- a/apps/codecs/Tremor/mapping0.c +++ b/apps/codecs/Tremor/mapping0.c @@ -202,10 +202,6 @@ static int mapping0_inverse(vorbis_block *vb,vorbis_look_mapping *l){ int nonzero[CHANNELS]; void *floormemo[CHANNELS]; - /* test for too many channels; - (maybe this is can be checked at the stream level?) */ - if (vi->channels > CHANNELS) return (-1); - /* time domain information decode (note that applying the information would have to happen later; we'll probably add a function entry to the harness for that later */ @@ -286,13 +282,14 @@ static int mapping0_inverse(vorbis_block *vb,vorbis_look_mapping *l){ //_analysis_output("residue",seq+j,vb->pcm[j],-8,n/2,0,0); /* compute and apply spectral envelope */ +#if 0 for(i=0;ichannels;i++){ ogg_int32_t *pcm=vb->pcm[i]; int submap=info->chmuxlist[i]; look->floor_func[submap]-> inverse2(vb,look->floor_look[submap],floormemo[i],pcm); } - +#endif //for(j=0;jchannels;j++) //_analysis_output("mdct",seq+j,vb->pcm[j],-24,n/2,0,1); @@ -301,8 +298,11 @@ static int mapping0_inverse(vorbis_block *vb,vorbis_look_mapping *l){ for(i=0;ichannels;i++){ ogg_int32_t *pcm=vb->pcm[i]; + int submap=info->chmuxlist[i]; - if(nonzero[i]) { + if(nonzero[i]) { + look->floor_func[submap]-> + inverse2(vb,look->floor_look[submap],floormemo[i],pcm); mdct_backward(n, pcm, pcm); /* window the data */ _vorbis_apply_window(pcm,b->window,ci->blocksizes,vb->lW,vb->W,vb->nW); diff --git a/apps/codecs/Tremor/mdct.c b/apps/codecs/Tremor/mdct.c index 27a340bcad..9bdfdce2e2 100644 --- a/apps/codecs/Tremor/mdct.c +++ b/apps/codecs/Tremor/mdct.c @@ -341,10 +341,6 @@ void mdct_backward(int n, DATA_TYPE *in, DATA_TYPE *out) { int shift; int step; -#if CONFIG_CPU == MCF5249 - /* mcf5249_init_mac(); */ /* should be redundant */ -#endif - for (shift=6;!(n&(1<vd; @@ -73,10 +73,10 @@ int vorbis_synthesis(vorbis_block *vb,ogg_packet *op,int decodep){ vb->sequence=op->packetno-3; /* first block is third packet */ vb->eofflag=op->e_o_s; - if(decodep){ + if(decodep && vi->channels<=CHANNELS){ /* alloc pcm passback storage */ vb->pcmend=ci->blocksizes[vb->W]; - if (vi->channels <= CHANNELS && vb->pcmend<=IRAM_PCM_END) { + if (vb->pcmend<=IRAM_PCM_END) { /* use statically allocated iram buffer */ vb->pcm = ipcm_vect; for(i=0; i