summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
committerJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
commit71dc284b5d4f7bfd27fb50fd91184d2d5f70db21 (patch)
treeb9a97081ec04d4d311a7b45747393e68837912a2
parentbcd94a9b01d19d87a437cd8158a758f206b30825 (diff)
downloadrockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.tar.gz
rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.zip
New algorithm for grayscale buffer updates which is faster for large buffer depths. Speedup (unbuffered, depth==32): +8% on H1x0, +17% on Recorder (depth==24), and +83% on iPod Mini.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10529 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/plugins/lib/gray_core.c1417
-rw-r--r--apps/plugins/lib/gray_draw.c1156
2 files changed, 1858 insertions, 715 deletions
diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c
index e65a7f259e..809e88dba1 100644
--- a/apps/plugins/lib/gray_core.c
+++ b/apps/plugins/lib/gray_core.c
@@ -649,7 +649,8 @@ void gray_update_rect(int x, int y, int width, int height)
bbuf = _gray_info.back_buffer + srcofs_row;
#ifdef CPU_ARM
- asm volatile (
+ asm volatile
+ (
"ldr r0, [%[cbuf]] \n"
"ldr r1, [%[bbuf]] \n"
"eor r1, r0, r1 \n"
@@ -668,137 +669,281 @@ void gray_update_rect(int x, int y, int width, int height)
if (change != 0)
{
- unsigned char *addr, *end;
- unsigned mask, trash;
+ unsigned char *addr;
+ unsigned mask, depth, trash;
pat_ptr = &pat_stack[8];
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
- asm volatile (
- "mov r3, #8 \n" /* loop count */
- "mov %[mask], #0 \n"
-
- ".ur_pre_loop: \n"
- "mov %[mask], %[mask], lsl #1 \n" /* shift mask */
- "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */
- "ldrb r1, [%[bbuf]] \n" /* read back buffer */
- "strb r0, [%[bbuf]], #1 \n" /* update back buffer */
- "mov r2, #0 \n" /* preset for skipped pixel */
- "cmp r0, r1 \n" /* no change? */
- "beq .ur_skip \n" /* -> skip */
-
- "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
-
- "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */
- "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n"
- "add %[rnd], %[rnd], #74 \n" /* add another 74 */
- /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
- "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
-
- "cmp r1, %[dpth] \n" /* random >= depth ? */
- "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
-
- "mov r0, r2, lsl r1 \n" /** rotate pattern **/
- "sub r1, %[dpth], r1 \n"
- "orr r2, r0, r2, lsr r1 \n"
-
- "orr %[mask], %[mask], #1 \n" /* set mask bit */
+ asm volatile
+ (
+ "mov r3, #8 \n" /* loop count */
+ "mov %[mask], #0 \n"
+
+ ".ur_pre_loop: \n"
+ "mov %[mask], %[mask], lsl #1 \n" /* shift mask */
+ "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */
+ "ldrb r1, [%[bbuf]] \n" /* read back buffer */
+ "strb r0, [%[bbuf]], #1 \n" /* update back buffer */
+ "mov r2, #0 \n" /* preset for skipped pixel */
+ "cmp r0, r1 \n" /* no change? */
+ "beq .ur_skip \n" /* -> skip */
+
+ "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
+
+ "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */
+ "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n"
+ "add %[rnd], %[rnd], #74 \n" /* add another 74 */
+ /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
+ "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
+
+ "cmp r1, %[dpth] \n" /* random >= depth ? */
+ "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
+
+ "mov r0, r2, lsl r1 \n" /** rotate pattern **/
+ "sub r1, %[dpth], r1 \n"
+ "orr r2, r0, r2, lsr r1 \n"
+
+ "orr %[mask], %[mask], #1 \n" /* set mask bit */
- ".ur_skip: \n"
- "str r2, [%[patp], #-4]! \n" /* push on pattern stack */
-
- "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
- "bne .ur_pre_loop \n"
- : /* outputs */
- [cbuf]"+r"(cbuf),
- [bbuf]"+r"(bbuf),
- [patp]"+r"(pat_ptr),
- [rnd] "+r"(_gray_random_buffer),
- [mask]"=&r"(mask)
- : /* inputs */
- [bpat]"r"(_gray_info.bitpattern),
- [dpth]"r"(_gray_info.depth),
- [rmsk]"r"(_gray_info.randmask)
- : /* clobbers */
- "r0", "r1", "r2", "r3"
+ ".ur_skip: \n"
+ "str r2, [%[patp], #-4]! \n" /* push on pattern stack */
+
+ "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
+ "bne .ur_pre_loop \n"
+ : /* outputs */
+ [cbuf]"+r"(cbuf),
+ [bbuf]"+r"(bbuf),
+ [patp]"+r"(pat_ptr),
+ [rnd] "+r"(_gray_random_buffer),
+ [mask]"=&r"(mask)
+ : /* inputs */
+ [bpat]"r"(_gray_info.bitpattern),
+ [dpth]"r"(_gray_info.depth),
+ [rmsk]"r"(_gray_info.randmask)
+ : /* clobbers */
+ "r0", "r1", "r2", "r3"
);
addr = dst_row;
- end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
+ depth = _gray_info.depth;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
- asm volatile (
- "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */
+ asm volatile
+ (
+ "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */
+
+ /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
+
+ "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/
+ "orr %[rx], %[rx], %[rx], lsl #8 \n"
+ "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */
+ "eor r0, r1, r5, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
+ "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
+ "eor r0, r2, r6, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
+ "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
+ "eor r0, r3, r7, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
+ "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
+ "eor r0, r4, r8, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
+ "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
+
+ "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/
+ "orr %[rx], %[rx], %[rx], lsl #8 \n"
+ "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */
+ "eor r0, r1, r3, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
+ "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
+ "eor r0, r2, r4, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
+ "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
+ "eor r0, r5, r7, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
+ "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
+ "eor r0, r6, r8, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
+ "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
+
+ "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/
+ "orr %[rx], %[rx], %[rx], lsl #8 \n"
+ "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */
+ "eor r0, r1, r2, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
+ "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
+ "eor r0, r3, r4, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
+ "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
+ "eor r0, r5, r6, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
+ "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
+ "eor r0, r7, r8, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
+ "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
+
+ "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
+ "ands %[mask], %[mask], #0xff \n"
+ "beq .ur_sloop \n" /* short loop if no bits to keep */
+
+ ".ur_floop: \n" /** full loop (bits to keep)**/
+ "cmp %[dpth], #8 \n" /* 8 planes or more left? */
+ "bhs .ur_f8 \n"
+
+ "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
+ "add %[addr], %[addr], r0 \n" /* for this round */
+
+ "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
+ "add pc, pc, r0 \n"
+ ".ur_ftable: \n"
+ ".byte .ur_f0 - .ur_ftable - 4 \n" /* [jump tables are tricky] */
+ ".byte .ur_f1 - .ur_ftable - 4 \n"
+ ".byte .ur_f2 - .ur_ftable - 4 \n"
+ ".byte .ur_f3 - .ur_ftable - 4 \n"
+ ".byte .ur_f4 - .ur_ftable - 4 \n"
+ ".byte .ur_f5 - .ur_ftable - 4 \n"
+ ".byte .ur_f6 - .ur_ftable - 4 \n"
+ ".byte .ur_f7 - .ur_ftable - 4 \n"
+
+ ".ur_f8: \n"
+ "add %[addr], %[addr], %[psiz], lsl #3 \n"
+ /* Point behind the last plane for this round. Note: We're using the
+ * registers backwards in order to reuse the streak for the last round.
+ * Therefore we need to go thru the bitplanes backwards too, otherwise
+ * the bit order would be destroyed which results in more flicker. */
+ "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */
+ "and r0, r0, %[mask] \n" /* mask out replaced bits */
+ "orr r0, r0, r8 \n" /* set new bits */
+ "strb r0, [%[addr]] \n" /* store byte */
+ "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
+ ".ur_f7: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r7 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r7, r7, lsr #8 \n"
+ ".ur_f6: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r6 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r6, r6, lsr #8 \n"
+ ".ur_f5: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r5 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r5, r5, lsr #8 \n"
+ ".ur_f4: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r4 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r4, r4, lsr #8 \n"
+ ".ur_f3: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r3 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r3, r3, lsr #8 \n"
+ ".ur_f2: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r2 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r2, r2, lsr #8 \n"
+ ".ur_f1: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r1 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r1, r1, lsr #8 \n"
+ ".ur_f0: \n"
+
+ "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
+ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
+ "bhi .ur_floop \n"
+
+ "b .ur_end \n"
+
+ ".ur_sloop: \n" /** short loop (nothing to keep) **/
+ "cmp %[dpth], #8 \n" /* 8 planes or more left? */
+ "bhs .ur_s8 \n"
+
+ "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
+ "add %[addr], %[addr], r0 \n" /* for this round */
- "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
- "ands %[mask], %[mask], #0xff \n"
- "beq .ur_sloop \n" /* short loop if nothing to keep */
-
- ".ur_floop: \n" /** full loop (there are bits to keep)**/
- "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
- "adc r0, r0, r0 \n" /* put bit into LSB for byte */
- "movs r8, r8, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r7, r7, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r6, r6, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r5, r5, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r4, r4, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r3, r3, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r2, r2, lsr #1 \n"
- "adc r0, r0, r0 \n"
-
- "ldrb r1, [%[addr]] \n" /* read old value */
- "and r1, r1, %[mask] \n" /* mask out replaced bits */
- "orr r1, r1, r0 \n" /* set new bits */
- "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */
-
- "cmp %[end], %[addr] \n" /* loop for all bitplanes */
- "bne .ur_floop \n"
-
- "b .ur_end \n"
-
- ".ur_sloop: \n" /** short loop (nothing to keep) **/
- "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
- "adc r0, r0, r0 \n" /* put bit into LSB for byte */
- "movs r8, r8, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r7, r7, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r6, r6, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r5, r5, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r4, r4, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r3, r3, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r2, r2, lsr #1 \n"
- "adc r0, r0, r0 \n"
-
- "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
-
- "cmp %[end], %[addr] \n" /* loop for all bitplanes */
- "bne .ur_sloop \n"
-
- ".ur_end: \n"
- : /* outputs */
- [addr]"+r"(addr),
- [mask]"+r"(mask),
- [rx] "=&r"(trash)
- : /* inputs */
- [psiz]"r"(_gray_info.plane_size),
- [end] "r"(end),
- [patp]"[rx]"(pat_ptr)
- : /* clobbers */
- "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
- );
+ "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
+ "add pc, pc, r0 \n"
+ ".ur_stable: \n"
+ ".byte .ur_s0 - .ur_stable - 4 \n"
+ ".byte .ur_s1 - .ur_stable - 4 \n"
+ ".byte .ur_s2 - .ur_stable - 4 \n"
+ ".byte .ur_s3 - .ur_stable - 4 \n"
+ ".byte .ur_s4 - .ur_stable - 4 \n"
+ ".byte .ur_s5 - .ur_stable - 4 \n"
+ ".byte .ur_s6 - .ur_stable - 4 \n"
+ ".byte .ur_s7 - .ur_stable - 4 \n"
+
+ ".ur_s8: \n"
+ "add %[addr], %[addr], %[psiz], lsl #3 \n"
+ /* Point behind the last plane for this round. See above. */
+ "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */
+ "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
+ ".ur_s7: \n"
+ "strb r7, [%[addr], -%[psiz]]! \n"
+ "mov r7, r7, lsr #8 \n"
+ ".ur_s6: \n"
+ "strb r6, [%[addr], -%[psiz]]! \n"
+ "mov r6, r6, lsr #8 \n"
+ ".ur_s5: \n"
+ "strb r5, [%[addr], -%[psiz]]! \n"
+ "mov r5, r5, lsr #8 \n"
+ ".ur_s4: \n"
+ "strb r4, [%[addr], -%[psiz]]! \n"
+ "mov r4, r4, lsr #8 \n"
+ ".ur_s3: \n"
+ "strb r3, [%[addr], -%[psiz]]! \n"
+ "mov r3, r3, lsr #8 \n"
+ ".ur_s2: \n"
+ "strb r2, [%[addr], -%[psiz]]! \n"
+ "mov r2, r2, lsr #8 \n"
+ ".ur_s1: \n"
+ "strb r1, [%[addr], -%[psiz]]! \n"
+ "mov r1, r1, lsr #8 \n"
+ ".ur_s0: \n"
+
+ "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
+ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
+ "bhi .ur_sloop \n"
+
+ ".ur_end: \n"
+ : /* outputs */
+ [addr]"+r"(addr),
+ [mask]"+r"(mask),
+ [dpth]"+r"(depth),
+ [rx] "=&r"(trash)
+ : /* inputs */
+ [psiz]"r"(_gray_info.plane_size),
+ [patp]"[rx]"(pat_ptr)
+ : /* clobbers */
+ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
+ );
}
#else /* C version, for reference*/
#warning C version of gray_update_rect() used
@@ -873,7 +1018,7 @@ void gray_update_rect(int x, int y, int width, int height)
for (i = 7; i >= 0; i--)
data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);
-
+
*addr = (*addr & mask) | data;
addr += _gray_info.plane_size;
test <<= 1;
@@ -935,13 +1080,13 @@ void gray_update_rect(int x, int y, int width, int height)
#if CONFIG_CPU == SH7034
asm volatile (
- "mov.l @%[cbuf],r1 \n"
- "mov.l @%[bbuf],r2 \n"
- "xor r1,r2 \n"
- "mov.l @(4,%[cbuf]),r1 \n"
- "mov.l @(4,%[bbuf]),%[chg] \n"
- "xor r1,%[chg] \n"
- "or r2,%[chg] \n"
+ "mov.l @%[cbuf], r1 \n"
+ "mov.l @%[bbuf], r2 \n"
+ "xor r1, r2 \n"
+ "mov.l @(4,%[cbuf]), r1 \n"
+ "mov.l @(4,%[bbuf]), %[chg]\n"
+ "xor r1, %[chg] \n"
+ "or r2, %[chg] \n"
: /* outputs */
[chg] "=r"(change)
: /* inputs */
@@ -953,176 +1098,402 @@ void gray_update_rect(int x, int y, int width, int height)
if (change != 0)
{
- unsigned char *addr, *end;
- unsigned mask, trash;
+ unsigned char *addr;
+ unsigned mask, depth, trash;
pat_ptr = &pat_stack[8];
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
- asm volatile (
- "mov #8,r3 \n" /* loop count */
-
- ".ur_pre_loop: \n"
- "mov.b @%[cbuf]+,r0\n" /* read current buffer */
- "mov.b @%[bbuf],r1 \n" /* read back buffer */
- "mov #0,r2 \n" /* preset for skipped pixel */
- "mov.b r0,@%[bbuf] \n" /* update back buffer */
- "add #1,%[bbuf] \n"
- "cmp/eq r0,r1 \n" /* no change? */
- "bt .ur_skip \n" /* -> skip */
-
- "shll2 r0 \n" /* pixel value -> pattern offset */
- "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */
-
- "mov #75,r0 \n"
- "mulu r0,%[rnd] \n" /* multiply by 75 */
- "sts macl,%[rnd] \n"
- "add #74,%[rnd] \n" /* add another 74 */
- /* Since the lower bits are not very random: */
- "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */
- "and %[rmsk],r1 \n" /* mask out unneeded bits */
-
- "cmp/hs %[dpth],r1 \n" /* random >= depth ? */
- "bf .ur_ntrim \n"
- "sub %[dpth],r1 \n" /* yes: random -= depth; */
- ".ur_ntrim: \n"
+ asm volatile
+ (
+ "mov #8, r3 \n" /* loop count */
+
+ ".ur_pre_loop: \n"
+ "mov.b @%[cbuf]+, r0 \n" /* read current buffer */
+ "mov.b @%[bbuf], r1 \n" /* read back buffer */
+ "mov #0, r2 \n" /* preset for skipped pixel */
+ "mov.b r0, @%[bbuf] \n" /* update back buffer */
+ "add #1, %[bbuf] \n"
+ "cmp/eq r0, r1 \n" /* no change? */
+ "bt .ur_skip \n" /* -> skip */
+
+ "shll2 r0 \n" /* pixel value -> pattern offset */
+ "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */
+
+ "mov #75, r0 \n"
+ "mulu r0, %[rnd] \n" /* multiply by 75 */
+ "sts macl, %[rnd] \n"
+ "add #74, %[rnd] \n" /* add another 74 */
+ /* Since the lower bits are not very random: */
+ "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */
+ "and %[rmsk], r1 \n" /* mask out unneeded bits */
+
+ "cmp/hs %[dpth], r1 \n" /* random >= depth ? */
+ "bf .ur_ntrim \n"
+ "sub %[dpth], r1 \n" /* yes: random -= depth; */
+ ".ur_ntrim: \n"
- "mov.l .ashlsi3,r0 \n" /** rotate pattern **/
- "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
- "mov r1,r5 \n"
-
- "mov %[dpth],r5 \n"
- "sub r1,r5 \n" /* r5 = depth - r1 */
- "mov.l .lshrsi3,r1 \n"
- "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
- "mov r0,r2 \n" /* store previous result in r2 */
+ "mov.l .ashlsi3, r0 \n" /** rotate pattern **/
+ "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
+ "mov r1, r5 \n"
+
+ "mov %[dpth], r5 \n"
+ "sub r1, r5 \n" /* r5 = depth - r1 */
+ "mov.l .lshrsi3, r1 \n"
+ "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
+ "mov r0, r2 \n" /* store previous result in r2 */
- "or r0,r2 \n" /* rotated_pattern = r2 | r0 */
- "clrt \n" /* mask bit = 0 (replace) */
-
- ".ur_skip: \n" /* T == 1 if skipped */
- "rotcr %[mask] \n" /* get mask bit */
- "mov.l r2,@-%[patp]\n" /* push on pattern stack */
-
- "add #-1,r3 \n" /* loop 8 times (pixel block) */
- "cmp/pl r3 \n"
- "bt .ur_pre_loop\n"
-
- "shlr8 %[mask] \n" /* shift mask to low byte */
- "shlr16 %[mask] \n"
- : /* outputs */
- [cbuf]"+r"(cbuf),
- [bbuf]"+r"(bbuf),
- [rnd] "+r"(_gray_random_buffer),
- [patp]"+r"(pat_ptr),
- [mask]"=&r"(mask)
- : /* inputs */
- [dpth]"r"(_gray_info.depth),
- [bpat]"r"(_gray_info.bitpattern),
- [rmsk]"r"(_gray_info.randmask)
- : /* clobbers */
- "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr"
+ "or r0, r2 \n" /* rotated_pattern = r2 | r0 */
+ "clrt \n" /* mask bit = 0 (replace) */
+
+ ".ur_skip: \n" /* T == 1 if skipped */
+ "rotcr %[mask] \n" /* get mask bit */
+ "mov.l r2, @-%[patp] \n" /* push on pattern stack */
+
+ "add #-1, r3 \n" /* loop 8 times (pixel block) */
+ "cmp/pl r3 \n"
+ "bt .ur_pre_loop \n"
+
+ "shlr8 %[mask] \n" /* shift mask to low byte */
+ "shlr16 %[mask] \n"
+ : /* outputs */
+ [cbuf]"+r"(cbuf),
+ [bbuf]"+r"(bbuf),
+ [rnd] "+r"(_gray_random_buffer),
+ [patp]"+r"(pat_ptr),
+ [mask]"=&r"(mask)
+ : /* inputs */
+ [dpth]"r"(_gray_info.depth),
+ [bpat]"r"(_gray_info.bitpattern),
+ [rmsk]"r"(_gray_info.randmask)
+ : /* clobbers */
+ "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr"
);
addr = dst_row;
- end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
+ depth = _gray_info.depth;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
- asm volatile (
- "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */
- "mov.l @%[patp]+,r2 \n"
- "mov.l @%[patp]+,r3 \n"
- "mov.l @%[patp]+,r6 \n"
- "mov.l @%[patp]+,r7 \n"
- "mov.l @%[patp]+,r8 \n"
- "mov.l @%[patp]+,r9 \n"
- "mov.l @%[patp],r10 \n"
-
- "tst %[mask],%[mask] \n"
- "bt .ur_sloop \n" /* short loop if nothing to keep */
-
- ".ur_floop: \n" /** full loop (there are bits to keep)**/
- "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
- "rotcl r0 \n" /* rotate t bit into r0 */
- "shlr r2 \n"
- "rotcl r0 \n"
- "shlr r3 \n"
- "rotcl r0 \n"
- "shlr r6 \n"
- "rotcl r0 \n"
- "shlr r7 \n"
- "rotcl r0 \n"
- "shlr r8 \n"
- "rotcl r0 \n"
- "shlr r9 \n"
- "rotcl r0 \n"
- "shlr r10 \n"
- "mov.b @%[addr],%[rx] \n" /* read old value */
- "rotcl r0 \n"
- "and %[mask],%[rx] \n" /* mask out replaced bits */
- "or %[rx],r0 \n" /* set new bits */
- "mov.b r0,@%[addr] \n" /* store value to bitplane */
- "add %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */
- "bt .ur_floop \n"
-
- "bra .ur_end \n"
- "nop \n"
-
- /* References to C library routines used in the precalc block */
- ".align 2 \n"
- ".ashlsi3: \n" /* C library routine: */
- ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */
- ".lshrsi3: \n" /* C library routine: */
- ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */
- /* both routines preserve r4, destroy r5 and take ~16 cycles */
-
- ".ur_sloop: \n" /** short loop (nothing to keep) **/
- "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
- "rotcl r0 \n" /* rotate t bit into r0 */
- "shlr r2 \n"
- "rotcl r0 \n"
- "shlr r3 \n"
- "rotcl r0 \n"
- "shlr r6 \n"
- "rotcl r0 \n"
- "shlr r7 \n"
- "rotcl r0 \n"
- "shlr r8 \n"
- "rotcl r0 \n"
- "shlr r9 \n"
- "rotcl r0 \n"
- "shlr r10 \n"
- "rotcl r0 \n"
- "mov.b r0,@%[addr] \n" /* store byte to bitplane */
- "add %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */
- "bt .ur_sloop \n"
-
- ".ur_end: \n"
- : /* outputs */
- [addr]"+r"(addr),
- [mask]"+r"(mask),
- [rx] "=&r"(trash)
- : /* inputs */
- [psiz]"r"(_gray_info.plane_size),
- [end] "r"(end),
- [patp]"[rx]"(pat_ptr)
- : /* clobbers */
- "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10"
+ asm volatile
+ (
+ "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */
+ "mov.l @%[patp]+, r7 \n"
+ "mov.l @%[patp]+, r6 \n"
+ "mov.l @%[patp]+, r5 \n"
+ "mov.l @%[patp]+, r4 \n"
+ "mov.l @%[patp]+, r3 \n"
+ "mov.l @%[patp]+, r2 \n"
+ "mov.l @%[patp], r1 \n"
+
+ /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
+
+ "mov.l .ur_mask4, %[rx] \n" /* bitmask = ...11110000 */
+ "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r1, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
+ "mov r6, r0 \n"
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r2, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
+ "mov r7, r0 \n"
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r3, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
+ "mov r8, r0 \n"
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r4, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
+
+ "mov.l .ur_mask2, %[rx] \n" /* bitmask = ...11001100 */
+ "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/
+ "shll2 r0 \n"
+ "xor r1, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
+ "shlr2 r0 \n"
+ "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
+ "mov r4, r0 \n"
+ "shll2 r0 \n"
+ "xor r2, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
+ "shlr2 r0 \n"
+ "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
+ "mov r7, r0 \n"
+ "shll2 r0 \n"
+ "xor r5, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
+ "shlr2 r0 \n"
+ "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
+ "mov r8, r0 \n"
+ "shll2 r0 \n"
+ "xor r6, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
+ "shlr2 r0 \n"
+ "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
+
+ "mov.l .ur_mask1, %[rx] \n" /* bitmask = ...10101010 */
+ "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/
+ "shll r0 \n"
+ "xor r1, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
+ "shlr r0 \n"
+ "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
+ "mov r4, r0 \n"
+ "shll r0 \n"
+ "xor r3, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
+ "shlr r0 \n"
+ "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
+ "mov r6, r0 \n"
+ "shll r0 \n"
+ "xor r5, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
+ "shlr r0 \n"
+ "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
+ "mov r8, r0 \n"
+ "shll r0 \n"
+ "xor r7, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
+ "shlr r0 \n"
+ "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
+
+ "tst %[mask], %[mask] \n"
+ "bt .ur_sloop \n" /* short loop if nothing to keep */
+
+ ".ur_floop: \n" /** full loop (there are bits to keep)**/
+ "mov #8, r0 \n"
+ "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
+ "bt .ur_f8 \n"
+
+ "mulu %[psiz], %[dpth] \n"
+ "mova .ur_ftable, r0 \n"
+ "mov.b @(r0, %[dpth]), %[rx] \n"
+ "add %[rx], r0 \n"
+ "sts macl, %[rx] \n" /* point behind the last plane.. */
+ "jmp @r0 \n" /* jump into streak */
+ "add %[rx], %[addr] \n" /* ..for this round */
+
+ ".align 2 \n"
+ ".ur_ftable: \n"
+ ".byte .ur_f0 - .ur_ftable \n"
+ ".byte .ur_f1 - .ur_ftable \n"
+ ".byte .ur_f2 - .ur_ftable \n"
+ ".byte .ur_f3 - .ur_ftable \n"
+ ".byte .ur_f4 - .ur_ftable \n"
+ ".byte .ur_f5 - .ur_ftable \n"
+ ".byte .ur_f6 - .ur_ftable \n"
+ ".byte .ur_f7 - .ur_ftable \n"
+
+ ".ur_f8: \n"
+ "mov %[psiz], %[rx] \n"
+ "shll2 %[rx] \n"
+ "add %[rx], %[rx] \n"
+ "add %[rx], %[addr] \n"
+ /* Point behind the last plane for this round. Note: We're using the
+ * registers backwards in order to reuse the streak for the last round.
+ * Therefore we need to go thru the bitplanes backwards too, otherwise
+ * the bit order would be destroyed which results in more flicker. */
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n" /* load old byte */
+ "and %[mask], r0 \n" /* mask out replaced bits */
+ "or r8, r0 \n" /* set new bits */
+ "mov.b r0, @%[addr] \n" /* store byte */
+ "shlr8 r8 \n" /* shift out used-up byte */
+ ".ur_f7: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r7, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r7 \n"
+ ".ur_f6: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r6, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r6 \n"
+ ".ur_f5: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r5, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r5 \n"
+ ".ur_f4: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r4, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r4 \n"
+ ".ur_f3: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r3, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r3 \n"
+ ".ur_f2: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r2, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r2 \n"
+ ".ur_f1: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r1, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r1 \n"
+ ".ur_f0: \n"
+
+ "add %[rx], %[addr] \n" /* correct address */
+ "add #-8, %[dpth] \n"
+ "cmp/pl %[dpth] \n" /* next round if anything left */
+ "bt .ur_floop \n"
+
+ "bra .ur_end \n"
+ "nop \n"
+
+ /* References to C library routines used in the precalc block */
+ ".align 2 \n"
+ ".ashlsi3: \n" /* C library routine: */
+ ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
+ ".lshrsi3: \n" /* C library routine: */
+ ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
+ /* both routines preserve r4, destroy r5 and take ~16 cycles */
+
+ /* Bitmasks for the bit block rotation */
+ ".ur_mask4: \n"
+ ".long 0xF0F0F0F0 \n"
+ ".ur_mask2: \n"
+ ".long 0xCCCCCCCC \n"
+ ".ur_mask1: \n"
+ ".long 0xAAAAAAAA \n"
+
+ ".ur_sloop: \n" /** short loop (nothing to keep) **/
+ "mov #8, r0 \n"
+ "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
+ "bt .ur_s8 \n"
+
+ "mulu %[psiz], %[dpth] \n"
+ "mova .ur_stable, r0 \n"
+ "mov.b @(r0, %[dpth]), %[rx] \n"
+ "add %[rx], r0 \n"
+ "sts macl, %[rx] \n" /* point behind the last plane.. */
+ "jmp @r0 \n" /* jump into streak */
+ "add %[rx], %[addr] \n" /* ..for this round */
+
+ ".align 2 \n"
+ ".ur_stable: \n"
+ ".byte .ur_s0 - .ur_stable \n"
+ ".byte .ur_s1 - .ur_stable \n"
+ ".byte .ur_s2 - .ur_stable \n"
+ ".byte .ur_s3 - .ur_stable \n"
+ ".byte .ur_s4 - .ur_stable \n"
+ ".byte .ur_s5 - .ur_stable \n"
+ ".byte .ur_s6 - .ur_stable \n"
+ ".byte .ur_s7 - .ur_stable \n"
+
+ ".ur_s8: \n"
+ "mov %[psiz], %[rx] \n" /* Point behind the last plane */
+ "shll2 %[rx] \n" /* for this round. */
+ "add %[rx], %[rx] \n" /* See above. */
+ "add %[rx], %[addr] \n"
+
+ "sub %[psiz], %[addr] \n"
+ "mov.b r8, @%[addr] \n" /* store byte */
+ "shlr8 r8 \n" /* shift out used-up byte */
+ ".ur_s7: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r7, @%[addr] \n"
+ "shlr8 r7 \n"
+ ".ur_s6: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r6, @%[addr] \n"
+ "shlr8 r6 \n"
+ ".ur_s5: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r5, @%[addr] \n"
+ "shlr8 r5 \n"
+ ".ur_s4: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r4, @%[addr] \n"
+ "shlr8 r4 \n"
+ ".ur_s3: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r3, @%[addr] \n"
+ "shlr8 r3 \n"
+ ".ur_s2: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r2, @%[addr] \n"
+ "shlr8 r2 \n"
+ ".ur_s1: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r1, @%[addr] \n"
+ "shlr8 r1 \n"
+ ".ur_s0: \n"
+
+ "add %[rx], %[addr] \n" /* correct address */
+ "add #-8, %[dpth] \n"
+ "cmp/pl %[dpth] \n" /* next round if anything left */
+ "bt .ur_sloop \n"
+
+ ".ur_end: \n"
+ : /* outputs */
+ [addr]"+r"(addr),
+ [dpth]"+r"(depth),
+ [rx] "=&r"(trash)
+ : /* inputs */
+ [mask]"r"(mask),
+ [psiz]"r"(_gray_info.plane_size),
+ [patp]"[rx]"(pat_ptr)
+ : /* clobbers */
+ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl"
);
}
#elif defined(CPU_COLDFIRE)
asm volatile (
- "move.l (%[cbuf]),%%d0 \n"
- "move.l (%[bbuf]),%%d1 \n"
- "eor.l %%d0,%%d1 \n"
- "move.l (4,%[cbuf]),%%d0 \n"
- "move.l (4,%[bbuf]),%[chg] \n"
- "eor.l %%d0,%[chg] \n"
- "or.l %%d1,%[chg] \n"
+ "move.l (%[cbuf]), %%d0 \n"
+ "move.l (%[bbuf]), %%d1 \n"
+ "eor.l %%d0, %%d1 \n"
+ "move.l (4,%[cbuf]), %%d0 \n"
+ "move.l (4,%[bbuf]), %[chg] \n"
+ "eor.l %%d0, %[chg] \n"
+ "or.l %%d1, %[chg] \n"
: /* outputs */
[chg] "=&d"(change)
: /* inputs */
@@ -1134,160 +1505,359 @@ void gray_update_rect(int x, int y, int width, int height)
if (change != 0)
{
- unsigned char *addr, *end;
- unsigned mask, trash;
+ unsigned char *addr;
+ unsigned mask, depth, trash;
pat_ptr = &pat_stack[8];
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
- asm volatile (
- "moveq.l #8,%%d3 \n" /* loop count */
- "clr.l %[mask] \n"
-
- ".ur_pre_loop: \n"
- "clr.l %%d0 \n"
- "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */
- "clr.l %%d1 \n"
- "move.b (%[bbuf]),%%d1 \n" /* read back buffer */
- "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */
- "clr.l %%d2 \n" /* preset for skipped pixel */
- "cmp.l %%d0,%%d1 \n" /* no change? */
- "beq.b .ur_skip \n" /* -> skip */
-
- "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */
-
- "mulu.w #75,%[rnd] \n" /* multiply by 75 */
- "add.l #74,%[rnd] \n" /* add another 74 */
- /* Since the lower bits are not very random: */
- "move.l %[rnd],%%d1 \n"
- "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
- "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */
-
- "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */
- "blo.b .ur_ntrim \n"
- "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */
- ".ur_ntrim: \n"
-
- "move.l %%d2,%%d0 \n" /** rotate pattern **/
- "lsl.l %%d1,%%d0 \n"
- "sub.l %[dpth],%%d1 \n"
- "neg.l %%d1 \n" /* d1 = depth - d1 */
- "lsr.l %%d1,%%d2 \n"
- "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */
-
- "or.l #0x0100,%[mask] \n" /* set mask bit */
-
- ".ur_skip: \n"
- "lsr.l #1,%[mask] \n" /* shift mask */
- "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */
-
- "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */
- "bne.b .ur_pre_loop \n"
- : /* outputs */
- [cbuf]"+a"(cbuf),
- [bbuf]"+a"(bbuf),
- [patp]"+a"(pat_ptr),
- [rnd] "+d"(_gray_random_buffer),
- [mask]"=&d"(mask)
- : /* inputs */
- [bpat]"a"(_gray_info.bitpattern),
- [dpth]"d"(_gray_info.depth),
- [rmsk]"d"(_gray_info.randmask)
- : /* clobbers */
- "d0", "d1", "d2", "d3"
+ asm volatile
+ (
+ "moveq.l #8, %%d3 \n" /* loop count */
+ "clr.l %[mask] \n"
+
+ ".ur_pre_loop: \n"
+ "clr.l %%d0 \n"
+ "move.b (%[cbuf])+, %%d0 \n" /* read current buffer */
+ "clr.l %%d1 \n"
+ "move.b (%[bbuf]), %%d1 \n" /* read back buffer */
+ "move.b %%d0, (%[bbuf])+ \n" /* update back buffer */
+ "clr.l %%d2 \n" /* preset for skipped pixel */
+ "cmp.l %%d0, %%d1 \n" /* no change? */
+ "beq.b .ur_skip \n" /* -> skip */
+
+ "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */
+
+ "mulu.w #75, %[rnd] \n" /* multiply by 75 */
+ "add.l #74, %[rnd] \n" /* add another 74 */
+ /* Since the lower bits are not very random: */
+ "move.l %[rnd], %%d1 \n"
+ "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */
+ "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */
+
+ "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */
+ "blo.b .ur_ntrim \n"
+ "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */
+ ".ur_ntrim: \n"
+
+ "move.l %%d2, %%d0 \n" /** rotate pattern **/
+ "lsl.l %%d1, %%d0 \n"
+ "sub.l %[dpth], %%d1 \n"
+ "neg.l %%d1 \n" /* d1 = depth - d1 */
+ "lsr.l %%d1, %%d2 \n"
+ "or.l %%d0, %%d2 \n" /* rotated_pattern = d2 | d0 */
+
+ "or.l #0x0100, %[mask] \n" /* set mask bit */
+
+ ".ur_skip: \n"
+ "lsr.l #1, %[mask] \n" /* shift mask */
+ "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */
+
+ "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */
+ "bne.b .ur_pre_loop \n"
+ : /* outputs */
+ [cbuf]"+a"(cbuf),
+ [bbuf]"+a"(bbuf),
+ [patp]"+a"(pat_ptr),
+ [rnd] "+d"(_gray_random_buffer),
+ [mask]"=&d"(mask)
+ : /* inputs */
+ [bpat]"a"(_gray_info.bitpattern),
+ [dpth]"d"(_gray_info.depth),
+ [rmsk]"d"(_gray_info.randmask)
+ : /* clobbers */
+ "d0", "d1", "d2", "d3"
);
addr = dst_row;
- end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
+ mask = ~mask & 0xff;
+ depth = _gray_info.depth;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
- asm volatile (
- "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n"
- /* pop all 8 patterns */
- "not.l %[mask] \n" /* "set" mask -> "keep" mask */
- "and.l #0xFF,%[mask] \n"
- "beq.b .ur_sstart \n" /* short loop if nothing to keep */
-
- ".ur_floop: \n" /** full loop (there are bits to keep)**/
- "clr.l %%d0 \n"
- "lsr.l #1,%%d2 \n" /* shift out pattern bit */
- "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
- "lsr.l #1,%%d3 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d4 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d5 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d6 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%a0,%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%%a0 \n"
- "move.l %%a1,%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%%a1 \n"
- "move.l %[ax],%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%[ax] \n"
-
- "move.b (%[addr]),%%d1 \n" /* read old value */
- "and.l %[mask],%%d1 \n" /* mask out replaced bits */
- "or.l %%d0,%%d1 \n" /* set new bits */
- "move.b %%d1,(%[addr]) \n" /* store value to bitplane */
-
- "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */
- "bhi.b .ur_floop \n"
-
- "bra.b .ur_end \n"
-
- ".ur_sstart: \n"
- "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */
-
- ".ur_sloop: \n" /** short loop (nothing to keep) **/
- "clr.l %%d0 \n"
- "lsr.l #1,%%d2 \n" /* shift out pattern bit */
- "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
- "lsr.l #1,%%d3 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d4 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d5 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d6 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%[mask] \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%a1,%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%%a1 \n"
- "move.l %[ax],%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%[ax] \n"
-
- "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */
- "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */
- "bhi.b .ur_sloop \n"
-
- ".ur_end: \n"
- : /* outputs */
- [addr]"+a"(addr),
- [mask]"+d"(mask),
- [ax] "=&a"(trash)
- : /* inputs */
- [psiz]"a"(_gray_info.plane_size),
- [end] "a"(end),
- [patp]"[ax]"(pat_ptr)
- : /* clobbers */
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1"
+ asm volatile
+ (
+ "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */
+ /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */
+
+ /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
+
+ "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/
+ "lsl.l #4, %%d0 \n"
+ /* move.l %[ax], %%d5 */ /* already in d5 */
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */
+ "move.l %%d2, %%d0 \n"
+ "lsl.l #4, %%d0 \n"
+ "eor.l %%d6, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n"
+ "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */
+ "move.l %%d3, %%d0 \n"
+ "lsl.l #4, %%d0 \n"
+ "eor.l %%d7, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n"
+ "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */
+ "move.l %%d4, %%d0 \n"
+ "lsl.l #4, %%d0 \n"
+ "move.l %%a0, %%d5 \n"
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n"
+ "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */
+ /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */
+
+ "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/
+ "lsl.l #2, %%d0 \n"
+ /* move.l %%a0, %%d5 */ /* still in d5 */
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */
+ "move.l %[ax], %%d5 \n"
+ "move.l %%d5, %%d0 \n"
+ "lsl.l #2, %%d0 \n"
+ "eor.l %%d7, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n"
+ "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */
+ /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */
+ "move.l %%d2, %%d0 \n"
+ "lsl.l #2, %%d0 \n"
+ "eor.l %%d4, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n"
+ "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */
+ "move.l %%d1, %%d0 \n"
+ "lsl.l #2, %%d0 \n"
+ "eor.l %%d3, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n"
+ "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */
+
+ "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/
+ "lsl.l #1, %%d0 \n"
+ "eor.l %%d2, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */
+ "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */
+ "move.l %%d3, %%d0 \n"
+ "lsl.l #1, %%d0 \n"
+ "eor.l %%d4, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n"
+ "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */
+ /* move.l %[ax], %%d5 */ /* still in d5 */
+ "move.l %%d5, %%d0 \n"
+ "lsl.l #1, %%d0 \n"
+ "eor.l %%d6, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n"
+ "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */
+ "move.l %%d7, %%d0 \n"
+ "lsl.l #1, %%d0 \n"
+ "move.l %%a0, %%d5 \n"
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n"
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */
+
+ "tst.l %[mask] \n"
+ "jeq .ur_sloop \n" /* short loop if nothing to keep */
+
+ "move.l %[mask], %%d5 \n" /* need mask in data reg. */
+ "move.l %%d1, %[mask] \n" /* free d1 as working reg. */
+
+ ".ur_floop: \n" /** full loop (there are bits to keep)**/
+ "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
+ "bhs.s .ur_f8 \n"
+
+ "move.l %[psiz], %%d0 \n"
+ "move.l %[dpth], %%d1 \n"
+ "mulu.w %%d1, %%d0 \n" /* point behind the last plane */
+ "add.l %%d0, %[addr] \n" /* for this round */
+ "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
+ "bra.s .ur_f1 \n" /* dpth == 0 should never happen */
+ "bra.s .ur_f2 \n"
+ "bra.s .ur_f3 \n"
+ "bra.s .ur_f4 \n"
+ "bra.s .ur_f5 \n"
+ "bra.s .ur_f6 \n"
+ "bra.s .ur_f7 \n"
+
+ ".ur_f8: \n"
+ "move.l %[psiz], %%d0 \n"
+ "lsl.l #3, %%d0 \n"
+ "add.l %%d0, %[addr] \n"
+ /* Point behind the last plane for this round. Note: We're using the
+ * registers backwards in order to reuse the streak for the last round.
+ * Therefore we need to go thru the bitplanes backwards too, otherwise
+ * the bit order would be destroyed which results in more flicker. */
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n" /* load old byte */
+ "and.l %%d5, %%d0 \n" /* mask out replaced bits */
+ "move.l %[mask], %%d1 \n"
+ "or.l %%d1, %%d0 \n" /* set new bits */
+ "move.b %%d0, (%[addr]) \n" /* store byte */
+ "lsr.l #8, %%d1 \n" /* shift out used-up byte */
+ "move.l %%d1, %[mask] \n"
+ ".ur_f7: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d2, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d2 \n"
+ ".ur_f6: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d3, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d3 \n"
+ ".ur_f5: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d4, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d4 \n"
+ ".ur_f4: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "move.l %[ax], %%d1 \n"
+ "or.l %%d1, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d1 \n"
+ "move.l %%d1, %[ax] \n"
+ ".ur_f3: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d6, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d6 \n"
+ ".ur_f2: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d7, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d7 \n"
+ ".ur_f1: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "move.l %%a0, %%d1 \n"
+ "or.l %%d1, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d1 \n"
+ "move.l %%d1, %%a0 \n"
+
+ "move.l %[psiz], %%d0 \n"
+ "lsl.l #3, %%d0 \n"
+ "add.l %%d0, %[addr] \n" /* correct address */
+ "subq.l #8, %[dpth] \n"
+ "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
+ "jgt .ur_floop \n" /* next round if anything left */
+
+ "jra .ur_end \n"
+
+ ".ur_sloop: \n" /** short loop (nothing to keep) **/
+ "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
+ "bhs.s .ur_s8 \n"
+
+ "move.l %[psiz], %%d0 \n"
+ "move.l %[dpth], %%d5 \n"
+ "mulu.w %%d5, %%d0 \n" /* point behind the last plane */
+ "add.l %%d0, %[addr] \n" /* for this round */
+ "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
+ "bra.s .ur_s1 \n" /* dpth == 0 should never happen */
+ "bra.s .ur_s2 \n"
+ "bra.s .ur_s3 \n"
+ "bra.s .ur_s4 \n"
+ "bra.s .ur_s5 \n"
+ "bra.s .ur_s6 \n"
+ "bra.s .ur_s7 \n"
+
+ ".ur_s8: \n"
+ "move.l %[psiz], %%d0 \n" /* Point behind the last plane */
+ "lsl.l #3, %%d0 \n" /* for this round. */
+ "add.l %%d0, %[addr] \n" /* See above. */
+
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d1, (%[addr]) \n" /* store byte */
+ "lsr.l #8, %%d1 \n" /* shift out used-up byte */
+ ".ur_s7: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d2, (%[addr]) \n"
+ "lsr.l #8, %%d2 \n"
+ ".ur_s6: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d3, (%[addr]) \n"
+ "lsr.l #8, %%d3 \n"
+ ".ur_s5: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d4, (%[addr]) \n"
+ "lsr.l #8, %%d4 \n"
+ ".ur_s4: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.l %[ax], %%d5 \n"
+ "move.b %%d5, (%[addr]) \n"
+ "lsr.l #8, %%d5 \n"
+ "move.l %%d5, %[ax] \n"
+ ".ur_s3: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d6, (%[addr]) \n"
+ "lsr.l #8, %%d6 \n"
+ ".ur_s2: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d7, (%[addr]) \n"
+ "lsr.l #8, %%d7 \n"
+ ".ur_s1: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.l %%a0, %%d5 \n"
+ "move.b %%d5, (%[addr]) \n"
+ "lsr.l #8, %%d5 \n"
+ "move.l %%d5, %%a0 \n"
+
+ "add.l %%d0, %[addr] \n" /* correct address */
+ "subq.l #8, %[dpth] \n"
+ "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
+ "jgt .ur_sloop \n" /* next round if anything left */
+
+ ".ur_end: \n"
+ : /* outputs */
+ [addr]"+a"(addr),
+ [dpth]"+a"(depth),
+ [mask]"+a"(mask),
+ [ax] "=&a"(trash)
+ : /* inputs */
+ [psiz]"a"(_gray_info.plane_size),
+ [patp]"[ax]"(pat_ptr)
+ : /* clobbers */
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0"
);
}
#else /* C version, for reference*/
@@ -1680,4 +2250,3 @@ static void gray_screendump_hook(int fd)
}
#endif /* HAVE_LCD_BITMAP */
-
diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c
index 9406664ea2..dcc65bdd09 100644
--- a/apps/plugins/lib/gray_draw.c
+++ b/apps/plugins/lib/gray_draw.c
@@ -868,24 +868,24 @@ void gray_ub_clear_display(void)
/* Write a pixel block, defined by their brightnesses in a greymap.
Address is the byte in the first bitplane, src is the greymap start address,
- stride is the increment for the greymap to get to the next pixel, mask
- determines which pixels of the destination block are changed. */
+ mask determines which pixels of the destination block are changed. */
static void _writearray(unsigned char *address, const unsigned char *src,
unsigned mask)
{
unsigned long pat_stack[8];
unsigned long *pat_ptr = &pat_stack[8];
- unsigned char *addr, *end;
+ unsigned char *addr;
#ifdef CPU_ARM
const unsigned char *_src;
- unsigned _mask, trash;
+ unsigned _mask, depth, trash;
_mask = mask;
_src = src;
/* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */
- asm volatile (
+ asm volatile
+ (
"mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */
"mov r3, #8 \n" /* loop count */
@@ -932,83 +932,228 @@ static void _writearray(unsigned char *address, const unsigned char *src,
);
addr = address;
- end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
_mask = mask;
+ depth = _gray_info.depth;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
- asm volatile (
- "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */
-
- "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
+ asm volatile
+ (
+ "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */
+
+ /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
+
+ "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/
+ "orr %[rx], %[rx], %[rx], lsl #8 \n"
+ "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */
+ "eor r0, r1, r5, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
+ "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
+ "eor r0, r2, r6, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
+ "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
+ "eor r0, r3, r7, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
+ "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
+ "eor r0, r4, r8, lsl #4 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
+ "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
+
+ "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/
+ "orr %[rx], %[rx], %[rx], lsl #8 \n"
+ "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */
+ "eor r0, r1, r3, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
+ "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
+ "eor r0, r2, r4, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
+ "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
+ "eor r0, r5, r7, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
+ "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
+ "eor r0, r6, r8, lsl #2 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
+ "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
+
+ "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/
+ "orr %[rx], %[rx], %[rx], lsl #8 \n"
+ "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */
+ "eor r0, r1, r2, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
+ "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
+ "eor r0, r3, r4, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
+ "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
+ "eor r0, r5, r6, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
+ "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
+ "eor r0, r7, r8, lsl #1 \n"
+ "and r0, r0, %[rx] \n"
+ "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
+ "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
+
+ "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
"ands %[mask], %[mask], #0xff \n"
- "beq .wa_sloop \n" /* short loop if nothing to keep */
-
- ".wa_floop: \n" /** full loop (there are bits to keep)**/
- "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
- "adc r0, r0, r0 \n" /* put bit into LSB of byte */
- "movs r8, r8, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r7, r7, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r6, r6, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r5, r5, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r4, r4, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r3, r3, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r2, r2, lsr #1 \n"
- "adc r0, r0, r0 \n"
-
- "ldrb r1, [%[addr]] \n" /* read old value */
- "and r1, r1, %[mask] \n" /* mask out replaced bits */
- "orr r1, r1, r0 \n" /* set new bits */
- "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */
-
- "cmp %[end], %[addr] \n" /* loop through all bitplanes */
- "bne .wa_floop \n"
-
+ "beq .wa_sloop \n" /* short loop if no bits to keep */
+
+ ".wa_floop: \n" /** full loop (bits to keep)**/
+ "cmp %[dpth], #8 \n" /* 8 planes or more left? */
+ "bhs .wa_f8 \n"
+
+ "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
+ "add %[addr], %[addr], r0 \n" /* for this round */
+
+
+ "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
+ "add pc, pc, r0 \n"
+ ".wa_ftable: \n"
+ ".byte .wa_f0 - .wa_ftable - 4 \n" /* [jump tables are tricky] */
+ ".byte .wa_f1 - .wa_ftable - 4 \n"
+ ".byte .wa_f2 - .wa_ftable - 4 \n"
+ ".byte .wa_f3 - .wa_ftable - 4 \n"
+ ".byte .wa_f4 - .wa_ftable - 4 \n"
+ ".byte .wa_f5 - .wa_ftable - 4 \n"
+ ".byte .wa_f6 - .wa_ftable - 4 \n"
+ ".byte .wa_f7 - .wa_ftable - 4 \n"
+
+ ".wa_f8: \n"
+ "add %[addr], %[addr], %[psiz], lsl #3 \n"
+ /* Point behind the last plane for this round. Note: We're using the
+ * registers backwards in order to reuse the streak for the last round.
+ * Therefore we need to go thru the bitplanes backwards too, otherwise
+ * the bit order would be destroyed which results in more flicker. */
+ "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */
+ "and r0, r0, %[mask] \n" /* mask out replaced bits */
+ "orr r0, r0, r8 \n" /* set new bits */
+ "strb r0, [%[addr]] \n" /* store byte */
+ "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
+ ".wa_f7: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r7 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r7, r7, lsr #8 \n"
+ ".wa_f6: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r6 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r6, r6, lsr #8 \n"
+ ".wa_f5: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r5 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r5, r5, lsr #8 \n"
+ ".wa_f4: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r4 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r4, r4, lsr #8 \n"
+ ".wa_f3: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r3 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r3, r3, lsr #8 \n"
+ ".wa_f2: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r2 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r2, r2, lsr #8 \n"
+ ".wa_f1: \n"
+ "ldrb r0, [%[addr], -%[psiz]]! \n"
+ "and r0, r0, %[mask] \n"
+ "orr r0, r0, r1 \n"
+ "strb r0, [%[addr]] \n"
+ "mov r1, r1, lsr #8 \n"
+ ".wa_f0: \n"
+
+ "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
+ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
+ "bhi .wa_floop \n"
+
"b .wa_end \n"
- ".wa_sloop: \n" /** short loop (nothing to keep) **/
- "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
- "adc r0, r0, r0 \n" /* put bit into LSB of byte */
- "movs r8, r8, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r7, r7, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r6, r6, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r5, r5, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r4, r4, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r3, r3, lsr #1 \n"
- "adc r0, r0, r0 \n"
- "movs r2, r2, lsr #1 \n"
- "adc r0, r0, r0 \n"
-
- "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
+ ".wa_sloop: \n" /** short loop (nothing to keep) **/
+ "cmp %[dpth], #8 \n" /* 8 planes or more left? */
+ "bhs .wa_s8 \n"
- "cmp %[end], %[addr] \n" /* loop through all bitplanes */
- "bne .wa_sloop \n"
+ "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
+ "add %[addr], %[addr], r0 \n" /* for this round */
+ "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
+ "add pc, pc, r0 \n"
+ ".wa_stable: \n"
+ ".byte .wa_s0 - .wa_stable - 4 \n"
+ ".byte .wa_s1 - .wa_stable - 4 \n"
+ ".byte .wa_s2 - .wa_stable - 4 \n"
+ ".byte .wa_s3 - .wa_stable - 4 \n"
+ ".byte .wa_s4 - .wa_stable - 4 \n"
+ ".byte .wa_s5 - .wa_stable - 4 \n"
+ ".byte .wa_s6 - .wa_stable - 4 \n"
+ ".byte .wa_s7 - .wa_stable - 4 \n"
+
+ ".wa_s8: \n"
+ "add %[addr], %[addr], %[psiz], lsl #3 \n"
+ /* Point behind the last plane for this round. See above. */
+ "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */
+ "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
+ ".wa_s7: \n"
+ "strb r7, [%[addr], -%[psiz]]! \n"
+ "mov r7, r7, lsr #8 \n"
+ ".wa_s6: \n"
+ "strb r6, [%[addr], -%[psiz]]! \n"
+ "mov r6, r6, lsr #8 \n"
+ ".wa_s5: \n"
+ "strb r5, [%[addr], -%[psiz]]! \n"
+ "mov r5, r5, lsr #8 \n"
+ ".wa_s4: \n"
+ "strb r4, [%[addr], -%[psiz]]! \n"
+ "mov r4, r4, lsr #8 \n"
+ ".wa_s3: \n"
+ "strb r3, [%[addr], -%[psiz]]! \n"
+ "mov r3, r3, lsr #8 \n"
+ ".wa_s2: \n"
+ "strb r2, [%[addr], -%[psiz]]! \n"
+ "mov r2, r2, lsr #8 \n"
+ ".wa_s1: \n"
+ "strb r1, [%[addr], -%[psiz]]! \n"
+ "mov r1, r1, lsr #8 \n"
+ ".wa_s0: \n"
+
+ "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
+ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
+ "bhi .wa_sloop \n"
+
".wa_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(_mask),
+ [dpth]"+r"(depth),
[rx] "=&r"(trash)
: /* inputs */
[psiz]"r"(_gray_info.plane_size),
- [end] "r"(end),
[patp]"[rx]"(pat_ptr)
: /* clobbers */
"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
);
#else /* C version, for reference*/
#warning C version of _writearray() used
+ unsigned char *end;
unsigned test = 0x80;
int i;
@@ -1143,67 +1288,70 @@ void gray_ub_gray_bitmap_part(const unsigned char *src, int src_x, int src_y,
stride is the increment for the greymap to get to the next pixel, mask
determines which pixels of the destination block are changed. */
static void _writearray(unsigned char *address, const unsigned char *src,
+ int stride, unsigned mask) __attribute__((noinline));
+static void _writearray(unsigned char *address, const unsigned char *src,
int stride, unsigned mask)
{
unsigned long pat_stack[8];
unsigned long *pat_ptr = &pat_stack[8];
- unsigned char *addr, *end;
+ unsigned char *addr;
#if CONFIG_CPU == SH7034
const unsigned char *_src;
- unsigned _mask, trash;
+ unsigned _mask, depth, trash;
_mask = mask;
_src = src;
/* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */
- asm volatile (
- "mov #8,r3 \n" /* loop count */
-
- ".wa_loop: \n" /** load pattern for pixel **/
- "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */
- "shlr %[mask] \n" /* shift out lsb of mask */
- "bf .wa_skip \n" /* skip this pixel */
-
- "mov.b @%[src],r0 \n" /* load src byte */
- "extu.b r0,r0 \n" /* extend unsigned */
- "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */
- "extu.b r0,r0 \n" /* extend unsigned */
- "shll2 r0 \n"
- "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */
-
- "mov #75,r0 \n"
- "mulu r0,%[rnd] \n" /* multiply by 75 */
- "sts macl,%[rnd] \n"
- "add #74,%[rnd] \n" /* add another 74 */
+ asm volatile
+ (
+ "mov #8, r3 \n" /* loop count */
+
+ ".wa_loop: \n" /** load pattern for pixel **/
+ "mov #0, r0 \n" /* pattern for skipped pixel must be 0 */
+ "shlr %[mask] \n" /* shift out lsb of mask */
+ "bf .wa_skip \n" /* skip this pixel */
+
+ "mov.b @%[src], r0 \n" /* load src byte */
+ "extu.b r0, r0 \n" /* extend unsigned */
+ "mov.b @(r0,%[trns]), r0 \n" /* idxtable into pattern index */
+ "extu.b r0, r0 \n" /* extend unsigned */
+ "shll2 r0 \n"
+ "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */
+
+ "mov #75, r0 \n"
+ "mulu r0, %[rnd] \n" /* multiply by 75 */
+ "sts macl, %[rnd] \n"
+ "add #74, %[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */
- "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */
- "and %[rmsk],r1 \n" /* mask out unneeded bits */
+ "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */
+ "and %[rmsk], r1 \n" /* mask out unneeded bits */
- "cmp/hs %[dpth],r1 \n" /* random >= depth ? */
- "bf .wa_ntrim \n"
- "sub %[dpth],r1 \n" /* yes: random -= depth; */
- ".wa_ntrim: \n"
+ "cmp/hs %[dpth], r1 \n" /* random >= depth ? */
+ "bf .wa_ntrim \n"
+ "sub %[dpth], r1 \n" /* yes: random -= depth; */
+ ".wa_ntrim: \n"
- "mov.l .ashlsi3,r0 \n" /** rotate pattern **/
- "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
- "mov r1,r5 \n"
+ "mov.l .ashlsi3, r0 \n" /** rotate pattern **/
+ "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
+ "mov r1, r5 \n"
- "mov %[dpth],r5 \n"
- "sub r1,r5 \n" /* r5 = depth - r1 */
- "mov.l .lshrsi3,r1 \n"
- "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
- "mov r0,r1 \n" /* store previous result in r1 */
+ "mov %[dpth], r5 \n"
+ "sub r1, r5 \n" /* r5 = depth - r1 */
+ "mov.l .lshrsi3, r1 \n"
+ "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
+ "mov r0, r1 \n" /* store previous result in r1 */
- "or r1,r0 \n" /* rotated_pattern = r0 | r1 */
+ "or r1, r0 \n" /* rotated_pattern = r0 | r1 */
- ".wa_skip: \n"
- "mov.l r0,@-%[patp] \n" /* push on pattern stack */
+ ".wa_skip: \n"
+ "mov.l r0, @-%[patp] \n" /* push on pattern stack */
- "add %[stri],%[src] \n" /* src += stride; */
- "add #-1,r3 \n" /* loop 8 times (pixel block) */
- "cmp/pl r3 \n"
- "bt .wa_loop \n"
+ "add %[stri], %[src] \n" /* src += stride; */
+ "add #-1, r3 \n" /* loop 8 times (pixel block) */
+ "cmp/pl r3 \n"
+ "bt .wa_loop \n"
: /* outputs */
[src] "+r"(_src),
[rnd] "+r"(_gray_random_buffer),
@@ -1220,143 +1368,369 @@ static void _writearray(unsigned char *address, const unsigned char *src,
);
addr = address;
- end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
_mask = mask;
+ depth = _gray_info.depth;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
- asm volatile (
- "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */
- "mov.l @%[patp]+,r2 \n"
- "mov.l @%[patp]+,r3 \n"
- "mov.l @%[patp]+,r6 \n"
- "mov.l @%[patp]+,r7 \n"
- "mov.l @%[patp]+,r8 \n"
- "mov.l @%[patp]+,r9 \n"
- "mov.l @%[patp],r10 \n"
-
- "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */
- "extu.b %[mask],%[mask] \n" /* mask out high bits */
- "tst %[mask],%[mask] \n"
- "bt .wa_sloop \n" /* short loop if nothing to keep */
-
- ".wa_floop: \n" /** full loop (there are bits to keep)**/
- "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
- "rotcl r0 \n" /* rotate t bit into r0 */
- "shlr r2 \n"
- "rotcl r0 \n"
- "shlr r3 \n"
- "rotcl r0 \n"
- "shlr r6 \n"
- "rotcl r0 \n"
- "shlr r7 \n"
- "rotcl r0 \n"
- "shlr r8 \n"
- "rotcl r0 \n"
- "shlr r9 \n"
- "rotcl r0 \n"
- "shlr r10 \n"
- "mov.b @%[addr],%[rx] \n" /* read old value */
- "rotcl r0 \n"
- "and %[mask],%[rx] \n" /* mask out replaced bits */
- "or %[rx],r0 \n" /* set new bits */
- "mov.b r0,@%[addr] \n" /* store value to bitplane */
- "add %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */
- "bt .wa_floop \n"
-
- "bra .wa_end \n"
- "nop \n"
+ asm volatile
+ (
+ "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */
+ "mov.l @%[patp]+, r7 \n"
+ "mov.l @%[patp]+, r6 \n"
+ "mov.l @%[patp]+, r5 \n"
+ "mov.l @%[patp]+, r4 \n"
+ "mov.l @%[patp]+, r3 \n"
+ "mov.l @%[patp]+, r2 \n"
+ "mov.l @%[patp], r1 \n"
+
+ /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
+
+ "mov.l .wa_mask4, %[rx] \n" /* bitmask = ...11110000 */
+ "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r1, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
+ "mov r6, r0 \n"
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r2, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
+ "mov r7, r0 \n"
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r3, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
+ "mov r8, r0 \n"
+ "shll2 r0 \n"
+ "shll2 r0 \n"
+ "xor r4, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
+ "shlr2 r0 \n"
+ "shlr2 r0 \n"
+ "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
+
+ "mov.l .wa_mask2, %[rx] \n" /* bitmask = ...11001100 */
+ "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/
+ "shll2 r0 \n"
+ "xor r1, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
+ "shlr2 r0 \n"
+ "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
+ "mov r4, r0 \n"
+ "shll2 r0 \n"
+ "xor r2, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
+ "shlr2 r0 \n"
+ "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
+ "mov r7, r0 \n"
+ "shll2 r0 \n"
+ "xor r5, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
+ "shlr2 r0 \n"
+ "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
+ "mov r8, r0 \n"
+ "shll2 r0 \n"
+ "xor r6, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
+ "shlr2 r0 \n"
+ "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
+
+ "mov.l .wa_mask1, %[rx] \n" /* bitmask = ...10101010 */
+ "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/
+ "shll r0 \n"
+ "xor r1, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
+ "shlr r0 \n"
+ "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
+ "mov r4, r0 \n"
+ "shll r0 \n"
+ "xor r3, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
+ "shlr r0 \n"
+ "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
+ "mov r6, r0 \n"
+ "shll r0 \n"
+ "xor r5, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
+ "shlr r0 \n"
+ "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
+ "mov r8, r0 \n"
+ "shll r0 \n"
+ "xor r7, r0 \n"
+ "and %[rx], r0 \n"
+ "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
+ "shlr r0 \n"
+ "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
+
+ "not %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
+ "extu.b %[mask], %[mask] \n" /* mask out high bits */
+ "tst %[mask], %[mask] \n"
+ "bt .wa_sloop \n" /* short loop if nothing to keep */
+
+ ".wa_floop: \n" /** full loop (there are bits to keep)**/
+ "mov #8, r0 \n"
+ "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
+ "bt .wa_f8 \n"
+
+ "mulu %[psiz], %[dpth] \n"
+ "mova .wa_ftable, r0 \n"
+ "mov.b @(r0, %[dpth]), %[rx] \n"
+ "add %[rx], r0 \n"
+ "sts macl, %[rx] \n" /* point behind the last plane.. */
+ "jmp @r0 \n" /* jump into streak */
+ "add %[rx], %[addr] \n" /* ..for this round */
+
+ ".align 2 \n"
+ ".wa_ftable: \n"
+ ".byte .wa_f0 - .wa_ftable \n"
+ ".byte .wa_f1 - .wa_ftable \n"
+ ".byte .wa_f2 - .wa_ftable \n"
+ ".byte .wa_f3 - .wa_ftable \n"
+ ".byte .wa_f4 - .wa_ftable \n"
+ ".byte .wa_f5 - .wa_ftable \n"
+ ".byte .wa_f6 - .wa_ftable \n"
+ ".byte .wa_f7 - .wa_ftable \n"
+
+ ".wa_f8: \n"
+ "mov %[psiz], %[rx] \n"
+ "shll2 %[rx] \n"
+ "add %[rx], %[rx] \n"
+ "add %[rx], %[addr] \n"
+ /* Point behind the last plane for this round. Note: We're using the
+ * registers backwards in order to reuse the streak for the last round.
+ * Therefore we need to go thru the bitplanes backwards too, otherwise
+ * the bit order would be destroyed which results in more flicker. */
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n" /* load old byte */
+ "and %[mask], r0 \n" /* mask out replaced bits */
+ "or r8, r0 \n" /* set new bits */
+ "mov.b r0, @%[addr] \n" /* store byte */
+ "shlr8 r8 \n" /* shift out used-up byte */
+ ".wa_f7: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r7, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r7 \n"
+ ".wa_f6: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r6, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r6 \n"
+ ".wa_f5: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r5, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r5 \n"
+ ".wa_f4: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r4, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r4 \n"
+ ".wa_f3: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r3, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r3 \n"
+ ".wa_f2: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r2, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r2 \n"
+ ".wa_f1: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b @%[addr], r0 \n"
+ "and %[mask], r0 \n"
+ "or r1, r0 \n"
+ "mov.b r0, @%[addr] \n"
+ "shlr8 r1 \n"
+ ".wa_f0: \n"
+
+ "add %[rx], %[addr] \n" /* correct address */
+ "add #-8, %[dpth] \n"
+ "cmp/pl %[dpth] \n" /* next round if anything left */
+ "bt .wa_floop \n"
+
+ "bra .wa_end \n"
+ "nop \n"
/* References to C library routines used in the precalc block */
- ".align 2 \n"
- ".ashlsi3: \n" /* C library routine: */
- ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
- ".lshrsi3: \n" /* C library routine: */
- ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
+ ".align 2 \n"
+ ".ashlsi3: \n" /* C library routine: */
+ ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
+ ".lshrsi3: \n" /* C library routine: */
+ ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
/* both routines preserve r4, destroy r5 and take ~16 cycles */
- ".wa_sloop: \n" /** short loop (nothing to keep) **/
- "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
- "rotcl r0 \n" /* rotate t bit into r0 */
- "shlr r2 \n"
- "rotcl r0 \n"
- "shlr r3 \n"
- "rotcl r0 \n"
- "shlr r6 \n"
- "rotcl r0 \n"
- "shlr r7 \n"
- "rotcl r0 \n"
- "shlr r8 \n"
- "rotcl r0 \n"
- "shlr r9 \n"
- "rotcl r0 \n"
- "shlr r10 \n"
- "rotcl r0 \n"
- "mov.b r0,@%[addr] \n" /* store byte to bitplane */
- "add %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */
- "bt .wa_sloop \n"
-
- ".wa_end: \n"
+ /* Bitmasks for the bit block rotation */
+ ".wa_mask4: \n"
+ ".long 0xF0F0F0F0 \n"
+ ".wa_mask2: \n"
+ ".long 0xCCCCCCCC \n"
+ ".wa_mask1: \n"
+ ".long 0xAAAAAAAA \n"
+
+ ".wa_sloop: \n" /** short loop (nothing to keep) **/
+ "mov #8, r0 \n"
+ "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
+ "bt .wa_s8 \n"
+
+ "mulu %[psiz], %[dpth] \n"
+ "mova .wa_stable, r0 \n"
+ "mov.b @(r0, %[dpth]), %[rx] \n"
+ "add %[rx], r0 \n"
+ "sts macl, %[rx] \n" /* point behind the last plane.. */
+ "jmp @r0 \n" /* jump into streak */
+ "add %[rx], %[addr] \n" /* ..for this round */
+
+ ".align 2 \n"
+ ".wa_stable: \n"
+ ".byte .wa_s0 - .wa_stable \n"
+ ".byte .wa_s1 - .wa_stable \n"
+ ".byte .wa_s2 - .wa_stable \n"
+ ".byte .wa_s3 - .wa_stable \n"
+ ".byte .wa_s4 - .wa_stable \n"
+ ".byte .wa_s5 - .wa_stable \n"
+ ".byte .wa_s6 - .wa_stable \n"
+ ".byte .wa_s7 - .wa_stable \n"
+
+ ".wa_s8: \n"
+ "mov %[psiz], %[rx] \n" /* Point behind the last plane */
+ "shll2 %[rx] \n" /* for this round. */
+ "add %[rx], %[rx] \n" /* See above. */
+ "add %[rx], %[addr] \n"
+
+ "sub %[psiz], %[addr] \n"
+ "mov.b r8, @%[addr] \n" /* store byte */
+ "shlr8 r8 \n" /* shift out used-up byte */
+ ".wa_s7: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r7, @%[addr] \n"
+ "shlr8 r7 \n"
+ ".wa_s6: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r6, @%[addr] \n"
+ "shlr8 r6 \n"
+ ".wa_s5: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r5, @%[addr] \n"
+ "shlr8 r5 \n"
+ ".wa_s4: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r4, @%[addr] \n"
+ "shlr8 r4 \n"
+ ".wa_s3: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r3, @%[addr] \n"
+ "shlr8 r3 \n"
+ ".wa_s2: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r2, @%[addr] \n"
+ "shlr8 r2 \n"
+ ".wa_s1: \n"
+ "sub %[psiz], %[addr] \n"
+ "mov.b r1, @%[addr] \n"
+ "shlr8 r1 \n"
+ ".wa_s0: \n"
+
+ "add %[rx], %[addr] \n" /* correct address */
+ "add #-8, %[dpth] \n"
+ "cmp/pl %[dpth] \n" /* next round if anything left */
+ "bt .wa_sloop \n"
+
+ ".wa_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(_mask),
+ [dpth]"+r"(depth),
[rx] "=&r"(trash)
: /* inputs */
[psiz]"r"(_gray_info.plane_size),
- [end] "r"(end),
[patp]"[rx]"(pat_ptr)
: /* clobbers */
- "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10"
+ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl"
);
#elif defined(CPU_COLDFIRE)
const unsigned char *_src;
- unsigned _mask, trash;
+ unsigned _mask, depth, trash;
_mask = mask;
_src = src;
/* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */
- asm volatile (
- "moveq.l #8,%%d3 \n" /* loop count */
-
- ".wa_loop: \n" /** load pattern for pixel **/
- "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */
- "lsr.l #1,%[mask] \n" /* shift out lsb of mask */
- "bcc.b .wa_skip \n" /* skip this pixel */
-
- "clr.l %%d0 \n"
- "move.b (%[src]),%%d0 \n" /* load src byte */
- "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */
- "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */
-
- "mulu.w #75,%[rnd] \n" /* multiply by 75 */
- "add.l #74,%[rnd] \n" /* add another 74 */
+ asm volatile
+ (
+ "moveq.l #8, %%d3 \n" /* loop count */
+
+ ".wa_loop: \n" /** load pattern for pixel **/
+ "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */
+ "lsr.l #1, %[mask] \n" /* shift out lsb of mask */
+ "bcc.b .wa_skip \n" /* skip this pixel */
+
+ "clr.l %%d0 \n"
+ "move.b (%[src]), %%d0 \n" /* load src byte */
+ "move.b (%%d0:l:1, %[trns]), %%d0 \n" /* idxtable into pattern index */
+ "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */
+
+ "mulu.w #75, %[rnd] \n" /* multiply by 75 */
+ "add.l #74, %[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */
- "move.l %[rnd],%%d1 \n"
- "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
- "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */
-
- "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */
- "blo.b .wa_ntrim \n"
- "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */
- ".wa_ntrim: \n"
-
- "move.l %%d2,%%d0 \n" /** rotate pattern **/
- "lsl.l %%d1,%%d0 \n"
- "sub.l %[dpth],%%d1 \n"
- "neg.l %%d1 \n" /* d1 = depth - d1 */
- "lsr.l %%d1,%%d2 \n"
- "or.l %%d0,%%d2 \n"
-
- ".wa_skip: \n"
- "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */
-
- "add.l %[stri],%[src] \n" /* src += stride; */
- "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */
- "bne.b .wa_loop \n"
+ "move.l %[rnd], %%d1 \n"
+ "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */
+ "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */
+
+ "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */
+ "blo.b .wa_ntrim \n"
+ "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */
+ ".wa_ntrim: \n"
+
+ "move.l %%d2, %%d0 \n" /** rotate pattern **/
+ "lsl.l %%d1, %%d0 \n"
+ "sub.l %[dpth], %%d1 \n"
+ "neg.l %%d1 \n" /* d1 = depth - d1 */
+ "lsr.l %%d1, %%d2 \n"
+ "or.l %%d0, %%d2 \n"
+
+ ".wa_skip: \n"
+ "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */
+
+ "add.l %[stri], %[src] \n" /* src += stride; */
+ "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */
+ "bne.b .wa_loop \n"
: /* outputs */
[src] "+a"(_src),
[patp]"+a"(pat_ptr),
@@ -1373,97 +1747,297 @@ static void _writearray(unsigned char *address, const unsigned char *src,
);
addr = address;
- end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
- _mask = mask;
+ _mask = ~mask & 0xff;
+ depth = _gray_info.depth;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
- asm volatile (
- "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n"
- /* pop all 8 patterns */
- "not.l %[mask] \n" /* "set" mask -> "keep" mask */
- "and.l #0xFF,%[mask] \n"
- "beq.b .wa_sstart \n" /* short loop if nothing to keep */
-
- ".wa_floop: \n" /** full loop (there are bits to keep)**/
- "lsr.l #1,%%d2 \n" /* shift out pattern bit */
- "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
- "lsr.l #1,%%d3 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d4 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d5 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d6 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%a0,%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%%a0 \n"
- "move.l %%a1,%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%%a1 \n"
- "move.l %[ax],%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%[ax] \n"
-
- "move.b (%[addr]),%%d1 \n" /* read old value */
- "and.l %[mask],%%d1 \n" /* mask out replaced bits */
- "or.l %%d0,%%d1 \n" /* set new bits */
- "move.b %%d1,(%[addr]) \n" /* store value to bitplane */
-
- "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */
- "bhi.b .wa_floop \n"
-
- "bra.b .wa_end \n"
-
- ".wa_sstart: \n"
- "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */
-
- ".wa_sloop: \n" /** short loop (nothing to keep) **/
- "lsr.l #1,%%d2 \n" /* shift out pattern bit */
- "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
- "lsr.l #1,%%d3 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d4 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d5 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%%d6 \n"
- "addx.l %%d0,%%d0 \n"
- "lsr.l #1,%[mask] \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%a1,%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%%a1 \n"
- "move.l %[ax],%%d1 \n"
- "lsr.l #1,%%d1 \n"
- "addx.l %%d0,%%d0 \n"
- "move.l %%d1,%[ax] \n"
-
- "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */
- "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
- "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */
- "bhi.b .wa_sloop \n"
-
- ".wa_end: \n"
+ asm volatile
+ (
+ "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */
+ /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */
+
+ /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
+
+ "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/
+ "lsl.l #4, %%d0 \n"
+ /* move.l %[ax], %%d5 */ /* already in d5 */
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */
+ "move.l %%d2, %%d0 \n"
+ "lsl.l #4, %%d0 \n"
+ "eor.l %%d6, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n"
+ "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */
+ "move.l %%d3, %%d0 \n"
+ "lsl.l #4, %%d0 \n"
+ "eor.l %%d7, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n"
+ "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */
+ "move.l %%d4, %%d0 \n"
+ "lsl.l #4, %%d0 \n"
+ "move.l %%a0, %%d5 \n"
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xF0F0F0F0, %%d0 \n"
+ "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */
+ /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */
+ "lsr.l #4, %%d0 \n"
+ "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */
+
+ "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/
+ "lsl.l #2, %%d0 \n"
+ /* move.l %%a0, %%d5 */ /* still in d5 */
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */
+ "move.l %[ax], %%d5 \n"
+ "move.l %%d5, %%d0 \n"
+ "lsl.l #2, %%d0 \n"
+ "eor.l %%d7, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n"
+ "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */
+ /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */
+ "move.l %%d2, %%d0 \n"
+ "lsl.l #2, %%d0 \n"
+ "eor.l %%d4, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n"
+ "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */
+ "move.l %%d1, %%d0 \n"
+ "lsl.l #2, %%d0 \n"
+ "eor.l %%d3, %%d0 \n"
+ "and.l #0xCCCCCCCC, %%d0 \n"
+ "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */
+ "lsr.l #2, %%d0 \n"
+ "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */
+
+ "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/
+ "lsl.l #1, %%d0 \n"
+ "eor.l %%d2, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */
+ "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */
+ "move.l %%d3, %%d0 \n"
+ "lsl.l #1, %%d0 \n"
+ "eor.l %%d4, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n"
+ "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */
+ /* move.l %[ax], %%d5 */ /* still in d5 */
+ "move.l %%d5, %%d0 \n"
+ "lsl.l #1, %%d0 \n"
+ "eor.l %%d6, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n"
+ "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */
+ "move.l %%d7, %%d0 \n"
+ "lsl.l #1, %%d0 \n"
+ "move.l %%a0, %%d5 \n"
+ "eor.l %%d5, %%d0 \n"
+ "and.l #0xAAAAAAAA, %%d0 \n"
+ "eor.l %%d0, %%d5 \n"
+ "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */
+ "lsr.l #1, %%d0 \n"
+ "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */
+
+ "tst.l %[mask] \n"
+ "jeq .wa_sloop \n" /* short loop if nothing to keep */
+
+ "move.l %[mask], %%d5 \n" /* need mask in data reg. */
+ "move.l %%d1, %[mask] \n" /* free d1 as working reg. */
+
+ ".wa_floop: \n" /** full loop (there are bits to keep)**/
+ "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
+ "bhs.s .wa_f8 \n"
+
+ "move.l %[psiz], %%d0 \n"
+ "move.l %[dpth], %%d1 \n"
+ "mulu.w %%d1, %%d0 \n" /* point behind the last plane */
+ "add.l %%d0, %[addr] \n" /* for this round */
+ "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
+ "bra.s .wa_f1 \n" /* dpth == 0 should never happen */
+ "bra.s .wa_f2 \n"
+ "bra.s .wa_f3 \n"
+ "bra.s .wa_f4 \n"
+ "bra.s .wa_f5 \n"
+ "bra.s .wa_f6 \n"
+ "bra.s .wa_f7 \n"
+
+ ".wa_f8: \n"
+ "move.l %[psiz], %%d0 \n"
+ "lsl.l #3, %%d0 \n"
+ "add.l %%d0, %[addr] \n"
+ /* Point behind the last plane for this round. Note: We're using the
+ * registers backwards in order to reuse the streak for the last round.
+ * Therefore we need to go thru the bitplanes backwards too, otherwise
+ * the bit order would be destroyed which results in more flicker. */
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n" /* load old byte */
+ "and.l %%d5, %%d0 \n" /* mask out replaced bits */
+ "move.l %[mask], %%d1 \n"
+ "or.l %%d1, %%d0 \n" /* set new bits */
+ "move.b %%d0, (%[addr]) \n" /* store byte */
+ "lsr.l #8, %%d1 \n" /* shift out used-up byte */
+ "move.l %%d1, %[mask] \n"
+ ".wa_f7: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d2, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d2 \n"
+ ".wa_f6: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d3, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d3 \n"
+ ".wa_f5: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d4, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d4 \n"
+ ".wa_f4: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "move.l %[ax], %%d1 \n"
+ "or.l %%d1, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d1 \n"
+ "move.l %%d1, %[ax] \n"
+ ".wa_f3: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d6, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d6 \n"
+ ".wa_f2: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "or.l %%d7, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d7 \n"
+ ".wa_f1: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b (%[addr]), %%d0 \n"
+ "and.l %%d5, %%d0 \n"
+ "move.l %%a0, %%d1 \n"
+ "or.l %%d1, %%d0 \n"
+ "move.b %%d0, (%[addr]) \n"
+ "lsr.l #8, %%d1 \n"
+ "move.l %%d1, %%a0 \n"
+
+ "move.l %[psiz], %%d0 \n"
+ "lsl.l #3, %%d0 \n"
+ "add.l %%d0, %[addr] \n" /* correct address */
+ "subq.l #8, %[dpth] \n"
+ "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
+ "jgt .wa_floop \n" /* next round if anything left */
+
+ "jra .wa_end \n"
+
+ ".wa_sloop: \n" /** short loop (nothing to keep) **/
+ "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
+ "bhs.s .wa_s8 \n"
+
+ "move.l %[psiz], %%d0 \n"
+ "move.l %[dpth], %%d5 \n"
+ "mulu.w %%d5, %%d0 \n" /* point behind the last plane */
+ "add.l %%d0, %[addr] \n" /* for this round */
+ "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
+ "bra.s .wa_s1 \n" /* dpth == 0 should never happen */
+ "bra.s .wa_s2 \n"
+ "bra.s .wa_s3 \n"
+ "bra.s .wa_s4 \n"
+ "bra.s .wa_s5 \n"
+ "bra.s .wa_s6 \n"
+ "bra.s .wa_s7 \n"
+
+ ".wa_s8: \n"
+ "move.l %[psiz], %%d0 \n" /* Point behind the last plane */
+ "lsl.l #3, %%d0 \n" /* for this round. */
+ "add.l %%d0, %[addr] \n" /* See above. */
+
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d1, (%[addr]) \n" /* store byte */
+ "lsr.l #8, %%d1 \n" /* shift out used-up byte */
+ ".wa_s7: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d2, (%[addr]) \n"
+ "lsr.l #8, %%d2 \n"
+ ".wa_s6: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d3, (%[addr]) \n"
+ "lsr.l #8, %%d3 \n"
+ ".wa_s5: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d4, (%[addr]) \n"
+ "lsr.l #8, %%d4 \n"
+ ".wa_s4: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.l %[ax], %%d5 \n"
+ "move.b %%d5, (%[addr]) \n"
+ "lsr.l #8, %%d5 \n"
+ "move.l %%d5, %[ax] \n"
+ ".wa_s3: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d6, (%[addr]) \n"
+ "lsr.l #8, %%d6 \n"
+ ".wa_s2: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.b %%d7, (%[addr]) \n"
+ "lsr.l #8, %%d7 \n"
+ ".wa_s1: \n"
+ "sub.l %[psiz], %[addr] \n"
+ "move.l %%a0, %%d5 \n"
+ "move.b %%d5, (%[addr]) \n"
+ "lsr.l #8, %%d5 \n"
+ "move.l %%d5, %%a0 \n"
+
+ "add.l %%d0, %[addr] \n" /* correct address */
+ "subq.l #8, %[dpth] \n"
+ "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
+ "jgt .wa_sloop \n" /* next round if anything left */
+
+ ".wa_end: \n"
: /* outputs */
[addr]"+a"(addr),
- [mask]"+d"(_mask),
+ [dpth]"+a"(depth),
+ [mask]"+a"(_mask),
[ax] "=&a"(trash)
: /* inputs */
[psiz]"a"(_gray_info.plane_size),
- [end] "a"(end),
[patp]"[ax]"(pat_ptr)
: /* clobbers */
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1"
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0"
);
#else /* C version, for reference*/
#warning C version of _writearray() used
+ unsigned char *end;
unsigned test = 1;
int i;