From 989021ed3cca4a76a14062bb2b64109cf77027b6 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Wed, 15 Jul 2009 22:14:21 +0000 Subject: Apply some ARMv6 optimisations to YUV blitting. Speeds up mpegplayer on Gigabeat S by ~2% in undithered and ~7.5% in dithered mode. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21889 a1c6a512-1295-4272-9138-f99709370657 --- firmware/target/arm/lcd-as-memframe.S | 68 ++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'firmware/target/arm') diff --git a/firmware/target/arm/lcd-as-memframe.S b/firmware/target/arm/lcd-as-memframe.S index 2d0aff937f..a6f9145fab 100644 --- a/firmware/target/arm/lcd-as-memframe.S +++ b/firmware/target/arm/lcd-as-memframe.S @@ -170,6 +170,11 @@ lcd_write_yuv420_lines: add lr, r9, r7, asr #8 @ lr = r = (Y >> 9) + rv add r7, r10, r7, asr #7 @ r7 = g = (Y >> 8) + guv @ +#if ARM_ARCH >= 6 + usat r1, #5, r1 @ clamp b + usat lr, #5, lr @ clamp r + usat r7, #6, r7 @ clamp g +#else orr r12, r1, lr @ check if clamping is needed... orr r12, r12, r7, asr #1 @ ...at all cmp r12, #31 @ @@ -184,6 +189,7 @@ lcd_write_yuv420_lines: mvnhi r7, r7, asr #31 @ andhi r7, r7, #63 @ 15: @ no clamp @ +#endif @ ldrb r12, [r4, r3] @ r12 = Y' = *(Y'_p + stride) @ @@ -206,6 +212,11 @@ lcd_write_yuv420_lines: add lr, r9, r7, asr #8 @ lr = r = (Y >> 9) + rv add r7, r10, r7, asr #7 @ r7 = g = (Y >> 8) + guv @ +#if ARM_ARCH >= 6 + usat r1, #5, r1 @ clamp b + usat lr, #5, lr @ clamp r + usat r7, #6, r7 @ clamp g +#else orr r12, r1, lr @ check if clamping is needed... orr r12, r12, r7, asr #1 @ ...at all cmp r12, #31 @ @@ -220,6 +231,7 @@ lcd_write_yuv420_lines: mvnhi r7, r7, asr #31 @ andhi r7, r7, #63 @ 15: @ no clamp @ +#endif @ ldrb r12, [r4], #1 @ r12 = Y' = *(Y'_p++) @ @@ -245,6 +257,11 @@ lcd_write_yuv420_lines: add lr, r9, r7, asr #8 @ lr = r = (Y >> 9) + rv add r7, r10, r7, asr #7 @ r7 = g = (Y >> 8) + guv @ +#if ARM_ARCH >= 6 + usat r1, #5, r1 @ clamp b + usat lr, #5, lr @ clamp r + usat r7, #6, r7 @ clamp g +#else orr r12, r1, lr @ check if clamping is needed... orr r12, r12, r7, asr #1 @ ...at all cmp r12, #31 @ @@ -259,6 +276,7 @@ lcd_write_yuv420_lines: mvnhi r7, r7, asr #31 @ andhi r7, r7, #63 @ 15: @ no clamp @ +#endif @ ldrb r12, [r4, r3] @ r12 = Y' = *(Y'_p + stride) @ @@ -281,6 +299,11 @@ lcd_write_yuv420_lines: add lr, r9, r7, asr #8 @ lr = r = (Y >> 9) + rv add r7, r10, r7, asr #7 @ r7 = g = (Y >> 8) + guv @ +#if ARM_ARCH >= 6 + usat r1, #5, r1 @ clamp b + usat lr, #5, lr @ clamp r + usat r7, #6, r7 @ clamp g +#else orr r12, r1, lr @ check if clamping is needed... orr r12, r12, r7, asr #1 @ ...at all cmp r12, #31 @ @@ -295,6 +318,7 @@ lcd_write_yuv420_lines: mvnhi r7, r7, asr #31 @ andhi r7, r7, #63 @ 15: @ no clamp @ +#endif @ orr r12, r1, lr, lsl #11 @ r12 = b | (r << 11) orr r12, r12, r7, lsl #5 @ r12 |= (g << 5) @@ -425,6 +449,16 @@ lcd_write_yuv420_lines_odither: add r11, r11, r12, lsl #1 @ r = r11 + delta*2 add r7, r7, r12, lsr #1 @ g = r7 + delta/2 @ +#if ARM_ARCH >= 6 + usat r11, #5, r11, asr #11 @ clamp r + usat r7, #6, r7, asr #9 @ clamp g + usat r1, #5, r1, asr #10 @ clamp b + @ + ldrb r12, [r4, r3] @ r12 = Y' = *(Y'_p + stride) + @ + orr r1, r1, r11, lsl #11 @ r1 = b | (r << 11) + orr r1, r1, r7, lsl #5 @ r1 |= (g << 5) +#else orr r12, r1, r11, asr #1 @ check if clamping is needed... orr r12, r12, r7 @ ...at all movs r12, r12, asr #15 @ @@ -444,6 +478,7 @@ lcd_write_yuv420_lines_odither: and r7, r7, #0x7e00 @ r1 = pixel = (r & 0xf800) | orr r11, r11, r7, lsr #4 @ ((g & 0x7e00) >> 4) | orr r1, r11, r1, lsr #10 @ (b >> 10) +#endif @ #if LCD_WIDTH >= LCD_HEIGHT strh r1, [r0] @ @@ -477,6 +512,16 @@ lcd_write_yuv420_lines_odither: add r11, r11, r12, lsl #1 @ r = r11 + delta*2 add r7, r7, r12, lsr #1 @ g = r7 + delta/2 @ +#if ARM_ARCH >= 6 + usat r11, #5, r11, asr #11 @ clamp r + usat r7, #6, r7, asr #9 @ clamp g + usat r1, #5, r1, asr #10 @ clamp b + @ + ldrb r12, [r4], #1 @ r12 = Y' = *(Y'_p++) + @ + orr r1, r1, r11, lsl #11 @ r1 = b | (r << 11) + orr r1, r1, r7, lsl #5 @ r1 |= (g << 5) +#else orr r12, r1, r11, asr #1 @ check if clamping is needed... orr r12, r12, r7 @ ...at all movs r12, r12, asr #15 @ @@ -496,6 +541,7 @@ lcd_write_yuv420_lines_odither: and r7, r7, #0x7e00 @ r1 = pixel = (r & 0xf800) | orr r11, r11, r7, lsr #4 @ ((g & 0x7e00) >> 4) | orr r1, r11, r1, lsr #10 @ (b >> 10) +#endif @ #if LCD_WIDTH >= LCD_HEIGHT add r0, r0, #2*LCD_WIDTH @ @@ -534,6 +580,16 @@ lcd_write_yuv420_lines_odither: add r11, r11, r12, lsl #1 @ r = r11 + delta*2 add r7, r7, r12, lsr #1 @ g = r7 + delta/2 @ +#if ARM_ARCH >= 6 + usat r11, #5, r11, asr #11 @ clamp r + usat r7, #6, r7, asr #9 @ clamp g + usat r1, #5, r1, asr #10 @ clamp b + @ + ldrb r12, [r4, r3] @ r12 = Y' = *(Y'_p + stride) + @ + orr r1, r1, r11, lsl #11 @ r1 = b | (r << 11) + orr r1, r1, r7, lsl #5 @ r1 |= (g << 5) +#else orr r12, r1, r11, asr #1 @ check if clamping is needed... orr r12, r12, r7 @ ...at all movs r12, r12, asr #15 @ @@ -547,12 +603,13 @@ lcd_write_yuv420_lines_odither: mvnne r7, r12, lsr #15 @ 15: @ no clamp @ @ - ldrb r12, [r4, r3] @ r12 = Y' = *(Y'_p + stride) + ldrb r12, [r4, r3] @ r12 = Y' = *(Y'_p + stride) @ and r11, r11, #0xf800 @ pack pixel and r7, r7, #0x7e00 @ r1 = pixel = (r & 0xf800) | orr r11, r11, r7, lsr #4 @ ((g & 0x7e00) >> 4) | orr r1, r11, r1, lsr #10 @ (b >> 10) +#endif @ #if LCD_WIDTH >= LCD_HEIGHT strh r1, [r0, #2] @@ -586,6 +643,14 @@ lcd_write_yuv420_lines_odither: add r11, r11, r14, lsl #1 @ r = r11 + delta*2 add r7, r7, r14, lsr #1 @ g = r7 + delta/2 @ +#if ARM_ARCH >= 6 + usat r11, #5, r11, asr #11 @ clamp r + usat r7, #6, r7, asr #9 @ clamp g + usat r1, #5, r1, asr #10 @ clamp b + @ + orr r1, r1, r11, lsl #11 @ r1 = b | (r << 11) + orr r1, r1, r7, lsl #5 @ r1 |= (g << 5) +#else orr r12, r1, r11, asr #1 @ check if clamping is needed... orr r12, r12, r7 @ ...at all movs r12, r12, asr #15 @ @@ -603,6 +668,7 @@ lcd_write_yuv420_lines_odither: and r7, r7, #0x7e00 @ r1 = pixel = (r & 0xf800) | orr r11, r11, r7, lsr #4 @ ((g & 0x7e00) >> 4) | orr r1, r11, r1, lsr #10 @ (b >> 10) +#endif @ #if LCD_WIDTH >= LCD_HEIGHT add r0, r0, #2*LCD_WIDTH -- cgit