380 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			380 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
; YUV-> RGB conversion code Copyright (C) 2008 Robin Watts (robin;wss.co.uk).
 | 
						|
;
 | 
						|
; Licensed under the GPL. If you need it under another license, contact me
 | 
						|
; and ask.
 | 
						|
;
 | 
						|
;  This program is free software ; you can redistribute it and/or modify
 | 
						|
;  it under the terms of the GNU General Public License as published by
 | 
						|
;  the Free Software Foundation ; either version 2 of the License, or
 | 
						|
;  (at your option) any later version.
 | 
						|
;
 | 
						|
;  This program is distributed in the hope that it will be useful,
 | 
						|
;  but WITHOUT ANY WARRANTY ; without even the implied warranty of
 | 
						|
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
;  GNU General Public License for more details.
 | 
						|
;
 | 
						|
;  You should have received a copy of the GNU General Public License
 | 
						|
;  along with this program ; if not, write to the Free Software
 | 
						|
;  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 | 
						|
;
 | 
						|
;
 | 
						|
; The algorithm used here is based heavily on one created by Sophie Wilson
 | 
						|
; of Acorn/e-14/Broadcomm. Many thanks.
 | 
						|
;
 | 
						|
; Additional tweaks (in the fast fixup code) are from Paul Gardiner.
 | 
						|
;
 | 
						|
; The old implementation of YUV -> RGB did:
 | 
						|
;
 | 
						|
; R = CLAMP((Y-16)*1.164 +           1.596*V)
 | 
						|
; G = CLAMP((Y-16)*1.164 - 0.391*U - 0.813*V)
 | 
						|
; B = CLAMP((Y-16)*1.164 + 2.018*U          )
 | 
						|
;
 | 
						|
; We're going to bend that here as follows:
 | 
						|
;
 | 
						|
; R = CLAMP(y +           1.596*V)
 | 
						|
; G = CLAMP(y - 0.383*U - 0.813*V)
 | 
						|
; B = CLAMP(y + 1.976*U          )
 | 
						|
;
 | 
						|
; where y = 0               for       Y <=  16,
 | 
						|
;       y = (  Y-16)*1.164, for  16 < Y <= 239,
 | 
						|
;       y = (239-16)*1.164, for 239 < Y
 | 
						|
;
 | 
						|
; i.e. We clamp Y to the 16 to 239 range (which it is supposed to be in
 | 
						|
; anyway). We then pick the B_U factor so that B never exceeds 511. We then
 | 
						|
; shrink the G_U factor in line with that to avoid a colour shift as much as
 | 
						|
; possible.
 | 
						|
;
 | 
						|
; We're going to use tables to do it faster, but rather than doing it using
 | 
						|
; 5 tables as as the above suggests, we're going to do it using just 3.
 | 
						|
;
 | 
						|
; We do this by working in parallel within a 32 bit word, and using one
 | 
						|
; table each for Y U and V.
 | 
						|
;
 | 
						|
; Source Y values are    0 to 255, so    0.. 260 after scaling
 | 
						|
; Source U values are -128 to 127, so  -49.. 49(G), -253..251(B) after
 | 
						|
; Source V values are -128 to 127, so -204..203(R), -104..103(G) after
 | 
						|
;
 | 
						|
; So total summed values:
 | 
						|
; -223 <= R <= 481, -173 <= G <= 431, -253 <= B < 511
 | 
						|
;
 | 
						|
; We need to pack R G and B into a 32 bit word, and because of Bs range we
 | 
						|
; need 2 bits above the valid range of B to detect overflow, and another one
 | 
						|
; to detect the sense of the overflow. We therefore adopt the following
 | 
						|
; representation:
 | 
						|
;
 | 
						|
; osGGGGGgggggosBBBBBbbbosRRRRRrrr
 | 
						|
;
 | 
						|
; Each such word breaks down into 3 ranges.
 | 
						|
;
 | 
						|
; osGGGGGggggg   osBBBBBbbb   osRRRRRrrr
 | 
						|
;
 | 
						|
; Thus we have 8 bits for each B and R table entry, and 10 bits for G (good
 | 
						|
; as G is the most noticable one). The s bit for each represents the sign,
 | 
						|
; and o represents the overflow.
 | 
						|
;
 | 
						|
; For R and B we pack the table by taking the 11 bit representation of their
 | 
						|
; values, and toggling bit 10 in the U and V tables.
 | 
						|
;
 | 
						|
; For the green case we calculate 4*G (thus effectively using 10 bits for the
 | 
						|
; valid range) truncate to 12 bits. We toggle bit 11 in the Y table.
 | 
						|
 | 
						|
; Theorarm library
 | 
						|
; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
 | 
						|
 | 
						|
	AREA	|.text|, CODE, READONLY
 | 
						|
 | 
						|
	EXPORT	yuv420_2_rgb888
 | 
						|
	EXPORT	yuv420_2_rgb888_PROFILE
 | 
						|
 | 
						|
; void yuv420_2_rgb565
 | 
						|
;  uint8_t *dst_ptr
 | 
						|
;  uint8_t *y_ptr
 | 
						|
;  uint8_t *u_ptr
 | 
						|
;  uint8_t *v_ptr
 | 
						|
;  int      width
 | 
						|
;  int      height
 | 
						|
;  int      y_span
 | 
						|
;  int      uv_span
 | 
						|
;  int      dst_span
 | 
						|
;  int     *tables
 | 
						|
;  int      dither
 | 
						|
 | 
						|
CONST_flags
 | 
						|
	DCD	0x40080100
 | 
						|
yuv420_2_rgb888
 | 
						|
	; r0 = dst_ptr
 | 
						|
	; r1 = y_ptr
 | 
						|
	; r2 = u_ptr
 | 
						|
	; r3 = v_ptr
 | 
						|
	; <> = width
 | 
						|
	; <> = height
 | 
						|
	; <> = y_span
 | 
						|
	; <> = uv_span
 | 
						|
	; <> = dst_span
 | 
						|
	; <> = y_table
 | 
						|
	; <> = dither
 | 
						|
	STMFD	r13!,{r4-r11,r14}
 | 
						|
 | 
						|
	LDR	r8, [r13,#10*4]		; r8 = height
 | 
						|
	LDR	r10,[r13,#11*4]		; r10= y_span
 | 
						|
	LDR	r9, [r13,#13*4]		; r9 = dst_span
 | 
						|
	LDR	r14,[r13,#14*4]		; r14= y_table
 | 
						|
	LDR	r5, CONST_flags
 | 
						|
	LDR	r11,[r13,#9*4]		; r11= width
 | 
						|
	ADD	r4, r14, #256*4
 | 
						|
	SUBS	r8, r8, #1
 | 
						|
	BLT	end
 | 
						|
	BEQ	trail_row1
 | 
						|
yloop1
 | 
						|
	SUB	r8, r8, r11,LSL #16	; r8 = height-(width<<16)
 | 
						|
	ADDS	r8, r8, #1<<16		; if (width == 1)
 | 
						|
	BGE	trail_pair1		;    just do 1 column
 | 
						|
xloop1
 | 
						|
	LDRB	r11,[r2], #1		; r11 = u  = *u_ptr++
 | 
						|
	LDRB	r12,[r3], #1		; r12 = v  = *v_ptr++
 | 
						|
	LDRB	r7, [r1, r10]		; r7  = y2 = y_ptr[stride]
 | 
						|
	LDRB	r6, [r1], #1		; r6  = y0 = *y_ptr++
 | 
						|
	ADD	r12,r12,#512
 | 
						|
	LDR	r11,[r4, r11,LSL #2]	; r11 = u  = u_table[u]
 | 
						|
	LDR	r12,[r14,r12,LSL #2]	; r12 = v  = v_table[v]
 | 
						|
	LDR	r7, [r14,r7, LSL #2]	; r7  = y2 = y_table[y2]
 | 
						|
	LDR	r6, [r14,r6, LSL #2]	; r6  = y0 = y_table[y0]
 | 
						|
	ADD	r11,r11,r12		; r11 = uv = u+v
 | 
						|
 | 
						|
	ADD	r7, r7, r11		; r7  = y2 + uv
 | 
						|
	ADD	r6, r6, r11		; r6  = y0 + uv
 | 
						|
	ANDS	r12,r7, r5
 | 
						|
	TSTEQ	r6, r5
 | 
						|
	BNE	fix101
 | 
						|
return101
 | 
						|
	; Store the bottom one first
 | 
						|
	ADD	r12,r0, r9
 | 
						|
	STRB	r7,[r12],#1		; Store R
 | 
						|
	MOV	r7, r7, ROR #22
 | 
						|
	STRB	r7,[r12],#1		; Store G
 | 
						|
	MOV	r7, r7, ROR #21
 | 
						|
	STRB	r7,[r12],#1		; Store B
 | 
						|
 | 
						|
	; Then store the top one
 | 
						|
	STRB	r6,[r0], #1		; Store R
 | 
						|
	MOV	r6, r6, ROR #22
 | 
						|
	STRB	r6,[r0], #1		; Store G
 | 
						|
 | 
						|
	LDRB	r7, [r1, r10]		; r7 = y3 = y_ptr[stride]
 | 
						|
	LDRB	r12,[r1], #1		; r12= y1 = *y_ptr++
 | 
						|
	MOV	r6, r6, ROR #21
 | 
						|
	LDR	r7, [r14, r7, LSL #2]	; r7 = y3 = y_table[y2]
 | 
						|
	LDR	r12,[r14, r12,LSL #2]	; r12= y1 = y_table[y0]
 | 
						|
	STRB	r6,[r0], #1		; Store B
 | 
						|
 | 
						|
	ADD	r7, r7, r11		; r7  = y3 + uv
 | 
						|
	ADD	r6, r12,r11		; r6  = y1 + uv
 | 
						|
	ANDS	r12,r7, r5
 | 
						|
	TSTEQ	r6, r5
 | 
						|
	BNE	fix102
 | 
						|
return102
 | 
						|
	; Store the bottom one first
 | 
						|
	ADD	r12,r0, r9
 | 
						|
	STRB	r7,[r12],#1		; Store R
 | 
						|
	MOV	r7, r7, ROR #22
 | 
						|
	STRB	r7,[r12],#1		; Store G
 | 
						|
	MOV	r7, r7, ROR #21
 | 
						|
	STRB	r7,[r12],#1		; Store B
 | 
						|
 | 
						|
	; Then store the top one
 | 
						|
	STRB	r6,[r0], #1		; Store R
 | 
						|
	MOV	r6, r6, ROR #22
 | 
						|
	STRB	r6,[r0], #1		; Store G
 | 
						|
	MOV	r6, r6, ROR #21
 | 
						|
	STRB	r6,[r0], #1		; Store B
 | 
						|
 | 
						|
	ADDS	r8, r8, #2<<16
 | 
						|
	BLT	xloop1
 | 
						|
	MOVS	r8, r8, LSL #16		; Clear the top 16 bits of r8
 | 
						|
	MOV	r8, r8, LSR #16		; If the C bit is clear we still have
 | 
						|
	BCC	trail_pair1		; 1 more pixel pair to do
 | 
						|
end_xloop1
 | 
						|
	LDR	r11,[r13,#9*4]		; r11= width
 | 
						|
	LDR	r12,[r13,#12*4]		; r12= uv_stride
 | 
						|
	ADD	r0, r0, r9, LSL #1
 | 
						|
	SUB	r0, r0, r11,LSL #1
 | 
						|
	SUB	r0, r0, r11
 | 
						|
	ADD	r1, r1, r10,LSL #1
 | 
						|
	SUB	r1, r1, r11
 | 
						|
	SUB	r2, r2, r11,LSR #1
 | 
						|
	SUB	r3, r3, r11,LSR #1
 | 
						|
	ADD	r2, r2, r12
 | 
						|
	ADD	r3, r3, r12
 | 
						|
 | 
						|
	SUBS	r8, r8, #2
 | 
						|
	BGT	yloop1
 | 
						|
 | 
						|
	LDMLTFD	r13!,{r4-r11,pc}
 | 
						|
trail_row1
 | 
						|
	; We have a row of pixels left to do
 | 
						|
	SUB	r8, r8, r11,LSL #16	; r8 = height-(width<<16)
 | 
						|
	ADDS	r8, r8, #1<<16		; if (width == 1)
 | 
						|
	BGE	trail_pix1		;    just do 1 pixel
 | 
						|
xloop12
 | 
						|
	LDRB	r11,[r2], #1		; r11 = u  = *u_ptr++
 | 
						|
	LDRB	r12,[r3], #1		; r12 = v  = *v_ptr++
 | 
						|
	LDRB	r6, [r1], #1		; r6  = y0 = *y_ptr++
 | 
						|
	LDRB	r7, [r1], #1		; r7  = y1 = *y_ptr++
 | 
						|
	ADD	r12,r12,#512
 | 
						|
	LDR	r11,[r4, r11,LSL #2]	; r11 = u  = u_table[u]
 | 
						|
	LDR	r12,[r14,r12,LSL #2]	; r12 = v  = v_table[v]
 | 
						|
	LDR	r7, [r14,r7, LSL #2]	; r7  = y1 = y_table[y1]
 | 
						|
	LDR	r6, [r14,r6, LSL #2]	; r6  = y0 = y_table[y0]
 | 
						|
	ADD	r11,r11,r12		; r11 = uv = u+v
 | 
						|
 | 
						|
	ADD	r6, r6, r11		; r6  = y0 + uv
 | 
						|
	ADD	r7, r7, r11		; r7  = y1 + uv
 | 
						|
	ANDS	r12,r7, r5
 | 
						|
	TSTEQ	r6, r5
 | 
						|
	BNE	fix104
 | 
						|
return104
 | 
						|
	; Store the bottom one first
 | 
						|
	STRB	r6,[r0], #1		; Store R
 | 
						|
	MOV	r6, r6, ROR #22
 | 
						|
	STRB	r6,[r0], #1		; Store G
 | 
						|
	MOV	r6, r6, ROR #21
 | 
						|
	STRB	r6,[r0], #1		; Store B
 | 
						|
 | 
						|
	; Then store the top one
 | 
						|
	STRB	r7,[r0], #1		; Store R
 | 
						|
	MOV	r7, r7, ROR #22
 | 
						|
	STRB	r7,[r0], #1		; Store G
 | 
						|
	MOV	r7, r7, ROR #21
 | 
						|
	STRB	r7,[r0], #1		; Store B
 | 
						|
 | 
						|
	ADDS	r8, r8, #2<<16
 | 
						|
	BLT	xloop12
 | 
						|
	MOVS	r8, r8, LSL #16		; Clear the top 16 bits of r8
 | 
						|
	MOV	r8, r8, LSR #16		; If the C bit is clear we still have
 | 
						|
	BCC	trail_pix1		; 1 more pixel pair to do
 | 
						|
end
 | 
						|
	LDMFD	r13!,{r4-r11,pc}
 | 
						|
trail_pix1
 | 
						|
	; We have a single extra pixel to do
 | 
						|
	LDRB	r11,[r2], #1		; r11 = u  = *u_ptr++
 | 
						|
	LDRB	r12,[r3], #1		; r12 = v  = *v_ptr++
 | 
						|
	LDRB	r6, [r1], #1		; r6  = y0 = *y_ptr++
 | 
						|
	ADD	r12,r12,#512
 | 
						|
	LDR	r11,[r4, r11,LSL #2]	; r11 = u  = u_table[u]
 | 
						|
	LDR	r12,[r14,r12,LSL #2]	; r12 = v  = v_table[v]
 | 
						|
	LDR	r6, [r14,r6, LSL #2]	; r6  = y0 = y_table[y0]
 | 
						|
	ADD	r11,r11,r12		; r11 = uv = u+v
 | 
						|
 | 
						|
	ADD	r6, r6, r11		; r6  = y0 + uv
 | 
						|
	ANDS	r12,r6, r5
 | 
						|
	BNE	fix105
 | 
						|
return105
 | 
						|
	STRB	r6,[r0], #1		; Store R
 | 
						|
	MOV	r6, r6, ROR #22
 | 
						|
	STRB	r6,[r0], #1		; Store G
 | 
						|
	MOV	r6, r6, ROR #21
 | 
						|
	STRB	r6,[r0], #1		; Store B
 | 
						|
 | 
						|
	LDMFD	r13!,{r4-r11,pc}
 | 
						|
 | 
						|
trail_pair1
 | 
						|
	; We have a pair of pixels left to do
 | 
						|
	LDRB	r11,[r2]		; r11 = u  = *u_ptr++
 | 
						|
	LDRB	r12,[r3]		; r12 = v  = *v_ptr++
 | 
						|
	LDRB	r7, [r1, r10]		; r7  = y2 = y_ptr[stride]
 | 
						|
	LDRB	r6, [r1], #1		; r6  = y0 = *y_ptr++
 | 
						|
	ADD	r12,r12,#512
 | 
						|
	LDR	r11,[r4, r11,LSL #2]	; r11 = u  = u_table[u]
 | 
						|
	LDR	r12,[r14,r12,LSL #2]	; r12 = v  = v_table[v]
 | 
						|
	LDR	r7, [r14,r7, LSL #2]	; r7  = y2 = y_table[y2]
 | 
						|
	LDR	r6, [r14,r6, LSL #2]	; r6  = y0 = y_table[y0]
 | 
						|
	ADD	r11,r11,r12		; r11 = uv = u+v
 | 
						|
 | 
						|
	ADD	r7, r7, r11		; r7  = y2 + uv
 | 
						|
	ADD	r6, r6, r11		; r6  = y0 + uv
 | 
						|
	ANDS	r12,r7, r5
 | 
						|
	TSTEQ	r6, r5
 | 
						|
	BNE	fix103
 | 
						|
return103
 | 
						|
	; Store the bottom one first
 | 
						|
	ADD	r12,r0, r9
 | 
						|
	STRB	r7,[r12],#1		; Store R
 | 
						|
	MOV	r7, r7, ROR #22
 | 
						|
	STRB	r7,[r12],#1		; Store G
 | 
						|
	MOV	r7, r7, ROR #21
 | 
						|
	STRB	r7,[r12],#1		; Store B
 | 
						|
 | 
						|
	; Then store the top one
 | 
						|
	STRB	r6,[r0], #1		; Store R
 | 
						|
	MOV	r6, r6, ROR #22
 | 
						|
	STRB	r6,[r0], #1		; Store G
 | 
						|
	MOV	r6, r6, ROR #21
 | 
						|
	STRB	r6,[r0], #1		; Store B
 | 
						|
	B	end_xloop1
 | 
						|
fix101
 | 
						|
	; r7 and r6 are the values, at least one of which has overflowed
 | 
						|
	; r12 = r7 & mask = .s......s......s......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r7, r7, r12		; r7 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r7, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r7, r7, r12,LSR #8	; r7  = fixed value
 | 
						|
 | 
						|
	AND	r12, r6, r5		; r12 = .S......S......S......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r6, r6, r12		; r6 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r6, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r6, r6, r12,LSR #8	; r6  = fixed value
 | 
						|
	B	return101
 | 
						|
fix102
 | 
						|
	; r7 and r6 are the values, at least one of which has overflowed
 | 
						|
	; r12 = r7 & mask = .s......s......s......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r7, r7, r12		; r7 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r7, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r7, r7, r12,LSR #8	; r7  = fixed value
 | 
						|
 | 
						|
	AND	r12, r6, r5		; r12 = .S......S......S......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS..SSSSS.SSSSSS
 | 
						|
	ORR	r6, r6, r12		; r6 |= ..SSSSSS..SSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r6, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r6, r6, r12,LSR #8	; r6  = fixed value
 | 
						|
	B	return102
 | 
						|
fix103
 | 
						|
	; r7 and r6 are the values, at least one of which has overflowed
 | 
						|
	; r12 = r7 & mask = .s......s......s......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r7, r7, r12		; r7 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r7, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r7, r7, r12,LSR #8	; r7  = fixed value
 | 
						|
 | 
						|
	AND	r12, r6, r5		; r12 = .S......S......S......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r6, r6, r12		; r6 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r6, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r6, r6, r12,LSR #8	; r6  = fixed value
 | 
						|
	B	return103
 | 
						|
fix104
 | 
						|
	; r7 and r6 are the values, at least one of which has overflowed
 | 
						|
	; r12 = r7 & mask = .s......s......s......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r7, r7, r12		; r7 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r7, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r7, r7, r12,LSR #8	; r7  = fixed value
 | 
						|
 | 
						|
	AND	r12, r6, r5		; r12 = .S......S......S......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r6, r6, r12		; r6 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r6, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r6, r6, r12,LSR #8	; r6  = fixed value
 | 
						|
	B	return104
 | 
						|
fix105
 | 
						|
	; r6 is the value, which has has overflowed
 | 
						|
	; r12 = r7 & mask = .s......s......s......
 | 
						|
	SUB	r12,r12,r12,LSR #8	; r12 = ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	ORR	r6, r6, r12		; r6 |= ..SSSSSS.SSSSSS.SSSSSS
 | 
						|
	BIC	r12,r5, r6, LSR #1	; r12 = .o......o......o......
 | 
						|
	ADD	r6, r6, r12,LSR #8	; r6  = fixed value
 | 
						|
	B	return105
 | 
						|
 | 
						|
	END
 |