3a7d467da9
Change-Id: I38dad398844ee424a7a92a745ab703645018d02b
147 lines
3.7 KiB
NASM
147 lines
3.7 KiB
NASM
;
|
|
; Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
%ifdef __YASM_VERSION_ID__
|
|
%if __YASM_VERSION_ID__ < 01020000h
|
|
%error AVX2 is supported only by yasm 1.2.0 or later.
|
|
%endif
|
|
%endif
|
|
%include "x86inc.asm"
|
|
|
|
SECTION .text
|
|
|
|
; cglobal numeric constants are parameters, gpr regs, mm regs
|
|
|
|
; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
|
|
|
|
%macro YUY2TOYROW 2-3
|
|
cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
|
|
%ifidn %1,YUY2
|
|
pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
|
|
psrlw m2, m2, 8
|
|
%endif
|
|
|
|
ALIGN 4
|
|
.convertloop:
|
|
mov%2 m0, [src_yuy2q]
|
|
mov%2 m1, [src_yuy2q + mmsize]
|
|
lea src_yuy2q, [src_yuy2q + mmsize * 2]
|
|
%ifidn %1,YUY2
|
|
pand m0, m0, m2 ; YUY2 even bytes are Y
|
|
pand m1, m1, m2
|
|
%else
|
|
psrlw m0, m0, 8 ; UYVY odd bytes are Y
|
|
psrlw m1, m1, 8
|
|
%endif
|
|
packuswb m0, m0, m1
|
|
%if cpuflag(AVX2)
|
|
vpermq m0, m0, 0xd8
|
|
%endif
|
|
sub pixd, mmsize
|
|
mov%2 [dst_yq], m0
|
|
lea dst_yq, [dst_yq + mmsize]
|
|
jg .convertloop
|
|
REP_RET
|
|
%endmacro
|
|
|
|
; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
|
|
INIT_MMX MMX
|
|
YUY2TOYROW YUY2,a,
|
|
YUY2TOYROW YUY2,u,_Unaligned
|
|
YUY2TOYROW UYVY,a,
|
|
YUY2TOYROW UYVY,u,_Unaligned
|
|
INIT_XMM SSE2
|
|
YUY2TOYROW YUY2,a,
|
|
YUY2TOYROW YUY2,u,_Unaligned
|
|
YUY2TOYROW UYVY,a,
|
|
YUY2TOYROW UYVY,u,_Unaligned
|
|
INIT_YMM AVX2
|
|
YUY2TOYROW YUY2,a,
|
|
YUY2TOYROW UYVY,a,
|
|
|
|
; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
|
|
|
|
%macro SplitUVRow 1-2
|
|
cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
|
|
pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
|
|
psrlw m4, m4, 8
|
|
sub dst_vq, dst_uq
|
|
|
|
ALIGN 4
|
|
.convertloop:
|
|
mov%1 m0, [src_uvq]
|
|
mov%1 m1, [src_uvq + mmsize]
|
|
lea src_uvq, [src_uvq + mmsize * 2]
|
|
psrlw m2, m0, 8 ; odd bytes
|
|
psrlw m3, m1, 8
|
|
pand m0, m0, m4 ; even bytes
|
|
pand m1, m1, m4
|
|
packuswb m0, m0, m1
|
|
packuswb m2, m2, m3
|
|
%if cpuflag(AVX2)
|
|
vpermq m0, m0, 0xd8
|
|
vpermq m2, m2, 0xd8
|
|
%endif
|
|
mov%1 [dst_uq], m0
|
|
mov%1 [dst_uq + dst_vq], m2
|
|
lea dst_uq, [dst_uq + mmsize]
|
|
sub pixd, mmsize
|
|
jg .convertloop
|
|
REP_RET
|
|
%endmacro
|
|
|
|
INIT_MMX MMX
|
|
SplitUVRow a,
|
|
SplitUVRow u,_Unaligned
|
|
INIT_XMM SSE2
|
|
SplitUVRow a,
|
|
SplitUVRow u,_Unaligned
|
|
INIT_YMM AVX2
|
|
SplitUVRow a,
|
|
|
|
; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
|
; int width);
|
|
|
|
%macro MergeUVRow_ 1-2
|
|
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
|
|
sub src_vq, src_uq
|
|
|
|
ALIGN 4
|
|
.convertloop:
|
|
mov%1 m0, [src_uq]
|
|
mov%1 m1, [src_vq]
|
|
lea src_uq, [src_uq + mmsize]
|
|
punpcklbw m2, m0, m1 // first 8 UV pairs
|
|
punpckhbw m0, m0, m1 // next 8 UV pairs
|
|
%if cpuflag(AVX2)
|
|
vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
|
|
vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
|
|
mov%1 [dst_uvq], m1
|
|
mov%1 [dst_uvq + mmsize], m2
|
|
%else
|
|
mov%1 [dst_uvq], m2
|
|
mov%1 [dst_uvq + mmsize], m0
|
|
%endif
|
|
lea dst_uvq, [dst_uvq + mmsize * 2]
|
|
sub pixd, mmsize
|
|
jg .convertloop
|
|
REP_RET
|
|
%endmacro
|
|
|
|
INIT_MMX MMX
|
|
MergeUVRow_ a,
|
|
MergeUVRow_ u,_Unaligned
|
|
INIT_XMM SSE2
|
|
MergeUVRow_ a,
|
|
MergeUVRow_ u,_Unaligned
|
|
INIT_YMM AVX2
|
|
MergeUVRow_ a,
|
|
|