Merge "combine loopfilter data access"
This commit is contained in:
commit
48b1917112
813
vp8/common/x86/loopfilter_block_sse2.asm
Normal file
813
vp8/common/x86/loopfilter_block_sse2.asm
Normal file
@ -0,0 +1,813 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%macro LF_ABS 2
|
||||
; %1 value not preserved
|
||||
; %2 value preserved
|
||||
; output in %1
|
||||
movdqa scratch1, %2 ; v2
|
||||
|
||||
psubusb scratch1, %1 ; v2 - v1
|
||||
psubusb %1, %2 ; v1 - v2
|
||||
por %1, scratch1 ; abs(v2 - v1)
|
||||
%endmacro
|
||||
|
||||
%macro LF_FILTER_HEV_MASK 8-9
|
||||
|
||||
LF_ABS %1, %2 ; abs(p3 - p2)
|
||||
LF_ABS %2, %3 ; abs(p2 - p1)
|
||||
pmaxub %1, %2 ; accumulate mask
|
||||
%if %0 == 8
|
||||
movdqa scratch2, %3 ; save p1
|
||||
LF_ABS scratch2, %4 ; abs(p1 - p0)
|
||||
%endif
|
||||
LF_ABS %4, %5 ; abs(p0 - q0)
|
||||
LF_ABS %5, %6 ; abs(q0 - q1)
|
||||
%if %0 == 8
|
||||
pmaxub %5, scratch2 ; accumulate hev
|
||||
%else
|
||||
pmaxub %5, %9
|
||||
%endif
|
||||
pmaxub %1, %5 ; accumulate mask
|
||||
|
||||
LF_ABS %3, %6 ; abs(p1 - q1)
|
||||
LF_ABS %6, %7 ; abs(q1 - q2)
|
||||
pmaxub %1, %6 ; accumulate mask
|
||||
LF_ABS %7, %8 ; abs(q2 - q3)
|
||||
pmaxub %1, %7 ; accumulate mask
|
||||
|
||||
paddusb %4, %4 ; 2 * abs(p0 - q0)
|
||||
pand %3, [GLOBAL(tfe)]
|
||||
psrlw %3, 1 ; abs(p1 - q1) / 2
|
||||
paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
||||
|
||||
psubusb %1, [limit]
|
||||
psubusb %4, [blimit]
|
||||
por %1, %4
|
||||
pcmpeqb %1, zero ; mask
|
||||
|
||||
psubusb %5, [thresh]
|
||||
pcmpeqb %5, zero ; ~hev
|
||||
%endmacro
|
||||
|
||||
%macro LF_FILTER 6
|
||||
; %1-%4: p1-q1
|
||||
; %5: mask
|
||||
; %6: hev
|
||||
|
||||
movdqa scratch2, %6 ; save hev
|
||||
|
||||
pxor %1, [GLOBAL(t80)] ; ps1
|
||||
pxor %4, [GLOBAL(t80)] ; qs1
|
||||
movdqa scratch1, %1
|
||||
psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
|
||||
pandn scratch2, scratch1 ; vp8_filter &= hev
|
||||
|
||||
pxor %2, [GLOBAL(t80)] ; ps0
|
||||
pxor %3, [GLOBAL(t80)] ; qs0
|
||||
movdqa scratch1, %3
|
||||
psubsb scratch1, %2 ; qs0 - ps0
|
||||
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
||||
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
||||
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
||||
pand %5, scratch2 ; &= mask
|
||||
|
||||
movdqa scratch2, %5
|
||||
paddsb %5, [GLOBAL(t4)] ; Filter1
|
||||
paddsb scratch2, [GLOBAL(t3)] ; Filter2
|
||||
|
||||
; Filter1 >> 3
|
||||
movdqa scratch1, zero
|
||||
pcmpgtb scratch1, %5
|
||||
psrlw %5, 3
|
||||
pand scratch1, [GLOBAL(te0)]
|
||||
pand %5, [GLOBAL(t1f)]
|
||||
por %5, scratch1
|
||||
|
||||
psubsb %3, %5 ; qs0 - Filter1
|
||||
pxor %3, [GLOBAL(t80)]
|
||||
|
||||
; Filter2 >> 3
|
||||
movdqa scratch1, zero
|
||||
pcmpgtb scratch1, scratch2
|
||||
psrlw scratch2, 3
|
||||
pand scratch1, [GLOBAL(te0)]
|
||||
pand scratch2, [GLOBAL(t1f)]
|
||||
por scratch2, scratch1
|
||||
|
||||
paddsb %2, scratch2 ; ps0 + Filter2
|
||||
pxor %2, [GLOBAL(t80)]
|
||||
|
||||
; outer tap adjustments
|
||||
paddsb %5, [GLOBAL(t1)]
|
||||
movdqa scratch1, zero
|
||||
pcmpgtb scratch1, %5
|
||||
psrlw %5, 1
|
||||
pand scratch1, [GLOBAL(t80)]
|
||||
pand %5, [GLOBAL(t7f)]
|
||||
por %5, scratch1
|
||||
pand %5, %6 ; vp8_filter &= ~hev
|
||||
|
||||
psubsb %4, %5 ; qs1 - vp8_filter
|
||||
pxor %4, [GLOBAL(t80)]
|
||||
|
||||
paddsb %1, %5 ; ps1 + vp8_filter
|
||||
pxor %1, [GLOBAL(t80)]
|
||||
%endmacro
|
||||
|
||||
;void vp8_loop_filter_bh_y_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixel_step,
|
||||
; const char *blimit,
|
||||
; const char *limit,
|
||||
; const char *thresh
|
||||
;)
|
||||
global sym(vp8_loop_filter_bh_y_sse2)
|
||||
sym(vp8_loop_filter_bh_y_sse2):
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__,x64
|
||||
%define src rcx ; src_ptr
|
||||
%define stride rdx ; src_pixel_step
|
||||
%define blimit r8
|
||||
%define limit r9
|
||||
%define thresh r10
|
||||
|
||||
%define spp rax
|
||||
%define stride3 r11
|
||||
%define stride5 r12
|
||||
%define stride7 r13
|
||||
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
push r12
|
||||
push r13
|
||||
mov thresh, arg(4)
|
||||
%else
|
||||
%define src rdi ; src_ptr
|
||||
%define stride rsi ; src_pixel_step
|
||||
%define blimit rdx
|
||||
%define limit rcx
|
||||
%define thresh r8
|
||||
|
||||
%define spp rax
|
||||
%define stride3 r9
|
||||
%define stride5 r10
|
||||
%define stride7 r11
|
||||
%endif
|
||||
|
||||
%define scratch1 xmm5
|
||||
%define scratch2 xmm6
|
||||
%define zero xmm7
|
||||
|
||||
%define i0 [src]
|
||||
%define i1 [spp]
|
||||
%define i2 [src + 2 * stride]
|
||||
%define i3 [spp + 2 * stride]
|
||||
%define i4 [src + 4 * stride]
|
||||
%define i5 [spp + 4 * stride]
|
||||
%define i6 [src + 2 * stride3]
|
||||
%define i7 [spp + 2 * stride3]
|
||||
%define i8 [src + 8 * stride]
|
||||
%define i9 [spp + 8 * stride]
|
||||
%define i10 [src + 2 * stride5]
|
||||
%define i11 [spp + 2 * stride5]
|
||||
%define i12 [src + 4 * stride3]
|
||||
%define i13 [spp + 4 * stride3]
|
||||
%define i14 [src + 2 * stride7]
|
||||
%define i15 [spp + 2 * stride7]
|
||||
|
||||
; prep work
|
||||
lea spp, [src + stride]
|
||||
lea stride3, [stride + 2 * stride]
|
||||
lea stride5, [stride3 + 2 * stride]
|
||||
lea stride7, [stride3 + 4 * stride]
|
||||
pxor zero, zero
|
||||
|
||||
; load the first set into registers
|
||||
movdqa xmm0, i0
|
||||
movdqa xmm1, i1
|
||||
movdqa xmm2, i2
|
||||
movdqa xmm3, i3
|
||||
movdqa xmm4, i4
|
||||
movdqa xmm8, i5
|
||||
movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
|
||||
movdqa xmm10, i7
|
||||
LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
|
||||
|
||||
movdqa xmm1, i2
|
||||
movdqa xmm2, i3
|
||||
movdqa xmm3, i4
|
||||
movdqa xmm8, i5
|
||||
LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
|
||||
movdqa i2, xmm1
|
||||
movdqa i3, xmm2
|
||||
|
||||
; second set
|
||||
movdqa i4, xmm3
|
||||
movdqa i5, xmm8
|
||||
|
||||
movdqa xmm0, i6
|
||||
movdqa xmm1, i7
|
||||
movdqa xmm2, i8
|
||||
movdqa xmm4, i9
|
||||
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
|
||||
movdqa xmm11, i11
|
||||
LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
|
||||
|
||||
movdqa xmm0, i6
|
||||
movdqa xmm1, i7
|
||||
movdqa xmm4, i8
|
||||
movdqa xmm8, i9
|
||||
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
|
||||
movdqa i6, xmm0
|
||||
movdqa i7, xmm1
|
||||
|
||||
; last set
|
||||
movdqa i8, xmm4
|
||||
movdqa i9, xmm8
|
||||
|
||||
movdqa xmm0, i10
|
||||
movdqa xmm1, i11
|
||||
movdqa xmm2, i12
|
||||
movdqa xmm3, i13
|
||||
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
|
||||
movdqa xmm11, i15
|
||||
LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
|
||||
|
||||
movdqa xmm0, i10
|
||||
movdqa xmm1, i11
|
||||
movdqa xmm3, i12
|
||||
movdqa xmm8, i13
|
||||
LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
|
||||
movdqa i10, xmm0
|
||||
movdqa i11, xmm1
|
||||
movdqa i12, xmm3
|
||||
movdqa i13, xmm8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__,x64
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
%endif
|
||||
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_loop_filter_bv_y_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixel_step,
|
||||
; const char *blimit,
|
||||
; const char *limit,
|
||||
; const char *thresh
|
||||
;)
|
||||
|
||||
global sym(vp8_loop_filter_bv_y_sse2)
|
||||
sym(vp8_loop_filter_bv_y_sse2):
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__,x64
|
||||
%define src rcx ; src_ptr
|
||||
%define stride rdx ; src_pixel_step
|
||||
%define blimit r8
|
||||
%define limit r9
|
||||
%define thresh r10
|
||||
|
||||
%define spp rax
|
||||
%define stride3 r11
|
||||
%define stride5 r12
|
||||
%define stride7 r13
|
||||
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SAVE_XMM 15
|
||||
push r12
|
||||
push r13
|
||||
mov thresh, arg(4)
|
||||
%else
|
||||
%define src rdi
|
||||
%define stride rsi
|
||||
%define blimit rdx
|
||||
%define limit rcx
|
||||
%define thresh r8
|
||||
|
||||
%define spp rax
|
||||
%define stride3 r9
|
||||
%define stride5 r10
|
||||
%define stride7 r11
|
||||
%endif
|
||||
|
||||
%define scratch1 xmm5
|
||||
%define scratch2 xmm6
|
||||
%define zero xmm7
|
||||
|
||||
%define s0 [src]
|
||||
%define s1 [spp]
|
||||
%define s2 [src + 2 * stride]
|
||||
%define s3 [spp + 2 * stride]
|
||||
%define s4 [src + 4 * stride]
|
||||
%define s5 [spp + 4 * stride]
|
||||
%define s6 [src + 2 * stride3]
|
||||
%define s7 [spp + 2 * stride3]
|
||||
%define s8 [src + 8 * stride]
|
||||
%define s9 [spp + 8 * stride]
|
||||
%define s10 [src + 2 * stride5]
|
||||
%define s11 [spp + 2 * stride5]
|
||||
%define s12 [src + 4 * stride3]
|
||||
%define s13 [spp + 4 * stride3]
|
||||
%define s14 [src + 2 * stride7]
|
||||
%define s15 [spp + 2 * stride7]
|
||||
|
||||
%define i0 [rsp]
|
||||
%define i1 [rsp + 16]
|
||||
%define i2 [rsp + 32]
|
||||
%define i3 [rsp + 48]
|
||||
%define i4 [rsp + 64]
|
||||
%define i5 [rsp + 80]
|
||||
%define i6 [rsp + 96]
|
||||
%define i7 [rsp + 112]
|
||||
%define i8 [rsp + 128]
|
||||
%define i9 [rsp + 144]
|
||||
%define i10 [rsp + 160]
|
||||
%define i11 [rsp + 176]
|
||||
%define i12 [rsp + 192]
|
||||
%define i13 [rsp + 208]
|
||||
%define i14 [rsp + 224]
|
||||
%define i15 [rsp + 240]
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
|
||||
; reserve stack space
|
||||
%define temp_storage 0 ; size is 256 (16*16)
|
||||
%define stack_size 256
|
||||
sub rsp, stack_size
|
||||
|
||||
; prep work
|
||||
lea spp, [src + stride]
|
||||
lea stride3, [stride + 2 * stride]
|
||||
lea stride5, [stride3 + 2 * stride]
|
||||
lea stride7, [stride3 + 4 * stride]
|
||||
|
||||
; 8-f
|
||||
movdqa xmm0, s8
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, s9 ; 80 90
|
||||
punpckhbw xmm1, s9 ; 88 98
|
||||
|
||||
movdqa xmm2, s10
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, s11 ; a0 b0
|
||||
punpckhbw xmm3, s11 ; a8 b8
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
|
||||
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm1, xmm3 ; 88 98 a8 b8
|
||||
punpckhwd xmm2, xmm3 ; 8c 9c ac bc
|
||||
|
||||
; using xmm[0124]
|
||||
; work on next 4 rows
|
||||
|
||||
movdqa xmm3, s12
|
||||
movdqa xmm5, xmm3
|
||||
punpcklbw xmm3, s13 ; c0 d0
|
||||
punpckhbw xmm5, s13 ; c8 d8
|
||||
|
||||
movdqa xmm6, s14
|
||||
movdqa xmm7, xmm6
|
||||
punpcklbw xmm6, s15 ; e0 f0
|
||||
punpckhbw xmm7, s15 ; e8 f8
|
||||
|
||||
movdqa xmm8, xmm3
|
||||
punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
|
||||
punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
|
||||
punpckhwd xmm6, xmm7 ; cc dc ec fc
|
||||
|
||||
; pull the third and fourth sets together
|
||||
|
||||
movdqa xmm7, xmm0
|
||||
punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
|
||||
punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
|
||||
punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
|
||||
|
||||
movdqa xmm8, xmm1
|
||||
punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
|
||||
punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
|
||||
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
|
||||
punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
|
||||
|
||||
; save the calculations. we only have 15 registers ...
|
||||
movdqa i0, xmm0
|
||||
movdqa i1, xmm7
|
||||
movdqa i2, xmm4
|
||||
movdqa i3, xmm3
|
||||
movdqa i4, xmm1
|
||||
movdqa i5, xmm8
|
||||
movdqa i6, xmm2
|
||||
movdqa i7, xmm5
|
||||
|
||||
; 0-7
|
||||
movdqa xmm0, s0
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, s1 ; 00 10
|
||||
punpckhbw xmm1, s1 ; 08 18
|
||||
|
||||
movdqa xmm2, s2
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, s3 ; 20 30
|
||||
punpckhbw xmm3, s3 ; 28 38
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm2 ; 00 10 20 30
|
||||
punpckhwd xmm4, xmm2 ; 04 14 24 34
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm1, xmm3 ; 08 18 28 38
|
||||
punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
|
||||
|
||||
; using xmm[0124]
|
||||
; work on next 4 rows
|
||||
|
||||
movdqa xmm3, s4
|
||||
movdqa xmm5, xmm3
|
||||
punpcklbw xmm3, s5 ; 40 50
|
||||
punpckhbw xmm5, s5 ; 48 58
|
||||
|
||||
movdqa xmm6, s6
|
||||
movdqa xmm7, xmm6
|
||||
punpcklbw xmm6, s7 ; 60 70
|
||||
punpckhbw xmm7, s7 ; 68 78
|
||||
|
||||
movdqa xmm8, xmm3
|
||||
punpcklwd xmm3, xmm6 ; 40 50 60 70
|
||||
punpckhwd xmm8, xmm6 ; 44 54 64 74
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
punpcklwd xmm5, xmm7 ; 48 58 68 78
|
||||
punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
|
||||
|
||||
; pull the first two sets together
|
||||
|
||||
movdqa xmm7, xmm0
|
||||
punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
|
||||
punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
|
||||
punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
|
||||
|
||||
movdqa xmm8, xmm1
|
||||
punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
|
||||
punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
|
||||
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
|
||||
punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
|
||||
; final combination
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
punpcklqdq xmm0, i0
|
||||
punpckhqdq xmm6, i0
|
||||
|
||||
movdqa xmm9, xmm7
|
||||
punpcklqdq xmm7, i1
|
||||
punpckhqdq xmm9, i1
|
||||
|
||||
movdqa xmm10, xmm4
|
||||
punpcklqdq xmm4, i2
|
||||
punpckhqdq xmm10, i2
|
||||
|
||||
movdqa xmm11, xmm3
|
||||
punpcklqdq xmm3, i3
|
||||
punpckhqdq xmm11, i3
|
||||
|
||||
movdqa xmm12, xmm1
|
||||
punpcklqdq xmm1, i4
|
||||
punpckhqdq xmm12, i4
|
||||
|
||||
movdqa xmm13, xmm8
|
||||
punpcklqdq xmm8, i5
|
||||
punpckhqdq xmm13, i5
|
||||
|
||||
movdqa xmm14, xmm2
|
||||
punpcklqdq xmm2, i6
|
||||
punpckhqdq xmm14, i6
|
||||
|
||||
movdqa xmm15, xmm5
|
||||
punpcklqdq xmm5, i7
|
||||
punpckhqdq xmm15, i7
|
||||
|
||||
movdqa i0, xmm0
|
||||
movdqa i1, xmm6
|
||||
movdqa i2, xmm7
|
||||
movdqa i3, xmm9
|
||||
movdqa i4, xmm4
|
||||
movdqa i5, xmm10
|
||||
movdqa i6, xmm3
|
||||
movdqa i7, xmm11
|
||||
movdqa i8, xmm1
|
||||
movdqa i9, xmm12
|
||||
movdqa i10, xmm8
|
||||
movdqa i11, xmm13
|
||||
movdqa i12, xmm2
|
||||
movdqa i13, xmm14
|
||||
movdqa i14, xmm5
|
||||
movdqa i15, xmm15
|
||||
|
||||
; TRANSPOSED DATA AVAILABLE ON THE STACK
|
||||
|
||||
movdqa xmm12, xmm6
|
||||
movdqa xmm13, xmm7
|
||||
|
||||
pxor zero, zero
|
||||
|
||||
LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
|
||||
|
||||
movdqa xmm1, i2
|
||||
movdqa xmm2, i3
|
||||
movdqa xmm8, i4
|
||||
movdqa xmm9, i5
|
||||
LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
|
||||
movdqa i2, xmm1
|
||||
movdqa i3, xmm2
|
||||
|
||||
; second set
|
||||
movdqa i4, xmm8
|
||||
movdqa i5, xmm9
|
||||
|
||||
movdqa xmm0, i6
|
||||
movdqa xmm1, i7
|
||||
movdqa xmm2, i8
|
||||
movdqa xmm4, i9
|
||||
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
|
||||
movdqa xmm11, i11
|
||||
LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
|
||||
|
||||
movdqa xmm0, i6
|
||||
movdqa xmm1, i7
|
||||
movdqa xmm3, i8
|
||||
movdqa xmm4, i9
|
||||
LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
|
||||
movdqa i6, xmm0
|
||||
movdqa i7, xmm1
|
||||
|
||||
; last set
|
||||
movdqa i8, xmm3
|
||||
movdqa i9, xmm4
|
||||
|
||||
movdqa xmm0, i10
|
||||
movdqa xmm1, i11
|
||||
movdqa xmm2, i12
|
||||
movdqa xmm8, i13
|
||||
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
|
||||
movdqa xmm11, i15
|
||||
LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
|
||||
|
||||
movdqa xmm0, i10
|
||||
movdqa xmm1, i11
|
||||
movdqa xmm4, i12
|
||||
movdqa xmm8, i13
|
||||
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
|
||||
movdqa i10, xmm0
|
||||
movdqa i11, xmm1
|
||||
movdqa i12, xmm4
|
||||
movdqa i13, xmm8
|
||||
|
||||
|
||||
; RESHUFFLE AND WRITE OUT
|
||||
; 8-f
|
||||
movdqa xmm0, i8
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, i9 ; 80 90
|
||||
punpckhbw xmm1, i9 ; 88 98
|
||||
|
||||
movdqa xmm2, i10
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, i11 ; a0 b0
|
||||
punpckhbw xmm3, i11 ; a8 b8
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
|
||||
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm1, xmm3 ; 88 98 a8 b8
|
||||
punpckhwd xmm2, xmm3 ; 8c 9c ac bc
|
||||
|
||||
; using xmm[0124]
|
||||
; work on next 4 rows
|
||||
|
||||
movdqa xmm3, i12
|
||||
movdqa xmm5, xmm3
|
||||
punpcklbw xmm3, i13 ; c0 d0
|
||||
punpckhbw xmm5, i13 ; c8 d8
|
||||
|
||||
movdqa xmm6, i14
|
||||
movdqa xmm7, xmm6
|
||||
punpcklbw xmm6, i15 ; e0 f0
|
||||
punpckhbw xmm7, i15 ; e8 f8
|
||||
|
||||
movdqa xmm8, xmm3
|
||||
punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
|
||||
punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
|
||||
punpckhwd xmm6, xmm7 ; cc dc ec fc
|
||||
|
||||
; pull the third and fourth sets together
|
||||
|
||||
movdqa xmm7, xmm0
|
||||
punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
|
||||
punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
|
||||
punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
|
||||
|
||||
movdqa xmm8, xmm1
|
||||
punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
|
||||
punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
|
||||
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
|
||||
punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
|
||||
|
||||
; save the calculations. we only have 15 registers ...
|
||||
movdqa i8, xmm0
|
||||
movdqa i9, xmm7
|
||||
movdqa i10, xmm4
|
||||
movdqa i11, xmm3
|
||||
movdqa i12, xmm1
|
||||
movdqa i13, xmm8
|
||||
movdqa i14, xmm2
|
||||
movdqa i15, xmm5
|
||||
|
||||
; 0-7
|
||||
movdqa xmm0, i0
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, i1 ; 00 10
|
||||
punpckhbw xmm1, i1 ; 08 18
|
||||
|
||||
movdqa xmm2, i2
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, i3 ; 20 30
|
||||
punpckhbw xmm3, i3 ; 28 38
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm2 ; 00 10 20 30
|
||||
punpckhwd xmm4, xmm2 ; 04 14 24 34
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm1, xmm3 ; 08 18 28 38
|
||||
punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
|
||||
|
||||
; using xmm[0124]
|
||||
; work on next 4 rows
|
||||
|
||||
movdqa xmm3, i4
|
||||
movdqa xmm5, xmm3
|
||||
punpcklbw xmm3, i5 ; 40 50
|
||||
punpckhbw xmm5, i5 ; 48 58
|
||||
|
||||
movdqa xmm6, i6
|
||||
movdqa xmm7, xmm6
|
||||
punpcklbw xmm6, i7 ; 60 70
|
||||
punpckhbw xmm7, i7 ; 68 78
|
||||
|
||||
movdqa xmm8, xmm3
|
||||
punpcklwd xmm3, xmm6 ; 40 50 60 70
|
||||
punpckhwd xmm8, xmm6 ; 44 54 64 74
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
punpcklwd xmm5, xmm7 ; 48 58 68 78
|
||||
punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
|
||||
|
||||
; pull the first two sets together
|
||||
|
||||
movdqa xmm7, xmm0
|
||||
punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
|
||||
punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
|
||||
punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
|
||||
|
||||
movdqa xmm8, xmm1
|
||||
punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
|
||||
punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
|
||||
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
|
||||
punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
|
||||
; final combination
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
punpcklqdq xmm0, i8
|
||||
punpckhqdq xmm6, i8
|
||||
|
||||
movdqa xmm9, xmm7
|
||||
punpcklqdq xmm7, i9
|
||||
punpckhqdq xmm9, i9
|
||||
|
||||
movdqa xmm10, xmm4
|
||||
punpcklqdq xmm4, i10
|
||||
punpckhqdq xmm10, i10
|
||||
|
||||
movdqa xmm11, xmm3
|
||||
punpcklqdq xmm3, i11
|
||||
punpckhqdq xmm11, i11
|
||||
|
||||
movdqa xmm12, xmm1
|
||||
punpcklqdq xmm1, i12
|
||||
punpckhqdq xmm12, i12
|
||||
|
||||
movdqa xmm13, xmm8
|
||||
punpcklqdq xmm8, i13
|
||||
punpckhqdq xmm13, i13
|
||||
|
||||
movdqa xmm14, xmm2
|
||||
punpcklqdq xmm2, i14
|
||||
punpckhqdq xmm14, i14
|
||||
|
||||
movdqa xmm15, xmm5
|
||||
punpcklqdq xmm5, i15
|
||||
punpckhqdq xmm15, i15
|
||||
|
||||
movdqa s0, xmm0
|
||||
movdqa s1, xmm6
|
||||
movdqa s2, xmm7
|
||||
movdqa s3, xmm9
|
||||
movdqa s4, xmm4
|
||||
movdqa s5, xmm10
|
||||
movdqa s6, xmm3
|
||||
movdqa s7, xmm11
|
||||
movdqa s8, xmm1
|
||||
movdqa s9, xmm12
|
||||
movdqa s10, xmm8
|
||||
movdqa s11, xmm13
|
||||
movdqa s12, xmm2
|
||||
movdqa s13, xmm14
|
||||
movdqa s14, xmm5
|
||||
movdqa s15, xmm15
|
||||
|
||||
; free stack space
|
||||
add rsp, stack_size
|
||||
|
||||
; un-ALIGN_STACK
|
||||
pop rsp
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__,x64
|
||||
pop r13
|
||||
pop r12
|
||||
RESTORE_XMM
|
||||
pop rbp
|
||||
%endif
|
||||
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
te0:
|
||||
times 16 db 0xe0
|
||||
align 16
|
||||
t7f:
|
||||
times 16 db 0x7f
|
||||
align 16
|
||||
tfe:
|
||||
times 16 db 0xfe
|
||||
align 16
|
||||
t1f:
|
||||
times 16 db 0x1f
|
||||
align 16
|
||||
t80:
|
||||
times 16 db 0x80
|
||||
align 16
|
||||
t1:
|
||||
times 16 db 0x01
|
||||
align 16
|
||||
t3:
|
||||
times 16 db 0x03
|
||||
align 16
|
||||
t4:
|
||||
times 16 db 0x04
|
@ -271,6 +271,7 @@
|
||||
|
||||
%endmacro
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
|
||||
;void vp8_loop_filter_horizontal_edge_sse2
|
||||
;(
|
||||
@ -321,6 +322,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%endif
|
||||
|
||||
;void vp8_loop_filter_horizontal_edge_uv_sse2
|
||||
;(
|
||||
@ -1005,6 +1007,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
|
||||
movd [rdi+2*rcx+2], %2
|
||||
%endmacro
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
|
||||
;void vp8_loop_filter_vertical_edge_sse2
|
||||
;(
|
||||
@ -1072,6 +1075,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%endif
|
||||
|
||||
;void vp8_loop_filter_vertical_edge_uv_sse2
|
||||
;(
|
||||
|
@ -17,8 +17,13 @@ prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
|
||||
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
|
||||
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
|
||||
|
||||
#if HAVE_SSE2 && ARCH_X86_64
|
||||
prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
|
||||
prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
|
||||
#else
|
||||
prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
|
||||
prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
|
||||
#endif
|
||||
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
|
||||
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
|
||||
|
||||
@ -132,9 +137,13 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
|
||||
void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi)
|
||||
{
|
||||
#if ARCH_X86_64
|
||||
vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
#else
|
||||
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
#endif
|
||||
|
||||
if (u_ptr)
|
||||
vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
|
||||
@ -153,9 +162,13 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned
|
||||
void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi)
|
||||
{
|
||||
#if ARCH_X86_64
|
||||
vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
#else
|
||||
vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
#endif
|
||||
|
||||
if (u_ptr)
|
||||
vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
|
||||
|
@ -99,6 +99,9 @@ ifeq ($(CONFIG_POSTPROC),yes)
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
|
||||
endif
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm
|
||||
endif
|
||||
|
||||
# common (c)
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
|
||||
|
Loading…
x
Reference in New Issue
Block a user