429dc676b1
A large number of functions were defined with external linkage, even though they were only used from within one file. This patch changes their linkage to static and removes the vp8_ prefix from their names, which should make it more obvious to the reader that the function is contained within the current translation unit. Functions that were not referenced were removed. These symbols were identified by: $ nm -A libvpx.a | sort -k3 | uniq -c -f2 | grep ' [A-Z] ' \ | sort | grep '^ *1 ' Change-Id: I59609f58ab65312012c047036ae1e0634f795779
728 lines
23 KiB
NASM
728 lines
23 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
|
|
%define BLOCK_HEIGHT_WIDTH 4
|
|
%define vp8_filter_weight 128
|
|
%define VP8_FILTER_SHIFT 7
|
|
|
|
|
|
;void vp8_filter_block1d_h6_mmx
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned short *output_ptr,
|
|
; unsigned int src_pixels_per_line,
|
|
; unsigned int pixel_step,
|
|
; unsigned int output_height,
|
|
; unsigned int output_width,
|
|
; short * vp8_filter
|
|
;)
|
|
global sym(vp8_filter_block1d_h6_mmx)
|
|
sym(vp8_filter_block1d_h6_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rdx, arg(6) ;vp8_filter
|
|
|
|
movq mm1, [rdx + 16] ; do both the negative taps first!!!
|
|
movq mm2, [rdx + 32] ;
|
|
movq mm6, [rdx + 48] ;
|
|
movq mm7, [rdx + 64] ;
|
|
|
|
mov rdi, arg(1) ;output_ptr
|
|
mov rsi, arg(0) ;src_ptr
|
|
movsxd rcx, dword ptr arg(4) ;output_height
|
|
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
|
|
pxor mm0, mm0 ; mm0 = 00000000
|
|
|
|
nextrow:
|
|
movq mm3, [rsi-2] ; mm3 = p-2..p5
|
|
movq mm4, mm3 ; mm4 = p-2..p5
|
|
psrlq mm3, 8 ; mm3 = p-1..p5
|
|
punpcklbw mm3, mm0 ; mm3 = p-1..p2
|
|
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
|
|
|
|
movq mm5, mm4 ; mm5 = p-2..p5
|
|
punpckhbw mm4, mm0 ; mm5 = p2..p5
|
|
pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
|
|
paddsw mm3, mm4 ; mm3 += mm5
|
|
|
|
movq mm4, mm5 ; mm4 = p-2..p5;
|
|
psrlq mm5, 16 ; mm5 = p0..p5;
|
|
punpcklbw mm5, mm0 ; mm5 = p0..p3
|
|
pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
|
|
paddsw mm3, mm5 ; mm3 += mm5
|
|
|
|
movq mm5, mm4 ; mm5 = p-2..p5
|
|
psrlq mm4, 24 ; mm4 = p1..p5
|
|
punpcklbw mm4, mm0 ; mm4 = p1..p4
|
|
pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
|
|
paddsw mm3, mm4 ; mm3 += mm5
|
|
|
|
; do outer positive taps
|
|
movd mm4, [rsi+3]
|
|
punpcklbw mm4, mm0 ; mm5 = p3..p6
|
|
pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
|
|
paddsw mm3, mm4 ; mm3 += mm5
|
|
|
|
punpcklbw mm5, mm0 ; mm5 = p-2..p1
|
|
pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
|
|
paddsw mm3, mm5 ; mm3 += mm5
|
|
|
|
paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
|
|
packuswb mm3, mm0 ; pack and unpack to saturate
|
|
punpcklbw mm3, mm0 ;
|
|
|
|
movq [rdi], mm3 ; store the results in the destination
|
|
|
|
%if ABI_IS_32BIT
|
|
add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
|
|
add rdi, rax;
|
|
%else
|
|
movsxd r8, dword ptr arg(2) ;src_pixels_per_line
|
|
add rdi, rax;
|
|
|
|
add rsi, r8 ; next line
|
|
%endif
|
|
|
|
dec rcx ; decrement count
|
|
jnz nextrow ; next row
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;void vp8_filter_block1dc_v6_mmx
|
|
;(
|
|
; short *src_ptr,
|
|
; unsigned char *output_ptr,
|
|
; int output_pitch,
|
|
; unsigned int pixels_per_line,
|
|
; unsigned int pixel_step,
|
|
; unsigned int output_height,
|
|
; unsigned int output_width,
|
|
; short * vp8_filter
|
|
;)
|
|
global sym(vp8_filter_block1dc_v6_mmx)
|
|
sym(vp8_filter_block1dc_v6_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 8
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movq mm5, [GLOBAL(rd)]
|
|
push rbx
|
|
mov rbx, arg(7) ;vp8_filter
|
|
movq mm1, [rbx + 16] ; do both the negative taps first!!!
|
|
movq mm2, [rbx + 32] ;
|
|
movq mm6, [rbx + 48] ;
|
|
movq mm7, [rbx + 64] ;
|
|
|
|
movsxd rdx, dword ptr arg(3) ;pixels_per_line
|
|
mov rdi, arg(1) ;output_ptr
|
|
mov rsi, arg(0) ;src_ptr
|
|
sub rsi, rdx
|
|
sub rsi, rdx
|
|
movsxd rcx, DWORD PTR arg(5) ;output_height
|
|
movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
|
|
pxor mm0, mm0 ; mm0 = 00000000
|
|
|
|
|
|
nextrow_cv:
|
|
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
|
|
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
|
|
|
|
|
|
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
|
|
pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
|
|
pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
movq mm4, [rsi] ; mm4 = p0..p3 = row -2
|
|
pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
|
|
add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
|
|
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
|
|
pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
|
|
pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
|
|
paddsw mm3, mm5 ; mm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
|
|
packuswb mm3, mm0 ; pack and saturate
|
|
|
|
movd [rdi],mm3 ; store the results in the destination
|
|
; the subsequent iterations repeat 3 out of 4 of these reads. Since the
|
|
; recon block should be in cache this shouldn't cost much. Its obviously
|
|
; avoidable!!!.
|
|
lea rdi, [rdi+rax] ;
|
|
dec rcx ; decrement count
|
|
jnz nextrow_cv ; next row
|
|
|
|
pop rbx
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;void bilinear_predict8x8_mmx
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; int src_pixels_per_line,
|
|
; int xoffset,
|
|
; int yoffset,
|
|
; unsigned char *dst_ptr,
|
|
; int dst_pitch
|
|
;)
|
|
global sym(vp8_bilinear_predict8x8_mmx)
|
|
sym(vp8_bilinear_predict8x8_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
;const short *HFilter = bilinear_filters_mmx[xoffset];
|
|
;const short *VFilter = bilinear_filters_mmx[yoffset];
|
|
|
|
movsxd rax, dword ptr arg(2) ;xoffset
|
|
mov rdi, arg(4) ;dst_ptr ;
|
|
|
|
shl rax, 5 ; offset * 32
|
|
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
|
|
|
|
add rax, rcx ; HFilter
|
|
mov rsi, arg(0) ;src_ptr ;
|
|
|
|
movsxd rdx, dword ptr arg(5) ;dst_pitch
|
|
movq mm1, [rax] ;
|
|
|
|
movq mm2, [rax+16] ;
|
|
movsxd rax, dword ptr arg(3) ;yoffset
|
|
|
|
pxor mm0, mm0 ;
|
|
|
|
shl rax, 5 ; offset*32
|
|
add rax, rcx ; VFilter
|
|
|
|
lea rcx, [rdi+rdx*8] ;
|
|
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
|
|
|
|
|
|
|
|
; get the first horizontal line done ;
|
|
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
|
movq mm4, mm3 ; make a copy of current line
|
|
|
|
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
|
punpckhbw mm4, mm0 ;
|
|
|
|
pmullw mm3, mm1 ;
|
|
pmullw mm4, mm1 ;
|
|
|
|
movq mm5, [rsi+1] ;
|
|
movq mm6, mm5 ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
punpckhbw mm6, mm0 ;
|
|
|
|
pmullw mm5, mm2 ;
|
|
pmullw mm6, mm2 ;
|
|
|
|
paddw mm3, mm5 ;
|
|
paddw mm4, mm6 ;
|
|
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
paddw mm4, [GLOBAL(rd)] ;
|
|
psraw mm4, VP8_FILTER_SHIFT ;
|
|
|
|
movq mm7, mm3 ;
|
|
packuswb mm7, mm4 ;
|
|
|
|
add rsi, rdx ; next line
|
|
next_row_8x8:
|
|
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
|
movq mm4, mm3 ; make a copy of current line
|
|
|
|
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
|
punpckhbw mm4, mm0 ;
|
|
|
|
pmullw mm3, mm1 ;
|
|
pmullw mm4, mm1 ;
|
|
|
|
movq mm5, [rsi+1] ;
|
|
movq mm6, mm5 ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
punpckhbw mm6, mm0 ;
|
|
|
|
pmullw mm5, mm2 ;
|
|
pmullw mm6, mm2 ;
|
|
|
|
paddw mm3, mm5 ;
|
|
paddw mm4, mm6 ;
|
|
|
|
movq mm5, mm7 ;
|
|
movq mm6, mm7 ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
punpckhbw mm6, mm0
|
|
|
|
pmullw mm5, [rax] ;
|
|
pmullw mm6, [rax] ;
|
|
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
paddw mm4, [GLOBAL(rd)] ;
|
|
psraw mm4, VP8_FILTER_SHIFT ;
|
|
|
|
movq mm7, mm3 ;
|
|
packuswb mm7, mm4 ;
|
|
|
|
|
|
pmullw mm3, [rax+16] ;
|
|
pmullw mm4, [rax+16] ;
|
|
|
|
paddw mm3, mm5 ;
|
|
paddw mm4, mm6 ;
|
|
|
|
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
paddw mm4, [GLOBAL(rd)] ;
|
|
psraw mm4, VP8_FILTER_SHIFT ;
|
|
|
|
packuswb mm3, mm4
|
|
|
|
movq [rdi], mm3 ; store the results in the destination
|
|
|
|
%if ABI_IS_32BIT
|
|
add rsi, rdx ; next line
|
|
add rdi, dword ptr arg(5) ;dst_pitch ;
|
|
%else
|
|
movsxd r8, dword ptr arg(5) ;dst_pitch
|
|
add rsi, rdx ; next line
|
|
add rdi, r8 ;dst_pitch
|
|
%endif
|
|
cmp rdi, rcx ;
|
|
jne next_row_8x8
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;void bilinear_predict8x4_mmx
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; int src_pixels_per_line,
|
|
; int xoffset,
|
|
; int yoffset,
|
|
; unsigned char *dst_ptr,
|
|
; int dst_pitch
|
|
;)
|
|
global sym(vp8_bilinear_predict8x4_mmx)
|
|
sym(vp8_bilinear_predict8x4_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
;const short *HFilter = bilinear_filters_mmx[xoffset];
|
|
;const short *VFilter = bilinear_filters_mmx[yoffset];
|
|
|
|
movsxd rax, dword ptr arg(2) ;xoffset
|
|
mov rdi, arg(4) ;dst_ptr ;
|
|
|
|
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
|
|
shl rax, 5
|
|
|
|
mov rsi, arg(0) ;src_ptr ;
|
|
add rax, rcx
|
|
|
|
movsxd rdx, dword ptr arg(5) ;dst_pitch
|
|
movq mm1, [rax] ;
|
|
|
|
movq mm2, [rax+16] ;
|
|
movsxd rax, dword ptr arg(3) ;yoffset
|
|
|
|
pxor mm0, mm0 ;
|
|
shl rax, 5
|
|
|
|
add rax, rcx
|
|
lea rcx, [rdi+rdx*4] ;
|
|
|
|
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
|
|
|
|
; get the first horizontal line done ;
|
|
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
|
movq mm4, mm3 ; make a copy of current line
|
|
|
|
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
|
punpckhbw mm4, mm0 ;
|
|
|
|
pmullw mm3, mm1 ;
|
|
pmullw mm4, mm1 ;
|
|
|
|
movq mm5, [rsi+1] ;
|
|
movq mm6, mm5 ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
punpckhbw mm6, mm0 ;
|
|
|
|
pmullw mm5, mm2 ;
|
|
pmullw mm6, mm2 ;
|
|
|
|
paddw mm3, mm5 ;
|
|
paddw mm4, mm6 ;
|
|
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
paddw mm4, [GLOBAL(rd)] ;
|
|
psraw mm4, VP8_FILTER_SHIFT ;
|
|
|
|
movq mm7, mm3 ;
|
|
packuswb mm7, mm4 ;
|
|
|
|
add rsi, rdx ; next line
|
|
next_row_8x4:
|
|
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
|
movq mm4, mm3 ; make a copy of current line
|
|
|
|
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
|
punpckhbw mm4, mm0 ;
|
|
|
|
pmullw mm3, mm1 ;
|
|
pmullw mm4, mm1 ;
|
|
|
|
movq mm5, [rsi+1] ;
|
|
movq mm6, mm5 ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
punpckhbw mm6, mm0 ;
|
|
|
|
pmullw mm5, mm2 ;
|
|
pmullw mm6, mm2 ;
|
|
|
|
paddw mm3, mm5 ;
|
|
paddw mm4, mm6 ;
|
|
|
|
movq mm5, mm7 ;
|
|
movq mm6, mm7 ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
punpckhbw mm6, mm0
|
|
|
|
pmullw mm5, [rax] ;
|
|
pmullw mm6, [rax] ;
|
|
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
paddw mm4, [GLOBAL(rd)] ;
|
|
psraw mm4, VP8_FILTER_SHIFT ;
|
|
|
|
movq mm7, mm3 ;
|
|
packuswb mm7, mm4 ;
|
|
|
|
|
|
pmullw mm3, [rax+16] ;
|
|
pmullw mm4, [rax+16] ;
|
|
|
|
paddw mm3, mm5 ;
|
|
paddw mm4, mm6 ;
|
|
|
|
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
paddw mm4, [GLOBAL(rd)] ;
|
|
psraw mm4, VP8_FILTER_SHIFT ;
|
|
|
|
packuswb mm3, mm4
|
|
|
|
movq [rdi], mm3 ; store the results in the destination
|
|
|
|
%if ABI_IS_32BIT
|
|
add rsi, rdx ; next line
|
|
add rdi, dword ptr arg(5) ;dst_pitch ;
|
|
%else
|
|
movsxd r8, dword ptr arg(5) ;dst_pitch
|
|
add rsi, rdx ; next line
|
|
add rdi, r8
|
|
%endif
|
|
cmp rdi, rcx ;
|
|
jne next_row_8x4
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;void bilinear_predict4x4_mmx
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; int src_pixels_per_line,
|
|
; int xoffset,
|
|
; int yoffset,
|
|
; unsigned char *dst_ptr,
|
|
; int dst_pitch
|
|
;)
|
|
global sym(vp8_bilinear_predict4x4_mmx)
|
|
sym(vp8_bilinear_predict4x4_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
;const short *HFilter = bilinear_filters_mmx[xoffset];
|
|
;const short *VFilter = bilinear_filters_mmx[yoffset];
|
|
|
|
movsxd rax, dword ptr arg(2) ;xoffset
|
|
mov rdi, arg(4) ;dst_ptr ;
|
|
|
|
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
|
|
shl rax, 5
|
|
|
|
add rax, rcx ; HFilter
|
|
mov rsi, arg(0) ;src_ptr ;
|
|
|
|
movsxd rdx, dword ptr arg(5) ;ldst_pitch
|
|
movq mm1, [rax] ;
|
|
|
|
movq mm2, [rax+16] ;
|
|
movsxd rax, dword ptr arg(3) ;yoffset
|
|
|
|
pxor mm0, mm0 ;
|
|
shl rax, 5
|
|
|
|
add rax, rcx
|
|
lea rcx, [rdi+rdx*4] ;
|
|
|
|
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
|
|
|
|
; get the first horizontal line done ;
|
|
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
|
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
|
|
|
pmullw mm3, mm1 ;
|
|
movd mm5, [rsi+1] ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
pmullw mm5, mm2 ;
|
|
|
|
paddw mm3, mm5 ;
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
movq mm7, mm3 ;
|
|
packuswb mm7, mm0 ;
|
|
|
|
add rsi, rdx ; next line
|
|
next_row_4x4:
|
|
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
|
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
|
|
|
pmullw mm3, mm1 ;
|
|
movd mm5, [rsi+1] ;
|
|
|
|
punpcklbw mm5, mm0 ;
|
|
pmullw mm5, mm2 ;
|
|
|
|
paddw mm3, mm5 ;
|
|
|
|
movq mm5, mm7 ;
|
|
punpcklbw mm5, mm0 ;
|
|
|
|
pmullw mm5, [rax] ;
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
movq mm7, mm3 ;
|
|
|
|
packuswb mm7, mm0 ;
|
|
|
|
pmullw mm3, [rax+16] ;
|
|
paddw mm3, mm5 ;
|
|
|
|
|
|
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
packuswb mm3, mm0
|
|
movd [rdi], mm3 ; store the results in the destination
|
|
|
|
%if ABI_IS_32BIT
|
|
add rsi, rdx ; next line
|
|
add rdi, dword ptr arg(5) ;dst_pitch ;
|
|
%else
|
|
movsxd r8, dword ptr arg(5) ;dst_pitch ;
|
|
add rsi, rdx ; next line
|
|
add rdi, r8
|
|
%endif
|
|
|
|
cmp rdi, rcx ;
|
|
jne next_row_4x4
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
|
|
SECTION_RODATA
|
|
align 16
|
|
rd:
|
|
times 4 dw 0x40
|
|
|
|
align 16
|
|
global HIDDEN_DATA(sym(vp8_six_tap_mmx))
|
|
sym(vp8_six_tap_mmx):
|
|
times 8 dw 0
|
|
times 8 dw 0
|
|
times 8 dw 128
|
|
times 8 dw 0
|
|
times 8 dw 0
|
|
times 8 dw 0
|
|
|
|
times 8 dw 0
|
|
times 8 dw -6
|
|
times 8 dw 123
|
|
times 8 dw 12
|
|
times 8 dw -1
|
|
times 8 dw 0
|
|
|
|
times 8 dw 2
|
|
times 8 dw -11
|
|
times 8 dw 108
|
|
times 8 dw 36
|
|
times 8 dw -8
|
|
times 8 dw 1
|
|
|
|
times 8 dw 0
|
|
times 8 dw -9
|
|
times 8 dw 93
|
|
times 8 dw 50
|
|
times 8 dw -6
|
|
times 8 dw 0
|
|
|
|
times 8 dw 3
|
|
times 8 dw -16
|
|
times 8 dw 77
|
|
times 8 dw 77
|
|
times 8 dw -16
|
|
times 8 dw 3
|
|
|
|
times 8 dw 0
|
|
times 8 dw -6
|
|
times 8 dw 50
|
|
times 8 dw 93
|
|
times 8 dw -9
|
|
times 8 dw 0
|
|
|
|
times 8 dw 1
|
|
times 8 dw -8
|
|
times 8 dw 36
|
|
times 8 dw 108
|
|
times 8 dw -11
|
|
times 8 dw 2
|
|
|
|
times 8 dw 0
|
|
times 8 dw -1
|
|
times 8 dw 12
|
|
times 8 dw 123
|
|
times 8 dw -6
|
|
times 8 dw 0
|
|
|
|
|
|
align 16
|
|
global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
|
|
sym(vp8_bilinear_filters_mmx):
|
|
times 8 dw 128
|
|
times 8 dw 0
|
|
|
|
times 8 dw 112
|
|
times 8 dw 16
|
|
|
|
times 8 dw 96
|
|
times 8 dw 32
|
|
|
|
times 8 dw 80
|
|
times 8 dw 48
|
|
|
|
times 8 dw 64
|
|
times 8 dw 64
|
|
|
|
times 8 dw 48
|
|
times 8 dw 80
|
|
|
|
times 8 dw 32
|
|
times 8 dw 96
|
|
|
|
times 8 dw 16
|
|
times 8 dw 112
|