; ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license and patent ; grant that can be found in the LICENSE file in the root of the source ; tree. All contributing project authors may be found in the AUTHORS ; file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" ;void vp8_post_proc_down_and_across_xmm ;( ; unsigned char *src_ptr, ; unsigned char *dst_ptr, ; int src_pixels_per_line, ; int dst_pixels_per_line, ; int rows, ; int cols, ; int flimit ;) global sym(vp8_post_proc_down_and_across_xmm) sym(vp8_post_proc_down_and_across_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 GET_GOT rbx push rsi push rdi ; end prolog %if ABI_IS_32BIT=1 && CONFIG_PIC=1 ALIGN_STACK 16, rax ; move the global rd onto the stack, since we don't have enough registers ; to do PIC addressing movdqa xmm0, [rd42 GLOBAL] sub rsp, 16 movdqa [rsp], xmm0 %define RD42 [rsp] %else %define RD42 [rd42 GLOBAL] %endif movd xmm2, dword ptr arg(6) ;flimit punpcklwd xmm2, xmm2 punpckldq xmm2, xmm2 punpcklqdq xmm2, xmm2 mov rsi, arg(0) ;src_ptr mov rdi, arg(1) ;dst_ptr movsxd rcx, DWORD PTR arg(4) ;rows movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? pxor xmm0, xmm0 ; mm0 = 00000000 nextrow: xor rdx, rdx ; clear out rdx for use as loop counter nextcol: movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 punpcklbw xmm3, xmm0 ; mm3 = p0..p3 movdqa xmm1, xmm3 ; mm1 = p0..p3 psllw xmm3, 2 ; movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 paddusw xmm3, xmm5 ; mm3 += mm6 ; thresholding movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) pcmpgtw xmm7, xmm2 movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds neg rax movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 paddusw xmm3, xmm4 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds paddusw xmm3, RD42 ; mm3 += round value psraw xmm3, 3 ; mm3 /= 8 pand xmm1, xmm7 ; mm1 select vals > thresh from source pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result paddusw xmm1, xmm7 ; combination packuswb xmm1, xmm0 ; pack to bytes movq QWORD PTR [rdi], xmm1 ; neg rax ; pitch is positive add rsi, 8 add rdi, 8 add rdx, 8 cmp edx, dword arg(5) ;cols jl nextcol ; done with the all cols, start the across filtering in place sub rsi, rdx sub rdi, rdx xor rdx, rdx movq mm0, QWORD PTR [rdi-8]; acrossnextcol: movq xmm7, QWORD PTR [rdi +rdx -2] movd xmm4, DWORD PTR [rdi +rdx +6] pslldq xmm4, 8 por xmm4, xmm7 movdqa xmm3, xmm4 psrldq xmm3, 2 punpcklbw xmm3, xmm0 ; mm3 = p0..p3 movdqa xmm1, xmm3 ; mm1 = p0..p3 psllw xmm3, 2 movdqa xmm5, xmm4 psrldq xmm5, 3 punpcklbw xmm5, xmm0 ; mm5 = p1..p4 paddusw xmm3, xmm5 ; mm3 += mm6 ; thresholding movdqa xmm7, xmm1 ; mm7 = p0..p3 psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) pcmpgtw xmm7, xmm2 movdqa xmm5, xmm4 psrldq xmm5, 4 punpcklbw xmm5, xmm0 ; mm5 = p2..p5 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = p0..p3 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds movdqa xmm5, xmm4 ; mm5 = p-2..p5 punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = p0..p3 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds psrldq xmm4, 1 ; mm4 = p-1..p5 punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 paddusw xmm3, xmm4 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = p0..p3 psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds paddusw xmm3, RD42 ; mm3 += round value psraw xmm3, 3 ; mm3 /= 8 pand xmm1, xmm7 ; mm1 select vals > thresh from source pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result paddusw xmm1, xmm7 ; combination packuswb xmm1, xmm0 ; pack to bytes movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes movdq2q mm0, xmm1 add rdx, 8 cmp edx, dword arg(5) ;cols jl acrossnextcol; ; last 8 pixels movq QWORD PTR [rdi+rdx-8], mm0 ; done with this rwo add rsi,rax ; next line mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? add rdi,rax ; next destination mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? dec rcx ; decrement count jnz nextrow ; next row %if ABI_IS_32BIT=1 && CONFIG_PIC=1 add rsp,16 pop rsp %endif ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret %undef RD42 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst, ; int pitch, int rows, int cols,int flimit) extern sym(vp8_rv) global sym(vp8_mbpost_proc_down_xmm) sym(vp8_mbpost_proc_down_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 128+16 ; unsigned char d[16][8] at [rsp] ; create flimit2 at [rsp+128] mov eax, dword ptr arg(4) ;flimit mov [rsp+128], eax mov [rsp+128+4], eax mov [rsp+128+8], eax mov [rsp+128+12], eax %define flimit4 [rsp+128] %if ABI_IS_32BIT=0 lea r8, [sym(vp8_rv) GLOBAL] %endif ;rows +=8; add dword arg(2), 8 ;for(c=0; c