; ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" %define xmm_filter_shift 7 ;unsigned int vp8_get_mb_ss_sse2 ;( ; short *src_ptr ;) global sym(vp8_get_mb_ss_sse2) sym(vp8_get_mb_ss_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 1 GET_GOT rbx push rsi push rdi sub rsp, 16 ; end prolog mov rax, arg(0) ;[src_ptr] mov rcx, 8 pxor xmm4, xmm4 NEXTROW: movdqa xmm0, [rax] movdqa xmm1, [rax+16] movdqa xmm2, [rax+32] movdqa xmm3, [rax+48] pmaddwd xmm0, xmm0 pmaddwd xmm1, xmm1 pmaddwd xmm2, xmm2 pmaddwd xmm3, xmm3 paddd xmm0, xmm1 paddd xmm2, xmm3 paddd xmm4, xmm0 paddd xmm4, xmm2 add rax, 0x40 dec rcx ja NEXTROW movdqa xmm3,xmm4 psrldq xmm4,8 paddd xmm4,xmm3 movdqa xmm3,xmm4 psrldq xmm4,4 paddd xmm4,xmm3 movd rax,xmm4 ; begin epilog add rsp, 16 pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;unsigned int vp8_get16x16var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, ; unsigned char * ref_ptr, ; int recon_stride, ; unsigned int * SSE, ; int * Sum ;) global sym(vp8_get16x16var_sse2) sym(vp8_get16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 GET_GOT rbx push rsi push rdi sub rsp, 16 ; end prolog mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] movsxd rax, DWORD PTR arg(1) ;[source_stride] movsxd rdx, DWORD PTR arg(3) ;[recon_stride] pxor xmm0, xmm0 ; clear xmm0 for unpack pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs pxor xmm6, xmm6 ; clear xmm6 for accumulating sse mov rcx, 16 var16loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] movdqa xmm3, xmm1 movdqa xmm4, xmm2 punpcklbw xmm1, xmm0 punpckhbw xmm3, xmm0 punpcklbw xmm2, xmm0 punpckhbw xmm4, xmm0 psubw xmm1, xmm2 psubw xmm3, xmm4 paddw xmm7, xmm1 pmaddwd xmm1, xmm1 paddw xmm7, xmm3 pmaddwd xmm3, xmm3 paddd xmm6, xmm1 paddd xmm6, xmm3 add rsi, rax add rdi, rdx sub rcx, 1 jnz var16loop movdqa xmm1, xmm6 pxor xmm6, xmm6 pxor xmm5, xmm5 punpcklwd xmm6, xmm7 punpckhwd xmm5, xmm7 psrad xmm5, 16 psrad xmm6, 16 paddd xmm6, xmm5 movdqa xmm2, xmm1 punpckldq xmm1, xmm0 punpckhdq xmm2, xmm0 movdqa xmm7, xmm6 paddd xmm1, xmm2 punpckldq xmm6, xmm0 punpckhdq xmm7, xmm0 paddd xmm6, xmm7 movdqa xmm2, xmm1 movdqa xmm7, xmm6 psrldq xmm1, 8 psrldq xmm6, 8 paddd xmm7, xmm6 paddd xmm1, xmm2 mov rax, arg(5) ;[Sum] mov rdi, arg(4) ;[SSE] movd DWORD PTR [rax], xmm7 movd DWORD PTR [rdi], xmm1 ; begin epilog add rsp, 16 pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;unsigned int vp8_get16x16pred_error_sse2 ;( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride ;) global sym(vp8_get16x16pred_error_sse2) sym(vp8_get16x16pred_error_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 GET_GOT rbx push rsi push rdi sub rsp, 16 ; end prolog mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] movsxd rax, DWORD PTR arg(1) ;[src_stride] movsxd rdx, DWORD PTR arg(3) ;[ref_stride] pxor xmm0, xmm0 ; clear xmm0 for unpack pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs pxor xmm6, xmm6 ; clear xmm6 for accumulating sse mov rcx, 16 var16peloop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] movdqa xmm3, xmm1 movdqa xmm4, xmm2 punpcklbw xmm1, xmm0 punpckhbw xmm3, xmm0 punpcklbw xmm2, xmm0 punpckhbw xmm4, xmm0 psubw xmm1, xmm2 psubw xmm3, xmm4 paddw xmm7, xmm1 pmaddwd xmm1, xmm1 paddw xmm7, xmm3 pmaddwd xmm3, xmm3 paddd xmm6, xmm1 paddd xmm6, xmm3 add rsi, rax add rdi, rdx sub rcx, 1 jnz var16peloop movdqa xmm1, xmm6 pxor xmm6, xmm6 pxor xmm5, xmm5 punpcklwd xmm6, xmm7 punpckhwd xmm5, xmm7 psrad xmm5, 16 psrad xmm6, 16 paddd xmm6, xmm5 movdqa xmm2, xmm1 punpckldq xmm1, xmm0 punpckhdq xmm2, xmm0 movdqa xmm7, xmm6 paddd xmm1, xmm2 punpckldq xmm6, xmm0 punpckhdq xmm7, xmm0 paddd xmm6, xmm7 movdqa xmm2, xmm1 movdqa xmm7, xmm6 psrldq xmm1, 8 psrldq xmm6, 8 paddd xmm7, xmm6 paddd xmm1, xmm2 movd DWORD PTR [rsp], xmm7 ;Sum movd DWORD PTR [rsp+4], xmm1 ;SSE ; return (SSE-((Sum*Sum)>>8)); movsxd rdx, dword ptr [rsp] imul rdx, rdx sar rdx, 8 movsxd rax, dword ptr [rsp + 4] sub rax, rdx ; begin epilog add rsp, 16 pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;unsigned int vp8_get8x8var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, ; unsigned char * ref_ptr, ; int recon_stride, ; unsigned int * SSE, ; int * Sum ;) global sym(vp8_get8x8var_sse2) sym(vp8_get8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 GET_GOT rbx push rsi push rdi sub rsp, 16 ; end prolog mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] movsxd rax, DWORD PTR arg(1) ;[source_stride] movsxd rdx, DWORD PTR arg(3) ;[recon_stride] pxor xmm0, xmm0 ; clear xmm0 for unpack pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs movq xmm1, QWORD PTR [rsi] movq xmm2, QWORD PTR [rdi] punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 psubsw xmm1, xmm2 paddw xmm7, xmm1 pmaddwd xmm1, xmm1 movq xmm2, QWORD PTR[rsi + rax] movq xmm3, QWORD PTR[rdi + rdx] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 psubsw xmm2, xmm3 paddw xmm7, xmm2 pmaddwd xmm2, xmm2 paddd xmm1, xmm2 movq xmm2, QWORD PTR[rsi + rax * 2] movq xmm3, QWORD PTR[rdi + rdx * 2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 psubsw xmm2, xmm3 paddw xmm7, xmm2 pmaddwd xmm2, xmm2 paddd xmm1, xmm2 lea rsi, [rsi + rax * 2] lea rdi, [rdi + rdx * 2] movq xmm2, QWORD PTR[rsi + rax] movq xmm3, QWORD PTR[rdi + rdx] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 psubsw xmm2, xmm3 paddw xmm7, xmm2 pmaddwd xmm2, xmm2 paddd xmm1, xmm2 movq xmm2, QWORD PTR[rsi + rax *2] movq xmm3, QWORD PTR[rdi + rdx *2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 psubsw xmm2, xmm3 paddw xmm7, xmm2 pmaddwd xmm2, xmm2 paddd xmm1, xmm2 lea rsi, [rsi + rax * 2] lea rdi, [rdi + rdx * 2] movq xmm2, QWORD PTR[rsi + rax] movq xmm3, QWORD PTR[rdi + rdx] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 psubsw xmm2, xmm3 paddw xmm7, xmm2 pmaddwd xmm2, xmm2 paddd xmm1, xmm2 movq xmm2, QWORD PTR[rsi + rax *2] movq xmm3, QWORD PTR[rdi + rdx *2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 psubsw xmm2, xmm3 paddw xmm7, xmm2 pmaddwd xmm2, xmm2 paddd xmm1, xmm2 lea rsi, [rsi + rax * 2] lea rdi, [rdi + rdx * 2] movq xmm2, QWORD PTR[rsi + rax] movq xmm3, QWORD PTR[rdi + rdx] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 psubsw xmm2, xmm3 paddw xmm7, xmm2 pmaddwd xmm2, xmm2 paddd xmm1, xmm2 movdqa xmm6, xmm7 punpcklwd xmm6, xmm0 punpckhwd xmm7, xmm0 movdqa xmm2, xmm1 paddw xmm6, xmm7 punpckldq xmm1, xmm0 punpckhdq xmm2, xmm0 movdqa xmm7, xmm6 paddd xmm1, xmm2 punpckldq xmm6, xmm0 punpckhdq xmm7, xmm0 paddw xmm6, xmm7 movdqa xmm2, xmm1 movdqa xmm7, xmm6 psrldq xmm1, 8 psrldq xmm6, 8 paddw xmm7, xmm6 paddd xmm1, xmm2 mov rax, arg(5) ;[Sum] mov rdi, arg(4) ;[SSE] movd rdx, xmm7 movsx rcx, dx mov dword ptr [rax], ecx movd DWORD PTR [rdi], xmm1 ; begin epilog add rsp, 16 pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;void vp8_filter_block2d_bil_var_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, ; unsigned char *src_ptr, ; int src_pixels_per_line, ; unsigned int Height, ; unsigned short *HFilter, ; unsigned short *VFilter, ; int *sum, ; unsigned int *sumsquared;; ; ;) global sym(vp8_filter_block2d_bil_var_sse2) sym(vp8_filter_block2d_bil_var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 GET_GOT rbx push rsi push rdi sub rsp, 16 ; end prolog pxor xmm6, xmm6 ; pxor xmm7, xmm7 ; mov rax, arg(5) ;HFilter ; mov rdx, arg(6) ;VFilter ; mov rsi, arg(0) ;ref_ptr ; mov rdi, arg(2) ;src_ptr ; movsxd rcx, dword ptr arg(4) ;Height ; pxor xmm0, xmm0 ; movq xmm1, QWORD PTR [rsi] ; movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; pmullw xmm1, [rax] ; punpcklbw xmm3, xmm0 ; pmullw xmm3, [rax+16] ; paddw xmm1, xmm3 ; paddw xmm1, [xmm_bi_rd GLOBAL] ; psraw xmm1, xmm_filter_shift ; movdqa xmm5, xmm1 %if ABI_IS_32BIT add rsi, dword ptr arg(1) ;ref_pixels_per_line ; %else movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; add rsi, r8 %endif filter_block2d_bil_var_sse2_loop: movq xmm1, QWORD PTR [rsi] ; movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; pmullw xmm1, [rax] ; punpcklbw xmm3, xmm0 ; pmullw xmm3, [rax+16] ; paddw xmm1, xmm3 ; paddw xmm1, [xmm_bi_rd GLOBAL] ; psraw xmm1, xmm_filter_shift ; movdqa xmm3, xmm5 ; movdqa xmm5, xmm1 ; pmullw xmm3, [rdx] ; pmullw xmm1, [rdx+16] ; paddw xmm1, xmm3 ; paddw xmm1, [xmm_bi_rd GLOBAL] ; psraw xmm1, xmm_filter_shift ; movq xmm3, QWORD PTR [rdi] ; punpcklbw xmm3, xmm0 ; psubw xmm1, xmm3 ; paddw xmm6, xmm1 ; pmaddwd xmm1, xmm1 ; paddd xmm7, xmm1 ; %if ABI_IS_32BIT add rsi, dword ptr arg(1) ;ref_pixels_per_line ; add rdi, dword ptr arg(3) ;src_pixels_per_line ; %else movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; add rsi, r8 add rdi, r9 %endif sub rcx, 1 ; jnz filter_block2d_bil_var_sse2_loop ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; psrldq xmm6, 8 psrldq xmm7, 8 movdq2q mm2, xmm6 movdq2q mm3, xmm7 paddw mm6, mm2 paddd mm7, mm3 pxor mm3, mm3 ; pxor mm2, mm2 ; punpcklwd mm2, mm6 ; punpckhwd mm3, mm6 ; paddd mm2, mm3 ; movq mm6, mm2 ; psrlq mm6, 32 ; paddd mm2, mm6 ; psrad mm2, 16 ; movq mm4, mm7 ; psrlq mm4, 32 ; paddd mm4, mm7 ; mov rsi, arg(7) ; sum mov rdi, arg(8) ; sumsquared movd [rsi], mm2 ; xsum movd [rdi], mm4 ; xxsum ; begin epilog add rsp, 16 pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;void vp8_half_horiz_vert_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, ; unsigned char *src_ptr, ; int src_pixels_per_line, ; unsigned int Height, ; int *sum, ; unsigned int *sumsquared ;) global sym(vp8_half_horiz_vert_variance16x_h_sse2) sym(vp8_half_horiz_vert_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 GET_GOT rbx push rsi push rdi ; end prolog %if ABI_IS_32BIT=0 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line movsxd r9, dword ptr arg(3) ;src_pixels_per_line %endif pxor xmm6, xmm6 ; error accumulator pxor xmm7, xmm7 ; sse eaccumulator mov rsi, arg(0) ;ref_ptr ; mov rdi, arg(2) ;src_ptr ; movsxd rcx, dword ptr arg(4) ;Height ; movsxd rax, dword ptr arg(1) ;ref_pixels_per_line pxor xmm0, xmm0 ; movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 %if ABI_IS_32BIT add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source %else add rsi, r8 %endif vp8_half_horiz_vert_variance16x_h_1: movq xmm1, QWORD PTR [rsi] ; movq xmm2, QWORD PTR [rsi+1] ; pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 pavgb xmm5, xmm1 ; xmm = vertical average of the above punpcklbw xmm5, xmm0 ; xmm5 = words of above movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 punpcklbw xmm3, xmm0 ; xmm3 = words of above psubw xmm5, xmm3 ; xmm5 -= xmm3 paddw xmm6, xmm5 ; xmm6 += accumulated column differences pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences movdqa xmm5, xmm1 ; save xmm1 for use on the next row %if ABI_IS_32BIT add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination %else add rsi, r8 add rdi, r9 %endif sub rcx, 1 ; jnz vp8_half_horiz_vert_variance16x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; psrldq xmm6, 8 psrldq xmm7, 8 movdq2q mm2, xmm6 movdq2q mm3, xmm7 paddw mm6, mm2 paddd mm7, mm3 pxor mm3, mm3 ; pxor mm2, mm2 ; punpcklwd mm2, mm6 ; punpckhwd mm3, mm6 ; paddd mm2, mm3 ; movq mm6, mm2 ; psrlq mm6, 32 ; paddd mm2, mm6 ; psrad mm2, 16 ; movq mm4, mm7 ; psrlq mm4, 32 ; paddd mm4, mm7 ; mov rsi, arg(5) ; sum mov rdi, arg(6) ; sumsquared movd [rsi], mm2 ; movd [rdi], mm4 ; ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;void vp8_half_vert_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, ; unsigned char *src_ptr, ; int src_pixels_per_line, ; unsigned int Height, ; int *sum, ; unsigned int *sumsquared ;) global sym(vp8_half_vert_variance16x_h_sse2) sym(vp8_half_vert_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 GET_GOT rbx push rsi push rdi ; end prolog %if ABI_IS_32BIT=0 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line movsxd r9, dword ptr arg(3) ;src_pixels_per_line %endif pxor xmm6, xmm6 ; error accumulator pxor xmm7, xmm7 ; sse eaccumulator mov rsi, arg(0) ;ref_ptr ; mov rdi, arg(2) ;src_ptr ; movsxd rcx, dword ptr arg(4) ;Height ; movsxd rax, dword ptr arg(1) ;ref_pixels_per_line pxor xmm0, xmm0 ; vp8_half_vert_variance16x_h_1: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) punpcklbw xmm5, xmm0 ; xmm5 = words of above movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 punpcklbw xmm3, xmm0 ; xmm3 = words of above psubw xmm5, xmm3 ; xmm5 -= xmm3 paddw xmm6, xmm5 ; xmm6 += accumulated column differences pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences %if ABI_IS_32BIT add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination %else add rsi, r8 add rdi, r9 %endif sub rcx, 1 ; jnz vp8_half_vert_variance16x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; psrldq xmm6, 8 psrldq xmm7, 8 movdq2q mm2, xmm6 movdq2q mm3, xmm7 paddw mm6, mm2 paddd mm7, mm3 pxor mm3, mm3 ; pxor mm2, mm2 ; punpcklwd mm2, mm6 ; punpckhwd mm3, mm6 ; paddd mm2, mm3 ; movq mm6, mm2 ; psrlq mm6, 32 ; paddd mm2, mm6 ; psrad mm2, 16 ; movq mm4, mm7 ; psrlq mm4, 32 ; paddd mm4, mm7 ; mov rsi, arg(5) ; sum mov rdi, arg(6) ; sumsquared movd [rsi], mm2 ; movd [rdi], mm4 ; ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;void vp8_half_horiz_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, ; unsigned char *src_ptr, ; int src_pixels_per_line, ; unsigned int Height, ; int *sum, ; unsigned int *sumsquared ;) global sym(vp8_half_horiz_variance16x_h_sse2) sym(vp8_half_horiz_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 GET_GOT rbx push rsi push rdi ; end prolog %if ABI_IS_32BIT=0 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line movsxd r9, dword ptr arg(3) ;src_pixels_per_line %endif pxor xmm6, xmm6 ; error accumulator pxor xmm7, xmm7 ; sse eaccumulator mov rsi, arg(0) ;ref_ptr ; mov rdi, arg(2) ;src_ptr ; movsxd rcx, dword ptr arg(4) ;Height ; pxor xmm0, xmm0 ; vp8_half_horiz_variance16x16_1: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) punpcklbw xmm5, xmm0 ; xmm5 = words of above movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 punpcklbw xmm3, xmm0 ; xmm3 = words of above psubw xmm5, xmm3 ; xmm5 -= xmm3 paddw xmm6, xmm5 ; xmm6 += accumulated column differences pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences %if ABI_IS_32BIT add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination %else add rsi, r8 add rdi, r9 %endif sub rcx, 1 ; jnz vp8_half_horiz_variance16x16_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; psrldq xmm6, 8 psrldq xmm7, 8 movdq2q mm2, xmm6 movdq2q mm3, xmm7 paddw mm6, mm2 paddd mm7, mm3 pxor mm3, mm3 ; pxor mm2, mm2 ; punpcklwd mm2, mm6 ; punpckhwd mm3, mm6 ; paddd mm2, mm3 ; movq mm6, mm2 ; psrlq mm6, 32 ; paddd mm2, mm6 ; psrad mm2, 16 ; movq mm4, mm7 ; psrlq mm4, 32 ; paddd mm4, mm7 ; mov rsi, arg(5) ; sum mov rdi, arg(6) ; sumsquared movd [rsi], mm2 ; movd [rdi], mm4 ; ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret SECTION_RODATA ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; align 16 xmm_bi_rd: times 8 dw 64