216 lines
6.1 KiB
NASM
216 lines
6.1 KiB
NASM
|
;
|
||
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||
|
;
|
||
|
; Use of this source code is governed by a BSD-style license
|
||
|
; that can be found in the LICENSE file in the root of the source
|
||
|
; tree. An additional intellectual property rights grant can be found
|
||
|
; in the file PATENTS. All contributing project authors may
|
||
|
; be found in the AUTHORS file in the root of the source tree.
|
||
|
;
|
||
|
|
||
|
%include "vpx_ports/x86_abi_support.asm"
|
||
|
|
||
|
; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
|
||
|
%macro TABULATE_SSIM 0
|
||
|
paddusw xmm15, xmm3 ; sum_s
|
||
|
paddusw xmm14, xmm4 ; sum_r
|
||
|
movdqa xmm1, xmm3
|
||
|
pmaddwd xmm1, xmm1
|
||
|
paddq xmm13, xmm1 ; sum_sq_s
|
||
|
movdqa xmm2, xmm4
|
||
|
pmaddwd xmm2, xmm2
|
||
|
paddq xmm12, xmm2 ; sum_sq_r
|
||
|
pmaddwd xmm3, xmm4
|
||
|
paddq xmm11, xmm3 ; sum_sxr
|
||
|
%endmacro
|
||
|
|
||
|
; Sum across the register %1 starting with q words
|
||
|
%macro SUM_ACROSS_Q 1
|
||
|
movdqa xmm2,%1
|
||
|
punpckldq %1,xmm0
|
||
|
punpckhdq xmm2,xmm0
|
||
|
paddq %1,xmm2
|
||
|
movdqa xmm2,%1
|
||
|
punpcklqdq %1,xmm0
|
||
|
punpckhqdq xmm2,xmm0
|
||
|
paddq %1,xmm2
|
||
|
%endmacro
|
||
|
|
||
|
; Sum across the register %1 starting with q words
|
||
|
%macro SUM_ACROSS_W 1
|
||
|
movdqa xmm1, %1
|
||
|
punpcklwd %1,xmm0
|
||
|
punpckhwd xmm1,xmm0
|
||
|
paddd %1, xmm1
|
||
|
SUM_ACROSS_Q %1
|
||
|
%endmacro
|
||
|
;void ssim_parms_sse3(
|
||
|
; unsigned char *s,
|
||
|
; int sp,
|
||
|
; unsigned char *r,
|
||
|
; int rp
|
||
|
; unsigned long *sum_s,
|
||
|
; unsigned long *sum_r,
|
||
|
; unsigned long *sum_sq_s,
|
||
|
; unsigned long *sum_sq_r,
|
||
|
; unsigned long *sum_sxr);
|
||
|
;
|
||
|
; TODO: Use parm passing through structure, probably don't need the pxors
|
||
|
; ( calling app will initialize to 0 ) could easily fit everything in sse2
|
||
|
; without too much hastle, and can probably do better estimates with psadw
|
||
|
; or pavgb At this point this is just meant to be first pass for calculating
|
||
|
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
|
||
|
; in mode selection code.
|
||
|
global sym(vp8_ssim_parms_16x16_sse3)
|
||
|
sym(vp8_ssim_parms_16x16_sse3):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 9
|
||
|
push rsi
|
||
|
push rdi
|
||
|
; end prolog
|
||
|
|
||
|
mov rsi, arg(0) ;s
|
||
|
mov rcx, arg(1) ;sp
|
||
|
mov rdi, arg(2) ;r
|
||
|
mov rax, arg(3) ;rp
|
||
|
|
||
|
pxor xmm0, xmm0
|
||
|
pxor xmm15,xmm15 ;sum_s
|
||
|
pxor xmm14,xmm14 ;sum_r
|
||
|
pxor xmm13,xmm13 ;sum_sq_s
|
||
|
pxor xmm12,xmm12 ;sum_sq_r
|
||
|
pxor xmm11,xmm11 ;sum_sxr
|
||
|
|
||
|
mov rdx, 16 ;row counter
|
||
|
NextRow:
|
||
|
|
||
|
;grab source and reference pixels
|
||
|
movdqu xmm5, [rsi]
|
||
|
movdqu xmm6, [rdi]
|
||
|
movdqa xmm3, xmm5
|
||
|
movdqa xmm4, xmm6
|
||
|
punpckhbw xmm3, xmm0 ; high_s
|
||
|
punpckhbw xmm4, xmm0 ; high_r
|
||
|
|
||
|
TABULATE_SSIM
|
||
|
|
||
|
movdqa xmm3, xmm5
|
||
|
movdqa xmm4, xmm6
|
||
|
punpcklbw xmm3, xmm0 ; low_s
|
||
|
punpcklbw xmm4, xmm0 ; low_r
|
||
|
|
||
|
TABULATE_SSIM
|
||
|
|
||
|
add rsi, rcx ; next s row
|
||
|
add rdi, rax ; next r row
|
||
|
|
||
|
dec rdx ; counter
|
||
|
jnz NextRow
|
||
|
|
||
|
SUM_ACROSS_W xmm15
|
||
|
SUM_ACROSS_W xmm14
|
||
|
SUM_ACROSS_Q xmm13
|
||
|
SUM_ACROSS_Q xmm12
|
||
|
SUM_ACROSS_Q xmm11
|
||
|
|
||
|
mov rdi,arg(4)
|
||
|
movq [rdi], xmm15;
|
||
|
mov rdi,arg(5)
|
||
|
movq [rdi], xmm14;
|
||
|
mov rdi,arg(6)
|
||
|
movq [rdi], xmm13;
|
||
|
mov rdi,arg(7)
|
||
|
movq [rdi], xmm12;
|
||
|
mov rdi,arg(8)
|
||
|
movq [rdi], xmm11;
|
||
|
|
||
|
; begin epilog
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
;void ssim_parms_sse3(
|
||
|
; unsigned char *s,
|
||
|
; int sp,
|
||
|
; unsigned char *r,
|
||
|
; int rp
|
||
|
; unsigned long *sum_s,
|
||
|
; unsigned long *sum_r,
|
||
|
; unsigned long *sum_sq_s,
|
||
|
; unsigned long *sum_sq_r,
|
||
|
; unsigned long *sum_sxr);
|
||
|
;
|
||
|
; TODO: Use parm passing through structure, probably don't need the pxors
|
||
|
; ( calling app will initialize to 0 ) could easily fit everything in sse2
|
||
|
; without too much hastle, and can probably do better estimates with psadw
|
||
|
; or pavgb At this point this is just meant to be first pass for calculating
|
||
|
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
|
||
|
; in mode selection code.
|
||
|
global sym(vp8_ssim_parms_8x8_sse3)
|
||
|
sym(vp8_ssim_parms_8x8_sse3):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 9
|
||
|
push rsi
|
||
|
push rdi
|
||
|
; end prolog
|
||
|
|
||
|
mov rsi, arg(0) ;s
|
||
|
mov rcx, arg(1) ;sp
|
||
|
mov rdi, arg(2) ;r
|
||
|
mov rax, arg(3) ;rp
|
||
|
|
||
|
pxor xmm0, xmm0
|
||
|
pxor xmm15,xmm15 ;sum_s
|
||
|
pxor xmm14,xmm14 ;sum_r
|
||
|
pxor xmm13,xmm13 ;sum_sq_s
|
||
|
pxor xmm12,xmm12 ;sum_sq_r
|
||
|
pxor xmm11,xmm11 ;sum_sxr
|
||
|
|
||
|
mov rdx, 8 ;row counter
|
||
|
NextRow2:
|
||
|
|
||
|
;grab source and reference pixels
|
||
|
movq xmm5, [rsi]
|
||
|
movq xmm6, [rdi]
|
||
|
|
||
|
movdqa xmm3, xmm5
|
||
|
movdqa xmm4, xmm6
|
||
|
punpcklbw xmm3, xmm0 ; low_s
|
||
|
punpcklbw xmm4, xmm0 ; low_r
|
||
|
|
||
|
TABULATE_SSIM
|
||
|
|
||
|
add rsi, rcx ; next s row
|
||
|
add rdi, rax ; next r row
|
||
|
|
||
|
dec rdx ; counter
|
||
|
jnz NextRow2
|
||
|
|
||
|
SUM_ACROSS_W xmm15
|
||
|
SUM_ACROSS_W xmm14
|
||
|
SUM_ACROSS_Q xmm13
|
||
|
SUM_ACROSS_Q xmm12
|
||
|
SUM_ACROSS_Q xmm11
|
||
|
|
||
|
mov rdi,arg(4)
|
||
|
movq [rdi], xmm15;
|
||
|
mov rdi,arg(5)
|
||
|
movq [rdi], xmm14;
|
||
|
mov rdi,arg(6)
|
||
|
movq [rdi], xmm13;
|
||
|
mov rdi,arg(7)
|
||
|
movq [rdi], xmm12;
|
||
|
mov rdi,arg(8)
|
||
|
movq [rdi], xmm11;
|
||
|
|
||
|
; begin epilog
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|