375 lines
6.8 KiB
NASM
375 lines
6.8 KiB
NASM
|
;
|
||
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||
|
;
|
||
|
; Use of this source code is governed by a BSD-style license and patent
|
||
|
; grant that can be found in the LICENSE file in the root of the source
|
||
|
; tree. All contributing project authors may be found in the AUTHORS
|
||
|
; file in the root of the source tree.
|
||
|
;
|
||
|
|
||
|
|
||
|
.globl vp8_get8x8var_ppc
|
||
|
.globl vp8_get16x16var_ppc
|
||
|
.globl vp8_mse16x16_ppc
|
||
|
.globl vp8_variance16x16_ppc
|
||
|
.globl vp8_variance16x8_ppc
|
||
|
.globl vp8_variance8x16_ppc
|
||
|
.globl vp8_variance8x8_ppc
|
||
|
.globl vp8_variance4x4_ppc
|
||
|
|
||
|
.macro load_aligned_16 V R O
|
||
|
lvsl v3, 0, \R ;# permutate value for alignment
|
||
|
|
||
|
lvx v1, 0, \R
|
||
|
lvx v2, \O, \R
|
||
|
|
||
|
vperm \V, v1, v2, v3
|
||
|
.endm
|
||
|
|
||
|
.macro prologue
|
||
|
mfspr r11, 256 ;# get old VRSAVE
|
||
|
oris r12, r11, 0xffc0
|
||
|
mtspr 256, r12 ;# set VRSAVE
|
||
|
|
||
|
stwu r1, -32(r1) ;# create space on the stack
|
||
|
|
||
|
li r10, 16 ;# load offset and loop counter
|
||
|
|
||
|
vspltisw v7, 0 ;# zero for merging
|
||
|
vspltisw v8, 0 ;# zero out total to start
|
||
|
vspltisw v9, 0 ;# zero out total for dif^2
|
||
|
.endm
|
||
|
|
||
|
.macro epilogue
|
||
|
addi r1, r1, 32 ;# recover stack
|
||
|
|
||
|
mtspr 256, r11 ;# reset old VRSAVE
|
||
|
.endm
|
||
|
|
||
|
.macro compute_sum_sse
|
||
|
;# Compute sum first. Unpack to so signed subract
|
||
|
;# can be used. Only have a half word signed
|
||
|
;# subract. Do high, then low.
|
||
|
vmrghb v2, v7, v4
|
||
|
vmrghb v3, v7, v5
|
||
|
vsubshs v2, v2, v3
|
||
|
vsum4shs v8, v2, v8
|
||
|
|
||
|
vmrglb v2, v7, v4
|
||
|
vmrglb v3, v7, v5
|
||
|
vsubshs v2, v2, v3
|
||
|
vsum4shs v8, v2, v8
|
||
|
|
||
|
;# Now compute sse.
|
||
|
vsububs v2, v4, v5
|
||
|
vsububs v3, v5, v4
|
||
|
vor v2, v2, v3
|
||
|
|
||
|
vmsumubm v9, v2, v2, v9
|
||
|
.endm
|
||
|
|
||
|
.macro variance_16 DS loop_label store_sum
|
||
|
\loop_label:
|
||
|
;# only one of the inputs should need to be aligned.
|
||
|
load_aligned_16 v4, r3, r10
|
||
|
load_aligned_16 v5, r5, r10
|
||
|
|
||
|
;# move onto the next line
|
||
|
add r3, r3, r4
|
||
|
add r5, r5, r6
|
||
|
|
||
|
compute_sum_sse
|
||
|
|
||
|
bdnz \loop_label
|
||
|
|
||
|
vsumsws v8, v8, v7
|
||
|
vsumsws v9, v9, v7
|
||
|
|
||
|
stvx v8, 0, r1
|
||
|
lwz r3, 12(r1)
|
||
|
|
||
|
stvx v9, 0, r1
|
||
|
lwz r4, 12(r1)
|
||
|
|
||
|
.if \store_sum
|
||
|
stw r3, 0(r8) ;# sum
|
||
|
.endif
|
||
|
stw r4, 0(r7) ;# sse
|
||
|
|
||
|
mullw r3, r3, r3 ;# sum*sum
|
||
|
srawi r3, r3, \DS ;# (sum*sum) >> DS
|
||
|
subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
|
||
|
.endm
|
||
|
|
||
|
.macro variance_8 DS loop_label store_sum
|
||
|
\loop_label:
|
||
|
;# only one of the inputs should need to be aligned.
|
||
|
load_aligned_16 v4, r3, r10
|
||
|
load_aligned_16 v5, r5, r10
|
||
|
|
||
|
;# move onto the next line
|
||
|
add r3, r3, r4
|
||
|
add r5, r5, r6
|
||
|
|
||
|
;# only one of the inputs should need to be aligned.
|
||
|
load_aligned_16 v6, r3, r10
|
||
|
load_aligned_16 v0, r5, r10
|
||
|
|
||
|
;# move onto the next line
|
||
|
add r3, r3, r4
|
||
|
add r5, r5, r6
|
||
|
|
||
|
vmrghb v4, v4, v6
|
||
|
vmrghb v5, v5, v0
|
||
|
|
||
|
compute_sum_sse
|
||
|
|
||
|
bdnz \loop_label
|
||
|
|
||
|
vsumsws v8, v8, v7
|
||
|
vsumsws v9, v9, v7
|
||
|
|
||
|
stvx v8, 0, r1
|
||
|
lwz r3, 12(r1)
|
||
|
|
||
|
stvx v9, 0, r1
|
||
|
lwz r4, 12(r1)
|
||
|
|
||
|
.if \store_sum
|
||
|
stw r3, 0(r8) ;# sum
|
||
|
.endif
|
||
|
stw r4, 0(r7) ;# sse
|
||
|
|
||
|
mullw r3, r3, r3 ;# sum*sum
|
||
|
srawi r3, r3, \DS ;# (sum*sum) >> 8
|
||
|
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
|
||
|
.endm
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *SSE
|
||
|
;# r8 int *Sum
|
||
|
;#
|
||
|
;# r3 return value
|
||
|
vp8_get8x8var_ppc:
|
||
|
|
||
|
prologue
|
||
|
|
||
|
li r9, 4
|
||
|
mtctr r9
|
||
|
|
||
|
variance_8 6, get8x8var_loop, 1
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *SSE
|
||
|
;# r8 int *Sum
|
||
|
;#
|
||
|
;# r3 return value
|
||
|
vp8_get16x16var_ppc:
|
||
|
|
||
|
prologue
|
||
|
|
||
|
mtctr r10
|
||
|
|
||
|
variance_16 8, get16x16var_loop, 1
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *sse
|
||
|
;#
|
||
|
;# r 3 return value
|
||
|
vp8_mse16x16_ppc:
|
||
|
prologue
|
||
|
|
||
|
mtctr r10
|
||
|
|
||
|
mse16x16_loop:
|
||
|
;# only one of the inputs should need to be aligned.
|
||
|
load_aligned_16 v4, r3, r10
|
||
|
load_aligned_16 v5, r5, r10
|
||
|
|
||
|
;# move onto the next line
|
||
|
add r3, r3, r4
|
||
|
add r5, r5, r6
|
||
|
|
||
|
;# Now compute sse.
|
||
|
vsububs v2, v4, v5
|
||
|
vsububs v3, v5, v4
|
||
|
vor v2, v2, v3
|
||
|
|
||
|
vmsumubm v9, v2, v2, v9
|
||
|
|
||
|
bdnz mse16x16_loop
|
||
|
|
||
|
vsumsws v9, v9, v7
|
||
|
|
||
|
stvx v9, 0, r1
|
||
|
lwz r3, 12(r1)
|
||
|
|
||
|
stvx v9, 0, r1
|
||
|
lwz r3, 12(r1)
|
||
|
|
||
|
stw r3, 0(r7) ;# sse
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *sse
|
||
|
;#
|
||
|
;# r3 return value
|
||
|
vp8_variance16x16_ppc:
|
||
|
|
||
|
prologue
|
||
|
|
||
|
mtctr r10
|
||
|
|
||
|
variance_16 8, variance16x16_loop, 0
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *sse
|
||
|
;#
|
||
|
;# r3 return value
|
||
|
vp8_variance16x8_ppc:
|
||
|
|
||
|
prologue
|
||
|
|
||
|
li r9, 8
|
||
|
mtctr r9
|
||
|
|
||
|
variance_16 7, variance16x8_loop, 0
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *sse
|
||
|
;#
|
||
|
;# r3 return value
|
||
|
vp8_variance8x16_ppc:
|
||
|
|
||
|
prologue
|
||
|
|
||
|
li r9, 8
|
||
|
mtctr r9
|
||
|
|
||
|
variance_8 7, variance8x16_loop, 0
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *sse
|
||
|
;#
|
||
|
;# r3 return value
|
||
|
vp8_variance8x8_ppc:
|
||
|
|
||
|
prologue
|
||
|
|
||
|
li r9, 4
|
||
|
mtctr r9
|
||
|
|
||
|
variance_8 6, variance8x8_loop, 0
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|
||
|
|
||
|
.macro transfer_4x4 I P
|
||
|
lwz r0, 0(\I)
|
||
|
add \I, \I, \P
|
||
|
|
||
|
lwz r10,0(\I)
|
||
|
add \I, \I, \P
|
||
|
|
||
|
lwz r8, 0(\I)
|
||
|
add \I, \I, \P
|
||
|
|
||
|
lwz r9, 0(\I)
|
||
|
|
||
|
stw r0, 0(r1)
|
||
|
stw r10, 4(r1)
|
||
|
stw r8, 8(r1)
|
||
|
stw r9, 12(r1)
|
||
|
.endm
|
||
|
|
||
|
.align 2
|
||
|
;# r3 unsigned char *src_ptr
|
||
|
;# r4 int source_stride
|
||
|
;# r5 unsigned char *ref_ptr
|
||
|
;# r6 int recon_stride
|
||
|
;# r7 unsigned int *sse
|
||
|
;#
|
||
|
;# r3 return value
|
||
|
vp8_variance4x4_ppc:
|
||
|
|
||
|
prologue
|
||
|
|
||
|
transfer_4x4 r3, r4
|
||
|
lvx v4, 0, r1
|
||
|
|
||
|
transfer_4x4 r5, r6
|
||
|
lvx v5, 0, r1
|
||
|
|
||
|
compute_sum_sse
|
||
|
|
||
|
vsumsws v8, v8, v7
|
||
|
vsumsws v9, v9, v7
|
||
|
|
||
|
stvx v8, 0, r1
|
||
|
lwz r3, 12(r1)
|
||
|
|
||
|
stvx v9, 0, r1
|
||
|
lwz r4, 12(r1)
|
||
|
|
||
|
stw r4, 0(r7) ;# sse
|
||
|
|
||
|
mullw r3, r3, r3 ;# sum*sum
|
||
|
srawi r3, r3, 4 ;# (sum*sum) >> 4
|
||
|
subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
|
||
|
|
||
|
epilogue
|
||
|
|
||
|
blr
|