189 lines
4.6 KiB
NASM
189 lines
4.6 KiB
NASM
;
|
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license and patent
|
|
; grant that can be found in the LICENSE file in the root of the source
|
|
; tree. All contributing project authors may be found in the AUTHORS
|
|
; file in the root of the source tree.
|
|
;
|
|
|
|
|
|
.globl short_idct4x4llm_ppc
|
|
|
|
.macro load_c V, LABEL, OFF, R0, R1
|
|
lis \R0, \LABEL@ha
|
|
la \R1, \LABEL@l(\R0)
|
|
lvx \V, \OFF, \R1
|
|
.endm
|
|
|
|
;# r3 short *input
|
|
;# r4 short *output
|
|
;# r5 int pitch
|
|
.align 2
|
|
short_idct4x4llm_ppc:
|
|
mfspr r11, 256 ;# get old VRSAVE
|
|
oris r12, r11, 0xfff8
|
|
mtspr 256, r12 ;# set VRSAVE
|
|
|
|
load_c v8, sinpi8sqrt2, 0, r9, r10
|
|
load_c v9, cospi8sqrt2minus1, 0, r9, r10
|
|
load_c v10, hi_hi, 0, r9, r10
|
|
load_c v11, lo_lo, 0, r9, r10
|
|
load_c v12, shift_16, 0, r9, r10
|
|
|
|
li r10, 16
|
|
lvx v0, 0, r3 ;# input ip[0], ip[ 4]
|
|
lvx v1, r10, r3 ;# input ip[8], ip[12]
|
|
|
|
;# first pass
|
|
vupkhsh v2, v0
|
|
vupkhsh v3, v1
|
|
vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
|
|
vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
|
|
|
|
vupklsh v0, v0
|
|
vmulosh v4, v0, v8
|
|
vsraw v4, v4, v12
|
|
vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
|
|
|
|
vupklsh v1, v1
|
|
vmulosh v5, v1, v9
|
|
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
|
|
vaddsws v5, v5, v1
|
|
|
|
vsubsws v4, v4, v5 ;# c1
|
|
|
|
vmulosh v3, v1, v8
|
|
vsraw v3, v3, v12
|
|
vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
|
|
|
|
vmulosh v5, v0, v9
|
|
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
|
|
vaddsws v5, v5, v0
|
|
|
|
vaddsws v3, v3, v5 ;# d1
|
|
|
|
vaddsws v0, v6, v3 ;# a1 + d1
|
|
vsubsws v3, v6, v3 ;# a1 - d1
|
|
|
|
vaddsws v1, v7, v4 ;# b1 + c1
|
|
vsubsws v2, v7, v4 ;# b1 - c1
|
|
|
|
;# transpose input
|
|
vmrghw v4, v0, v1 ;# a0 b0 a1 b1
|
|
vmrghw v5, v2, v3 ;# c0 d0 c1 d1
|
|
|
|
vmrglw v6, v0, v1 ;# a2 b2 a3 b3
|
|
vmrglw v7, v2, v3 ;# c2 d2 c3 d3
|
|
|
|
vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
|
|
vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
|
|
|
|
vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
|
|
vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
|
|
|
|
;# second pass
|
|
vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
|
|
vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
|
|
|
|
vmulosh v4, v1, v8
|
|
vsraw v4, v4, v12
|
|
vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
|
|
|
|
vmulosh v5, v3, v9
|
|
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
|
|
vaddsws v5, v5, v3
|
|
|
|
vsubsws v4, v4, v5 ;# c1
|
|
|
|
vmulosh v2, v3, v8
|
|
vsraw v2, v2, v12
|
|
vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
|
|
|
|
vmulosh v5, v1, v9
|
|
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
|
|
vaddsws v5, v5, v1
|
|
|
|
vaddsws v3, v2, v5 ;# d1
|
|
|
|
vaddsws v0, v6, v3 ;# a1 + d1
|
|
vsubsws v3, v6, v3 ;# a1 - d1
|
|
|
|
vaddsws v1, v7, v4 ;# b1 + c1
|
|
vsubsws v2, v7, v4 ;# b1 - c1
|
|
|
|
vspltish v6, 4
|
|
vspltish v7, 3
|
|
|
|
vpkswss v0, v0, v1
|
|
vpkswss v1, v2, v3
|
|
|
|
vaddshs v0, v0, v6
|
|
vaddshs v1, v1, v6
|
|
|
|
vsrah v0, v0, v7
|
|
vsrah v1, v1, v7
|
|
|
|
;# transpose output
|
|
vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
|
|
vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
|
|
|
|
vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
|
|
vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
|
|
|
|
stwu r1,-416(r1) ;# create space on the stack
|
|
|
|
stvx v0, 0, r1
|
|
lwz r6, 0(r1)
|
|
stw r6, 0(r4)
|
|
lwz r6, 4(r1)
|
|
stw r6, 4(r4)
|
|
|
|
add r4, r4, r5
|
|
|
|
lwz r6, 8(r1)
|
|
stw r6, 0(r4)
|
|
lwz r6, 12(r1)
|
|
stw r6, 4(r4)
|
|
|
|
add r4, r4, r5
|
|
|
|
stvx v1, 0, r1
|
|
lwz r6, 0(r1)
|
|
stw r6, 0(r4)
|
|
lwz r6, 4(r1)
|
|
stw r6, 4(r4)
|
|
|
|
add r4, r4, r5
|
|
|
|
lwz r6, 8(r1)
|
|
stw r6, 0(r4)
|
|
lwz r6, 12(r1)
|
|
stw r6, 4(r4)
|
|
|
|
addi r1, r1, 416 ;# recover stack
|
|
|
|
mtspr 256, r11 ;# reset old VRSAVE
|
|
|
|
blr
|
|
|
|
.align 4
|
|
sinpi8sqrt2:
|
|
.short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
|
|
|
|
.align 4
|
|
cospi8sqrt2minus1:
|
|
.short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
|
|
|
|
.align 4
|
|
shift_16:
|
|
.long 16, 16, 16, 16
|
|
|
|
.align 4
|
|
hi_hi:
|
|
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
.align 4
|
|
lo_lo:
|
|
.byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
|