vpx/vp9/common/ppc/vp9_idctllm_altivec.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    .globl short_idct4x4llm_ppc

.macro load_c V, LABEL, OFF, R0, R1
    lis     \R0, \LABEL@ha
    la      \R1, \LABEL@l(\R0)
    lvx     \V, \OFF, \R1
.endm

;# r3 short *input
;# r4 short *output
;# r5 int pitch
    .align 2
short_idct4x4llm_ppc:
    mfspr   r11, 256            ;# get old VRSAVE
    oris    r12, r11, 0xfff8
    mtspr   256, r12            ;# set VRSAVE

    load_c v8, sinpi8sqrt2, 0, r9, r10
    load_c v9, cospi8sqrt2minus1, 0, r9, r10
    load_c v10, hi_hi, 0, r9, r10
    load_c v11, lo_lo, 0, r9, r10
    load_c v12, shift_16, 0, r9, r10

    li      r10,  16
    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
    lvx     v1, r10, r3         ;# input ip[8], ip[12]

    ;# first pass
    vupkhsh v2, v0
    vupkhsh v3, v1
    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

    vupklsh v0, v0
    vmulosh v4, v0, v8
    vsraw   v4, v4, v12
    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

    vupklsh v1, v1
    vmulosh v5, v1, v9
    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
    vaddsws v5, v5, v1

    vsubsws v4, v4, v5          ;# c1

    vmulosh v3, v1, v8
    vsraw   v3, v3, v12
    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

    vmulosh v5, v0, v9
    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
    vaddsws v5, v5, v0

    vaddsws v3, v3, v5          ;# d1

    vaddsws v0, v6, v3          ;# a1 + d1
    vsubsws v3, v6, v3          ;# a1 - d1

    vaddsws v1, v7, v4          ;# b1 + c1
    vsubsws v2, v7, v4          ;# b1 - c1

    ;# transpose input
    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

    ;# second pass
    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

    vmulosh v4, v1, v8
    vsraw   v4, v4, v12
    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

    vmulosh v5, v3, v9
    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
    vaddsws v5, v5, v3

    vsubsws v4, v4, v5          ;# c1

    vmulosh v2, v3, v8
    vsraw   v2, v2, v12
    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

    vmulosh v5, v1, v9
    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
    vaddsws v5, v5, v1

    vaddsws v3, v2, v5          ;# d1

    vaddsws v0, v6, v3          ;# a1 + d1
    vsubsws v3, v6, v3          ;# a1 - d1

    vaddsws v1, v7, v4          ;# b1 + c1
    vsubsws v2, v7, v4          ;# b1 - c1

    vspltish v6, 4
    vspltish v7, 3

    vpkswss v0, v0, v1
    vpkswss v1, v2, v3

    vaddshs v0, v0, v6
    vaddshs v1, v1, v6

    vsrah   v0, v0, v7
    vsrah   v1, v1, v7

    ;# transpose output
    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

    stwu    r1,-416(r1)         ;# create space on the stack

    stvx    v0,  0, r1
    lwz     r6, 0(r1)
    stw     r6, 0(r4)
    lwz     r6, 4(r1)
    stw     r6, 4(r4)

    add     r4, r4, r5

    lwz     r6,  8(r1)
    stw     r6,  0(r4)
    lwz     r6, 12(r1)
    stw     r6,  4(r4)

    add     r4, r4, r5

    stvx    v1,  0, r1
    lwz     r6, 0(r1)
    stw     r6, 0(r4)
    lwz     r6, 4(r1)
    stw     r6, 4(r4)

    add     r4, r4, r5

    lwz     r6,  8(r1)
    stw     r6,  0(r4)
    lwz     r6, 12(r1)
    stw     r6,  4(r4)

    addi    r1, r1, 416         ;# recover stack

    mtspr   256, r11            ;# reset old VRSAVE

    blr

    .align 4
sinpi8sqrt2:
    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

    .align 4
cospi8sqrt2minus1:
    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

    .align 4
shift_16:
    .long      16,    16,    16,    16

    .align 4
hi_hi:
    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

    .align 4
lo_lo:
    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
Use WebM in copyright notice for consistency Changes 'The VP8 project' to 'The WebM project', for consistency with other webmproject.org repositories. Fixes issue #97. Change-Id: I37c13ed5fbdb9d334ceef71c6350e9febed9bbba 2010-09-09 14:16:39 +02:00			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`


			`.globl short_idct4x4llm_ppc`

			`.macro load_c V, LABEL, OFF, R0, R1`
			`lis \R0, \LABEL@ha`
			`la \R1, \LABEL@l(\R0)`
			`lvx \V, \OFF, \R1`
			`.endm`

			`;# r3 short *input`
			`;# r4 short *output`
			`;# r5 int pitch`
			`.align 2`
			`short_idct4x4llm_ppc:`
			`mfspr r11, 256 ;# get old VRSAVE`
			`oris r12, r11, 0xfff8`
			`mtspr 256, r12 ;# set VRSAVE`

			`load_c v8, sinpi8sqrt2, 0, r9, r10`
			`load_c v9, cospi8sqrt2minus1, 0, r9, r10`
			`load_c v10, hi_hi, 0, r9, r10`
			`load_c v11, lo_lo, 0, r9, r10`
			`load_c v12, shift_16, 0, r9, r10`

			`li r10, 16`
			`lvx v0, 0, r3 ;# input ip[0], ip[ 4]`
			`lvx v1, r10, r3 ;# input ip[8], ip[12]`

			`;# first pass`
			`vupkhsh v2, v0`
			`vupkhsh v3, v1`
			`vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]`
			`vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]`

			`vupklsh v0, v0`
			`vmulosh v4, v0, v8`
			`vsraw v4, v4, v12`
			`vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)`

			`vupklsh v1, v1`
			`vmulosh v5, v1, v9`
			`vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)`
			`vaddsws v5, v5, v1`

			`vsubsws v4, v4, v5 ;# c1`

			`vmulosh v3, v1, v8`
			`vsraw v3, v3, v12`
			`vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)`

			`vmulosh v5, v0, v9`
			`vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)`
			`vaddsws v5, v5, v0`

			`vaddsws v3, v3, v5 ;# d1`

			`vaddsws v0, v6, v3 ;# a1 + d1`
			`vsubsws v3, v6, v3 ;# a1 - d1`

			`vaddsws v1, v7, v4 ;# b1 + c1`
			`vsubsws v2, v7, v4 ;# b1 - c1`

			`;# transpose input`
			`vmrghw v4, v0, v1 ;# a0 b0 a1 b1`
			`vmrghw v5, v2, v3 ;# c0 d0 c1 d1`

			`vmrglw v6, v0, v1 ;# a2 b2 a3 b3`
			`vmrglw v7, v2, v3 ;# c2 d2 c3 d3`

			`vperm v0, v4, v5, v10 ;# a0 b0 c0 d0`
			`vperm v1, v4, v5, v11 ;# a1 b1 c1 d1`

			`vperm v2, v6, v7, v10 ;# a2 b2 c2 d2`
			`vperm v3, v6, v7, v11 ;# a3 b3 c3 d3`

			`;# second pass`
			`vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]`
			`vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]`

			`vmulosh v4, v1, v8`
			`vsraw v4, v4, v12`
			`vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)`

			`vmulosh v5, v3, v9`
			`vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)`
			`vaddsws v5, v5, v3`

			`vsubsws v4, v4, v5 ;# c1`

			`vmulosh v2, v3, v8`
			`vsraw v2, v2, v12`
			`vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)`

			`vmulosh v5, v1, v9`
			`vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)`
			`vaddsws v5, v5, v1`

			`vaddsws v3, v2, v5 ;# d1`

			`vaddsws v0, v6, v3 ;# a1 + d1`
			`vsubsws v3, v6, v3 ;# a1 - d1`

			`vaddsws v1, v7, v4 ;# b1 + c1`
			`vsubsws v2, v7, v4 ;# b1 - c1`

			`vspltish v6, 4`
			`vspltish v7, 3`

			`vpkswss v0, v0, v1`
			`vpkswss v1, v2, v3`

			`vaddshs v0, v0, v6`
			`vaddshs v1, v1, v6`

			`vsrah v0, v0, v7`
			`vsrah v1, v1, v7`

			`;# transpose output`
			`vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3`
			`vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3`

			`vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1`
			`vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3`

			`stwu r1,-416(r1) ;# create space on the stack`

			`stvx v0, 0, r1`
			`lwz r6, 0(r1)`
			`stw r6, 0(r4)`
			`lwz r6, 4(r1)`
			`stw r6, 4(r4)`

			`add r4, r4, r5`

			`lwz r6, 8(r1)`
			`stw r6, 0(r4)`
			`lwz r6, 12(r1)`
			`stw r6, 4(r4)`

			`add r4, r4, r5`

			`stvx v1, 0, r1`
			`lwz r6, 0(r1)`
			`stw r6, 0(r4)`
			`lwz r6, 4(r1)`
			`stw r6, 4(r4)`

			`add r4, r4, r5`

			`lwz r6, 8(r1)`
			`stw r6, 0(r4)`
			`lwz r6, 12(r1)`
			`stw r6, 4(r4)`

			`addi r1, r1, 416 ;# recover stack`

			`mtspr 256, r11 ;# reset old VRSAVE`

			`blr`

			`.align 4`
			`sinpi8sqrt2:`
			`.short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468`

			`.align 4`
			`cospi8sqrt2minus1:`
			`.short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091`

			`.align 4`
			`shift_16:`
			`.long 16, 16, 16, 16`

			`.align 4`
			`hi_hi:`
			`.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23`

			`.align 4`
			`lo_lo:`
			`.byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31`