
This minimize code differences between AOM master and nextgenv2 Change-Id: If144865bdf3ef0818e7aac11018b9e786444c550
151 lines
4.4 KiB
NASM
151 lines
4.4 KiB
NASM
;
|
|
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
|
;
|
|
; This source code is subject to the terms of the BSD 2 Clause License and
|
|
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
|
; was not distributed with this source code in the LICENSE file, you can
|
|
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
|
; Media Patent License 1.0 was not distributed with this source code in the
|
|
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
|
;
|
|
|
|
;
|
|
|
|
%include "third_party/x86inc/x86inc.asm"
|
|
|
|
SECTION .text
|
|
|
|
; void aom_subtract_block(int rows, int cols,
|
|
; int16_t *diff, ptrdiff_t diff_stride,
|
|
; const uint8_t *src, ptrdiff_t src_stride,
|
|
; const uint8_t *pred, ptrdiff_t pred_stride)
|
|
|
|
INIT_XMM sse2
|
|
cglobal subtract_block, 7, 7, 8, \
|
|
rows, cols, diff, diff_stride, src, src_stride, \
|
|
pred, pred_stride
|
|
%define pred_str colsq
|
|
pxor m7, m7 ; dedicated zero register
|
|
cmp colsd, 4
|
|
je .case_4
|
|
cmp colsd, 8
|
|
je .case_8
|
|
cmp colsd, 16
|
|
je .case_16
|
|
cmp colsd, 32
|
|
je .case_32
|
|
%if CONFIG_EXT_PARTITION
|
|
cmp colsd, 64
|
|
je .case_64
|
|
%endif
|
|
|
|
%macro loop16 6
|
|
mova m0, [srcq+%1]
|
|
mova m4, [srcq+%2]
|
|
mova m1, [predq+%3]
|
|
mova m5, [predq+%4]
|
|
punpckhbw m2, m0, m7
|
|
punpckhbw m3, m1, m7
|
|
punpcklbw m0, m7
|
|
punpcklbw m1, m7
|
|
psubw m2, m3
|
|
psubw m0, m1
|
|
punpckhbw m1, m4, m7
|
|
punpckhbw m3, m5, m7
|
|
punpcklbw m4, m7
|
|
punpcklbw m5, m7
|
|
psubw m1, m3
|
|
psubw m4, m5
|
|
mova [diffq+mmsize*0+%5], m0
|
|
mova [diffq+mmsize*1+%5], m2
|
|
mova [diffq+mmsize*0+%6], m4
|
|
mova [diffq+mmsize*1+%6], m1
|
|
%endmacro
|
|
|
|
%if CONFIG_EXT_PARTITION
|
|
mov pred_str, pred_stridemp
|
|
.loop_128:
|
|
loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
|
|
loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
|
|
loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize
|
|
loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
|
|
lea diffq, [diffq+diff_strideq*2]
|
|
add predq, pred_str
|
|
add srcq, src_strideq
|
|
sub rowsd, 1
|
|
jnz .loop_128
|
|
RET
|
|
|
|
.case_64:
|
|
%endif
|
|
mov pred_str, pred_stridemp
|
|
.loop_64:
|
|
loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
|
|
loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
|
|
lea diffq, [diffq+diff_strideq*2]
|
|
add predq, pred_str
|
|
add srcq, src_strideq
|
|
dec rowsd
|
|
jg .loop_64
|
|
RET
|
|
|
|
.case_32:
|
|
mov pred_str, pred_stridemp
|
|
.loop_32:
|
|
loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
|
|
lea diffq, [diffq+diff_strideq*2]
|
|
add predq, pred_str
|
|
add srcq, src_strideq
|
|
dec rowsd
|
|
jg .loop_32
|
|
RET
|
|
|
|
.case_16:
|
|
mov pred_str, pred_stridemp
|
|
.loop_16:
|
|
loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
|
|
lea diffq, [diffq+diff_strideq*4]
|
|
lea predq, [predq+pred_str*2]
|
|
lea srcq, [srcq+src_strideq*2]
|
|
sub rowsd, 2
|
|
jg .loop_16
|
|
RET
|
|
|
|
%macro loop_h 0
|
|
movh m0, [srcq]
|
|
movh m2, [srcq+src_strideq]
|
|
movh m1, [predq]
|
|
movh m3, [predq+pred_str]
|
|
punpcklbw m0, m7
|
|
punpcklbw m1, m7
|
|
punpcklbw m2, m7
|
|
punpcklbw m3, m7
|
|
psubw m0, m1
|
|
psubw m2, m3
|
|
mova [diffq], m0
|
|
mova [diffq+diff_strideq*2], m2
|
|
%endmacro
|
|
|
|
.case_8:
|
|
mov pred_str, pred_stridemp
|
|
.loop_8:
|
|
loop_h
|
|
lea diffq, [diffq+diff_strideq*4]
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea predq, [predq+pred_str*2]
|
|
sub rowsd, 2
|
|
jg .loop_8
|
|
RET
|
|
|
|
INIT_MMX
|
|
.case_4:
|
|
mov pred_str, pred_stridemp
|
|
.loop_4:
|
|
loop_h
|
|
lea diffq, [diffq+diff_strideq*4]
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea predq, [predq+pred_str*2]
|
|
sub rowsd, 2
|
|
jg .loop_4
|
|
RET
|