vpx/vpx_scale/x86_64/scaleopt.c
John Koleszar 09202d8071 LICENSE: update with latest text
Change-Id: Ieebea089095d9073b3a94932791099f614ce120c
2010-06-04 16:19:40 -04:00

1751 lines
55 KiB
C

/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/****************************************************************************
*
* Module Title : scaleopt.cpp
*
* Description : Optimized scaling functions
*
****************************************************************************/
#include "pragmas.h"
/****************************************************************************
* Module Statics
****************************************************************************/
__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
#include "vpx_scale/vpxscale.h"
#include "vpx_mem/vpx_mem.h"
/****************************************************************************
*
* ROUTINE : horizontal_line_3_5_scale_mmx
*
* INPUTS : const unsigned char *source :
* unsigned int source_width :
* unsigned char *dest :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_3_5_scale_mmx
(
const unsigned char *source,
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width
)
{
(void) dest_width;
__asm
{
push rbx
mov rsi, source
mov rdi, dest
mov ecx, source_width
lea rdx, [rsi+rcx-3];
movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
pxor mm7, mm7 // clear mm7
horiz_line_3_5_loop:
mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
mov ebx, eax
and ebx, 0xffff00 // ebx = xx 01 02 xx
mov ecx, eax // ecx = 00 01 02 03
and eax, 0xffff0000 // eax = xx xx 02 03
xor ecx, eax // ecx = 00 01 xx xx
shr ebx, 8 // ebx = 01 02 xx xx
or eax, ebx // eax = 01 02 02 03
shl ebx, 16 // ebx = xx xx 01 02
movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
or ebx, ecx // ebx = 00 01 01 02
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
movd mm0, ebx // mm0 = 00 01 01 02
pmullw mm1, mm6 //
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
pmullw mm0, mm5 //
mov [rdi], ebx // writeoutput 00 xx xx xx
add rsi, 3
add rdi, 5
paddw mm0, mm1
paddw mm0, mm4
psrlw mm0, 8
cmp rsi, rdx
packuswb mm0, mm7
movd DWORD Ptr [rdi-4], mm0
jl horiz_line_3_5_loop
//Exit:
mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
mov ebx, eax
and ebx, 0xffff00 // ebx = xx 01 02 xx
mov ecx, eax // ecx = 00 01 02 03
and eax, 0xffff0000 // eax = xx xx 02 03
xor ecx, eax // ecx = 00 01 xx xx
shr ebx, 8 // ebx = 01 02 xx xx
or eax, ebx // eax = 01 02 02 03
shl eax, 8 // eax = xx 01 02 02
and eax, 0xffff0000 // eax = xx xx 02 02
or eax, ebx // eax = 01 02 02 02
shl ebx, 16 // ebx = xx xx 01 02
movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
or ebx, ecx // ebx = 00 01 01 02
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
movd mm0, ebx // mm0 = 00 01 01 02
pmullw mm1, mm6 //
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
pmullw mm0, mm5 //
mov [rdi], ebx // writeoutput 00 xx xx xx
paddw mm0, mm1
paddw mm0, mm4
psrlw mm0, 8
packuswb mm0, mm7
movd DWORD Ptr [rdi+1], mm0
pop rbx
}
}
/****************************************************************************
*
* ROUTINE : horizontal_line_4_5_scale_mmx
*
* INPUTS : const unsigned char *source :
* unsigned int source_width :
* unsigned char *dest :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_4_5_scale_mmx
(
const unsigned char *source,
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width
)
{
(void)dest_width;
__asm
{
mov rsi, source
mov rdi, dest
mov ecx, source_width
lea rdx, [rsi+rcx-8];
movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
pxor mm7, mm7 // clear mm7
horiz_line_4_5_loop:
movq mm0, QWORD PTR [rsi] // mm0 = 00 01 02 03 04 05 06 07
movq mm1, QWORD PTR [rsi+1]; // mm1 = 01 02 03 04 05 06 07 08
movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
paddw mm0, mm1 // added round values
paddw mm0, mm4
psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
packuswb mm0, mm7
movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
add rdi, 10
add rsi, 8
paddw mm2, mm3 //
paddw mm2, mm4 // added round values
cmp rsi, rdx
psrlw mm2, 8
packuswb mm2, mm7
movd DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
jl horiz_line_4_5_loop
//Exit:
movq mm0, [rsi] // mm0 = 00 01 02 03 04 05 06 07
movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
movq mm3, mm1
movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
paddw mm0, mm1 // added round values
paddw mm0, mm4
psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
paddw mm2, mm3 //
paddw mm2, mm4 // added round values
psrlw mm2, 8
packuswb mm2, mm7
movd DWORD PTR [rdi+6], mm2 // writeoutput 06 07 08 09
}
}
/****************************************************************************
*
* ROUTINE : vertical_band_4_5_scale_mmx
*
* INPUTS : unsigned char *dest :
* unsigned int dest_pitch :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
*
* SPECIAL NOTES : The routine uses the first line of the band below
* the current band. The function also has a "C" only
* version.
*
****************************************************************************/
static
void vertical_band_4_5_scale_mmx
(
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, dest // Get the source and destination pointer
mov ecx, dest_pitch // Get the pitch size
lea rdi, [rsi+rcx*2] // tow lines below
add rdi, rcx // three lines below
pxor mm7, mm7 // clear out mm7
mov edx, dest_width // Loop counter
vs_4_5_loop:
movq mm0, QWORD ptr [rsi] // src[0];
movq mm1, QWORD ptr [rsi+rcx] // src[1];
movq mm2, mm0 // Make a copy
punpcklbw mm0, mm7 // unpack low to word
movq mm5, one_fifth
punpckhbw mm2, mm7 // unpack high to word
pmullw mm0, mm5 // a * 1/5
movq mm3, mm1 // make a copy
punpcklbw mm1, mm7 // unpack low to word
pmullw mm2, mm5 // a * 1/5
movq mm6, four_fifths // constan
movq mm4, mm1 // copy of low b
pmullw mm4, mm6 // b * 4/5
punpckhbw mm3, mm7 // unpack high to word
movq mm5, mm3 // copy of high b
pmullw mm5, mm6 // b * 4/5
paddw mm0, mm4 // a * 1/5 + b * 4/5
paddw mm2, mm5 // a * 1/5 + b * 4/5
paddw mm0, round_values // + 128
paddw mm2, round_values // + 128
psrlw mm0, 8
psrlw mm2, 8
packuswb mm0, mm2 // des [1]
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
movq mm0, [rsi+rcx*2] // mm0 = src[2]
// mm1, mm3 --- Src[1]
// mm0 --- Src[2]
// mm7 for unpacking
movq mm5, two_fifths
movq mm2, mm0 // make a copy
pmullw mm1, mm5 // b * 2/5
movq mm6, three_fifths
punpcklbw mm0, mm7 // unpack low to word
pmullw mm3, mm5 // b * 2/5
movq mm4, mm0 // make copy of c
punpckhbw mm2, mm7 // unpack high to word
pmullw mm4, mm6 // c * 3/5
movq mm5, mm2
pmullw mm5, mm6 // c * 3/5
paddw mm1, mm4 // b * 2/5 + c * 3/5
paddw mm3, mm5 // b * 2/5 + c * 3/5
paddw mm1, round_values // + 128
paddw mm3, round_values // + 128
psrlw mm1, 8
psrlw mm3, 8
packuswb mm1, mm3 // des[2]
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
movq mm1, [rdi] // mm1=Src[3];
// mm0, mm2 --- Src[2]
// mm1 --- Src[3]
// mm6 --- 3/5
// mm7 for unpacking
pmullw mm0, mm6 // c * 3/5
movq mm5, two_fifths // mm5 = 2/5
movq mm3, mm1 // make a copy
pmullw mm2, mm6 // c * 3/5
punpcklbw mm1, mm7 // unpack low
movq mm4, mm1 // make a copy
punpckhbw mm3, mm7 // unpack high
pmullw mm4, mm5 // d * 2/5
movq mm6, mm3 // make a copy
pmullw mm6, mm5 // d * 2/5
paddw mm0, mm4 // c * 3/5 + d * 2/5
paddw mm2, mm6 // c * 3/5 + d * 2/5
paddw mm0, round_values // + 128
paddw mm2, round_values // + 128
psrlw mm0, 8
psrlw mm2, 8
packuswb mm0, mm2 // des[3]
movq QWORD ptr [rdi], mm0 // write des[3]
// mm1, mm3 --- Src[3]
// mm7 -- cleared for unpacking
movq mm0, [rdi+rcx*2] // mm0, Src[0] of the next group
movq mm5, four_fifths // mm5 = 4/5
pmullw mm1, mm5 // d * 4/5
movq mm6, one_fifth // mm6 = 1/5
movq mm2, mm0 // make a copy
pmullw mm3, mm5 // d * 4/5
punpcklbw mm0, mm7 // unpack low
pmullw mm0, mm6 // an * 1/5
punpckhbw mm2, mm7 // unpack high
paddw mm1, mm0 // d * 4/5 + an * 1/5
pmullw mm2, mm6 // an * 1/5
paddw mm3, mm2 // d * 4/5 + an * 1/5
paddw mm1, round_values // + 128
paddw mm3, round_values // + 128
psrlw mm1, 8
psrlw mm3, 8
packuswb mm1, mm3 // des[4]
movq QWORD ptr [rdi+rcx], mm1 // write des[4]
add rdi, 8
add rsi, 8
sub rdx, 8
jg vs_4_5_loop
}
}
/****************************************************************************
*
* ROUTINE : last_vertical_band_4_5_scale_mmx
*
* INPUTS : unsigned char *dest :
* unsigned int dest_pitch :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : None
*
* FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
*
* SPECIAL NOTES : The routine uses the first line of the band below
* the current band. The function also has an "C" only
* version.
*
****************************************************************************/
static
void last_vertical_band_4_5_scale_mmx
(
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, dest // Get the source and destination pointer
mov ecx, dest_pitch // Get the pitch size
lea rdi, [rsi+rcx*2] // tow lines below
add rdi, rcx // three lines below
pxor mm7, mm7 // clear out mm7
mov edx, dest_width // Loop counter
last_vs_4_5_loop:
movq mm0, QWORD ptr [rsi] // src[0];
movq mm1, QWORD ptr [rsi+rcx] // src[1];
movq mm2, mm0 // Make a copy
punpcklbw mm0, mm7 // unpack low to word
movq mm5, one_fifth
punpckhbw mm2, mm7 // unpack high to word
pmullw mm0, mm5 // a * 1/5
movq mm3, mm1 // make a copy
punpcklbw mm1, mm7 // unpack low to word
pmullw mm2, mm5 // a * 1/5
movq mm6, four_fifths // constan
movq mm4, mm1 // copy of low b
pmullw mm4, mm6 // b * 4/5
punpckhbw mm3, mm7 // unpack high to word
movq mm5, mm3 // copy of high b
pmullw mm5, mm6 // b * 4/5
paddw mm0, mm4 // a * 1/5 + b * 4/5
paddw mm2, mm5 // a * 1/5 + b * 4/5
paddw mm0, round_values // + 128
paddw mm2, round_values // + 128
psrlw mm0, 8
psrlw mm2, 8
packuswb mm0, mm2 // des [1]
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
movq mm0, [rsi+rcx*2] // mm0 = src[2]
// mm1, mm3 --- Src[1]
// mm0 --- Src[2]
// mm7 for unpacking
movq mm5, two_fifths
movq mm2, mm0 // make a copy
pmullw mm1, mm5 // b * 2/5
movq mm6, three_fifths
punpcklbw mm0, mm7 // unpack low to word
pmullw mm3, mm5 // b * 2/5
movq mm4, mm0 // make copy of c
punpckhbw mm2, mm7 // unpack high to word
pmullw mm4, mm6 // c * 3/5
movq mm5, mm2
pmullw mm5, mm6 // c * 3/5
paddw mm1, mm4 // b * 2/5 + c * 3/5
paddw mm3, mm5 // b * 2/5 + c * 3/5
paddw mm1, round_values // + 128
paddw mm3, round_values // + 128
psrlw mm1, 8
psrlw mm3, 8
packuswb mm1, mm3 // des[2]
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
movq mm1, [rdi] // mm1=Src[3];
movq QWORD ptr [rdi+rcx], mm1 // write des[4];
// mm0, mm2 --- Src[2]
// mm1 --- Src[3]
// mm6 --- 3/5
// mm7 for unpacking
pmullw mm0, mm6 // c * 3/5
movq mm5, two_fifths // mm5 = 2/5
movq mm3, mm1 // make a copy
pmullw mm2, mm6 // c * 3/5
punpcklbw mm1, mm7 // unpack low
movq mm4, mm1 // make a copy
punpckhbw mm3, mm7 // unpack high
pmullw mm4, mm5 // d * 2/5
movq mm6, mm3 // make a copy
pmullw mm6, mm5 // d * 2/5
paddw mm0, mm4 // c * 3/5 + d * 2/5
paddw mm2, mm6 // c * 3/5 + d * 2/5
paddw mm0, round_values // + 128
paddw mm2, round_values // + 128
psrlw mm0, 8
psrlw mm2, 8
packuswb mm0, mm2 // des[3]
movq QWORD ptr [rdi], mm0 // write des[3]
// mm1, mm3 --- Src[3]
// mm7 -- cleared for unpacking
add rdi, 8
add rsi, 8
sub rdx, 8
jg last_vs_4_5_loop
}
}
/****************************************************************************
*
* ROUTINE : vertical_band_3_5_scale_mmx
*
* INPUTS : unsigned char *dest :
* unsigned int dest_pitch :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
*
* SPECIAL NOTES : The routine uses the first line of the band below
* the current band. The function also has an "C" only
* version.
*
****************************************************************************/
static
void vertical_band_3_5_scale_mmx
(
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, dest // Get the source and destination pointer
mov ecx, dest_pitch // Get the pitch size
lea rdi, [rsi+rcx*2] // two lines below
add rdi, rcx // three lines below
pxor mm7, mm7 // clear out mm7
mov edx, dest_width // Loop counter
vs_3_5_loop:
movq mm0, QWORD ptr [rsi] // src[0];
movq mm1, QWORD ptr [rsi+rcx] // src[1];
movq mm2, mm0 // Make a copy
punpcklbw mm0, mm7 // unpack low to word
movq mm5, two_fifths // mm5 = 2/5
punpckhbw mm2, mm7 // unpack high to word
pmullw mm0, mm5 // a * 2/5
movq mm3, mm1 // make a copy
punpcklbw mm1, mm7 // unpack low to word
pmullw mm2, mm5 // a * 2/5
movq mm6, three_fifths // mm6 = 3/5
movq mm4, mm1 // copy of low b
pmullw mm4, mm6 // b * 3/5
punpckhbw mm3, mm7 // unpack high to word
movq mm5, mm3 // copy of high b
pmullw mm5, mm6 // b * 3/5
paddw mm0, mm4 // a * 2/5 + b * 3/5
paddw mm2, mm5 // a * 2/5 + b * 3/5
paddw mm0, round_values // + 128
paddw mm2, round_values // + 128
psrlw mm0, 8
psrlw mm2, 8
packuswb mm0, mm2 // des [1]
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
movq mm0, [rsi+rcx*2] // mm0 = src[2]
// mm1, mm3 --- Src[1]
// mm0 --- Src[2]
// mm7 for unpacking
movq mm4, mm1 // b low
pmullw mm1, four_fifths // b * 4/5 low
movq mm5, mm3 // b high
pmullw mm3, four_fifths // b * 4/5 high
movq mm2, mm0 // c
pmullw mm4, one_fifth // b * 1/5
punpcklbw mm0, mm7 // c low
pmullw mm5, one_fifth // b * 1/5
movq mm6, mm0 // make copy of c low
punpckhbw mm2, mm7 // c high
pmullw mm6, one_fifth // c * 1/5 low
movq mm7, mm2 // make copy of c high
pmullw mm7, one_fifth // c * 1/5 high
paddw mm1, mm6 // b * 4/5 + c * 1/5 low
paddw mm3, mm7 // b * 4/5 + c * 1/5 high
movq mm6, mm0 // make copy of c low
pmullw mm6, four_fifths // c * 4/5 low
movq mm7, mm2 // make copy of c high
pmullw mm7, four_fifths // c * 4/5 high
paddw mm4, mm6 // b * 1/5 + c * 4/5 low
paddw mm5, mm7 // b * 1/5 + c * 4/5 high
paddw mm1, round_values // + 128
paddw mm3, round_values // + 128
psrlw mm1, 8
psrlw mm3, 8
packuswb mm1, mm3 // des[2]
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
paddw mm4, round_values // + 128
paddw mm5, round_values // + 128
psrlw mm4, 8
psrlw mm5, 8
packuswb mm4, mm5 // des[3]
movq QWORD ptr [rdi], mm4 // write des[3]
// mm0, mm2 --- Src[3]
pxor mm7, mm7 // clear mm7 for unpacking
movq mm1, [rdi+rcx*2] // mm1 = Src[0] of the next group
movq mm5, three_fifths // mm5 = 3/5
pmullw mm0, mm5 // d * 3/5
movq mm6, two_fifths // mm6 = 2/5
movq mm3, mm1 // make a copy
pmullw mm2, mm5 // d * 3/5
punpcklbw mm1, mm7 // unpack low
pmullw mm1, mm6 // an * 2/5
punpckhbw mm3, mm7 // unpack high
paddw mm0, mm1 // d * 3/5 + an * 2/5
pmullw mm3, mm6 // an * 2/5
paddw mm2, mm3 // d * 3/5 + an * 2/5
paddw mm0, round_values // + 128
paddw mm2, round_values // + 128
psrlw mm0, 8
psrlw mm2, 8
packuswb mm0, mm2 // des[4]
movq QWORD ptr [rdi+rcx], mm0 // write des[4]
add rdi, 8
add rsi, 8
sub rdx, 8
jg vs_3_5_loop
}
}
/****************************************************************************
*
* ROUTINE : last_vertical_band_3_5_scale_mmx
*
* INPUTS : unsigned char *dest :
* unsigned int dest_pitch :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
*
* SPECIAL NOTES : The routine uses the first line of the band below
* the current band. The function also has an "C" only
* version.
*
****************************************************************************/
static
void last_vertical_band_3_5_scale_mmx
(
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, dest // Get the source and destination pointer
mov ecx, dest_pitch // Get the pitch size
lea rdi, [rsi+rcx*2] // tow lines below
add rdi, rcx // three lines below
pxor mm7, mm7 // clear out mm7
mov edx, dest_width // Loop counter
last_vs_3_5_loop:
movq mm0, QWORD ptr [rsi] // src[0];
movq mm1, QWORD ptr [rsi+rcx] // src[1];
movq mm2, mm0 // Make a copy
punpcklbw mm0, mm7 // unpack low to word
movq mm5, two_fifths // mm5 = 2/5
punpckhbw mm2, mm7 // unpack high to word
pmullw mm0, mm5 // a * 2/5
movq mm3, mm1 // make a copy
punpcklbw mm1, mm7 // unpack low to word
pmullw mm2, mm5 // a * 2/5
movq mm6, three_fifths // mm6 = 3/5
movq mm4, mm1 // copy of low b
pmullw mm4, mm6 // b * 3/5
punpckhbw mm3, mm7 // unpack high to word
movq mm5, mm3 // copy of high b
pmullw mm5, mm6 // b * 3/5
paddw mm0, mm4 // a * 2/5 + b * 3/5
paddw mm2, mm5 // a * 2/5 + b * 3/5
paddw mm0, round_values // + 128
paddw mm2, round_values // + 128
psrlw mm0, 8
psrlw mm2, 8
packuswb mm0, mm2 // des [1]
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
movq mm0, [rsi+rcx*2] // mm0 = src[2]
// mm1, mm3 --- Src[1]
// mm0 --- Src[2]
// mm7 for unpacking
movq mm4, mm1 // b low
pmullw mm1, four_fifths // b * 4/5 low
movq QWORD ptr [rdi+rcx], mm0 // write des[4]
movq mm5, mm3 // b high
pmullw mm3, four_fifths // b * 4/5 high
movq mm2, mm0 // c
pmullw mm4, one_fifth // b * 1/5
punpcklbw mm0, mm7 // c low
pmullw mm5, one_fifth // b * 1/5
movq mm6, mm0 // make copy of c low
punpckhbw mm2, mm7 // c high
pmullw mm6, one_fifth // c * 1/5 low
movq mm7, mm2 // make copy of c high
pmullw mm7, one_fifth // c * 1/5 high
paddw mm1, mm6 // b * 4/5 + c * 1/5 low
paddw mm3, mm7 // b * 4/5 + c * 1/5 high
movq mm6, mm0 // make copy of c low
pmullw mm6, four_fifths // c * 4/5 low
movq mm7, mm2 // make copy of c high
pmullw mm7, four_fifths // c * 4/5 high
paddw mm4, mm6 // b * 1/5 + c * 4/5 low
paddw mm5, mm7 // b * 1/5 + c * 4/5 high
paddw mm1, round_values // + 128
paddw mm3, round_values // + 128
psrlw mm1, 8
psrlw mm3, 8
packuswb mm1, mm3 // des[2]
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
paddw mm4, round_values // + 128
paddw mm5, round_values // + 128
psrlw mm4, 8
psrlw mm5, 8
packuswb mm4, mm5 // des[3]
movq QWORD ptr [rdi], mm4 // write des[3]
// mm0, mm2 --- Src[3]
add rdi, 8
add rsi, 8
sub rdx, 8
jg last_vs_3_5_loop
}
}
/****************************************************************************
*
* ROUTINE : vertical_band_1_2_scale_mmx
*
* INPUTS : unsigned char *dest :
* unsigned int dest_pitch :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 1 to 2 up-scaling of a band of pixels.
*
* SPECIAL NOTES : The routine uses the first line of the band below
* the current band. The function also has an "C" only
* version.
*
****************************************************************************/
static
void vertical_band_1_2_scale_mmx
(
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, dest // Get the source and destination pointer
mov ecx, dest_pitch // Get the pitch size
pxor mm7, mm7 // clear out mm7
mov edx, dest_width // Loop counter
vs_1_2_loop:
movq mm0, [rsi] // get Src[0]
movq mm1, [rsi + rcx * 2] // get Src[1]
movq mm2, mm0 // make copy before unpack
movq mm3, mm1 // make copy before unpack
punpcklbw mm0, mm7 // low Src[0]
movq mm6, four_ones // mm6= 1, 1, 1, 1
punpcklbw mm1, mm7 // low Src[1]
paddw mm0, mm1 // low (a + b)
punpckhbw mm2, mm7 // high Src[0]
paddw mm0, mm6 // low (a + b + 1)
punpckhbw mm3, mm7
paddw mm2, mm3 // high (a + b )
psraw mm0, 1 // low (a + b +1 )/2
paddw mm2, mm6 // high (a + b + 1)
psraw mm2, 1 // high (a + b + 1)/2
packuswb mm0, mm2 // pack results
movq [rsi+rcx], mm0 // write out eight bytes
add rsi, 8
sub rdx, 8
jg vs_1_2_loop
}
}
/****************************************************************************
*
* ROUTINE : last_vertical_band_1_2_scale_mmx
*
* INPUTS : unsigned char *dest :
* unsigned int dest_pitch :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 1 to 2 up-scaling of band of pixels.
*
* SPECIAL NOTES : The routine uses the first line of the band below
* the current band. The function also has an "C" only
* version.
*
****************************************************************************/
static
void last_vertical_band_1_2_scale_mmx
(
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, dest // Get the source and destination pointer
mov ecx, dest_pitch // Get the pitch size
mov edx, dest_width // Loop counter
last_vs_1_2_loop:
movq mm0, [rsi] // get Src[0]
movq [rsi+rcx], mm0 // write out eight bytes
add rsi, 8
sub rdx, 8
jg last_vs_1_2_loop
}
}
/****************************************************************************
*
* ROUTINE : horizontal_line_1_2_scale
*
* INPUTS : const unsigned char *source :
* unsigned int source_width :
* unsigned char *dest :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_1_2_scale_mmx
(
const unsigned char *source,
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width
)
{
(void) dest_width;
__asm
{
mov rsi, source
mov rdi, dest
pxor mm7, mm7
movq mm6, four_ones
mov ecx, source_width
hs_1_2_loop:
movq mm0, [rsi]
movq mm1, [rsi+1]
movq mm2, mm0
movq mm3, mm1
movq mm4, mm0
punpcklbw mm0, mm7
punpcklbw mm1, mm7
paddw mm0, mm1
paddw mm0, mm6
punpckhbw mm2, mm7
punpckhbw mm3, mm7
paddw mm2, mm3
paddw mm2, mm6
psraw mm0, 1
psraw mm2, 1
packuswb mm0, mm2
movq mm2, mm4
punpcklbw mm2, mm0
movq [rdi], mm2
punpckhbw mm4, mm0
movq [rdi+8], mm4
add rsi, 8
add rdi, 16
sub rcx, 8
cmp rcx, 8
jg hs_1_2_loop
// last eight pixel
movq mm0, [rsi]
movq mm1, mm0
movq mm2, mm0
movq mm3, mm1
psrlq mm1, 8
psrlq mm3, 56
psllq mm3, 56
por mm1, mm3
movq mm3, mm1
movq mm4, mm0
punpcklbw mm0, mm7
punpcklbw mm1, mm7
paddw mm0, mm1
paddw mm0, mm6
punpckhbw mm2, mm7
punpckhbw mm3, mm7
paddw mm2, mm3
paddw mm2, mm6
psraw mm0, 1
psraw mm2, 1
packuswb mm0, mm2
movq mm2, mm4
punpcklbw mm2, mm0
movq [rdi], mm2
punpckhbw mm4, mm0
movq [rdi+8], mm4
}
}
__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
/****************************************************************************
*
* ROUTINE : horizontal_line_5_4_scale_mmx
*
* INPUTS : const unsigned char *source : Pointer to source data.
* unsigned int source_width : Stride of source.
* unsigned char *dest : Pointer to destination data.
* unsigned int dest_width : Stride of destination (NOT USED).
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Copies horizontal line of pixels from source to
* destination scaling up by 4 to 5.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_5_4_scale_mmx
(
const unsigned char *source,
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width
)
{
/*
unsigned i;
unsigned int a, b, c, d, e;
unsigned char *des = dest;
const unsigned char *src = source;
(void) dest_width;
for ( i=0; i<source_width; i+=5 )
{
a = src[0];
b = src[1];
c = src[2];
d = src[3];
e = src[4];
des[0] = a;
des[1] = ((b*192 + c* 64 + 128)>>8);
des[2] = ((c*128 + d*128 + 128)>>8);
des[3] = ((d* 64 + e*192 + 128)>>8);
src += 5;
des += 4;
}
*/
__asm
{
mov rsi, source ;
mov rdi, dest ;
mov ecx, source_width ;
movq mm5, const54_1 ;
pxor mm7, mm7 ;
movq mm6, const54_2 ;
movq mm4, round_values ;
lea rdx, [rsi+rcx] ;
horizontal_line_5_4_loop:
movq mm0, QWORD PTR [rsi] ;
00 01 02 03 04 05 06 07
movq mm1, mm0 ;
00 01 02 03 04 05 06 07
psrlq mm0, 8 ;
01 02 03 04 05 06 07 xx
punpcklbw mm1, mm7 ;
xx 00 xx 01 xx 02 xx 03
punpcklbw mm0, mm7 ;
xx 01 xx 02 xx 03 xx 04
pmullw mm1, mm5
pmullw mm0, mm6
add rsi, 5
add rdi, 4
paddw mm1, mm0
paddw mm1, mm4
psrlw mm1, 8
cmp rsi, rdx
packuswb mm1, mm7
movd DWORD PTR [rdi-4], mm1
jl horizontal_line_5_4_loop
}
}
__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
static
void vertical_band_5_4_scale_mmx
(
unsigned char *source,
unsigned int src_pitch,
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, source // Get the source and destination pointer
mov ecx, src_pitch // Get the pitch size
mov rdi, dest // tow lines below
pxor mm7, mm7 // clear out mm7
mov edx, dest_pitch // Loop counter
mov ebx, dest_width
vs_5_4_loop:
movd mm0, DWORD ptr [rsi] // src[0];
movd mm1, DWORD ptr [rsi+rcx] // src[1];
movd mm2, DWORD ptr [rsi+rcx*2]
lea rax, [rsi+rcx*2] //
punpcklbw mm1, mm7
punpcklbw mm2, mm7
movq mm3, mm2
pmullw mm1, three_fourths
pmullw mm2, one_fourths
movd mm4, [rax+rcx]
pmullw mm3, two_fourths
punpcklbw mm4, mm7
movq mm5, mm4
pmullw mm4, two_fourths
paddw mm1, mm2
movd mm6, [rax+rcx*2]
pmullw mm5, one_fourths
paddw mm1, round_values;
paddw mm3, mm4
psrlw mm1, 8
punpcklbw mm6, mm7
paddw mm3, round_values
pmullw mm6, three_fourths
psrlw mm3, 8
packuswb mm1, mm7
packuswb mm3, mm7
movd DWORD PTR [rdi], mm0
movd DWORD PTR [rdi+rdx], mm1
paddw mm5, mm6
movd DWORD PTR [rdi+rdx*2], mm3
lea rax, [rdi+rdx*2]
paddw mm5, round_values
psrlw mm5, 8
add rdi, 4
packuswb mm5, mm7
movd DWORD PTR [rax+rdx], mm5
add rsi, 4
sub rbx, 4
jg vs_5_4_loop
}
}
__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
static
void horizontal_line_5_3_scale_mmx
(
const unsigned char *source,
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width
)
{
__asm
{
mov rsi, source ;
mov rdi, dest ;
mov ecx, source_width ;
movq mm5, const53_1 ;
pxor mm7, mm7 ;
movq mm6, const53_2 ;
movq mm4, round_values ;
lea rdx, [rsi+rcx-5] ;
horizontal_line_5_3_loop:
movq mm0, QWORD PTR [rsi] ;
00 01 02 03 04 05 06 07
movq mm1, mm0 ;
00 01 02 03 04 05 06 07
psllw mm0, 8 ;
xx 00 xx 02 xx 04 xx 06
psrlw mm1, 8 ;
01 xx 03 xx 05 xx 07 xx
psrlw mm0, 8 ;
00 xx 02 xx 04 xx 06 xx
psllq mm1, 16 ;
xx xx 01 xx 03 xx 05 xx
pmullw mm0, mm6
pmullw mm1, mm5
add rsi, 5
add rdi, 3
paddw mm1, mm0
paddw mm1, mm4
psrlw mm1, 8
cmp rsi, rdx
packuswb mm1, mm7
movd DWORD PTR [rdi-3], mm1
jl horizontal_line_5_3_loop
//exit condition
movq mm0, QWORD PTR [rsi] ;
00 01 02 03 04 05 06 07
movq mm1, mm0 ;
00 01 02 03 04 05 06 07
psllw mm0, 8 ;
xx 00 xx 02 xx 04 xx 06
psrlw mm1, 8 ;
01 xx 03 xx 05 xx 07 xx
psrlw mm0, 8 ;
00 xx 02 xx 04 xx 06 xx
psllq mm1, 16 ;
xx xx 01 xx 03 xx 05 xx
pmullw mm0, mm6
pmullw mm1, mm5
paddw mm1, mm0
paddw mm1, mm4
psrlw mm1, 8
packuswb mm1, mm7
movd rax, mm1
mov rdx, rax
shr rdx, 16
mov WORD PTR[rdi], ax
mov BYTE PTR[rdi+2], dl
}
}
__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
static
void vertical_band_5_3_scale_mmx
(
unsigned char *source,
unsigned int src_pitch,
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, source // Get the source and destination pointer
mov ecx, src_pitch // Get the pitch size
mov rdi, dest // tow lines below
pxor mm7, mm7 // clear out mm7
mov edx, dest_pitch // Loop counter
movq mm5, one_thirds
movq mm6, two_thirds
mov ebx, dest_width;
vs_5_3_loop:
movd mm0, DWORD ptr [rsi] // src[0];
movd mm1, DWORD ptr [rsi+rcx] // src[1];
movd mm2, DWORD ptr [rsi+rcx*2]
lea rax, [rsi+rcx*2] //
punpcklbw mm1, mm7
punpcklbw mm2, mm7
pmullw mm1, mm5
pmullw mm2, mm6
movd mm3, DWORD ptr [rax+rcx]
movd mm4, DWORD ptr [rax+rcx*2]
punpcklbw mm3, mm7
punpcklbw mm4, mm7
pmullw mm3, mm6
pmullw mm4, mm5
movd DWORD PTR [rdi], mm0
paddw mm1, mm2
paddw mm1, round_values
psrlw mm1, 8
packuswb mm1, mm7
paddw mm3, mm4
paddw mm3, round_values
movd DWORD PTR [rdi+rdx], mm1
psrlw mm3, 8
packuswb mm3, mm7
movd DWORD PTR [rdi+rdx*2], mm3
add rdi, 4
add rsi, 4
sub rbx, 4
jg vs_5_3_loop
}
}
/****************************************************************************
*
* ROUTINE : horizontal_line_2_1_scale
*
* INPUTS : const unsigned char *source :
* unsigned int source_width :
* unsigned char *dest :
* unsigned int dest_width :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_2_1_scale_mmx
(
const unsigned char *source,
unsigned int source_width,
unsigned char *dest,
unsigned int dest_width
)
{
(void) dest_width;
__asm
{
mov rsi, source
mov rdi, dest
pxor mm7, mm7
mov ecx, dest_width
xor rdx, rdx
hs_2_1_loop:
movq mm0, [rsi+rdx*2]
psllw mm0, 8
psrlw mm0, 8
packuswb mm0, mm7
movd DWORD Ptr [rdi+rdx], mm0;
add rdx, 4
cmp rdx, rcx
jl hs_2_1_loop
}
}
static
void vertical_band_2_1_scale_mmx
(
unsigned char *source,
unsigned int src_pitch,
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width)
{
vpx_memcpy(dest, source, dest_width);
}
__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
static
void vertical_band_2_1_scale_i_mmx
(
unsigned char *source,
unsigned int src_pitch,
unsigned char *dest,
unsigned int dest_pitch,
unsigned int dest_width
)
{
__asm
{
mov rsi, source
mov rdi, dest
mov eax, src_pitch
mov edx, dest_width
pxor mm7, mm7
sub rsi, rax //back one line
lea rcx, [rsi+rdx];
movq mm6, round_values;
movq mm5, three_sixteenths;
movq mm4, ten_sixteenths;
vs_2_1_i_loop:
movd mm0, [rsi] //
movd mm1, [rsi+rax] //
movd mm2, [rsi+rax*2] //
punpcklbw mm0, mm7
pmullw mm0, mm5
punpcklbw mm1, mm7
pmullw mm1, mm4
punpcklbw mm2, mm7
pmullw mm2, mm5
paddw mm0, round_values
paddw mm1, mm2
paddw mm0, mm1
psrlw mm0, 8
packuswb mm0, mm7
movd DWORD PTR [rdi], mm0
add rsi, 4
add rdi, 4;
cmp rsi, rcx
jl vs_2_1_i_loop
}
}
void
register_mmxscalers(void)
{
vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
}