1750 lines
55 KiB
C
1750 lines
55 KiB
C
|
/*
|
||
|
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||
|
*
|
||
|
* Use of this source code is governed by a BSD-style license and patent
|
||
|
* grant that can be found in the LICENSE file in the root of the source
|
||
|
* tree. All contributing project authors may be found in the AUTHORS
|
||
|
* file in the root of the source tree.
|
||
|
*/
|
||
|
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* Module Title : scaleopt.cpp
|
||
|
*
|
||
|
* Description : Optimized scaling functions
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
#include "pragmas.h"
|
||
|
|
||
|
|
||
|
|
||
|
/****************************************************************************
|
||
|
* Module Statics
|
||
|
****************************************************************************/
|
||
|
__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
|
||
|
__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
|
||
|
__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
|
||
|
__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
|
||
|
__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
|
||
|
__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
|
||
|
__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
|
||
|
__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
|
||
|
__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
|
||
|
__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
|
||
|
__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
|
||
|
|
||
|
|
||
|
|
||
|
#include "vpx_scale/vpxscale.h"
|
||
|
#include "vpx_mem/vpx_mem.h"
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : horizontal_line_3_5_scale_mmx
|
||
|
*
|
||
|
* INPUTS : const unsigned char *source :
|
||
|
* unsigned int source_width :
|
||
|
* unsigned char *dest :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : None.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void horizontal_line_3_5_scale_mmx
|
||
|
(
|
||
|
const unsigned char *source,
|
||
|
unsigned int source_width,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
(void) dest_width;
|
||
|
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
push rbx
|
||
|
|
||
|
mov rsi, source
|
||
|
mov rdi, dest
|
||
|
|
||
|
mov ecx, source_width
|
||
|
lea rdx, [rsi+rcx-3];
|
||
|
|
||
|
movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
|
||
|
movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
|
||
|
|
||
|
movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
|
||
|
pxor mm7, mm7 // clear mm7
|
||
|
|
||
|
horiz_line_3_5_loop:
|
||
|
|
||
|
mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
|
||
|
mov ebx, eax
|
||
|
|
||
|
and ebx, 0xffff00 // ebx = xx 01 02 xx
|
||
|
mov ecx, eax // ecx = 00 01 02 03
|
||
|
|
||
|
and eax, 0xffff0000 // eax = xx xx 02 03
|
||
|
xor ecx, eax // ecx = 00 01 xx xx
|
||
|
|
||
|
shr ebx, 8 // ebx = 01 02 xx xx
|
||
|
or eax, ebx // eax = 01 02 02 03
|
||
|
|
||
|
shl ebx, 16 // ebx = xx xx 01 02
|
||
|
movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
|
||
|
|
||
|
or ebx, ecx // ebx = 00 01 01 02
|
||
|
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
|
||
|
|
||
|
movd mm0, ebx // mm0 = 00 01 01 02
|
||
|
pmullw mm1, mm6 //
|
||
|
|
||
|
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
|
||
|
pmullw mm0, mm5 //
|
||
|
|
||
|
mov [rdi], ebx // writeoutput 00 xx xx xx
|
||
|
add rsi, 3
|
||
|
|
||
|
add rdi, 5
|
||
|
paddw mm0, mm1
|
||
|
|
||
|
paddw mm0, mm4
|
||
|
psrlw mm0, 8
|
||
|
|
||
|
cmp rsi, rdx
|
||
|
packuswb mm0, mm7
|
||
|
|
||
|
movd DWORD Ptr [rdi-4], mm0
|
||
|
jl horiz_line_3_5_loop
|
||
|
|
||
|
//Exit:
|
||
|
mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
|
||
|
mov ebx, eax
|
||
|
|
||
|
and ebx, 0xffff00 // ebx = xx 01 02 xx
|
||
|
mov ecx, eax // ecx = 00 01 02 03
|
||
|
|
||
|
and eax, 0xffff0000 // eax = xx xx 02 03
|
||
|
xor ecx, eax // ecx = 00 01 xx xx
|
||
|
|
||
|
shr ebx, 8 // ebx = 01 02 xx xx
|
||
|
or eax, ebx // eax = 01 02 02 03
|
||
|
|
||
|
shl eax, 8 // eax = xx 01 02 02
|
||
|
and eax, 0xffff0000 // eax = xx xx 02 02
|
||
|
|
||
|
or eax, ebx // eax = 01 02 02 02
|
||
|
|
||
|
shl ebx, 16 // ebx = xx xx 01 02
|
||
|
movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
|
||
|
|
||
|
or ebx, ecx // ebx = 00 01 01 02
|
||
|
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
|
||
|
|
||
|
movd mm0, ebx // mm0 = 00 01 01 02
|
||
|
pmullw mm1, mm6 //
|
||
|
|
||
|
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
|
||
|
pmullw mm0, mm5 //
|
||
|
|
||
|
mov [rdi], ebx // writeoutput 00 xx xx xx
|
||
|
paddw mm0, mm1
|
||
|
|
||
|
paddw mm0, mm4
|
||
|
psrlw mm0, 8
|
||
|
|
||
|
packuswb mm0, mm7
|
||
|
movd DWORD Ptr [rdi+1], mm0
|
||
|
|
||
|
pop rbx
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : horizontal_line_4_5_scale_mmx
|
||
|
*
|
||
|
* INPUTS : const unsigned char *source :
|
||
|
* unsigned int source_width :
|
||
|
* unsigned char *dest :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : None.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void horizontal_line_4_5_scale_mmx
|
||
|
(
|
||
|
const unsigned char *source,
|
||
|
unsigned int source_width,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
(void)dest_width;
|
||
|
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
mov rsi, source
|
||
|
mov rdi, dest
|
||
|
|
||
|
mov ecx, source_width
|
||
|
lea rdx, [rsi+rcx-8];
|
||
|
|
||
|
movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
|
||
|
movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
|
||
|
|
||
|
movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
|
||
|
pxor mm7, mm7 // clear mm7
|
||
|
|
||
|
horiz_line_4_5_loop:
|
||
|
|
||
|
movq mm0, QWORD PTR [rsi] // mm0 = 00 01 02 03 04 05 06 07
|
||
|
movq mm1, QWORD PTR [rsi+1]; // mm1 = 01 02 03 04 05 06 07 08
|
||
|
|
||
|
movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
|
||
|
movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
|
||
|
|
||
|
movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
|
||
|
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
|
||
|
|
||
|
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
|
||
|
pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
|
||
|
|
||
|
pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
|
||
|
punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
|
||
|
|
||
|
movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
|
||
|
pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
|
||
|
|
||
|
punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
|
||
|
pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
|
||
|
|
||
|
paddw mm0, mm1 // added round values
|
||
|
paddw mm0, mm4
|
||
|
|
||
|
psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
|
||
|
packuswb mm0, mm7
|
||
|
|
||
|
movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
|
||
|
add rdi, 10
|
||
|
|
||
|
add rsi, 8
|
||
|
paddw mm2, mm3 //
|
||
|
|
||
|
paddw mm2, mm4 // added round values
|
||
|
cmp rsi, rdx
|
||
|
|
||
|
psrlw mm2, 8
|
||
|
packuswb mm2, mm7
|
||
|
|
||
|
movd DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
|
||
|
jl horiz_line_4_5_loop
|
||
|
|
||
|
//Exit:
|
||
|
movq mm0, [rsi] // mm0 = 00 01 02 03 04 05 06 07
|
||
|
movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
|
||
|
|
||
|
movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
|
||
|
psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
|
||
|
|
||
|
movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
|
||
|
pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
|
||
|
|
||
|
psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
|
||
|
por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
|
||
|
|
||
|
movq mm3, mm1
|
||
|
|
||
|
movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
|
||
|
punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
|
||
|
|
||
|
punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
|
||
|
pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
|
||
|
|
||
|
pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
|
||
|
punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
|
||
|
|
||
|
movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
|
||
|
pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
|
||
|
|
||
|
punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
|
||
|
pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
|
||
|
|
||
|
paddw mm0, mm1 // added round values
|
||
|
paddw mm0, mm4
|
||
|
|
||
|
psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
|
||
|
packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
|
||
|
|
||
|
movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
|
||
|
paddw mm2, mm3 //
|
||
|
|
||
|
paddw mm2, mm4 // added round values
|
||
|
psrlw mm2, 8
|
||
|
|
||
|
packuswb mm2, mm7
|
||
|
movd DWORD PTR [rdi+6], mm2 // writeoutput 06 07 08 09
|
||
|
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : vertical_band_4_5_scale_mmx
|
||
|
*
|
||
|
* INPUTS : unsigned char *dest :
|
||
|
* unsigned int dest_pitch :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : The routine uses the first line of the band below
|
||
|
* the current band. The function also has a "C" only
|
||
|
* version.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void vertical_band_4_5_scale_mmx
|
||
|
(
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
mov rsi, dest // Get the source and destination pointer
|
||
|
mov ecx, dest_pitch // Get the pitch size
|
||
|
|
||
|
lea rdi, [rsi+rcx*2] // tow lines below
|
||
|
add rdi, rcx // three lines below
|
||
|
|
||
|
pxor mm7, mm7 // clear out mm7
|
||
|
mov edx, dest_width // Loop counter
|
||
|
|
||
|
vs_4_5_loop:
|
||
|
|
||
|
movq mm0, QWORD ptr [rsi] // src[0];
|
||
|
movq mm1, QWORD ptr [rsi+rcx] // src[1];
|
||
|
|
||
|
movq mm2, mm0 // Make a copy
|
||
|
punpcklbw mm0, mm7 // unpack low to word
|
||
|
|
||
|
movq mm5, one_fifth
|
||
|
punpckhbw mm2, mm7 // unpack high to word
|
||
|
|
||
|
pmullw mm0, mm5 // a * 1/5
|
||
|
|
||
|
movq mm3, mm1 // make a copy
|
||
|
punpcklbw mm1, mm7 // unpack low to word
|
||
|
|
||
|
pmullw mm2, mm5 // a * 1/5
|
||
|
movq mm6, four_fifths // constan
|
||
|
|
||
|
movq mm4, mm1 // copy of low b
|
||
|
pmullw mm4, mm6 // b * 4/5
|
||
|
|
||
|
punpckhbw mm3, mm7 // unpack high to word
|
||
|
movq mm5, mm3 // copy of high b
|
||
|
|
||
|
pmullw mm5, mm6 // b * 4/5
|
||
|
paddw mm0, mm4 // a * 1/5 + b * 4/5
|
||
|
|
||
|
paddw mm2, mm5 // a * 1/5 + b * 4/5
|
||
|
paddw mm0, round_values // + 128
|
||
|
|
||
|
paddw mm2, round_values // + 128
|
||
|
psrlw mm0, 8
|
||
|
|
||
|
psrlw mm2, 8
|
||
|
packuswb mm0, mm2 // des [1]
|
||
|
|
||
|
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
|
||
|
movq mm0, [rsi+rcx*2] // mm0 = src[2]
|
||
|
|
||
|
// mm1, mm3 --- Src[1]
|
||
|
// mm0 --- Src[2]
|
||
|
// mm7 for unpacking
|
||
|
|
||
|
movq mm5, two_fifths
|
||
|
movq mm2, mm0 // make a copy
|
||
|
|
||
|
pmullw mm1, mm5 // b * 2/5
|
||
|
movq mm6, three_fifths
|
||
|
|
||
|
|
||
|
punpcklbw mm0, mm7 // unpack low to word
|
||
|
pmullw mm3, mm5 // b * 2/5
|
||
|
|
||
|
movq mm4, mm0 // make copy of c
|
||
|
punpckhbw mm2, mm7 // unpack high to word
|
||
|
|
||
|
pmullw mm4, mm6 // c * 3/5
|
||
|
movq mm5, mm2
|
||
|
|
||
|
pmullw mm5, mm6 // c * 3/5
|
||
|
paddw mm1, mm4 // b * 2/5 + c * 3/5
|
||
|
|
||
|
paddw mm3, mm5 // b * 2/5 + c * 3/5
|
||
|
paddw mm1, round_values // + 128
|
||
|
|
||
|
paddw mm3, round_values // + 128
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
psrlw mm3, 8
|
||
|
packuswb mm1, mm3 // des[2]
|
||
|
|
||
|
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
|
||
|
movq mm1, [rdi] // mm1=Src[3];
|
||
|
|
||
|
// mm0, mm2 --- Src[2]
|
||
|
// mm1 --- Src[3]
|
||
|
// mm6 --- 3/5
|
||
|
// mm7 for unpacking
|
||
|
|
||
|
pmullw mm0, mm6 // c * 3/5
|
||
|
movq mm5, two_fifths // mm5 = 2/5
|
||
|
|
||
|
movq mm3, mm1 // make a copy
|
||
|
pmullw mm2, mm6 // c * 3/5
|
||
|
|
||
|
punpcklbw mm1, mm7 // unpack low
|
||
|
movq mm4, mm1 // make a copy
|
||
|
|
||
|
punpckhbw mm3, mm7 // unpack high
|
||
|
pmullw mm4, mm5 // d * 2/5
|
||
|
|
||
|
movq mm6, mm3 // make a copy
|
||
|
pmullw mm6, mm5 // d * 2/5
|
||
|
|
||
|
paddw mm0, mm4 // c * 3/5 + d * 2/5
|
||
|
paddw mm2, mm6 // c * 3/5 + d * 2/5
|
||
|
|
||
|
paddw mm0, round_values // + 128
|
||
|
paddw mm2, round_values // + 128
|
||
|
|
||
|
psrlw mm0, 8
|
||
|
psrlw mm2, 8
|
||
|
|
||
|
packuswb mm0, mm2 // des[3]
|
||
|
movq QWORD ptr [rdi], mm0 // write des[3]
|
||
|
|
||
|
// mm1, mm3 --- Src[3]
|
||
|
// mm7 -- cleared for unpacking
|
||
|
|
||
|
movq mm0, [rdi+rcx*2] // mm0, Src[0] of the next group
|
||
|
|
||
|
movq mm5, four_fifths // mm5 = 4/5
|
||
|
pmullw mm1, mm5 // d * 4/5
|
||
|
|
||
|
movq mm6, one_fifth // mm6 = 1/5
|
||
|
movq mm2, mm0 // make a copy
|
||
|
|
||
|
pmullw mm3, mm5 // d * 4/5
|
||
|
punpcklbw mm0, mm7 // unpack low
|
||
|
|
||
|
pmullw mm0, mm6 // an * 1/5
|
||
|
punpckhbw mm2, mm7 // unpack high
|
||
|
|
||
|
paddw mm1, mm0 // d * 4/5 + an * 1/5
|
||
|
pmullw mm2, mm6 // an * 1/5
|
||
|
|
||
|
paddw mm3, mm2 // d * 4/5 + an * 1/5
|
||
|
paddw mm1, round_values // + 128
|
||
|
|
||
|
paddw mm3, round_values // + 128
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
psrlw mm3, 8
|
||
|
packuswb mm1, mm3 // des[4]
|
||
|
|
||
|
movq QWORD ptr [rdi+rcx], mm1 // write des[4]
|
||
|
|
||
|
add rdi, 8
|
||
|
add rsi, 8
|
||
|
|
||
|
sub rdx, 8
|
||
|
jg vs_4_5_loop
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : last_vertical_band_4_5_scale_mmx
|
||
|
*
|
||
|
* INPUTS : unsigned char *dest :
|
||
|
* unsigned int dest_pitch :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : None
|
||
|
*
|
||
|
* FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
|
||
|
*
|
||
|
* SPECIAL NOTES : The routine uses the first line of the band below
|
||
|
* the current band. The function also has an "C" only
|
||
|
* version.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void last_vertical_band_4_5_scale_mmx
|
||
|
(
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
mov rsi, dest // Get the source and destination pointer
|
||
|
mov ecx, dest_pitch // Get the pitch size
|
||
|
|
||
|
lea rdi, [rsi+rcx*2] // tow lines below
|
||
|
add rdi, rcx // three lines below
|
||
|
|
||
|
pxor mm7, mm7 // clear out mm7
|
||
|
mov edx, dest_width // Loop counter
|
||
|
|
||
|
last_vs_4_5_loop:
|
||
|
|
||
|
movq mm0, QWORD ptr [rsi] // src[0];
|
||
|
movq mm1, QWORD ptr [rsi+rcx] // src[1];
|
||
|
|
||
|
movq mm2, mm0 // Make a copy
|
||
|
punpcklbw mm0, mm7 // unpack low to word
|
||
|
|
||
|
movq mm5, one_fifth
|
||
|
punpckhbw mm2, mm7 // unpack high to word
|
||
|
|
||
|
pmullw mm0, mm5 // a * 1/5
|
||
|
|
||
|
movq mm3, mm1 // make a copy
|
||
|
punpcklbw mm1, mm7 // unpack low to word
|
||
|
|
||
|
pmullw mm2, mm5 // a * 1/5
|
||
|
movq mm6, four_fifths // constan
|
||
|
|
||
|
movq mm4, mm1 // copy of low b
|
||
|
pmullw mm4, mm6 // b * 4/5
|
||
|
|
||
|
punpckhbw mm3, mm7 // unpack high to word
|
||
|
movq mm5, mm3 // copy of high b
|
||
|
|
||
|
pmullw mm5, mm6 // b * 4/5
|
||
|
paddw mm0, mm4 // a * 1/5 + b * 4/5
|
||
|
|
||
|
paddw mm2, mm5 // a * 1/5 + b * 4/5
|
||
|
paddw mm0, round_values // + 128
|
||
|
|
||
|
paddw mm2, round_values // + 128
|
||
|
psrlw mm0, 8
|
||
|
|
||
|
psrlw mm2, 8
|
||
|
packuswb mm0, mm2 // des [1]
|
||
|
|
||
|
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
|
||
|
movq mm0, [rsi+rcx*2] // mm0 = src[2]
|
||
|
|
||
|
// mm1, mm3 --- Src[1]
|
||
|
// mm0 --- Src[2]
|
||
|
// mm7 for unpacking
|
||
|
|
||
|
movq mm5, two_fifths
|
||
|
movq mm2, mm0 // make a copy
|
||
|
|
||
|
pmullw mm1, mm5 // b * 2/5
|
||
|
movq mm6, three_fifths
|
||
|
|
||
|
|
||
|
punpcklbw mm0, mm7 // unpack low to word
|
||
|
pmullw mm3, mm5 // b * 2/5
|
||
|
|
||
|
movq mm4, mm0 // make copy of c
|
||
|
punpckhbw mm2, mm7 // unpack high to word
|
||
|
|
||
|
pmullw mm4, mm6 // c * 3/5
|
||
|
movq mm5, mm2
|
||
|
|
||
|
pmullw mm5, mm6 // c * 3/5
|
||
|
paddw mm1, mm4 // b * 2/5 + c * 3/5
|
||
|
|
||
|
paddw mm3, mm5 // b * 2/5 + c * 3/5
|
||
|
paddw mm1, round_values // + 128
|
||
|
|
||
|
paddw mm3, round_values // + 128
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
psrlw mm3, 8
|
||
|
packuswb mm1, mm3 // des[2]
|
||
|
|
||
|
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
|
||
|
movq mm1, [rdi] // mm1=Src[3];
|
||
|
|
||
|
movq QWORD ptr [rdi+rcx], mm1 // write des[4];
|
||
|
|
||
|
// mm0, mm2 --- Src[2]
|
||
|
// mm1 --- Src[3]
|
||
|
// mm6 --- 3/5
|
||
|
// mm7 for unpacking
|
||
|
|
||
|
pmullw mm0, mm6 // c * 3/5
|
||
|
movq mm5, two_fifths // mm5 = 2/5
|
||
|
|
||
|
movq mm3, mm1 // make a copy
|
||
|
pmullw mm2, mm6 // c * 3/5
|
||
|
|
||
|
punpcklbw mm1, mm7 // unpack low
|
||
|
movq mm4, mm1 // make a copy
|
||
|
|
||
|
punpckhbw mm3, mm7 // unpack high
|
||
|
pmullw mm4, mm5 // d * 2/5
|
||
|
|
||
|
movq mm6, mm3 // make a copy
|
||
|
pmullw mm6, mm5 // d * 2/5
|
||
|
|
||
|
paddw mm0, mm4 // c * 3/5 + d * 2/5
|
||
|
paddw mm2, mm6 // c * 3/5 + d * 2/5
|
||
|
|
||
|
paddw mm0, round_values // + 128
|
||
|
paddw mm2, round_values // + 128
|
||
|
|
||
|
psrlw mm0, 8
|
||
|
psrlw mm2, 8
|
||
|
|
||
|
packuswb mm0, mm2 // des[3]
|
||
|
movq QWORD ptr [rdi], mm0 // write des[3]
|
||
|
|
||
|
// mm1, mm3 --- Src[3]
|
||
|
// mm7 -- cleared for unpacking
|
||
|
add rdi, 8
|
||
|
add rsi, 8
|
||
|
|
||
|
sub rdx, 8
|
||
|
jg last_vs_4_5_loop
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : vertical_band_3_5_scale_mmx
|
||
|
*
|
||
|
* INPUTS : unsigned char *dest :
|
||
|
* unsigned int dest_pitch :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : The routine uses the first line of the band below
|
||
|
* the current band. The function also has an "C" only
|
||
|
* version.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void vertical_band_3_5_scale_mmx
|
||
|
(
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
mov rsi, dest // Get the source and destination pointer
|
||
|
mov ecx, dest_pitch // Get the pitch size
|
||
|
|
||
|
lea rdi, [rsi+rcx*2] // two lines below
|
||
|
add rdi, rcx // three lines below
|
||
|
|
||
|
pxor mm7, mm7 // clear out mm7
|
||
|
mov edx, dest_width // Loop counter
|
||
|
|
||
|
vs_3_5_loop:
|
||
|
|
||
|
movq mm0, QWORD ptr [rsi] // src[0];
|
||
|
movq mm1, QWORD ptr [rsi+rcx] // src[1];
|
||
|
|
||
|
movq mm2, mm0 // Make a copy
|
||
|
punpcklbw mm0, mm7 // unpack low to word
|
||
|
|
||
|
movq mm5, two_fifths // mm5 = 2/5
|
||
|
punpckhbw mm2, mm7 // unpack high to word
|
||
|
|
||
|
pmullw mm0, mm5 // a * 2/5
|
||
|
|
||
|
movq mm3, mm1 // make a copy
|
||
|
punpcklbw mm1, mm7 // unpack low to word
|
||
|
|
||
|
pmullw mm2, mm5 // a * 2/5
|
||
|
movq mm6, three_fifths // mm6 = 3/5
|
||
|
|
||
|
movq mm4, mm1 // copy of low b
|
||
|
pmullw mm4, mm6 // b * 3/5
|
||
|
|
||
|
punpckhbw mm3, mm7 // unpack high to word
|
||
|
movq mm5, mm3 // copy of high b
|
||
|
|
||
|
pmullw mm5, mm6 // b * 3/5
|
||
|
paddw mm0, mm4 // a * 2/5 + b * 3/5
|
||
|
|
||
|
paddw mm2, mm5 // a * 2/5 + b * 3/5
|
||
|
paddw mm0, round_values // + 128
|
||
|
|
||
|
paddw mm2, round_values // + 128
|
||
|
psrlw mm0, 8
|
||
|
|
||
|
psrlw mm2, 8
|
||
|
packuswb mm0, mm2 // des [1]
|
||
|
|
||
|
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
|
||
|
movq mm0, [rsi+rcx*2] // mm0 = src[2]
|
||
|
|
||
|
// mm1, mm3 --- Src[1]
|
||
|
// mm0 --- Src[2]
|
||
|
// mm7 for unpacking
|
||
|
|
||
|
movq mm4, mm1 // b low
|
||
|
pmullw mm1, four_fifths // b * 4/5 low
|
||
|
|
||
|
movq mm5, mm3 // b high
|
||
|
pmullw mm3, four_fifths // b * 4/5 high
|
||
|
|
||
|
movq mm2, mm0 // c
|
||
|
pmullw mm4, one_fifth // b * 1/5
|
||
|
|
||
|
punpcklbw mm0, mm7 // c low
|
||
|
pmullw mm5, one_fifth // b * 1/5
|
||
|
|
||
|
movq mm6, mm0 // make copy of c low
|
||
|
punpckhbw mm2, mm7 // c high
|
||
|
|
||
|
pmullw mm6, one_fifth // c * 1/5 low
|
||
|
movq mm7, mm2 // make copy of c high
|
||
|
|
||
|
pmullw mm7, one_fifth // c * 1/5 high
|
||
|
paddw mm1, mm6 // b * 4/5 + c * 1/5 low
|
||
|
|
||
|
paddw mm3, mm7 // b * 4/5 + c * 1/5 high
|
||
|
movq mm6, mm0 // make copy of c low
|
||
|
|
||
|
pmullw mm6, four_fifths // c * 4/5 low
|
||
|
movq mm7, mm2 // make copy of c high
|
||
|
|
||
|
pmullw mm7, four_fifths // c * 4/5 high
|
||
|
|
||
|
paddw mm4, mm6 // b * 1/5 + c * 4/5 low
|
||
|
paddw mm5, mm7 // b * 1/5 + c * 4/5 high
|
||
|
|
||
|
paddw mm1, round_values // + 128
|
||
|
paddw mm3, round_values // + 128
|
||
|
|
||
|
psrlw mm1, 8
|
||
|
psrlw mm3, 8
|
||
|
|
||
|
packuswb mm1, mm3 // des[2]
|
||
|
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
|
||
|
|
||
|
paddw mm4, round_values // + 128
|
||
|
paddw mm5, round_values // + 128
|
||
|
|
||
|
psrlw mm4, 8
|
||
|
psrlw mm5, 8
|
||
|
|
||
|
packuswb mm4, mm5 // des[3]
|
||
|
movq QWORD ptr [rdi], mm4 // write des[3]
|
||
|
|
||
|
// mm0, mm2 --- Src[3]
|
||
|
|
||
|
pxor mm7, mm7 // clear mm7 for unpacking
|
||
|
movq mm1, [rdi+rcx*2] // mm1 = Src[0] of the next group
|
||
|
|
||
|
movq mm5, three_fifths // mm5 = 3/5
|
||
|
pmullw mm0, mm5 // d * 3/5
|
||
|
|
||
|
movq mm6, two_fifths // mm6 = 2/5
|
||
|
movq mm3, mm1 // make a copy
|
||
|
|
||
|
pmullw mm2, mm5 // d * 3/5
|
||
|
punpcklbw mm1, mm7 // unpack low
|
||
|
|
||
|
pmullw mm1, mm6 // an * 2/5
|
||
|
punpckhbw mm3, mm7 // unpack high
|
||
|
|
||
|
paddw mm0, mm1 // d * 3/5 + an * 2/5
|
||
|
pmullw mm3, mm6 // an * 2/5
|
||
|
|
||
|
paddw mm2, mm3 // d * 3/5 + an * 2/5
|
||
|
paddw mm0, round_values // + 128
|
||
|
|
||
|
paddw mm2, round_values // + 128
|
||
|
psrlw mm0, 8
|
||
|
|
||
|
psrlw mm2, 8
|
||
|
packuswb mm0, mm2 // des[4]
|
||
|
|
||
|
movq QWORD ptr [rdi+rcx], mm0 // write des[4]
|
||
|
|
||
|
add rdi, 8
|
||
|
add rsi, 8
|
||
|
|
||
|
sub rdx, 8
|
||
|
jg vs_3_5_loop
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : last_vertical_band_3_5_scale_mmx
|
||
|
*
|
||
|
* INPUTS : unsigned char *dest :
|
||
|
* unsigned int dest_pitch :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : The routine uses the first line of the band below
|
||
|
* the current band. The function also has an "C" only
|
||
|
* version.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void last_vertical_band_3_5_scale_mmx
|
||
|
(
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
mov rsi, dest // Get the source and destination pointer
|
||
|
mov ecx, dest_pitch // Get the pitch size
|
||
|
|
||
|
lea rdi, [rsi+rcx*2] // tow lines below
|
||
|
add rdi, rcx // three lines below
|
||
|
|
||
|
pxor mm7, mm7 // clear out mm7
|
||
|
mov edx, dest_width // Loop counter
|
||
|
|
||
|
|
||
|
last_vs_3_5_loop:
|
||
|
|
||
|
movq mm0, QWORD ptr [rsi] // src[0];
|
||
|
movq mm1, QWORD ptr [rsi+rcx] // src[1];
|
||
|
|
||
|
movq mm2, mm0 // Make a copy
|
||
|
punpcklbw mm0, mm7 // unpack low to word
|
||
|
|
||
|
movq mm5, two_fifths // mm5 = 2/5
|
||
|
punpckhbw mm2, mm7 // unpack high to word
|
||
|
|
||
|
pmullw mm0, mm5 // a * 2/5
|
||
|
|
||
|
movq mm3, mm1 // make a copy
|
||
|
punpcklbw mm1, mm7 // unpack low to word
|
||
|
|
||
|
pmullw mm2, mm5 // a * 2/5
|
||
|
movq mm6, three_fifths // mm6 = 3/5
|
||
|
|
||
|
movq mm4, mm1 // copy of low b
|
||
|
pmullw mm4, mm6 // b * 3/5
|
||
|
|
||
|
punpckhbw mm3, mm7 // unpack high to word
|
||
|
movq mm5, mm3 // copy of high b
|
||
|
|
||
|
pmullw mm5, mm6 // b * 3/5
|
||
|
paddw mm0, mm4 // a * 2/5 + b * 3/5
|
||
|
|
||
|
paddw mm2, mm5 // a * 2/5 + b * 3/5
|
||
|
paddw mm0, round_values // + 128
|
||
|
|
||
|
paddw mm2, round_values // + 128
|
||
|
psrlw mm0, 8
|
||
|
|
||
|
psrlw mm2, 8
|
||
|
packuswb mm0, mm2 // des [1]
|
||
|
|
||
|
movq QWORD ptr [rsi+rcx], mm0 // write des[1]
|
||
|
movq mm0, [rsi+rcx*2] // mm0 = src[2]
|
||
|
|
||
|
|
||
|
|
||
|
// mm1, mm3 --- Src[1]
|
||
|
// mm0 --- Src[2]
|
||
|
// mm7 for unpacking
|
||
|
|
||
|
movq mm4, mm1 // b low
|
||
|
pmullw mm1, four_fifths // b * 4/5 low
|
||
|
|
||
|
movq QWORD ptr [rdi+rcx], mm0 // write des[4]
|
||
|
|
||
|
movq mm5, mm3 // b high
|
||
|
pmullw mm3, four_fifths // b * 4/5 high
|
||
|
|
||
|
movq mm2, mm0 // c
|
||
|
pmullw mm4, one_fifth // b * 1/5
|
||
|
|
||
|
punpcklbw mm0, mm7 // c low
|
||
|
pmullw mm5, one_fifth // b * 1/5
|
||
|
|
||
|
movq mm6, mm0 // make copy of c low
|
||
|
punpckhbw mm2, mm7 // c high
|
||
|
|
||
|
pmullw mm6, one_fifth // c * 1/5 low
|
||
|
movq mm7, mm2 // make copy of c high
|
||
|
|
||
|
pmullw mm7, one_fifth // c * 1/5 high
|
||
|
paddw mm1, mm6 // b * 4/5 + c * 1/5 low
|
||
|
|
||
|
paddw mm3, mm7 // b * 4/5 + c * 1/5 high
|
||
|
movq mm6, mm0 // make copy of c low
|
||
|
|
||
|
pmullw mm6, four_fifths // c * 4/5 low
|
||
|
movq mm7, mm2 // make copy of c high
|
||
|
|
||
|
pmullw mm7, four_fifths // c * 4/5 high
|
||
|
|
||
|
paddw mm4, mm6 // b * 1/5 + c * 4/5 low
|
||
|
paddw mm5, mm7 // b * 1/5 + c * 4/5 high
|
||
|
|
||
|
paddw mm1, round_values // + 128
|
||
|
paddw mm3, round_values // + 128
|
||
|
|
||
|
psrlw mm1, 8
|
||
|
psrlw mm3, 8
|
||
|
|
||
|
packuswb mm1, mm3 // des[2]
|
||
|
movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
|
||
|
|
||
|
paddw mm4, round_values // + 128
|
||
|
paddw mm5, round_values // + 128
|
||
|
|
||
|
psrlw mm4, 8
|
||
|
psrlw mm5, 8
|
||
|
|
||
|
packuswb mm4, mm5 // des[3]
|
||
|
movq QWORD ptr [rdi], mm4 // write des[3]
|
||
|
|
||
|
// mm0, mm2 --- Src[3]
|
||
|
|
||
|
add rdi, 8
|
||
|
add rsi, 8
|
||
|
|
||
|
sub rdx, 8
|
||
|
jg last_vs_3_5_loop
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : vertical_band_1_2_scale_mmx
|
||
|
*
|
||
|
* INPUTS : unsigned char *dest :
|
||
|
* unsigned int dest_pitch :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 1 to 2 up-scaling of a band of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : The routine uses the first line of the band below
|
||
|
* the current band. The function also has an "C" only
|
||
|
* version.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void vertical_band_1_2_scale_mmx
|
||
|
(
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
mov rsi, dest // Get the source and destination pointer
|
||
|
mov ecx, dest_pitch // Get the pitch size
|
||
|
|
||
|
pxor mm7, mm7 // clear out mm7
|
||
|
mov edx, dest_width // Loop counter
|
||
|
|
||
|
vs_1_2_loop:
|
||
|
|
||
|
movq mm0, [rsi] // get Src[0]
|
||
|
movq mm1, [rsi + rcx * 2] // get Src[1]
|
||
|
|
||
|
movq mm2, mm0 // make copy before unpack
|
||
|
movq mm3, mm1 // make copy before unpack
|
||
|
|
||
|
punpcklbw mm0, mm7 // low Src[0]
|
||
|
movq mm6, four_ones // mm6= 1, 1, 1, 1
|
||
|
|
||
|
punpcklbw mm1, mm7 // low Src[1]
|
||
|
paddw mm0, mm1 // low (a + b)
|
||
|
|
||
|
punpckhbw mm2, mm7 // high Src[0]
|
||
|
paddw mm0, mm6 // low (a + b + 1)
|
||
|
|
||
|
punpckhbw mm3, mm7
|
||
|
paddw mm2, mm3 // high (a + b )
|
||
|
|
||
|
psraw mm0, 1 // low (a + b +1 )/2
|
||
|
paddw mm2, mm6 // high (a + b + 1)
|
||
|
|
||
|
psraw mm2, 1 // high (a + b + 1)/2
|
||
|
packuswb mm0, mm2 // pack results
|
||
|
|
||
|
movq [rsi+rcx], mm0 // write out eight bytes
|
||
|
add rsi, 8
|
||
|
|
||
|
sub rdx, 8
|
||
|
jg vs_1_2_loop
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : last_vertical_band_1_2_scale_mmx
|
||
|
*
|
||
|
* INPUTS : unsigned char *dest :
|
||
|
* unsigned int dest_pitch :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 1 to 2 up-scaling of band of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : The routine uses the first line of the band below
|
||
|
* the current band. The function also has an "C" only
|
||
|
* version.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void last_vertical_band_1_2_scale_mmx
|
||
|
(
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
mov rsi, dest // Get the source and destination pointer
|
||
|
mov ecx, dest_pitch // Get the pitch size
|
||
|
|
||
|
mov edx, dest_width // Loop counter
|
||
|
|
||
|
last_vs_1_2_loop:
|
||
|
|
||
|
movq mm0, [rsi] // get Src[0]
|
||
|
movq [rsi+rcx], mm0 // write out eight bytes
|
||
|
|
||
|
add rsi, 8
|
||
|
sub rdx, 8
|
||
|
|
||
|
jg last_vs_1_2_loop
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : horizontal_line_1_2_scale
|
||
|
*
|
||
|
* INPUTS : const unsigned char *source :
|
||
|
* unsigned int source_width :
|
||
|
* unsigned char *dest :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : None.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void horizontal_line_1_2_scale_mmx
|
||
|
(
|
||
|
const unsigned char *source,
|
||
|
unsigned int source_width,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
(void) dest_width;
|
||
|
|
||
|
__asm
|
||
|
{
|
||
|
mov rsi, source
|
||
|
mov rdi, dest
|
||
|
|
||
|
pxor mm7, mm7
|
||
|
movq mm6, four_ones
|
||
|
|
||
|
mov ecx, source_width
|
||
|
|
||
|
hs_1_2_loop:
|
||
|
|
||
|
movq mm0, [rsi]
|
||
|
movq mm1, [rsi+1]
|
||
|
|
||
|
movq mm2, mm0
|
||
|
movq mm3, mm1
|
||
|
|
||
|
movq mm4, mm0
|
||
|
punpcklbw mm0, mm7
|
||
|
|
||
|
punpcklbw mm1, mm7
|
||
|
paddw mm0, mm1
|
||
|
|
||
|
paddw mm0, mm6
|
||
|
punpckhbw mm2, mm7
|
||
|
|
||
|
punpckhbw mm3, mm7
|
||
|
paddw mm2, mm3
|
||
|
|
||
|
paddw mm2, mm6
|
||
|
psraw mm0, 1
|
||
|
|
||
|
psraw mm2, 1
|
||
|
packuswb mm0, mm2
|
||
|
|
||
|
movq mm2, mm4
|
||
|
punpcklbw mm2, mm0
|
||
|
|
||
|
movq [rdi], mm2
|
||
|
punpckhbw mm4, mm0
|
||
|
|
||
|
movq [rdi+8], mm4
|
||
|
add rsi, 8
|
||
|
|
||
|
add rdi, 16
|
||
|
sub rcx, 8
|
||
|
|
||
|
cmp rcx, 8
|
||
|
jg hs_1_2_loop
|
||
|
|
||
|
// last eight pixel
|
||
|
|
||
|
movq mm0, [rsi]
|
||
|
movq mm1, mm0
|
||
|
|
||
|
movq mm2, mm0
|
||
|
movq mm3, mm1
|
||
|
|
||
|
psrlq mm1, 8
|
||
|
psrlq mm3, 56
|
||
|
|
||
|
psllq mm3, 56
|
||
|
por mm1, mm3
|
||
|
|
||
|
movq mm3, mm1
|
||
|
movq mm4, mm0
|
||
|
|
||
|
punpcklbw mm0, mm7
|
||
|
punpcklbw mm1, mm7
|
||
|
|
||
|
paddw mm0, mm1
|
||
|
paddw mm0, mm6
|
||
|
|
||
|
punpckhbw mm2, mm7
|
||
|
punpckhbw mm3, mm7
|
||
|
|
||
|
paddw mm2, mm3
|
||
|
paddw mm2, mm6
|
||
|
|
||
|
psraw mm0, 1
|
||
|
psraw mm2, 1
|
||
|
|
||
|
packuswb mm0, mm2
|
||
|
movq mm2, mm4
|
||
|
|
||
|
punpcklbw mm2, mm0
|
||
|
movq [rdi], mm2
|
||
|
|
||
|
punpckhbw mm4, mm0
|
||
|
movq [rdi+8], mm4
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
|
||
|
__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
|
||
|
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : horizontal_line_5_4_scale_mmx
|
||
|
*
|
||
|
* INPUTS : const unsigned char *source : Pointer to source data.
|
||
|
* unsigned int source_width : Stride of source.
|
||
|
* unsigned char *dest : Pointer to destination data.
|
||
|
* unsigned int dest_width : Stride of destination (NOT USED).
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : Copies horizontal line of pixels from source to
|
||
|
* destination scaling up by 4 to 5.
|
||
|
*
|
||
|
* SPECIAL NOTES : None.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void horizontal_line_5_4_scale_mmx
|
||
|
(
|
||
|
const unsigned char *source,
|
||
|
unsigned int source_width,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
/*
|
||
|
unsigned i;
|
||
|
unsigned int a, b, c, d, e;
|
||
|
unsigned char *des = dest;
|
||
|
const unsigned char *src = source;
|
||
|
|
||
|
(void) dest_width;
|
||
|
|
||
|
for ( i=0; i<source_width; i+=5 )
|
||
|
{
|
||
|
a = src[0];
|
||
|
b = src[1];
|
||
|
c = src[2];
|
||
|
d = src[3];
|
||
|
e = src[4];
|
||
|
|
||
|
des[0] = a;
|
||
|
des[1] = ((b*192 + c* 64 + 128)>>8);
|
||
|
des[2] = ((c*128 + d*128 + 128)>>8);
|
||
|
des[3] = ((d* 64 + e*192 + 128)>>8);
|
||
|
|
||
|
src += 5;
|
||
|
des += 4;
|
||
|
}
|
||
|
*/
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
mov rsi, source ;
|
||
|
mov rdi, dest ;
|
||
|
|
||
|
mov ecx, source_width ;
|
||
|
movq mm5, const54_1 ;
|
||
|
|
||
|
pxor mm7, mm7 ;
|
||
|
movq mm6, const54_2 ;
|
||
|
|
||
|
movq mm4, round_values ;
|
||
|
lea rdx, [rsi+rcx] ;
|
||
|
horizontal_line_5_4_loop:
|
||
|
|
||
|
movq mm0, QWORD PTR [rsi] ;
|
||
|
00 01 02 03 04 05 06 07
|
||
|
movq mm1, mm0 ;
|
||
|
00 01 02 03 04 05 06 07
|
||
|
|
||
|
psrlq mm0, 8 ;
|
||
|
01 02 03 04 05 06 07 xx
|
||
|
punpcklbw mm1, mm7 ;
|
||
|
xx 00 xx 01 xx 02 xx 03
|
||
|
|
||
|
punpcklbw mm0, mm7 ;
|
||
|
xx 01 xx 02 xx 03 xx 04
|
||
|
pmullw mm1, mm5
|
||
|
|
||
|
pmullw mm0, mm6
|
||
|
add rsi, 5
|
||
|
|
||
|
add rdi, 4
|
||
|
paddw mm1, mm0
|
||
|
|
||
|
paddw mm1, mm4
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
cmp rsi, rdx
|
||
|
packuswb mm1, mm7
|
||
|
|
||
|
movd DWORD PTR [rdi-4], mm1
|
||
|
|
||
|
jl horizontal_line_5_4_loop
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
|
||
|
__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
|
||
|
__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
|
||
|
|
||
|
static
|
||
|
void vertical_band_5_4_scale_mmx
|
||
|
(
|
||
|
unsigned char *source,
|
||
|
unsigned int src_pitch,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
mov rsi, source // Get the source and destination pointer
|
||
|
mov ecx, src_pitch // Get the pitch size
|
||
|
|
||
|
mov rdi, dest // tow lines below
|
||
|
pxor mm7, mm7 // clear out mm7
|
||
|
|
||
|
mov edx, dest_pitch // Loop counter
|
||
|
mov ebx, dest_width
|
||
|
|
||
|
vs_5_4_loop:
|
||
|
|
||
|
movd mm0, DWORD ptr [rsi] // src[0];
|
||
|
movd mm1, DWORD ptr [rsi+rcx] // src[1];
|
||
|
|
||
|
movd mm2, DWORD ptr [rsi+rcx*2]
|
||
|
lea rax, [rsi+rcx*2] //
|
||
|
|
||
|
punpcklbw mm1, mm7
|
||
|
punpcklbw mm2, mm7
|
||
|
|
||
|
movq mm3, mm2
|
||
|
pmullw mm1, three_fourths
|
||
|
|
||
|
pmullw mm2, one_fourths
|
||
|
movd mm4, [rax+rcx]
|
||
|
|
||
|
pmullw mm3, two_fourths
|
||
|
punpcklbw mm4, mm7
|
||
|
|
||
|
movq mm5, mm4
|
||
|
pmullw mm4, two_fourths
|
||
|
|
||
|
paddw mm1, mm2
|
||
|
movd mm6, [rax+rcx*2]
|
||
|
|
||
|
pmullw mm5, one_fourths
|
||
|
paddw mm1, round_values;
|
||
|
|
||
|
paddw mm3, mm4
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
punpcklbw mm6, mm7
|
||
|
paddw mm3, round_values
|
||
|
|
||
|
pmullw mm6, three_fourths
|
||
|
psrlw mm3, 8
|
||
|
|
||
|
packuswb mm1, mm7
|
||
|
packuswb mm3, mm7
|
||
|
|
||
|
movd DWORD PTR [rdi], mm0
|
||
|
movd DWORD PTR [rdi+rdx], mm1
|
||
|
|
||
|
|
||
|
paddw mm5, mm6
|
||
|
movd DWORD PTR [rdi+rdx*2], mm3
|
||
|
|
||
|
lea rax, [rdi+rdx*2]
|
||
|
paddw mm5, round_values
|
||
|
|
||
|
psrlw mm5, 8
|
||
|
add rdi, 4
|
||
|
|
||
|
packuswb mm5, mm7
|
||
|
movd DWORD PTR [rax+rdx], mm5
|
||
|
|
||
|
add rsi, 4
|
||
|
sub rbx, 4
|
||
|
|
||
|
jg vs_5_4_loop
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
|
||
|
__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
|
||
|
|
||
|
|
||
|
static
|
||
|
void horizontal_line_5_3_scale_mmx
|
||
|
(
|
||
|
const unsigned char *source,
|
||
|
unsigned int source_width,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
mov rsi, source ;
|
||
|
mov rdi, dest ;
|
||
|
|
||
|
mov ecx, source_width ;
|
||
|
movq mm5, const53_1 ;
|
||
|
|
||
|
pxor mm7, mm7 ;
|
||
|
movq mm6, const53_2 ;
|
||
|
|
||
|
movq mm4, round_values ;
|
||
|
lea rdx, [rsi+rcx-5] ;
|
||
|
horizontal_line_5_3_loop:
|
||
|
|
||
|
movq mm0, QWORD PTR [rsi] ;
|
||
|
00 01 02 03 04 05 06 07
|
||
|
movq mm1, mm0 ;
|
||
|
00 01 02 03 04 05 06 07
|
||
|
|
||
|
psllw mm0, 8 ;
|
||
|
xx 00 xx 02 xx 04 xx 06
|
||
|
psrlw mm1, 8 ;
|
||
|
01 xx 03 xx 05 xx 07 xx
|
||
|
|
||
|
psrlw mm0, 8 ;
|
||
|
00 xx 02 xx 04 xx 06 xx
|
||
|
psllq mm1, 16 ;
|
||
|
xx xx 01 xx 03 xx 05 xx
|
||
|
|
||
|
pmullw mm0, mm6
|
||
|
|
||
|
pmullw mm1, mm5
|
||
|
add rsi, 5
|
||
|
|
||
|
add rdi, 3
|
||
|
paddw mm1, mm0
|
||
|
|
||
|
paddw mm1, mm4
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
cmp rsi, rdx
|
||
|
packuswb mm1, mm7
|
||
|
|
||
|
movd DWORD PTR [rdi-3], mm1
|
||
|
jl horizontal_line_5_3_loop
|
||
|
|
||
|
//exit condition
|
||
|
movq mm0, QWORD PTR [rsi] ;
|
||
|
00 01 02 03 04 05 06 07
|
||
|
movq mm1, mm0 ;
|
||
|
00 01 02 03 04 05 06 07
|
||
|
|
||
|
psllw mm0, 8 ;
|
||
|
xx 00 xx 02 xx 04 xx 06
|
||
|
psrlw mm1, 8 ;
|
||
|
01 xx 03 xx 05 xx 07 xx
|
||
|
|
||
|
psrlw mm0, 8 ;
|
||
|
00 xx 02 xx 04 xx 06 xx
|
||
|
psllq mm1, 16 ;
|
||
|
xx xx 01 xx 03 xx 05 xx
|
||
|
|
||
|
pmullw mm0, mm6
|
||
|
|
||
|
pmullw mm1, mm5
|
||
|
paddw mm1, mm0
|
||
|
|
||
|
paddw mm1, mm4
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
packuswb mm1, mm7
|
||
|
movd rax, mm1
|
||
|
|
||
|
mov rdx, rax
|
||
|
shr rdx, 16
|
||
|
|
||
|
mov WORD PTR[rdi], ax
|
||
|
mov BYTE PTR[rdi+2], dl
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
|
||
|
__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
|
||
|
|
||
|
static
|
||
|
void vertical_band_5_3_scale_mmx
|
||
|
(
|
||
|
unsigned char *source,
|
||
|
unsigned int src_pitch,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
|
||
|
__asm
|
||
|
{
|
||
|
|
||
|
mov rsi, source // Get the source and destination pointer
|
||
|
mov ecx, src_pitch // Get the pitch size
|
||
|
|
||
|
mov rdi, dest // tow lines below
|
||
|
pxor mm7, mm7 // clear out mm7
|
||
|
|
||
|
mov edx, dest_pitch // Loop counter
|
||
|
movq mm5, one_thirds
|
||
|
|
||
|
movq mm6, two_thirds
|
||
|
mov ebx, dest_width;
|
||
|
|
||
|
vs_5_3_loop:
|
||
|
|
||
|
movd mm0, DWORD ptr [rsi] // src[0];
|
||
|
movd mm1, DWORD ptr [rsi+rcx] // src[1];
|
||
|
|
||
|
movd mm2, DWORD ptr [rsi+rcx*2]
|
||
|
lea rax, [rsi+rcx*2] //
|
||
|
|
||
|
punpcklbw mm1, mm7
|
||
|
punpcklbw mm2, mm7
|
||
|
|
||
|
pmullw mm1, mm5
|
||
|
pmullw mm2, mm6
|
||
|
|
||
|
movd mm3, DWORD ptr [rax+rcx]
|
||
|
movd mm4, DWORD ptr [rax+rcx*2]
|
||
|
|
||
|
punpcklbw mm3, mm7
|
||
|
punpcklbw mm4, mm7
|
||
|
|
||
|
pmullw mm3, mm6
|
||
|
pmullw mm4, mm5
|
||
|
|
||
|
|
||
|
movd DWORD PTR [rdi], mm0
|
||
|
paddw mm1, mm2
|
||
|
|
||
|
paddw mm1, round_values
|
||
|
psrlw mm1, 8
|
||
|
|
||
|
packuswb mm1, mm7
|
||
|
paddw mm3, mm4
|
||
|
|
||
|
paddw mm3, round_values
|
||
|
movd DWORD PTR [rdi+rdx], mm1
|
||
|
|
||
|
psrlw mm3, 8
|
||
|
packuswb mm3, mm7
|
||
|
|
||
|
movd DWORD PTR [rdi+rdx*2], mm3
|
||
|
|
||
|
|
||
|
add rdi, 4
|
||
|
add rsi, 4
|
||
|
|
||
|
sub rbx, 4
|
||
|
jg vs_5_3_loop
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/****************************************************************************
|
||
|
*
|
||
|
* ROUTINE : horizontal_line_2_1_scale
|
||
|
*
|
||
|
* INPUTS : const unsigned char *source :
|
||
|
* unsigned int source_width :
|
||
|
* unsigned char *dest :
|
||
|
* unsigned int dest_width :
|
||
|
*
|
||
|
* OUTPUTS : None.
|
||
|
*
|
||
|
* RETURNS : void
|
||
|
*
|
||
|
* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
|
||
|
*
|
||
|
* SPECIAL NOTES : None.
|
||
|
*
|
||
|
****************************************************************************/
|
||
|
static
|
||
|
void horizontal_line_2_1_scale_mmx
|
||
|
(
|
||
|
const unsigned char *source,
|
||
|
unsigned int source_width,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
(void) dest_width;
|
||
|
|
||
|
__asm
|
||
|
{
|
||
|
mov rsi, source
|
||
|
mov rdi, dest
|
||
|
|
||
|
pxor mm7, mm7
|
||
|
mov ecx, dest_width
|
||
|
|
||
|
xor rdx, rdx
|
||
|
hs_2_1_loop:
|
||
|
|
||
|
movq mm0, [rsi+rdx*2]
|
||
|
psllw mm0, 8
|
||
|
|
||
|
psrlw mm0, 8
|
||
|
packuswb mm0, mm7
|
||
|
|
||
|
movd DWORD Ptr [rdi+rdx], mm0;
|
||
|
add rdx, 4
|
||
|
|
||
|
cmp rdx, rcx
|
||
|
jl hs_2_1_loop
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
static
|
||
|
void vertical_band_2_1_scale_mmx
|
||
|
(
|
||
|
unsigned char *source,
|
||
|
unsigned int src_pitch,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width)
|
||
|
{
|
||
|
vpx_memcpy(dest, source, dest_width);
|
||
|
}
|
||
|
|
||
|
|
||
|
__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
|
||
|
__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
|
||
|
|
||
|
static
|
||
|
void vertical_band_2_1_scale_i_mmx
|
||
|
(
|
||
|
unsigned char *source,
|
||
|
unsigned int src_pitch,
|
||
|
unsigned char *dest,
|
||
|
unsigned int dest_pitch,
|
||
|
unsigned int dest_width
|
||
|
)
|
||
|
{
|
||
|
__asm
|
||
|
{
|
||
|
mov rsi, source
|
||
|
mov rdi, dest
|
||
|
|
||
|
mov eax, src_pitch
|
||
|
mov edx, dest_width
|
||
|
|
||
|
pxor mm7, mm7
|
||
|
sub rsi, rax //back one line
|
||
|
|
||
|
|
||
|
lea rcx, [rsi+rdx];
|
||
|
movq mm6, round_values;
|
||
|
|
||
|
movq mm5, three_sixteenths;
|
||
|
movq mm4, ten_sixteenths;
|
||
|
|
||
|
vs_2_1_i_loop:
|
||
|
movd mm0, [rsi] //
|
||
|
movd mm1, [rsi+rax] //
|
||
|
|
||
|
movd mm2, [rsi+rax*2] //
|
||
|
punpcklbw mm0, mm7
|
||
|
|
||
|
pmullw mm0, mm5
|
||
|
punpcklbw mm1, mm7
|
||
|
|
||
|
pmullw mm1, mm4
|
||
|
punpcklbw mm2, mm7
|
||
|
|
||
|
pmullw mm2, mm5
|
||
|
paddw mm0, round_values
|
||
|
|
||
|
paddw mm1, mm2
|
||
|
paddw mm0, mm1
|
||
|
|
||
|
psrlw mm0, 8
|
||
|
packuswb mm0, mm7
|
||
|
|
||
|
movd DWORD PTR [rdi], mm0
|
||
|
add rsi, 4
|
||
|
|
||
|
add rdi, 4;
|
||
|
cmp rsi, rcx
|
||
|
jl vs_2_1_i_loop
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
void
|
||
|
register_mmxscalers(void)
|
||
|
{
|
||
|
vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
|
||
|
vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
|
||
|
vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
|
||
|
vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
|
||
|
vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
|
||
|
vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
|
||
|
vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
|
||
|
vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
|
||
|
vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
|
||
|
|
||
|
vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
|
||
|
vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
|
||
|
vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
|
||
|
vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
|
||
|
vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
|
||
|
vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
|
||
|
vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
|
||
|
}
|