vpx/vp8/common/x86/postproc_mmx.c

1509 lines
42 KiB
C
Raw Normal View History

2010-05-18 11:58:33 -04:00
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
2010-05-18 11:58:33 -04:00
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
2010-05-18 11:58:33 -04:00
*/
#include <math.h>
#include <stdlib.h>
#include "vpx_scale/yv12config.h"
#include "pragmas.h"
#define VP8_FILTER_WEIGHT 128
#define VP8_FILTER_SHIFT 7
/* static constants */
__declspec(align(16))
const static short Blur[48] =
{
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
64, 64, 64, 64, 64, 64, 64, 64,
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
0, 0, 0, 0, 0, 0, 0, 0,
};
#define RD __declspec(align(16)) __int64 rd = 0x0040004000400040;
#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
#ifndef RELOCATEABLE
const static RD;
const static R4D2;
#endif
/* external references */
extern double vp8_gaussian(double sigma, double mu, double x);
extern short vp8_rv[];
extern int vp8_q2mbl(int x) ;
void vp8_post_proc_down_and_across_mmx
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
)
{
#ifdef RELOCATEABLE
RD
R4D2
#endif
__asm
{
push ebx
lea ebx, Blur
movd mm2, flimit
punpcklwd mm2, mm2
punpckldq mm2, mm2
mov esi, src_ptr
mov edi, dst_ptr
mov ecx, DWORD PTR rows
mov eax, src_pixels_per_line ;
destination pitch?
pxor mm0, mm0 ;
mm0 = 00000000
nextrow:
xor edx, edx ;
clear out edx for use as loop counter
nextcol:
pxor mm7, mm7 ;
mm7 = 00000000
movq mm6, [ebx + 32 ] ;
mm6 = kernel 2 taps
movq mm3, [esi] ;
mm4 = r0 p0..p7
punpcklbw mm3, mm0 ;
mm3 = p0..p3
movq mm1, mm3 ;
mm1 = p0..p3
pmullw mm3, mm6 ;
mm3 *= kernel 2 modifiers
movq mm6, [ebx + 48] ;
mm6 = kernel 3 taps
movq mm5, [esi + eax] ;
mm4 = r1 p0..p7
punpcklbw mm5, mm0 ;
mm5 = r1 p0..p3
pmullw mm6, mm5 ;
mm6 *= p0..p3 * kernel 3 modifiers
paddusw mm3, mm6 ;
mm3 += mm6
;
thresholding
movq mm7, mm1 ;
mm7 = r0 p0..p3
psubusw mm7, mm5 ;
mm7 = r0 p0..p3 - r1 p0..p3
psubusw mm5, mm1 ;
mm5 = r1 p0..p3 - r0 p0..p3
paddusw mm7, mm5 ;
mm7 = abs(r0 p0..p3 - r1 p0..p3)
pcmpgtw mm7, mm2
movq mm6, [ebx + 64 ] ;
mm6 = kernel 4 modifiers
movq mm5, [esi + 2*eax] ;
mm4 = r2 p0..p7
punpcklbw mm5, mm0 ;
mm5 = r2 p0..p3
pmullw mm6, mm5 ;
mm5 *= kernel 4 modifiers
paddusw mm3, mm6 ;
mm3 += mm5
;
thresholding
movq mm6, mm1 ;
mm6 = r0 p0..p3
psubusw mm6, mm5 ;
mm6 = r0 p0..p3 - r2 p0..p3
psubusw mm5, mm1 ;
mm5 = r2 p0..p3 - r2 p0..p3
paddusw mm6, mm5 ;
mm6 = abs(r0 p0..p3 - r2 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ;
accumulate thresholds
neg eax
movq mm6, [ebx ] ;
kernel 0 taps
movq mm5, [esi+2*eax] ;
mm4 = r-2 p0..p7
punpcklbw mm5, mm0 ;
mm5 = r-2 p0..p3
pmullw mm6, mm5 ;
mm5 *= kernel 0 modifiers
paddusw mm3, mm6 ;
mm3 += mm5
;
thresholding
movq mm6, mm1 ;
mm6 = r0 p0..p3
psubusw mm6, mm5 ;
mm6 = p0..p3 - r-2 p0..p3
psubusw mm5, mm1 ;
mm5 = r-2 p0..p3 - p0..p3
paddusw mm6, mm5 ;
mm6 = abs(r0 p0..p3 - r-2 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ;
accumulate thresholds
movq mm6, [ebx + 16] ;
kernel 1 taps
movq mm4, [esi+eax] ;
mm4 = r-1 p0..p7
punpcklbw mm4, mm0 ;
mm4 = r-1 p0..p3
pmullw mm6, mm4 ;
mm4 *= kernel 1 modifiers.
paddusw mm3, mm6 ;
mm3 += mm5
;
thresholding
movq mm6, mm1 ;
mm6 = r0 p0..p3
psubusw mm6, mm4 ;
mm6 = p0..p3 - r-2 p0..p3
psubusw mm4, mm1 ;
mm5 = r-1 p0..p3 - p0..p3
paddusw mm6, mm4 ;
mm6 = abs(r0 p0..p3 - r-1 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ;
accumulate thresholds
paddusw mm3, rd ;
mm3 += round value
psraw mm3, VP8_FILTER_SHIFT ;
mm3 /= 128
pand mm1, mm7 ;
mm1 select vals > thresh from source
pandn mm7, mm3 ;
mm7 select vals < thresh from blurred result
paddusw mm1, mm7 ;
combination
packuswb mm1, mm0 ;
pack to bytes
movd [edi], mm1 ;
neg eax ;
pitch is positive
add esi, 4
add edi, 4
add edx, 4
cmp edx, cols
jl nextcol
// done with the all cols, start the across filtering in place
sub esi, edx
sub edi, edx
push eax
xor edx, edx
mov eax, [edi-4];
acrossnextcol:
pxor mm7, mm7 ;
mm7 = 00000000
movq mm6, [ebx + 32 ] ;
movq mm4, [edi+edx] ;
mm4 = p0..p7
movq mm3, mm4 ;
mm3 = p0..p7
punpcklbw mm3, mm0 ;
mm3 = p0..p3
movq mm1, mm3 ;
mm1 = p0..p3
pmullw mm3, mm6 ;
mm3 *= kernel 2 modifiers
movq mm6, [ebx + 48]
psrlq mm4, 8 ;
mm4 = p1..p7
movq mm5, mm4 ;
mm5 = p1..p7
punpcklbw mm5, mm0 ;
mm5 = p1..p4
pmullw mm6, mm5 ;
mm6 *= p1..p4 * kernel 3 modifiers
paddusw mm3, mm6 ;
mm3 += mm6
;
thresholding
movq mm7, mm1 ;
mm7 = p0..p3
psubusw mm7, mm5 ;
mm7 = p0..p3 - p1..p4
psubusw mm5, mm1 ;
mm5 = p1..p4 - p0..p3
paddusw mm7, mm5 ;
mm7 = abs(p0..p3 - p1..p4)
pcmpgtw mm7, mm2
movq mm6, [ebx + 64 ]
psrlq mm4, 8 ;
mm4 = p2..p7
movq mm5, mm4 ;
mm5 = p2..p7
punpcklbw mm5, mm0 ;
mm5 = p2..p5
pmullw mm6, mm5 ;
mm5 *= kernel 4 modifiers
paddusw mm3, mm6 ;
mm3 += mm5
;
thresholding
movq mm6, mm1 ;
mm6 = p0..p3
psubusw mm6, mm5 ;
mm6 = p0..p3 - p1..p4
psubusw mm5, mm1 ;
mm5 = p1..p4 - p0..p3
paddusw mm6, mm5 ;
mm6 = abs(p0..p3 - p1..p4)
pcmpgtw mm6, mm2
por mm7, mm6 ;
accumulate thresholds
movq mm6, [ebx ]
movq mm4, [edi+edx-2] ;
mm4 = p-2..p5
movq mm5, mm4 ;
mm5 = p-2..p5
punpcklbw mm5, mm0 ;
mm5 = p-2..p1
pmullw mm6, mm5 ;
mm5 *= kernel 0 modifiers
paddusw mm3, mm6 ;
mm3 += mm5
;
thresholding
movq mm6, mm1 ;
mm6 = p0..p3
psubusw mm6, mm5 ;
mm6 = p0..p3 - p1..p4
psubusw mm5, mm1 ;
mm5 = p1..p4 - p0..p3
paddusw mm6, mm5 ;
mm6 = abs(p0..p3 - p1..p4)
pcmpgtw mm6, mm2
por mm7, mm6 ;
accumulate thresholds
movq mm6, [ebx + 16]
psrlq mm4, 8 ;
mm4 = p-1..p5
punpcklbw mm4, mm0 ;
mm4 = p-1..p2
pmullw mm6, mm4 ;
mm4 *= kernel 1 modifiers.
paddusw mm3, mm6 ;
mm3 += mm5
;
thresholding
movq mm6, mm1 ;
mm6 = p0..p3
psubusw mm6, mm4 ;
mm6 = p0..p3 - p1..p4
psubusw mm4, mm1 ;
mm5 = p1..p4 - p0..p3
paddusw mm6, mm4 ;
mm6 = abs(p0..p3 - p1..p4)
pcmpgtw mm6, mm2
por mm7, mm6 ;
accumulate thresholds
paddusw mm3, rd ;
mm3 += round value
psraw mm3, VP8_FILTER_SHIFT ;
mm3 /= 128
pand mm1, mm7 ;
mm1 select vals > thresh from source
pandn mm7, mm3 ;
mm7 select vals < thresh from blurred result
paddusw mm1, mm7 ;
combination
packuswb mm1, mm0 ;
pack to bytes
mov DWORD PTR [edi+edx-4], eax ;
store previous four bytes
movd eax, mm1
add edx, 4
cmp edx, cols
jl acrossnextcol;
mov DWORD PTR [edi+edx-4], eax
pop eax
// done with this rwo
add esi, eax ;
next line
mov eax, dst_pixels_per_line ;
destination pitch?
add edi, eax ;
next destination
mov eax, src_pixels_per_line ;
destination pitch?
dec ecx ;
decrement count
jnz nextrow ;
next row
pop ebx
}
}
void vp8_post_proc_down_and_across_xmm
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
)
{
#ifdef RELOCATEABLE
R4D2
#endif
__asm
{
movd xmm2, flimit
punpcklwd xmm2, xmm2
punpckldq xmm2, xmm2
punpcklqdq xmm2, xmm2
mov esi, src_ptr
mov edi, dst_ptr
mov ecx, DWORD PTR rows
mov eax, src_pixels_per_line ;
destination pitch?
pxor xmm0, xmm0 ;
mm0 = 00000000
nextrow:
xor edx, edx ;
clear out edx for use as loop counter
nextcol:
movq xmm3, QWORD PTR [esi] ;
mm4 = r0 p0..p7
punpcklbw xmm3, xmm0 ;
mm3 = p0..p3
movdqa xmm1, xmm3 ;
mm1 = p0..p3
psllw xmm3, 2 ;
movq xmm5, QWORD PTR [esi + eax] ;
mm4 = r1 p0..p7
punpcklbw xmm5, xmm0 ;
mm5 = r1 p0..p3
paddusw xmm3, xmm5 ;
mm3 += mm6
;
thresholding
movdqa xmm7, xmm1 ;
mm7 = r0 p0..p3
psubusw xmm7, xmm5 ;
mm7 = r0 p0..p3 - r1 p0..p3
psubusw xmm5, xmm1 ;
mm5 = r1 p0..p3 - r0 p0..p3
paddusw xmm7, xmm5 ;
mm7 = abs(r0 p0..p3 - r1 p0..p3)
pcmpgtw xmm7, xmm2
movq xmm5, QWORD PTR [esi + 2*eax] ;
mm4 = r2 p0..p7
punpcklbw xmm5, xmm0 ;
mm5 = r2 p0..p3
paddusw xmm3, xmm5 ;
mm3 += mm5
;
thresholding
movdqa xmm6, xmm1 ;
mm6 = r0 p0..p3
psubusw xmm6, xmm5 ;
mm6 = r0 p0..p3 - r2 p0..p3
psubusw xmm5, xmm1 ;
mm5 = r2 p0..p3 - r2 p0..p3
paddusw xmm6, xmm5 ;
mm6 = abs(r0 p0..p3 - r2 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ;
accumulate thresholds
neg eax
movq xmm5, QWORD PTR [esi+2*eax] ;
mm4 = r-2 p0..p7
punpcklbw xmm5, xmm0 ;
mm5 = r-2 p0..p3
paddusw xmm3, xmm5 ;
mm3 += mm5
;
thresholding
movdqa xmm6, xmm1 ;
mm6 = r0 p0..p3
psubusw xmm6, xmm5 ;
mm6 = p0..p3 - r-2 p0..p3
psubusw xmm5, xmm1 ;
mm5 = r-2 p0..p3 - p0..p3
paddusw xmm6, xmm5 ;
mm6 = abs(r0 p0..p3 - r-2 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ;
accumulate thresholds
movq xmm4, QWORD PTR [esi+eax] ;
mm4 = r-1 p0..p7
punpcklbw xmm4, xmm0 ;
mm4 = r-1 p0..p3
paddusw xmm3, xmm4 ;
mm3 += mm5
;
thresholding
movdqa xmm6, xmm1 ;
mm6 = r0 p0..p3
psubusw xmm6, xmm4 ;
mm6 = p0..p3 - r-2 p0..p3
psubusw xmm4, xmm1 ;
mm5 = r-1 p0..p3 - p0..p3
paddusw xmm6, xmm4 ;
mm6 = abs(r0 p0..p3 - r-1 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ;
accumulate thresholds
paddusw xmm3, rd42 ;
mm3 += round value
psraw xmm3, 3 ;
mm3 /= 8
pand xmm1, xmm7 ;
mm1 select vals > thresh from source
pandn xmm7, xmm3 ;
mm7 select vals < thresh from blurred result
paddusw xmm1, xmm7 ;
combination
packuswb xmm1, xmm0 ;
pack to bytes
movq QWORD PTR [edi], xmm1 ;
neg eax ;
pitch is positive
add esi, 8
add edi, 8
add edx, 8
cmp edx, cols
jl nextcol
// done with the all cols, start the across filtering in place
sub esi, edx
sub edi, edx
xor edx, edx
movq mm0, QWORD PTR [edi-8];
acrossnextcol:
movq xmm7, QWORD PTR [edi +edx -2]
movd xmm4, DWORD PTR [edi +edx +6]
pslldq xmm4, 8
por xmm4, xmm7
movdqa xmm3, xmm4
psrldq xmm3, 2
punpcklbw xmm3, xmm0 ;
mm3 = p0..p3
movdqa xmm1, xmm3 ;
mm1 = p0..p3
psllw xmm3, 2
movdqa xmm5, xmm4
psrldq xmm5, 3
punpcklbw xmm5, xmm0 ;
mm5 = p1..p4
paddusw xmm3, xmm5 ;
mm3 += mm6
;
thresholding
movdqa xmm7, xmm1 ;
mm7 = p0..p3
psubusw xmm7, xmm5 ;
mm7 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ;
mm5 = p1..p4 - p0..p3
paddusw xmm7, xmm5 ;
mm7 = abs(p0..p3 - p1..p4)
pcmpgtw xmm7, xmm2
movdqa xmm5, xmm4
psrldq xmm5, 4
punpcklbw xmm5, xmm0 ;
mm5 = p2..p5
paddusw xmm3, xmm5 ;
mm3 += mm5
;
thresholding
movdqa xmm6, xmm1 ;
mm6 = p0..p3
psubusw xmm6, xmm5 ;
mm6 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ;
mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm5 ;
mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ;
accumulate thresholds
movdqa xmm5, xmm4 ;
mm5 = p-2..p5
punpcklbw xmm5, xmm0 ;
mm5 = p-2..p1
paddusw xmm3, xmm5 ;
mm3 += mm5
;
thresholding
movdqa xmm6, xmm1 ;
mm6 = p0..p3
psubusw xmm6, xmm5 ;
mm6 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ;
mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm5 ;
mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ;
accumulate thresholds
psrldq xmm4, 1 ;
mm4 = p-1..p5
punpcklbw xmm4, xmm0 ;
mm4 = p-1..p2
paddusw xmm3, xmm4 ;
mm3 += mm5
;
thresholding
movdqa xmm6, xmm1 ;
mm6 = p0..p3
psubusw xmm6, xmm4 ;
mm6 = p0..p3 - p1..p4
psubusw xmm4, xmm1 ;
mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm4 ;
mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ;
accumulate thresholds
paddusw xmm3, rd42 ;
mm3 += round value
psraw xmm3, 3 ;
mm3 /= 8
pand xmm1, xmm7 ;
mm1 select vals > thresh from source
pandn xmm7, xmm3 ;
mm7 select vals < thresh from blurred result
paddusw xmm1, xmm7 ;
combination
packuswb xmm1, xmm0 ;
pack to bytes
movq QWORD PTR [edi+edx-8], mm0 ;
store previous four bytes
movdq2q mm0, xmm1
add edx, 8
cmp edx, cols
jl acrossnextcol;
// last 8 pixels
movq QWORD PTR [edi+edx-8], mm0
// done with this rwo
add esi, eax ;
next line
mov eax, dst_pixels_per_line ;
destination pitch?
add edi, eax ;
next destination
mov eax, src_pixels_per_line ;
destination pitch?
dec ecx ;
decrement count
jnz nextrow ;
next row
}
}
void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
{
int c, i;
__declspec(align(16))
int flimit2[2];
__declspec(align(16))
unsigned char d[16][8];
flimit = vp8_q2mbl(flimit);
for (i = 0; i < 2; i++)
flimit2[i] = flimit;
rows += 8;
for (c = 0; c < cols; c += 4)
{
unsigned char *s = &dst[c];
__asm
{
mov esi, s ;
pxor mm0, mm0 ;
mov eax, pitch ;
neg eax // eax = -pitch
lea esi, [esi + eax*8]; // edi = s[-pitch*8]
neg eax
pxor mm5, mm5
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov edi, esi
mov ecx, 15 ;
loop_initvar:
movd mm1, DWORD PTR [edi];
punpcklbw mm1, mm0 ;
paddw mm5, mm1 ;
pmullw mm1, mm1 ;
movq mm2, mm1 ;
punpcklwd mm1, mm0 ;
punpckhwd mm2, mm0 ;
paddd mm6, mm1 ;
paddd mm7, mm2 ;
lea edi, [edi+eax] ;
dec ecx
jne loop_initvar
//save the var and sum
xor edx, edx
loop_row:
movd mm1, DWORD PTR [esi] // [s-pitch*8]
movd mm2, DWORD PTR [edi] // [s+pitch*7]
punpcklbw mm1, mm0
punpcklbw mm2, mm0
paddw mm5, mm2
psubw mm5, mm1
pmullw mm2, mm2
movq mm4, mm2
punpcklwd mm2, mm0
punpckhwd mm4, mm0
paddd mm6, mm2
paddd mm7, mm4
pmullw mm1, mm1
movq mm2, mm1
punpcklwd mm1, mm0
psubd mm6, mm1
punpckhwd mm2, mm0
psubd mm7, mm2
movq mm3, mm6
pslld mm3, 4
psubd mm3, mm6
movq mm1, mm5
movq mm4, mm5
pmullw mm1, mm1
pmulhw mm4, mm4
movq mm2, mm1
punpcklwd mm1, mm4
punpckhwd mm2, mm4
movq mm4, mm7
pslld mm4, 4
psubd mm4, mm7
psubd mm3, mm1
psubd mm4, mm2
psubd mm3, flimit2
psubd mm4, flimit2
psrad mm3, 31
psrad mm4, 31
packssdw mm3, mm4
packsswb mm3, mm0
movd mm1, DWORD PTR [esi+eax*8]
movq mm2, mm1
punpcklbw mm1, mm0
paddw mm1, mm5
mov ecx, edx
and ecx, 127
movq mm4, vp8_rv[ecx*2]
paddw mm1, mm4
//paddw xmm1, eight8s
psraw mm1, 4
packuswb mm1, mm0
pand mm1, mm3
pandn mm3, mm2
por mm1, mm3
and ecx, 15
movd DWORD PTR d[ecx*4], mm1
mov ecx, edx
sub ecx, 8
and ecx, 15
movd mm1, DWORD PTR d[ecx*4]
movd [esi], mm1
lea esi, [esi+eax]
lea edi, [edi+eax]
add edx, 1
cmp edx, rows
jl loop_row
}
}
}
void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
{
int c, i;
__declspec(align(16))
int flimit4[4];
__declspec(align(16))
unsigned char d[16][8];
flimit = vp8_q2mbl(flimit);
for (i = 0; i < 4; i++)
flimit4[i] = flimit;
rows += 8;
for (c = 0; c < cols; c += 8)
{
unsigned char *s = &dst[c];
__asm
{
mov esi, s ;
pxor xmm0, xmm0 ;
mov eax, pitch ;
neg eax // eax = -pitch
lea esi, [esi + eax*8]; // edi = s[-pitch*8]
neg eax
pxor xmm5, xmm5
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
mov edi, esi
mov ecx, 15 ;
loop_initvar:
movq xmm1, QWORD PTR [edi];
punpcklbw xmm1, xmm0 ;
paddw xmm5, xmm1 ;
pmullw xmm1, xmm1 ;
movdqa xmm2, xmm1 ;
punpcklwd xmm1, xmm0 ;
punpckhwd xmm2, xmm0 ;
paddd xmm6, xmm1 ;
paddd xmm7, xmm2 ;
lea edi, [edi+eax] ;
dec ecx
jne loop_initvar
//save the var and sum
xor edx, edx
loop_row:
movq xmm1, QWORD PTR [esi] // [s-pitch*8]
movq xmm2, QWORD PTR [edi] // [s+pitch*7]
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
paddw xmm5, xmm2
psubw xmm5, xmm1
pmullw xmm2, xmm2
movdqa xmm4, xmm2
punpcklwd xmm2, xmm0
punpckhwd xmm4, xmm0
paddd xmm6, xmm2
paddd xmm7, xmm4
pmullw xmm1, xmm1
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
psubd xmm6, xmm1
punpckhwd xmm2, xmm0
psubd xmm7, xmm2
movdqa xmm3, xmm6
pslld xmm3, 4
psubd xmm3, xmm6
movdqa xmm1, xmm5
movdqa xmm4, xmm5
pmullw xmm1, xmm1
pmulhw xmm4, xmm4
movdqa xmm2, xmm1
punpcklwd xmm1, xmm4
punpckhwd xmm2, xmm4
movdqa xmm4, xmm7
pslld xmm4, 4
psubd xmm4, xmm7
psubd xmm3, xmm1
psubd xmm4, xmm2
psubd xmm3, flimit4
psubd xmm4, flimit4
psrad xmm3, 31
psrad xmm4, 31
packssdw xmm3, xmm4
packsswb xmm3, xmm0
movq xmm1, QWORD PTR [esi+eax*8]
movq xmm2, xmm1
punpcklbw xmm1, xmm0
paddw xmm1, xmm5
mov ecx, edx
and ecx, 127
movdqu xmm4, vp8_rv[ecx*2]
paddw xmm1, xmm4
//paddw xmm1, eight8s
psraw xmm1, 4
packuswb xmm1, xmm0
pand xmm1, xmm3
pandn xmm3, xmm2
por xmm1, xmm3
and ecx, 15
movq QWORD PTR d[ecx*8], xmm1
mov ecx, edx
sub ecx, 8
and ecx, 15
movq mm0, d[ecx*8]
movq [esi], mm0
lea esi, [esi+eax]
lea edi, [edi+eax]
add edx, 1
cmp edx, rows
jl loop_row
}
}
}
#if 0
/****************************************************************************
*
* ROUTINE : plane_add_noise_wmt
*
* INPUTS : unsigned char *Start starting address of buffer to add gaussian
* noise to
* unsigned int Width width of plane
* unsigned int Height height of plane
* int Pitch distance between subsequent lines of frame
* int q quantizer used to determine amount of noise
* to add
*
* OUTPUTS : None.
*
* RETURNS : void.
*
* FUNCTION : adds gaussian noise to a plane of pixels
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
{
unsigned int i;
__declspec(align(16)) unsigned char blackclamp[16];
__declspec(align(16)) unsigned char whiteclamp[16];
__declspec(align(16)) unsigned char bothclamp[16];
char char_dist[300];
char Rand[2048];
double sigma;
// return;
__asm emms
sigma = a + .5 + .6 * (63 - q) / 63.0;
// set up a lookup table of 256 entries that matches
// a gaussian distribution with sigma determined by q.
//
{
double i;
int next, j;
next = 0;
for (i = -32; i < 32; i++)
{
double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
int a = (int)(g + .5);
if (a)
{
for (j = 0; j < a; j++)
{
char_dist[next+j] = (char) i;
}
next = next + j;
}
}
for (next = next; next < 256; next++)
char_dist[next] = 0;
}
for (i = 0; i < 2048; i++)
{
Rand[i] = char_dist[rand() & 0xff];
}
for (i = 0; i < 16; i++)
{
blackclamp[i] = -char_dist[0];
whiteclamp[i] = -char_dist[0];
bothclamp[i] = -2 * char_dist[0];
}
for (i = 0; i < Height; i++)
{
unsigned char *Pos = Start + i * Pitch;
char *Ref = Rand + (rand() & 0xff);
__asm
{
mov ecx, [Width]
mov esi, Pos
mov edi, Ref
xor eax, eax
nextset:
movdqu xmm1, [esi+eax] // get the source
psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
paddusb xmm1, bothclamp
psubusb xmm1, whiteclamp
movdqu xmm2, [edi+eax] // get the noise for this line
paddb xmm1, xmm2 // add it in
movdqu [esi+eax], xmm1 // store the result
add eax, 16 // move to the next line
cmp eax, ecx
jl nextset
}
}
}
#endif
__declspec(align(16))
static const int four8s[4] = { 8, 8, 8, 8};
void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
{
int r, i;
__declspec(align(16))
int flimit4[4];
unsigned char *s = src;
int sumsq;
int sum;
flimit = vp8_q2mbl(flimit);
flimit4[0] =
flimit4[1] =
flimit4[2] =
flimit4[3] = flimit;
for (r = 0; r < rows; r++)
{
sumsq = 0;
sum = 0;
for (i = -8; i <= 6; i++)
{
sumsq += s[i] * s[i];
sum += s[i];
}
__asm
{
mov eax, sumsq
movd xmm7, eax
mov eax, sum
movd xmm6, eax
mov esi, s
xor ecx, ecx
mov edx, cols
add edx, 8
pxor mm0, mm0
pxor mm1, mm1
pxor xmm0, xmm0
nextcol4:
movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5
movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10
punpcklbw xmm1, xmm0 // expanding
punpcklbw xmm2, xmm0 // expanding
punpcklwd xmm1, xmm0 // expanding to dwords
punpcklwd xmm2, xmm0 // expanding to dwords
psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5
paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2
paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5
pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5
paddd xmm6, xmm2
paddd xmm7, xmm1
pshufd xmm6, xmm6, 0 // duplicate the last ones
pshufd xmm7, xmm7, 0 // duplicate the last ones
psrldq xmm1, 4 // 8--7 9--6 10--5 0000
psrldq xmm2, 4 // 8--7 9--6 10--5 0000
pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared
pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared
paddd xmm6, xmm4
paddd xmm7, xmm3
pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared
pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared
paddd xmm7, xmm3
paddd xmm6, xmm4
pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared
pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared
paddd xmm7, xmm3
paddd xmm6, xmm4
movdqa xmm3, xmm6
pmaddwd xmm3, xmm3
movdqa xmm5, xmm7
pslld xmm5, 4
psubd xmm5, xmm7
psubd xmm5, xmm3
psubd xmm5, flimit4
psrad xmm5, 31
packssdw xmm5, xmm0
packsswb xmm5, xmm0
movd xmm1, DWORD PTR [esi+ecx]
movq xmm2, xmm1
punpcklbw xmm1, xmm0
punpcklwd xmm1, xmm0
paddd xmm1, xmm6
paddd xmm1, four8s
psrad xmm1, 4
packssdw xmm1, xmm0
packuswb xmm1, xmm0
pand xmm1, xmm5
pandn xmm5, xmm2
por xmm5, xmm1
movd [esi+ecx-8], mm0
movq mm0, mm1
movdq2q mm1, xmm5
psrldq xmm7, 12
psrldq xmm6, 12
add ecx, 4
cmp ecx, edx
jl nextcol4
}
s += pitch;
}
}
#if 0
/****************************************************************************
*
* ROUTINE : plane_add_noise_mmx
*
* INPUTS : unsigned char *Start starting address of buffer to add gaussian
* noise to
* unsigned int Width width of plane
* unsigned int Height height of plane
* int Pitch distance between subsequent lines of frame
* int q quantizer used to determine amount of noise
* to add
*
* OUTPUTS : None.
*
* RETURNS : void.
*
* FUNCTION : adds gaussian noise to a plane of pixels
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
{
unsigned int i;
int Pitch4 = Pitch * 4;
const int noise_amount = 2;
const int noise_adder = 2 * noise_amount + 1;
__declspec(align(16)) unsigned char blackclamp[16];
__declspec(align(16)) unsigned char whiteclamp[16];
__declspec(align(16)) unsigned char bothclamp[16];
char char_dist[300];
char Rand[2048];
double sigma;
__asm emms
sigma = a + .5 + .6 * (63 - q) / 63.0;
// set up a lookup table of 256 entries that matches
// a gaussian distribution with sigma determined by q.
//
{
double i, sum = 0;
int next, j;
next = 0;
for (i = -32; i < 32; i++)
{
int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
if (a)
{
for (j = 0; j < a; j++)
{
char_dist[next+j] = (char) i;
}
next = next + j;
}
}
for (next = next; next < 256; next++)
char_dist[next] = 0;
}
for (i = 0; i < 2048; i++)
{
Rand[i] = char_dist[rand() & 0xff];
}
for (i = 0; i < 16; i++)
{
blackclamp[i] = -char_dist[0];
whiteclamp[i] = -char_dist[0];
bothclamp[i] = -2 * char_dist[0];
}
for (i = 0; i < Height; i++)
{
unsigned char *Pos = Start + i * Pitch;
char *Ref = Rand + (rand() & 0xff);
__asm
{
mov ecx, [Width]
mov esi, Pos
mov edi, Ref
xor eax, eax
nextset:
movq mm1, [esi+eax] // get the source
psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
paddusb mm1, bothclamp
psubusb mm1, whiteclamp
movq mm2, [edi+eax] // get the noise for this line
paddb mm1, mm2 // add it in
movq [esi+eax], mm1 // store the result
add eax, 8 // move to the next line
cmp eax, ecx
jl nextset
}
}
}
#else
extern char an[8][64][3072];
extern int cd[8][64];
void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
{
unsigned int i;
__declspec(align(16)) unsigned char blackclamp[16];
__declspec(align(16)) unsigned char whiteclamp[16];
__declspec(align(16)) unsigned char bothclamp[16];
__asm emms
for (i = 0; i < 16; i++)
{
blackclamp[i] = -cd[a][q];
whiteclamp[i] = -cd[a][q];
bothclamp[i] = -2 * cd[a][q];
}
for (i = 0; i < Height; i++)
{
unsigned char *Pos = Start + i * Pitch;
char *Ref = an[a][q] + (rand() & 0xff);
__asm
{
mov ecx, [Width]
mov esi, Pos
mov edi, Ref
xor eax, eax
nextset:
movq mm1, [esi+eax] // get the source
psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
paddusb mm1, bothclamp
psubusb mm1, whiteclamp
movq mm2, [edi+eax] // get the noise for this line
paddb mm1, mm2 // add it in
movq [esi+eax], mm1 // store the result
add eax, 8 // move to the next line
cmp eax, ecx
jl nextset
}
}
}
void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
{
unsigned int i;
__declspec(align(16)) unsigned char blackclamp[16];
__declspec(align(16)) unsigned char whiteclamp[16];
__declspec(align(16)) unsigned char bothclamp[16];
__asm emms
for (i = 0; i < 16; i++)
{
blackclamp[i] = -cd[a][q];
whiteclamp[i] = -cd[a][q];
bothclamp[i] = -2 * cd[a][q];
}
for (i = 0; i < Height; i++)
{
unsigned char *Pos = Start + i * Pitch;
char *Ref = an[a][q] + (rand() & 0xff);
__asm
{
mov ecx, [Width]
mov esi, Pos
mov edi, Ref
xor eax, eax
nextset:
movdqu xmm1, [esi+eax] // get the source
psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
paddusb xmm1, bothclamp
psubusb xmm1, whiteclamp
movdqu xmm2, [edi+eax] // get the noise for this line
paddb xmm1, xmm2 // add it in
movdqu [esi+eax], xmm1 // store the result
add eax, 16 // move to the next line
cmp eax, ecx
jl nextset
}
}
}
#endif