Merge pull request #162 from Vproject/yasm
Allow yasm to be used instead of nasm.
This commit is contained in:
commit
a50d54f8e8
@ -392,7 +392,7 @@ UpdateMbMv_sse2:
|
||||
;mov eax, [esp+4] ; mv_buffer
|
||||
;movd xmm0, [esp+8] ; _mv
|
||||
movd xmm0, r1d ; _mv
|
||||
pshufd xmm1, xmm0, $0
|
||||
pshufd xmm1, xmm0, $00
|
||||
movdqa [r0 ], xmm1
|
||||
movdqa [r0+0x10], xmm1
|
||||
movdqa [r0+0x20], xmm1
|
||||
|
@ -101,7 +101,7 @@
|
||||
punpcklwd %1, %2
|
||||
punpckhwd %3, %4
|
||||
punpcklwd %1, %3
|
||||
psraw %1, $4
|
||||
psraw %1, $04
|
||||
%endmacro
|
||||
|
||||
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
|
||||
@ -129,7 +129,7 @@
|
||||
paddw %2, %4 ; block 2, 3
|
||||
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
|
||||
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
|
||||
psraw %1, $4
|
||||
psraw %1, $04
|
||||
%endmacro
|
||||
|
||||
|
||||
@ -178,12 +178,12 @@ AnalysisVaaInfoIntra_sse2:
|
||||
|
||||
|
||||
mov r2,r1
|
||||
sal r2,$1 ;r2 = 2*iLineSize
|
||||
sal r2,$01 ;r2 = 2*iLineSize
|
||||
mov r3,r2
|
||||
add r3,r1 ;r3 = 3*iLineSize
|
||||
|
||||
mov r4,r2
|
||||
sal r4,$1 ;r4 = 4*iLineSize
|
||||
sal r4,$01 ;r4 = 4*iLineSize
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
@ -231,7 +231,7 @@ AnalysisVaaInfoIntra_sse2:
|
||||
and r2, 0ffffh ; effective low work truncated
|
||||
mov r3, r2
|
||||
imul r2, r3
|
||||
sar r2, $4
|
||||
sar r2, $04
|
||||
movd retrd, xmm1
|
||||
sub retrd, r2d
|
||||
|
||||
@ -273,12 +273,12 @@ AnalysisVaaInfoIntra_ssse3:
|
||||
|
||||
|
||||
mov r2,r1
|
||||
sal r2,$1 ;r2 = 2*iLineSize
|
||||
sal r2,$01 ;r2 = 2*iLineSize
|
||||
mov r3,r2
|
||||
add r3,r1 ;r3 = 3*iLineSize
|
||||
|
||||
mov r4,r2
|
||||
sal r4,$1 ;r4 = 4*iLineSize
|
||||
sal r4,$01 ;r4 = 4*iLineSize
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
@ -327,7 +327,7 @@ AnalysisVaaInfoIntra_ssse3:
|
||||
and r2, 0ffffh ; effective low work truncated
|
||||
mov r3, r2
|
||||
imul r2, r3
|
||||
sar r2, $4
|
||||
sar r2, $04
|
||||
movd retrd, xmm1
|
||||
sub retrd, r2d
|
||||
|
||||
|
@ -86,17 +86,17 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
|
||||
|
||||
%macro MMX_SumSubMul2 3
|
||||
movq %3, %1
|
||||
psllw %1, $1
|
||||
psllw %1, $01
|
||||
paddw %1, %2
|
||||
psllw %2, $1
|
||||
psllw %2, $01
|
||||
psubw %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro MMX_SumSubDiv2 3
|
||||
movq %3, %2
|
||||
psraw %3, $1
|
||||
psraw %3, $01
|
||||
paddw %3, %1
|
||||
psraw %1, $1
|
||||
psraw %1, $01
|
||||
psubw %1, %2
|
||||
%endmacro
|
||||
|
||||
@ -124,7 +124,7 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
|
||||
movd %2, %6
|
||||
punpcklbw %2, %4
|
||||
paddw %1, %3
|
||||
psraw %1, $6
|
||||
psraw %1, $06
|
||||
paddsw %1, %2
|
||||
packuswb %1, %2
|
||||
movd %5, %1
|
||||
@ -255,15 +255,15 @@ WelsIDctT4Rec_mmx:
|
||||
%macro SSE2_SumSubDiv2 4
|
||||
movdqa %4, %1
|
||||
movdqa %3, %2
|
||||
psraw %2, $1
|
||||
psraw %4, $1
|
||||
psraw %2, $01
|
||||
psraw %4, $01
|
||||
paddw %1, %2
|
||||
psubw %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_StoreDiff8p 6
|
||||
paddw %1, %3
|
||||
psraw %1, $6
|
||||
psraw %1, $06
|
||||
movq %2, %6
|
||||
punpcklbw %2, %4
|
||||
paddsw %2, %1
|
||||
@ -282,7 +282,7 @@ WelsIDctT4Rec_mmx:
|
||||
%macro SSE2_Load8DC 6
|
||||
movdqa %1, %6 ; %1 = dc0 dc1
|
||||
paddw %1, %5
|
||||
psraw %1, $6 ; (dc + 32) >> 6
|
||||
psraw %1, $06 ; (dc + 32) >> 6
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
|
@ -94,13 +94,13 @@ DyadicBilinearDownsamplerWidthx32_sse:
|
||||
mov ecx, [esp+36] ; iSrcStride
|
||||
mov ebp, [esp+44] ; iSrcHeight
|
||||
|
||||
sar ebp, $1 ; iSrcHeight >> 1
|
||||
sar ebp, $01 ; iSrcHeight >> 1
|
||||
|
||||
.yloops:
|
||||
mov eax, [esp+40] ; iSrcWidth
|
||||
sar eax, $1 ; iSrcWidth >> 1
|
||||
sar eax, $01 ; iSrcWidth >> 1
|
||||
mov ebx, eax ; iDstWidth restored at ebx
|
||||
sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
neg ebx ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops:
|
||||
@ -247,13 +247,13 @@ DyadicBilinearDownsamplerWidthx16_sse:
|
||||
mov ecx, [esp+36] ; iSrcStride
|
||||
mov ebp, [esp+44] ; iSrcHeight
|
||||
|
||||
sar ebp, $1 ; iSrcHeight >> 1
|
||||
sar ebp, $01 ; iSrcHeight >> 1
|
||||
|
||||
.yloops:
|
||||
mov eax, [esp+40] ; iSrcWidth
|
||||
sar eax, $1 ; iSrcWidth >> 1
|
||||
sar eax, $01 ; iSrcWidth >> 1
|
||||
mov ebx, eax ; iDstWidth restored at ebx
|
||||
sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
neg ebx ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 16 bytes
|
||||
.xloops:
|
||||
@ -351,13 +351,13 @@ DyadicBilinearDownsamplerWidthx8_sse:
|
||||
mov ecx, [esp+36] ; iSrcStride
|
||||
mov ebp, [esp+44] ; iSrcHeight
|
||||
|
||||
sar ebp, $1 ; iSrcHeight >> 1
|
||||
sar ebp, $01 ; iSrcHeight >> 1
|
||||
|
||||
.yloops:
|
||||
mov eax, [esp+40] ; iSrcWidth
|
||||
sar eax, $1 ; iSrcWidth >> 1
|
||||
sar eax, $01 ; iSrcWidth >> 1
|
||||
mov ebx, eax ; iDstWidth restored at ebx
|
||||
sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
|
||||
sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
|
||||
neg ebx ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 8 bytes
|
||||
.xloops:
|
||||
@ -442,16 +442,16 @@ DyadicBilinearDownsamplerWidthx32_ssse3:
|
||||
mov ecx, [esp+36] ; iSrcStride
|
||||
mov ebp, [esp+44] ; iSrcHeight
|
||||
|
||||
sar ebp, $1 ; iSrcHeight >> 1
|
||||
sar ebp, $01 ; iSrcHeight >> 1
|
||||
|
||||
movdqa xmm7, [shufb_mask_low] ; mask low
|
||||
movdqa xmm6, [shufb_mask_high] ; mask high
|
||||
|
||||
.yloops:
|
||||
mov eax, [esp+40] ; iSrcWidth
|
||||
sar eax, $1 ; iSrcWidth >> 1
|
||||
sar eax, $01 ; iSrcWidth >> 1
|
||||
mov ebx, eax ; iDstWidth restored at ebx
|
||||
sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
neg ebx ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops:
|
||||
@ -553,15 +553,15 @@ DyadicBilinearDownsamplerWidthx16_ssse3:
|
||||
mov ecx, [esp+36] ; iSrcStride
|
||||
mov ebp, [esp+44] ; iSrcHeight
|
||||
|
||||
sar ebp, $1 ; iSrcHeight >> 1
|
||||
sar ebp, $01 ; iSrcHeight >> 1
|
||||
movdqa xmm7, [shufb_mask_low] ; mask low
|
||||
movdqa xmm6, [shufb_mask_high] ; mask high
|
||||
|
||||
.yloops:
|
||||
mov eax, [esp+40] ; iSrcWidth
|
||||
sar eax, $1 ; iSrcWidth >> 1
|
||||
sar eax, $01 ; iSrcWidth >> 1
|
||||
mov ebx, eax ; iDstWidth restored at ebx
|
||||
sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
neg ebx ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 16 bytes
|
||||
.xloops:
|
||||
@ -643,16 +643,16 @@ DyadicBilinearDownsamplerWidthx32_sse4:
|
||||
mov ecx, [esp+36] ; iSrcStride
|
||||
mov ebp, [esp+44] ; iSrcHeight
|
||||
|
||||
sar ebp, $1 ; iSrcHeight >> 1
|
||||
sar ebp, $01 ; iSrcHeight >> 1
|
||||
|
||||
movdqa xmm7, [shufb_mask_low] ; mask low
|
||||
movdqa xmm6, [shufb_mask_high] ; mask high
|
||||
|
||||
.yloops:
|
||||
mov eax, [esp+40] ; iSrcWidth
|
||||
sar eax, $1 ; iSrcWidth >> 1
|
||||
sar eax, $01 ; iSrcWidth >> 1
|
||||
mov ebx, eax ; iDstWidth restored at ebx
|
||||
sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
|
||||
neg ebx ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 32 bytes
|
||||
.xloops:
|
||||
@ -753,15 +753,15 @@ DyadicBilinearDownsamplerWidthx16_sse4:
|
||||
mov ecx, [esp+36] ; iSrcStride
|
||||
mov ebp, [esp+44] ; iSrcHeight
|
||||
|
||||
sar ebp, $1 ; iSrcHeight >> 1
|
||||
sar ebp, $01 ; iSrcHeight >> 1
|
||||
movdqa xmm7, [shufb_mask_low] ; mask low
|
||||
movdqa xmm6, [shufb_mask_high] ; mask high
|
||||
|
||||
.yloops:
|
||||
mov eax, [esp+40] ; iSrcWidth
|
||||
sar eax, $1 ; iSrcWidth >> 1
|
||||
sar eax, $01 ; iSrcWidth >> 1
|
||||
mov ebx, eax ; iDstWidth restored at ebx
|
||||
sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
|
||||
neg ebx ; - (iSrcWidth >> 1)
|
||||
; each loop = source bandwidth: 16 bytes
|
||||
.xloops:
|
||||
|
@ -121,7 +121,7 @@
|
||||
punpcklwd %1, %2
|
||||
punpckhwd %3, %4
|
||||
punpcklwd %1, %3
|
||||
psraw %1, $4
|
||||
psraw %1, $04
|
||||
%endmacro
|
||||
|
||||
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
|
||||
@ -149,7 +149,7 @@
|
||||
paddw %2, %4 ; block 2, 3
|
||||
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
|
||||
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
|
||||
psraw %1, $4
|
||||
psraw %1, $04
|
||||
%endmacro
|
||||
|
||||
%macro WELS_SAD_16x2_SSE2 0
|
||||
@ -353,7 +353,7 @@ rc_sad_frame_sse2:
|
||||
pxor xmm0, xmm0
|
||||
.hloop:
|
||||
mov eax, ebx
|
||||
mov ebp, $0
|
||||
mov ebp, $00
|
||||
.wloop:
|
||||
movdqa xmm1, [esi+ebp]
|
||||
movdqa xmm2, [edi+ebp]
|
||||
|
Loading…
Reference in New Issue
Block a user