Sindre Aamås 3f31aff4dc [Encoder] Add an SSE4.2 implementation of CavlcParamCal
Use a combination of table lookups and pshufb to convert coefficients
to zero run/level format. Two 16-entry lookup tables are used for a
total of 192 bytes worth of tables. (The existing SSE2 version uses a
table of size 2048 bytes.)

Speedup is ~1.5x-3x as compared with the SSE2 version on Haswell (the
speedup is greater for input with many trailing zeros).

The use of popcnt makes it require SSE4.2. This can be replaced with
a small LUT and accumulation which would reduce the requirement to
SSSE3.
2016-04-20 18:37:08 +02:00

669 lines
19 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2010-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* memzero.asm
;*
;* Abstract
;* cavlc
;*
;* History
;* 09/08/2010 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
SECTION .rodata align=16
align 16
wels_shufb_rev:
db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
; 4-bit table giving number of preceding zeros for each set bit as well as the
; eventual next bit. For the case where all 4 bits are set, this requires 5
; zeros. The 5th zero can either be read from beyond the final table entry or
; implied via zero-initializing the location being read into.
wels_cavlc_param_cal_run_lut:
db 4, 0, 0, 0
db 0, 3, 0, 0
db 1, 2, 0, 0
db 0, 0, 2, 0
db 2, 1, 0, 0
db 0, 1, 1, 0
db 1, 0, 1, 0
db 0, 0, 0, 1
db 3, 0, 0, 0
db 0, 2, 0, 0
db 1, 1, 0, 0
db 0, 0, 1, 0
db 2, 0, 0, 0
db 0, 1, 0, 0
db 1, 0, 0, 0
db 0, 0, 0, 0
; db 0
; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
; the words that match zero bits and concatenating in reverse order.
wels_cavlc_param_cal_shufb_lut:
db 0, 0, 0, 0, 0, 0, 0, 0
db 6, 7, 0, 0, 0, 0, 0, 0
db 4, 5, 0, 0, 0, 0, 0, 0
db 6, 7, 4, 5, 0, 0, 0, 0
db 2, 3, 0, 0, 0, 0, 0, 0
db 6, 7, 2, 3, 0, 0, 0, 0
db 4, 5, 2, 3, 0, 0, 0, 0
db 6, 7, 4, 5, 2, 3, 0, 0
db 0, 1, 0, 0, 0, 0, 0, 0
db 6, 7, 0, 1, 0, 0, 0, 0
db 4, 5, 0, 1, 0, 0, 0, 0
db 6, 7, 4, 5, 0, 1, 0, 0
db 2, 3, 0, 1, 0, 0, 0, 0
db 6, 7, 2, 3, 0, 1, 0, 0
db 4, 5, 2, 3, 0, 1, 0, 0
db 6, 7, 4, 5, 2, 3, 0, 1
%ifdef X86_32
align 16
sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
ALIGN 16
sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
align 16
byte_1pos_table:
db 0,0,0,0,0,0,0,0, ;0
db 0,0,0,0,0,0,0,1, ;1
db 1,0,0,0,0,0,0,1, ;2
db 1,0,0,0,0,0,0,2, ;3
db 2,0,0,0,0,0,0,1, ;4
db 2,0,0,0,0,0,0,2, ;5
db 2,1,0,0,0,0,0,2, ;6
db 2,1,0,0,0,0,0,3, ;7
db 3,0,0,0,0,0,0,1, ;8
db 3,0,0,0,0,0,0,2, ;9
db 3,1,0,0,0,0,0,2, ;10
db 3,1,0,0,0,0,0,3, ;11
db 3,2,0,0,0,0,0,2, ;12
db 3,2,0,0,0,0,0,3, ;13
db 3,2,1,0,0,0,0,3, ;14
db 3,2,1,0,0,0,0,4, ;15
db 4,0,0,0,0,0,0,1, ;16
db 4,0,0,0,0,0,0,2, ;17
db 4,1,0,0,0,0,0,2, ;18
db 4,1,0,0,0,0,0,3, ;19
db 4,2,0,0,0,0,0,2, ;20
db 4,2,0,0,0,0,0,3, ;21
db 4,2,1,0,0,0,0,3, ;22
db 4,2,1,0,0,0,0,4, ;23
db 4,3,0,0,0,0,0,2, ;24
db 4,3,0,0,0,0,0,3, ;25
db 4,3,1,0,0,0,0,3, ;26
db 4,3,1,0,0,0,0,4, ;27
db 4,3,2,0,0,0,0,3, ;28
db 4,3,2,0,0,0,0,4, ;29
db 4,3,2,1,0,0,0,4, ;30
db 4,3,2,1,0,0,0,5, ;31
db 5,0,0,0,0,0,0,1, ;32
db 5,0,0,0,0,0,0,2, ;33
db 5,1,0,0,0,0,0,2, ;34
db 5,1,0,0,0,0,0,3, ;35
db 5,2,0,0,0,0,0,2, ;36
db 5,2,0,0,0,0,0,3, ;37
db 5,2,1,0,0,0,0,3, ;38
db 5,2,1,0,0,0,0,4, ;39
db 5,3,0,0,0,0,0,2, ;40
db 5,3,0,0,0,0,0,3, ;41
db 5,3,1,0,0,0,0,3, ;42
db 5,3,1,0,0,0,0,4, ;43
db 5,3,2,0,0,0,0,3, ;44
db 5,3,2,0,0,0,0,4, ;45
db 5,3,2,1,0,0,0,4, ;46
db 5,3,2,1,0,0,0,5, ;47
db 5,4,0,0,0,0,0,2, ;48
db 5,4,0,0,0,0,0,3, ;49
db 5,4,1,0,0,0,0,3, ;50
db 5,4,1,0,0,0,0,4, ;51
db 5,4,2,0,0,0,0,3, ;52
db 5,4,2,0,0,0,0,4, ;53
db 5,4,2,1,0,0,0,4, ;54
db 5,4,2,1,0,0,0,5, ;55
db 5,4,3,0,0,0,0,3, ;56
db 5,4,3,0,0,0,0,4, ;57
db 5,4,3,1,0,0,0,4, ;58
db 5,4,3,1,0,0,0,5, ;59
db 5,4,3,2,0,0,0,4, ;60
db 5,4,3,2,0,0,0,5, ;61
db 5,4,3,2,1,0,0,5, ;62
db 5,4,3,2,1,0,0,6, ;63
db 6,0,0,0,0,0,0,1, ;64
db 6,0,0,0,0,0,0,2, ;65
db 6,1,0,0,0,0,0,2, ;66
db 6,1,0,0,0,0,0,3, ;67
db 6,2,0,0,0,0,0,2, ;68
db 6,2,0,0,0,0,0,3, ;69
db 6,2,1,0,0,0,0,3, ;70
db 6,2,1,0,0,0,0,4, ;71
db 6,3,0,0,0,0,0,2, ;72
db 6,3,0,0,0,0,0,3, ;73
db 6,3,1,0,0,0,0,3, ;74
db 6,3,1,0,0,0,0,4, ;75
db 6,3,2,0,0,0,0,3, ;76
db 6,3,2,0,0,0,0,4, ;77
db 6,3,2,1,0,0,0,4, ;78
db 6,3,2,1,0,0,0,5, ;79
db 6,4,0,0,0,0,0,2, ;80
db 6,4,0,0,0,0,0,3, ;81
db 6,4,1,0,0,0,0,3, ;82
db 6,4,1,0,0,0,0,4, ;83
db 6,4,2,0,0,0,0,3, ;84
db 6,4,2,0,0,0,0,4, ;85
db 6,4,2,1,0,0,0,4, ;86
db 6,4,2,1,0,0,0,5, ;87
db 6,4,3,0,0,0,0,3, ;88
db 6,4,3,0,0,0,0,4, ;89
db 6,4,3,1,0,0,0,4, ;90
db 6,4,3,1,0,0,0,5, ;91
db 6,4,3,2,0,0,0,4, ;92
db 6,4,3,2,0,0,0,5, ;93
db 6,4,3,2,1,0,0,5, ;94
db 6,4,3,2,1,0,0,6, ;95
db 6,5,0,0,0,0,0,2, ;96
db 6,5,0,0,0,0,0,3, ;97
db 6,5,1,0,0,0,0,3, ;98
db 6,5,1,0,0,0,0,4, ;99
db 6,5,2,0,0,0,0,3, ;100
db 6,5,2,0,0,0,0,4, ;101
db 6,5,2,1,0,0,0,4, ;102
db 6,5,2,1,0,0,0,5, ;103
db 6,5,3,0,0,0,0,3, ;104
db 6,5,3,0,0,0,0,4, ;105
db 6,5,3,1,0,0,0,4, ;106
db 6,5,3,1,0,0,0,5, ;107
db 6,5,3,2,0,0,0,4, ;108
db 6,5,3,2,0,0,0,5, ;109
db 6,5,3,2,1,0,0,5, ;110
db 6,5,3,2,1,0,0,6, ;111
db 6,5,4,0,0,0,0,3, ;112
db 6,5,4,0,0,0,0,4, ;113
db 6,5,4,1,0,0,0,4, ;114
db 6,5,4,1,0,0,0,5, ;115
db 6,5,4,2,0,0,0,4, ;116
db 6,5,4,2,0,0,0,5, ;117
db 6,5,4,2,1,0,0,5, ;118
db 6,5,4,2,1,0,0,6, ;119
db 6,5,4,3,0,0,0,4, ;120
db 6,5,4,3,0,0,0,5, ;121
db 6,5,4,3,1,0,0,5, ;122
db 6,5,4,3,1,0,0,6, ;123
db 6,5,4,3,2,0,0,5, ;124
db 6,5,4,3,2,0,0,6, ;125
db 6,5,4,3,2,1,0,6, ;126
db 6,5,4,3,2,1,0,7, ;127
db 7,0,0,0,0,0,0,1, ;128
db 7,0,0,0,0,0,0,2, ;129
db 7,1,0,0,0,0,0,2, ;130
db 7,1,0,0,0,0,0,3, ;131
db 7,2,0,0,0,0,0,2, ;132
db 7,2,0,0,0,0,0,3, ;133
db 7,2,1,0,0,0,0,3, ;134
db 7,2,1,0,0,0,0,4, ;135
db 7,3,0,0,0,0,0,2, ;136
db 7,3,0,0,0,0,0,3, ;137
db 7,3,1,0,0,0,0,3, ;138
db 7,3,1,0,0,0,0,4, ;139
db 7,3,2,0,0,0,0,3, ;140
db 7,3,2,0,0,0,0,4, ;141
db 7,3,2,1,0,0,0,4, ;142
db 7,3,2,1,0,0,0,5, ;143
db 7,4,0,0,0,0,0,2, ;144
db 7,4,0,0,0,0,0,3, ;145
db 7,4,1,0,0,0,0,3, ;146
db 7,4,1,0,0,0,0,4, ;147
db 7,4,2,0,0,0,0,3, ;148
db 7,4,2,0,0,0,0,4, ;149
db 7,4,2,1,0,0,0,4, ;150
db 7,4,2,1,0,0,0,5, ;151
db 7,4,3,0,0,0,0,3, ;152
db 7,4,3,0,0,0,0,4, ;153
db 7,4,3,1,0,0,0,4, ;154
db 7,4,3,1,0,0,0,5, ;155
db 7,4,3,2,0,0,0,4, ;156
db 7,4,3,2,0,0,0,5, ;157
db 7,4,3,2,1,0,0,5, ;158
db 7,4,3,2,1,0,0,6, ;159
db 7,5,0,0,0,0,0,2, ;160
db 7,5,0,0,0,0,0,3, ;161
db 7,5,1,0,0,0,0,3, ;162
db 7,5,1,0,0,0,0,4, ;163
db 7,5,2,0,0,0,0,3, ;164
db 7,5,2,0,0,0,0,4, ;165
db 7,5,2,1,0,0,0,4, ;166
db 7,5,2,1,0,0,0,5, ;167
db 7,5,3,0,0,0,0,3, ;168
db 7,5,3,0,0,0,0,4, ;169
db 7,5,3,1,0,0,0,4, ;170
db 7,5,3,1,0,0,0,5, ;171
db 7,5,3,2,0,0,0,4, ;172
db 7,5,3,2,0,0,0,5, ;173
db 7,5,3,2,1,0,0,5, ;174
db 7,5,3,2,1,0,0,6, ;175
db 7,5,4,0,0,0,0,3, ;176
db 7,5,4,0,0,0,0,4, ;177
db 7,5,4,1,0,0,0,4, ;178
db 7,5,4,1,0,0,0,5, ;179
db 7,5,4,2,0,0,0,4, ;180
db 7,5,4,2,0,0,0,5, ;181
db 7,5,4,2,1,0,0,5, ;182
db 7,5,4,2,1,0,0,6, ;183
db 7,5,4,3,0,0,0,4, ;184
db 7,5,4,3,0,0,0,5, ;185
db 7,5,4,3,1,0,0,5, ;186
db 7,5,4,3,1,0,0,6, ;187
db 7,5,4,3,2,0,0,5, ;188
db 7,5,4,3,2,0,0,6, ;189
db 7,5,4,3,2,1,0,6, ;190
db 7,5,4,3,2,1,0,7, ;191
db 7,6,0,0,0,0,0,2, ;192
db 7,6,0,0,0,0,0,3, ;193
db 7,6,1,0,0,0,0,3, ;194
db 7,6,1,0,0,0,0,4, ;195
db 7,6,2,0,0,0,0,3, ;196
db 7,6,2,0,0,0,0,4, ;197
db 7,6,2,1,0,0,0,4, ;198
db 7,6,2,1,0,0,0,5, ;199
db 7,6,3,0,0,0,0,3, ;200
db 7,6,3,0,0,0,0,4, ;201
db 7,6,3,1,0,0,0,4, ;202
db 7,6,3,1,0,0,0,5, ;203
db 7,6,3,2,0,0,0,4, ;204
db 7,6,3,2,0,0,0,5, ;205
db 7,6,3,2,1,0,0,5, ;206
db 7,6,3,2,1,0,0,6, ;207
db 7,6,4,0,0,0,0,3, ;208
db 7,6,4,0,0,0,0,4, ;209
db 7,6,4,1,0,0,0,4, ;210
db 7,6,4,1,0,0,0,5, ;211
db 7,6,4,2,0,0,0,4, ;212
db 7,6,4,2,0,0,0,5, ;213
db 7,6,4,2,1,0,0,5, ;214
db 7,6,4,2,1,0,0,6, ;215
db 7,6,4,3,0,0,0,4, ;216
db 7,6,4,3,0,0,0,5, ;217
db 7,6,4,3,1,0,0,5, ;218
db 7,6,4,3,1,0,0,6, ;219
db 7,6,4,3,2,0,0,5, ;220
db 7,6,4,3,2,0,0,6, ;221
db 7,6,4,3,2,1,0,6, ;222
db 7,6,4,3,2,1,0,7, ;223
db 7,6,5,0,0,0,0,3, ;224
db 7,6,5,0,0,0,0,4, ;225
db 7,6,5,1,0,0,0,4, ;226
db 7,6,5,1,0,0,0,5, ;227
db 7,6,5,2,0,0,0,4, ;228
db 7,6,5,2,0,0,0,5, ;229
db 7,6,5,2,1,0,0,5, ;230
db 7,6,5,2,1,0,0,6, ;231
db 7,6,5,3,0,0,0,4, ;232
db 7,6,5,3,0,0,0,5, ;233
db 7,6,5,3,1,0,0,5, ;234
db 7,6,5,3,1,0,0,6, ;235
db 7,6,5,3,2,0,0,5, ;236
db 7,6,5,3,2,0,0,6, ;237
db 7,6,5,3,2,1,0,6, ;238
db 7,6,5,3,2,1,0,7, ;239
db 7,6,5,4,0,0,0,4, ;240
db 7,6,5,4,0,0,0,5, ;241
db 7,6,5,4,1,0,0,5, ;242
db 7,6,5,4,1,0,0,6, ;243
db 7,6,5,4,2,0,0,5, ;244
db 7,6,5,4,2,0,0,6, ;245
db 7,6,5,4,2,1,0,6, ;246
db 7,6,5,4,2,1,0,7, ;247
db 7,6,5,4,3,0,0,5, ;248
db 7,6,5,4,3,0,0,6, ;249
db 7,6,5,4,3,1,0,6, ;250
db 7,6,5,4,3,1,0,7, ;251
db 7,6,5,4,3,2,0,6, ;252
db 7,6,5,4,3,2,0,7, ;253
db 7,6,5,4,3,2,1,7, ;254
db 7,6,5,4,3,2,1,8, ;255
%endif ; X86_32
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
%ifdef X86_32
;***********************************************************************
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
push ebx
push edi
push esi
mov eax, [esp+16] ;coffLevel
mov edi, [esp+24] ;Level
mov ebx, [esp+32] ;endIdx
cmp ebx, 3
jne .Level16
pxor xmm1, xmm1
movq xmm0, [eax] ; removed QWORD
jmp .Cal_begin
.Level16:
movdqa xmm0, [eax]
movdqa xmm1, [eax+16]
.Cal_begin:
movdqa xmm2, xmm0
packsswb xmm0, xmm1
movdqa xmm4, xmm0
pxor xmm3, xmm3
pcmpgtb xmm0, xmm3
pcmpgtb xmm3, xmm4
por xmm0, xmm3
pmovmskb edx, xmm0
cmp edx, 0
je near .return
movdqa xmm6, [sse2_b_1]
pcmpeqw xmm7, xmm7 ;generate -1
mov ebx, 0xff
;pinsrw xmm6, ebx, 3
mov bl, dh
lea ebx, [byte_1pos_table+8*ebx]
movq xmm0, [ebx]
pextrw ecx, xmm0, 3
shr ecx, 8
mov dh, cl
.loopHighFind0:
cmp ecx, 0
je .loopHighFind0End
;mov esi, [ebx]
;and esi, 0xff
movzx esi, byte [ebx]
add esi, 8
mov esi, [eax+2*esi]
mov [edi], si
add edi, 2
;add ebx, 1
inc ebx
dec ecx
jmp .loopHighFind0
.loopHighFind0End:
mov cl, dh
cmp cl, 8
pand xmm0, xmm6
jne .LowByteFind0
sub edi, 2
mov esi, [eax+16]
mov [edi], esi
add edi, 2
.LowByteFind0:
and edx, 0xff
lea ebx, [byte_1pos_table+8*edx]
movq xmm1, [ebx]
pextrw esi, xmm1, 3
or esi, 0xff
or ecx, 0xff00
and ecx, esi
shr esi, 8
pand xmm1, xmm6
.loopLowFind0:
cmp esi, 0
je .loopLowFind0End
;mov edx, [ebx]
;and edx, 0xff
movzx edx, byte [ebx]
mov edx, [eax+2*edx]
mov [edi], dx
add edi, 2
;add ebx, 1
inc ebx
dec esi
jmp .loopLowFind0
.loopLowFind0End:
cmp ch, 8
jne .getLevelEnd
sub edi, 2
mov edx, [eax]
mov [edi], dx
.getLevelEnd:
mov edx, [esp+28] ;total_coeffs
;mov ebx, ecx
;and ebx, 0xff
movzx ebx, byte cl
add cl, ch
mov [edx], cl
;getRun
movq xmm5, [sse2_b8]
paddb xmm0, xmm5
pxor xmm2, xmm2
pxor xmm3, xmm3
mov eax, 8
sub eax, ebx
shl eax, 3
shl ebx, 3
pinsrw xmm2, ebx, 0
pinsrw xmm3, eax, 0
psllq xmm0, xmm3
psrlq xmm0, xmm3
movdqa xmm4, xmm1
psllq xmm1, xmm2
psrlq xmm4, xmm3
punpcklqdq xmm1, xmm4
por xmm0, xmm1
pextrw eax, xmm0, 0
and eax, 0xff
inc eax
sub al, cl
movdqa xmm1, xmm0
paddb xmm1, xmm7
psrldq xmm0, 1
psubb xmm1, xmm0
mov ecx, [esp+20] ;run
movdqa [ecx], xmm1
;getRunEnd
.return:
pop esi
pop edi
pop ebx
ret
%endif
;***********************************************************************
;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse42
%define p_coeff_level r0
%define p_run r1
%define p_level r2
%define p_total_coeffs r3
%define i_endidxd r4d
%ifdef X86_32
push r5
push r6
%assign push_num 2
%define r_mask r5
%define r_maskd r5d
%define p_shufb_lut wels_cavlc_param_cal_shufb_lut
%define p_run_lut wels_cavlc_param_cal_run_lut
%elifdef WIN64
push rbx
%assign push_num 1
%define r_mask rbx
%define r_maskd ebx
%define p_shufb_lut r5
%define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
%else
%assign push_num 0
%define r_mask rax
%define r_maskd eax
%define p_shufb_lut r5
%define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
%endif
LOAD_5_PARA
PUSH_XMM 2
; Free up rcx/ecx because only cl is accepted as shift amount operand.
%ifidni r0b, cl
mov r6, r0
%undef p_coeff_level
%define p_coeff_level r6
%define r_tmp r0
%define r_tmpd r0d
%define r_tmpb r0b
%elifidni r1b, cl
mov r6, r1
%undef p_run
%define p_run r6
%define r_tmp r1
%define r_tmpd r1d
%define r_tmpb r1b
%elifidni r3b, cl
mov r6, r3
%undef p_total_coeffs
%define p_total_coeffs r6
%define r_tmp r3
%define r_tmpd r3d
%define r_tmpb r3b
%else
%error "Unknown cl register."
%endif
; Acquire a bitmask indicating which words are non-zero.
; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
; Assumptions are taken from previous implementations.
pxor xmm1, xmm1
cmp i_endidxd, 3
jg .load16
movq xmm0, [p_coeff_level]
packsswb xmm0, xmm1
jmp .load_done
.load16:
movdqa xmm0, [p_coeff_level]
packsswb xmm0, [p_coeff_level + 16]
.load_done:
movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros.
pcmpeqb xmm0, xmm1
pshufb xmm0, [wels_shufb_rev]
pmovmskb r_maskd, xmm0
xor r_maskd, 0FFFFh
mov r_tmpd, i_endidxd
%undef i_endidxd
%define r_tmp2 r4
%define r_tmp2d r4d
popcnt r_tmp2d, r_maskd
mov [p_total_coeffs], r_tmp2d
%xdefine i_total_zeros p_total_coeffs
%undef p_total_coeffs
mov i_total_zeros, r_tmp2
jz .done
mov i_total_zeros, 16
sub i_total_zeros, r_tmp2
bsf r_tmpd, r_maskd ; Find first set bit.
sub i_total_zeros, r_tmp
; Skip trailing zeros.
; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
and r_tmpd, -4
shr r_maskd, r_tmpb
add r_tmpd, r_tmpd
sub p_coeff_level, r_tmp
; Handle first quadruple containing a non-zero value.
mov r_tmp, r_mask
and r_tmpd, 0Fh
movq xmm0, [p_coeff_level + 24]
movq xmm1, [p_shufb_lut + 8 * r_tmp]
pshufb xmm0, xmm1
mov r_tmp2d, [p_run_lut + 4 * r_tmp]
shr r_tmp2d, 8 ; Skip initial zero run.
movlps [p_level], xmm0 ; Store levels for the first quadruple.
mov [p_run], r_tmp2d ; Store accompanying zero runs thus far.
shr r_maskd, 4
jz .done
.loop:
; Increment pointers.
popcnt r_tmpd, r_tmpd ; Number of non-zero values handled.
lea p_level, [p_level + 2 * r_tmp]
add p_run, r_tmp
; Handle next quadruple.
mov r_tmp, r_mask
and r_tmpd, 0Fh
movq xmm0, [p_coeff_level + 16]
sub p_coeff_level, 8
movq xmm1, [p_shufb_lut + 8 * r_tmp]
pshufb xmm0, xmm1
movzx r_tmp2d, byte [p_run - 1]
add r_tmp2d, [p_run_lut + 4 * r_tmp] ; Add to previous run and get eventual new runs.
movlps [p_level], xmm0 ; Store levels (potentially none).
mov [p_run - 1], r_tmp2d ; Update previous run and store eventual new runs.
shr r_maskd, 4
jnz .loop
.done:
mov retrq, i_total_zeros
POP_XMM
LOAD_5_PARA_POP
%ifdef X86_32
pop r6
pop r5
%elifdef WIN64
pop rbx
%endif
ret
%undef p_coeff_level
%undef p_run
%undef p_level
%undef i_total_zeros
%undef r_mask
%undef r_maskd
%undef r_tmp
%undef r_tmpd
%undef r_tmpb
%undef r_tmp2
%undef r_tmp2d
%undef p_shufb_lut
%undef p_run_lut