Merge "minor spelling cleanup in comments"
This commit is contained in:
commit
ecf392a155
@ -53,7 +53,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -77,7 +77,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -101,7 +101,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -127,7 +127,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -51,7 +51,7 @@ loop
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
@ -77,7 +77,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
|
@ -58,7 +58,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -89,7 +89,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -120,7 +120,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -153,7 +153,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -69,7 +69,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -111,7 +111,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -153,7 +153,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -195,7 +195,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -59,7 +59,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -90,7 +90,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -121,7 +121,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -154,7 +154,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -527,7 +527,7 @@ sym(vp8_loop_filter_vertical_edge_mmx):
|
||||
pxor mm7, [GLOBAL(t80)] ; unoffset
|
||||
; mm7 = q1
|
||||
|
||||
; tranpose and write back
|
||||
; transpose and write back
|
||||
; mm1 = 72 62 52 42 32 22 12 02
|
||||
; mm6 = 73 63 53 43 33 23 13 03
|
||||
; mm3 = 74 64 54 44 34 24 14 04
|
||||
@ -1289,7 +1289,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx):
|
||||
pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
|
||||
pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
|
||||
|
||||
; tranpose and write back
|
||||
; transpose and write back
|
||||
movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
|
||||
movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
|
||||
|
||||
|
@ -958,7 +958,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
|
||||
; start work on filters
|
||||
B_FILTER 2
|
||||
|
||||
; tranpose and write back - only work on q1, q0, p0, p1
|
||||
; transpose and write back - only work on q1, q0, p0, p1
|
||||
BV_TRANSPOSE
|
||||
; store 16-line result
|
||||
|
||||
@ -1023,7 +1023,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
|
||||
; start work on filters
|
||||
B_FILTER 2
|
||||
|
||||
; tranpose and write back - only work on q1, q0, p0, p1
|
||||
; transpose and write back - only work on q1, q0, p0, p1
|
||||
BV_TRANSPOSE
|
||||
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
|
@ -72,7 +72,7 @@ cospi_31_64 EQU 804
|
||||
; reg1 = output[first_offset]
|
||||
; reg2 = output[second_offset]
|
||||
; for proper address calculation, the last offset used when manipulating
|
||||
; output, wethere reading or storing) must be passed in. use 0 for first
|
||||
; output, whether reading or storing) must be passed in. use 0 for first
|
||||
; use.
|
||||
MACRO
|
||||
LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
|
||||
@ -88,7 +88,7 @@ cospi_31_64 EQU 804
|
||||
; output[first_offset] = reg1
|
||||
; output[second_offset] = reg2
|
||||
; for proper address calculation, the last offset used when manipulating
|
||||
; output, wethere reading or storing) must be passed in. use 0 for first
|
||||
; output, whether reading or storing) must be passed in. use 0 for first
|
||||
; use.
|
||||
MACRO
|
||||
STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
|
||||
@ -242,7 +242,7 @@ cospi_31_64 EQU 804
|
||||
; TODO(cd): have special case to re-use constants when they are similar for
|
||||
; consecutive butterflies
|
||||
; TODO(cd): have special case when both constants are the same, do the
|
||||
; additions/substractions before the multiplies.
|
||||
; additions/subtractions before the multiplies.
|
||||
; generate the constants
|
||||
; generate scalar constants
|
||||
mov r8, #$first_constant & 0xFF00
|
||||
@ -260,7 +260,7 @@ cospi_31_64 EQU 804
|
||||
vmull.s16 q11, $regB, d31
|
||||
vmull.s16 q12, $regC, d31
|
||||
; (used) five for intermediate (q8-q12), one for constants (q15)
|
||||
; do some addition/substractions (to get back two register)
|
||||
; do some addition/subtractions (to get back two register)
|
||||
vsub.s32 q8, q8, q10
|
||||
vsub.s32 q9, q9, q11
|
||||
; do more multiplications (ordered for maximum latency hiding)
|
||||
@ -268,7 +268,7 @@ cospi_31_64 EQU 804
|
||||
vmull.s16 q11, $regA, d30
|
||||
vmull.s16 q15, $regB, d30
|
||||
; (used) six for intermediate (q8-q12, q15)
|
||||
; do more addition/substractions
|
||||
; do more addition/subtractions
|
||||
vadd.s32 q11, q12, q11
|
||||
vadd.s32 q10, q10, q15
|
||||
; (used) four for intermediate (q8-q11)
|
||||
|
@ -527,7 +527,7 @@ sym(vp9_lpf_vertical_4_mmx):
|
||||
pxor mm7, [GLOBAL(t80)] ; unoffset
|
||||
; mm7 = q1
|
||||
|
||||
; tranpose and write back
|
||||
; transpose and write back
|
||||
; mm1 = 72 62 52 42 32 22 12 02
|
||||
; mm6 = 73 63 53 43 33 23 13 03
|
||||
; mm3 = 74 64 54 44 34 24 14 04
|
||||
|
@ -47,7 +47,7 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
// the results. In the second one, we transform the rows. To achieve that,
|
||||
// as the first pass results are transposed, we tranpose the columns (that
|
||||
// as the first pass results are transposed, we transpose the columns (that
|
||||
// is the transposed rows) and transpose the results (so that it goes back
|
||||
// in normal/row positions).
|
||||
int pass;
|
||||
@ -315,7 +315,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
// the results. In the second one, we transform the rows. To achieve that,
|
||||
// as the first pass results are transposed, we tranpose the columns (that
|
||||
// as the first pass results are transposed, we transpose the columns (that
|
||||
// is the transposed rows) and transpose the results (so that it goes back
|
||||
// in normal/row positions).
|
||||
int pass;
|
||||
|
@ -16,7 +16,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
// the results. In the second one, we transform the rows. To achieve that,
|
||||
// as the first pass results are transposed, we tranpose the columns (that
|
||||
// as the first pass results are transposed, we transpose the columns (that
|
||||
// is the transposed rows) and transpose the results (so that it goes back
|
||||
// in normal/row positions).
|
||||
int pass;
|
||||
@ -46,7 +46,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
in3 = _mm_slli_epi16(in3, 4);
|
||||
// if (i == 0 && input[0]) input[0] += 1;
|
||||
{
|
||||
// The mask will only contain wether the first value is zero, all
|
||||
// The mask will only contain whether the first value is zero, all
|
||||
// other comparison will fail as something shifted by 4 (above << 4)
|
||||
// can never be equal to one. To increment in the non-zero case, we
|
||||
// add the mask and one for the first element:
|
||||
@ -59,7 +59,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
}
|
||||
// Do the two transform/transpose passes
|
||||
for (pass = 0; pass < 2; ++pass) {
|
||||
// Transform 1/2: Add/substract
|
||||
// Transform 1/2: Add/subtract
|
||||
const __m128i r0 = _mm_add_epi16(in0, in3);
|
||||
const __m128i r1 = _mm_add_epi16(in1, in2);
|
||||
const __m128i r2 = _mm_sub_epi16(in1, in2);
|
||||
@ -317,7 +317,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
for (pass = 0; pass < 2; pass++) {
|
||||
// To store results of each pass before the transpose.
|
||||
__m128i res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i q0 = _mm_add_epi16(in0, in7);
|
||||
const __m128i q1 = _mm_add_epi16(in1, in6);
|
||||
const __m128i q2 = _mm_add_epi16(in2, in5);
|
||||
@ -328,7 +328,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
const __m128i q7 = _mm_sub_epi16(in0, in7);
|
||||
// Work on first four results
|
||||
{
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i r0 = _mm_add_epi16(q0, q3);
|
||||
const __m128i r1 = _mm_add_epi16(q1, q2);
|
||||
const __m128i r2 = _mm_sub_epi16(q1, q2);
|
||||
@ -390,7 +390,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
// Combine
|
||||
const __m128i r0 = _mm_packs_epi32(s0, s1);
|
||||
const __m128i r1 = _mm_packs_epi32(s2, s3);
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i x0 = _mm_add_epi16(q4, r0);
|
||||
const __m128i x1 = _mm_sub_epi16(q4, r0);
|
||||
const __m128i x2 = _mm_sub_epi16(q7, r1);
|
||||
@ -1071,7 +1071,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
// the results. In the second one, we transform the rows. To achieve that,
|
||||
// as the first pass results are transposed, we tranpose the columns (that
|
||||
// as the first pass results are transposed, we transpose the columns (that
|
||||
// is the transposed rows) and transpose the results (so that it goes back
|
||||
// in normal/row positions).
|
||||
int pass;
|
||||
@ -1228,7 +1228,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
}
|
||||
// Work on the first eight values; fdct8(input, even_results);
|
||||
{
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i q0 = _mm_add_epi16(input0, input7);
|
||||
const __m128i q1 = _mm_add_epi16(input1, input6);
|
||||
const __m128i q2 = _mm_add_epi16(input2, input5);
|
||||
@ -1239,7 +1239,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
const __m128i q7 = _mm_sub_epi16(input0, input7);
|
||||
// Work on first four results
|
||||
{
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i r0 = _mm_add_epi16(q0, q3);
|
||||
const __m128i r1 = _mm_add_epi16(q1, q2);
|
||||
const __m128i r2 = _mm_sub_epi16(q1, q2);
|
||||
@ -1303,7 +1303,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
|
||||
// Combine
|
||||
const __m128i r0 = _mm_packs_epi32(s0, s1);
|
||||
const __m128i r1 = _mm_packs_epi32(s2, s3);
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i x0 = _mm_add_epi16(q4, r0);
|
||||
const __m128i x1 = _mm_sub_epi16(q4, r0);
|
||||
const __m128i x2 = _mm_sub_epi16(q7, r1);
|
||||
|
@ -16,7 +16,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
// the results. In the second one, we transform the rows. To achieve that,
|
||||
// as the first pass results are transposed, we tranpose the columns (that
|
||||
// as the first pass results are transposed, we transpose the columns (that
|
||||
// is the transposed rows) and transpose the results (so that it goes back
|
||||
// in normal/row positions).
|
||||
int pass;
|
||||
@ -47,7 +47,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
in1 = _mm_slli_epi16(in1, 4);
|
||||
// if (i == 0 && input[0]) input[0] += 1;
|
||||
{
|
||||
// The mask will only contain wether the first value is zero, all
|
||||
// The mask will only contain whether the first value is zero, all
|
||||
// other comparison will fail as something shifted by 4 (above << 4)
|
||||
// can never be equal to one. To increment in the non-zero case, we
|
||||
// add the mask and one for the first element:
|
||||
@ -60,7 +60,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
}
|
||||
// Do the two transform/transpose passes
|
||||
for (pass = 0; pass < 2; ++pass) {
|
||||
// Transform 1/2: Add/substract
|
||||
// Transform 1/2: Add/subtract
|
||||
const __m128i r0 = _mm_add_epi16(in0, in1);
|
||||
const __m128i r1 = _mm_sub_epi16(in0, in1);
|
||||
const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
|
||||
@ -315,7 +315,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
for (pass = 0; pass < 2; pass++) {
|
||||
// To store results of each pass before the transpose.
|
||||
__m128i res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i q0 = _mm_add_epi16(in0, in7);
|
||||
const __m128i q1 = _mm_add_epi16(in1, in6);
|
||||
const __m128i q2 = _mm_add_epi16(in2, in5);
|
||||
@ -326,7 +326,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
const __m128i q7 = _mm_sub_epi16(in0, in7);
|
||||
// Work on first four results
|
||||
{
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i r0 = _mm_add_epi16(q0, q3);
|
||||
const __m128i r1 = _mm_add_epi16(q1, q2);
|
||||
const __m128i r2 = _mm_sub_epi16(q1, q2);
|
||||
@ -388,7 +388,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
// Combine
|
||||
const __m128i r0 = _mm_packs_epi32(s0, s1);
|
||||
const __m128i r1 = _mm_packs_epi32(s2, s3);
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i x0 = _mm_add_epi16(q4, r0);
|
||||
const __m128i x1 = _mm_sub_epi16(q4, r0);
|
||||
const __m128i x2 = _mm_sub_epi16(q7, r1);
|
||||
@ -1069,7 +1069,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
// the results. In the second one, we transform the rows. To achieve that,
|
||||
// as the first pass results are transposed, we tranpose the columns (that
|
||||
// as the first pass results are transposed, we transpose the columns (that
|
||||
// is the transposed rows) and transpose the results (so that it goes back
|
||||
// in normal/row positions).
|
||||
int pass;
|
||||
@ -1226,7 +1226,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
}
|
||||
// Work on the first eight values; fdct8(input, even_results);
|
||||
{
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i q0 = _mm_add_epi16(input0, input7);
|
||||
const __m128i q1 = _mm_add_epi16(input1, input6);
|
||||
const __m128i q2 = _mm_add_epi16(input2, input5);
|
||||
@ -1237,7 +1237,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
const __m128i q7 = _mm_sub_epi16(input0, input7);
|
||||
// Work on first four results
|
||||
{
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i r0 = _mm_add_epi16(q0, q3);
|
||||
const __m128i r1 = _mm_add_epi16(q1, q2);
|
||||
const __m128i r2 = _mm_sub_epi16(q1, q2);
|
||||
@ -1301,7 +1301,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
// Combine
|
||||
const __m128i r0 = _mm_packs_epi32(s0, s1);
|
||||
const __m128i r1 = _mm_packs_epi32(s2, s3);
|
||||
// Add/substract
|
||||
// Add/subtract
|
||||
const __m128i x0 = _mm_add_epi16(q4, r0);
|
||||
const __m128i x1 = _mm_sub_epi16(q4, r0);
|
||||
const __m128i x2 = _mm_sub_epi16(q7, r1);
|
||||
|
Loading…
Reference in New Issue
Block a user