Implement BundleColorMap in SSE2.
Change-Id: I44cd23647bd0a49330b6b2b3ed08050a5500e58e
This commit is contained in:
parent
3674d49e63
commit
875fafc191
@ -211,8 +211,11 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
|
|||||||
// Returns the first index where array1 and array2 are different.
|
// Returns the first index where array1 and array2 are different.
|
||||||
extern VP8LVectorMismatchFunc VP8LVectorMismatch;
|
extern VP8LVectorMismatchFunc VP8LVectorMismatch;
|
||||||
|
|
||||||
void VP8LBundleColorMap(const uint8_t* const row, int width,
|
typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width,
|
||||||
int xbits, uint32_t* const dst);
|
int xbits, uint32_t* dst);
|
||||||
|
extern VP8LBundleColorMapFunc VP8LBundleColorMap;
|
||||||
|
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
|
||||||
|
uint32_t* dst);
|
||||||
|
|
||||||
// Must be called before calling any of the above methods.
|
// Must be called before calling any of the above methods.
|
||||||
void VP8LEncDspInit(void);
|
void VP8LEncDspInit(void);
|
||||||
|
@ -588,8 +588,8 @@ static int VectorMismatch(const uint32_t* const array1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
|
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
|
||||||
void VP8LBundleColorMap(const uint8_t* const row, int width,
|
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
|
||||||
int xbits, uint32_t* const dst) {
|
uint32_t* dst) {
|
||||||
int x;
|
int x;
|
||||||
if (xbits > 0) {
|
if (xbits > 0) {
|
||||||
const int bit_depth = 1 << (3 - xbits);
|
const int bit_depth = 1 << (3 - xbits);
|
||||||
@ -849,6 +849,7 @@ VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
|
|||||||
VP8LHistogramAddFunc VP8LHistogramAdd;
|
VP8LHistogramAddFunc VP8LHistogramAdd;
|
||||||
|
|
||||||
VP8LVectorMismatchFunc VP8LVectorMismatch;
|
VP8LVectorMismatchFunc VP8LVectorMismatch;
|
||||||
|
VP8LBundleColorMapFunc VP8LBundleColorMap;
|
||||||
|
|
||||||
VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
|
VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
|
||||||
VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
|
VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
|
||||||
@ -888,6 +889,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
|
|||||||
VP8LHistogramAdd = HistogramAdd;
|
VP8LHistogramAdd = HistogramAdd;
|
||||||
|
|
||||||
VP8LVectorMismatch = VectorMismatch;
|
VP8LVectorMismatch = VectorMismatch;
|
||||||
|
VP8LBundleColorMap = VP8LBundleColorMap_C;
|
||||||
|
|
||||||
VP8LPredictorsSub[0] = PredictorSub0_C;
|
VP8LPredictorsSub[0] = PredictorSub0_C;
|
||||||
VP8LPredictorsSub[1] = PredictorSub1_C;
|
VP8LPredictorsSub[1] = PredictorSub1_C;
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#include "./lossless.h"
|
#include "./lossless.h"
|
||||||
|
#include "./common_sse2.h"
|
||||||
#include "./lossless_common.h"
|
#include "./lossless_common.h"
|
||||||
|
|
||||||
// For sign-extended multiplying constants, pre-shifted by 5:
|
// For sign-extended multiplying constants, pre-shifted by 5:
|
||||||
@ -377,6 +378,82 @@ static int VectorMismatch(const uint32_t* const array1,
|
|||||||
return match_len;
|
return match_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
|
||||||
|
static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
|
||||||
|
uint32_t* dst) {
|
||||||
|
int x;
|
||||||
|
assert(xbits >= 0);
|
||||||
|
assert(xbits <= 3);
|
||||||
|
switch (xbits) {
|
||||||
|
case 0: {
|
||||||
|
const __m128i ff = _mm_set1_epi16(0xff00);
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
// Store 0xff000000 | (row[x] << 8).
|
||||||
|
for (x = 0; x + 16 <= width; x += 16, dst += 16) {
|
||||||
|
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
|
||||||
|
const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
|
||||||
|
const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
|
||||||
|
const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
|
||||||
|
const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
|
||||||
|
const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
|
||||||
|
const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
|
||||||
|
_mm_storeu_si128((__m128i*)&dst[0], dst0);
|
||||||
|
_mm_storeu_si128((__m128i*)&dst[4], dst1);
|
||||||
|
_mm_storeu_si128((__m128i*)&dst[8], dst2);
|
||||||
|
_mm_storeu_si128((__m128i*)&dst[12], dst3);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 1: {
|
||||||
|
const __m128i ff = _mm_set1_epi16(0xff00);
|
||||||
|
const __m128i mul = _mm_set1_epi16(0x110);
|
||||||
|
for (x = 0; x + 16 <= width; x += 16, dst += 8) {
|
||||||
|
// 0a0b | (where a/b are 4 bits).
|
||||||
|
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
|
||||||
|
const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0
|
||||||
|
const __m128i pack = _mm_and_si128(tmp, ff); // ab00
|
||||||
|
const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
|
||||||
|
const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
|
||||||
|
_mm_storeu_si128((__m128i*)&dst[0], dst0);
|
||||||
|
_mm_storeu_si128((__m128i*)&dst[4], dst1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 2: {
|
||||||
|
const __m128i mask_or = _mm_set1_epi32(0xff000000);
|
||||||
|
const __m128i mul_cst = _mm_set1_epi16(0x0104);
|
||||||
|
const __m128i mask_mul = _mm_set1_epi16(0x0f00);
|
||||||
|
for (x = 0; x + 16 <= width; x += 16, dst += 4) {
|
||||||
|
// 000a000b000c000d | (where a/b/c/d are 2 bits).
|
||||||
|
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
|
||||||
|
const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0
|
||||||
|
const __m128i and = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000
|
||||||
|
const __m128i shift = _mm_srli_epi32(and, 12); // 00000000ab000000
|
||||||
|
const __m128i pack = _mm_or_si128(shift, and); // 00000000abcd0000
|
||||||
|
// Convert to 0xff00**00.
|
||||||
|
const __m128i res = _mm_or_si128(pack, mask_or);
|
||||||
|
_mm_storeu_si128((__m128i*)dst, res);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
assert(xbits == 3);
|
||||||
|
for (x = 0; x + 16 <= width; x += 16, dst += 2) {
|
||||||
|
// 0000000a00000000b... | (where a/b are 1 bit).
|
||||||
|
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
|
||||||
|
const __m128i shift = _mm_slli_epi64(in, 7);
|
||||||
|
const uint32_t move = _mm_movemask_epi8(shift);
|
||||||
|
dst[0] = 0xff000000 | ((move & 0xff) << 8);
|
||||||
|
dst[1] = 0xff000000 | (move & 0xff00);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (x != width) {
|
||||||
|
VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Batch version of Predictor Transform subtraction
|
// Batch version of Predictor Transform subtraction
|
||||||
|
|
||||||
@ -587,6 +664,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
|
|||||||
VP8LHistogramAdd = HistogramAdd;
|
VP8LHistogramAdd = HistogramAdd;
|
||||||
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
|
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
|
||||||
VP8LVectorMismatch = VectorMismatch;
|
VP8LVectorMismatch = VectorMismatch;
|
||||||
|
VP8LBundleColorMap = BundleColorMap_SSE2;
|
||||||
|
|
||||||
VP8LPredictorsSub[0] = PredictorSub0_SSE2;
|
VP8LPredictorsSub[0] = PredictorSub0_SSE2;
|
||||||
VP8LPredictorsSub[1] = PredictorSub1_SSE2;
|
VP8LPredictorsSub[1] = PredictorSub1_SSE2;
|
||||||
|
Loading…
Reference in New Issue
Block a user