dsp: avoid defining _C functions w/NEON builds

when targeting NEON C functions with NEON equivalents won't be used, but will contribute to binary size. the same goes for sse2, etc., but this change is primarily concerned with binary sizes for android arm targets. note '-noasm' or otherwise modifying VP8GetCPUInfo will have no effect on the use of NEON functions. this decision can be overridden by defining WEBP_DSP_OMIT_C_CODE to 0. Change-Id: I47bd453c84a3d341ca39bc986a39eb9c785aface
2017-10-26 20:31:27 -07:00 · 2017-10-26 20:31:27 -07:00 · b7971d0e22
commit b7971d0e22
parent 6ba98764e8
10 changed files with 384 additions and 71 deletions
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -220,6 +220,7 @@ void WebPMultRows(uint8_t* ptr, int stride,
 #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
 #endif

+#if !WEBP_NEON_OMIT_C_CODE
 static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
                                 int w, int h, int stride) {
  while (h-- > 0) {
@ -238,6 +239,7 @@ static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
    rgba += stride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 #undef MULTIPLIER
 #undef PREMULTIPLY

@ -287,6 +289,7 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
 #endif
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
                           int width, int height,
                           uint8_t* dst, int dst_stride) {
@ -341,6 +344,7 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
  int i;
  for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Simple channel manipulations.
@ -383,15 +387,16 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {

  WebPMultARGBRow = WebPMultARGBRow_C;
  WebPMultRow = WebPMultRow_C;
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;

+  WebPPackRGB = PackRGB_C;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
  WebPDispatchAlpha = DispatchAlpha_C;
  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
  WebPExtractAlpha = ExtractAlpha_C;
  WebPExtractGreen = ExtractGreen_C;
-
-  WebPPackRGB = PackRGB_C;
+#endif

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@ -405,16 +410,29 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
 #endif
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitAlphaProcessingNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitAlphaProcessingMIPSdspR2();
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitAlphaProcessingNEON();
+  }
+#endif
+
+  assert(WebPMultARGBRow != NULL);
+  assert(WebPMultRow != NULL);
+  assert(WebPApplyAlphaMultiply != NULL);
+  assert(WebPApplyAlphaMultiply4444 != NULL);
+  assert(WebPDispatchAlpha != NULL);
+  assert(WebPDispatchAlphaToGreen != NULL);
+  assert(WebPExtractAlpha != NULL);
+  assert(WebPExtractGreen != NULL);
+  assert(WebPPackRGB != NULL);
+
  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -11,6 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

+#include <assert.h>
+
 #include "src/dsp/dsp.h"
 #include "src/dec/vp8i_dec.h"
 #include "src/utils/utils.h"
@ -38,6 +40,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define MUL1(a) ((((a) * 20091) >> 16) + (a))
 #define MUL2(a) (((a) * 35468) >> 16)

+#if !WEBP_NEON_OMIT_C_CODE
 static void TransformOne(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
@ -99,12 +102,14 @@ static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
    TransformOne(in + 16, dst + 4);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 static void TransformUV(const int16_t* in, uint8_t* dst) {
  VP8Transform(in + 0 * 16, dst, 1);
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static void TransformDC(const int16_t* in, uint8_t* dst) {
  const int DC = in[0] + 4;
  int i, j;
@ -114,6 +119,7 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
    }
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 static void TransformDCUV(const int16_t* in, uint8_t* dst) {
  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
@ -127,6 +133,7 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
 //------------------------------------------------------------------------------
 // Paragraph 14.3

+#if !WEBP_NEON_OMIT_C_CODE
 static void TransformWHT(const int16_t* in, int16_t* out) {
  int tmp[16];
  int i;
@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
    out += 64;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);

 #define DST(x, y) dst[(x) + (y) * BPS]

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const uint8_t* const clip0 = VP8kclip1 - top[-1];
@ -233,6 +242,7 @@ static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
 static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];

@ -242,6 +252,7 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

+#if !WEBP_NEON_OMIT_C_CODE
 static void VE4(uint8_t* dst) {    // vertical
  const uint8_t* top = dst - BPS;
  const uint8_t vals[4] = {
@ -255,6 +266,7 @@ static void VE4(uint8_t* dst) {    // vertical
    memcpy(dst + i * BPS, vals, sizeof(vals));
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 static void HE4(uint8_t* dst) {    // horizontal
  const int A = dst[-1 - BPS];
@ -268,6 +280,7 @@ static void HE4(uint8_t* dst) {    // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static void DC4(uint8_t* dst) {   // DC
  uint32_t dc = 4;
  int i;
@ -312,6 +325,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
                                      DST(3, 3) = AVG3(G, H, H);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 static void VR4(uint8_t* dst) {   // Vertical-Right
  const int I = dst[-1 + 0 * BPS];
@ -404,6 +418,7 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
 //------------------------------------------------------------------------------
 // Chroma

+#if !WEBP_NEON_OMIT_C_CODE
 static void VE8uv(uint8_t* dst) {    // vertical
  int j;
  for (j = 0; j < 8; ++j) {
@ -457,12 +472,14 @@ static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
 static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
  Put8x8uv(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];

 //------------------------------------------------------------------------------
 // Edge filtering functions

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 // 4 pixels in, 2 pixels out
 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
@ -507,12 +524,16 @@ static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static WEBP_INLINE int needs_filter2(const uint8_t* p,
                                     int step, int t, int it) {
  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
@ -523,10 +544,12 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

+#if !WEBP_NEON_OMIT_C_CODE
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
@ -562,10 +585,12 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
    SimpleHFilter16(p, stride, thresh);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
@ -597,7 +622,9 @@ static WEBP_INLINE void FilterLoop24(uint8_t* p,
    p += vstride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

+#if !WEBP_NEON_OMIT_C_CODE
 // on macroblock edges
 static void VFilter16(uint8_t* p, int stride,
                      int thresh, int ithresh, int hev_thresh) {
@ -618,7 +645,9 @@ static void VFilter16i(uint8_t* p, int stride,
    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static void HFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
  int k;
@ -627,31 +656,40 @@ static void HFilter16i(uint8_t* p, int stride,
    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

+#if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

+#if !WEBP_NEON_OMIT_C_CODE
 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------

@ -709,37 +747,48 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {

  VP8InitClipTables();

+#if !WEBP_NEON_OMIT_C_CODE
  VP8TransformWHT = TransformWHT;
  VP8Transform = TransformTwo;
-  VP8TransformUV = TransformUV;
  VP8TransformDC = TransformDC;
-  VP8TransformDCUV = TransformDCUV;
  VP8TransformAC3 = TransformAC3;
+#endif
+  VP8TransformUV = TransformUV;
+  VP8TransformDCUV = TransformDCUV;

+#if !WEBP_NEON_OMIT_C_CODE
  VP8VFilter16 = VFilter16;
+  VP8VFilter16i = VFilter16i;
  VP8HFilter16 = HFilter16;
  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16i;
  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8i;
  VP8SimpleVFilter16 = SimpleVFilter16;
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
+#endif

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8HFilter16i = HFilter16i;
+  VP8HFilter8 = HFilter8;
+  VP8HFilter8i = HFilter8i;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE
  VP8PredLuma4[0] = DC4;
  VP8PredLuma4[1] = TM4;
  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[3] = HE4;
  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[5] = VR4;
  VP8PredLuma4[6] = LD4;
+#endif
+
+  VP8PredLuma4[3] = HE4;
+  VP8PredLuma4[5] = VR4;
  VP8PredLuma4[7] = VL4;
  VP8PredLuma4[8] = HD4;
  VP8PredLuma4[9] = HU4;

+#if !WEBP_NEON_OMIT_C_CODE
  VP8PredLuma16[0] = DC16;
  VP8PredLuma16[1] = TM16;
  VP8PredLuma16[2] = VE16;
@ -755,6 +804,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
  VP8PredChroma8[4] = DC8uvNoTop;
  VP8PredChroma8[5] = DC8uvNoLeft;
  VP8PredChroma8[6] = DC8uvNoTopLeft;
+#endif

  VP8DitherCombine8x8 = DitherCombine8x8;

@ -770,11 +820,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
 #endif
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8DspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8DspInitMIPS32();
@ -791,5 +836,57 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8DspInitNEON();
+  }
+#endif
+
+  assert(VP8TransformWHT != NULL);
+  assert(VP8Transform != NULL);
+  assert(VP8TransformDC != NULL);
+  assert(VP8TransformAC3 != NULL);
+  assert(VP8TransformUV != NULL);
+  assert(VP8TransformDCUV != NULL);
+  assert(VP8VFilter16 != NULL);
+  assert(VP8HFilter16 != NULL);
+  assert(VP8VFilter8 != NULL);
+  assert(VP8HFilter8 != NULL);
+  assert(VP8VFilter16i != NULL);
+  assert(VP8HFilter16i != NULL);
+  assert(VP8VFilter8i != NULL);
+  assert(VP8HFilter8i != NULL);
+  assert(VP8SimpleVFilter16 != NULL);
+  assert(VP8SimpleHFilter16 != NULL);
+  assert(VP8SimpleVFilter16i != NULL);
+  assert(VP8SimpleHFilter16i != NULL);
+  assert(VP8PredLuma4[0] != NULL);
+  assert(VP8PredLuma4[1] != NULL);
+  assert(VP8PredLuma4[2] != NULL);
+  assert(VP8PredLuma4[3] != NULL);
+  assert(VP8PredLuma4[4] != NULL);
+  assert(VP8PredLuma4[5] != NULL);
+  assert(VP8PredLuma4[6] != NULL);
+  assert(VP8PredLuma4[7] != NULL);
+  assert(VP8PredLuma4[8] != NULL);
+  assert(VP8PredLuma4[9] != NULL);
+  assert(VP8PredLuma16[0] != NULL);
+  assert(VP8PredLuma16[1] != NULL);
+  assert(VP8PredLuma16[2] != NULL);
+  assert(VP8PredLuma16[3] != NULL);
+  assert(VP8PredLuma16[4] != NULL);
+  assert(VP8PredLuma16[5] != NULL);
+  assert(VP8PredLuma16[6] != NULL);
+  assert(VP8PredChroma8[0] != NULL);
+  assert(VP8PredChroma8[1] != NULL);
+  assert(VP8PredChroma8[2] != NULL);
+  assert(VP8PredChroma8[3] != NULL);
+  assert(VP8PredChroma8[4] != NULL);
+  assert(VP8PredChroma8[5] != NULL);
+  assert(VP8PredChroma8[6] != NULL);
+  assert(VP8DitherCombine8x8 != NULL);
+
  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -116,6 +116,22 @@ extern "C" {

 #endif  /* EMSCRIPTEN */

+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -21,9 +21,11 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int clip_max(int v, int max) {
  return (v > max) ? max : v;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
@ -56,6 +58,7 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
  histo->last_non_zero = last_non_zero;
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
                               int start_block, int end_block,
                               VP8Histogram* const histo) {
@ -76,6 +79,7 @@ static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
  }
  VP8SetHistogramData(distribution, histo);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // run-time tables (~4k)
@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

+#if !WEBP_NEON_OMIT_C_CODE
+
 #define STORE(x, y, v) \
  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))

@ -176,6 +182,7 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
                          int16_t* out) {
@ -183,6 +190,7 @@ static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
  VP8FTransform(src + 4, ref + 4, out + 16);
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static void FTransformWHT_C(const int16_t* in, int16_t* out) {
  // input is 12b signed
  int32_t tmp[16];
@ -212,6 +220,7 @@ static void FTransformWHT_C(const int16_t* in, int16_t* out) {
    out[12 + i] = b3 >> 1;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 #undef MUL
 #undef STORE
@ -524,6 +533,7 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Metric

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
                              int w, int h) {
  int count = 0;
@ -551,6 +561,7 @@ static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
 static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 4, 4);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
  int k, x, y;
@ -572,6 +583,7 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.

+#if !WEBP_NEON_OMIT_C_CODE
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
@ -627,6 +639,7 @@ static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
  }
  return D;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Quantization
@ -663,6 +676,7 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
                             const VP8Matrix* const mtx) {
  int nz;
@ -670,6 +684,7 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC

 //------------------------------------------------------------------------------
 // Block copy
@ -735,23 +750,29 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  InitTables();

  // default C implementations
-  VP8CollectHistogram = CollectHistogram_C;
+#if !WEBP_NEON_OMIT_C_CODE
  VP8ITransform = ITransform_C;
  VP8FTransform = FTransform_C;
-  VP8FTransform2 = FTransform2_C;
  VP8FTransformWHT = FTransformWHT_C;
+  VP8TDisto4x4 = Disto4x4_C;
+  VP8TDisto16x16 = Disto16x16_C;
+  VP8CollectHistogram = CollectHistogram_C;
+  VP8SSE16x16 = SSE16x16_C;
+  VP8SSE16x8 = SSE16x8_C;
+  VP8SSE8x8 = SSE8x8_C;
+  VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8EncQuantizeBlock = QuantizeBlock_C;
+  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
  VP8EncPredLuma4 = Intra4Preds_C;
  VP8EncPredLuma16 = Intra16Preds_C;
  VP8EncPredChroma8 = IntraChromaPreds_C;
-  VP8SSE16x16 = SSE16x16_C;
-  VP8SSE8x8 = SSE8x8_C;
-  VP8SSE16x8 = SSE16x8_C;
-  VP8SSE4x4 = SSE4x4_C;
-  VP8TDisto4x4 = Disto4x4_C;
-  VP8TDisto16x16 = Disto16x16_C;
  VP8Mean16x4 = Mean16x4_C;
-  VP8EncQuantizeBlock = QuantizeBlock_C;
-  VP8EncQuantize2Blocks = Quantize2Blocks_C;
  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
  VP8Copy4x4 = Copy4x4_C;
  VP8Copy16x8 = Copy16x8_C;
@ -773,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
      VP8EncDspInitAVX2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8EncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
@ -794,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8EncDspInitNEON();
+  }
+#endif
+
+  assert(VP8ITransform != NULL);
+  assert(VP8FTransform != NULL);
+  assert(VP8FTransformWHT != NULL);
+  assert(VP8TDisto4x4 != NULL);
+  assert(VP8TDisto16x16 != NULL);
+  assert(VP8CollectHistogram != NULL);
+  assert(VP8SSE16x16 != NULL);
+  assert(VP8SSE16x8 != NULL);
+  assert(VP8SSE8x8 != NULL);
+  assert(VP8SSE4x4 != NULL);
+  assert(VP8EncQuantizeBlock != NULL);
+  assert(VP8EncQuantize2Blocks != NULL);
+  assert(VP8FTransform2 != NULL);
+  assert(VP8EncPredLuma4 != NULL);
+  assert(VP8EncPredLuma16 != NULL);
+  assert(VP8EncPredChroma8 != NULL);
+  assert(VP8Mean16x4 != NULL);
+  assert(VP8EncQuantizeBlockWHT != NULL);
+  assert(VP8Copy4x4 != NULL);
+  assert(VP8Copy16x8 != NULL);
+
  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/filters.c
+++ b/src/dsp/filters.c
@ -28,6 +28,7 @@
  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
  (void)height;  // Silence unused warning.

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
                                      uint8_t* dst, int length, int inverse) {
  int i;
@ -112,6 +113,7 @@ static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
    out += stride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------
 // Gradient filter.
@ -121,6 +123,7 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
                                           int width, int height, int stride,
                                           int row, int num_rows,
@ -160,11 +163,13 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
    out += stride;
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 #undef SANITY_CHECK

 //------------------------------------------------------------------------------

+#if !WEBP_NEON_OMIT_C_CODE
 static void HorizontalFilter_C(const uint8_t* data, int width, int height,
                               int stride, uint8_t* filtered_data) {
  DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
@ -180,7 +185,7 @@ static void GradientFilter_C(const uint8_t* data, int width, int height,
                             int stride, uint8_t* filtered_data) {
  DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
 }
-
+#endif  // !WEBP_NEON_OMIT_C_CODE

 //------------------------------------------------------------------------------

@ -194,6 +199,7 @@ static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
  }
 }

+#if !WEBP_NEON_OMIT_C_CODE
 static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
                               uint8_t* out, int width) {
  if (prev == NULL) {
@ -203,6 +209,7 @@ static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
    for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
                               uint8_t* out, int width) {
@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;

  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
+#if !WEBP_NEON_OMIT_C_CODE
  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
+#endif
  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;

  WebPFilters[WEBP_FILTER_NONE] = NULL;
+#if !WEBP_NEON_OMIT_C_CODE
  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
+#endif

  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
      VP8FiltersInitSSE2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8FiltersInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8FiltersInitMIPSdspR2();
@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8FiltersInitNEON();
+  }
+#endif
+
+  assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
+  assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
+
  filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -15,6 +15,7 @@

 #include "src/dsp/dsp.h"

+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
 #include "src/dec/vp8li_dec.h"
@ -606,15 +607,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)

+#if !WEBP_NEON_OMIT_C_CODE
  VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;

  VP8LTransformColorInverse = VP8LTransformColorInverse_C;

-  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
  VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+#endif
+
  VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
  VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
-  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;

  VP8LMapColor32b = MapARGB_C;
  VP8LMapColor8b = MapAlpha_C;
@ -626,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
      VP8LDspInitSSE2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8LDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8LDspInitMIPSdspR2();
@ -642,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LDspInitNEON();
+  }
+#endif
+
+  assert(VP8LAddGreenToBlueAndRed != NULL);
+  assert(VP8LTransformColorInverse != NULL);
+  assert(VP8LConvertBGRAToRGBA != NULL);
+  assert(VP8LConvertBGRAToRGB != NULL);
+  assert(VP8LConvertBGRAToBGR != NULL);
+  assert(VP8LConvertBGRAToRGBA4444 != NULL);
+  assert(VP8LConvertBGRAToRGB565 != NULL);
+  assert(VP8LMapColor32b != NULL);
+  assert(VP8LMapColor8b != NULL);
+
  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #undef COPY_PREDICTOR_ARRAY
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
@ -15,6 +15,7 @@

 #include "src/dsp/dsp.h"

+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
 #include "src/dec/vp8li_dec.h"
@ -870,9 +871,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {

  VP8LDspInit();

+#if !WEBP_NEON_OMIT_C_CODE
  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;

  VP8LTransformColor = VP8LTransformColor_C;
+#endif

  VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
  VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
@ -938,11 +941,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
 #endif
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8LEncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8LEncDspInitMIPS32();
@ -959,6 +957,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LEncDspInitNEON();
+  }
+#endif
+
+  assert(VP8LSubtractGreenFromBlueAndRed != NULL);
+  assert(VP8LTransformColor != NULL);
+  assert(VP8LCollectColorBlueTransforms != NULL);
+  assert(VP8LCollectColorRedTransforms != NULL);
+  assert(VP8LFastLog2Slow != NULL);
+  assert(VP8LFastSLog2Slow != NULL);
+  assert(VP8LExtraCost != NULL);
+  assert(VP8LExtraCostCombined != NULL);
+  assert(VP8LCombinedShannonEntropy != NULL);
+  assert(VP8LGetEntropyUnrefined != NULL);
+  assert(VP8LGetCombinedEntropyUnrefined != NULL);
+  assert(VP8LHistogramAdd != NULL);
+  assert(VP8LVectorMismatch != NULL);
+  assert(VP8LBundleColorMap != NULL);
+  assert(VP8LPredictorsSub[0] != NULL);
+  assert(VP8LPredictorsSub[1] != NULL);
+  assert(VP8LPredictorsSub[2] != NULL);
+  assert(VP8LPredictorsSub[3] != NULL);
+  assert(VP8LPredictorsSub[4] != NULL);
+  assert(VP8LPredictorsSub[5] != NULL);
+  assert(VP8LPredictorsSub[6] != NULL);
+  assert(VP8LPredictorsSub[7] != NULL);
+  assert(VP8LPredictorsSub[8] != NULL);
+  assert(VP8LPredictorsSub[9] != NULL);
+  assert(VP8LPredictorsSub[10] != NULL);
+  assert(VP8LPredictorsSub[11] != NULL);
+  assert(VP8LPredictorsSub[12] != NULL);
+  assert(VP8LPredictorsSub[13] != NULL);
+  assert(VP8LPredictorsSub[14] != NULL);
+  assert(VP8LPredictorsSub[15] != NULL);
+  assert(VP8LPredictorsSub_C[0] != NULL);
+  assert(VP8LPredictorsSub_C[1] != NULL);
+  assert(VP8LPredictorsSub_C[2] != NULL);
+  assert(VP8LPredictorsSub_C[3] != NULL);
+  assert(VP8LPredictorsSub_C[4] != NULL);
+  assert(VP8LPredictorsSub_C[5] != NULL);
+  assert(VP8LPredictorsSub_C[6] != NULL);
+  assert(VP8LPredictorsSub_C[7] != NULL);
+  assert(VP8LPredictorsSub_C[8] != NULL);
+  assert(VP8LPredictorsSub_C[9] != NULL);
+  assert(VP8LPredictorsSub_C[10] != NULL);
+  assert(VP8LPredictorsSub_C[11] != NULL);
+  assert(VP8LPredictorsSub_C[12] != NULL);
+  assert(VP8LPredictorsSub_C[13] != NULL);
+  assert(VP8LPredictorsSub_C[14] != NULL);
+  assert(VP8LPredictorsSub_C[15] != NULL);
+
  lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
 }

--- a/src/dsp/rescaler.c
+++ b/src/dsp/rescaler.c
@ -210,10 +210,13 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
  if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;

-  WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
-  WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
+#if !WEBP_NEON_OMIT_C_CODE
  WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
  WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
+#endif
+
+  WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
+  WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;

  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@ -221,11 +224,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
      WebPRescalerDspInitSSE2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPRescalerDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      WebPRescalerDspInitMIPS32();
@ -242,5 +240,18 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPRescalerDspInitNEON();
+  }
+#endif
+
+  assert(WebPRescalerExportRowExpand != NULL);
+  assert(WebPRescalerExportRowShrink != NULL);
+  assert(WebPRescalerImportRowExpand != NULL);
+  assert(WebPRescalerImportRowShrink != NULL);
+
  rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -93,6 +93,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }

 // All variants implemented.
+#if !WEBP_NEON_OMIT_C_CODE
 UPSAMPLE_FUNC(UpsampleRgbLinePair_C,  VP8YuvToRgb,  3)
 UPSAMPLE_FUNC(UpsampleBgrLinePair_C,  VP8YuvToBgr,  3)
 UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
@ -100,6 +101,7 @@ UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
 UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
 UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
 UPSAMPLE_FUNC(UpsampleRgb565LinePair_C,  VP8YuvToRgb565,  2)
+#endif

 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@ -223,6 +225,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;

 #ifdef FANCY_UPSAMPLING
+#if !WEBP_NEON_OMIT_C_CODE
  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair_C;
  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair_C;
  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair_C;
@ -234,6 +237,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair_C;
  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair_C;
  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
+#endif

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@ -242,11 +246,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
      WebPInitUpsamplersSSE2();
    }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitUpsamplersNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitUpsamplersMIPSdspR2();
@ -258,6 +257,26 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
    }
 #endif
  }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitUpsamplersNEON();
+  }
+#endif
+
+  assert(WebPUpsamplers[MODE_RGB] != NULL);
+  assert(WebPUpsamplers[MODE_RGBA] != NULL);
+  assert(WebPUpsamplers[MODE_BGR] != NULL);
+  assert(WebPUpsamplers[MODE_BGRA] != NULL);
+  assert(WebPUpsamplers[MODE_ARGB] != NULL);
+  assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
+  assert(WebPUpsamplers[MODE_RGB_565] != NULL);
+  assert(WebPUpsamplers[MODE_rgbA] != NULL);
+  assert(WebPUpsamplers[MODE_bgrA] != NULL);
+  assert(WebPUpsamplers[MODE_Argb] != NULL);
+  assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
+
 #endif  // FANCY_UPSAMPLING
  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -13,6 +13,7 @@

 #include "src/dsp/yuv.h"

+#include <assert.h>
 #include <stdlib.h>

 //-----------------------------------------------------------------------------
@ -193,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,

 //-----------------------------------------------------------------------------

+#if !WEBP_NEON_OMIT_C_CODE
 #define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
 static uint16_t clip_y(int v) {
  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
@ -230,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
  }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE

 #undef MAX_Y

@ -270,9 +273,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {

  WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;

+#if !WEBP_NEON_OMIT_C_CODE
  WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
  WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
+#endif

  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@ -281,13 +286,24 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
      WebPInitSharpYUVSSE2();
    }
 #endif  // WEBP_USE_SSE2
+  }
+
 #if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitConvertARGBToYUVNEON();
-      WebPInitSharpYUVNEON();
-    }
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitConvertARGBToYUVNEON();
+    WebPInitSharpYUVNEON();
+  }
 #endif  // WEBP_USE_NEON

-  }
+  assert(WebPConvertARGBToY != NULL);
+  assert(WebPConvertARGBToUV != NULL);
+  assert(WebPConvertRGB24ToY != NULL);
+  assert(WebPConvertBGR24ToY != NULL);
+  assert(WebPConvertRGBA32ToUV != NULL);
+  assert(WebPSharpYUVUpdateY != NULL);
+  assert(WebPSharpYUVUpdateRGB != NULL);
+  assert(WebPSharpYUVFilterRow != NULL);
+
  rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }