Port commits related to clpf and qm experiments

Manually cherry-picked following commits from AOMedia git repository: bb2727c Sort includess for "clpf.h" c297fd0 Add quantisation matrix range parameters. 0527894 Add encoder option and signaling for quant matrix control. 4106232 Turn off trellis coding for quantization matrices. 4017fca Modify tests to allow quantization matrices. 1c122c2 Add quant and dequant functions for new quant matrices. 95a8999 Enable CLPF f72782b Fix a build issue 73bae50 Add quantisation matrices and selection functions 33208d2 Added support for constrained low pass filter (CLPF) Change-Id: I60fc1ee1ac40e6b9d1d00affd97547ee5d5dd6be
2016-08-11 09:39:47 -07:00
parent ac917ec262
commit 0818a7c828
28 changed files with 12166 additions and 84 deletions
--- a/3
+++ b/3
@@ -252,6 +252,7 @@ HAVE_LIST="
 EXPERIMENT_LIST="
    fp_mb_stats
    emulate_hardware
+    clpf
    var_tx
    rect_tx
    ref_mv
@@ -328,6 +329,7 @@ CONFIG_LIST="
    better_hw_compatibility
    experimental
    size_limit
+    aom_qm
    ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@@ -386,6 +388,7 @@ CMDLINE_SELECT="
    better_hw_compatibility
    vp9_highbitdepth
    experimental
+    aom_qm
 "

 process_cmdline() {
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -26,6 +26,7 @@
 #include "vpx_mem/vpx_mem.h"

 namespace {
+#if !CONFIG_AOM_QM

 const int kNumBlocks = 25;
 const int kNumBlockEntries = 16;
@@ -199,4 +200,5 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
        make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MSA
+#endif  // CONFIG_AOM_QM
 }  // namespace
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -295,11 +295,17 @@ typedef struct macroblockd_plane {
  // log2 of n4_w, n4_h
  uint8_t n4_wl, n4_hl;

+#if CONFIG_AOM_QM
+  const qm_val_t *seg_iqmatrix[MAX_SEGMENTS][2][TX_SIZES];
+#endif
  // encoder
  const int16_t *dequant;
 #if CONFIG_NEW_QUANT
  const dequant_val_type_nuq *dequant_val_nuq[QUANT_PROFILES];
 #endif  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+  const qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
+#endif
 } MACROBLOCKD_PLANE;

 #define BLOCK_OFFSET(x, i) ((x) + (i)*16)
--- a/vp10/common/clpf.c
+++ b/vp10/common/clpf.c
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2016 Cisco Systems
+(Replace with proper AOM header)
+*/
+
+#include "vp10/common/clpf.h"
+
+// Apply the filter on a single block
+static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
+                       int dstride, int has_top, int has_left, int has_bottom,
+                       int has_right, int width, int height) {
+  int x, y;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      int X = src[(y + 0) * sstride + x + 0];
+      int A = has_top ? src[(y - 1) * sstride + x + 0] : X;
+      int B = has_left ? src[(y + 0) * sstride + x - 1] : X;
+      int C = has_right ? src[(y + 0) * sstride + x + 1] : X;
+      int D = has_bottom ? src[(y + 1) * sstride + x + 0] : X;
+      int delta = ((A > X) + (B > X) + (C > X) + (D > X) > 2) -
+                  ((A < X) + (B < X) + (C < X) + (D < X) > 2);
+      dst[y * dstride + x] = X + delta;
+    }
+  }
+}
+
+#define BS (MI_SIZE * MAX_MIB_SIZE)
+
+// Iterate over blocks within a superblock
+static void vp10_clpf_sb(const YV12_BUFFER_CONFIG *frame_buffer,
+                         const VP10_COMMON *cm, MACROBLOCKD *xd,
+                         MODE_INFO *const *mi_8x8, int xpos, int ypos) {
+  // Temporary buffer (to allow SIMD parallelism)
+  uint8_t buf_unaligned[BS * BS + 15];
+  uint8_t *buf = (uint8_t *)(((intptr_t)buf_unaligned + 15) & ~15);
+  int x, y, p;
+
+  for (p = 0; p < (CLPF_FILTER_ALL_PLANES ? MAX_MB_PLANE : 1); p++) {
+    for (y = 0; y < MAX_MIB_SIZE && ypos + y < cm->mi_rows; y++) {
+      for (x = 0; x < MAX_MIB_SIZE && xpos + x < cm->mi_cols; x++) {
+        const MB_MODE_INFO *mbmi =
+            &mi_8x8[(ypos + y) * cm->mi_stride + xpos + x]->mbmi;
+
+        // Do not filter if there is no residual
+        if (!mbmi->skip) {
+          // Do not filter frame edges
+          int has_top = ypos + y > 0;
+          int has_left = xpos + x > 0;
+          int has_bottom = ypos + y < cm->mi_rows - 1;
+          int has_right = xpos + x < cm->mi_cols - 1;
+#if CLPF_ALLOW_BLOCK_PARALLELISM
+          // Do not filter superblock edges
+          has_top &= !!y;
+          has_left &= !!x;
+          has_bottom &= y != MAX_MIB_SIZE - 1;
+          has_right &= x != MAX_MIB_SIZE - 1;
+#endif
+          vp10_setup_dst_planes(xd->plane, frame_buffer, ypos + y, xpos + x);
+          clpf_block(
+              xd->plane[p].dst.buf, CLPF_ALLOW_PIXEL_PARALLELISM
+                                        ? buf + y * MI_SIZE * BS + x * MI_SIZE
+                                        : xd->plane[p].dst.buf,
+              xd->plane[p].dst.stride,
+              CLPF_ALLOW_PIXEL_PARALLELISM ? BS : xd->plane[p].dst.stride,
+              has_top, has_left, has_bottom, has_right,
+              MI_SIZE >> xd->plane[p].subsampling_x,
+              MI_SIZE >> xd->plane[p].subsampling_y);
+        }
+      }
+    }
+#if CLPF_ALLOW_PIXEL_PARALLELISM
+    for (y = 0; y < MAX_MIB_SIZE && ypos + y < cm->mi_rows; y++) {
+      for (x = 0; x < MAX_MIB_SIZE && xpos + x < cm->mi_cols; x++) {
+        const MB_MODE_INFO *mbmi =
+            &mi_8x8[(ypos + y) * cm->mi_stride + xpos + x]->mbmi;
+        vp10_setup_dst_planes(xd->plane, frame_buffer, ypos + y, xpos + x);
+        if (!mbmi->skip) {
+          int i = 0;
+          for (i = 0; i<MI_SIZE>> xd->plane[p].subsampling_y; i++)
+            memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
+                   buf + (y * MI_SIZE + i) * BS + x * MI_SIZE,
+                   MI_SIZE >> xd->plane[p].subsampling_x);
+        }
+      }
+    }
+#endif
+  }
+}
+
+// Iterate over the superblocks of an entire frame
+void vp10_clpf_frame(const YV12_BUFFER_CONFIG *frame, const VP10_COMMON *cm,
+                     MACROBLOCKD *xd) {
+  int x, y;
+
+  for (y = 0; y < cm->mi_rows; y += MAX_MIB_SIZE)
+    for (x = 0; x < cm->mi_cols; x += MAX_MIB_SIZE)
+      vp10_clpf_sb(frame, cm, xd, cm->mi_grid_visible, x, y);
+}
--- a/vp10/common/clpf.h
+++ b/vp10/common/clpf.h
@@ -0,0 +1,22 @@
+/*
+Copyright (c) 2016, Cisco Systems
+(Replace with proper AOM header)
+*/
+
+#ifndef VP10_COMMON_CLPF_H_
+#define VP10_COMMON_CLPF_H_
+
+#include "vp10/common/reconinter.h"
+
+// Configuration
+#define CLPF_ALLOW_PIXEL_PARALLELISM \
+  1  // 1 = SIMD friendly (adds a buffer requirement)
+#define CLPF_ALLOW_BLOCK_PARALLELISM \
+  0  // 1 = MT friendly (degrades quality slighty)
+#define CLPF_FILTER_ALL_PLANES \
+  0  // 1 = filter both luma and chroma, 0 = filter only luma
+
+void vp10_clpf_frame(const YV12_BUFFER_CONFIG *frame, const VP10_COMMON *cm,
+                     MACROBLOCKD *xd);
+
+#endif
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -144,6 +144,9 @@ typedef struct VP10Common {
  // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
  int use_highbitdepth;
 #endif
+#if CONFIG_CLPF
+  int clpf;
+#endif

  YV12_BUFFER_CONFIG *frame_to_show;
  RefCntBuffer *prev_frame;
@@ -214,6 +217,23 @@ typedef struct VP10Common {
  int uv_ac_delta_q;
  int16_t y_dequant[MAX_SEGMENTS][2];
  int16_t uv_dequant[MAX_SEGMENTS][2];
+
+#if CONFIG_AOM_QM
+  // Global quant matrix tables
+  qm_val_t *giqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES];
+  qm_val_t *gqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES];
+
+  // Local quant matrix tables for each frame
+  qm_val_t *y_iqmatrix[MAX_SEGMENTS][2][TX_SIZES];
+  qm_val_t *uv_iqmatrix[MAX_SEGMENTS][2][TX_SIZES];
+  // Encoder
+  qm_val_t *y_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
+  qm_val_t *uv_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
+
+  int using_qmatrix;
+  int min_qmlevel;
+  int max_qmlevel;
+#endif
 #if CONFIG_NEW_QUANT
  dequant_val_type_nuq y_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
  dequant_val_type_nuq uv_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
@@ -430,12 +450,20 @@ static INLINE void vp10_init_macroblockd(VP10_COMMON *cm, MACROBLOCKD *xd,
    xd->above_context[i] = cm->above_context[i];
    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
      memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
+#if CONFIG_AOM_QM
+      memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
+#endif
+
 #if CONFIG_NEW_QUANT
      memcpy(xd->plane[i].seg_dequant_nuq, cm->y_dequant_nuq,
             sizeof(cm->y_dequant_nuq));
 #endif
    } else {
      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
+#if CONFIG_AOM_QM
+      memcpy(xd->plane[i].seg_iqmatrix, cm->uv_iqmatrix,
+             sizeof(cm->uv_iqmatrix));
+#endif
 #if CONFIG_NEW_QUANT
      memcpy(xd->plane[i].seg_dequant_nuq, cm->uv_dequant_nuq,
             sizeof(cm->uv_dequant_nuq));
--- a/vp10/common/quant_common.c
+++ b/vp10/common/quant_common.c
--- a/vp10/common/quant_common.h
+++ b/vp10/common/quant_common.h
@@ -13,6 +13,7 @@

 #include "vpx/vpx_codec.h"
 #include "vp10/common/seg_common.h"
+#include "vp10/common/enums.h"

 #ifdef __cplusplus
 extern "C" {
@@ -22,12 +23,38 @@ extern "C" {
 #define MAXQ 255
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8
+#if CONFIG_AOM_QM
+// Total number of QM sets stored
+#define QM_LEVEL_BITS 4
+#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
+/* Offset into the list of QMs. Actual number of levels used is
+   (NUM_QM_LEVELS-AOM_QM_OFFSET)
+   Lower value of AOM_QM_OFFSET implies more heavily weighted matrices.*/
+#define DEFAULT_QM_FIRST (NUM_QM_LEVELS / 2)
+#define DEFAULT_QM_LAST (NUM_QM_LEVELS - 1)
+#endif
+
+struct VP10Common;

 int16_t vp10_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
 int16_t vp10_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);

 int vp10_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
+#if CONFIG_AOM_QM
+// Reduce the large number of quantizers to a smaller number of levels for which
+// different matrices may be defined
+static inline int aom_get_qmlevel(int qindex, int first, int last) {
+  int qmlevel = (qindex * (last + 1 - first) + QINDEX_RANGE / 2) / QINDEX_RANGE;
+  qmlevel = VPXMIN(qmlevel + first, NUM_QM_LEVELS - 1);
+  return qmlevel;
+}
+void aom_qm_init(struct VP10Common *cm);
+qm_val_t *aom_iqmatrix(struct VP10Common *cm, int qindex, int comp,
+                       int log2sizem2, int is_intra);
+qm_val_t *aom_qmatrix(struct VP10Common *cm, int qindex, int comp,
+                      int log2sizem2, int is_intra);
+#endif

 #if CONFIG_NEW_QUANT

--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -462,35 +462,64 @@ if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {

 # ENCODEMB INVOKE

-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-# the transform coefficients are held in 32-bit
-# values, so the assembler code for  vp10_block_error can no longer be used.
-  add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp10_block_error/;
+if (vpx_config("CONFIG_AOM_QM") eq "yes") {
+  if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
+    # the transform coefficients are held in 32-bit
+    # values, so the assembler code for  vp10_block_error can no longer be used.
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error/;

-  add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp/;
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";

-  add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp_32x32/;
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";

-  add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_fdct8x8_quant/;
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    specialize qw/vp10_fdct8x8_quant/;
+  } else {
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error avx2 msa/, "$sse2_x86inc";
+
+    add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+    specialize qw/vp10_block_error_fp neon/, "$sse2_x86inc";
+
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+  }
 } else {
-  add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp10_block_error sse2 avx2 msa/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    # the transform coefficients are held in 32-bit
+    # values, so the assembler code for  vp10_block_error can no longer be used.
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error/;

-  add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-  specialize qw/vp10_block_error_fp neon sse2/;
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp/;

-  add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp neon sse2/, "$ssse3_x86_64";
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp_32x32/;

-  add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp_32x32/, "$ssse3_x86_64";
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_fdct8x8_quant/;
+  } else {
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error sse2 avx2 msa/;
+
+    add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+    specialize qw/vp10_block_error_fp neon sse2/;
+
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp neon sse2/, "$ssse3_x86_64";
+
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp_32x32/, "$ssse3_x86_64";
+
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_fdct8x8_quant sse2 ssse3 neon/;
+  }

-  add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_fdct8x8_quant sse2 ssse3 neon/;
 }

 # fdct functions
@@ -817,11 +846,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/int64_t vp10_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
  specialize qw/vp10_highbd_block_error sse2/;

-  add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  specialize qw/vp10_highbd_quantize_fp sse4_1/;
+  if (vpx_config("CONFIG_AOM_QM") eq "yes") {
+    add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";

-  add_proto qw/void vp10_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  specialize qw/vp10_highbd_quantize_b/;
+    add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+  } else {
+    add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/vp10_highbd_quantize_fp sse4_1/;
+
+    add_proto qw/void vp10_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/vp10_highbd_quantize_b/;
+  }

  # fdct functions
  add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -25,6 +25,9 @@
 #include "vpx_util/vpx_thread.h"

 #include "vp10/common/alloccommon.h"
+#if CONFIG_CLPF
+#include "vp10/common/clpf.h"
+#endif
 #include "vp10/common/common.h"
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
@@ -1942,6 +1945,12 @@ static void setup_loopfilter(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
  }
 }

+#if CONFIG_CLPF
+static void setup_clpf(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  cm->clpf = vpx_rb_read_literal(rb, 1);
+}
+#endif
+
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
  return vpx_rb_read_bit(rb) ? vpx_rb_read_inv_signed_literal(rb, 6) : 0;
 }
@@ -1953,16 +1962,34 @@ static void setup_quantization(VP10_COMMON *const cm,
  cm->uv_dc_delta_q = read_delta_q(rb);
  cm->uv_ac_delta_q = read_delta_q(rb);
  cm->dequant_bit_depth = cm->bit_depth;
+#if CONFIG_AOM_QM
+  cm->using_qmatrix = vpx_rb_read_bit(rb);
+  if (cm->using_qmatrix) {
+    cm->min_qmlevel = vpx_rb_read_literal(rb, QM_LEVEL_BITS);
+    cm->max_qmlevel = vpx_rb_read_literal(rb, QM_LEVEL_BITS);
+  } else {
+    cm->min_qmlevel = 0;
+    cm->max_qmlevel = 0;
+  }
+#endif
 }

 static void setup_segmentation_dequant(VP10_COMMON *const cm) {
-// Build y/uv dequant values based on segmentation.
+  // Build y/uv dequant values based on segmentation.
+  int i = 0;
+#if CONFIG_AOM_QM
+  int lossless;
+  int j = 0;
+  int qmlevel;
+  int using_qm = cm->using_qmatrix;
+  int minqm = cm->min_qmlevel;
+  int maxqm = cm->max_qmlevel;
+#endif
 #if CONFIG_NEW_QUANT
  int b;
  int dq;
 #endif  //  CONFIG_NEW_QUANT
  if (cm->seg.enabled) {
-    int i;
    for (i = 0; i < MAX_SEGMENTS; ++i) {
      const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
      cm->y_dequant[i][0] =
@@ -1972,6 +1999,21 @@ static void setup_segmentation_dequant(VP10_COMMON *const cm) {
          vp10_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
      cm->uv_dequant[i][1] =
          vp10_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
+#if CONFIG_AOM_QM
+      lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                 cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+      // NB: depends on base index so there is only 1 set per frame
+      // No quant weighting when lossless or signalled not using QM
+      qmlevel = (lossless || using_qm == 0)
+                    ? NUM_QM_LEVELS - 1
+                    : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+      for (j = 0; j < TX_SIZES; ++j) {
+        cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 0, j, 1);
+        cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 0, j, 0);
+        cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 1, j, 1);
+        cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
+      }
+#endif  // CONFIG_AOM_QM
 #if CONFIG_NEW_QUANT
      for (dq = 0; dq < QUANT_PROFILES; dq++) {
        for (b = 0; b < COEF_BANDS; ++b) {
@@ -1994,6 +2036,20 @@ static void setup_segmentation_dequant(VP10_COMMON *const cm) {
        vp10_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
    cm->uv_dequant[0][1] =
        vp10_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
+#if CONFIG_AOM_QM
+    lossless = qindex == 0 && cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 &&
+               cm->uv_ac_delta_q == 0;
+    // No quant weighting when lossless or signalled not using QM
+    qmlevel = (lossless || using_qm == 0)
+                  ? NUM_QM_LEVELS - 1
+                  : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+    for (j = 0; j < TX_SIZES; ++j) {
+      cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 0, j, 1);
+      cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 0, j, 0);
+      cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 1, j, 1);
+      cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
+    }
+#endif
 #if CONFIG_NEW_QUANT
    for (dq = 0; dq < QUANT_PROFILES; dq++) {
      for (b = 0; b < COEF_BANDS; ++b) {
@@ -2646,6 +2702,10 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi, const uint8_t *data,
    winterface->execute(&pbi->lf_worker);
  }
 #endif  // CONFIG_VAR_TX
+#if CONFIG_CLPF
+  if (cm->clpf && !cm->skip_loop_filter)
+    vp10_clpf_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+#endif

  if (cm->frame_parallel_decode)
    vp10_frameworker_broadcast(pbi->cur_buf, INT_MAX);
@@ -3179,6 +3239,9 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
 #endif  // CONFIG_EXT_PARTITION

  setup_loopfilter(cm, rb);
+#if CONFIG_CLPF
+  setup_clpf(cm, rb);
+#endif
 #if CONFIG_LOOP_RESTORATION
  setup_restoration(cm, rb);
 #endif  // CONFIG_LOOP_RESTORATION
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@@ -112,6 +112,10 @@ VP10Decoder *vp10_decoder_create(BufferPool *const pool) {
  cm->setup_mi = vp10_dec_setup_mi;

  vp10_loop_filter_init(cm);
+
+#if CONFIG_AOM_QM
+  aom_qm_init(cm);
+#endif
 #if CONFIG_LOOP_RESTORATION
  vp10_loop_restoration_precal();
 #endif  // CONFIG_LOOP_RESTORATION
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -43,6 +43,13 @@ static INLINE int read_coeff(const vpx_prob *probs, int n, vp10_reader *r) {
  return val;
 }

+#if CONFIG_AOM_QM
+static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
+                        tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
+                        const int16_t *dq, int ctx, const int16_t *scan,
+                        const int16_t *nb, vp10_reader *r,
+                        const qm_val_t *iqm[2][TX_SIZES])
+#else
 static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
                        tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
                        const int16_t *dq,
@@ -50,11 +57,16 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
                        dequant_val_type_nuq *dq_val,
 #endif  // CONFIG_NEW_QUANT
                        int ctx, const int16_t *scan, const int16_t *nb,
-                        vp10_reader *r) {
+                        vp10_reader *r)
+#endif
+{
  FRAME_COUNTS *counts = xd->counts;
  const int max_eob = get_tx2d_size(tx_size);
  const FRAME_CONTEXT *const fc = xd->fc;
  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+#if CONFIG_AOM_QM
+  const qm_val_t *iqmatrix = iqm[!ref][tx_size];
+#endif
  int band, c = 0;
  const int tx_size_ctx = txsize_sqr_map[tx_size];
  const vpx_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
@@ -197,9 +209,14 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
      }
    }
 #if CONFIG_NEW_QUANT
+
    v = vp10_dequant_abscoeff_nuq(val, dqv, dqv_val);
    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
 #else
+#if CONFIG_AOM_QM
+    dqv = ((iqmatrix[scan[c]] * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
    v = (val * dqv) >> dq_shift;
 #endif  // CONFIG_NEW_QUANT

@@ -489,12 +506,18 @@ int vp10_decode_block_tokens(MACROBLOCKD *const xd, int plane,
 #endif  //  CONFIG_NEW_QUANT

 #if !CONFIG_ANS
+#if CONFIG_AOM_QM
+  const int eob =
+      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
+                   ctx, sc->scan, sc->neighbors, r, pd->seg_iqmatrix[seg_id]);
+#else
  const int eob =
      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
 #if CONFIG_NEW_QUANT
                   pd->seg_dequant_nuq[seg_id][dq],
 #endif  // CONFIG_NEW_QUANT
                   ctx, sc->scan, sc->neighbors, r);
+#endif  // CONFIG_AOM_QM
 #else
  const int eob = decode_coefs_ans(xd, pd->plane_type, pd->dqcoeff, tx_size,
                                   tx_type, dequant,
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -20,6 +20,9 @@
 #include "vpx_ports/system_state.h"
 #include "vpx_util/debug_util.h"

+#if CONFIG_CLPF
+#include "vp10/common/clpf.h"
+#endif
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/entropymv.h"
@@ -2437,6 +2440,13 @@ static void encode_loopfilter(VP10_COMMON *cm,
  }
 }

+#if CONFIG_CLPF
+static void encode_clpf(const VP10_COMMON *cm,
+                        struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->clpf, 1);
+}
+#endif
+
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
  if (delta_q != 0) {
    vpx_wb_write_bit(wb, 1);
@@ -2452,6 +2462,13 @@ static void encode_quantization(const VP10_COMMON *const cm,
  write_delta_q(wb, cm->y_dc_delta_q);
  write_delta_q(wb, cm->uv_dc_delta_q);
  write_delta_q(wb, cm->uv_ac_delta_q);
+#if CONFIG_AOM_QM
+  vpx_wb_write_bit(wb, cm->using_qmatrix);
+  if (cm->using_qmatrix) {
+    vpx_wb_write_literal(wb, cm->min_qmlevel, QM_LEVEL_BITS);
+    vpx_wb_write_literal(wb, cm->max_qmlevel, QM_LEVEL_BITS);
+  }
+#endif
 }

 static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
@@ -3083,6 +3100,9 @@ static void write_uncompressed_header(VP10_COMP *cpi,
 #endif  // CONFIG_EXT_PARTITION

  encode_loopfilter(cm, wb);
+#if CONFIG_CLPF
+  encode_clpf(cm, wb);
+#endif
 #if CONFIG_LOOP_RESTORATION
  encode_restoration(cm, wb);
 #endif  // CONFIG_LOOP_RESTORATION
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1561,7 +1561,12 @@ void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
                          const int16_t *quant_shift_ptr,
                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                          ,
+                          const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                          ) {
  int eob = -1;

  int i, j;
@@ -1647,16 +1652,29 @@ void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
    for (i = 0; i < n_coeffs; i++) {
      const int rc = scan[i];
      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
      const int coeff_sign = (coeff >> 31);
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      int tmp32;
+#if CONFIG_AOM_QM
+      tmp32 = (tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS);
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+#else
+      tmp32 = (tmp * quant_ptr[rc != 0]) >> 16;
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+#endif

-      if (tmp) eob = i;
+      if (tmp32) eob = i;
    }
  }
  *eob_ptr = eob + 1;
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -88,6 +88,11 @@ int vp10_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
  const int16_t *const scan = so->scan;
  const int16_t *const nb = so->neighbors;
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+#endif
  const int shift = get_tx_scale(xd, tx_type, tx_size);
 #if CONFIG_NEW_QUANT
  int dq = get_dq_profile_from_ctx(ctx);
@@ -142,6 +147,9 @@ int vp10_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
    int base_bits, dx;
    int64_t d2;
    const int rc = scan[i];
+#if CONFIG_AOM_QM
+    int iwt = iqmatrix[rc];
+#endif
    int x = qcoeff[rc];
    next_shortcut = shortcut;

@@ -200,10 +208,18 @@ int vp10_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    (vp10_dequant_abscoeff_nuq(abs(x) - 1, dequant_ptr[rc != 0],
                                               dequant_val[band_translate[i]]) <
                     (abs(coeff[rc]) << shift)));
-#else   // CONFIG_NEW_QUANT
+#else  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+        if ((abs(x) * dequant_ptr[rc != 0] * iwt >
+             ((abs(coeff[rc]) << shift) << AOM_QM_BITS)) &&
+            (abs(x) * dequant_ptr[rc != 0] * iwt <
+             (((abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])
+              << AOM_QM_BITS)))
+#else
        if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
            (abs(x) * dequant_ptr[rc != 0] <
             (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
+#endif  // CONFIG_AOM_QM
          shortcut = 1;
        else
          shortcut = 0;
@@ -366,6 +382,11 @@ int vp10_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
  for (i = next; i < eob; i = next) {
    const int x = tokens[i][best].qc;
    const int rc = scan[i];
+#if CONFIG_AOM_QM
+    const int iwt = iqmatrix[rc];
+    const int dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+#endif

    if (x) final_eob = i;
    qcoeff[rc] = x;
@@ -430,6 +451,12 @@ void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
  uint16_t *const eob = &p->eobs[block];
  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+#endif
  const int16_t *src_diff;
  const int tx2d_size = get_tx2d_size(tx_size);

@@ -452,7 +479,12 @@ void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
    if (xform_quant_idx != VP10_XFORM_QUANT_SKIP_QUANT) {
      if (LIKELY(!x->skip_block)) {
        quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
-            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam
+#if CONFIG_AOM_QM
+            ,
+            qmatrix, iqmatrix
+#endif  // CONFIG_AOM_QM
+            );
      } else {
        vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
      }
@@ -465,7 +497,12 @@ void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
  if (xform_quant_idx != VP10_XFORM_QUANT_SKIP_QUANT) {
    if (LIKELY(!x->skip_block)) {
      quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
-          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam
+#if CONFIG_AOM_QM
+          ,
+          qmatrix, iqmatrix
+#endif  // CONFIG_AOM_QM
+          );
    } else {
      vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
    }
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -15,6 +15,9 @@
 #include "./vpx_config.h"

 #include "vp10/common/alloccommon.h"
+#if CONFIG_CLPF
+#include "vp10/common/clpf.h"
+#endif
 #include "vp10/common/filter.h"
 #include "vp10/common/idct.h"
 #include "vp10/common/reconinter.h"
@@ -2435,6 +2438,9 @@ VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
   * vp10_init_quantizer() for every frame.
   */
  vp10_init_quantizer(cpi);
+#if CONFIG_AOM_QM
+  aom_qm_init(cm);
+#endif

  vp10_loop_filter_init(cm);
 #if CONFIG_LOOP_RESTORATION
@@ -3337,6 +3343,65 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
      vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
 #endif
  }
+
+#if CONFIG_CLPF
+  cm->clpf = 0;
+  if (!is_lossless_requested(&cpi->oxcf)) {
+    // Test CLPF
+    int i, hq = 1;
+    uint64_t before, after;
+    // TODO(yaowu): investigate per-segment CLPF decision and
+    // an optimal threshold, use 80 for now.
+    for (i = 0; i < MAX_SEGMENTS; i++)
+      hq &= vp10_get_qindex(&cm->seg, i, cm->base_qindex) < 80;
+
+    if (!hq) {  // Don't try filter if the entire image is nearly losslessly
+                // encoded
+#if CLPF_FILTER_ALL_PLANES
+      vpx_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf);
+      before =
+          get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
+                  cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+                  cpi->Source->y_crop_width, cpi->Source->y_crop_height) +
+          get_sse(cpi->Source->u_buffer, cpi->Source->uv_stride,
+                  cm->frame_to_show->u_buffer, cm->frame_to_show->uv_stride,
+                  cpi->Source->uv_crop_width, cpi->Source->uv_crop_height) +
+          get_sse(cpi->Source->v_buffer, cpi->Source->uv_stride,
+                  cm->frame_to_show->v_buffer, cm->frame_to_show->uv_stride,
+                  cpi->Source->uv_crop_width, cpi->Source->uv_crop_height);
+      vp10_clpf_frame(cm->frame_to_show, cm, xd);
+      after = get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
+                      cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+                      cpi->Source->y_crop_width, cpi->Source->y_crop_height) +
+              get_sse(cpi->Source->u_buffer, cpi->Source->uv_stride,
+                      cm->frame_to_show->u_buffer, cm->frame_to_show->uv_stride,
+                      cpi->Source->uv_crop_width, cpi->Source->uv_crop_height) +
+              get_sse(cpi->Source->v_buffer, cpi->Source->uv_stride,
+                      cm->frame_to_show->v_buffer, cm->frame_to_show->uv_stride,
+                      cpi->Source->uv_crop_width, cpi->Source->uv_crop_height);
+#else
+      vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+      before = get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
+                       cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+                       cpi->Source->y_crop_width, cpi->Source->y_crop_height);
+      vp10_clpf_frame(cm->frame_to_show, cm, xd);
+      after = get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
+                      cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+                      cpi->Source->y_crop_width, cpi->Source->y_crop_height);
+#endif
+      if (before < after) {
+// No improvement, restore original
+#if CLPF_FILTER_ALL_PLANES
+        vpx_yv12_copy_frame(&cpi->last_frame_uf, cm->frame_to_show);
+#else
+        vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+#endif
+      } else {
+        cm->clpf = 1;
+      }
+    }
+  }
+#endif
 #if CONFIG_LOOP_RESTORATION
  if (cm->rst_info.restoration_type != RESTORE_NONE) {
    vp10_loop_restoration_init(&cm->rst_internal, &cm->rst_info,
@@ -5259,6 +5324,12 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
      cpi->scaled_ref_idx[i] = INVALID_IDX;
  }

+#if CONFIG_AOM_QM
+  cm->using_qmatrix = cpi->oxcf.using_qm;
+  cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+  cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+#endif
+
  if (oxcf->pass == 1) {
    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
    vp10_first_pass(cpi, source);
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -193,6 +193,11 @@ typedef struct VP10EncoderConfig {
  int best_allowed_q;
  int cq_level;
  AQ_MODE aq_mode;  // Adaptive Quantization mode
+#if CONFIG_AOM_QM
+  int using_qm;
+  int qm_minlevel;
+  int qm_maxlevel;
+#endif

  // Internal frame size scaling.
  RESIZE_TYPE resize_mode;
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -756,7 +756,12 @@ void vp10_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan) {
+                        const int16_t *iscan
+#if CONFIG_AOM_QM
+                        ,
+                        const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                        ) {
  int i, eob = -1;
  // TODO(jingning) Decide the need of these arguments after the
  // quantization process is completed.
@@ -773,28 +778,47 @@ void vp10_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    for (i = 0; i < n_coeffs; i++) {
      const int rc = scan[i];
      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
      const int coeff_sign = (coeff >> 31);
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      int tmp32;
+#if CONFIG_AOM_QM
+      tmp32 = (tmp * wt * quant_ptr[rc != 0]) >> (16 + AOM_QM_BITS);
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+#else
+      tmp32 = (tmp * quant_ptr[rc != 0]) >> 16;
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+#endif

-      if (tmp) eob = i;
+      if (tmp32) eob = i;
    }
  }
  *eob_ptr = eob + 1;
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_c(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, int log_scale) {
+void vp10_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                               int skip_block, const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan,
+#if CONFIG_AOM_QM
+                               const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+                               int log_scale) {
  int i;
  int eob = -1;
  const int scale = 1 << log_scale;
@@ -814,13 +838,27 @@ void vp10_highbd_quantize_fp_c(
    for (i = 0; i < count; i++) {
      const int rc = scan[i];
      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
      const int coeff_sign = (coeff >> 31);
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+#if CONFIG_AOM_QM
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
+#else
      const uint32_t abs_qcoeff =
          (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+#endif
      if (abs_qcoeff) eob = i;
    }
  }
@@ -838,7 +876,12 @@ void vp10_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                              ,
+                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                              ) {
  int i, eob = -1;
  (void)zbin_ptr;
  (void)quant_shift_ptr;
@@ -851,31 +894,56 @@ void vp10_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    for (i = 0; i < n_coeffs; i++) {
      const int rc = scan[i];
      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      int64_t tmp = 0;
+#endif
      const int coeff_sign = (coeff >> 31);
-      int tmp = 0;
+      int tmp32 = 0;
      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+#if CONFIG_AOM_QM
+      if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - 2))) {
+#else
      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+#endif
        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#if CONFIG_AOM_QM
+        tmp = abs_coeff * wt;
+        tmp32 = (int)(tmp * quant_ptr[rc != 0]) >> (AOM_QM_BITS + 15);
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+#else
+        tmp32 = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant_ptr[rc != 0]) / 2;
+#endif
      }

-      if (tmp) eob = i;
+      if (tmp32) eob = i;
    }
  }
  *eob_ptr = eob + 1;
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_b_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, int log_scale) {
+void vp10_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan,
+#if CONFIG_AOM_QM
+                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+                              int log_scale) {
  int i, non_zero_count = (int)n_coeffs, eob = -1;
  int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
  int round[2] = { round_ptr[0], round_ptr[1] };
@@ -904,6 +972,14 @@ void vp10_highbd_quantize_b_c(
    for (i = (int)n_coeffs - 1; i >= 0; i--) {
      const int rc = scan[i];
      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      uint32_t abs_qcoeff = 0;
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif

      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
        non_zero_count--;
@@ -918,14 +994,25 @@ void vp10_highbd_quantize_b_c(
      const int coeff = coeff_ptr[rc];
      const int coeff_sign = (coeff >> 31);
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+#if CONFIG_AOM_QM
+      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+#else

      if (abs_coeff >= zbins[rc != 0]) {
+#endif
        const int64_t tmp1 = abs_coeff + round[rc != 0];
        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+#if CONFIG_AOM_QM
+        const uint32_t abs_qcoeff = (uint32_t)(
+            (tmp2 * wt * quant_shift_ptr[rc != 0]) >> (AOM_QM_BITS + shift));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / scale;
+#else
        const uint32_t abs_qcoeff =
            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> shift);
        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+#endif  // CONFIG_AOM_QM
        if (abs_qcoeff) eob = i;
      }
    }
@@ -1064,6 +1151,14 @@ void vp10_init_plane_quantizers(const VP10_COMP *cpi, MACROBLOCK *x,
  const int qindex = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
  const int rdmult = vp10_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
  int i;
+#if CONFIG_AOM_QM
+  int minqm = cm->min_qmlevel;
+  int maxqm = cm->max_qmlevel;
+  // Quant matrix only depends on the base QP so there is only one set per frame
+  int qmlevel = (lossless || cm->using_qmatrix == 0)
+                    ? NUM_QM_LEVELS - 1
+                    : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+#endif
 #if CONFIG_NEW_QUANT
  int dq;
 #endif
@@ -1075,6 +1170,12 @@ void vp10_init_plane_quantizers(const VP10_COMP *cpi, MACROBLOCK *x,
  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
  x->plane[0].zbin = quants->y_zbin[qindex];
  x->plane[0].round = quants->y_round[qindex];
+#if CONFIG_AOM_QM
+  memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
+         sizeof(cm->gqmatrix[qmlevel][0]));
+  memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
+         sizeof(cm->giqmatrix[qmlevel][0]));
+#endif
  xd->plane[0].dequant = cpi->y_dequant[qindex];
 #if CONFIG_NEW_QUANT
  for (dq = 0; dq < QUANT_PROFILES; dq++) {
@@ -1094,6 +1195,12 @@ void vp10_init_plane_quantizers(const VP10_COMP *cpi, MACROBLOCK *x,
    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
    x->plane[i].zbin = quants->uv_zbin[qindex];
    x->plane[i].round = quants->uv_round[qindex];
+#if CONFIG_AOM_QM
+    memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+           sizeof(cm->gqmatrix[qmlevel][1]));
+    memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+           sizeof(cm->giqmatrix[qmlevel][1]));
+#endif
    xd->plane[i].dequant = cpi->uv_dequant[qindex];
 #if CONFIG_NEW_QUANT
    for (dq = 0; dq < QUANT_PROFILES; dq++) {
--- a/vp10/encoder/quantize.h
+++ b/vp10/encoder/quantize.h
@@ -12,6 +12,7 @@
 #define VP10_ENCODER_QUANTIZE_H_

 #include "./vpx_config.h"
+#include "vp10/common/quant_common.h"
 #include "vp10/common/scan.h"
 #include "vp10/encoder/block.h"

--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -619,7 +619,12 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
        vp10_find_best_sub_pixel_tree_pruned_evenmore;
  }

+#if !CONFIG_AOM_QM
  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+#else
+  // FIXME: trellis not very efficient for quantisation matrices
+  x->optimize = 0;
+#endif

  x->min_partition_size = sf->default_min_partition_size;
  x->max_partition_size = sf->default_max_partition_size;
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -89,6 +89,8 @@ ifeq (yes,$(filter yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)))
 VP10_COMMON_SRCS-yes += common/warped_motion.h
 VP10_COMMON_SRCS-yes += common/warped_motion.c
 endif
+VP10_COMMON_SRCS-yes += common/clpf.c
+VP10_COMMON_SRCS-yes += common/clpf.h

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c
@@ -43,6 +43,11 @@ struct vp10_extracfg {
  unsigned int rc_max_inter_bitrate_pct;
  unsigned int gf_cbr_boost_pct;
  unsigned int lossless;
+#if CONFIG_AOM_QM
+  unsigned int enable_qm;
+  unsigned int qm_min;
+  unsigned int qm_max;
+#endif
  unsigned int frame_parallel_decoding_mode;
  AQ_MODE aq_mode;
  unsigned int frame_periodic_boost;
@@ -70,17 +75,22 @@ static struct vp10_extracfg default_extra_cfg = {
 #else
  0,  // tile_columns
  0,  // tile_rows
-#endif                         // CONFIG_EXT_TILE
-  7,                           // arnr_max_frames
-  5,                           // arnr_strength
-  0,                           // min_gf_interval; 0 -> default decision
-  0,                           // max_gf_interval; 0 -> default decision
-  VPX_TUNE_PSNR,               // tuning
-  10,                          // cq_level
-  0,                           // rc_max_intra_bitrate_pct
-  0,                           // rc_max_inter_bitrate_pct
-  0,                           // gf_cbr_boost_pct
-  0,                           // lossless
+#endif            // CONFIG_EXT_TILE
+  7,              // arnr_max_frames
+  5,              // arnr_strength
+  0,              // min_gf_interval; 0 -> default decision
+  0,              // max_gf_interval; 0 -> default decision
+  VPX_TUNE_PSNR,  // tuning
+  10,             // cq_level
+  0,              // rc_max_intra_bitrate_pct
+  0,              // rc_max_inter_bitrate_pct
+  0,              // gf_cbr_boost_pct
+  0,              // lossless
+#if CONFIG_AOM_QM
+  0,                 // enable_qm
+  DEFAULT_QM_FIRST,  // qm_min
+  DEFAULT_QM_LAST,   // qm_max
+#endif
  1,                           // frame_parallel_decoding_mode
  NO_AQ,                       // aq_mode
  0,                           // frame_periodic_delta_q
@@ -378,6 +388,12 @@ static vpx_codec_err_t set_encoder_config(
  oxcf->cq_level = vp10_quantizer_to_qindex(extra_cfg->cq_level);
  oxcf->fixed_q = -1;

+#if CONFIG_AOM_QM
+  oxcf->using_qm = extra_cfg->enable_qm;
+  oxcf->qm_minlevel = extra_cfg->qm_min;
+  oxcf->qm_maxlevel = extra_cfg->qm_max;
+#endif
+
  oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
  oxcf->over_shoot_pct = cfg->rc_overshoot_pct;

@@ -682,6 +698,29 @@ static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
  return update_extra_cfg(ctx, &extra_cfg);
 }

+#if CONFIG_AOM_QM
+static vpx_codec_err_t ctrl_set_enable_qm(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_qm = CAST(VP9E_SET_ENABLE_QM, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_qm_min(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_min = CAST(VP9E_SET_QM_MIN, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_qm_max(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_max = CAST(VP9E_SET_QM_MAX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
 static vpx_codec_err_t ctrl_set_frame_parallel_decoding_mode(
    vpx_codec_alg_priv_t *ctx, va_list args) {
  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
@@ -1280,6 +1319,11 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
  { VP9E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
  { VP9E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
  { VP9E_SET_LOSSLESS, ctrl_set_lossless },
+#if CONFIG_AOM_QM
+  { VP9E_SET_ENABLE_QM, ctrl_set_enable_qm },
+  { VP9E_SET_QM_MIN, ctrl_set_qm_min },
+  { VP9E_SET_QM_MAX, ctrl_set_qm_max },
+#endif
  { VP9E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
  { VP9E_SET_AQ_MODE, ctrl_set_aq_mode },
  { VP9E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -314,6 +314,48 @@ enum vp8e_enc_control_id {
   * Supported in codecs: VP9
   */
  VP9E_SET_LOSSLESS,
+#if CONFIG_AOM_QM
+  /*!\brief Codec control function to encode with quantisation matrices.
+   *
+   * AOM can operate with default quantisation matrices dependent on
+   * quantisation level and block type.
+   *                          0 = do not use quantisation matrices
+   *                          1 = use quantisation matrices
+   *
+   *  By default, the encoder operates without quantisation matrices.
+   *
+   * Supported in codecs: AOM
+   */
+
+  VP9E_SET_ENABLE_QM,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the minimum level of flatness from which the matrices
+   * are determined.
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Supported in codecs: AOM
+   */
+  VP9E_SET_QM_MIN,
+
+  /*!\brief Codec control function to set the max quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the maximum level of flatness possible.
+   *
+   * By default, the encoder sets this maximum at the top of the
+   * available range.
+   *
+   * Supported in codecs: AOM
+   */
+  VP9E_SET_QM_MAX,
+#endif

  /*!\brief Codec control function to set number of tile columns.
   *
@@ -651,6 +693,17 @@ VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 #define VPX_CTRL_VP9E_SET_LOSSLESS

+#if CONFIG_AOM_QM
+VPX_CTRL_USE_TYPE(VP9E_SET_ENABLE_QM, unsigned int)
+#define VPX_CTRL_VP9E_SET_ENABLE_QM
+
+VPX_CTRL_USE_TYPE(VP9E_SET_QM_MIN, unsigned int)
+#define VPX_CTRL_VP9E_SET_QM_MIN
+
+VPX_CTRL_USE_TYPE(VP9E_SET_QM_MAX, unsigned int)
+#define VPX_CTRL_VP9E_SET_QM_MAX
+#endif
+
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING

--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -12,6 +12,371 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"

+#if CONFIG_AOM_QM
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr,
+                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp, eob = -1;
+  int32_t tmp32;
+  int dequant =
+      (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                            const qm_val_t *iqm_ptr) {
+  int eob = -1;
+  int dequant =
+      (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp, eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant =
+        (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr, const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr, uint16_t *eob_ptr,
+                                  const qm_val_t *qm_ptr,
+                                  const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *zbin_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan, const qm_val_t *qm_ptr,
+                      const qm_val_t *iqm_ptr) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
+          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int dequant;
+
+      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+        int32_t tmp32;
+        int64_t tmp =
+            clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = tmp * wt;
+        tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                 quant_shift_ptr[rc != 0]) >>
+                (16 + AOM_QM_BITS);  // quantization
+        dequant =
+            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+            AOM_QM_BITS;
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+
+        if (tmp32) eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
+          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmpw = tmp1 * wt;
+        const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dequant =
+            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+            AOM_QM_BITS;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+        if (abs_qcoeff) eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      int64_t tmp;
+      int tmp32;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = tmp * wt;
+      tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >>
+              (15 + AOM_QM_BITS);
+
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+
+      if (tmp32) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+      if (abs_qcoeff) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+#else
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                     const int16_t *round_ptr, const int16_t quant,
                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -318,3 +683,4 @@ void vpx_highbd_quantize_b_32x32_c(
  *eob_ptr = eob + 1;
 }
 #endif
+#endif
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -18,6 +18,47 @@
 extern "C" {
 #endif

+#if CONFIG_AOM_QM
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr,
+                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *zbin_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan, const qm_val_t *qm_ptr,
+                      const qm_val_t *iqm_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                            const qm_val_t *iqm_ptr);
+void vpx_highbd_quantize_dc_32x32(
+    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
+    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+#endif
+#else
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                     const int16_t *round_ptr, const int16_t quant_ptr,
                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -40,6 +81,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
 #endif
+#endif

 #ifdef __cplusplus
 }  // extern "C"
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -53,6 +53,10 @@ extern "C" {
    a = c;                   \
  } while (0)

+#if CONFIG_AOM_QM
+typedef uint16_t qm_val_t;
+#define AOM_QM_BITS 6
+#endif
 #if CONFIG_VP9_HIGHBITDEPTH
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -920,22 +920,35 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 # Quantization
 #
-if ((vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+if (vpx_config("CONFIG_AOM_QM") eq "yes") {
+  if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+    add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";

-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+    add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";

-  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
+    if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";

-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-  }  # CONFIG_VP9_HIGHBITDEPTH
-}  # CONFIG_VP10_ENCODER
+      add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+    }  # CONFIG_VPX_HIGHBITDEPTH
+  }  # CONFIG_VP10_ENCODER
+} else {
+  if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+    add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";

+    add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+
+    if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+      specialize qw/vpx_highbd_quantize_b sse2/;
+
+      add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+      specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+    }  # CONFIG_VP9_HIGHBITDEPTH
+  }  # CONFIG_VP10_ENCODER
+} # CONFIG_AOM_QM
 if (vpx_config("CONFIG_VP10") eq "yes") {
  #
  # Alpha blending with mask
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -366,6 +366,15 @@ static const arg_def_t tile_rows =
            "Number of tile rows to use, log2 (set to 0 while threads > 1)");
 static const arg_def_t lossless =
    ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
+#if CONFIG_AOM_QM
+static const arg_def_t enable_qm =
+    ARG_DEF(NULL, "enable_qm", 1,
+            "Enable quantisation matrices (0: false (default), 1: true)");
+static const arg_def_t qm_min = ARG_DEF(
+    NULL, "qm_min", 1, "Min quant matrix flatness (0..15), default is 8");
+static const arg_def_t qm_max = ARG_DEF(
+    NULL, "qm_max", 1, "Max quant matrix flatness (0..15), default is 16");
+#endif
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
 static const arg_def_t aq_mode = ARG_DEF(