Using stride (# of elements) instead of pitch (bytes) in fdct4x4.

Just making fdct consistent with iht/idct/fht functions which all use stride (# of elements) as input argument. Change-Id: I0ba3c52513a5fdd194f1e7e2901092671398985b
2013-10-21 15:27:35 -07:00 · 2013-10-21 15:27:35 -07:00 · 190c2b4591
commit 190c2b4591
parent f6d870f7ae
7 changed files with 19 additions and 22 deletions
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@ -31,15 +31,15 @@ void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
 }
 void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                 int stride, int /*tx_type*/) {
-  vp9_idct4x4_16_add_c(out, dst, stride >> 1);
+  vp9_idct4x4_16_add_c(out, dst, stride);
 }
 void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
            int stride, int tx_type) {
-  vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
+  vp9_short_fht4x4_c(in, out, stride, tx_type);
 }
 void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                int stride, int tx_type) {
-  vp9_iht4x4_16_add_c(out, dst, stride >> 1, tx_type);
+  vp9_iht4x4_16_add_c(out, dst, stride, tx_type);
 }

 class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
@ -78,7 +78,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
-  const int pitch = 8;
+  const int pitch = 4;
  int count_sign_block[16][2];
  const int count_test_block = 1000000;

@ -152,7 +152,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
    for (int j = 0; j < 16; ++j)
      test_input_block[j] = src[j] - dst[j];

-    const int pitch = 8;
+    const int pitch = 4;
    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -698,7 +698,7 @@ specialize vp9_short_fht16x16 sse2
 prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x8 sse2

-prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int stride"
 specialize vp9_short_fdct4x4 sse2

 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride"
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@ -36,14 +36,13 @@ static void fdct4(const int16_t *input, int16_t *output) {
  output[3] = dct_const_round_shift(temp2);
 }

-void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we tranpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
-  const int stride = pitch >> 1;
  int pass;
  // We need an intermediate buffer between passes.
  int16_t intermediate[4 * 4];
@ -587,18 +586,17 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,

 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
   pixel. */
-void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int stride) {
  int i;
  int a1, b1, c1, d1, e1;
  int16_t *ip = input;
  int16_t *op = output;
-  int pitch_short = pitch >> 1;

  for (i = 0; i < 4; i++) {
-    a1 = ip[0 * pitch_short];
-    b1 = ip[1 * pitch_short];
-    c1 = ip[2 * pitch_short];
-    d1 = ip[3 * pitch_short];
+    a1 = ip[0 * stride];
+    b1 = ip[1 * stride];
+    c1 = ip[2 * stride];
+    d1 = ip[3 * stride];

    a1 += b1;
    d1 = d1 - c1;
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -402,7 +402,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
      xoff = 4 * (block & twmask);
      yoff = 4 * (block >> twl);
      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      x->fwd_txm4x4(src_diff, coeff, bw * 8);
+      x->fwd_txm4x4(src_diff, coeff, bw * 4);
      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob, scan, iscan);
@ -612,7 +612,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
      if (tx_type != DCT_DCT)
        vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
      else
-        x->fwd_txm4x4(src_diff, coeff, bw * 8);
+        x->fwd_txm4x4(src_diff, coeff, bw * 4);
      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                     p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob, scan, iscan);
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@ -959,9 +959,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
    sf->optimize_coefficients = 0;
  }

-  cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;
+  cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4;
+    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
  }

  cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -1089,7 +1089,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
          x->quantize_b_4x4(x, block, tx_type, 16);
        } else {
-          x->fwd_txm4x4(src_diff, coeff, 16);
+          x->fwd_txm4x4(src_diff, coeff, 8);
          x->quantize_b_4x4(x, block, tx_type, 16);
        }

@ -1566,7 +1566,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
      k += (idy * 2 + idx);
      coeff = BLOCK_OFFSET(p->coeff, k);
      x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
-                    coeff, 16);
+                    coeff, 8);
      x->quantize_b_4x4(x, k, DCT_DCT, 16);
      thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                        16, &ssz);
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@ -12,14 +12,13 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"

-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we tranpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
-  const int stride = pitch >> 1;
  int pass;
  // Constants
  //    When we use them, in one case, they are all the same. In all others