diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 18c12a857..796a2e96a 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -27,7 +27,7 @@ using libvpx_test::ACMRandom;
 namespace {
 void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
              int stride, int /*tx_type*/) {
-  vp9_short_fdct4x4_c(in, out, stride);
+  vp9_fdct4x4_c(in, out, stride);
 }
 void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                  int stride, int /*tx_type*/) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 72b2126da..60636eee0 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -695,11 +695,17 @@ specialize vp9_short_fht8x8 sse2
 prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
 specialize vp9_short_fht16x16 sse2
 
+prototype void vp9_fwht4x4 "int16_t *input, int16_t *output, int stride"
+specialize vp9_fwht4x4
+
+prototype void vp9_fdct4x4 "int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct4x4 sse2
+
 prototype void vp9_fdct8x8 "int16_t *input, int16_t *output, int stride"
 specialize vp9_fdct8x8 sse2
 
-prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct4x4 sse2
+prototype void vp9_fdct16x16 "int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct16x16 sse2
 
 prototype void vp9_fdct32x32 "int16_t *input, int16_t *output, int stride"
 specialize vp9_fdct32x32 sse2
@@ -707,12 +713,6 @@ specialize vp9_fdct32x32 sse2
 prototype void vp9_fdct32x32_rd "int16_t *input, int16_t *output, int stride"
 specialize vp9_fdct32x32_rd sse2
 
-prototype void vp9_fdct16x16 "int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2
-
-prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4
-
 #
 # Motion search
 #
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 0aae53502..94fcf9101 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -36,7 +36,7 @@ static void fdct4(const int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(temp2);
 }
 
-void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
@@ -585,7 +585,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    pixel. */
-void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fwht4x4_c(int16_t *input, int16_t *output, int stride) {
   int i;
   int a1, b1, c1, d1, e1;
   int16_t *ip = input;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index edb20ba2f..57a0b3487 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1865,7 +1865,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
     // printf("Switching to lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
     cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
     cpi->mb.optimize = 0;
     cpi->common.lf.filter_level = 0;
@@ -1873,7 +1873,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
     cpi->common.tx_mode = ONLY_4X4;
   } else {
     // printf("Not lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+    cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
     cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
   }
 }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 05b166219..afd6fc5fa 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -959,9 +959,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     sf->optimize_coefficients = 0;
   }
 
-  cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+  cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
   if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
   }
 
   if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index ae298c9e7..25b9e7e46 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,7 +12,7 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"
 
-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,