SSSE3 Optimization for Atom processors using new instruction selection and ordering

The function vp9_filter_block1d16_h8_ssse3 uses the PSHUFB instruction which has a 3 cycle latency and slows execution when done in blocks of 5 or more on Atom processors. By replacing the PSHUFB instructions with other more efficient single cycle instructions (PUNPCKLBW + PUNPCHBW + PALIGNR) performance can be improved. In the original code, the PSHUBF uses every byte and is consecutively copied. This is done more efficiently by PUNPCKLBW and PUNPCHBW, using PALIGNR to concatenate the intermediate result and then shift right the next consecutive 16 bytes for the final result. For example: filter = 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8 Reg = 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 REG1 = PUNPCKLBW Reg, Reg = 0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7 REG2 = PUNPCHBW Reg, Reg = 8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15 PALIGNR REG2, REG1, 1 = 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8 This optimization improved the function performance by 23% and produced a 3% user level gain on 1080p content on Atom processors. There was no observed performance impact on Core processors (expected). Change-Id: I3cec701158993d95ed23ff04516942b5a4a461c0
2014-12-05 11:14:33 -07:00 · 2014-12-05 11:14:33 -07:00 · 8f9d94ec17
commit 8f9d94ec17
parent 7cb7588b1e
1 changed files with 30 additions and 20 deletions
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@ -765,40 +765,50 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):

    movq        xmm0,   [rsi - 3]           ;load src data
    movq        xmm4,   [rsi + 5]
-    movq        xmm7,   [rsi + 13]
+    movq        xmm6,   [rsi + 13]
    punpcklqdq  xmm0,   xmm4
-    punpcklqdq  xmm4,   xmm7
+    punpcklqdq  xmm4,   xmm6

+    movdqa      xmm7,   xmm0
+
+    punpcklbw   xmm7,   xmm7
+    punpckhbw   xmm0,   xmm0
    movdqa      xmm1,   xmm0
    movdqa      xmm2,   xmm0
    movdqa      xmm3,   xmm0
+
+    palignr     xmm0,   xmm7, 1
+    palignr     xmm1,   xmm7, 5
+    pmaddubsw   xmm0,   k0k1
+    palignr     xmm2,   xmm7, 9
+    pmaddubsw   xmm1,   k2k3
+    palignr     xmm3,   xmm7, 13
+
+    pmaddubsw   xmm2,   k4k5
+    pmaddubsw   xmm3,   k6k7
+    paddsw      xmm0,   xmm3
+
+    movdqa      xmm3,   xmm4
+    punpcklbw   xmm3,   xmm3
+    punpckhbw   xmm4,   xmm4
+
    movdqa      xmm5,   xmm4
    movdqa      xmm6,   xmm4
    movdqa      xmm7,   xmm4

-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
-    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
+    palignr     xmm4,   xmm3, 1
+    palignr     xmm5,   xmm3, 5
+    palignr     xmm6,   xmm3, 9
+    palignr     xmm7,   xmm3, 13

-    pmaddubsw   xmm0,   k0k1
-    pmaddubsw   xmm1,   k2k3
-    pmaddubsw   xmm2,   k4k5
-    pmaddubsw   xmm3,   k6k7
-    pmaddubsw   xmm4,   k0k1
-    pmaddubsw   xmm5,   k2k3
-    pmaddubsw   xmm6,   k4k5
-    pmaddubsw   xmm7,   k6k7
-
-    paddsw      xmm0,   xmm3
    movdqa      xmm3,   xmm1
+    pmaddubsw   xmm4,   k0k1
    pmaxsw      xmm1,   xmm2
+    pmaddubsw   xmm5,   k2k3
    pminsw      xmm2,   xmm3
+    pmaddubsw   xmm6,   k4k5
    paddsw      xmm0,   xmm2
+    pmaddubsw   xmm7,   k6k7
    paddsw      xmm0,   xmm1

    paddsw      xmm4,   xmm7