Merge remote branch 'internal/upstream' into HEAD

2010-09-21 00:05:03 -04:00 · 2010-09-21 00:05:03 -04:00 · 99c611fea6
commit 99c611fea6
parent 14b322e466 b7dc9398f2
9 changed files with 796 additions and 383 deletions
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@ -11,6 +11,8 @@
 %include "vpx_ports/x86_abi_support.asm"
 ; Use of pmaxub instead of psubusb to compute filter mask was seen
 ; in ffvp8
 %macro LFH_FILTER_MASK 1
 %if %1
@ -33,8 +35,6 @@
        psubusb     xmm2,                   xmm6              ; q3-=q2
        por         xmm1,                   xmm2              ; abs(q3-q2)
        psubusb     xmm1,                   xmm7
 %if %1
        movdqa      xmm4,                   [rsi+rax]         ; q1
 %else
@ -49,9 +49,7 @@
        psubusb     xmm4,                   xmm6              ; q1-=q2
        psubusb     xmm6,                   xmm3              ; q2-=q1
        por         xmm4,                   xmm6              ; abs(q2-q1)
-        psubusb     xmm4,                   xmm7
+        pmaxub      xmm1,                   xmm4
        por         xmm1,                   xmm4
 %if %1
        movdqa      xmm4,                   [rsi]             ; q0
@ -67,9 +65,7 @@
        psubusb     xmm3,                   xmm0              ; q1-=q0
        por         xmm4,                   xmm3              ; abs(q0-q1)
        movdqa      t0,                     xmm4              ; save to t0
-
+        pmaxub      xmm1,                   xmm4
        psubusb     xmm4,                   xmm7
        por         xmm1,                   xmm4
 %if %1
        neg         rax                     ; negate pitch to deal with above border
@ -95,9 +91,7 @@
        psubusb     xmm4,                   xmm2              ; p2-=p3
        psubusb     xmm2,                   xmm5              ; p3-=p2
        por         xmm4,                   xmm2              ; abs(p3 - p2)
-
+        pmaxub      xmm1,                   xmm4
        psubusb     xmm4,                   xmm7
        por         xmm1,                   xmm4
 %if %1
        movdqa      xmm4,                   [rsi+2*rax]       ; p1
@ -113,9 +107,8 @@
        psubusb     xmm4,                   xmm5              ; p1-=p2
        psubusb     xmm5,                   xmm3              ; p2-=p1
        por         xmm4,                   xmm5              ; abs(p2 - p1)
-        psubusb     xmm4,                   xmm7
+        pmaxub      xmm1,                   xmm4
        por         xmm1,                   xmm4
        movdqa      xmm2,                   xmm3              ; p1
 %if %1
@ -133,8 +126,8 @@
        por         xmm4,                   xmm3              ; abs(p1 - p0)
        movdqa        t1,                   xmm4              ; save to t1
-        psubusb     xmm4,                   xmm7
+        pmaxub      xmm1,                   xmm4
-        por         xmm1,                   xmm4
+        psubusb     xmm1,                   xmm7
 %if %1
        movdqa      xmm3,                   [rdi]             ; q1
@ -872,19 +865,18 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
        psubusb     xmm0,               xmm7            ; q2-q3
        psubusb     xmm7,               xmm6            ; q3-q2
        por         xmm7,               xmm0            ; abs (q3-q2)
        movdqa      xmm4,               xmm5            ; q1
        por         xmm7,               xmm0            ; abs (q3-q2)
        psubusb     xmm4,               xmm6            ; q1-q2
        psubusb     xmm6,               xmm5            ; q2-q1
        por         xmm6,               xmm4            ; abs (q2-q1)
        movdqa      xmm0,               xmm1
        psubusb     xmm6,               xmm5            ; q2-q1
        por         xmm6,               xmm4            ; abs (q2-q1)
        psubusb     xmm0,               xmm2            ; p2 - p3;
        psubusb     xmm2,               xmm1            ; p3 - p2;
        psubusb     xmm2,               xmm1            ; p3 - p2;
        por         xmm0,               xmm2            ; abs(p2-p3)
 %if %1
        movdqa      xmm2,               [rdx]           ; p1
@ -892,39 +884,28 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
        movdqa      xmm2,               [rdx+32]        ; p1
 %endif
        movdqa      xmm5,               xmm2            ; p1
        pmaxub      xmm0,               xmm7
        psubusb     xmm5,               xmm1            ; p1-p2
        psubusb     xmm1,               xmm2            ; p2-p1
        por         xmm1,               xmm5            ; abs(p2-p1)
        mov         rdx,                arg(3)          ; limit
        movdqa      xmm4,               [rdx]           ; limit
        psubusb     xmm7,               xmm4
        psubusb     xmm0,               xmm4            ; abs(p3-p2) > limit
        psubusb     xmm1,               xmm4            ; abs(p2-p1) > limit
        psubusb     xmm6,               xmm4            ; abs(q2-q1) > limit
        por         xmm7,               xmm6            ; or
        por         xmm0,               xmm1
        por         xmm0,               xmm7            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
        movdqa      xmm1,               xmm2            ; p1
        movdqa      xmm7,               xmm3            ; p0
        psubusb     xmm7,               xmm2            ; p0-p1
        por         xmm1,               xmm5            ; abs(p2-p1)
        pmaxub      xmm0,               xmm6
        pmaxub      xmm0,               xmm1
        movdqa      xmm1,               xmm2            ; p1
        psubusb     xmm2,               xmm3            ; p1-p0
        por         xmm2,               xmm7            ; abs(p1-p0)
        movdqa      t0,                 xmm2            ; save abs(p1-p0)
        lea         rdx,                srct
-        psubusb     xmm2,               xmm4            ; abs(p1-p0)>limit
+        pmaxub      xmm0,               xmm2
-        por         xmm0,               xmm2            ; mask
+
 %if %1
        movdqa      xmm5,               [rdx+32]        ; q0
        movdqa      xmm7,               [rdx+48]        ; q1
@ -940,9 +921,12 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
        por         xmm7,               xmm5            ; abs(q1-q0)
        movdqa      t1,                 xmm7            ; save abs(q1-q0)
        psubusb     xmm7,               xmm4            ; abs(q1-q0)> limit
-        por         xmm0,               xmm7            ; mask
+        mov         rdx,                arg(3)          ; limit
        movdqa      xmm4,               [rdx]           ; limit
        pmaxub      xmm0,               xmm7
        psubusb     xmm0,               xmm4
        movdqa      xmm5,               xmm2            ; q1
        psubusb     xmm5,               xmm1            ; q1-=p1
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@ -70,27 +70,35 @@ sym(vp8_filter_block1d8_h6_ssse3):
    sub         rdi, rdx
 ;xmm3 free
 filter_block1d8_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-    movdqa      xmm1, xmm0
+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    pshufb      xmm0, [shuf1b GLOBAL]
-    movdqa      xmm2, xmm1
+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
    pshufb      xmm1, [shuf2b GLOBAL]
    pmaddubsw   xmm0, xmm4
    pmaddubsw   xmm1, xmm5
-    pshufb      xmm2, [shuf3b GLOBAL]
+    movdqa      xmm1,   xmm0
-    add         rdi, rdx
+    pmaddubsw   xmm0,   xmm4
-    pmaddubsw   xmm2, xmm6
+
    movdqa      xmm2,   xmm1
    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]
    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]
    pmaddubsw   xmm1,   xmm5
    lea         rdi,    [rdi + rdx]
    pmaddubsw   xmm2,   xmm6
    lea         rsi,    [rsi + rax]
    dec         rcx
-    paddsw      xmm0, xmm1
+
-    paddsw      xmm0, xmm7
+    paddsw      xmm0,   xmm1
-    paddsw      xmm0, xmm2
+    paddsw      xmm2,   xmm7
-    psraw       xmm0, 7
+
-    packuswb    xmm0, xmm0
+    paddsw      xmm0,   xmm2
    psraw       xmm0,   7
    packuswb    xmm0,   xmm0
    movq        MMWORD Ptr [rdi], xmm0
    jnz         filter_block1d8_h6_rowloop_ssse3
@ -107,8 +115,8 @@ vp8_filter_block1d8_h4_ssse3:
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm3, XMMWORD PTR [shuf2b GLOBAL]
+    movdqa      xmm3, XMMWORD PTR [shuf2bfrom1 GLOBAL]
-    movdqa      xmm4, XMMWORD PTR [shuf3b GLOBAL]
+    movdqa      xmm4, XMMWORD PTR [shuf3bfrom1 GLOBAL]
    mov         rsi, arg(0)             ;src_ptr
@ -118,24 +126,33 @@ vp8_filter_block1d8_h4_ssse3:
    movsxd      rdx, dword ptr arg(3)   ;output_pitch
    sub         rdi, rdx
-;xmm3 free
+
 filter_block1d8_h4_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-    movdqa      xmm2, xmm0
+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    pshufb      xmm0, xmm3 ;[shuf2b GLOBAL]
    pshufb      xmm2, xmm4 ;[shuf3b GLOBAL]
-    pmaddubsw   xmm0, xmm5
+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-    add         rdi, rdx
+
-    pmaddubsw   xmm2, xmm6
+    movdqa      xmm2,   xmm0
    pshufb      xmm0,   xmm3
    pshufb      xmm2,   xmm4
    pmaddubsw   xmm0,   xmm5
    lea         rdi,    [rdi + rdx]
    pmaddubsw   xmm2,   xmm6
    lea         rsi,    [rsi + rax]
    dec         rcx
-    paddsw      xmm0, xmm7
+
-    paddsw      xmm0, xmm2
+    paddsw      xmm0,   xmm7
-    psraw       xmm0, 7
+
-    packuswb    xmm0, xmm0
+    paddsw      xmm0,   xmm2
    psraw       xmm0,   7
    packuswb    xmm0,   xmm0
    movq        MMWORD Ptr [rdi], xmm0
@ -168,74 +185,88 @@ sym(vp8_filter_block1d16_h6_ssse3):
    push        rdi
    ; end prolog
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    movsxd      rdx, DWORD PTR arg(5)           ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;
    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
-    mov         rdi, arg(2)             ;output_ptr
+    mov         rdi, arg(2)                     ;output_ptr
    movdqa      xmm7, [rd GLOBAL]
 ;;
 ;;    cmp         esi, DWORD PTR [rax]
 ;;    je          vp8_filter_block1d16_h4_ssse3
-    mov         rsi, arg(0)             ;src_ptr
+    mov         rsi, arg(0)                     ;src_ptr
    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
+    movsxd      rcx, dword ptr arg(4)           ;output_height
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+    movsxd      rdx, dword ptr arg(3)           ;output_pitch
 filter_block1d16_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-    movdqa      xmm1, xmm0
+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    pshufb      xmm0, [shuf1b GLOBAL]
    movdqa      xmm2, xmm1
    pmaddubsw   xmm0, xmm4
    pshufb      xmm1, [shuf2b GLOBAL]
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5
-    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-    pmaddubsw   xmm2, xmm6
+    movdqa      xmm1,   xmm0
-    paddsw      xmm0, xmm1
+    pmaddubsw   xmm0,   xmm4
-    movdqa      xmm1, xmm3
+
-    pshufb      xmm3, [shuf1b GLOBAL]
+    movdqa      xmm2,   xmm1
-    paddsw      xmm0, xmm7
+    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]
-    pmaddubsw   xmm3, xmm4
+
-    paddsw      xmm0, xmm2
+    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]
-    movdqa      xmm2, xmm1
+    movq        xmm3,   MMWORD PTR [rsi +  6]
-    pshufb      xmm1, [shuf2b GLOBAL]
+
-    pshufb      xmm2, [shuf3b GLOBAL]
+    pmaddubsw   xmm1,   xmm5
-    pmaddubsw   xmm1, xmm5
+    movq        xmm7,   MMWORD PTR [rsi + 11]
-    pmaddubsw   xmm2, xmm6
+
    pmaddubsw   xmm2,   xmm6
    punpcklbw   xmm3,   xmm7
    paddsw      xmm0,   xmm1
    movdqa      xmm1,   xmm3
    pmaddubsw   xmm3,   xmm4
    paddsw      xmm0,   xmm2
    movdqa      xmm2,   xmm1
    paddsw      xmm0,   [rd GLOBAL]
    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]
    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]
    psraw       xmm0,   7
    pmaddubsw   xmm1,   xmm5
    pmaddubsw   xmm2,   xmm6
    packuswb    xmm0,   xmm0
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    lea         rsi,    [rsi + rax]
-    paddsw      xmm3, xmm1
+    paddsw      xmm3,   xmm1
    paddsw      xmm3, xmm7
    paddsw      xmm3, xmm2
    psraw       xmm3, 7
    packuswb    xmm3, xmm3
-    punpcklqdq  xmm0, xmm3
+    paddsw      xmm3,   xmm2
    paddsw      xmm3,   [rd GLOBAL]
    psraw       xmm3,   7
    packuswb    xmm3,   xmm3
    punpcklqdq  xmm0,   xmm3
    movdqa      XMMWORD Ptr [rdi], xmm0
-    add         rdi, rdx
+    lea         rdi,    [rdi + rdx]
    dec         rcx
    jnz         filter_block1d16_h6_rowloop_ssse3
    ; begin epilog
    pop rdi
    pop rsi
@ -268,7 +299,7 @@ filter_block1d16_h4_rowloop_ssse3:
    pshufb      xmm3, [shuf3b GLOBAL]
    pshufb      xmm0, [shuf2b GLOBAL]
-    paddsw      xmm1, xmm7
+    paddsw      xmm1, [rd GLOBAL]
    paddsw      xmm1, xmm2
    pmaddubsw   xmm0, xmm5
@ -278,7 +309,7 @@ filter_block1d16_h4_rowloop_ssse3:
    packuswb    xmm1, xmm1
    lea         rsi,    [rsi + rax]
    paddsw      xmm3, xmm0
-    paddsw      xmm3, xmm7
+    paddsw      xmm3, [rd GLOBAL]
    psraw       xmm3, 7
    packuswb    xmm3, xmm3
@ -939,17 +970,19 @@ sym(vp8_bilinear_predict16x16_ssse3):
 %if ABI_IS_32BIT=0
        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
 %endif
-        movdqu      xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
-
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
        movdqa      xmm4,       xmm3
        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16
        lea         rsi,        [rsi + rdx]         ; next line
        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
        lea         rsi,        [rsi + rdx]         ; next line
        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
-        punpckhbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
@ -962,17 +995,18 @@ sym(vp8_bilinear_predict16x16_ssse3):
        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
 .next_row:
-        movdqu      xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
-
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
        movdqa      xmm4,       xmm6
        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16
        lea         rsi,        [rsi + rdx]         ; next line
        punpcklbw   xmm6,       xmm5
        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
        lea         rsi,        [rsi + rdx]         ; next line
        pmaddubsw   xmm6,       xmm1
-        punpckhbw   xmm4,       xmm5
+        punpcklbw   xmm4,       xmm5
        pmaddubsw   xmm4,       xmm1
        paddw       xmm6,       [rd GLOBAL]         ; xmm6 += round value
@ -1027,49 +1061,51 @@ b16x16_sp_only:
        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
        ; get the first horizontal line done
-        movdqu      xmm2,       [rsi]               ; load row 0
+        movq        xmm4,       [rsi]               ; load row 0
        movq        xmm2,       [rsi + 8]           ; load row 0
        lea         rsi,        [rsi + rax]         ; next line
 .next_row:
-        movdqu      xmm3,       [rsi]               ; load row + 1
+        movq        xmm3,       [rsi]               ; load row + 1
        movq        xmm5,       [rsi + 8]           ; load row + 1
        movdqu      xmm4,       xmm2
        punpcklbw   xmm4,       xmm3
        punpcklbw   xmm2,       xmm5
        pmaddubsw   xmm4,       xmm1
-        movdqu      xmm7,       [rsi + rax]         ; load row + 2
+        movq        xmm7,       [rsi + rax]         ; load row + 2
        punpckhbw   xmm2,       xmm3
        movdqu      xmm6,       xmm3
        pmaddubsw   xmm2,       xmm1
-        punpcklbw   xmm6,       xmm7
+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
        punpcklbw   xmm3,       xmm7
        punpcklbw   xmm5,       xmm6
        pmaddubsw   xmm3,       xmm1
        paddw       xmm4,       [rd GLOBAL]
-        pmaddubsw   xmm6,       xmm1
+
        pmaddubsw   xmm5,       xmm1
        paddw       xmm2,       [rd GLOBAL]
        psraw       xmm4,       VP8_FILTER_SHIFT
        punpckhbw   xmm3,       xmm7
        paddw       xmm2,       [rd GLOBAL]
        pmaddubsw   xmm3,       xmm1
        psraw       xmm2,       VP8_FILTER_SHIFT
        paddw       xmm6,       [rd GLOBAL]
        packuswb    xmm4,       xmm2
        psraw       xmm6,       VP8_FILTER_SHIFT
        movdqa      [rdi],      xmm4                ; store row 0
        paddw       xmm3,       [rd GLOBAL]
        movdqa      [rdi],      xmm4                ; store row 0
        paddw       xmm5,       [rd GLOBAL]
        psraw       xmm3,       VP8_FILTER_SHIFT
        psraw       xmm5,       VP8_FILTER_SHIFT
        packuswb    xmm3,       xmm5
        movdqa      xmm4,       xmm7
        movdqa      [rdi + rdx],xmm3                ; store row 1
        lea         rsi,        [rsi + 2*rax]
-        packuswb    xmm6,       xmm3
+        movdqa      xmm2,       xmm6
        movdqa      xmm2,       xmm7
        movdqa      [rdi + rdx],xmm6                ; store row 1
        lea         rdi,        [rdi + 2*rdx]
        cmp         rdi,        rcx
@ -1083,32 +1119,35 @@ b16x16_fp_only:
        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
 .next_row:
-        movdqu      xmm2,       [rsi]               ; row 0
+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movdqa      xmm3,       xmm2
+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
        movdqu      xmm4,       [rsi + 1]           ; row 0 + 1
        lea         rsi,        [rsi + rax]         ; next line
        punpcklbw   xmm2,       xmm4
-        movdqu      xmm5,       [rsi]               ; row 1
+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
        pmaddubsw   xmm2,       xmm1
-        movdqa      xmm6,       xmm5
+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-        punpckhbw   xmm3,       xmm4
+        lea         rsi,        [rsi + rax]         ; next line
-        movdqu      xmm7,       [rsi + 1]           ; row 1 + 1
+        punpcklbw   xmm3,       xmm4
        pmaddubsw   xmm3,       xmm1
-        paddw       xmm2,       [rd GLOBAL]
+        movq        xmm5,       [rsi]
        paddw       xmm2,       [rd GLOBAL]
        movq        xmm7,       [rsi+1]
        movq        xmm6,       [rsi+8]
        psraw       xmm2,       VP8_FILTER_SHIFT
        punpcklbw   xmm5,       xmm7
        movq        xmm7,       [rsi+9]
        paddw       xmm3,       [rd GLOBAL]
        pmaddubsw   xmm5,       xmm1
        psraw       xmm3,       VP8_FILTER_SHIFT
-        punpckhbw   xmm6,       xmm7
+        punpcklbw   xmm6,       xmm7
        packuswb    xmm2,       xmm3
        pmaddubsw   xmm6,       xmm1
@ -1463,6 +1502,13 @@ shuf2b:
 shuf3b:
    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
 align 16
 shuf2bfrom1:
    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
 align 16
 shuf3bfrom1:
    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
 align 16
 rd:
    times 8 dw 0x40
--- a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
@ -1,136 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_dequant_dc_idct_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
 ;                                  unsigned char *dest, int pitch, int stride,
 ;                                  int Dc);
 ; r0    short *input,
 ; r1    short *dq,
 ; r2    unsigned char *pred
 ; r3    unsigned char *dest
 ; sp    int pitch
 ; sp+4  int stride
 ; sp+8  int Dc
 |vp8_dequant_dc_idct_add_neon| PROC
    vld1.16         {q3, q4}, [r0]
    vld1.16         {q5, q6}, [r1]
    ldr             r1, [sp, #8]            ;load Dc from stack
    ldr             r12, _CONSTANTS_
    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
    vmul.i16        q2, q4, q6
    vmov.16         d2[0], r1
    ldr             r1, [sp]                ; pitch
    vld1.32         {d14[0]}, [r2], r1
    vld1.32         {d14[1]}, [r2], r1
    vld1.32         {d15[0]}, [r2], r1
    vld1.32         {d15[1]}, [r2]
    ldr             r1, [sp, #4]            ; stride
 ;|short_idct4x4llm_neon| PROC
    vld1.16         {d0}, [r12]
    vswp            d3, d4                  ;q2(vp[4] vp[12])
    vqdmulh.s16     q3, q2, d0[2]
    vqdmulh.s16     q4, q2, d0[0]
    vqadd.s16       d12, d2, d3             ;a1
    vqsub.s16       d13, d2, d3             ;b1
    vshr.s16        q3, q3, #1
    vshr.s16        q4, q4, #1
    vqadd.s16       q3, q3, q2
    vqadd.s16       q4, q4, q2
    vqsub.s16       d10, d6, d9             ;c1
    vqadd.s16       d11, d7, d8             ;d1
    vqadd.s16       d2, d12, d11
    vqadd.s16       d3, d13, d10
    vqsub.s16       d4, d13, d10
    vqsub.s16       d5, d12, d11
    vtrn.32         d2, d4
    vtrn.32         d3, d5
    vtrn.16         d2, d3
    vtrn.16         d4, d5
 ; memset(input, 0, 32) -- 32bytes
    vmov.i16        q14, #0
    vswp            d3, d4
    vqdmulh.s16     q3, q2, d0[2]
    vqdmulh.s16     q4, q2, d0[0]
    vqadd.s16       d12, d2, d3             ;a1
    vqsub.s16       d13, d2, d3             ;b1
    vmov            q15, q14
    vshr.s16        q3, q3, #1
    vshr.s16        q4, q4, #1
    vqadd.s16       q3, q3, q2
    vqadd.s16       q4, q4, q2
    vqsub.s16       d10, d6, d9             ;c1
    vqadd.s16       d11, d7, d8             ;d1
    vqadd.s16       d2, d12, d11
    vqadd.s16       d3, d13, d10
    vqsub.s16       d4, d13, d10
    vqsub.s16       d5, d12, d11
    vst1.16         {q14, q15}, [r0]
    vrshr.s16       d2, d2, #3
    vrshr.s16       d3, d3, #3
    vrshr.s16       d4, d4, #3
    vrshr.s16       d5, d5, #3
    vtrn.32         d2, d4
    vtrn.32         d3, d5
    vtrn.16         d2, d3
    vtrn.16         d4, d5
    vaddw.u8        q1, q1, d14
    vaddw.u8        q2, q2, d15
    vqmovun.s16     d0, q1
    vqmovun.s16     d1, q2
    vst1.32         {d0[0]}, [r3], r1
    vst1.32         {d0[1]}, [r3], r1
    vst1.32         {d1[0]}, [r3], r1
    vst1.32         {d1[1]}, [r3]
    bx             lr
    ENDP           ; |vp8_dequant_dc_idct_add_neon|
 ; Constant Pool
 _CONSTANTS_       DCD cospi8sqrt2minus1
 cospi8sqrt2minus1 DCD 0x4e7b4e7b
 sinpi8sqrt2       DCD 0x8a8c8a8c
    END
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@ -12,6 +12,21 @@
 #include "idct.h"
 #include "dequantize.h"
 /* place these declarations here because we don't want to maintain them
 * outside of this scope
 */
 void idct_dequant_dc_full_2x_neon
            (short *input, short *dq, unsigned char *pre, unsigned char *dst,
             int stride, short *dc);
 void idct_dequant_dc_0_2x_neon
            (short *dc, unsigned char *pre, unsigned char *dst, int stride);
 void idct_dequant_full_2x_neon
            (short *q, short *dq, unsigned char *pre, unsigned char *dst,
             int pitch, int stride);
 void idct_dequant_0_2x_neon
            (short *q, short dq, unsigned char *pre, int pitch,
             unsigned char *dst, int stride);
 void vp8_dequant_dc_idct_add_y_block_neon
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs, short *dc)
@ -20,25 +35,15 @@ void vp8_dequant_dc_idct_add_y_block_neon
    for (i = 0; i < 4; i++)
    {
-        if (eobs[0] > 1)
+        if (((short *)eobs)[0] & 0xfefe)
-            vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
+            idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
        else
-            vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
+            idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
-        if (eobs[1] > 1)
+        if (((short *)eobs)[1] & 0xfefe)
-            vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
+            idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
        else
-            vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
+            idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
        if (eobs[2] > 1)
            vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
        else
            vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
        if (eobs[3] > 1)
            vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
        else
            vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
        q    += 64;
        dc   += 4;
@ -56,37 +61,15 @@ void vp8_dequant_idct_add_y_block_neon
    for (i = 0; i < 4; i++)
    {
-        if (eobs[0] > 1)
+        if (((short *)eobs)[0] & 0xfefe)
-            vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
+            idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
        else
-        {
+            idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
            ((int *)q)[0] = 0;
        }
-        if (eobs[1] > 1)
+        if (((short *)eobs)[1] & 0xfefe)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
+            idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
        else
-        {
+            idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
            ((int *)(q+16))[0] = 0;
        }
        if (eobs[2] > 1)
            vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
            ((int *)(q+32))[0] = 0;
        }
        if (eobs[3] > 1)
            vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
            ((int *)(q+48))[0] = 0;
        }
        q    += 64;
        pre  += 64;
@ -99,53 +82,34 @@ void vp8_dequant_idct_add_uv_block_neon
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
-    int i;
+    if (((short *)eobs)[0] & 0xfefe)
        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
    else
        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
-    for (i = 0; i < 2; i++)
+    q    += 32;
-    {
+    pre  += 32;
-        if (eobs[0] > 1)
+    dstu += 4*stride;
            vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
            ((int *)q)[0] = 0;
        }
-        if (eobs[1] > 1)
+    if (((short *)eobs)[1] & 0xfefe)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
-        else
+    else
-        {
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
-        q    += 32;
+    q += 32;
-        pre  += 32;
+    pre += 32;
        dstu += 4*stride;
        eobs += 2;
    }
-    for (i = 0; i < 2; i++)
+    if (((short *)eobs)[2] & 0xfefe)
-    {
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
-        if (eobs[0] > 1)
+    else
-            vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
            ((int *)q)[0] = 0;
        }
-        if (eobs[1] > 1)
+    q    += 32;
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
+    pre  += 32;
-        else
+    dstv += 4*stride;
        {
            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
-        q    += 32;
+    if (((short *)eobs)[3] & 0xfefe)
-        pre  += 32;
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
-        dstv += 4*stride;
+    else
-        eobs += 2;
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
    }
 }
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@ -0,0 +1,79 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT  |idct_dequant_0_2x_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
 ;                            int pitch, unsigned char *dst, int stride);
 ; r0   *q
 ; r1   dq
 ; r2   *pre
 ; r3   pitch
 ; sp   *dst
 ; sp+4 stride
 |idct_dequant_0_2x_neon| PROC
    add             r12, r2, #4
    vld1.32         {d2[0]}, [r2], r3
    vld1.32         {d2[1]}, [r2], r3
    vld1.32         {d4[0]}, [r2], r3
    vld1.32         {d4[1]}, [r2]
    vld1.32         {d8[0]}, [r12], r3
    vld1.32         {d8[1]}, [r12], r3
    vld1.32         {d10[0]}, [r12], r3
    vld1.32         {d10[1]}, [r12]
    ldrh            r12, [r0]               ; lo q
    ldrh            r2, [r0, #32]           ; hi q
    mov             r3, #0
    strh            r3, [r0]
    strh            r3, [r0, #32]
    sxth            r12, r12                ; lo
    mul             r0, r12, r1
    add             r0, r0, #4
    asr             r0, r0, #3
    vdup.16         q0, r0
    sxth            r2, r2                  ; hi
    mul             r0, r2, r1
    add             r0, r0, #4
    asr             r0, r0, #3
    vdup.16         q3, r0
    vaddw.u8        q1, q0, d2              ; lo
    vaddw.u8        q2, q0, d4
    vaddw.u8        q4, q3, d8              ; hi
    vaddw.u8        q5, q3, d10
    ldr             r2, [sp]                ; dst
    ldr             r3, [sp, #4]            ; stride
    vqmovun.s16     d2, q1                  ; lo
    vqmovun.s16     d4, q2
    vqmovun.s16     d8, q4                  ; hi
    vqmovun.s16     d10, q5
    add             r0, r2, #4
    vst1.32         {d2[0]}, [r2], r3       ; lo
    vst1.32         {d2[1]}, [r2], r3
    vst1.32         {d4[0]}, [r2], r3
    vst1.32         {d4[1]}, [r2]
    vst1.32         {d8[0]}, [r0], r3       ; hi
    vst1.32         {d8[1]}, [r0], r3
    vst1.32         {d10[0]}, [r0], r3
    vst1.32         {d10[1]}, [r0]
    bx             lr
    ENDP           ; |idct_dequant_0_2x_neon|
    END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@ -0,0 +1,69 @@
 ;
 ;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT  |idct_dequant_dc_0_2x_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
 ;                               unsigned char *dst, int stride);
 ; r0  *dc
 ; r1  *pre
 ; r2  *dst
 ; r3  stride
 |idct_dequant_dc_0_2x_neon| PROC
    ldr             r0, [r0]                ; *dc
    mov             r12, #16
    vld1.32         {d2[0]}, [r1], r12      ; lo
    vld1.32         {d2[1]}, [r1], r12
    vld1.32         {d4[0]}, [r1], r12
    vld1.32         {d4[1]}, [r1]
    sub             r1, r1, #44
    vld1.32         {d8[0]}, [r1], r12      ; hi
    vld1.32         {d8[1]}, [r1], r12
    vld1.32         {d10[0]}, [r1], r12
    vld1.32         {d10[1]}, [r1]
    sxth            r1, r0                  ; lo *dc
    add             r1, r1, #4
    asr             r1, r1, #3
    vdup.16         q0, r1
    sxth            r0, r0, ror #16         ; hi *dc
    add             r0, r0, #4
    asr             r0, r0, #3
    vdup.16         q3, r0
    vaddw.u8        q1, q0, d2              ; lo
    vaddw.u8        q2, q0, d4
    vaddw.u8        q4, q3, d8              ; hi
    vaddw.u8        q5, q3, d10
    vqmovun.s16     d2, q1                  ; lo
    vqmovun.s16     d4, q2
    vqmovun.s16     d8, q4                  ; hi
    vqmovun.s16     d10, q5
    add             r0, r2, #4
    vst1.32         {d2[0]}, [r2], r3       ; lo
    vst1.32         {d2[1]}, [r2], r3
    vst1.32         {d4[0]}, [r2], r3
    vst1.32         {d4[1]}, [r2]
    vst1.32         {d8[0]}, [r0], r3       ; hi
    vst1.32         {d8[1]}, [r0], r3
    vst1.32         {d10[0]}, [r0], r3
    vst1.32         {d10[1]}, [r0]
    bx             lr
    ENDP           ;|idct_dequant_dc_0_2x_neon|
    END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@ -0,0 +1,206 @@
 ;
 ;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |idct_dequant_dc_full_2x_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
 ;                                  unsigned char *dst, int stride, short *dc);
 ; r0    *q,
 ; r1    *dq,
 ; r2    *pre
 ; r3    *dst
 ; sp    stride
 ; sp+4  *dc
 |idct_dequant_dc_full_2x_neon| PROC
    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
    vld1.16         {q2, q3}, [r0]          ; l q
    mov             r1, #16                 ; pitch
    add             r0, r0, #32
    vld1.16         {q4, q5}, [r0]          ; r q
    add             r12, r2, #4
    ; interleave the predictors
    vld1.32         {d28[0]}, [r2], r1      ; l pre
    vld1.32         {d28[1]}, [r12], r1     ; r pre
    vld1.32         {d29[0]}, [r2], r1
    vld1.32         {d29[1]}, [r12], r1
    vld1.32         {d30[0]}, [r2], r1
    vld1.32         {d30[1]}, [r12], r1
    vld1.32         {d31[0]}, [r2]
    ldr             r1, [sp, #4]
    vld1.32         {d31[1]}, [r12]
    ldr             r2, _CONSTANTS_
    ldrh            r12, [r1], #2           ; lo *dc
    ldrh            r1, [r1]                ; hi *dc
    ; dequant: q[i] = q[i] * dq[i]
    vmul.i16        q2, q2, q0
    vmul.i16        q3, q3, q1
    vmul.i16        q4, q4, q0
    vmul.i16        q5, q5, q1
    ; move dc up to neon and overwrite first element
    vmov.16         d4[0], r12
    vmov.16         d8[0], r1
    vld1.16         {d0}, [r2]
    ; q2: l0r0  q3: l8r8
    ; q4: l4r4  q5: l12r12
    vswp            d5, d8
    vswp            d7, d10
    ; _CONSTANTS_ * 4,12 >> 16
    ; q6:  4 * sinpi : c1/temp1
    ; q7: 12 * sinpi : d1/temp2
    ; q8:  4 * cospi
    ; q9: 12 * cospi
    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
    vqdmulh.s16     q7, q5, d0[2]
    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
    vqdmulh.s16     q9, q5, d0[0]
    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
    ; vqdmulh only accepts signed values. this was a problem because
    ; our constant had the high bit set, and was treated as a negative value.
    ; vqdmulh also doubles the value before it shifts by 16. we need to
    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
    ; so we can shift the constant without losing precision. this avoids
    ; shift again afterward, but also avoids the sign issue. win win!
    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
    ; pre-shift it
    vshr.s16        q8, q8, #1
    vshr.s16        q9, q9, #1
    ; q4:  4 +  4 * cospi : d1/temp1
    ; q5: 12 + 12 * cospi : c1/temp2
    vqadd.s16       q4, q4, q8
    vqadd.s16       q5, q5, q9
    ; c1 = temp1 - temp2
    ; d1 = temp1 + temp2
    vqsub.s16       q2, q6, q5
    vqadd.s16       q3, q4, q7
    ; [0]: a1+d1
    ; [1]: b1+c1
    ; [2]: b1-c1
    ; [3]: a1-d1
    vqadd.s16       q4, q10, q3
    vqadd.s16       q5, q11, q2
    vqsub.s16       q6, q11, q2
    vqsub.s16       q7, q10, q3
    ; rotate
    vtrn.32         q4, q6
    vtrn.32         q5, q7
    vtrn.16         q4, q5
    vtrn.16         q6, q7
    ; idct loop 2
    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
    ; q6: l 2, 6,10,14 r 2, 6,10,14
    ; q7: l 3, 7,11,15 r 3, 7,11,15
    ; q8:  1 * sinpi : c1/temp1
    ; q9:  3 * sinpi : d1/temp2
    ; q10: 1 * cospi
    ; q11: 3 * cospi
    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
    vqdmulh.s16     q9, q7, d0[2]
    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
    vqdmulh.s16     q11, q7, d0[0]
    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
    ; see note on shifting above
    vshr.s16        q10, q10, #1
    vshr.s16        q11, q11, #1
    ; q10: 1 + 1 * cospi : d1/temp1
    ; q11: 3 + 3 * cospi : c1/temp2
    vqadd.s16       q10, q5, q10
    vqadd.s16       q11, q7, q11
    ; q8: c1 = temp1 - temp2
    ; q9: d1 = temp1 + temp2
    vqsub.s16       q8, q8, q11
    vqadd.s16       q9, q10, q9
    ; a1+d1
    ; b1+c1
    ; b1-c1
    ; a1-d1
    vqadd.s16       q4, q2, q9
    vqadd.s16       q5, q3, q8
    vqsub.s16       q6, q3, q8
    vqsub.s16       q7, q2, q9
    ; +4 >> 3 (rounding)
    vrshr.s16       q4, q4, #3              ; lo
    vrshr.s16       q5, q5, #3
    vrshr.s16       q6, q6, #3              ; hi
    vrshr.s16       q7, q7, #3
    vtrn.32         q4, q6
    vtrn.32         q5, q7
    vtrn.16         q4, q5
    vtrn.16         q6, q7
    ; adding pre
    ; input is still packed. pre was read interleaved
    vaddw.u8        q4, q4, d28
    vaddw.u8        q5, q5, d29
    vaddw.u8        q6, q6, d30
    vaddw.u8        q7, q7, d31
    vmov.i16        q14, #0
    vmov            q15, q14
    vst1.16         {q14, q15}, [r0]        ; write over high input
    sub             r0, r0, #32
    vst1.16         {q14, q15}, [r0]        ; write over low input
    ;saturate and narrow
    vqmovun.s16     d0, q4                  ; lo
    vqmovun.s16     d1, q5
    vqmovun.s16     d2, q6                  ; hi
    vqmovun.s16     d3, q7
    ldr             r1, [sp]                ; stride
    add             r2, r3, #4              ; hi
    vst1.32         {d0[0]}, [r3], r1       ; lo
    vst1.32         {d0[1]}, [r2], r1       ; hi
    vst1.32         {d1[0]}, [r3], r1
    vst1.32         {d1[1]}, [r2], r1
    vst1.32         {d2[0]}, [r3], r1
    vst1.32         {d2[1]}, [r2], r1
    vst1.32         {d3[0]}, [r3]
    vst1.32         {d3[1]}, [r2]
    bx             lr
    ENDP           ; |idct_dequant_dc_full_2x_neon|
 ; Constant Pool
 _CONSTANTS_       DCD cospi8sqrt2minus1
 cospi8sqrt2minus1 DCD 0x4e7b
 ; because the lowest bit in 0x8a8c is 0, we can pre-shift this
 sinpi8sqrt2       DCD 0x4546
    END
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@ -0,0 +1,198 @@
 ;
 ;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |idct_dequant_full_2x_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
 ;                               unsigned char *dst, int pitch, int stride);
 ; r0    *q,
 ; r1    *dq,
 ; r2    *pre
 ; r3    *dst
 ; sp    pitch
 ; sp+4  stride
 |idct_dequant_full_2x_neon| PROC
    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
    vld1.16         {q2, q3}, [r0]          ; l q
    ldr             r1, [sp]                ; pitch
    add             r0, r0, #32
    vld1.16         {q4, q5}, [r0]          ; r q
    add             r12, r2, #4
    ; interleave the predictors
    vld1.32         {d28[0]}, [r2], r1      ; l pre
    vld1.32         {d28[1]}, [r12], r1     ; r pre
    vld1.32         {d29[0]}, [r2], r1
    vld1.32         {d29[1]}, [r12], r1
    vld1.32         {d30[0]}, [r2], r1
    vld1.32         {d30[1]}, [r12], r1
    vld1.32         {d31[0]}, [r2]
    vld1.32         {d31[1]}, [r12]
    ldr             r2, _CONSTANTS_
    ; dequant: q[i] = q[i] * dq[i]
    vmul.i16        q2, q2, q0
    vmul.i16        q3, q3, q1
    vmul.i16        q4, q4, q0
    vmul.i16        q5, q5, q1
    vld1.16         {d0}, [r2]
    ; q2: l0r0  q3: l8r8
    ; q4: l4r4  q5: l12r12
    vswp            d5, d8
    vswp            d7, d10
    ; _CONSTANTS_ * 4,12 >> 16
    ; q6:  4 * sinpi : c1/temp1
    ; q7: 12 * sinpi : d1/temp2
    ; q8:  4 * cospi
    ; q9: 12 * cospi
    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
    vqdmulh.s16     q7, q5, d0[2]
    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
    vqdmulh.s16     q9, q5, d0[0]
    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
    ; vqdmulh only accepts signed values. this was a problem because
    ; our constant had the high bit set, and was treated as a negative value.
    ; vqdmulh also doubles the value before it shifts by 16. we need to
    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
    ; so we can shift the constant without losing precision. this avoids
    ; shift again afterward, but also avoids the sign issue. win win!
    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
    ; pre-shift it
    vshr.s16        q8, q8, #1
    vshr.s16        q9, q9, #1
    ; q4:  4 +  4 * cospi : d1/temp1
    ; q5: 12 + 12 * cospi : c1/temp2
    vqadd.s16       q4, q4, q8
    vqadd.s16       q5, q5, q9
    ; c1 = temp1 - temp2
    ; d1 = temp1 + temp2
    vqsub.s16       q2, q6, q5
    vqadd.s16       q3, q4, q7
    ; [0]: a1+d1
    ; [1]: b1+c1
    ; [2]: b1-c1
    ; [3]: a1-d1
    vqadd.s16       q4, q10, q3
    vqadd.s16       q5, q11, q2
    vqsub.s16       q6, q11, q2
    vqsub.s16       q7, q10, q3
    ; rotate
    vtrn.32         q4, q6
    vtrn.32         q5, q7
    vtrn.16         q4, q5
    vtrn.16         q6, q7
    ; idct loop 2
    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
    ; q6: l 2, 6,10,14 r 2, 6,10,14
    ; q7: l 3, 7,11,15 r 3, 7,11,15
    ; q8:  1 * sinpi : c1/temp1
    ; q9:  3 * sinpi : d1/temp2
    ; q10: 1 * cospi
    ; q11: 3 * cospi
    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
    vqdmulh.s16     q9, q7, d0[2]
    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
    vqdmulh.s16     q11, q7, d0[0]
    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
    ; see note on shifting above
    vshr.s16        q10, q10, #1
    vshr.s16        q11, q11, #1
    ; q10: 1 + 1 * cospi : d1/temp1
    ; q11: 3 + 3 * cospi : c1/temp2
    vqadd.s16       q10, q5, q10
    vqadd.s16       q11, q7, q11
    ; q8: c1 = temp1 - temp2
    ; q9: d1 = temp1 + temp2
    vqsub.s16       q8, q8, q11
    vqadd.s16       q9, q10, q9
    ; a1+d1
    ; b1+c1
    ; b1-c1
    ; a1-d1
    vqadd.s16       q4, q2, q9
    vqadd.s16       q5, q3, q8
    vqsub.s16       q6, q3, q8
    vqsub.s16       q7, q2, q9
    ; +4 >> 3 (rounding)
    vrshr.s16       q4, q4, #3              ; lo
    vrshr.s16       q5, q5, #3
    vrshr.s16       q6, q6, #3              ; hi
    vrshr.s16       q7, q7, #3
    vtrn.32         q4, q6
    vtrn.32         q5, q7
    vtrn.16         q4, q5
    vtrn.16         q6, q7
    ; adding pre
    ; input is still packed. pre was read interleaved
    vaddw.u8        q4, q4, d28
    vaddw.u8        q5, q5, d29
    vaddw.u8        q6, q6, d30
    vaddw.u8        q7, q7, d31
    vmov.i16        q14, #0
    vmov            q15, q14
    vst1.16         {q14, q15}, [r0]        ; write over high input
    sub             r0, r0, #32
    vst1.16         {q14, q15}, [r0]        ; write over low input
    ;saturate and narrow
    vqmovun.s16     d0, q4                  ; lo
    vqmovun.s16     d1, q5
    vqmovun.s16     d2, q6                  ; hi
    vqmovun.s16     d3, q7
    ldr             r1, [sp, #4]            ; stride
    add             r2, r3, #4              ; hi
    vst1.32         {d0[0]}, [r3], r1       ; lo
    vst1.32         {d0[1]}, [r2], r1       ; hi
    vst1.32         {d1[0]}, [r3], r1
    vst1.32         {d1[1]}, [r2], r1
    vst1.32         {d2[0]}, [r3], r1
    vst1.32         {d2[1]}, [r2], r1
    vst1.32         {d3[0]}, [r3]
    vst1.32         {d3[1]}, [r2]
    bx             lr
    ENDP           ; |idct_dequant_full_2x_neon|
 ; Constant Pool
 _CONSTANTS_       DCD cospi8sqrt2minus1
 cospi8sqrt2minus1 DCD 0x4e7b
 ; because the lowest bit in 0x8a8c is 0, we can pre-shift this
 sinpi8sqrt2       DCD 0x4546
    END
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@ -25,7 +25,10 @@ VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c
 #File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c