Merge remote branch 'internal/upstream' into HEAD

2010-09-21 00:05:03 -04:00 · 2010-09-21 00:05:03 -04:00 · 99c611fea6
commit 99c611fea6
parent 14b322e466 b7dc9398f2
9 changed files with 796 additions and 383 deletions
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@ -11,6 +11,8 @@

 %include "vpx_ports/x86_abi_support.asm"

+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8

 %macro LFH_FILTER_MASK 1
 %if %1
@ -33,8 +35,6 @@
        psubusb     xmm2,                   xmm6              ; q3-=q2
        por         xmm1,                   xmm2              ; abs(q3-q2)

-        psubusb     xmm1,                   xmm7
-
 %if %1
        movdqa      xmm4,                   [rsi+rax]         ; q1
 %else
@ -49,9 +49,7 @@
        psubusb     xmm4,                   xmm6              ; q1-=q2
        psubusb     xmm6,                   xmm3              ; q2-=q1
        por         xmm4,                   xmm6              ; abs(q2-q1)
-        psubusb     xmm4,                   xmm7
-
-        por         xmm1,                   xmm4
+        pmaxub      xmm1,                   xmm4

 %if %1
        movdqa      xmm4,                   [rsi]             ; q0
@ -67,9 +65,7 @@
        psubusb     xmm3,                   xmm0              ; q1-=q0
        por         xmm4,                   xmm3              ; abs(q0-q1)
        movdqa      t0,                     xmm4              ; save to t0
-
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
+        pmaxub      xmm1,                   xmm4

 %if %1
        neg         rax                     ; negate pitch to deal with above border
@ -95,9 +91,7 @@
        psubusb     xmm4,                   xmm2              ; p2-=p3
        psubusb     xmm2,                   xmm5              ; p3-=p2
        por         xmm4,                   xmm2              ; abs(p3 - p2)
-
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
+        pmaxub      xmm1,                   xmm4

 %if %1
        movdqa      xmm4,                   [rsi+2*rax]       ; p1
@ -113,9 +107,8 @@
        psubusb     xmm4,                   xmm5              ; p1-=p2
        psubusb     xmm5,                   xmm3              ; p2-=p1
        por         xmm4,                   xmm5              ; abs(p2 - p1)
-        psubusb     xmm4,                   xmm7
+        pmaxub      xmm1,                   xmm4

-        por         xmm1,                   xmm4
        movdqa      xmm2,                   xmm3              ; p1

 %if %1
@ -133,8 +126,8 @@
        por         xmm4,                   xmm3              ; abs(p1 - p0)
        movdqa        t1,                   xmm4              ; save to t1

-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
+        pmaxub      xmm1,                   xmm4
+        psubusb     xmm1,                   xmm7

 %if %1
        movdqa      xmm3,                   [rdi]             ; q1
@ -872,19 +865,18 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
        psubusb     xmm0,               xmm7            ; q2-q3

        psubusb     xmm7,               xmm6            ; q3-q2
-        por         xmm7,               xmm0            ; abs (q3-q2)
-
        movdqa      xmm4,               xmm5            ; q1
+
+        por         xmm7,               xmm0            ; abs (q3-q2)
        psubusb     xmm4,               xmm6            ; q1-q2

-        psubusb     xmm6,               xmm5            ; q2-q1
-        por         xmm6,               xmm4            ; abs (q2-q1)
-
        movdqa      xmm0,               xmm1
+        psubusb     xmm6,               xmm5            ; q2-q1

+        por         xmm6,               xmm4            ; abs (q2-q1)
        psubusb     xmm0,               xmm2            ; p2 - p3;
-        psubusb     xmm2,               xmm1            ; p3 - p2;

+        psubusb     xmm2,               xmm1            ; p3 - p2;
        por         xmm0,               xmm2            ; abs(p2-p3)
 %if %1
        movdqa      xmm2,               [rdx]           ; p1
@ -892,39 +884,28 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
        movdqa      xmm2,               [rdx+32]        ; p1
 %endif
        movdqa      xmm5,               xmm2            ; p1
+        pmaxub      xmm0,               xmm7

        psubusb     xmm5,               xmm1            ; p1-p2
        psubusb     xmm1,               xmm2            ; p2-p1

-        por         xmm1,               xmm5            ; abs(p2-p1)
-
-        mov         rdx,                arg(3)          ; limit
-        movdqa      xmm4,               [rdx]           ; limit
-
-        psubusb     xmm7,               xmm4
-
-        psubusb     xmm0,               xmm4            ; abs(p3-p2) > limit
-        psubusb     xmm1,               xmm4            ; abs(p2-p1) > limit
-
-        psubusb     xmm6,               xmm4            ; abs(q2-q1) > limit
-        por         xmm7,               xmm6            ; or
-
-        por         xmm0,               xmm1
-        por         xmm0,               xmm7            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movdqa      xmm1,               xmm2            ; p1
-
        movdqa      xmm7,               xmm3            ; p0
        psubusb     xmm7,               xmm2            ; p0-p1

+        por         xmm1,               xmm5            ; abs(p2-p1)
+        pmaxub      xmm0,               xmm6
+
+        pmaxub      xmm0,               xmm1
+        movdqa      xmm1,               xmm2            ; p1
+
        psubusb     xmm2,               xmm3            ; p1-p0
        por         xmm2,               xmm7            ; abs(p1-p0)

        movdqa      t0,                 xmm2            ; save abs(p1-p0)
        lea         rdx,                srct

-        psubusb     xmm2,               xmm4            ; abs(p1-p0)>limit
-        por         xmm0,               xmm2            ; mask
+        pmaxub      xmm0,               xmm2
+
 %if %1
        movdqa      xmm5,               [rdx+32]        ; q0
        movdqa      xmm7,               [rdx+48]        ; q1
@ -940,9 +921,12 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
        por         xmm7,               xmm5            ; abs(q1-q0)

        movdqa      t1,                 xmm7            ; save abs(q1-q0)
-        psubusb     xmm7,               xmm4            ; abs(q1-q0)> limit

-        por         xmm0,               xmm7            ; mask
+        mov         rdx,                arg(3)          ; limit
+        movdqa      xmm4,               [rdx]           ; limit
+
+        pmaxub      xmm0,               xmm7
+        psubusb     xmm0,               xmm4

        movdqa      xmm5,               xmm2            ; q1
        psubusb     xmm5,               xmm1            ; q1-=p1
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@ -70,27 +70,35 @@ sym(vp8_filter_block1d8_h6_ssse3):
    sub         rdi, rdx
 ;xmm3 free
 filter_block1d8_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [shuf1b GLOBAL]
+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [shuf2b GLOBAL]
-    pmaddubsw   xmm0, xmm4
-    pmaddubsw   xmm1, xmm5
+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    pshufb      xmm2, [shuf3b GLOBAL]
-    add         rdi, rdx
-    pmaddubsw   xmm2, xmm6
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]
+
+    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]
+    pmaddubsw   xmm1,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6

    lea         rsi,    [rsi + rax]
    dec         rcx
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm2,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0

    movq        MMWORD Ptr [rdi], xmm0
    jnz         filter_block1d8_h6_rowloop_ssse3
@ -107,8 +115,8 @@ vp8_filter_block1d8_h4_ssse3:
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm3, XMMWORD PTR [shuf2b GLOBAL]
-    movdqa      xmm4, XMMWORD PTR [shuf3b GLOBAL]
+    movdqa      xmm3, XMMWORD PTR [shuf2bfrom1 GLOBAL]
+    movdqa      xmm4, XMMWORD PTR [shuf3bfrom1 GLOBAL]

    mov         rsi, arg(0)             ;src_ptr

@ -118,24 +126,33 @@ vp8_filter_block1d8_h4_ssse3:
    movsxd      rdx, dword ptr arg(3)   ;output_pitch

    sub         rdi, rdx
-;xmm3 free
+
 filter_block1d8_h4_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movdqa      xmm2, xmm0
-    pshufb      xmm0, xmm3 ;[shuf2b GLOBAL]
-    pshufb      xmm2, xmm4 ;[shuf3b GLOBAL]
+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    pmaddubsw   xmm0, xmm5
-    add         rdi, rdx
-    pmaddubsw   xmm2, xmm6
+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm2,   xmm0
+    pshufb      xmm0,   xmm3
+
+    pshufb      xmm2,   xmm4
+    pmaddubsw   xmm0,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6

    lea         rsi,    [rsi + rax]
    dec         rcx
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
+
+    paddsw      xmm0,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0

    movq        MMWORD Ptr [rdi], xmm0

@ -168,74 +185,88 @@ sym(vp8_filter_block1d16_h6_ssse3):
    push        rdi
    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    movsxd      rdx, DWORD PTR arg(5)           ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;

    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx

-    mov         rdi, arg(2)             ;output_ptr
-    movdqa      xmm7, [rd GLOBAL]
+    mov         rdi, arg(2)                     ;output_ptr

 ;;
 ;;    cmp         esi, DWORD PTR [rax]
 ;;    je          vp8_filter_block1d16_h4_ssse3

-    mov         rsi, arg(0)             ;src_ptr
+    mov         rsi, arg(0)                     ;src_ptr

    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)           ;output_height
+    movsxd      rdx, dword ptr arg(3)           ;output_pitch

 filter_block1d16_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [shuf1b GLOBAL]
-    movdqa      xmm2, xmm1
-    pmaddubsw   xmm0, xmm4
-    pshufb      xmm1, [shuf2b GLOBAL]
-    pshufb      xmm2, [shuf3b GLOBAL]
-    pmaddubsw   xmm1, xmm5
+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    pmaddubsw   xmm2, xmm6
-    paddsw      xmm0, xmm1
-    movdqa      xmm1, xmm3
-    pshufb      xmm3, [shuf1b GLOBAL]
-    paddsw      xmm0, xmm7
-    pmaddubsw   xmm3, xmm4
-    paddsw      xmm0, xmm2
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [shuf2b GLOBAL]
-    pshufb      xmm2, [shuf3b GLOBAL]
-    pmaddubsw   xmm1, xmm5
-    pmaddubsw   xmm2, xmm6
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]
+
+    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]
+    movq        xmm3,   MMWORD PTR [rsi +  6]
+
+    pmaddubsw   xmm1,   xmm5
+    movq        xmm7,   MMWORD PTR [rsi + 11]
+
+    pmaddubsw   xmm2,   xmm6
+    punpcklbw   xmm3,   xmm7
+
+    paddsw      xmm0,   xmm1
+    movdqa      xmm1,   xmm3
+
+    pmaddubsw   xmm3,   xmm4
+    paddsw      xmm0,   xmm2
+
+    movdqa      xmm2,   xmm1
+    paddsw      xmm0,   [rd GLOBAL]
+
+    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]
+    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]
+
+    psraw       xmm0,   7
+    pmaddubsw   xmm1,   xmm5
+
+    pmaddubsw   xmm2,   xmm6
+    packuswb    xmm0,   xmm0

-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
    lea         rsi,    [rsi + rax]
-    paddsw      xmm3, xmm1
-    paddsw      xmm3, xmm7
-    paddsw      xmm3, xmm2
-    psraw       xmm3, 7
-    packuswb    xmm3, xmm3
+    paddsw      xmm3,   xmm1

-    punpcklqdq  xmm0, xmm3
+    paddsw      xmm3,   xmm2
+
+    paddsw      xmm3,   [rd GLOBAL]
+
+    psraw       xmm3,   7
+
+    packuswb    xmm3,   xmm3
+
+    punpcklqdq  xmm0,   xmm3

    movdqa      XMMWORD Ptr [rdi], xmm0

-    add         rdi, rdx
+    lea         rdi,    [rdi + rdx]
    dec         rcx
    jnz         filter_block1d16_h6_rowloop_ssse3

-
    ; begin epilog
    pop rdi
    pop rsi
@ -268,7 +299,7 @@ filter_block1d16_h4_rowloop_ssse3:
    pshufb      xmm3, [shuf3b GLOBAL]
    pshufb      xmm0, [shuf2b GLOBAL]

-    paddsw      xmm1, xmm7
+    paddsw      xmm1, [rd GLOBAL]
    paddsw      xmm1, xmm2

    pmaddubsw   xmm0, xmm5
@ -278,7 +309,7 @@ filter_block1d16_h4_rowloop_ssse3:
    packuswb    xmm1, xmm1
    lea         rsi,    [rsi + rax]
    paddsw      xmm3, xmm0
-    paddsw      xmm3, xmm7
+    paddsw      xmm3, [rd GLOBAL]
    psraw       xmm3, 7
    packuswb    xmm3, xmm3

@ -939,17 +970,19 @@ sym(vp8_bilinear_predict16x16_ssse3):
 %if ABI_IS_32BIT=0
        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
 %endif
-        movdqu      xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-        movdqa      xmm4,       xmm3
-
-        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16
-        lea         rsi,        [rsi + rdx]         ; next line
+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+
+        lea         rsi,        [rsi + rdx]         ; next line
+
        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14

-        punpckhbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15

        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
@ -962,17 +995,18 @@ sym(vp8_bilinear_predict16x16_ssse3):
        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

 .next_row:
-        movdqu      xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-        movdqa      xmm4,       xmm6
-
-        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16
-        lea         rsi,        [rsi + rdx]         ; next line
+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

        punpcklbw   xmm6,       xmm5
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+        lea         rsi,        [rsi + rdx]         ; next line
+
        pmaddubsw   xmm6,       xmm1

-        punpckhbw   xmm4,       xmm5
+        punpcklbw   xmm4,       xmm5
        pmaddubsw   xmm4,       xmm1

        paddw       xmm6,       [rd GLOBAL]         ; xmm6 += round value
@ -1027,49 +1061,51 @@ b16x16_sp_only:
        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

        ; get the first horizontal line done
-        movdqu      xmm2,       [rsi]               ; load row 0
+        movq        xmm4,       [rsi]               ; load row 0
+        movq        xmm2,       [rsi + 8]           ; load row 0

        lea         rsi,        [rsi + rax]         ; next line
 .next_row:
-        movdqu      xmm3,       [rsi]               ; load row + 1
+        movq        xmm3,       [rsi]               ; load row + 1
+        movq        xmm5,       [rsi + 8]           ; load row + 1

-        movdqu      xmm4,       xmm2
        punpcklbw   xmm4,       xmm3
+        punpcklbw   xmm2,       xmm5

        pmaddubsw   xmm4,       xmm1
-        movdqu      xmm7,       [rsi + rax]         ; load row + 2
-
-        punpckhbw   xmm2,       xmm3
-        movdqu      xmm6,       xmm3
+        movq        xmm7,       [rsi + rax]         ; load row + 2

        pmaddubsw   xmm2,       xmm1
-        punpcklbw   xmm6,       xmm7
+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2

+        punpcklbw   xmm3,       xmm7
+        punpcklbw   xmm5,       xmm6
+
+        pmaddubsw   xmm3,       xmm1
        paddw       xmm4,       [rd GLOBAL]
-        pmaddubsw   xmm6,       xmm1
+
+        pmaddubsw   xmm5,       xmm1
+        paddw       xmm2,       [rd GLOBAL]

        psraw       xmm4,       VP8_FILTER_SHIFT
-        punpckhbw   xmm3,       xmm7
-
-        paddw       xmm2,       [rd GLOBAL]
-        pmaddubsw   xmm3,       xmm1
-
        psraw       xmm2,       VP8_FILTER_SHIFT
-        paddw       xmm6,       [rd GLOBAL]

        packuswb    xmm4,       xmm2
-        psraw       xmm6,       VP8_FILTER_SHIFT
-
-        movdqa      [rdi],      xmm4                ; store row 0
        paddw       xmm3,       [rd GLOBAL]

+        movdqa      [rdi],      xmm4                ; store row 0
+        paddw       xmm5,       [rd GLOBAL]
+
        psraw       xmm3,       VP8_FILTER_SHIFT
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm5
+        movdqa      xmm4,       xmm7
+
+        movdqa      [rdi + rdx],xmm3                ; store row 1
        lea         rsi,        [rsi + 2*rax]

-        packuswb    xmm6,       xmm3
-        movdqa      xmm2,       xmm7
-
-        movdqa      [rdi + rdx],xmm6                ; store row 1
+        movdqa      xmm2,       xmm6
        lea         rdi,        [rdi + 2*rdx]

        cmp         rdi,        rcx
@ -1083,32 +1119,35 @@ b16x16_fp_only:
        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

 .next_row:
-        movdqu      xmm2,       [rsi]               ; row 0
-        movdqa      xmm3,       xmm2
-
-        movdqu      xmm4,       [rsi + 1]           ; row 0 + 1
-        lea         rsi,        [rsi + rax]         ; next line
+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08

        punpcklbw   xmm2,       xmm4
-        movdqu      xmm5,       [rsi]               ; row 1
+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15

        pmaddubsw   xmm2,       xmm1
-        movdqa      xmm6,       xmm5
+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        punpckhbw   xmm3,       xmm4
-        movdqu      xmm7,       [rsi + 1]           ; row 1 + 1
+        lea         rsi,        [rsi + rax]         ; next line
+        punpcklbw   xmm3,       xmm4

        pmaddubsw   xmm3,       xmm1
-        paddw       xmm2,       [rd GLOBAL]
+        movq        xmm5,       [rsi]

+        paddw       xmm2,       [rd GLOBAL]
+        movq        xmm7,       [rsi+1]
+
+        movq        xmm6,       [rsi+8]
        psraw       xmm2,       VP8_FILTER_SHIFT
+
        punpcklbw   xmm5,       xmm7
+        movq        xmm7,       [rsi+9]

        paddw       xmm3,       [rd GLOBAL]
        pmaddubsw   xmm5,       xmm1

        psraw       xmm3,       VP8_FILTER_SHIFT
-        punpckhbw   xmm6,       xmm7
+        punpcklbw   xmm6,       xmm7

        packuswb    xmm2,       xmm3
        pmaddubsw   xmm6,       xmm1
@ -1463,6 +1502,13 @@ shuf2b:
 shuf3b:
    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10

+align 16
+shuf2bfrom1:
+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
 align 16
 rd:
    times 8 dw 0x40
--- a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
@ -1,136 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_dc_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
-;                                  unsigned char *dest, int pitch, int stride,
-;                                  int Dc);
-; r0    short *input,
-; r1    short *dq,
-; r2    unsigned char *pred
-; r3    unsigned char *dest
-; sp    int pitch
-; sp+4  int stride
-; sp+8  int Dc
-|vp8_dequant_dc_idct_add_neon| PROC
-    vld1.16         {q3, q4}, [r0]
-    vld1.16         {q5, q6}, [r1]
-
-    ldr             r1, [sp, #8]            ;load Dc from stack
-
-    ldr             r12, _CONSTANTS_
-
-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
-    vmul.i16        q2, q4, q6
-
-    vmov.16         d2[0], r1
-
-    ldr             r1, [sp]                ; pitch
-    vld1.32         {d14[0]}, [r2], r1
-    vld1.32         {d14[1]}, [r2], r1
-    vld1.32         {d15[0]}, [r2], r1
-    vld1.32         {d15[1]}, [r2]
-
-    ldr             r1, [sp, #4]            ; stride
-
-;|short_idct4x4llm_neon| PROC
-    vld1.16         {d0}, [r12]
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-; memset(input, 0, 32) -- 32bytes
-    vmov.i16        q14, #0
-
-    vswp            d3, d4
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vmov            q15, q14
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vst1.16         {q14, q15}, [r0]
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vaddw.u8        q1, q1, d14
-    vaddw.u8        q2, q2, d15
-
-    vqmovun.s16     d0, q1
-    vqmovun.s16     d1, q2
-
-    vst1.32         {d0[0]}, [r3], r1
-    vst1.32         {d0[1]}, [r3], r1
-    vst1.32         {d1[0]}, [r3], r1
-    vst1.32         {d1[1]}, [r3]
-
-    bx             lr
-
-    ENDP           ; |vp8_dequant_dc_idct_add_neon|
-
-; Constant Pool
-_CONSTANTS_       DCD cospi8sqrt2minus1
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2       DCD 0x8a8c8a8c
-
-    END
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@ -12,6 +12,21 @@
 #include "idct.h"
 #include "dequantize.h"

+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_dc_full_2x_neon
+            (short *input, short *dq, unsigned char *pre, unsigned char *dst,
+             int stride, short *dc);
+void idct_dequant_dc_0_2x_neon
+            (short *dc, unsigned char *pre, unsigned char *dst, int stride);
+void idct_dequant_full_2x_neon
+            (short *q, short *dq, unsigned char *pre, unsigned char *dst,
+             int pitch, int stride);
+void idct_dequant_0_2x_neon
+            (short *q, short dq, unsigned char *pre, int pitch,
+             unsigned char *dst, int stride);
+
 void vp8_dequant_dc_idct_add_y_block_neon
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs, short *dc)
@ -20,25 +35,15 @@ void vp8_dequant_dc_idct_add_y_block_neon

    for (i = 0; i < 4; i++)
    {
-        if (eobs[0] > 1)
-            vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
        else
-            vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
+            idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);

-        if (eobs[1] > 1)
-            vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
        else
-            vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
-
-        if (eobs[2] > 1)
-            vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
-        else
-            vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
-
-        if (eobs[3] > 1)
-            vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
-        else
-            vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
+            idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);

        q    += 64;
        dc   += 4;
@ -56,37 +61,15 @@ void vp8_dequant_idct_add_y_block_neon

    for (i = 0; i < 4; i++)
    {
-        if (eobs[0] > 1)
-            vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
        else
-        {
-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
-            ((int *)q)[0] = 0;
-        }
+            idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);

-        if (eobs[1] > 1)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
        else
-        {
-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
-            ((int *)(q+16))[0] = 0;
-        }
-
-        if (eobs[2] > 1)
-            vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
-            ((int *)(q+32))[0] = 0;
-        }
-
-        if (eobs[3] > 1)
-            vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
-            ((int *)(q+48))[0] = 0;
-        }
+            idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);

        q    += 64;
        pre  += 64;
@ -99,53 +82,34 @@ void vp8_dequant_idct_add_uv_block_neon
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
-    int i;
+    if (((short *)eobs)[0] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);

-    for (i = 0; i < 2; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
-            ((int *)q)[0] = 0;
-        }
+    q    += 32;
+    pre  += 32;
+    dstu += 4*stride;

-        if (eobs[1] > 1)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
-            ((int *)(q+16))[0] = 0;
-        }
+    if (((short *)eobs)[1] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);

-        q    += 32;
-        pre  += 32;
-        dstu += 4*stride;
-        eobs += 2;
-    }
+    q += 32;
+    pre += 32;

-    for (i = 0; i < 2; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
-            ((int *)q)[0] = 0;
-        }
+    if (((short *)eobs)[2] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);

-        if (eobs[1] > 1)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
-            ((int *)(q+16))[0] = 0;
-        }
+    q    += 32;
+    pre  += 32;
+    dstv += 4*stride;

-        q    += 32;
-        pre  += 32;
-        dstv += 4*stride;
-        eobs += 2;
-    }
+    if (((short *)eobs)[3] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
 }
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@ -0,0 +1,79 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
+;                            int pitch, unsigned char *dst, int stride);
+; r0   *q
+; r1   dq
+; r2   *pre
+; r3   pitch
+; sp   *dst
+; sp+4 stride
+|idct_dequant_0_2x_neon| PROC
+    add             r12, r2, #4
+    vld1.32         {d2[0]}, [r2], r3
+    vld1.32         {d2[1]}, [r2], r3
+    vld1.32         {d4[0]}, [r2], r3
+    vld1.32         {d4[1]}, [r2]
+    vld1.32         {d8[0]}, [r12], r3
+    vld1.32         {d8[1]}, [r12], r3
+    vld1.32         {d10[0]}, [r12], r3
+    vld1.32         {d10[1]}, [r12]
+
+    ldrh            r12, [r0]               ; lo q
+    ldrh            r2, [r0, #32]           ; hi q
+    mov             r3, #0
+    strh            r3, [r0]
+    strh            r3, [r0, #32]
+
+    sxth            r12, r12                ; lo
+    mul             r0, r12, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q0, r0
+    sxth            r2, r2                  ; hi
+    mul             r0, r2, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    ldr             r2, [sp]                ; dst
+    ldr             r3, [sp, #4]            ; stride
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_0_2x_neon|
+    END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
+;                               unsigned char *dst, int stride);
+; r0  *dc
+; r1  *pre
+; r2  *dst
+; r3  stride
+|idct_dequant_dc_0_2x_neon| PROC
+    ldr             r0, [r0]                ; *dc
+    mov             r12, #16
+
+    vld1.32         {d2[0]}, [r1], r12      ; lo
+    vld1.32         {d2[1]}, [r1], r12
+    vld1.32         {d4[0]}, [r1], r12
+    vld1.32         {d4[1]}, [r1]
+    sub             r1, r1, #44
+    vld1.32         {d8[0]}, [r1], r12      ; hi
+    vld1.32         {d8[1]}, [r1], r12
+    vld1.32         {d10[0]}, [r1], r12
+    vld1.32         {d10[1]}, [r1]
+
+    sxth            r1, r0                  ; lo *dc
+    add             r1, r1, #4
+    asr             r1, r1, #3
+    vdup.16         q0, r1
+    sxth            r0, r0, ror #16         ; hi *dc
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ;|idct_dequant_dc_0_2x_neon|
+    END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@ -0,0 +1,206 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                                  unsigned char *dst, int stride, short *dc);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    stride
+; sp+4  *dc
+|idct_dequant_dc_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    mov             r1, #16                 ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    ldr             r1, [sp, #4]
+    vld1.32         {d31[1]}, [r12]
+
+    ldr             r2, _CONSTANTS_
+
+    ldrh            r12, [r1], #2           ; lo *dc
+    ldrh            r1, [r1]                ; hi *dc
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    ; move dc up to neon and overwrite first element
+    vmov.16         d4[0], r12
+    vmov.16         d8[0], r1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp]                ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_dc_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_       DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                               unsigned char *dst, int pitch, int stride);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    pitch
+; sp+4  stride
+|idct_dequant_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    ldr             r1, [sp]                ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    vld1.32         {d31[1]}, [r12]
+
+    ldr             r2, _CONSTANTS_
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp, #4]            ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_       DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@ -25,7 +25,10 @@ VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c

 #File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c