rv40dsp: implement prescaled versions for biweight.

Quite often, the original weights are multiple of 512. By prescaling them by 1/512 when they are computed (once per frame), no intermediate shifting is needed, and no prescaling on each call either. The x86 code already used that trick. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
2012-03-19 22:46:28 +01:00 · 2012-03-19 22:46:28 +01:00 · 272b252c01
commit 272b252c01
parent d3c59d5003
7 changed files with 113 additions and 80 deletions
--- a/libavcodec/arm/rv40dsp_init_neon.c
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@ -128,8 +128,8 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
    c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
    c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;

-    c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon;
-    c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon;
+    c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
+    c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;

    c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
    c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@ -521,7 +521,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int
 */
 static int calc_add_mv(RV34DecContext *r, int dir, int val)
 {
-    int mul = dir ? -r->weight2 : r->weight1;
+    int mul = dir ? -r->mv_weight2 : r->mv_weight1;

    return (val * mul + 0x2000) >> 14;
 }
@ -776,24 +776,24 @@ static void rv34_mc_1mv(RV34DecContext *r, const int block_type,

 static void rv4_weight(RV34DecContext *r)
 {
-    r->rdsp.rv40_weight_pixels_tab[0](r->s.dest[0],
-                                      r->tmp_b_block_y[0],
-                                      r->tmp_b_block_y[1],
-                                      r->weight1,
-                                      r->weight2,
-                                      r->s.linesize);
-    r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[1],
-                                      r->tmp_b_block_uv[0],
-                                      r->tmp_b_block_uv[2],
-                                      r->weight1,
-                                      r->weight2,
-                                      r->s.uvlinesize);
-    r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[2],
-                                      r->tmp_b_block_uv[1],
-                                      r->tmp_b_block_uv[3],
-                                      r->weight1,
-                                      r->weight2,
-                                      r->s.uvlinesize);
+    r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][0](r->s.dest[0],
+                                                        r->tmp_b_block_y[0],
+                                                        r->tmp_b_block_y[1],
+                                                        r->weight1,
+                                                        r->weight2,
+                                                        r->s.linesize);
+    r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[1],
+                                                        r->tmp_b_block_uv[0],
+                                                        r->tmp_b_block_uv[2],
+                                                        r->weight1,
+                                                        r->weight2,
+                                                        r->s.uvlinesize);
+    r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[2],
+                                                        r->tmp_b_block_uv[1],
+                                                        r->tmp_b_block_uv[3],
+                                                        r->weight1,
+                                                        r->weight2,
+                                                        r->s.uvlinesize);
 }

 static void rv34_mc_2mv(RV34DecContext *r, const int block_type)
@ -1703,11 +1703,21 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
            int dist0   = GET_PTS_DIFF(r->cur_pts,  r->last_pts);
            int dist1   = GET_PTS_DIFF(r->next_pts, r->cur_pts);

-            if (!refdist) {
-                r->weight1 = r->weight2 = 8192;
-            } else {
-                r->weight1 = (dist0 << 14) / refdist;
-                r->weight2 = (dist1 << 14) / refdist;
+            if(!refdist){
+                r->mv_weight1 = r->mv_weight2 = r->weight1 = r->weight2 = 8192;
+                r->scaled_weight = 0;
+            }else{
+                r->mv_weight1 = (dist0 << 14) / refdist;
+                r->mv_weight2 = (dist1 << 14) / refdist;
+                if((r->mv_weight1|r->mv_weight2) & 511){
+                    r->weight1 = r->mv_weight1;
+                    r->weight2 = r->mv_weight2;
+                    r->scaled_weight = 0;
+                }else{
+                    r->weight1 = r->mv_weight1 >> 9;
+                    r->weight2 = r->mv_weight2 >> 9;
+                    r->scaled_weight = 1;
+                }
            }
        }
        s->mb_x = s->mb_y = 0;
--- a/libavcodec/rv34.h
+++ b/libavcodec/rv34.h
@ -106,7 +106,9 @@ typedef struct RV34DecContext{
    int rpr;                 ///< one field size in RV30 slice header

    int cur_pts, last_pts, next_pts;
+    int scaled_weight;
    int weight1, weight2;    ///< B frame distance fractions (0.14) used in motion compensation
+    int mv_weight1, mv_weight2;

    uint16_t *cbp_luma;      ///< CBP values for luma subblocks
    uint8_t  *cbp_chroma;    ///< CBP values for chroma subblocks
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@ -58,7 +58,12 @@ typedef struct RV34DSPContext {
    qpel_mc_func avg_pixels_tab[4][16];
    h264_chroma_mc_func put_chroma_pixels_tab[3];
    h264_chroma_mc_func avg_chroma_pixels_tab[3];
-    rv40_weight_func rv40_weight_pixels_tab[2];
+    /**
+     * Biweight functions, first dimension is transform size (16/8),
+     * second is whether the weight is prescaled by 1/512 to skip
+     * the intermediate shifting.
+     */
+    rv40_weight_func rv40_weight_pixels_tab[2][2];
    rv34_inv_transform_func rv34_inv_transform;
    rv34_inv_transform_func rv34_inv_transform_dc;
    rv34_idct_add_func rv34_idct_add;
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@ -278,7 +278,7 @@ RV40_CHROMA_MC(put_, op_put)
 RV40_CHROMA_MC(avg_, op_avg)

 #define RV40_WEIGHT_FUNC(size) \
-static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
+static void rv40_weight_func_rnd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
 {\
    int i, j;\
 \
@ -289,6 +289,18 @@ static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src
        src2 += stride;\
        dst  += stride;\
    }\
+}\
+static void rv40_weight_func_nornd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
+{\
+    int i, j;\
+\
+    for (j = 0; j < size; j++) {\
+        for (i = 0; i < size; i++)\
+            dst[i] = (w2 * src1[i] + w1 * src2[i] + 0x10) >> 5;\
+        src1 += stride;\
+        src2 += stride;\
+        dst  += stride;\
+    }\
 }

 RV40_WEIGHT_FUNC(16)
@ -578,8 +590,10 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) {
    c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c;
    c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c;

-    c->rv40_weight_pixels_tab[0] = rv40_weight_func_16;
-    c->rv40_weight_pixels_tab[1] = rv40_weight_func_8;
+    c->rv40_weight_pixels_tab[0][0] = rv40_weight_func_rnd_16;
+    c->rv40_weight_pixels_tab[0][1] = rv40_weight_func_rnd_8;
+    c->rv40_weight_pixels_tab[1][0] = rv40_weight_func_nornd_16;
+    c->rv40_weight_pixels_tab[1][1] = rv40_weight_func_nornd_8;

    c->rv40_weak_loop_filter[0]     = rv40_h_weak_loop_filter;
    c->rv40_weak_loop_filter[1]     = rv40_v_weak_loop_filter;
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@ -139,69 +139,61 @@ SECTION .text

 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 ; %1=size  %2=num of xmm regs
-%macro RV40_WEIGHT  2
-cglobal rv40_weight_func_%1, 6, 7, %2
+; The weights are FP0.14 notation of fractions depending on pts.
+; For timebases without rounding error (i.e. PAL), the fractions
+; can be simplified, and several operations can be avoided.
+; Therefore, we check here whether they are multiples of 2^9 for
+; those simplifications to occur.
+%macro RV40_WEIGHT  3
+cglobal rv40_weight_func_%1_%2, 6, 7, %3
 %if cpuflag(ssse3)
    mova       m1, [shift_round]
 %else
    mova       m1, [pw_16]
 %endif
    pxor       m0, m0
-    mov        r6, r3
-    or         r6, r4
-    ; The weights are FP0.14 notation of fractions depending on pts.
-    ; For timebases without rounding error (i.e. PAL), the fractions
-    ; can be simplified, and several operations can be avoided.
-    ; Therefore, we check here whether they are multiples of 2^9 for
-    ; those simplifications to occur.
-    and        r6, 0x1FF
    ; Set loop counter and increments
 %if mmsize == 8
-    mov        r6, %1
+    mov        r6, %2
 %else
-    mov        r6, (%1 * %1) / mmsize
+    mov        r6, (%2 * %2) / mmsize
 %endif

-    ; Use result of test now
-    jz .loop_512
    movd       m2, r3
    movd       m3, r4
+%ifidn %1,rnd
+%define  RND   0
    SPLATW     m2, m2
+%else
+%define  RND   1
+%if cpuflag(ssse3)
+    punpcklbw  m3, m2
+%else
+    SPLATW     m2, m2
+%endif
+%endif
    SPLATW     m3, m3

 .loop:
-    MAIN_LOOP  %1, 0
+    MAIN_LOOP  %2, RND
    jnz        .loop
    REP_RET
-
-    ; Weights are multiple of 512, which allows some shortcuts
-.loop_512:
-    sar        r3, 9
-    sar        r4, 9
-    movd       m2, r3
-    movd       m3, r4
-%if cpuflag(ssse3)
-    punpcklbw  m3, m2
-    SPLATW     m3, m3
-%else
-    SPLATW     m2, m2
-    SPLATW     m3, m3
-%endif
-.loop2:
-    MAIN_LOOP  %1, 1
-    jnz        .loop2
-    REP_RET
-
 %endmacro

 INIT_MMX mmx
-RV40_WEIGHT    8, 0
-RV40_WEIGHT   16, 0
+RV40_WEIGHT   rnd,    8, 3
+RV40_WEIGHT   rnd,   16, 4
+RV40_WEIGHT   nornd,  8, 3
+RV40_WEIGHT   nornd, 16, 4

 INIT_XMM sse2
-RV40_WEIGHT    8, 8
-RV40_WEIGHT   16, 8
+RV40_WEIGHT   rnd,    8, 3
+RV40_WEIGHT   rnd,   16, 4
+RV40_WEIGHT   nornd,  8, 3
+RV40_WEIGHT   nornd, 16, 4

 INIT_XMM ssse3
-RV40_WEIGHT    8, 8
-RV40_WEIGHT   16, 8
+RV40_WEIGHT   rnd,    8, 3
+RV40_WEIGHT   rnd,   16, 4
+RV40_WEIGHT   nornd,  8, 3
+RV40_WEIGHT   nornd, 16, 4
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
                                  int stride, int h, int x, int y);

 #define DECLARE_WEIGHT(opt) \
-void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
-                                  int w1, int w2, ptrdiff_t stride); \
-void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
-                                  int w1, int w2, ptrdiff_t stride);
+void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                      int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                      int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                        int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                        int w1, int w2, ptrdiff_t stride);
 DECLARE_WEIGHT(mmx)
 DECLARE_WEIGHT(sse2)
 DECLARE_WEIGHT(ssse3)
@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
    if (mm_flags & AV_CPU_FLAG_MMX) {
        c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
        c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
-        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx;
-        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
    }
    if (mm_flags & AV_CPU_FLAG_MMX2) {
        c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
        c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
    }
    if (mm_flags & AV_CPU_FLAG_SSE2) {
-        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2;
-        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
    }
    if (mm_flags & AV_CPU_FLAG_SSSE3) {
-        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3;
-        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
    }
 #endif
 }