Split up yuv2yuvX functions

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
2011-10-04 12:22:03 +01:00 · 2011-10-04 12:22:03 +01:00 · ff7913aef1
commit ff7913aef1
parent 34e8d147b3
3 changed files with 103 additions and 201 deletions
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@ -405,7 +405,7 @@ void ff_sws_init_swScale_altivec(SwsContext *c)
    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
        dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21 &&
        !c->alpPixBuf) {
-        c->yuv2yuvX     = yuv2yuvX_altivec_real;
+//        c->yuv2yuvX     = yuv2yuvX_altivec_real;
    }

    /* The following list of supported dstFormat values should
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@ -196,141 +196,67 @@ DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 {  64, 64, 64, 64, 64, 64, 64, 64 };

 static av_always_inline void
-yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
-                      int lumFilterSize, const int16_t *chrFilter,
-                      const int32_t **chrUSrc, const int32_t **chrVSrc,
-                      int chrFilterSize, const int32_t **alpSrc,
-                      uint16_t *dest[4], int dstW, int chrDstW,
+yuv2yuvX16_c_template(const int16_t *filter, int filterSize,
+                      const int32_t **src, uint16_t *dest, int dstW,
                      int big_endian, int output_bits)
 {
-    //FIXME Optimize (just quickly written not optimized..)
-    int i;
-    uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
-             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
-    int shift = 15 + 16 - output_bits - 1;
-
 #define output_pixel(pos, val) \
    if (big_endian) { \
        AV_WB16(pos, av_clip_uint16(val >> shift)); \
    } else { \
        AV_WL16(pos, av_clip_uint16(val >> shift)); \
    }
+
+    int i;
+    int shift = 15 + 16 - output_bits - 1;
+
    for (i = 0; i < dstW; i++) {
        int val = 1 << (30-output_bits - 1);
        int j;

-        for (j = 0; j < lumFilterSize; j++)
-            val += (lumSrc[j][i] * lumFilter[j]) >> 1;
+        for (j = 0; j < filterSize; j++)
+            val += (src[j][i] * filter[j]) >> 1;

-        output_pixel(&yDest[i], val);
-    }
-
-    if (uDest) {
-        for (i = 0; i < chrDstW; i++) {
-            int u = 1 << (30-output_bits - 1);
-            int v = 1 << (30-output_bits - 1);
-            int j;
-
-            for (j = 0; j < chrFilterSize; j++) {
-                u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
-                v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
-            }
-
-            output_pixel(&uDest[i], u);
-            output_pixel(&vDest[i], v);
-        }
-    }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest) {
-        for (i = 0; i < dstW; i++) {
-            int val = 1 << (30-output_bits - 1);
-            int j;
-
-            for (j = 0; j < lumFilterSize; j++)
-                val += (alpSrc[j][i] * lumFilter[j]) >> 1;
-
-            output_pixel(&aDest[i], val);
-        }
+        output_pixel(&dest[i], val);
    }
 #undef output_pixel
 }

-static av_always_inline void
-yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
-                      int lumFilterSize, const int16_t *chrFilter,
-                      const int16_t **chrUSrc, const int16_t **chrVSrc,
-                      int chrFilterSize, const int16_t **alpSrc,
-                      uint16_t *dest[4], int dstW, int chrDstW,
-                      int big_endian, int output_bits)
-{
-    //FIXME Optimize (just quickly written not optimized..)
-    int i;
-    uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
-             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
-    int shift = 11 + 16 - output_bits;
-
 #define output_pixel(pos, val) \
    if (big_endian) { \
        AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
    } else { \
        AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
    }
+
+static av_always_inline void
+yuv2yuvX10_c_template(const int16_t *filter, int filterSize,
+                      const int16_t **src, uint16_t *dest, int dstW,
+                      int big_endian, int output_bits)
+{
+    int i;
+    int shift = 11 + 16 - output_bits;
+
    for (i = 0; i < dstW; i++) {
        int val = 1 << (26-output_bits);
        int j;

-        for (j = 0; j < lumFilterSize; j++)
-            val += lumSrc[j][i] * lumFilter[j];
+        for (j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];

-        output_pixel(&yDest[i], val);
+        output_pixel(&dest[i], val);
    }
-
-    if (uDest) {
-        for (i = 0; i < chrDstW; i++) {
-            int u = 1 << (26-output_bits);
-            int v = 1 << (26-output_bits);
-            int j;
-
-            for (j = 0; j < chrFilterSize; j++) {
-                u += chrUSrc[j][i] * chrFilter[j];
-                v += chrVSrc[j][i] * chrFilter[j];
-            }
-
-            output_pixel(&uDest[i], u);
-            output_pixel(&vDest[i], v);
-        }
-    }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest) {
-        for (i = 0; i < dstW; i++) {
-            int val = 1 << (26-output_bits);
-            int j;
-
-            for (j = 0; j < lumFilterSize; j++)
-                val += alpSrc[j][i] * lumFilter[j];
-
-            output_pixel(&aDest[i], val);
-        }
-    }
-#undef output_pixel
 }

+#undef output_pixel
+
 #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \
-static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
-                              const int16_t **_lumSrc, int lumFilterSize, \
-                              const int16_t *chrFilter, const int16_t **_chrUSrc, \
-                              const int16_t **_chrVSrc, \
-                              int chrFilterSize, const int16_t **_alpSrc, \
-                              uint8_t *_dest[4], int dstW, int chrDstW) \
+static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
+                              const int16_t **src, uint16_t *dest, int dstW, \
+                              const uint8_t *dither, int offset)\
 { \
-    const typeX_t **lumSrc  = (const typeX_t **) _lumSrc, \
-                  **chrUSrc = (const typeX_t **) _chrUSrc, \
-                  **chrVSrc = (const typeX_t **) _chrVSrc, \
-                  **alpSrc  = (const typeX_t **) _alpSrc; \
-    yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \
-                         chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                         alpSrc, (uint16_t **) _dest, \
-                         dstW, chrDstW, is_be, bits); \
+    yuv2yuvX_template_fn(filter, filterSize, (const typeX_t **) src, \
+                         dest, dstW, is_be, bits); \
 }
 yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t);
 yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t);
@ -339,51 +265,19 @@ yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t);
 yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t);
 yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t);

-static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
-                       const int16_t **lumSrc, int lumFilterSize,
-                       const int16_t *chrFilter, const int16_t **chrUSrc,
-                       const int16_t **chrVSrc,
-                       int chrFilterSize, const int16_t **alpSrc,
-                       uint8_t *dest[4], int dstW, int chrDstW)
+static void yuv2yuvX_c(const int16_t *filter, int filterSize,
+                       const int16_t **src, uint8_t *dest, int dstW,
+                       const uint8_t *dither, int offset)
 {
-    uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
-            *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
    int i;
-    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
-
-    //FIXME Optimize (just quickly written not optimized..)
    for (i=0; i<dstW; i++) {
-        int val = lumDither[i & 7] << 12;
+        int val = dither[(i + offset) & 7] << 12;
        int j;
-        for (j=0; j<lumFilterSize; j++)
-            val += lumSrc[j][i] * lumFilter[j];
+        for (j=0; j<filterSize; j++)
+            val += src[j][i] * filter[j];

-        yDest[i]= av_clip_uint8(val>>19);
+        dest[i]= av_clip_uint8(val>>19);
    }
-
-    if (uDest)
-        for (i=0; i<chrDstW; i++) {
-            int u = chrDither[i & 7] << 12;
-            int v = chrDither[(i + 3) & 7] << 12;
-            int j;
-            for (j=0; j<chrFilterSize; j++) {
-                u += chrUSrc[j][i] * chrFilter[j];
-                v += chrVSrc[j][i] * chrFilter[j];
-            }
-
-            uDest[i]= av_clip_uint8(u>>19);
-            vDest[i]= av_clip_uint8(v>>19);
-        }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest)
-        for (i=0; i<dstW; i++) {
-            int val = lumDither[i & 7] << 12;
-            int j;
-            for (j=0; j<lumFilterSize; j++)
-                val += alpSrc[j][i] * lumFilter[j];
-
-            aDest[i]= av_clip_uint8(val>>19);
-        }
 }

 static void yuv2yuv1_c(const int16_t *src, uint8_t *dest, int dstW,
@ -396,30 +290,13 @@ static void yuv2yuv1_c(const int16_t *src, uint8_t *dest, int dstW,
    }
 }

-static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
-                        const int16_t **lumSrc, int lumFilterSize,
-                        const int16_t *chrFilter, const int16_t **chrUSrc,
-                        const int16_t **chrVSrc, int chrFilterSize,
-                        const int16_t **alpSrc, uint8_t *dest[4],
-                        int dstW, int chrDstW)
+static void yuv2nv12X_chroma_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
+                        const int16_t **chrUSrc, const int16_t **chrVSrc,
+                        uint8_t *dest, int chrDstW)
 {
-    uint8_t *yDest = dest[0], *uDest = dest[1];
    enum PixelFormat dstFormat = c->dstFormat;
-    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
-
-    //FIXME Optimize (just quickly written not optimized..)
+    const uint8_t *chrDither = c->chrDither8;
    int i;
-    for (i=0; i<dstW; i++) {
-        int val = lumDither[i & 7] << 12;
-        int j;
-        for (j=0; j<lumFilterSize; j++)
-            val += lumSrc[j][i] * lumFilter[j];
-
-        yDest[i]= av_clip_uint8(val>>19);
-    }
-
-    if (!uDest)
-        return;

    if (dstFormat == PIX_FMT_NV12)
        for (i=0; i<chrDstW; i++) {
@ -431,8 +308,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
                v += chrVSrc[j][i] * chrFilter[j];
            }

-            uDest[2*i]= av_clip_uint8(u>>19);
-            uDest[2*i+1]= av_clip_uint8(v>>19);
+            dest[2*i]= av_clip_uint8(u>>19);
+            dest[2*i+1]= av_clip_uint8(v>>19);
        }
    else
        for (i=0; i<chrDstW; i++) {
@ -444,8 +321,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
                v += chrVSrc[j][i] * chrFilter[j];
            }

-            uDest[2*i]= av_clip_uint8(v>>19);
-            uDest[2*i+1]= av_clip_uint8(u>>19);
+            dest[2*i]= av_clip_uint8(v>>19);
+            dest[2*i+1]= av_clip_uint8(u>>19);
        }
 }

@ -2117,26 +1994,29 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2

 static av_always_inline void
 find_c_packed_planar_out_funcs(SwsContext *c,
-                               yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
+                               yuv2planar1_fn *yuv2yuv1, yuv2planarX_fn *yuv2planeX_luma,
+                               yuv2planarX_fn *yuv2planeX_chroma, yuv2interleavedX_fn *yuv2nv12X_chroma,
                               yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
                               yuv2packedX_fn *yuv2packedX)
 {
    enum PixelFormat dstFormat = c->dstFormat;

    if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-        *yuv2yuvX     = yuv2nv12X_c;
+        *yuv2planeX_luma = yuv2yuvX_c;
+        *yuv2nv12X_chroma     = yuv2nv12X_chroma_c;
    } else if (is16BPS(dstFormat)) {
-        *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
+        *yuv2planeX_luma = *yuv2planeX_chroma = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
    } else if (is9_OR_10BPS(dstFormat)) {
        if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
-            *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
+            *yuv2planeX_luma = *yuv2planeX_chroma = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
        } else {
-            *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
+            *yuv2planeX_luma = *yuv2planeX_chroma = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
        }
    } else {
        *yuv2yuv1     = yuv2yuv1_c;
-        *yuv2yuvX     = yuv2yuvX_c;
+        *yuv2planeX_luma = *yuv2planeX_chroma = yuv2yuvX_c;
    }
+
    if(c->flags & SWS_FULL_CHR_H_INT) {
        switch (dstFormat) {
            case PIX_FMT_RGBA:
@ -2396,7 +2276,9 @@ static int swScale(SwsContext *c, const uint8_t* src[],
    int lastDstY;
    uint32_t *pal=c->pal_yuv;
    yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
-    yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
+    yuv2planarX_fn yuv2planeX_luma = c->yuv2planeX_luma;
+    yuv2planarX_fn yuv2planeX_chroma = c->yuv2planeX_chroma;
+    yuv2interleavedX_fn yuv2nv12X_chroma = c->yuv2nv12X_chroma;
    yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
    yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
    yuv2packedX_fn yuv2packedX = c->yuv2packedX;
@ -2548,8 +2430,8 @@ static int swScale(SwsContext *c, const uint8_t* src[],
        }
        if (dstY >= dstH-2) {
            // hmm looks like we can't use MMX here without overwriting this array's tail
-            find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
-                                           &yuv2packed1, &yuv2packed2,
+            find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2planeX_luma, &yuv2planeX_chroma,
+                                           &yuv2nv12X_chroma, &yuv2packed1, &yuv2packed2,
                                           &yuv2packedX);
        }

@ -2564,7 +2446,13 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                    dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
                const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;

-                if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
+                if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
+                    yuv2planeX_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
+
+                    if (dest[1]){
+                        yuv2nv12X_chroma(c, vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
+                    }
+                } else if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
                    yuv2yuv1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);

                    if (dest[1]){
@ -2575,11 +2463,15 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                    if (alpBuf && dest[3])
                        yuv2yuv1(alpBuf, dest[3], dstW, c->lumDither8, 0);
                } else { //General YV12
-                    yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
-                             lumSrcPtr, vLumFilterSize,
-                             vChrFilter + chrDstY * vChrFilterSize,
-                             chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                             alpSrcPtr, dest, dstW, chrDstW);
+                    yuv2planeX_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
+
+                    if (dest[1]){
+                        yuv2planeX_chroma(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
+                        yuv2planeX_chroma(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrVSrcPtr, dest[2], chrDstW, c->chrDither8, 3);
+                    }
+
+                    if (alpBuf && dest[3])
+                        yuv2planeX_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
                }
            } else {
                assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
@ -2633,8 +2525,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
 {
    enum PixelFormat srcFormat = c->srcFormat;

-    find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
-                                   &c->yuv2packed1, &c->yuv2packed2,
+    find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2planeX_luma, &c->yuv2planeX_chroma,
+                                   &c->yuv2nv12X_chroma, &c->yuv2packed1, &c->yuv2packed2,
                                   &c->yuv2packedX);

    c->chrToYV12 = NULL;
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@ -76,33 +76,41 @@ typedef void (*yuv2planar1_fn) (const int16_t *src, uint8_t *dest, int dstW,
                                const uint8_t *dither, int offset);

 /**
- * Write one line of horizontally scaled Y/U/V/A to planar output
+ * Write one line of horizontally scaled data to planar output
+ * with multi-point vertical scaling between input pixels.
+ *
+ * @param filter        vertical luma/alpha scaling coefficients, 12bit [0,4096]
+ * @param src           scaled luma (Y) or alpha (A) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param filterSize    number of vertical input lines to scale
+ * @param dest          pointer to output plane. For >8bit
+ *                      output, this is in uint16_t
+ * @param dstW          width of destination pixels
+ * @param offset        Dither offset
+ */
+typedef void (*yuv2planarX_fn) (const int16_t *filter, int filterSize,
+                                const int16_t **src, uint8_t *dest, int dstW,
+                                const uint8_t *dither, int offset);
+
+/**
+ * Write one line of horizontally scaled chroma to interleaved output
 * with multi-point vertical scaling between input pixels.
 *
 * @param c             SWS scaling context
- * @param lumFilter     vertical luma/alpha scaling coefficients, 12bit [0,4096]
- * @param lumSrc        scaled luma (Y) source data, 15bit for 8-10bit output,
- *                      19-bit for 16bit output (in int32_t)
- * @param lumFilterSize number of vertical luma/alpha input lines to scale
 * @param chrFilter     vertical chroma scaling coefficients, 12bit [0,4096]
 * @param chrUSrc       scaled chroma (U) source data, 15bit for 8-10bit output,
 *                      19-bit for 16bit output (in int32_t)
 * @param chrVSrc       scaled chroma (V) source data, 15bit for 8-10bit output,
 *                      19-bit for 16bit output (in int32_t)
 * @param chrFilterSize number of vertical chroma input lines to scale
- * @param alpSrc        scaled alpha (A) source data, 15bit for 8-10bit output,
- *                      19-bit for 16bit output (in int32_t)
- * @param dest          pointer to the 4 output planes (Y/U/V/A). For >8bit
+ * @param dest          pointer to the output plane. For >8bit
 *                      output, this is in uint16_t
- * @param dstW          width of dest[0], dest[3], lumSrc and alpSrc in pixels
- * @param chrDstW       width of dest[1], dest[2], chrUSrc and chrVSrc
+ * @param dstW          width of chroma planes
 */
-typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter,
-                                const int16_t **lumSrc, int lumFilterSize,
-                                const int16_t *chrFilter, const int16_t **chrUSrc,
-                                const int16_t **chrVSrc,  int chrFilterSize,
-                                const int16_t **alpSrc, uint8_t *dest[4],
-                                int dstW, int chrDstW);
+typedef void (*yuv2interleavedX_fn) (struct SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
+                                     const int16_t **chrUSrc, const int16_t **chrVSrc,
+                                     uint8_t *dest, int dstW);
+
 /**
 * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
 * output without any additional vertical scaling (or point-scaling). Note
@ -405,7 +413,9 @@ typedef struct SwsContext {

    /* function pointers for swScale() */
    yuv2planar1_fn yuv2yuv1;
-    yuv2planarX_fn yuv2yuvX;
+    yuv2planarX_fn yuv2planeX_luma;
+    yuv2planarX_fn yuv2planeX_chroma;
+    yuv2interleavedX_fn yuv2nv12X_chroma;
    yuv2packed1_fn yuv2packed1;
    yuv2packed2_fn yuv2packed2;
    yuv2packedX_fn yuv2packedX;