ffmpeg/libavcodec/ppc/int_altivec.c

/*
 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

/**
 ** @file
 ** integer misc ops.
 **/

#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif

#include "libavcodec/dsputil.h"

#include "dsputil_altivec.h"

#include "types_altivec.h"

static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                     int size) {
    int i, size16;
    vector signed char vpix1;
    vector signed short vpix2, vdiff, vpix1l,vpix1h;
    union { vector signed int vscore;
            int32_t score[4];
          } u;
    u.vscore = vec_splat_s32(0);
//
//XXX lazy way, fix it later

#define vec_unaligned_load(b) \
    vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));

    size16 = size >> 4;
    while(size16) {
//        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
        //load pix1 and the first batch of pix2

        vpix1 = vec_unaligned_load(pix1);
        vpix2 = vec_unaligned_load(pix2);
        pix2 += 8;
        //unpack
        vpix1h = vec_unpackh(vpix1);
        vdiff  = vec_sub(vpix1h, vpix2);
        vpix1l = vec_unpackl(vpix1);
        // load another batch from pix2
        vpix2 = vec_unaligned_load(pix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        vdiff  = vec_sub(vpix1l, vpix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        pix1 += 16;
        pix2 += 8;
        size16--;
    }
    u.vscore = vec_sums(u.vscore, vec_splat_s32(0));

    size %= 16;
    for (i = 0; i < size; i++) {
        u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
    }
    return u.score[3];
}

static int32_t scalarproduct_int16_altivec(int16_t *v1, const int16_t *v2,
                                           int order, const int shift)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1, *pv;
    register vec_s32 res = vec_splat_s32(0), t;
    register vec_u32 shifts;
    int32_t ires;

    shifts = zero_u32v;
    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));

    for(i = 0; i < order; i += 8){
        pv = (vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        t = vec_sr(t, shifts);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}

static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16*)v1;
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
    register vec_s16 t0, t1, i0, i1, i4;
    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        i1 = vec_ld(16, v2);
        t0 = vec_perm(i2, i1, align);
        i2 = vec_ld(32, v2);
        t1 = vec_perm(i1, i2, align);
        i0 = pv1[0];
        i1 = pv1[1];
        res = vec_msum(t0, i0, res);
        res = vec_msum(t1, i1, res);
        i4 = vec_ld(16, v3);
        t0 = vec_perm(i3, i4, align);
        i3 = vec_ld(32, v3);
        t1 = vec_perm(i4, i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1 += 2;
        v2  += 8;
        v3  += 8;
    } while(--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
}

void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
    c->scalarproduct_int16 = scalarproduct_int16_altivec;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
}
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`/*`
			`* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>`
			`*`
Replace FFmpeg with Libav in licence headers Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-03-18 18:35:10 +01:00			`* This file is part of Libav.`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`*`
Replace FFmpeg with Libav in licence headers Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-03-18 18:35:10 +01:00			`* Libav is free software; you can redistribute it and/or`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
Replace FFmpeg with Libav in licence headers Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-03-18 18:35:10 +01:00			`* Libav is distributed in the hope that it will be useful,`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Replace FFmpeg with Libav in licence headers Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-03-18 18:35:10 +01:00			`* License along with Libav; if not, write to the Free Software`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`/**`
Remove explicit filename from Doxygen @file commands. Passing an explicit filename to this command is only necessary if the documentation in the @file block refers to a file different from the one the block resides in. Originally committed as revision 22921 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-20 16:45:34 +02:00			`** @file`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`** integer misc ops.`
			`**/`

Remove unnecessary gcc_fixes.h #include. Originally committed as revision 18384 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-04-09 13:54:13 +02:00			`#include "config.h"`
			`#if HAVE_ALTIVEC_H`
			`#include <altivec.h>`
			`#endif`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00
Remove unnecessary gcc_fixes.h #include. Originally committed as revision 18384 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-04-09 13:54:13 +02:00			`#include "libavcodec/dsputil.h"`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00
			`#include "dsputil_altivec.h"`

Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 09:33:09 +02:00			`#include "types_altivec.h"`

make arguments to ssd_int8_vs_int16() const Originally committed as revision 9548 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-07-09 01:15:00 +02:00			`static int ssd_int8_vs_int16_altivec(const int8_t pix1, const int16_t pix2,`
			`int size) {`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`int i, size16;`
			`vector signed char vpix1;`
			`vector signed short vpix2, vdiff, vpix1l,vpix1h;`
			`union { vector signed int vscore;`
			`int32_t score[4];`
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-20 20:58:30 +02:00			`} u;`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`u.vscore = vec_splat_s32(0);`
			`//`
			`//XXX lazy way, fix it later`

			`#define vec_unaligned_load(b) \`
			`vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));`

			`size16 = size >> 4;`
			`while(size16) {`
			`// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);`
			`//load pix1 and the first batch of pix2`

			`vpix1 = vec_unaligned_load(pix1);`
			`vpix2 = vec_unaligned_load(pix2);`
			`pix2 += 8;`
			`//unpack`
			`vpix1h = vec_unpackh(vpix1);`
			`vdiff = vec_sub(vpix1h, vpix2);`
			`vpix1l = vec_unpackl(vpix1);`
			`// load another batch from pix2`
			`vpix2 = vec_unaligned_load(pix2);`
			`u.vscore = vec_msum(vdiff, vdiff, u.vscore);`
			`vdiff = vec_sub(vpix1l, vpix2);`
			`u.vscore = vec_msum(vdiff, vdiff, u.vscore);`
			`pix1 += 16;`
			`pix2 += 8;`
			`size16--;`
			`}`
			`u.vscore = vec_sums(u.vscore, vec_splat_s32(0));`

			`size %= 16;`
			`for (i = 0; i < size; i++) {`
			`u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);`
			`}`
			`return u.score[3];`
			`}`

ppc: Add/remove a number of const qualifiers to fix related warnings. 2012-04-02 19:03:30 +02:00			`static int32_t scalarproduct_int16_altivec(int16_t v1, const int16_t v2,`
			`int order, const int shift)`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 09:33:09 +02:00			`{`
			`int i;`
			`LOAD_ZERO;`
Cleanup _t types in libavcodec/ppc Originally committed as revision 16357 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-12-27 12:21:28 +01:00			`register vec_s16 vec1, *pv;`
			`register vec_s32 res = vec_splat_s32(0), t;`
			`register vec_u32 shifts;`
PPC: remove unnecessary alignment on local variables Storing a single element from a vector where all elements have the same value does not require an aligned destination. Which element is stored depends on the alignment of the destination address, but since they all have the same value, the result is the same regardless of the alignment. Originally committed as revision 19696 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-08-24 23:42:22 +02:00			`int32_t ires;`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 09:33:09 +02:00
			`shifts = zero_u32v;`
			`if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));`
			`if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));`
			`if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));`
			`if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));`
			`if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));`

			`for(i = 0; i < order; i += 8){`
Cleanup _t types in libavcodec/ppc Originally committed as revision 16357 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-12-27 12:21:28 +01:00			`pv = (vec_s16*)v1;`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 09:33:09 +02:00			`vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));`
			`t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);`
			`t = vec_sr(t, shifts);`
			`res = vec_sums(t, res);`
			`v1 += 8;`
			`v2 += 8;`
			`}`
			`res = vec_splat(res, 3);`
			`vec_ste(res, 0, &ires);`
			`return ires;`
			`}`

Add const to some pointer parameters. Patch by Eli Friedman, eli D friedman A gmail Originally committed as revision 23826 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-06-27 17:11:38 +02:00			`static int32_t scalarproduct_and_madd_int16_altivec(int16_t v1, const int16_t v2, const int16_t *v3, int order, int mul)`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 16:09:10 +01:00			`{`
			`LOAD_ZERO;`
			`vec_s16 pv1 = (vec_s16)v1;`
			`register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 03:46:14 +02:00			`register vec_s16 t0, t1, i0, i1, i4;`
			`register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 16:09:10 +01:00			`register vec_s32 res = zero_s32v;`
			`register vec_u8 align = vec_lvsl(0, v2);`
			`int32_t ires;`
			`order >>= 4;`
			`do {`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 03:46:14 +02:00			`i1 = vec_ld(16, v2);`
			`t0 = vec_perm(i2, i1, align);`
			`i2 = vec_ld(32, v2);`
			`t1 = vec_perm(i1, i2, align);`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 16:09:10 +01:00			`i0 = pv1[0];`
			`i1 = pv1[1];`
			`res = vec_msum(t0, i0, res);`
			`res = vec_msum(t1, i1, res);`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 03:46:14 +02:00			`i4 = vec_ld(16, v3);`
			`t0 = vec_perm(i3, i4, align);`
			`i3 = vec_ld(32, v3);`
			`t1 = vec_perm(i4, i3, align);`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 16:09:10 +01:00			`pv1[0] = vec_mladd(t0, muls, i0);`
			`pv1[1] = vec_mladd(t1, muls, i1);`
			`pv1 += 2;`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 03:46:14 +02:00			`v2 += 8;`
			`v3 += 8;`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 16:09:10 +01:00			`} while(--order);`
			`res = vec_splat(vec_sums(res, zero_s32v), 3);`
			`vec_ste(res, 0, &ires);`
			`return ires;`
			`}`

ppc: Add ff_ prefix to nonstatic symbols Signed-off-by: Martin Storsjö <martin@martin.st> 2012-02-15 13:42:56 +01:00			`void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx)`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`{`
			`c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 09:33:09 +02:00			`c->scalarproduct_int16 = scalarproduct_int16_altivec;`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 16:09:10 +01:00			`c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 11:47:37 +02:00			`}`