From 688417399c69aadd4c287bdb0dec82ef8799011c Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 24 Jul 2015 18:56:54 +0200 Subject: [PATCH] hevcdsp: split the pred functions by width This should allow for more efficient SIMD. --- libavcodec/hevc.c | 118 +++++++++++++++++----------------- libavcodec/hevcdsp.c | 33 ++++++++-- libavcodec/hevcdsp.h | 36 +++++++---- libavcodec/hevcdsp_template.c | 81 +++++++++++++++++------ 4 files changed, 174 insertions(+), 94 deletions(-) diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c index 63d3bc7256..699e680609 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c @@ -1725,32 +1725,32 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { - s->hevcdsp.weighted_pred(s->sh.luma_log2_weight_denom, - s->sh.luma_weight_l0[current_mv.ref_idx[0]], - s->sh.luma_offset_l0[current_mv.ref_idx[0]], - dst0, s->frame->linesize[0], tmp, - tmpstride, nPbW, nPbH); + s->hevcdsp.weighted_pred[pred_idx](s->sh.luma_log2_weight_denom, + s->sh.luma_weight_l0[current_mv.ref_idx[0]], + s->sh.luma_offset_l0[current_mv.ref_idx[0]], + dst0, s->frame->linesize[0], tmp, + tmpstride, nPbH); } else { - s->hevcdsp.put_unweighted_pred(dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH); + s->hevcdsp.put_unweighted_pred[pred_idx](dst0, s->frame->linesize[0], tmp, tmpstride, nPbH); } chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame, ¤t_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx); if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { - s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], - s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0], - dst1, s->frame->linesize[1], tmp, tmpstride, - nPbW / 2, nPbH / 2); - s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], - s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1], - dst2, s->frame->linesize[2], tmp2, tmpstride, - nPbW / 2, nPbH / 2); + s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], + s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0], + dst1, s->frame->linesize[1], tmp, tmpstride, + nPbH / 2); + s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], + s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1], + dst2, s->frame->linesize[2], tmp2, tmpstride, + nPbH / 2); } else { - s->hevcdsp.put_unweighted_pred(dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2); - s->hevcdsp.put_unweighted_pred(dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2); + s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst1, s->frame->linesize[1], tmp, tmpstride, nPbH / 2); + s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH / 2); } } else if (!current_mv.pred_flag[0] && current_mv.pred_flag[1]) { DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); @@ -1761,13 +1761,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { - s->hevcdsp.weighted_pred(s->sh.luma_log2_weight_denom, - s->sh.luma_weight_l1[current_mv.ref_idx[1]], - s->sh.luma_offset_l1[current_mv.ref_idx[1]], - dst0, s->frame->linesize[0], tmp, tmpstride, - nPbW, nPbH); + s->hevcdsp.weighted_pred[pred_idx](s->sh.luma_log2_weight_denom, + s->sh.luma_weight_l1[current_mv.ref_idx[1]], + s->sh.luma_offset_l1[current_mv.ref_idx[1]], + dst0, s->frame->linesize[0], tmp, tmpstride, + nPbH); } else { - s->hevcdsp.put_unweighted_pred(dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH); + s->hevcdsp.put_unweighted_pred[pred_idx](dst0, s->frame->linesize[0], tmp, tmpstride, nPbH); } chroma_mc(s, tmp, tmp2, tmpstride, ref1->frame, @@ -1775,17 +1775,17 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { - s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom, - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], - s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0], - dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2); - s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom, - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], - s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1], - dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2); + s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom, + s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], + s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0], + dst1, s->frame->linesize[1], tmp, tmpstride, nPbH/2); + s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom, + s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], + s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1], + dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH/2); } else { - s->hevcdsp.put_unweighted_pred(dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2); - s->hevcdsp.put_unweighted_pred(dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2); + s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst1, s->frame->linesize[1], tmp, tmpstride, nPbH / 2); + s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH / 2); } } else if (current_mv.pred_flag[0] && current_mv.pred_flag[1]) { DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); @@ -1800,16 +1800,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { - s->hevcdsp.weighted_pred_avg(s->sh.luma_log2_weight_denom, - s->sh.luma_weight_l0[current_mv.ref_idx[0]], - s->sh.luma_weight_l1[current_mv.ref_idx[1]], - s->sh.luma_offset_l0[current_mv.ref_idx[0]], - s->sh.luma_offset_l1[current_mv.ref_idx[1]], - dst0, s->frame->linesize[0], - tmp, tmp2, tmpstride, nPbW, nPbH); + s->hevcdsp.weighted_pred_avg[pred_idx](s->sh.luma_log2_weight_denom, + s->sh.luma_weight_l0[current_mv.ref_idx[0]], + s->sh.luma_weight_l1[current_mv.ref_idx[1]], + s->sh.luma_offset_l0[current_mv.ref_idx[0]], + s->sh.luma_offset_l1[current_mv.ref_idx[1]], + dst0, s->frame->linesize[0], + tmp, tmp2, tmpstride, nPbH); } else { - s->hevcdsp.put_unweighted_pred_avg(dst0, s->frame->linesize[0], - tmp, tmp2, tmpstride, nPbW, nPbH); + s->hevcdsp.put_unweighted_pred_avg[pred_idx](dst0, s->frame->linesize[0], + tmp, tmp2, tmpstride, nPbH); } chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame, @@ -1819,23 +1819,23 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { - s->hevcdsp.weighted_pred_avg(s->sh.chroma_log2_weight_denom, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], - s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0], - s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0], - dst1, s->frame->linesize[1], tmp, tmp3, - tmpstride, nPbW / 2, nPbH / 2); - s->hevcdsp.weighted_pred_avg(s->sh.chroma_log2_weight_denom, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], - s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1], - s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1], - dst2, s->frame->linesize[2], tmp2, tmp4, - tmpstride, nPbW / 2, nPbH / 2); + s->hevcdsp.weighted_pred_avg_chroma[pred_idx](s->sh.chroma_log2_weight_denom, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], + s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], + s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0], + s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0], + dst1, s->frame->linesize[1], tmp, tmp3, + tmpstride, nPbH / 2); + s->hevcdsp.weighted_pred_avg_chroma[pred_idx](s->sh.chroma_log2_weight_denom, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], + s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], + s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1], + s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1], + dst2, s->frame->linesize[2], tmp2, tmp4, + tmpstride, nPbH / 2); } else { - s->hevcdsp.put_unweighted_pred_avg(dst1, s->frame->linesize[1], tmp, tmp3, tmpstride, nPbW/2, nPbH/2); - s->hevcdsp.put_unweighted_pred_avg(dst2, s->frame->linesize[2], tmp2, tmp4, tmpstride, nPbW/2, nPbH/2); + s->hevcdsp.put_unweighted_pred_avg_chroma[pred_idx](dst1, s->frame->linesize[1], tmp, tmp3, tmpstride, nPbH/2); + s->hevcdsp.put_unweighted_pred_avg_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmp4, tmpstride, nPbH/2); } } } diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c index 7f42399dfc..67c2705318 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -128,6 +128,18 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) hevcdsp->put_hevc_epel[1][0][i] = FUNC(put_hevc_epel_v_ ## width, depth); \ hevcdsp->put_hevc_epel[1][1][i] = FUNC(put_hevc_epel_hv_ ## width, depth); \ +#define PRED_FUNC(i, width, depth) \ + hevcdsp->put_unweighted_pred[i] = FUNC(put_unweighted_pred_ ## width, depth); \ + hevcdsp->put_unweighted_pred_avg[i] = FUNC(put_unweighted_pred_avg_ ## width, depth); \ + hevcdsp->weighted_pred[i] = FUNC(put_weighted_pred_ ## width, depth); \ + hevcdsp->weighted_pred_avg[i] = FUNC(put_weighted_pred_avg_ ## width, depth); \ + +#define PRED_FUNC_CHROMA(i, width, depth) \ + hevcdsp->put_unweighted_pred_chroma[i] = FUNC(put_unweighted_pred_ ## width, depth); \ + hevcdsp->put_unweighted_pred_avg_chroma[i] = FUNC(put_unweighted_pred_avg_ ## width, depth); \ + hevcdsp->weighted_pred_chroma[i] = FUNC(put_weighted_pred_ ## width, depth); \ + hevcdsp->weighted_pred_avg_chroma[i] = FUNC(put_weighted_pred_avg_ ## width, depth); \ + #define HEVC_DSP(depth) \ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ hevcdsp->transquant_bypass[0] = FUNC(transquant_bypass4x4, depth); \ @@ -169,11 +181,22 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) EPEL_FUNC(6, 24, depth); \ EPEL_FUNC(7, 32, depth); \ \ - hevcdsp->put_unweighted_pred = FUNC(put_unweighted_pred, depth); \ - hevcdsp->put_unweighted_pred_avg = FUNC(put_unweighted_pred_avg, depth); \ - \ - hevcdsp->weighted_pred = FUNC(weighted_pred, depth); \ - hevcdsp->weighted_pred_avg = FUNC(weighted_pred_avg, depth); \ + PRED_FUNC(0, 4, depth); \ + PRED_FUNC(1, 8, depth); \ + PRED_FUNC(2, 12, depth); \ + PRED_FUNC(3, 16, depth); \ + PRED_FUNC(4, 24, depth); \ + PRED_FUNC(5, 32, depth); \ + PRED_FUNC(6, 48, depth); \ + PRED_FUNC(7, 64, depth); \ + PRED_FUNC_CHROMA(0, 2, depth); \ + PRED_FUNC_CHROMA(1, 4, depth); \ + PRED_FUNC_CHROMA(2, 6, depth); \ + PRED_FUNC_CHROMA(3, 8, depth); \ + PRED_FUNC_CHROMA(4, 12, depth); \ + PRED_FUNC_CHROMA(5, 16, depth); \ + PRED_FUNC_CHROMA(6, 24, depth); \ + PRED_FUNC_CHROMA(7, 32, depth); \ \ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index e906c5e4aa..59dd9b25ec 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -65,18 +65,30 @@ typedef struct HEVCDSPContext { ptrdiff_t srcstride, int height, int mx, int my, int16_t *mcbuffer); - void (*put_unweighted_pred)(uint8_t *dst, ptrdiff_t dststride, int16_t *src, - ptrdiff_t srcstride, int width, int height); - void (*put_unweighted_pred_avg)(uint8_t *dst, ptrdiff_t dststride, - int16_t *src1, int16_t *src2, - ptrdiff_t srcstride, int width, int height); - void (*weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag, - uint8_t *dst, ptrdiff_t dststride, int16_t *src, - ptrdiff_t srcstride, int width, int height); - void (*weighted_pred_avg)(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag, - int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst, - ptrdiff_t dststride, int16_t *src1, int16_t *src2, - ptrdiff_t srcstride, int width, int height); + void (*put_unweighted_pred[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src, + ptrdiff_t srcstride, int height); + void (*put_unweighted_pred_chroma[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src, + ptrdiff_t srcstride, int height); + void (*put_unweighted_pred_avg[8])(uint8_t *dst, ptrdiff_t dststride, + int16_t *src1, int16_t *src2, + ptrdiff_t srcstride, int height); + void (*put_unweighted_pred_avg_chroma[8])(uint8_t *dst, ptrdiff_t dststride, + int16_t *src1, int16_t *src2, + ptrdiff_t srcstride, int height); + void (*weighted_pred[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag, + uint8_t *dst, ptrdiff_t dststride, int16_t *src, + ptrdiff_t srcstride, int height); + void (*weighted_pred_chroma[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag, + uint8_t *dst, ptrdiff_t dststride, int16_t *src, + ptrdiff_t srcstride, int height); + void (*weighted_pred_avg[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag, + int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst, + ptrdiff_t dststride, int16_t *src1, int16_t *src2, + ptrdiff_t srcstride, int height); + void (*weighted_pred_avg_chroma[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag, + int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst, + ptrdiff_t dststride, int16_t *src1, int16_t *src2, + ptrdiff_t srcstride, int height); void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index d832904dcb..723f4d4520 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -1130,9 +1130,10 @@ EPEL(6) EPEL(4) EPEL(2) -static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride, - int16_t *src, ptrdiff_t srcstride, - int width, int height) +static av_always_inline void +FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride, + int16_t *src, ptrdiff_t srcstride, + int width, int height) { int x, y; pixel *dst = (pixel *)_dst; @@ -1152,10 +1153,11 @@ static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride, } } -static void FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride, - int16_t *src1, int16_t *src2, - ptrdiff_t srcstride, - int width, int height) +static av_always_inline void +FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride, + int16_t *src1, int16_t *src2, + ptrdiff_t srcstride, + int width, int height) { int x, y; pixel *dst = (pixel *)_dst; @@ -1177,10 +1179,11 @@ static void FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride, } } -static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag, - uint8_t *_dst, ptrdiff_t _dststride, - int16_t *src, ptrdiff_t srcstride, - int width, int height) +static av_always_inline void +FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag, + uint8_t *_dst, ptrdiff_t _dststride, + int16_t *src, ptrdiff_t srcstride, + int width, int height) { int shift, log2Wd, wx, ox, x, y, offset; pixel *dst = (pixel *)_dst; @@ -1205,13 +1208,14 @@ static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag, } } -static void FUNC(weighted_pred_avg)(uint8_t denom, - int16_t wl0Flag, int16_t wl1Flag, - int16_t ol0Flag, int16_t ol1Flag, - uint8_t *_dst, ptrdiff_t _dststride, - int16_t *src1, int16_t *src2, - ptrdiff_t srcstride, - int width, int height) +static av_always_inline void +FUNC(weighted_pred_avg)(uint8_t denom, + int16_t wl0Flag, int16_t wl1Flag, + int16_t ol0Flag, int16_t ol1Flag, + uint8_t *_dst, ptrdiff_t _dststride, + int16_t *src1, int16_t *src2, + ptrdiff_t srcstride, + int width, int height) { int shift, log2Wd, w0, w1, o0, o1, x, y; pixel *dst = (pixel *)_dst; @@ -1234,6 +1238,47 @@ static void FUNC(weighted_pred_avg)(uint8_t denom, } } +#define PUT_PRED(w) \ +static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src, ptrdiff_t srcstride, \ + int height) \ +{ \ + FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height); \ +} \ +static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src1, int16_t *src2, \ + ptrdiff_t srcstride, int height) \ +{ \ + FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height); \ +} \ +static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset, \ + uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src, ptrdiff_t srcstride, int height) \ +{ \ + FUNC(weighted_pred)(denom, weight, offset, \ + dst, dststride, src, srcstride, w, height); \ +} \ +static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \ + int16_t offset0, int16_t offset1, \ + uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src1, int16_t *src2, \ + ptrdiff_t srcstride, int height) \ +{ \ + FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1, \ + dst, dststride, src1, src2, srcstride, w, height); \ +} + +PUT_PRED(64) +PUT_PRED(48) +PUT_PRED(32) +PUT_PRED(24) +PUT_PRED(16) +PUT_PRED(12) +PUT_PRED(8) +PUT_PRED(6) +PUT_PRED(4) +PUT_PRED(2) + // line zero #define P3 pix[-4 * xstride] #define P2 pix[-3 * xstride]