2012-11-08 17:09:30 -08:00
vp9_common_forward_decls( ) {
2012-08-08 15:47:36 -07:00
cat <<EOF
2012-11-08 17:09:30 -08:00
/*
* VP9
*/
2012-10-13 18:49:44 -07:00
2012-12-18 15:31:19 -08:00
#include "vpx/vpx_integer.h"
2012-10-28 10:38:23 -07:00
struct loop_filter_info;
struct blockd;
struct macroblockd;
2012-10-13 18:49:44 -07:00
struct loop_filter_info;
2012-10-28 10:38:23 -07:00
/* Encoder forward decls */
struct block;
struct macroblock;
struct variance_vtable;
2012-11-05 16:58:03 -08:00
#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
2012-10-13 18:49:44 -07:00
union int_mv;
struct yv12_buffer_config;
2012-08-08 15:47:36 -07:00
EOF
}
2012-11-08 17:09:30 -08:00
forward_decls vp9_common_forward_decls
2012-08-14 18:19:09 -07:00
2012-10-30 12:09:49 -07:00
#
# Dequant
#
2012-10-30 14:51:31 -07:00
prototype void vp9_dequantize_b "struct blockd *x"
2012-12-19 11:53:43 -08:00
specialize vp9_dequantize_b
2012-10-30 12:09:49 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, struct macroblockd *xd"
2012-10-30 14:51:31 -07:00
specialize vp9_dequant_idct_add_y_block_8x8
2012-10-30 12:09:49 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs, struct macroblockd *xd"
2012-10-30 14:51:31 -07:00
specialize vp9_dequant_idct_add_uv_block_8x8
2012-10-30 12:09:49 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
2012-10-30 14:51:31 -07:00
specialize vp9_dequant_idct_add_16x16
2012-10-30 12:09:49 -07:00
2013-02-15 10:15:42 -08:00
prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
2012-11-02 11:22:57 -07:00
specialize vp9_dequant_idct_add_8x8
2012-12-18 15:31:19 -08:00
prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride"
2012-10-30 14:51:31 -07:00
specialize vp9_dequant_idct_add
2012-10-30 12:09:49 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"
2012-12-19 11:53:43 -08:00
specialize vp9_dequant_idct_add_y_block
2012-10-30 12:09:49 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
2012-12-19 11:53:43 -08:00
specialize vp9_dequant_idct_add_uv_block
2012-10-09 17:09:08 -07:00
2013-01-08 10:29:22 -08:00
prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob"
specialize vp9_dequant_idct_add_32x32
prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
specialize vp9_dequant_idct_add_uv_block_16x16
2012-10-09 17:09:08 -07:00
#
# RECON
#
Convert subpixel filters to use convolve framework
Update the code to call the new convolution functions to do subpixel
prediction rather than the existing functions. Remove the old C and
assembly code, since it is unused. This causes a 50% performance
reduction on the decoder, but that will be resolved when the asm for
the new functions is available.
There is no consensus for whether 6-tap or 2-tap predictors will be
supported in the final codec, so these filters are implemented in
terms of the 8-tap code, so that quality testing of these modes
can continue. Implementing the lower complexity algorithms is a
simple exercise, should it be necessary.
This code produces slightly better results in the EIGHTTAP_SMOOTH
case, since the filter is now applied in only one direction when
the subpel motion is only in one direction. Like the previous code,
the filtering is skipped entirely on full-pel MVs. This combination
seems to give the best quality gains, but this may be indicative of a
bug in the encoder's filter selection, since the encoder could
achieve the result of skipping the filtering on full-pel by selecting
one of the other filters. This should be revisited.
Quality gains on derf positive on almost all clips. The only clip
that seemed to be hurt at all datarates was football
(-0.115% PSNR average, -0.587% min). Overall averages 0.375% PSNR,
0.347% SSIM.
Change-Id: I7d469716091b1d89b4b08adde5863999319d69ff
2013-01-28 16:59:03 -08:00
prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
2012-12-02 14:14:00 -08:00
specialize vp9_copy_mem16x16 mmx sse2 dspr2
2012-10-30 16:25:53 -07:00
vp9_copy_mem16x16_dspr2 = vp9_copy_mem16x16_dspr2
2012-10-13 18:49:44 -07:00
Convert subpixel filters to use convolve framework
Update the code to call the new convolution functions to do subpixel
prediction rather than the existing functions. Remove the old C and
assembly code, since it is unused. This causes a 50% performance
reduction on the decoder, but that will be resolved when the asm for
the new functions is available.
There is no consensus for whether 6-tap or 2-tap predictors will be
supported in the final codec, so these filters are implemented in
terms of the 8-tap code, so that quality testing of these modes
can continue. Implementing the lower complexity algorithms is a
simple exercise, should it be necessary.
This code produces slightly better results in the EIGHTTAP_SMOOTH
case, since the filter is now applied in only one direction when
the subpel motion is only in one direction. Like the previous code,
the filtering is skipped entirely on full-pel MVs. This combination
seems to give the best quality gains, but this may be indicative of a
bug in the encoder's filter selection, since the encoder could
achieve the result of skipping the filtering on full-pel by selecting
one of the other filters. This should be revisited.
Quality gains on derf positive on almost all clips. The only clip
that seemed to be hurt at all datarates was football
(-0.115% PSNR average, -0.587% min). Overall averages 0.375% PSNR,
0.347% SSIM.
Change-Id: I7d469716091b1d89b4b08adde5863999319d69ff
2013-01-28 16:59:03 -08:00
prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
2012-12-02 14:14:00 -08:00
specialize vp9_copy_mem8x8 mmx dspr2
2012-10-30 16:25:53 -07:00
vp9_copy_mem8x8_dspr2 = vp9_copy_mem8x8_dspr2
2012-10-13 18:49:44 -07:00
Convert subpixel filters to use convolve framework
Update the code to call the new convolution functions to do subpixel
prediction rather than the existing functions. Remove the old C and
assembly code, since it is unused. This causes a 50% performance
reduction on the decoder, but that will be resolved when the asm for
the new functions is available.
There is no consensus for whether 6-tap or 2-tap predictors will be
supported in the final codec, so these filters are implemented in
terms of the 8-tap code, so that quality testing of these modes
can continue. Implementing the lower complexity algorithms is a
simple exercise, should it be necessary.
This code produces slightly better results in the EIGHTTAP_SMOOTH
case, since the filter is now applied in only one direction when
the subpel motion is only in one direction. Like the previous code,
the filtering is skipped entirely on full-pel MVs. This combination
seems to give the best quality gains, but this may be indicative of a
bug in the encoder's filter selection, since the encoder could
achieve the result of skipping the filtering on full-pel by selecting
one of the other filters. This should be revisited.
Quality gains on derf positive on almost all clips. The only clip
that seemed to be hurt at all datarates was football
(-0.115% PSNR average, -0.587% min). Overall averages 0.375% PSNR,
0.347% SSIM.
Change-Id: I7d469716091b1d89b4b08adde5863999319d69ff
2013-01-28 16:59:03 -08:00
prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
2012-10-30 16:25:53 -07:00
specialize vp9_copy_mem8x4 mmx
2012-10-13 18:49:44 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
2012-10-30 16:25:53 -07:00
specialize vp9_recon_b
2012-10-13 18:49:44 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
2012-10-30 16:25:53 -07:00
specialize vp9_recon_uv_b
2012-10-13 18:49:44 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_recon2b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
2012-10-30 16:25:53 -07:00
specialize vp9_recon2b sse2
2012-10-13 18:49:44 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_recon4b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
2012-10-30 16:25:53 -07:00
specialize vp9_recon4b sse2
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_recon_mb "struct macroblockd *x"
specialize vp9_recon_mb
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_recon_mby "struct macroblockd *x"
specialize vp9_recon_mby
2012-10-13 18:49:44 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_recon_mby_s "struct macroblockd *x, uint8_t *dst"
2012-11-30 07:29:43 -08:00
specialize vp9_recon_mby_s
2012-12-18 15:31:19 -08:00
prototype void vp9_recon_mbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
2012-11-30 07:29:43 -08:00
specialize void vp9_recon_mbuv_s
2013-01-08 10:29:22 -08:00
prototype void vp9_recon_sby_s "struct macroblockd *x, uint8_t *dst"
specialize vp9_recon_sby_s
prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
specialize void vp9_recon_sbuv_s
2012-10-30 16:25:53 -07:00
prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
specialize vp9_build_intra_predictors_mby_s
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"
specialize vp9_build_intra_predictors_sby_s;
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"
specialize vp9_build_intra_predictors_sbuv_s;
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
specialize vp9_build_intra_predictors_mby;
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
specialize vp9_build_intra_predictors_mby_s;
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"
specialize vp9_build_intra_predictors_mbuv;
2012-10-13 18:49:44 -07:00
2012-10-30 16:25:53 -07:00
prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
specialize vp9_build_intra_predictors_mbuv_s;
2012-10-13 18:49:44 -07:00
2013-01-08 10:29:22 -08:00
prototype void vp9_build_intra_predictors_sb64y_s "struct macroblockd *x"
specialize vp9_build_intra_predictors_sb64y_s;
prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x"
specialize vp9_build_intra_predictors_sb64uv_s;
[WIP] Add column-based tiling.
This patch adds column-based tiling. The idea is to make each tile
independently decodable (after reading the common frame header) and
also independendly encodable (minus within-frame cost adjustments in
the RD loop) to speed-up hardware & software en/decoders if they used
multi-threading. Column-based tiling has the added advantage (over
other tiling methods) that it minimizes realtime use-case latency,
since all threads can start encoding data as soon as the first SB-row
worth of data is available to the encoder.
There is some test code that does random tile ordering in the decoder,
to confirm that each tile is indeed independently decodable from other
tiles in the same frame. At tile edges, all contexts assume default
values (i.e. 0, 0 motion vector, no coefficients, DC intra4x4 mode),
and motion vector search and ordering do not cross tiles in the same
frame.
t log
Tile independence is not maintained between frames ATM, i.e. tile 0 of
frame 1 is free to use motion vectors that point into any tile of frame
0. We support 1 (i.e. no tiling), 2 or 4 column-tiles.
The loopfilter crosses tile boundaries. I discussed this briefly with Aki
and he says that's OK. An in-loop loopfilter would need to do some sync
between tile threads, but that shouldn't be a big issue.
Resuls: with tiling disabled, we go up slightly because of improved edge
use in the intra4x4 prediction. With 2 tiles, we lose about ~1% on derf,
~0.35% on HD and ~0.55% on STD/HD. With 4 tiles, we lose another ~1.5%
on derf ~0.77% on HD and ~0.85% on STD/HD. Most of this loss is
concentrated in the low-bitrate end of clips, and most of it is because
of the loss of edges at tile boundaries and the resulting loss of intra
predictors.
TODO:
- more tiles (perhaps allow row-based tiling also, and max. 8 tiles)?
- maybe optionally (for EC purposes), motion vectors themselves
should not cross tile edges, or we should emulate such borders as
if they were off-frame, to limit error propagation to within one
tile only. This doesn't have to be the default behaviour but could
be an optional bitstream flag.
Change-Id: I5951c3a0742a767b20bc9fb5af685d9892c2c96f
2013-02-01 09:35:28 -08:00
prototype void vp9_intra4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
2012-10-30 16:25:53 -07:00
specialize vp9_intra4x4_predict;
2012-10-13 18:49:44 -07:00
[WIP] Add column-based tiling.
This patch adds column-based tiling. The idea is to make each tile
independently decodable (after reading the common frame header) and
also independendly encodable (minus within-frame cost adjustments in
the RD loop) to speed-up hardware & software en/decoders if they used
multi-threading. Column-based tiling has the added advantage (over
other tiling methods) that it minimizes realtime use-case latency,
since all threads can start encoding data as soon as the first SB-row
worth of data is available to the encoder.
There is some test code that does random tile ordering in the decoder,
to confirm that each tile is indeed independently decodable from other
tiles in the same frame. At tile edges, all contexts assume default
values (i.e. 0, 0 motion vector, no coefficients, DC intra4x4 mode),
and motion vector search and ordering do not cross tiles in the same
frame.
t log
Tile independence is not maintained between frames ATM, i.e. tile 0 of
frame 1 is free to use motion vectors that point into any tile of frame
0. We support 1 (i.e. no tiling), 2 or 4 column-tiles.
The loopfilter crosses tile boundaries. I discussed this briefly with Aki
and he says that's OK. An in-loop loopfilter would need to do some sync
between tile threads, but that shouldn't be a big issue.
Resuls: with tiling disabled, we go up slightly because of improved edge
use in the intra4x4 prediction. With 2 tiles, we lose about ~1% on derf,
~0.35% on HD and ~0.55% on STD/HD. With 4 tiles, we lose another ~1.5%
on derf ~0.77% on HD and ~0.85% on STD/HD. Most of this loss is
concentrated in the low-bitrate end of clips, and most of it is because
of the loss of edges at tile boundaries and the resulting loss of intra
predictors.
TODO:
- more tiles (perhaps allow row-based tiling also, and max. 8 tiles)?
- maybe optionally (for EC purposes), motion vectors themselves
should not cross tile edges, or we should emulate such borders as
if they were off-frame, to limit error propagation to within one
tile only. This doesn't have to be the default behaviour but could
be an optional bitstream flag.
Change-Id: I5951c3a0742a767b20bc9fb5af685d9892c2c96f
2013-02-01 09:35:28 -08:00
prototype void vp9_intra8x8_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
2012-10-30 16:25:53 -07:00
specialize vp9_intra8x8_predict;
2012-10-13 18:49:44 -07:00
[WIP] Add column-based tiling.
This patch adds column-based tiling. The idea is to make each tile
independently decodable (after reading the common frame header) and
also independendly encodable (minus within-frame cost adjustments in
the RD loop) to speed-up hardware & software en/decoders if they used
multi-threading. Column-based tiling has the added advantage (over
other tiling methods) that it minimizes realtime use-case latency,
since all threads can start encoding data as soon as the first SB-row
worth of data is available to the encoder.
There is some test code that does random tile ordering in the decoder,
to confirm that each tile is indeed independently decodable from other
tiles in the same frame. At tile edges, all contexts assume default
values (i.e. 0, 0 motion vector, no coefficients, DC intra4x4 mode),
and motion vector search and ordering do not cross tiles in the same
frame.
t log
Tile independence is not maintained between frames ATM, i.e. tile 0 of
frame 1 is free to use motion vectors that point into any tile of frame
0. We support 1 (i.e. no tiling), 2 or 4 column-tiles.
The loopfilter crosses tile boundaries. I discussed this briefly with Aki
and he says that's OK. An in-loop loopfilter would need to do some sync
between tile threads, but that shouldn't be a big issue.
Resuls: with tiling disabled, we go up slightly because of improved edge
use in the intra4x4 prediction. With 2 tiles, we lose about ~1% on derf,
~0.35% on HD and ~0.55% on STD/HD. With 4 tiles, we lose another ~1.5%
on derf ~0.77% on HD and ~0.85% on STD/HD. Most of this loss is
concentrated in the low-bitrate end of clips, and most of it is because
of the loss of edges at tile boundaries and the resulting loss of intra
predictors.
TODO:
- more tiles (perhaps allow row-based tiling also, and max. 8 tiles)?
- maybe optionally (for EC purposes), motion vectors themselves
should not cross tile edges, or we should emulate such borders as
if they were off-frame, to limit error propagation to within one
tile only. This doesn't have to be the default behaviour but could
be an optional bitstream flag.
Change-Id: I5951c3a0742a767b20bc9fb5af685d9892c2c96f
2013-02-01 09:35:28 -08:00
prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
2012-10-30 16:25:53 -07:00
specialize vp9_intra_uv4x4_predict;
2012-10-13 18:49:44 -07:00
2012-10-16 16:19:35 -07:00
#
# Loopfilter
#
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_mbv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
2012-10-30 16:25:53 -07:00
specialize vp9_loop_filter_mbv sse2
2012-10-16 16:19:35 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_bv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
2012-10-30 16:25:53 -07:00
specialize vp9_loop_filter_bv sse2
2012-10-16 16:19:35 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_bv8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
2012-10-30 16:25:53 -07:00
specialize vp9_loop_filter_bv8x8 sse2
2012-10-16 16:19:35 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_mbh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
2012-10-30 16:25:53 -07:00
specialize vp9_loop_filter_mbh sse2
2012-10-16 16:19:35 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_bh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
2012-10-30 16:25:53 -07:00
specialize vp9_loop_filter_bh sse2
2012-10-16 16:19:35 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
2012-10-30 16:25:53 -07:00
specialize vp9_loop_filter_bh8x8 sse2
2012-10-16 16:19:35 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
2012-12-02 14:14:00 -08:00
specialize vp9_loop_filter_simple_mbv mmx sse2
2012-10-31 14:40:53 -07:00
vp9_loop_filter_simple_mbv_c = vp9_loop_filter_simple_vertical_edge_c
vp9_loop_filter_simple_mbv_mmx = vp9_loop_filter_simple_vertical_edge_mmx
vp9_loop_filter_simple_mbv_sse2 = vp9_loop_filter_simple_vertical_edge_sse2
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
2012-12-02 14:14:00 -08:00
specialize vp9_loop_filter_simple_mbh mmx sse2
2012-10-31 14:40:53 -07:00
vp9_loop_filter_simple_mbh_c = vp9_loop_filter_simple_horizontal_edge_c
vp9_loop_filter_simple_mbh_mmx = vp9_loop_filter_simple_horizontal_edge_mmx
vp9_loop_filter_simple_mbh_sse2 = vp9_loop_filter_simple_horizontal_edge_sse2
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
2012-12-02 14:14:00 -08:00
specialize vp9_loop_filter_simple_bv mmx sse2
2012-10-31 14:40:53 -07:00
vp9_loop_filter_simple_bv_c = vp9_loop_filter_bvs_c
vp9_loop_filter_simple_bv_mmx = vp9_loop_filter_bvs_mmx
vp9_loop_filter_simple_bv_sse2 = vp9_loop_filter_bvs_sse2
2012-12-18 15:31:19 -08:00
prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
2012-12-02 14:14:00 -08:00
specialize vp9_loop_filter_simple_bh mmx sse2
2012-10-31 14:40:53 -07:00
vp9_loop_filter_simple_bh_c = vp9_loop_filter_bhs_c
vp9_loop_filter_simple_bh_mmx = vp9_loop_filter_bhs_mmx
vp9_loop_filter_simple_bh_sse2 = vp9_loop_filter_bhs_sse2
2012-10-16 16:19:35 -07:00
2013-01-11 09:45:45 -08:00
prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
2013-01-11 14:54:14 -08:00
specialize vp9_lpf_mbh_w sse2
2013-01-11 09:45:45 -08:00
prototype void vp9_lpf_mbv_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
2013-01-11 14:54:14 -08:00
specialize vp9_lpf_mbv_w sse2
2012-11-28 10:00:25 -08:00
#
# post proc
#
2012-12-23 07:20:10 -08:00
if [ " $CONFIG_POSTPROC " = "yes" ] ; then
2012-12-18 15:31:19 -08:00
prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
2012-11-28 10:00:25 -08:00
specialize vp9_mbpost_proc_down mmx sse2
vp9_mbpost_proc_down_sse2 = vp9_mbpost_proc_down_xmm
2012-12-18 15:31:19 -08:00
prototype void vp9_mbpost_proc_across_ip "uint8_t *src, int pitch, int rows, int cols, int flimit"
2012-11-28 10:00:25 -08:00
specialize vp9_mbpost_proc_across_ip sse2
vp9_mbpost_proc_across_ip_sse2 = vp9_mbpost_proc_across_ip_xmm
2012-12-18 15:31:19 -08:00
prototype void vp9_post_proc_down_and_across "uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
2012-11-28 10:00:25 -08:00
specialize vp9_post_proc_down_and_across mmx sse2
vp9_post_proc_down_and_across_sse2 = vp9_post_proc_down_and_across_xmm
2012-12-18 15:31:19 -08:00
prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
2012-11-28 10:00:25 -08:00
specialize vp9_plane_add_noise mmx sse2
vp9_plane_add_noise_sse2 = vp9_plane_add_noise_wmt
2012-12-23 07:20:10 -08:00
fi
2012-11-28 10:00:25 -08:00
2012-12-18 15:31:19 -08:00
prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
2012-11-28 10:00:25 -08:00
specialize vp9_blend_mb_inner
2012-12-18 15:31:19 -08:00
prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
2012-11-28 10:00:25 -08:00
specialize vp9_blend_mb_outer
2012-12-18 15:31:19 -08:00
prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
2012-11-28 10:00:25 -08:00
specialize vp9_blend_b
2012-10-22 13:45:42 -07:00
#
# sad 16x3, 3x16
#
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad16x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"
2012-10-31 14:40:53 -07:00
specialize vp9_sad16x3 sse2
2012-10-22 13:45:42 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"
2012-10-31 14:40:53 -07:00
specialize vp9_sad3x16 sse2
2012-10-22 13:45:42 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, const int source_stride, const int xoffset, const int yoffset, const uint8_t *ref_ptr, const int ref_stride, unsigned int *sse"
2013-01-08 10:44:19 -08:00
specialize vp9_sub_pixel_variance16x2 sse2
2012-11-24 19:33:58 -08:00
#
# Sub Pixel Filters
#
2013-01-25 09:47:09 -08:00
prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
2013-02-07 17:00:37 -08:00
specialize vp9_convolve8 ssse3
2012-11-21 09:16:30 -08:00
2013-01-25 09:47:09 -08:00
prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
2013-02-07 17:00:37 -08:00
specialize vp9_convolve8_horiz ssse3
2012-11-21 09:16:30 -08:00
2013-01-25 09:47:09 -08:00
prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
2013-02-07 17:00:37 -08:00
specialize vp9_convolve8_vert ssse3
2012-11-21 09:16:30 -08:00
2013-01-25 09:47:09 -08:00
prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
2013-02-13 09:15:38 -08:00
specialize vp9_convolve8_avg ssse3
2012-11-21 09:16:30 -08:00
2013-01-25 09:47:09 -08:00
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
2013-02-13 09:15:38 -08:00
specialize vp9_convolve8_avg_horiz ssse3
2012-11-21 09:16:30 -08:00
2013-01-25 09:47:09 -08:00
prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
2013-02-13 09:15:38 -08:00
specialize vp9_convolve8_avg_vert ssse3
2012-11-21 09:16:30 -08:00
2012-11-24 19:33:58 -08:00
#
# dct
#
2012-12-18 15:31:19 -08:00
prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"
2012-12-19 11:53:43 -08:00
specialize vp9_short_idct4x4llm_1
2012-11-24 19:33:58 -08:00
2012-12-18 15:31:19 -08:00
prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
2012-12-19 11:53:43 -08:00
specialize vp9_short_idct4x4llm
2012-11-24 19:33:58 -08:00
2012-12-18 15:31:19 -08:00
prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
2012-11-24 19:33:58 -08:00
specialize vp9_short_idct8x8
2012-12-18 15:31:19 -08:00
prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
2012-11-24 19:33:58 -08:00
specialize vp9_short_idct10_8x8
2013-02-01 15:34:28 -08:00
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8
2012-12-18 15:31:19 -08:00
prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
2012-11-24 19:33:58 -08:00
specialize vp9_short_idct16x16
2012-12-18 15:31:19 -08:00
prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
2012-11-24 19:33:58 -08:00
specialize vp9_short_idct10_16x16
2013-01-31 16:16:28 -08:00
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16
2012-12-18 15:31:19 -08:00
prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 14:45:05 -08:00
specialize vp9_short_idct32x32
2013-02-04 16:49:17 -08:00
prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_32x32
2013-02-05 12:37:13 -08:00
#if CONFIG_INTHT
2013-02-08 16:19:42 -08:00
prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
2013-02-05 12:37:13 -08:00
specialize vp9_short_iht8x8
#endif
2013-02-08 16:19:42 -08:00
#if CONFIG_INTHT4X4
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
#endif
2013-02-16 14:08:36 -08:00
#if CONFIG_INTHT16X16
prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht16x16
#endif
2013-01-08 10:11:26 -08:00
prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
2012-11-29 07:19:38 -08:00
specialize vp9_ihtllm
2012-11-24 19:33:58 -08:00
# dct and add
2012-12-18 15:31:19 -08:00
prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
2012-11-24 19:33:58 -08:00
specialize vp9_dc_only_idct_add
if [ " $CONFIG_LOSSLESS " = "yes" ] ; then
2012-12-18 15:31:19 -08:00
prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
2013-02-11 15:58:22 -08:00
specialize vp9_short_inv_walsh4x4_1_x8
2012-12-18 15:31:19 -08:00
prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"
2013-02-11 15:58:22 -08:00
specialize vp9_short_inv_walsh4x4_x8
2012-12-18 15:31:19 -08:00
prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
2013-02-11 15:58:22 -08:00
specialize vp9_dc_only_inv_walsh_add
2012-11-24 19:33:58 -08:00
fi
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
2012-11-12 16:18:35 -08:00
specialize vp9_sad32x3
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad3x32 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
2012-11-12 16:18:35 -08:00
specialize vp9_sad3x32
2012-10-21 20:47:57 -07:00
#
# Encoder functions below this point.
#
2012-11-01 11:09:58 -07:00
if [ " $CONFIG_VP9_ENCODER " = "yes" ] ; then
2012-10-21 20:47:57 -07:00
# variance
[ $arch = "x86_64" ] && mmx_x86_64 = mmx && sse2_x86_64 = sse2
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance32x32
2012-10-21 20:47:57 -07:00
2013-01-05 18:20:25 -08:00
prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance64x64
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance16x16 mmx sse2
vp9_variance16x16_sse2 = vp9_variance16x16_wmt
vp9_variance16x16_mmx = vp9_variance16x16_mmx
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance16x8 mmx sse2
vp9_variance16x8_sse2 = vp9_variance16x8_wmt
vp9_variance16x8_mmx = vp9_variance16x8_mmx
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance8x16 mmx sse2
vp9_variance8x16_sse2 = vp9_variance8x16_wmt
vp9_variance8x16_mmx = vp9_variance8x16_mmx
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance8x8 mmx sse2
vp9_variance8x8_sse2 = vp9_variance8x8_wmt
vp9_variance8x8_mmx = vp9_variance8x8_mmx
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance4x4 mmx sse2
vp9_variance4x4_sse2 = vp9_variance4x4_wmt
vp9_variance4x4_mmx = vp9_variance4x4_mmx
2012-10-21 20:47:57 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2013-02-06 11:20:59 -08:00
specialize vp9_sub_pixel_variance64x64 sse2
2013-01-05 18:20:25 -08:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2013-02-06 11:20:59 -08:00
specialize vp9_sub_pixel_variance32x32 sse2
2012-10-21 20:47:57 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
2012-10-21 20:47:57 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_sub_pixel_variance8x16 sse2 mmx
vp9_sub_pixel_variance8x16_sse2 = vp9_sub_pixel_variance8x16_wmt
2012-10-21 20:47:57 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
vp9_sub_pixel_variance16x8_sse2 = vp9_sub_pixel_variance16x8_ssse3;
vp9_sub_pixel_variance16x8_sse2 = vp9_sub_pixel_variance16x8_wmt
2012-10-21 20:47:57 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_sub_pixel_variance8x8 sse2 mmx
vp9_sub_pixel_variance8x8_sse2 = vp9_sub_pixel_variance8x8_wmt
2012-10-21 20:47:57 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_sub_pixel_variance4x4 sse2 mmx
vp9_sub_pixel_variance4x4_sse2 = vp9_sub_pixel_variance4x4_wmt
2012-10-21 20:47:57 -07:00
2013-01-05 18:20:25 -08:00
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
2013-02-07 10:24:46 -08:00
specialize vp9_sad64x64 sse2
2013-01-05 18:20:25 -08:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
2013-02-07 10:24:46 -08:00
specialize vp9_sad32x32 sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
2013-02-07 10:24:46 -08:00
specialize vp9_sad16x16 mmx sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
2012-10-30 12:58:42 -07:00
specialize vp9_sad16x8 mmx sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
2012-10-30 12:58:42 -07:00
specialize vp9_sad8x16 mmx sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
2012-10-30 12:58:42 -07:00
specialize vp9_sad8x8 mmx sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
2013-02-07 10:24:46 -08:00
specialize vp9_sad4x4 mmx sse
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance_halfpixvar16x16_h mmx sse2
vp9_variance_halfpixvar16x16_h_sse2 = vp9_variance_halfpixvar16x16_h_wmt
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance_halfpixvar16x16_v mmx sse2
vp9_variance_halfpixvar16x16_v_sse2 = vp9_variance_halfpixvar16x16_v_wmt
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance_halfpixvar16x16_hv mmx sse2
vp9_variance_halfpixvar16x16_hv_sse2 = vp9_variance_halfpixvar16x16_hv_wmt
2012-10-21 20:47:57 -07:00
2013-01-05 18:20:25 -08:00
prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar64x64_h
prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar64x64_v
prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar64x64_hv
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance_halfpixvar32x32_h
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance_halfpixvar32x32_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance_halfpixvar32x32_v
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_variance_halfpixvar32x32_hv
2012-10-21 20:47:57 -07:00
2013-01-05 18:20:25 -08:00
prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
specialize vp9_sad64x64x3
2012-12-18 15:31:19 -08:00
prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad32x32x3
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad16x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad16x16x3 sse3 ssse3
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad16x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad16x8x3 sse3 ssse3
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad8x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad8x16x3 sse3
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad8x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad8x8x3 sse3
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad4x4x3 sse3
2012-10-21 20:47:57 -07:00
2013-01-05 18:20:25 -08:00
prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
specialize vp9_sad64x64x8
2012-12-18 15:31:19 -08:00
prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad32x32x8
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad16x16x8 sse4
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad16x8x8 sse4
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad8x16x8 sse4
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad8x8x8 sse4
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
2012-10-30 12:58:42 -07:00
specialize vp9_sad4x4x8 sse4
2012-10-21 20:47:57 -07:00
2013-01-05 18:20:25 -08:00
prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
2013-02-07 17:28:56 -08:00
specialize vp9_sad64x64x4d sse2
2013-01-05 18:20:25 -08:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
2013-02-07 17:28:56 -08:00
specialize vp9_sad32x32x4d sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
2013-02-07 17:28:56 -08:00
specialize vp9_sad16x16x4d sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
2013-02-07 17:28:56 -08:00
specialize vp9_sad16x8x4d sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
2013-02-07 17:28:56 -08:00
specialize vp9_sad8x16x4d sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
2013-02-07 17:28:56 -08:00
specialize vp9_sad8x8x4d sse2
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
2013-02-07 17:28:56 -08:00
specialize vp9_sad4x4x4d sse
2012-10-21 20:47:57 -07:00
#
# Block copy
#
case $arch in
x86*)
2012-12-18 15:31:19 -08:00
prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n"
2012-10-30 12:58:42 -07:00
specialize vp9_copy32xn sse2 sse3
2012-10-21 20:47:57 -07:00
; ;
esac
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_sub_pixel_mse16x16 sse2 mmx
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_mse16x16 mmx sse2
vp9_mse16x16_sse2 = vp9_mse16x16_wmt
2012-10-21 20:47:57 -07:00
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2013-01-05 18:20:25 -08:00
specialize vp9_sub_pixel_mse64x64
2013-02-01 16:45:54 -08:00
prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
2012-10-30 12:58:42 -07:00
specialize vp9_sub_pixel_mse32x32
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype unsigned int vp9_get_mb_ss "const int16_t *"
2012-10-30 12:58:42 -07:00
specialize vp9_get_mb_ss mmx sse2
2012-10-28 10:38:23 -07:00
# ENCODEMB INVOKE
2013-02-15 10:15:42 -08:00
prototype int vp9_mbblock_error "struct macroblock *mb"
2012-10-30 12:58:42 -07:00
specialize vp9_mbblock_error mmx sse2
vp9_mbblock_error_sse2 = vp9_mbblock_error_xmm
2012-10-28 10:38:23 -07:00
2012-12-18 15:31:19 -08:00
prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
2012-10-30 12:58:42 -07:00
specialize vp9_block_error mmx sse2
vp9_block_error_sse2 = vp9_block_error_xmm
2012-10-28 10:38:23 -07:00
2012-10-30 12:58:42 -07:00
prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
specialize vp9_subtract_b mmx sse2
2012-10-28 10:38:23 -07:00
2012-10-30 12:58:42 -07:00
prototype int vp9_mbuverror "struct macroblock *mb"
specialize vp9_mbuverror mmx sse2
vp9_mbuverror_sse2 = vp9_mbuverror_xmm
2012-10-28 10:38:23 -07:00
2012-10-30 12:58:42 -07:00
prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
specialize vp9_subtract_b mmx sse2
2012-10-28 10:38:23 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_subtract_mby "int16_t *diff, uint8_t *src, uint8_t *pred, int stride"
2012-10-30 12:58:42 -07:00
specialize vp9_subtract_mby mmx sse2
2012-10-28 10:38:23 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_subtract_mbuv "int16_t *diff, uint8_t *usrc, uint8_t *vsrc, uint8_t *pred, int stride"
2012-10-30 12:58:42 -07:00
specialize vp9_subtract_mbuv mmx sse2
2012-10-21 20:47:57 -07:00
#
# Structured Similarity (SSIM)
#
if [ " $CONFIG_INTERNAL_STATS " = "yes" ] ; then
[ $arch = "x86_64" ] && sse2_on_x86_64 = sse2
2012-12-18 15:31:19 -08:00
prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
2012-10-30 12:58:42 -07:00
specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
2012-10-21 20:47:57 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
2012-10-30 12:58:42 -07:00
specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
2012-10-21 20:47:57 -07:00
fi
2012-10-29 11:25:56 -07:00
# fdct functions
2012-12-18 15:31:19 -08:00
prototype void vp9_fht "const int16_t *input, int pitch, int16_t *output, int tx_type, int tx_dim"
2012-10-31 09:38:51 -07:00
specialize vp9_fht
2012-12-18 15:31:19 -08:00
prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
2012-10-30 12:58:42 -07:00
specialize vp9_short_fdct8x8
2012-10-29 11:25:56 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
2012-10-30 12:58:42 -07:00
specialize vp9_short_fdct4x4
2012-10-29 11:25:56 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
2012-10-30 12:58:42 -07:00
specialize vp9_short_fdct8x4
2012-10-29 11:25:56 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 14:45:05 -08:00
specialize vp9_short_fdct32x32
2012-12-18 15:31:19 -08:00
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
2012-10-30 12:58:42 -07:00
specialize vp9_short_fdct16x16
2012-10-29 11:25:56 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
2012-10-30 12:58:42 -07:00
specialize vp9_short_walsh4x4_x8
2012-10-29 11:25:56 -07:00
2012-12-18 15:31:19 -08:00
prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
2012-10-30 12:58:42 -07:00
specialize vp9_short_walsh8x4_x8
2012-10-29 11:25:56 -07:00
2012-11-05 16:58:03 -08:00
#
# Motion search
#
prototype int vp9_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
specialize vp9_full_search_sad sse3 sse4_1
vp9_full_search_sad_sse3 = vp9_full_search_sadx3
vp9_full_search_sad_sse4_1 = vp9_full_search_sadx8
prototype int vp9_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
specialize vp9_refining_search_sad sse3
vp9_refining_search_sad_sse3 = vp9_refining_search_sadx4
prototype int vp9_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
2012-11-17 06:11:01 -08:00
specialize vp9_diamond_search_sad sse3
2012-11-05 16:58:03 -08:00
vp9_diamond_search_sad_sse3 = vp9_diamond_search_sadx4
2012-12-18 15:31:19 -08:00
prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
2012-11-17 06:11:01 -08:00
specialize vp9_temporal_filter_apply sse2
2012-11-18 12:33:18 -08:00
prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
2012-12-02 14:14:00 -08:00
specialize vp9_yv12_copy_partial_frame
2012-11-18 12:33:18 -08:00
2012-11-17 06:11:01 -08:00
2012-10-21 20:47:57 -07:00
fi
# end encoder functions