Added sse2 acceleration for highbitdepth variance

This is a combination of:
  4a19fa6 Added sse2 acceleration for highbitdepth variance
  c6f5d3b Fix high bit depth assembly function bugs

Change-Id: I446bdf3a405e4e9d2aa633d6281d66ea0cdfd79f
This commit is contained in:
Peter de Rivaz
2014-10-16 14:00:54 +01:00
committed by Yaowu Xu
parent 41973e0e3e
commit 2189a51891
5 changed files with 2092 additions and 108 deletions

View File

@@ -1557,34 +1557,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# variance
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x16/;
specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x32/;
specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x32/;
specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x64/;
specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x32/;
specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x64/;
specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x16/;
specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x8/;
specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x16/;
specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x8/;
specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x4/;
@@ -1596,40 +1596,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_variance4x4/;
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get8x8var/;
specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get16x16var/;
specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x16/;
specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x32/;
specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x32/;
specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x64/;
specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x32/;
specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x64/;
specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x16/;
specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x8/;
specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x16/;
specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x8/;
specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x4/;
@@ -1641,40 +1641,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_variance4x4/;
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get8x8var/;
specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get16x16var/;
specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x16/;
specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x32/;
specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x32/;
specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x64/;
specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x32/;
specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x64/;
specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x16/;
specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x8/;
specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x16/;
specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x8/;
specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x4/;
@@ -1686,76 +1686,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_variance4x4/;
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get8x8var/;
specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get16x16var/;
specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@@ -1770,70 +1770,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@@ -1848,70 +1848,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@@ -2091,7 +2091,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sad4x4x4d sse2/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x16/;
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x16/;
@@ -2100,10 +2100,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_mse16x8/;
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x8/;
specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse16x16/;
specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x16/;
@@ -2112,10 +2112,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_mse16x8/;
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x8/;
specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse16x16/;
specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x16/;
@@ -2124,7 +2124,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_mse16x8/;
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x8/;
specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
if (vpx_config("CONFIG_WEDGE_PARTITION") eq "yes") {
add_proto qw/unsigned int vp9_highbd_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,313 @@
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp9_highbd_calc16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
sym(vp9_highbd_calc16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
add rax, rax ; source stride in bytes
add rdx, rdx ; recon stride in bytes
; Prefetch data
prefetcht0 [rsi]
prefetcht0 [rsi+16]
prefetcht0 [rsi+rax]
prefetcht0 [rsi+rax+16]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax+16]
prefetcht0 [rdi]
prefetcht0 [rdi+16]
prefetcht0 [rdi+rdx]
prefetcht0 [rdi+rdx+16]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx+16]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax+16]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx+16]
pxor xmm5, xmm5
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+16]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+16]
paddd xmm6, xmm1
psubw xmm3, xmm2
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm3
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax+16]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
paddd xmm6, xmm1
psubw xmm3, xmm2
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm3
movdqa xmm1, xmm5
movdqa xmm2, xmm5
pcmpgtw xmm1, xmm0
pcmpeqw xmm2, xmm0
por xmm1, xmm2
pcmpeqw xmm1, xmm0
movdqa xmm2, xmm5
punpcklwd xmm5, xmm1
punpckhwd xmm2, xmm1
paddd xmm7, xmm5
paddd xmm7, xmm2
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
sub rcx, 2
jnz .var16loop
movdqa xmm4, xmm6
punpckldq xmm6, xmm0
punpckhdq xmm4, xmm0
movdqa xmm5, xmm7
paddd xmm6, xmm4
punpckldq xmm7, xmm0
punpckhdq xmm5, xmm0
paddd xmm7, xmm5
movdqa xmm4, xmm6
movdqa xmm5, xmm7
psrldq xmm4, 8
psrldq xmm5, 8
paddd xmm6, xmm4
paddd xmm7, xmm5
mov rdi, arg(4) ; [SSE]
mov rax, arg(5) ; [Sum]
movd DWORD PTR [rdi], xmm6
movd DWORD PTR [rax], xmm7
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp9_highbd_calc8x8var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
sym(vp9_highbd_calc8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
add rax, rax ; source stride in bytes
add rdx, rdx ; recon stride in bytes
; Prefetch data
prefetcht0 [rsi]
prefetcht0 [rsi+rax]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
prefetcht0 [rdi]
prefetcht0 [rdi+rdx]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 8
.var8loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
lea rbx, [rsi+rax*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
lea rbx, [rbx+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
lea rbx, [rdi+rdx*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
lea rbx, [rbx+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
pxor xmm5, xmm5
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm1
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
psubw xmm3, xmm2
movdqu xmm1, XMMWORD PTR [rsi]
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
movdqu xmm2, XMMWORD PTR [rdi]
paddd xmm6, xmm3
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm1
psubw xmm3, xmm2
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm3
movdqa xmm1, xmm5
movdqa xmm2, xmm5
pcmpgtw xmm1, xmm0
pcmpeqw xmm2, xmm0
por xmm1, xmm2
pcmpeqw xmm1, xmm0
movdqa xmm2, xmm5
punpcklwd xmm5, xmm1
punpckhwd xmm2, xmm1
paddd xmm7, xmm5
paddd xmm7, xmm2
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
sub rcx, 4
jnz .var8loop
movdqa xmm4, xmm6
punpckldq xmm6, xmm0
punpckhdq xmm4, xmm0
movdqa xmm5, xmm7
paddd xmm6, xmm4
punpckldq xmm7, xmm0
punpckhdq xmm5, xmm0
paddd xmm7, xmm5
movdqa xmm4, xmm6
movdqa xmm5, xmm7
psrldq xmm4, 8
psrldq xmm5, 8
paddd xmm6, xmm4
paddd xmm7, xmm5
mov rdi, arg(4) ; [SSE]
mov rax, arg(5) ; [Sum]
movd DWORD PTR [rdi], xmm6
movd DWORD PTR [rax], xmm7
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

View File

@@ -0,0 +1,613 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef unsigned int (*high_variance_fn_t) (const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
unsigned int *sse, int *sum);
static void highbd_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
}
#define HIGH_GET_VAR(S) \
void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
} \
\
void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
} \
\
void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
}
HIGH_GET_VAR(16);
HIGH_GET_VAR(8);
#undef HIGH_GET_VAR
#define VAR_FN(w, h, block_size, shift) \
unsigned int vp9_highbd_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
unsigned int vp9_highbd_10_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
unsigned int vp9_highbd_12_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
}
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
VAR_FN(32, 64, 16, 11);
VAR_FN(32, 32, 16, 10);
VAR_FN(32, 16, 16, 9);
VAR_FN(16, 32, 16, 9);
VAR_FN(16, 16, 16, 8);
VAR_FN(16, 8, 8, 7);
VAR_FN(8, 16, 8, 7);
VAR_FN(8, 8, 8, 6);
#undef VAR_FN
unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse);
#define DECLS(opt1, opt2) \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
// DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int \
vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, h, \
&sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr) { \
int start_row; \
unsigned int sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
for (start_row = 0; start_row < h; start_row +=16) { \
unsigned int sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
}\
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2, sse);
#undef FNS
#undef FN
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
const uint16_t *sec, \
ptrdiff_t sec_stride, \
int height, \
unsigned int *sse);
#define DECLS(opt1) \
DECL(16, opt1) \
DECL(8, opt1)
DECLS(sse2);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr, \
const uint8_t *sec8) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr, \
const uint8_t *sec8) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr, \
const uint8_t *sec8) { \
int start_row; \
unsigned int sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
for (start_row = 0; start_row < h; start_row +=16) { \
unsigned int sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, x_offset, \
y_offset, dst + (start_row * dst_stride), dst_stride, \
sec + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 16 + (start_row * dst_stride), dst_stride, \
sec + 16 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 32 + (start_row * dst_stride), dst_stride, \
sec + 32 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 48 + (start_row * dst_stride), dst_stride, \
sec + 48 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
} \
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2);
#undef FNS
#undef FN

View File

@@ -116,6 +116,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
endif
ifeq ($(CONFIG_USE_X86INC),yes)
@@ -127,6 +128,8 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
endif
endif