Clean up and speed up CLPF clipping
* Move clipping tests from inside to outside loops * Let sizex and sizey to clpf_block() be the clipped block size rather than both just bs * Make fallback tests to C more accurate Change-Id: Icdc57540ce21b41a95403fdcc37988a4ebf546c7
This commit is contained in:
		
				
					committed by
					
						
						Yaowu Xu
					
				
			
			
				
	
			
			
			
						parent
						
							6116141c23
						
					
				
				
					commit
					e66fc87c46
				
			@@ -153,8 +153,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
 | 
				
			|||||||
        // Iterate over all smaller blocks inside the filter block
 | 
					        // Iterate over all smaller blocks inside the filter block
 | 
				
			||||||
        for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
 | 
					        for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
 | 
				
			||||||
          for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
 | 
					          for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
 | 
				
			||||||
 | 
					            int sizex, sizey;
 | 
				
			||||||
            xpos = xoff + n * bs;
 | 
					            xpos = xoff + n * bs;
 | 
				
			||||||
            ypos = yoff + m * bs;
 | 
					            ypos = yoff + m * bs;
 | 
				
			||||||
 | 
					            sizex = AOMMIN(width - xpos, bs);
 | 
				
			||||||
 | 
					            sizey = AOMMIN(height - ypos, bs);
 | 
				
			||||||
            if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
 | 
					            if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
 | 
				
			||||||
                                     (xpos << subx) / MI_SIZE]
 | 
					                                     (xpos << subx) / MI_SIZE]
 | 
				
			||||||
                     ->mbmi.skip) {  // Not skip block
 | 
					                     ->mbmi.skip) {  // Not skip block
 | 
				
			||||||
@@ -164,30 +167,49 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
 | 
				
			|||||||
#if CONFIG_AOM_HIGHBITDEPTH
 | 
					#if CONFIG_AOM_HIGHBITDEPTH
 | 
				
			||||||
                if (cm->use_highbitdepth) {
 | 
					                if (cm->use_highbitdepth) {
 | 
				
			||||||
                  uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
 | 
					                  uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
 | 
				
			||||||
                  for (c = 0; c < bs; c++) {
 | 
					                  if (sizex == 8) {
 | 
				
			||||||
                    *(uint64_t *)(d + c * sstride) =
 | 
					                    for (c = 0; c < sizey; c++) {
 | 
				
			||||||
                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
 | 
					                      *(uint64_t *)(d + c * sstride) =
 | 
				
			||||||
                    if (bs == 8)
 | 
					                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
 | 
				
			||||||
                      *(uint64_t *)(d + c * sstride + 4) =
 | 
					                      *(uint64_t *)(d + c * sstride + 4) =
 | 
				
			||||||
                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
 | 
					                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                  } else if (sizex == 4) {
 | 
				
			||||||
 | 
					                    for (c = 0; c < sizey; c++)
 | 
				
			||||||
 | 
					                      *(uint64_t *)(d + c * sstride) =
 | 
				
			||||||
 | 
					                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
 | 
				
			||||||
 | 
					                  } else {
 | 
				
			||||||
 | 
					                    for (c = 0; c < sizey; c++)
 | 
				
			||||||
 | 
					                      memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
 | 
				
			||||||
 | 
					                             sizex);
 | 
				
			||||||
                  }
 | 
					                  }
 | 
				
			||||||
                } else {
 | 
					                } else {
 | 
				
			||||||
                  for (c = 0; c < bs; c++)
 | 
					                  if (sizex == 8)
 | 
				
			||||||
                    if (bs == 8)
 | 
					                    for (c = 0; c < sizey; c++)
 | 
				
			||||||
                      *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
					                      *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
				
			||||||
                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
 | 
					                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
 | 
				
			||||||
                    else
 | 
					                  else if (sizex == 4)
 | 
				
			||||||
 | 
					                    for (c = 0; c < sizey; c++)
 | 
				
			||||||
                      *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
					                      *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
				
			||||||
                          *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
 | 
					                          *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
 | 
				
			||||||
 | 
					                  else
 | 
				
			||||||
 | 
					                    for (c = 0; c < sizey; c++)
 | 
				
			||||||
 | 
					                      memcpy(cache_dst[cache_idx] + c * sstride,
 | 
				
			||||||
 | 
					                             cache_ptr[cache_idx] + c * bs, sizex);
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
                for (c = 0; c < bs; c++)
 | 
					                if (sizex == 8)
 | 
				
			||||||
                  if (bs == 8)
 | 
					                  for (c = 0; c < sizey; c++)
 | 
				
			||||||
                    *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
					                    *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
				
			||||||
                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
 | 
					                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
 | 
				
			||||||
                  else
 | 
					                else if (sizex == 4)
 | 
				
			||||||
 | 
					                  for (c = 0; c < sizey; c++)
 | 
				
			||||||
                    *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
					                    *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
 | 
				
			||||||
                        *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
 | 
					                        *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
 | 
				
			||||||
 | 
					                else
 | 
				
			||||||
 | 
					                  for (c = 0; c < sizey; c++)
 | 
				
			||||||
 | 
					                    memcpy(cache_dst[cache_idx] + c * sstride,
 | 
				
			||||||
 | 
					                           cache_ptr[cache_idx] + c * bs, sizex);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
#if CONFIG_AOM_HIGHBITDEPTH
 | 
					#if CONFIG_AOM_HIGHBITDEPTH
 | 
				
			||||||
@@ -211,15 +233,15 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
 | 
				
			|||||||
              if (cm->use_highbitdepth) {
 | 
					              if (cm->use_highbitdepth) {
 | 
				
			||||||
                aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
 | 
					                aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
 | 
				
			||||||
                                   CONVERT_TO_SHORTPTR(dst_buffer), sstride,
 | 
					                                   CONVERT_TO_SHORTPTR(dst_buffer), sstride,
 | 
				
			||||||
                                   dstride, xpos, ypos, bs, bs, width, height,
 | 
					                                   dstride, xpos, ypos, sizex, sizey, width,
 | 
				
			||||||
                                   strength);
 | 
					                                   height, strength);
 | 
				
			||||||
              } else {
 | 
					              } else {
 | 
				
			||||||
                aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
 | 
					                aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
 | 
				
			||||||
                               ypos, bs, bs, width, height, strength);
 | 
					                               ypos, sizex, sizey, width, height, strength);
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
              aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
 | 
					              aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
 | 
				
			||||||
                             ypos, bs, bs, width, height, strength);
 | 
					                             ypos, sizex, sizey, width, height, strength);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
          }
 | 
					          }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -76,24 +76,27 @@ static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
 | 
				
			|||||||
    v128 o = v128_from_v64(l1, l2);
 | 
					    v128 o = v128_from_v64(l1, l2);
 | 
				
			||||||
    const v128 a =
 | 
					    const v128 a =
 | 
				
			||||||
        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
 | 
					        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
 | 
				
			||||||
    v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src - 2 * !!x0 + sstride));
 | 
					 | 
				
			||||||
    v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src - !!x0 + sstride));
 | 
					 | 
				
			||||||
    v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src + !!right + sstride));
 | 
					 | 
				
			||||||
    v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src + 2 * !!right + sstride));
 | 
					 | 
				
			||||||
    const v128 f = v128_from_v64(
 | 
					    const v128 f = v128_from_v64(
 | 
				
			||||||
        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
 | 
					        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
 | 
				
			||||||
 | 
					    v128 b, c, d, e;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (!x0) {  // Left clipping
 | 
					    if (x0) {
 | 
				
			||||||
      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
 | 
					      b = v128_from_v64(v64_load_unaligned(src - 2),
 | 
				
			||||||
      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
 | 
					                        v64_load_unaligned(src - 2 + sstride));
 | 
				
			||||||
 | 
					      c = v128_from_v64(v64_load_unaligned(src - 1),
 | 
				
			||||||
 | 
					                        v64_load_unaligned(src - 1 + sstride));
 | 
				
			||||||
 | 
					    } else {  // Left clipping
 | 
				
			||||||
 | 
					      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
 | 
				
			||||||
 | 
					      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if (!right) {  // Right clipping
 | 
					    if (right) {
 | 
				
			||||||
      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
 | 
					      d = v128_from_v64(v64_load_unaligned(src + 1),
 | 
				
			||||||
      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
 | 
					                        v64_load_unaligned(src + 1 + sstride));
 | 
				
			||||||
 | 
					      e = v128_from_v64(v64_load_unaligned(src + 2),
 | 
				
			||||||
 | 
					                        v64_load_unaligned(src + 2 + sstride));
 | 
				
			||||||
 | 
					    } else {  // Right clipping
 | 
				
			||||||
 | 
					      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
 | 
				
			||||||
 | 
					      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
 | 
					    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
 | 
				
			||||||
@@ -134,31 +137,34 @@ static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
 | 
				
			|||||||
    const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
 | 
					    const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
 | 
				
			||||||
    v128 o = v128_from_32(l1, l2, l3, l4);
 | 
					    v128 o = v128_from_32(l1, l2, l3, l4);
 | 
				
			||||||
    const v128 a = v128_from_32(l0, l1, l2, l3);
 | 
					    const v128 a = v128_from_32(l0, l1, l2, l3);
 | 
				
			||||||
    v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + sstride - 2 * !!x0),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
 | 
					 | 
				
			||||||
    v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + sstride - !!x0),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 2 * sstride - !!x0),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 3 * sstride - !!x0));
 | 
					 | 
				
			||||||
    v128 d = v128_from_32(u32_load_unaligned(src + !!right),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + sstride + !!right),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 2 * sstride + !!right),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 3 * sstride + !!right));
 | 
					 | 
				
			||||||
    v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + sstride + 2 * !!right),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 2 * sstride + 2 * !!right),
 | 
					 | 
				
			||||||
                          u32_load_unaligned(src + 3 * sstride + 2 * !!right));
 | 
					 | 
				
			||||||
    const v128 f = v128_from_32(l2, l3, l4, l5);
 | 
					    const v128 f = v128_from_32(l2, l3, l4, l5);
 | 
				
			||||||
 | 
					    v128 b, c, d, e;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (!x0) {  // Left clipping
 | 
					    if (x0) {
 | 
				
			||||||
      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
 | 
					      b = v128_from_32(u32_load_unaligned(src - 2),
 | 
				
			||||||
      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
 | 
					                       u32_load_unaligned(src + sstride - 2),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 2 * sstride - 2),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 3 * sstride - 2));
 | 
				
			||||||
 | 
					      c = v128_from_32(u32_load_unaligned(src - 1),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + sstride - 1),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 2 * sstride - 1),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 3 * sstride - 1));
 | 
				
			||||||
 | 
					    } else {  // Left clipping
 | 
				
			||||||
 | 
					      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
 | 
				
			||||||
 | 
					      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if (!right) {  // Right clipping
 | 
					    if (right) {
 | 
				
			||||||
      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
 | 
					      d = v128_from_32(u32_load_unaligned(src + 1),
 | 
				
			||||||
      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
 | 
					                       u32_load_unaligned(src + sstride + 1),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 2 * sstride + 1),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 3 * sstride + 1));
 | 
				
			||||||
 | 
					      e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + sstride + 2),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 2 * sstride + 2),
 | 
				
			||||||
 | 
					                       u32_load_unaligned(src + 3 * sstride + 2));
 | 
				
			||||||
 | 
					    } else {  // Right clipping
 | 
				
			||||||
 | 
					      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
 | 
				
			||||||
 | 
					      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
 | 
					    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
 | 
				
			||||||
@@ -176,9 +182,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
 | 
				
			|||||||
                               int dstride, int x0, int y0, int sizex,
 | 
					                               int dstride, int x0, int y0, int sizex,
 | 
				
			||||||
                               int sizey, int width, int height,
 | 
					                               int sizey, int width, int height,
 | 
				
			||||||
                               unsigned int strength) {
 | 
					                               unsigned int strength) {
 | 
				
			||||||
  if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
 | 
					  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
 | 
				
			||||||
      (sizey & 3 && sizex == 4) || x0 + 4 > width) {
 | 
					    // Fallback to C for odd sizes:
 | 
				
			||||||
    // Fallback to C for odd sizes
 | 
					    // * block widths not 4 or 8
 | 
				
			||||||
 | 
					    // * block heights not a multiple of 4 if the block width is 4
 | 
				
			||||||
    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
 | 
					    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
 | 
				
			||||||
                     height, strength);
 | 
					                     height, strength);
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
@@ -255,24 +262,27 @@ SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
 | 
				
			|||||||
    v128 o = v128_from_v64(l1, l2);
 | 
					    v128 o = v128_from_v64(l1, l2);
 | 
				
			||||||
    const v128 a =
 | 
					    const v128 a =
 | 
				
			||||||
        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
 | 
					        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
 | 
				
			||||||
    v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src - 2 * !!x0 + sstride));
 | 
					 | 
				
			||||||
    v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src - !!x0 + sstride));
 | 
					 | 
				
			||||||
    v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src + !!right + sstride));
 | 
					 | 
				
			||||||
    v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
 | 
					 | 
				
			||||||
                           v64_load_unaligned(src + 2 * !!right + sstride));
 | 
					 | 
				
			||||||
    const v128 f = v128_from_v64(
 | 
					    const v128 f = v128_from_v64(
 | 
				
			||||||
        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
 | 
					        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
 | 
				
			||||||
 | 
					    v128 b, c, d, e;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (!x0) {  // Left clipping
 | 
					    if (x0) {
 | 
				
			||||||
      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
 | 
					      b = v128_from_v64(v64_load_unaligned(src - 2),
 | 
				
			||||||
      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
 | 
					                        v64_load_unaligned(src - 2 + sstride));
 | 
				
			||||||
 | 
					      c = v128_from_v64(v64_load_unaligned(src - 1),
 | 
				
			||||||
 | 
					                        v64_load_unaligned(src - 1 + sstride));
 | 
				
			||||||
 | 
					    } else {  // Left clipping
 | 
				
			||||||
 | 
					      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
 | 
				
			||||||
 | 
					      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if (!right) {  // Right clipping
 | 
					    if (right) {
 | 
				
			||||||
      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
 | 
					      d = v128_from_v64(v64_load_unaligned(src + 1),
 | 
				
			||||||
      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
 | 
					                        v64_load_unaligned(src + 1 + sstride));
 | 
				
			||||||
 | 
					      e = v128_from_v64(v64_load_unaligned(src + 2),
 | 
				
			||||||
 | 
					                        v64_load_unaligned(src + 2 + sstride));
 | 
				
			||||||
 | 
					    } else {  // Right clipping
 | 
				
			||||||
 | 
					      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
 | 
				
			||||||
 | 
					      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
 | 
					    calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
 | 
				
			||||||
    src += sstride * 2;
 | 
					    src += sstride * 2;
 | 
				
			||||||
@@ -309,18 +319,21 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
 | 
				
			|||||||
    const v128 o = v128_load_aligned(src);
 | 
					    const v128 o = v128_load_aligned(src);
 | 
				
			||||||
    const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
 | 
					    const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
 | 
				
			||||||
    const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
 | 
					    const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
 | 
				
			||||||
    v128 b = v128_load_unaligned(src - 2 * !!x0);
 | 
					    v128 b, c, d, e;
 | 
				
			||||||
    v128 c = v128_load_unaligned(src - !!x0);
 | 
					 | 
				
			||||||
    v128 d = v128_load_unaligned(src + !!right);
 | 
					 | 
				
			||||||
    v128 e = v128_load_unaligned(src + 2 * !!right);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (!x0) {  // Left clipping
 | 
					    if (x0) {
 | 
				
			||||||
      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
 | 
					      b = v128_load_unaligned(src - 2);
 | 
				
			||||||
      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
 | 
					      c = v128_load_unaligned(src - 1);
 | 
				
			||||||
 | 
					    } else {  // Left clipping
 | 
				
			||||||
 | 
					      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
 | 
				
			||||||
 | 
					      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if (!right) {  // Right clipping
 | 
					    if (right) {
 | 
				
			||||||
      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
 | 
					      d = v128_load_unaligned(src + 1);
 | 
				
			||||||
      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
 | 
					      e = v128_load_unaligned(src + 2);
 | 
				
			||||||
 | 
					    } else {  // Right clipping
 | 
				
			||||||
 | 
					      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
 | 
				
			||||||
 | 
					      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
 | 
					    calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
 | 
				
			||||||
    src += sstride;
 | 
					    src += sstride;
 | 
				
			||||||
@@ -332,8 +345,10 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
 | 
				
			|||||||
                                   int sstride, int dstride, int x0, int y0,
 | 
					                                   int sstride, int dstride, int x0, int y0,
 | 
				
			||||||
                                   int sizex, int sizey, int width, int height,
 | 
					                                   int sizex, int sizey, int width, int height,
 | 
				
			||||||
                                   unsigned int strength) {
 | 
					                                   unsigned int strength) {
 | 
				
			||||||
  if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
 | 
					  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
 | 
				
			||||||
    // Fallback to C for odd sizes
 | 
					    // Fallback to C for odd sizes:
 | 
				
			||||||
 | 
					    // * block width not 4 or 8
 | 
				
			||||||
 | 
					    // * block heights not a multiple of 2 if the block width is 4
 | 
				
			||||||
    aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
 | 
					    aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
 | 
				
			||||||
                         width, height, strength);
 | 
					                         width, height, strength);
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user