Fix overflow issue in 32x32 idct NEON intrinsics
Similar issue as Change bc1c18e.
The PartialIDctTest.ResultsMatch test on vpx_idct32x32_135_add_neon()
in high bit-depth mode exposes 16-bit overflow in final stage of pass
2, when changing the test number from 1,000 to 1,000,000.
Change to use saturating add/sub for vpx_idct32x32_34_add_neon(),
vpx_idct32x32_135_add_neon and vpx_idct32x32_1024_add_neon() in high
bit-depth mode.
Change-Id: Iaec0e9aeab41a3fdb4e170d7e9b3ad1fda922f6f
			
			
This commit is contained in:
		@@ -612,50 +612,50 @@ static void idct32_16_neon(const int16_t *const input, uint8_t *const output,
 | 
			
		||||
  s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
 | 
			
		||||
 | 
			
		||||
  // final stage
 | 
			
		||||
  out[0] = vaddq_s16(s7[0], s6[31]);
 | 
			
		||||
  out[1] = vaddq_s16(s7[1], s6[30]);
 | 
			
		||||
  out[2] = vaddq_s16(s7[2], s6[29]);
 | 
			
		||||
  out[3] = vaddq_s16(s7[3], s6[28]);
 | 
			
		||||
  out[4] = vaddq_s16(s7[4], s7[27]);
 | 
			
		||||
  out[5] = vaddq_s16(s7[5], s7[26]);
 | 
			
		||||
  out[6] = vaddq_s16(s7[6], s7[25]);
 | 
			
		||||
  out[7] = vaddq_s16(s7[7], s7[24]);
 | 
			
		||||
  out[0] = final_add(s7[0], s6[31]);
 | 
			
		||||
  out[1] = final_add(s7[1], s6[30]);
 | 
			
		||||
  out[2] = final_add(s7[2], s6[29]);
 | 
			
		||||
  out[3] = final_add(s7[3], s6[28]);
 | 
			
		||||
  out[4] = final_add(s7[4], s7[27]);
 | 
			
		||||
  out[5] = final_add(s7[5], s7[26]);
 | 
			
		||||
  out[6] = final_add(s7[6], s7[25]);
 | 
			
		||||
  out[7] = final_add(s7[7], s7[24]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output, stride);
 | 
			
		||||
 | 
			
		||||
  out[0] = vaddq_s16(s7[8], s7[23]);
 | 
			
		||||
  out[1] = vaddq_s16(s7[9], s7[22]);
 | 
			
		||||
  out[2] = vaddq_s16(s7[10], s7[21]);
 | 
			
		||||
  out[3] = vaddq_s16(s7[11], s7[20]);
 | 
			
		||||
  out[4] = vaddq_s16(s7[12], s6[19]);
 | 
			
		||||
  out[5] = vaddq_s16(s7[13], s6[18]);
 | 
			
		||||
  out[6] = vaddq_s16(s7[14], s6[17]);
 | 
			
		||||
  out[7] = vaddq_s16(s7[15], s6[16]);
 | 
			
		||||
  out[0] = final_add(s7[8], s7[23]);
 | 
			
		||||
  out[1] = final_add(s7[9], s7[22]);
 | 
			
		||||
  out[2] = final_add(s7[10], s7[21]);
 | 
			
		||||
  out[3] = final_add(s7[11], s7[20]);
 | 
			
		||||
  out[4] = final_add(s7[12], s6[19]);
 | 
			
		||||
  out[5] = final_add(s7[13], s6[18]);
 | 
			
		||||
  out[6] = final_add(s7[14], s6[17]);
 | 
			
		||||
  out[7] = final_add(s7[15], s6[16]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output + (8 * stride), stride);
 | 
			
		||||
 | 
			
		||||
  out[0] = vsubq_s16(s7[15], s6[16]);
 | 
			
		||||
  out[1] = vsubq_s16(s7[14], s6[17]);
 | 
			
		||||
  out[2] = vsubq_s16(s7[13], s6[18]);
 | 
			
		||||
  out[3] = vsubq_s16(s7[12], s6[19]);
 | 
			
		||||
  out[4] = vsubq_s16(s7[11], s7[20]);
 | 
			
		||||
  out[5] = vsubq_s16(s7[10], s7[21]);
 | 
			
		||||
  out[6] = vsubq_s16(s7[9], s7[22]);
 | 
			
		||||
  out[7] = vsubq_s16(s7[8], s7[23]);
 | 
			
		||||
  out[0] = final_sub(s7[15], s6[16]);
 | 
			
		||||
  out[1] = final_sub(s7[14], s6[17]);
 | 
			
		||||
  out[2] = final_sub(s7[13], s6[18]);
 | 
			
		||||
  out[3] = final_sub(s7[12], s6[19]);
 | 
			
		||||
  out[4] = final_sub(s7[11], s7[20]);
 | 
			
		||||
  out[5] = final_sub(s7[10], s7[21]);
 | 
			
		||||
  out[6] = final_sub(s7[9], s7[22]);
 | 
			
		||||
  out[7] = final_sub(s7[8], s7[23]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output + (16 * stride), stride);
 | 
			
		||||
 | 
			
		||||
  out[0] = vsubq_s16(s7[7], s7[24]);
 | 
			
		||||
  out[1] = vsubq_s16(s7[6], s7[25]);
 | 
			
		||||
  out[2] = vsubq_s16(s7[5], s7[26]);
 | 
			
		||||
  out[3] = vsubq_s16(s7[4], s7[27]);
 | 
			
		||||
  out[4] = vsubq_s16(s7[3], s6[28]);
 | 
			
		||||
  out[5] = vsubq_s16(s7[2], s6[29]);
 | 
			
		||||
  out[6] = vsubq_s16(s7[1], s6[30]);
 | 
			
		||||
  out[7] = vsubq_s16(s7[0], s6[31]);
 | 
			
		||||
  out[0] = final_sub(s7[7], s7[24]);
 | 
			
		||||
  out[1] = final_sub(s7[6], s7[25]);
 | 
			
		||||
  out[2] = final_sub(s7[5], s7[26]);
 | 
			
		||||
  out[3] = final_sub(s7[4], s7[27]);
 | 
			
		||||
  out[4] = final_sub(s7[3], s6[28]);
 | 
			
		||||
  out[5] = final_sub(s7[2], s6[29]);
 | 
			
		||||
  out[6] = final_sub(s7[1], s6[30]);
 | 
			
		||||
  out[7] = final_sub(s7[0], s6[31]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output + (24 * stride), stride);
 | 
			
		||||
 
 | 
			
		||||
@@ -451,50 +451,50 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
 | 
			
		||||
  s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
 | 
			
		||||
 | 
			
		||||
  // final stage
 | 
			
		||||
  out[0] = vaddq_s16(s1[0], s2[31]);
 | 
			
		||||
  out[1] = vaddq_s16(s1[1], s2[30]);
 | 
			
		||||
  out[2] = vaddq_s16(s1[2], s2[29]);
 | 
			
		||||
  out[3] = vaddq_s16(s1[3], s2[28]);
 | 
			
		||||
  out[4] = vaddq_s16(s1[4], s1[27]);
 | 
			
		||||
  out[5] = vaddq_s16(s1[5], s1[26]);
 | 
			
		||||
  out[6] = vaddq_s16(s1[6], s1[25]);
 | 
			
		||||
  out[7] = vaddq_s16(s1[7], s1[24]);
 | 
			
		||||
  out[0] = final_add(s1[0], s2[31]);
 | 
			
		||||
  out[1] = final_add(s1[1], s2[30]);
 | 
			
		||||
  out[2] = final_add(s1[2], s2[29]);
 | 
			
		||||
  out[3] = final_add(s1[3], s2[28]);
 | 
			
		||||
  out[4] = final_add(s1[4], s1[27]);
 | 
			
		||||
  out[5] = final_add(s1[5], s1[26]);
 | 
			
		||||
  out[6] = final_add(s1[6], s1[25]);
 | 
			
		||||
  out[7] = final_add(s1[7], s1[24]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output, stride);
 | 
			
		||||
 | 
			
		||||
  out[0] = vaddq_s16(s1[8], s2[23]);
 | 
			
		||||
  out[1] = vaddq_s16(s1[9], s2[22]);
 | 
			
		||||
  out[2] = vaddq_s16(s1[10], s1[21]);
 | 
			
		||||
  out[3] = vaddq_s16(s1[11], s1[20]);
 | 
			
		||||
  out[4] = vaddq_s16(s1[12], s2[19]);
 | 
			
		||||
  out[5] = vaddq_s16(s1[13], s2[18]);
 | 
			
		||||
  out[6] = vaddq_s16(s1[14], s1[17]);
 | 
			
		||||
  out[7] = vaddq_s16(s1[15], s1[16]);
 | 
			
		||||
  out[0] = final_add(s1[8], s2[23]);
 | 
			
		||||
  out[1] = final_add(s1[9], s2[22]);
 | 
			
		||||
  out[2] = final_add(s1[10], s1[21]);
 | 
			
		||||
  out[3] = final_add(s1[11], s1[20]);
 | 
			
		||||
  out[4] = final_add(s1[12], s2[19]);
 | 
			
		||||
  out[5] = final_add(s1[13], s2[18]);
 | 
			
		||||
  out[6] = final_add(s1[14], s1[17]);
 | 
			
		||||
  out[7] = final_add(s1[15], s1[16]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output + (8 * stride), stride);
 | 
			
		||||
 | 
			
		||||
  out[0] = vsubq_s16(s1[15], s1[16]);
 | 
			
		||||
  out[1] = vsubq_s16(s1[14], s1[17]);
 | 
			
		||||
  out[2] = vsubq_s16(s1[13], s2[18]);
 | 
			
		||||
  out[3] = vsubq_s16(s1[12], s2[19]);
 | 
			
		||||
  out[4] = vsubq_s16(s1[11], s1[20]);
 | 
			
		||||
  out[5] = vsubq_s16(s1[10], s1[21]);
 | 
			
		||||
  out[6] = vsubq_s16(s1[9], s2[22]);
 | 
			
		||||
  out[7] = vsubq_s16(s1[8], s2[23]);
 | 
			
		||||
  out[0] = final_sub(s1[15], s1[16]);
 | 
			
		||||
  out[1] = final_sub(s1[14], s1[17]);
 | 
			
		||||
  out[2] = final_sub(s1[13], s2[18]);
 | 
			
		||||
  out[3] = final_sub(s1[12], s2[19]);
 | 
			
		||||
  out[4] = final_sub(s1[11], s1[20]);
 | 
			
		||||
  out[5] = final_sub(s1[10], s1[21]);
 | 
			
		||||
  out[6] = final_sub(s1[9], s2[22]);
 | 
			
		||||
  out[7] = final_sub(s1[8], s2[23]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output + (16 * stride), stride);
 | 
			
		||||
 | 
			
		||||
  out[0] = vsubq_s16(s1[7], s1[24]);
 | 
			
		||||
  out[1] = vsubq_s16(s1[6], s1[25]);
 | 
			
		||||
  out[2] = vsubq_s16(s1[5], s1[26]);
 | 
			
		||||
  out[3] = vsubq_s16(s1[4], s1[27]);
 | 
			
		||||
  out[4] = vsubq_s16(s1[3], s2[28]);
 | 
			
		||||
  out[5] = vsubq_s16(s1[2], s2[29]);
 | 
			
		||||
  out[6] = vsubq_s16(s1[1], s2[30]);
 | 
			
		||||
  out[7] = vsubq_s16(s1[0], s2[31]);
 | 
			
		||||
  out[0] = final_sub(s1[7], s1[24]);
 | 
			
		||||
  out[1] = final_sub(s1[6], s1[25]);
 | 
			
		||||
  out[2] = final_sub(s1[5], s1[26]);
 | 
			
		||||
  out[3] = final_sub(s1[4], s1[27]);
 | 
			
		||||
  out[4] = final_sub(s1[3], s2[28]);
 | 
			
		||||
  out[5] = final_sub(s1[2], s2[29]);
 | 
			
		||||
  out[6] = final_sub(s1[1], s2[30]);
 | 
			
		||||
  out[7] = final_sub(s1[0], s2[31]);
 | 
			
		||||
 | 
			
		||||
  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
 | 
			
		||||
                       out[7], output + (24 * stride), stride);
 | 
			
		||||
 
 | 
			
		||||
@@ -339,10 +339,10 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
 | 
			
		||||
  dest3 -= str2;
 | 
			
		||||
 | 
			
		||||
  load_from_output(out, 30, 31, &q[0], &q[1]);
 | 
			
		||||
  q[4] = vaddq_s16(q[2], q[1]);
 | 
			
		||||
  q[5] = vaddq_s16(q[3], q[0]);
 | 
			
		||||
  q[6] = vsubq_s16(q[3], q[0]);
 | 
			
		||||
  q[7] = vsubq_s16(q[2], q[1]);
 | 
			
		||||
  q[4] = final_add(q[2], q[1]);
 | 
			
		||||
  q[5] = final_add(q[3], q[0]);
 | 
			
		||||
  q[6] = final_sub(q[3], q[0]);
 | 
			
		||||
  q[7] = final_sub(q[2], q[1]);
 | 
			
		||||
  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
 | 
			
		||||
  dest0 += str2;
 | 
			
		||||
  dest1 -= str2;
 | 
			
		||||
@@ -354,19 +354,19 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
 | 
			
		||||
  q[5] = vsubq_s16(q[10], q[1]);
 | 
			
		||||
 | 
			
		||||
  load_from_output(out, 18, 19, &q[0], &q[1]);
 | 
			
		||||
  q[8] = vaddq_s16(q[4], q[1]);
 | 
			
		||||
  q[9] = vaddq_s16(q[5], q[0]);
 | 
			
		||||
  q[6] = vsubq_s16(q[5], q[0]);
 | 
			
		||||
  q[7] = vsubq_s16(q[4], q[1]);
 | 
			
		||||
  q[8] = final_add(q[4], q[1]);
 | 
			
		||||
  q[9] = final_add(q[5], q[0]);
 | 
			
		||||
  q[6] = final_sub(q[5], q[0]);
 | 
			
		||||
  q[7] = final_sub(q[4], q[1]);
 | 
			
		||||
  store_combine_center_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
 | 
			
		||||
  dest2 += str2;
 | 
			
		||||
  dest3 -= str2;
 | 
			
		||||
 | 
			
		||||
  load_from_output(out, 28, 29, &q[0], &q[1]);
 | 
			
		||||
  q[4] = vaddq_s16(q[2], q[1]);
 | 
			
		||||
  q[5] = vaddq_s16(q[3], q[0]);
 | 
			
		||||
  q[6] = vsubq_s16(q[3], q[0]);
 | 
			
		||||
  q[7] = vsubq_s16(q[2], q[1]);
 | 
			
		||||
  q[4] = final_add(q[2], q[1]);
 | 
			
		||||
  q[5] = final_add(q[3], q[0]);
 | 
			
		||||
  q[6] = final_sub(q[3], q[0]);
 | 
			
		||||
  q[7] = final_sub(q[2], q[1]);
 | 
			
		||||
  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
 | 
			
		||||
  dest0 += str2;
 | 
			
		||||
  dest1 -= str2;
 | 
			
		||||
@@ -378,19 +378,19 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
 | 
			
		||||
  q[5] = vsubq_s16(q[12], q[1]);
 | 
			
		||||
 | 
			
		||||
  load_from_output(out, 20, 21, &q[0], &q[1]);
 | 
			
		||||
  q[8] = vaddq_s16(q[4], q[1]);
 | 
			
		||||
  q[9] = vaddq_s16(q[5], q[0]);
 | 
			
		||||
  q[6] = vsubq_s16(q[5], q[0]);
 | 
			
		||||
  q[7] = vsubq_s16(q[4], q[1]);
 | 
			
		||||
  q[8] = final_add(q[4], q[1]);
 | 
			
		||||
  q[9] = final_add(q[5], q[0]);
 | 
			
		||||
  q[6] = final_sub(q[5], q[0]);
 | 
			
		||||
  q[7] = final_sub(q[4], q[1]);
 | 
			
		||||
  store_combine_center_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
 | 
			
		||||
  dest2 += str2;
 | 
			
		||||
  dest3 -= str2;
 | 
			
		||||
 | 
			
		||||
  load_from_output(out, 26, 27, &q[0], &q[1]);
 | 
			
		||||
  q[4] = vaddq_s16(q[2], q[1]);
 | 
			
		||||
  q[5] = vaddq_s16(q[3], q[0]);
 | 
			
		||||
  q[6] = vsubq_s16(q[3], q[0]);
 | 
			
		||||
  q[7] = vsubq_s16(q[2], q[1]);
 | 
			
		||||
  q[4] = final_add(q[2], q[1]);
 | 
			
		||||
  q[5] = final_add(q[3], q[0]);
 | 
			
		||||
  q[6] = final_sub(q[3], q[0]);
 | 
			
		||||
  q[7] = final_sub(q[2], q[1]);
 | 
			
		||||
  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
 | 
			
		||||
  dest0 += str2;
 | 
			
		||||
  dest1 -= str2;
 | 
			
		||||
@@ -402,17 +402,17 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
 | 
			
		||||
  q[5] = vsubq_s16(q[14], q[1]);
 | 
			
		||||
 | 
			
		||||
  load_from_output(out, 22, 23, &q[0], &q[1]);
 | 
			
		||||
  q[8] = vaddq_s16(q[4], q[1]);
 | 
			
		||||
  q[9] = vaddq_s16(q[5], q[0]);
 | 
			
		||||
  q[6] = vsubq_s16(q[5], q[0]);
 | 
			
		||||
  q[7] = vsubq_s16(q[4], q[1]);
 | 
			
		||||
  q[8] = final_add(q[4], q[1]);
 | 
			
		||||
  q[9] = final_add(q[5], q[0]);
 | 
			
		||||
  q[6] = final_sub(q[5], q[0]);
 | 
			
		||||
  q[7] = final_sub(q[4], q[1]);
 | 
			
		||||
  store_combine_center_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
 | 
			
		||||
 | 
			
		||||
  load_from_output(out, 24, 25, &q[0], &q[1]);
 | 
			
		||||
  q[4] = vaddq_s16(q[2], q[1]);
 | 
			
		||||
  q[5] = vaddq_s16(q[3], q[0]);
 | 
			
		||||
  q[6] = vsubq_s16(q[3], q[0]);
 | 
			
		||||
  q[7] = vsubq_s16(q[2], q[1]);
 | 
			
		||||
  q[4] = final_add(q[2], q[1]);
 | 
			
		||||
  q[5] = final_add(q[3], q[0]);
 | 
			
		||||
  q[6] = final_sub(q[3], q[0]);
 | 
			
		||||
  q[7] = final_sub(q[2], q[1]);
 | 
			
		||||
  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -656,10 +656,10 @@ void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
 | 
			
		||||
      q[4] = vsubq_s16(q[9], q[0]);
 | 
			
		||||
      q[5] = vsubq_s16(q[8], q[1]);
 | 
			
		||||
      load_from_output(out, 16, 17, &q[0], &q[1]);
 | 
			
		||||
      q[8] = vaddq_s16(q[4], q[1]);
 | 
			
		||||
      q[9] = vaddq_s16(q[5], q[0]);
 | 
			
		||||
      q[6] = vsubq_s16(q[5], q[0]);
 | 
			
		||||
      q[7] = vsubq_s16(q[4], q[1]);
 | 
			
		||||
      q[8] = final_add(q[4], q[1]);
 | 
			
		||||
      q[9] = final_add(q[5], q[0]);
 | 
			
		||||
      q[6] = final_sub(q[5], q[0]);
 | 
			
		||||
      q[7] = final_sub(q[4], q[1]);
 | 
			
		||||
 | 
			
		||||
      if (idct32_pass_loop == 0) {
 | 
			
		||||
        idct32_bands_end_1st_pass(out, q);
 | 
			
		||||
 
 | 
			
		||||
@@ -92,6 +92,24 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//------------------------------------------------------------------------------
 | 
			
		||||
// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
 | 
			
		||||
static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
  return vqaddq_s16(a, b);
 | 
			
		||||
#else
 | 
			
		||||
  return vaddq_s16(a, b);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) {
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
  return vqsubq_s16(a, b);
 | 
			
		||||
#else
 | 
			
		||||
  return vsubq_s16(a, b);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user