64f728caef
This patch followed "Rewrite filter_selectively_horiz for parallel loopfiltering" commit, and added x86 SSE2 optimization to do 16-pixel filtering in parallel. Also, corrected the declaration of aligned arrays. For 8-pixel-in-parallel case, improved the calculation of the masks and filters. Updated the threshold loading since the thresholds were already duplicated. Updated neon C functions to call neon loopfilters twice. Using tulip clip, tests showed it gave a ~1.5% decoder speed gain. Change-Id: Id02638626ac27a4b0e0b09d71792a24c0499bd35 |
||
---|---|---|
.. | ||
vp9_avg_neon.asm | ||
vp9_convolve8_avg_neon.asm | ||
vp9_convolve8_neon.asm | ||
vp9_convolve_neon.c | ||
vp9_copy_neon.asm | ||
vp9_dc_only_idct_add_neon.asm | ||
vp9_idct16x16_neon.c | ||
vp9_loopfilter_16_neon.c | ||
vp9_loopfilter_neon.asm | ||
vp9_mb_lpf_neon.asm | ||
vp9_save_reg_neon.asm | ||
vp9_short_idct4x4_1_add_neon.asm | ||
vp9_short_idct4x4_add_neon.asm | ||
vp9_short_idct8x8_1_add_neon.asm | ||
vp9_short_idct8x8_add_neon.asm | ||
vp9_short_idct16x16_1_add_neon.asm | ||
vp9_short_idct16x16_add_neon.asm | ||
vp9_short_idct32x32_1_add_neon.asm | ||
vp9_short_idct32x32_add_neon.asm | ||
vp9_short_iht4x4_add_neon.asm | ||
vp9_short_iht8x8_add_neon.asm |