2010-05-18 17:58:33 +02:00
|
|
|
/*
|
2010-09-09 14:16:39 +02:00
|
|
|
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
2010-05-18 17:58:33 +02:00
|
|
|
*
|
2010-06-18 18:39:21 +02:00
|
|
|
* Use of this source code is governed by a BSD-style license
|
2010-06-04 22:19:40 +02:00
|
|
|
* that can be found in the LICENSE file in the root of the source
|
|
|
|
* tree. An additional intellectual property rights grant can be found
|
2010-06-18 18:39:21 +02:00
|
|
|
* in the file PATENTS. All contributing project authors may
|
2010-06-04 22:19:40 +02:00
|
|
|
* be found in the AUTHORS file in the root of the source tree.
|
2010-05-18 17:58:33 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/****************************************************************************
|
|
|
|
* Notes:
|
|
|
|
*
|
|
|
|
* This implementation makes use of 16 bit fixed point verio of two multiply
|
|
|
|
* constants:
|
|
|
|
* 1. sqrt(2) * cos (pi/8)
|
|
|
|
* 2. sqrt(2) * sin (pi/8)
|
|
|
|
* Becuase the first constant is bigger than 1, to maintain the same 16 bit
|
|
|
|
* fixed point precision as the second one, we use a trick of
|
|
|
|
* x * a = x + x*(a-1)
|
|
|
|
* so
|
|
|
|
* x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
|
|
|
|
**************************************************************************/
|
2012-10-31 01:54:22 +01:00
|
|
|
#include <assert.h>
|
|
|
|
#include <math.h>
|
2012-12-23 16:20:10 +01:00
|
|
|
#include "./vpx_config.h"
|
2012-11-27 22:59:17 +01:00
|
|
|
#include "vp9/common/vp9_systemdependent.h"
|
|
|
|
#include "vp9/common/vp9_blockd.h"
|
Consistently use get_prob(), clip_prob() and newly added clip_pixel().
Add a function clip_pixel() to clip a pixel value to the [0,255] range
of allowed values, and use this where-ever appropriate (e.g. prediction,
reconstruction). Likewise, consistently use the recently added function
clip_prob(), which calculates a binary probability in the [1,255] range.
If possible, try to use get_prob() or its sister get_binary_prob() to
calculate binary probabilities, for consistency.
Since in some places, this means that binary probability calculations
are changed (we use {255,256}*count0/(total) in a range of places,
and all of these are now changed to use 256*count0+(total>>1)/total),
this changes the encoding result, so this patch warrants some extensive
testing.
Change-Id: Ibeeff8d886496839b8e0c0ace9ccc552351f7628
2012-12-10 21:09:07 +01:00
|
|
|
#include "vp9/common/vp9_common.h"
|
2011-02-14 23:18:18 +01:00
|
|
|
|
2010-05-18 17:58:33 +02:00
|
|
|
static const int cospi8sqrt2minus1 = 20091;
|
|
|
|
static const int sinpi8sqrt2 = 35468;
|
|
|
|
static const int rounding = 0;
|
2011-02-14 23:18:18 +01:00
|
|
|
|
2013-02-01 01:16:28 +01:00
|
|
|
// Constants and Macros used by 16 and 32 point idct functions
|
|
|
|
#define DCT_CONST_BITS 14
|
|
|
|
#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
|
|
|
|
// Constants are 16384 * cos(kPi/64) where k = 1 to 31.
|
|
|
|
// Note: sin(kPi/64) = cos((32-k)Pi/64)
|
|
|
|
static const int cospi_1_64 = 16364;
|
|
|
|
static const int cospi_2_64 = 16305;
|
|
|
|
static const int cospi_3_64 = 16207;
|
|
|
|
static const int cospi_4_64 = 16069;
|
|
|
|
static const int cospi_5_64 = 15893;
|
|
|
|
static const int cospi_6_64 = 15679;
|
|
|
|
static const int cospi_7_64 = 15426;
|
|
|
|
static const int cospi_8_64 = 15137;
|
|
|
|
static const int cospi_9_64 = 14811;
|
|
|
|
static const int cospi_10_64 = 14449;
|
|
|
|
static const int cospi_11_64 = 14053;
|
|
|
|
static const int cospi_12_64 = 13623;
|
|
|
|
static const int cospi_13_64 = 13160;
|
|
|
|
static const int cospi_14_64 = 12665;
|
|
|
|
static const int cospi_15_64 = 12140;
|
|
|
|
static const int cospi_16_64 = 11585;
|
|
|
|
static const int cospi_17_64 = 11003;
|
|
|
|
static const int cospi_18_64 = 10394;
|
|
|
|
static const int cospi_19_64 = 9760;
|
|
|
|
static const int cospi_20_64 = 9102;
|
|
|
|
static const int cospi_21_64 = 8423;
|
|
|
|
static const int cospi_22_64 = 7723;
|
|
|
|
static const int cospi_23_64 = 7005;
|
|
|
|
static const int cospi_24_64 = 6270;
|
|
|
|
static const int cospi_25_64 = 5520;
|
|
|
|
static const int cospi_26_64 = 4756;
|
|
|
|
static const int cospi_27_64 = 3981;
|
|
|
|
static const int cospi_28_64 = 3196;
|
|
|
|
static const int cospi_29_64 = 2404;
|
|
|
|
static const int cospi_30_64 = 1606;
|
|
|
|
static const int cospi_31_64 = 804;
|
|
|
|
|
|
|
|
static int16_t dct_const_round_shift(int input) {
|
|
|
|
int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
|
|
|
|
assert((rv <= INT16_MAX) && (rv >= INT16_MIN));
|
|
|
|
return (int16_t)rv;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-10-31 01:12:12 +01:00
|
|
|
static const int16_t idct_i4[16] = {
|
2012-10-19 01:31:59 +02:00
|
|
|
8192, 10703, 8192, 4433,
|
|
|
|
8192, 4433, -8192, -10703,
|
|
|
|
8192, -4433, -8192, 10703,
|
|
|
|
8192, -10703, 8192, -4433
|
|
|
|
};
|
|
|
|
|
2012-10-31 01:12:12 +01:00
|
|
|
static const int16_t iadst_i4[16] = {
|
2012-10-19 01:31:59 +02:00
|
|
|
3736, 9459, 10757, 7021,
|
|
|
|
7021, 9459, -3736, -10757,
|
|
|
|
9459, 0, -9459, 9459,
|
|
|
|
10757, -9459, 7021, -3736
|
|
|
|
};
|
|
|
|
|
2012-10-31 01:12:12 +01:00
|
|
|
static const int16_t idct_i8[64] = {
|
2012-10-19 01:31:59 +02:00
|
|
|
5793, 8035, 7568, 6811,
|
|
|
|
5793, 4551, 3135, 1598,
|
|
|
|
5793, 6811, 3135, -1598,
|
|
|
|
-5793, -8035, -7568, -4551,
|
|
|
|
5793, 4551, -3135, -8035,
|
|
|
|
-5793, 1598, 7568, 6811,
|
|
|
|
5793, 1598, -7568, -4551,
|
|
|
|
5793, 6811, -3135, -8035,
|
|
|
|
5793, -1598, -7568, 4551,
|
|
|
|
5793, -6811, -3135, 8035,
|
|
|
|
5793, -4551, -3135, 8035,
|
|
|
|
-5793, -1598, 7568, -6811,
|
|
|
|
5793, -6811, 3135, 1598,
|
|
|
|
-5793, 8035, -7568, 4551,
|
|
|
|
5793, -8035, 7568, -6811,
|
|
|
|
5793, -4551, 3135, -1598
|
|
|
|
};
|
|
|
|
|
2012-10-31 01:12:12 +01:00
|
|
|
static const int16_t iadst_i8[64] = {
|
2012-10-19 01:31:59 +02:00
|
|
|
1460, 4184, 6342, 7644,
|
|
|
|
7914, 7114, 5354, 2871,
|
|
|
|
2871, 7114, 7644, 4184,
|
|
|
|
-1460, -6342, -7914, -5354,
|
|
|
|
4184, 7914, 2871, -5354,
|
|
|
|
-7644, -1460, 6342, 7114,
|
|
|
|
5354, 6342, -4184, -7114,
|
|
|
|
2871, 7644, -1460, -7914,
|
|
|
|
6342, 2871, -7914, 1460,
|
|
|
|
7114, -5354, -4184, 7644,
|
|
|
|
7114, -1460, -5354, 7914,
|
|
|
|
-4184, -2871, 7644, -6342,
|
|
|
|
7644, -5354, 1460, 2871,
|
|
|
|
-6342, 7914, -7114, 4184,
|
|
|
|
7914, -7644, 7114, -6342,
|
|
|
|
5354, -4184, 2871, -1460
|
|
|
|
};
|
|
|
|
|
2012-08-29 20:25:38 +02:00
|
|
|
|
2012-08-02 18:07:33 +02:00
|
|
|
|
2012-10-31 01:12:12 +01:00
|
|
|
static const int16_t idct_i16[256] = {
|
2012-10-19 01:31:59 +02:00
|
|
|
4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478,
|
|
|
|
4096, 3675, 3218, 2731, 2217, 1682, 1130, 568,
|
|
|
|
4096, 5543, 4816, 3675, 2217, 568, -1130, -2731,
|
|
|
|
-4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
|
|
|
|
4096, 5109, 3218, 568, -2217, -4478, -5681, -5543,
|
|
|
|
-4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731,
|
|
|
|
4096, 4478, 1130, -2731, -5352, -5543, -3218, 568,
|
|
|
|
4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675,
|
|
|
|
4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765,
|
|
|
|
4096, -568, -4816, -5543, -2217, 2731, 5681, 4478,
|
|
|
|
4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682,
|
|
|
|
-4096, -5543, -1130, 4478, 5352, 568, -4816, -5109,
|
|
|
|
4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109,
|
|
|
|
-4096, 2731, 5681, 568, -5352, -3675, 3218, 5543,
|
|
|
|
4096, 568, -5681, -1682, 5352, 2731, -4816, -3675,
|
|
|
|
4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765,
|
|
|
|
4096, -568, -5681, 1682, 5352, -2731, -4816, 3675,
|
|
|
|
4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765,
|
|
|
|
4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109,
|
|
|
|
-4096, -2731, 5681, -568, -5352, 3675, 3218, -5543,
|
|
|
|
4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682,
|
|
|
|
-4096, 5543, -1130, -4478, 5352, -568, -4816, 5109,
|
|
|
|
4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765,
|
|
|
|
4096, 568, -4816, 5543, -2217, -2731, 5681, -4478,
|
|
|
|
4096, -4478, 1130, 2731, -5352, 5543, -3218, -568,
|
|
|
|
4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675,
|
|
|
|
4096, -5109, 3218, -568, -2217, 4478, -5681, 5543,
|
|
|
|
-4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731,
|
|
|
|
4096, -5543, 4816, -3675, 2217, -568, -1130, 2731,
|
|
|
|
-4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682,
|
|
|
|
4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478,
|
|
|
|
4096, -3675, 3218, -2731, 2217, -1682, 1130, -568
|
|
|
|
};
|
2012-08-02 18:07:33 +02:00
|
|
|
|
2012-10-31 01:12:12 +01:00
|
|
|
static const int16_t iadst_i16[256] = {
|
2012-10-19 01:31:59 +02:00
|
|
|
542, 1607, 2614, 3526, 4311, 4940, 5390, 5646,
|
|
|
|
5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080,
|
|
|
|
1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607,
|
|
|
|
-542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
|
|
|
|
1607, 4311, 5646, 5189, 3084, 0, -3084, -5189,
|
|
|
|
-5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084,
|
|
|
|
2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084,
|
|
|
|
1080, 4646, 5646, 3526, -542, -4311, -5698, -3936,
|
|
|
|
2614, 5646, 3936, -1080, -5189, -4940, -542, 4311,
|
|
|
|
5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646,
|
|
|
|
3084, 5646, 1607, -4311, -5189, 0, 5189, 4311,
|
|
|
|
-1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189,
|
|
|
|
3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084,
|
|
|
|
-5390, 542, 5646, 2120, -4646, -4311, 2614, 5543,
|
|
|
|
3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189,
|
|
|
|
2120, 5390, -1607, -5543, 1080, 5646, -542, -5698,
|
|
|
|
4311, 3084, -5189, -1607, 5646, 0, -5646, 1607,
|
|
|
|
5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646,
|
|
|
|
4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646,
|
|
|
|
-2614, -3936, 5189, 542, -5543, 3084, 3526, -5390,
|
|
|
|
4940, 0, -4940, 4940, 0, -4940, 4940, 0,
|
|
|
|
-4940, 4940, 0, -4940, 4940, 0, -4940, 4940,
|
|
|
|
5189, -1607, -3084, 5646, -4311, 0, 4311, -5646,
|
|
|
|
3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311,
|
|
|
|
5390, -3084, -542, 3936, -5646, 4940, -2120, -1607,
|
|
|
|
4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526,
|
|
|
|
5543, -4311, 2120, 542, -3084, 4940, -5698, 5189,
|
|
|
|
-3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614,
|
|
|
|
5646, -5189, 4311, -3084, 1607, 0, -1607, 3084,
|
|
|
|
-4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607,
|
|
|
|
5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311,
|
|
|
|
3936, -3526, 3084, -2614, 2120, -1607, 1080, -542
|
|
|
|
};
|
|
|
|
|
2012-06-25 21:26:09 +02:00
|
|
|
|
2012-10-19 01:31:59 +02:00
|
|
|
/* Converted the transforms to integer form. */
|
2012-12-27 22:48:17 +01:00
|
|
|
#define HORIZONTAL_SHIFT 14 // 16
|
2012-10-19 01:31:59 +02:00
|
|
|
#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
|
2012-12-27 22:48:17 +01:00
|
|
|
#define VERTICAL_SHIFT 17 // 15
|
|
|
|
#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
|
2012-11-29 16:19:38 +01:00
|
|
|
void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
|
2012-12-13 00:49:39 +01:00
|
|
|
TX_TYPE tx_type, int tx_dim, uint16_t eobs) {
|
2012-10-19 01:31:59 +02:00
|
|
|
int i, j, k;
|
2012-12-13 00:49:39 +01:00
|
|
|
int nz_dim;
|
2012-10-19 01:31:59 +02:00
|
|
|
int16_t imbuf[256];
|
|
|
|
|
|
|
|
const int16_t *ip = input;
|
|
|
|
int16_t *op = output;
|
|
|
|
int16_t *im = &imbuf[0];
|
|
|
|
|
|
|
|
/* pointers to vertical and horizontal transforms. */
|
|
|
|
const int16_t *ptv = NULL, *pth = NULL;
|
|
|
|
int shortpitch = pitch >> 1;
|
|
|
|
|
|
|
|
switch (tx_type) {
|
|
|
|
case ADST_ADST :
|
|
|
|
ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
|
|
|
|
: ((tx_dim == 8) ? &iadst_i8[0]
|
|
|
|
: &iadst_i16[0]);
|
|
|
|
break;
|
|
|
|
case ADST_DCT :
|
|
|
|
ptv = (tx_dim == 4) ? &iadst_i4[0]
|
|
|
|
: ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
|
|
|
|
pth = (tx_dim == 4) ? &idct_i4[0]
|
|
|
|
: ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
|
|
|
|
break;
|
|
|
|
case DCT_ADST :
|
|
|
|
ptv = (tx_dim == 4) ? &idct_i4[0]
|
|
|
|
: ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
|
|
|
|
pth = (tx_dim == 4) ? &iadst_i4[0]
|
|
|
|
: ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
|
|
|
|
break;
|
|
|
|
case DCT_DCT :
|
|
|
|
ptv = pth = (tx_dim == 4) ? &idct_i4[0]
|
2012-10-31 01:54:22 +01:00
|
|
|
: ((tx_dim == 8) ? &idct_i8[0]
|
|
|
|
: &idct_i16[0]);
|
2012-10-19 01:31:59 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2012-12-13 00:49:39 +01:00
|
|
|
nz_dim = tx_dim;
|
|
|
|
if(tx_dim > 4) {
|
|
|
|
if(eobs < 36) {
|
|
|
|
vpx_memset(im, 0, 512);
|
|
|
|
nz_dim = 8;
|
|
|
|
if(eobs < 3) {
|
|
|
|
nz_dim = 2;
|
|
|
|
} else if(eobs < 10) {
|
|
|
|
nz_dim = 4;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-27 22:48:17 +01:00
|
|
|
/* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps
|
|
|
|
* from right to left:
|
|
|
|
* 1. horizontal transform: Y= Z*Transposed_M2
|
|
|
|
* 2. vertical transform: X = M1*Y
|
|
|
|
* In SIMD, doing this way could eliminate the transpose needed if it is
|
|
|
|
* calculated from left to right.
|
|
|
|
*/
|
|
|
|
/* Horizontal transformation */
|
2012-10-19 01:31:59 +02:00
|
|
|
for (j = 0; j < tx_dim; j++) {
|
2012-12-13 00:49:39 +01:00
|
|
|
for (i = 0; i < nz_dim; i++) {
|
2012-10-19 01:31:59 +02:00
|
|
|
int temp = 0;
|
|
|
|
|
2012-12-13 00:49:39 +01:00
|
|
|
for (k = 0; k < nz_dim; k++) {
|
2012-12-27 22:48:17 +01:00
|
|
|
temp += ip[k] * pth[k];
|
2012-10-19 01:31:59 +02:00
|
|
|
}
|
|
|
|
|
2012-12-27 22:48:17 +01:00
|
|
|
/* Calculate im and store it in its transposed position. */
|
|
|
|
im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
|
|
|
|
ip += tx_dim;
|
2012-10-19 01:31:59 +02:00
|
|
|
}
|
2012-12-27 22:48:17 +01:00
|
|
|
im += tx_dim;
|
|
|
|
pth += tx_dim;
|
2012-10-19 01:31:59 +02:00
|
|
|
ip = input;
|
|
|
|
}
|
|
|
|
|
2012-12-27 22:48:17 +01:00
|
|
|
/* Vertical transformation */
|
2012-10-19 01:31:59 +02:00
|
|
|
im = &imbuf[0];
|
|
|
|
|
2012-12-27 22:48:17 +01:00
|
|
|
for (i = 0; i < tx_dim; i++) {
|
|
|
|
for (j = 0; j < tx_dim; j++) {
|
2012-10-19 01:31:59 +02:00
|
|
|
int temp = 0;
|
|
|
|
|
2012-12-13 00:49:39 +01:00
|
|
|
for (k = 0; k < nz_dim; k++) {
|
2012-12-27 22:48:17 +01:00
|
|
|
temp += ptv[k] * im[k];
|
2012-10-19 01:31:59 +02:00
|
|
|
}
|
|
|
|
|
2012-12-27 22:48:17 +01:00
|
|
|
op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
|
|
|
|
im += tx_dim;
|
2012-10-19 01:31:59 +02:00
|
|
|
}
|
2012-12-27 22:48:17 +01:00
|
|
|
im = &imbuf[0];
|
|
|
|
ptv += tx_dim;
|
2012-10-19 01:31:59 +02:00
|
|
|
op += shortpitch;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
|
|
|
int a1, b1, c1, d1;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t *ip = input;
|
|
|
|
int16_t *op = output;
|
2012-07-14 00:21:29 +02:00
|
|
|
int temp1, temp2;
|
|
|
|
int shortpitch = pitch >> 1;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ip[0] + ip[8];
|
|
|
|
b1 = ip[0] - ip[8];
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
|
|
|
|
temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
|
|
|
|
c1 = temp1 - temp2;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
|
|
|
|
temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
|
|
|
|
d1 = temp1 + temp2;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
op[shortpitch * 0] = a1 + d1;
|
|
|
|
op[shortpitch * 3] = a1 - d1;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
op[shortpitch * 1] = b1 + c1;
|
|
|
|
op[shortpitch * 2] = b1 - c1;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
ip++;
|
|
|
|
op++;
|
|
|
|
}
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
ip = output;
|
|
|
|
op = output;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ip[0] + ip[2];
|
|
|
|
b1 = ip[0] - ip[2];
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
|
|
|
|
temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
|
|
|
|
c1 = temp1 - temp2;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
|
|
|
|
temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
|
|
|
|
d1 = temp1 + temp2;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
op[0] = (a1 + d1 + 16) >> 5;
|
|
|
|
op[3] = (a1 - d1 + 16) >> 5;
|
2010-12-02 00:50:14 +01:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
op[1] = (b1 + c1 + 16) >> 5;
|
|
|
|
op[2] = (b1 - c1 + 16) >> 5;
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
ip += shortpitch;
|
|
|
|
op += shortpitch;
|
|
|
|
}
|
2010-05-18 17:58:33 +02:00
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
|
|
|
int a1;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t *op = output;
|
2012-07-14 00:21:29 +02:00
|
|
|
int shortpitch = pitch >> 1;
|
|
|
|
a1 = ((input[0] + 16) >> 5);
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
op[0] = a1;
|
|
|
|
op[1] = a1;
|
|
|
|
op[2] = a1;
|
|
|
|
op[3] = a1;
|
|
|
|
op += shortpitch;
|
|
|
|
}
|
2010-05-18 17:58:33 +02:00
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
|
|
|
|
uint8_t *dst_ptr, int pitch, int stride) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int a1 = ((input_dc + 16) >> 5);
|
|
|
|
int r, c;
|
2010-05-28 20:28:12 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
for (r = 0; r < 4; r++) {
|
|
|
|
for (c = 0; c < 4; c++) {
|
Consistently use get_prob(), clip_prob() and newly added clip_pixel().
Add a function clip_pixel() to clip a pixel value to the [0,255] range
of allowed values, and use this where-ever appropriate (e.g. prediction,
reconstruction). Likewise, consistently use the recently added function
clip_prob(), which calculates a binary probability in the [1,255] range.
If possible, try to use get_prob() or its sister get_binary_prob() to
calculate binary probabilities, for consistency.
Since in some places, this means that binary probability calculations
are changed (we use {255,256}*count0/(total) in a range of places,
and all of these are now changed to use 256*count0+(total>>1)/total),
this changes the encoding result, so this patch warrants some extensive
testing.
Change-Id: Ibeeff8d886496839b8e0c0ace9ccc552351f7628
2012-12-10 21:09:07 +01:00
|
|
|
dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
|
2010-05-18 17:58:33 +02:00
|
|
|
}
|
2010-05-28 20:28:12 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
dst_ptr += stride;
|
|
|
|
pred_ptr += pitch;
|
|
|
|
}
|
|
|
|
}
|
2010-05-18 17:58:33 +02:00
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
|
|
|
int a1, b1, c1, d1;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t *ip = input;
|
|
|
|
int16_t *op = output;
|
2012-07-14 00:21:29 +02:00
|
|
|
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ((ip[0] + ip[3]));
|
|
|
|
b1 = ((ip[1] + ip[2]));
|
|
|
|
c1 = ((ip[1] - ip[2]));
|
|
|
|
d1 = ((ip[0] - ip[3]));
|
|
|
|
|
|
|
|
op[0] = (a1 + b1 + 1) >> 1;
|
|
|
|
op[1] = (c1 + d1) >> 1;
|
|
|
|
op[2] = (a1 - b1) >> 1;
|
|
|
|
op[3] = (d1 - c1) >> 1;
|
|
|
|
|
|
|
|
ip += 4;
|
|
|
|
op += 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
ip = output;
|
|
|
|
op = output;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ip[0] + ip[12];
|
|
|
|
b1 = ip[4] + ip[8];
|
|
|
|
c1 = ip[4] - ip[8];
|
|
|
|
d1 = ip[0] - ip[12];
|
|
|
|
op[0] = (a1 + b1 + 1) >> 1;
|
|
|
|
op[4] = (c1 + d1) >> 1;
|
|
|
|
op[8] = (a1 - b1) >> 1;
|
|
|
|
op[12] = (d1 - c1) >> 1;
|
|
|
|
ip++;
|
|
|
|
op++;
|
|
|
|
}
|
2010-05-18 17:58:33 +02:00
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_inv_walsh4x4_1_c(int16_t *in, int16_t *out) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t tmp[4];
|
|
|
|
int16_t *ip = in;
|
|
|
|
int16_t *op = tmp;
|
2012-07-14 00:21:29 +02:00
|
|
|
|
|
|
|
op[0] = (ip[0] + 1) >> 1;
|
|
|
|
op[1] = op[2] = op[3] = (ip[0] >> 1);
|
|
|
|
|
|
|
|
ip = tmp;
|
|
|
|
op = out;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
op[0] = (ip[0] + 1) >> 1;
|
|
|
|
op[4] = op[8] = op[12] = (ip[0] >> 1);
|
|
|
|
ip++;
|
|
|
|
op++;
|
|
|
|
}
|
2010-05-18 17:58:33 +02:00
|
|
|
}
|
2011-02-14 23:18:18 +01:00
|
|
|
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
#if CONFIG_LOSSLESS
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
|
|
|
int a1, b1, c1, d1;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t *ip = input;
|
|
|
|
int16_t *op = output;
|
2012-07-14 00:21:29 +02:00
|
|
|
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
|
|
|
|
op[0] = (a1 + b1 + 1) >> 1;
|
|
|
|
op[1] = (c1 + d1) >> 1;
|
|
|
|
op[2] = (a1 - b1) >> 1;
|
|
|
|
op[3] = (d1 - c1) >> 1;
|
|
|
|
|
|
|
|
ip += 4;
|
|
|
|
op += 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
ip = output;
|
|
|
|
op = output;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ip[0] + ip[12];
|
|
|
|
b1 = ip[4] + ip[8];
|
|
|
|
c1 = ip[4] - ip[8];
|
|
|
|
d1 = ip[0] - ip[12];
|
|
|
|
|
|
|
|
|
|
|
|
op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
|
|
|
|
ip++;
|
|
|
|
op++;
|
|
|
|
}
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_inv_walsh4x4_1_lossless_c(int16_t *in, int16_t *out) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t tmp[4];
|
|
|
|
int16_t *ip = in;
|
|
|
|
int16_t *op = tmp;
|
2012-07-14 00:21:29 +02:00
|
|
|
|
|
|
|
op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
|
|
|
|
op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
|
|
|
|
|
|
|
|
ip = tmp;
|
|
|
|
op = out;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
|
|
|
|
ip++;
|
|
|
|
op++;
|
|
|
|
}
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
|
|
|
int a1, b1, c1, d1;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t *ip = input;
|
|
|
|
int16_t *op = output;
|
2012-07-14 00:21:29 +02:00
|
|
|
int shortpitch = pitch >> 1;
|
|
|
|
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
|
|
|
|
b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
|
|
|
|
c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
|
|
|
|
d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
|
|
|
|
|
|
|
|
op[0] = (a1 + b1 + 1) >> 1;
|
|
|
|
op[1] = (c1 + d1) >> 1;
|
|
|
|
op[2] = (a1 - b1) >> 1;
|
|
|
|
op[3] = (d1 - c1) >> 1;
|
|
|
|
|
|
|
|
ip += 4;
|
|
|
|
op += shortpitch;
|
|
|
|
}
|
|
|
|
|
|
|
|
ip = output;
|
|
|
|
op = output;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
a1 = ip[shortpitch * 0] + ip[shortpitch * 3];
|
|
|
|
b1 = ip[shortpitch * 1] + ip[shortpitch * 2];
|
|
|
|
c1 = ip[shortpitch * 1] - ip[shortpitch * 2];
|
|
|
|
d1 = ip[shortpitch * 0] - ip[shortpitch * 3];
|
|
|
|
|
|
|
|
|
|
|
|
op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
|
|
|
|
op[shortpitch * 1] = (c1 + d1) >> 1;
|
|
|
|
op[shortpitch * 2] = (a1 - b1) >> 1;
|
|
|
|
op[shortpitch * 3] = (d1 - c1) >> 1;
|
|
|
|
|
|
|
|
ip++;
|
|
|
|
op++;
|
|
|
|
}
|
|
|
|
}
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t tmp[4];
|
|
|
|
int16_t *ip = in;
|
|
|
|
int16_t *op = tmp;
|
2012-07-14 00:21:29 +02:00
|
|
|
int shortpitch = pitch >> 1;
|
|
|
|
|
|
|
|
op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
|
|
|
|
op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
|
|
|
|
|
|
|
|
|
|
|
|
ip = tmp;
|
|
|
|
op = out;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
op[shortpitch * 0] = (ip[0] + 1) >> 1;
|
|
|
|
op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
|
|
|
|
ip++;
|
|
|
|
op++;
|
|
|
|
}
|
|
|
|
}
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,
|
|
|
|
uint8_t *dst_ptr,
|
2012-11-01 00:09:17 +01:00
|
|
|
int pitch, int stride) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int r, c;
|
|
|
|
short tmp[16];
|
2012-10-31 00:25:53 +01:00
|
|
|
vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
for (r = 0; r < 4; r++) {
|
|
|
|
for (c = 0; c < 4; c++) {
|
Consistently use get_prob(), clip_prob() and newly added clip_pixel().
Add a function clip_pixel() to clip a pixel value to the [0,255] range
of allowed values, and use this where-ever appropriate (e.g. prediction,
reconstruction). Likewise, consistently use the recently added function
clip_prob(), which calculates a binary probability in the [1,255] range.
If possible, try to use get_prob() or its sister get_binary_prob() to
calculate binary probabilities, for consistency.
Since in some places, this means that binary probability calculations
are changed (we use {255,256}*count0/(total) in a range of places,
and all of these are now changed to use 256*count0+(total>>1)/total),
this changes the encoding result, so this patch warrants some extensive
testing.
Change-Id: Ibeeff8d886496839b8e0c0ace9ccc552351f7628
2012-12-10 21:09:07 +01:00
|
|
|
dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
}
|
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
dst_ptr += stride;
|
|
|
|
pred_ptr += pitch;
|
|
|
|
}
|
Add lossless compression mode.
This commit adds lossless compression capability to the experimental
branch. The lossless experiment can be enabled using --enable-lossless
in configure. When the experiment is enabled, the encoder will use
lossless compression mode by command line option --lossless, and the
decoder automatically recognizes a losslessly encoded clip and decodes
accordingly.
To achieve the lossless coding, this commit has changed the following:
1. To encode at lossless mode, encoder forces the use of unit
quantizer, i.e, Q 0, where effective quantization is 1. Encoder also
disables the usage of 8x8 transform and allows only 4x4 transform;
2. At Q 0, the first order 4x4 DCT/IDCT have been switched over
to a pair of forward and inverse Walsh-Hadamard Transform
(http://goo.gl/EIsfy), with proper scaling applied to match the range
of the original 4x4 DCT/IDCT pair;
3. At Q 0, the second order remains to use the previous
walsh-hadamard transform pair. However, to maintain the reversibility
in second order transform at Q 0, scaling down is applied to first
order DC coefficients prior to forward transform, and scaling up is
applied to the second order output prior to quantization. Symmetric
upscaling and downscaling are added around inverse second order
transform;
4. At lossless mode, encoder also disables a number of minor
features to ensure no loss is introduced, these features includes:
a. Trellis quantization optimization
b. Loop filtering
c. Aggressive zero-binning, rounding and zero-bin boosting
d. Mode based zero-bin boosting
Lossless coding test was performed on all clips within the derf set,
to verify that the commit has achieved lossless compression for all
clips. The average compression ratio is around 2.57 to 1.
(http://goo.gl/dEShs)
Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
|
|
|
}
|
|
|
|
#endif
|
2012-02-29 02:11:12 +01:00
|
|
|
|
2011-02-14 23:18:18 +01:00
|
|
|
#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
|
|
|
|
#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
|
|
|
|
#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
|
|
|
|
#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
|
|
|
|
#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
|
|
|
|
#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
|
|
|
|
|
|
|
|
/* row (horizontal) IDCT
|
|
|
|
*
|
|
|
|
* 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- *
|
|
|
|
* ( k + - ) * l ) l=0 8 2
|
|
|
|
*
|
|
|
|
* where: c[0] = 128 c[1..7] = 128*sqrt(2) */
|
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
static void idctrow(int *blk) {
|
|
|
|
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
|
|
|
|
/* shortcut */
|
|
|
|
if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
|
|
|
|
(x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
|
|
|
|
blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
|
|
|
|
= blk[5] = blk[6] = blk[7] = blk[0] << 3;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
|
|
|
|
/* first stage */
|
|
|
|
x8 = W7 * (x4 + x5);
|
|
|
|
x4 = x8 + (W1 - W7) * x4;
|
|
|
|
x5 = x8 - (W1 + W7) * x5;
|
|
|
|
x8 = W3 * (x6 + x7);
|
|
|
|
x6 = x8 - (W3 - W5) * x6;
|
|
|
|
x7 = x8 - (W3 + W5) * x7;
|
|
|
|
|
|
|
|
/* second stage */
|
|
|
|
x8 = x0 + x1;
|
|
|
|
x0 -= x1;
|
|
|
|
x1 = W6 * (x3 + x2);
|
|
|
|
x2 = x1 - (W2 + W6) * x2;
|
|
|
|
x3 = x1 + (W2 - W6) * x3;
|
|
|
|
x1 = x4 + x6;
|
|
|
|
x4 -= x6;
|
|
|
|
x6 = x5 + x7;
|
|
|
|
x5 -= x7;
|
|
|
|
|
|
|
|
/* third stage */
|
|
|
|
x7 = x8 + x3;
|
|
|
|
x8 -= x3;
|
|
|
|
x3 = x0 + x2;
|
|
|
|
x0 -= x2;
|
|
|
|
x2 = (181 * (x4 + x5) + 128) >> 8;
|
|
|
|
x4 = (181 * (x4 - x5) + 128) >> 8;
|
|
|
|
|
|
|
|
/* fourth stage */
|
|
|
|
blk[0] = (x7 + x1) >> 8;
|
|
|
|
blk[1] = (x3 + x2) >> 8;
|
|
|
|
blk[2] = (x0 + x4) >> 8;
|
|
|
|
blk[3] = (x8 + x6) >> 8;
|
|
|
|
blk[4] = (x8 - x6) >> 8;
|
|
|
|
blk[5] = (x0 - x4) >> 8;
|
|
|
|
blk[6] = (x3 - x2) >> 8;
|
|
|
|
blk[7] = (x7 - x1) >> 8;
|
2011-02-14 23:18:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* column (vertical) IDCT
|
|
|
|
*
|
|
|
|
* 7 pi 1 dst[8*k] = sum c[l] * src[8*l] *
|
|
|
|
* cos( -- * ( k + - ) * l ) l=0 8 2
|
|
|
|
*
|
|
|
|
* where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
|
2012-07-14 00:21:29 +02:00
|
|
|
static void idctcol(int *blk) {
|
|
|
|
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
|
2012-02-16 22:41:17 +01:00
|
|
|
|
2012-07-14 00:21:29 +02:00
|
|
|
/* shortcut */
|
|
|
|
if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
|
2012-02-16 22:41:17 +01:00
|
|
|
(x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
|
2012-07-14 00:21:29 +02:00
|
|
|
(x7 = blk[8 * 3]))) {
|
|
|
|
blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
|
2012-12-12 02:06:35 +01:00
|
|
|
= blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
|
|
|
|
= blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
|
2012-07-14 00:21:29 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
x0 = (blk[8 * 0] << 8) + 16384;
|
|
|
|
|
|
|
|
/* first stage */
|
|
|
|
x8 = W7 * (x4 + x5) + 4;
|
|
|
|
x4 = (x8 + (W1 - W7) * x4) >> 3;
|
|
|
|
x5 = (x8 - (W1 + W7) * x5) >> 3;
|
|
|
|
x8 = W3 * (x6 + x7) + 4;
|
|
|
|
x6 = (x8 - (W3 - W5) * x6) >> 3;
|
|
|
|
x7 = (x8 - (W3 + W5) * x7) >> 3;
|
|
|
|
|
|
|
|
/* second stage */
|
|
|
|
x8 = x0 + x1;
|
|
|
|
x0 -= x1;
|
|
|
|
x1 = W6 * (x3 + x2) + 4;
|
|
|
|
x2 = (x1 - (W2 + W6) * x2) >> 3;
|
|
|
|
x3 = (x1 + (W2 - W6) * x3) >> 3;
|
|
|
|
x1 = x4 + x6;
|
|
|
|
x4 -= x6;
|
|
|
|
x6 = x5 + x7;
|
|
|
|
x5 -= x7;
|
|
|
|
|
|
|
|
/* third stage */
|
|
|
|
x7 = x8 + x3;
|
|
|
|
x8 -= x3;
|
|
|
|
x3 = x0 + x2;
|
|
|
|
x0 -= x2;
|
|
|
|
x2 = (181 * (x4 + x5) + 128) >> 8;
|
|
|
|
x4 = (181 * (x4 - x5) + 128) >> 8;
|
|
|
|
|
|
|
|
/* fourth stage */
|
|
|
|
blk[8 * 0] = (x7 + x1) >> 14;
|
|
|
|
blk[8 * 1] = (x3 + x2) >> 14;
|
|
|
|
blk[8 * 2] = (x0 + x4) >> 14;
|
|
|
|
blk[8 * 3] = (x8 + x6) >> 14;
|
|
|
|
blk[8 * 4] = (x8 - x6) >> 14;
|
|
|
|
blk[8 * 5] = (x0 - x4) >> 14;
|
|
|
|
blk[8 * 6] = (x3 - x2) >> 14;
|
|
|
|
blk[8 * 7] = (x7 - x1) >> 14;
|
2011-02-14 23:18:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#define TX_DIM 8
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct8x8_c(int16_t *coefs, int16_t *block, int pitch) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int X[TX_DIM * TX_DIM];
|
|
|
|
int i, j;
|
|
|
|
int shortpitch = pitch >> 1;
|
|
|
|
|
|
|
|
for (i = 0; i < TX_DIM; i++) {
|
|
|
|
for (j = 0; j < TX_DIM; j++) {
|
|
|
|
X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
|
|
|
|
+ (coefs[i * TX_DIM + j] < 0)) >> 2;
|
2011-02-14 23:18:18 +01:00
|
|
|
}
|
2012-07-14 00:21:29 +02:00
|
|
|
}
|
|
|
|
for (i = 0; i < 8; i++)
|
|
|
|
idctrow(X + 8 * i);
|
|
|
|
|
|
|
|
for (i = 0; i < 8; i++)
|
|
|
|
idctcol(X + i);
|
|
|
|
|
|
|
|
for (i = 0; i < TX_DIM; i++) {
|
|
|
|
for (j = 0; j < TX_DIM; j++) {
|
|
|
|
block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
|
2011-02-14 23:18:18 +01:00
|
|
|
}
|
2012-07-14 00:21:29 +02:00
|
|
|
}
|
2011-02-14 23:18:18 +01:00
|
|
|
}
|
|
|
|
|
2012-11-10 02:50:13 +01:00
|
|
|
/* Row IDCT when only first 4 coefficients are non-zero. */
|
|
|
|
static void idctrow10(int *blk) {
|
|
|
|
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
|
|
|
|
|
|
|
|
/* shortcut */
|
|
|
|
if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
|
|
|
|
(x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
|
|
|
|
blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
|
|
|
|
= blk[5] = blk[6] = blk[7] = blk[0] << 3;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
|
|
|
|
/* first stage */
|
|
|
|
x5 = W7 * x4;
|
|
|
|
x4 = W1 * x4;
|
|
|
|
x6 = W3 * x7;
|
|
|
|
x7 = -W5 * x7;
|
|
|
|
|
|
|
|
/* second stage */
|
|
|
|
x2 = W6 * x3;
|
|
|
|
x3 = W2 * x3;
|
|
|
|
x1 = x4 + x6;
|
|
|
|
x4 -= x6;
|
|
|
|
x6 = x5 + x7;
|
|
|
|
x5 -= x7;
|
|
|
|
|
|
|
|
/* third stage */
|
|
|
|
x7 = x0 + x3;
|
|
|
|
x8 = x0 - x3;
|
|
|
|
x3 = x0 + x2;
|
|
|
|
x0 -= x2;
|
|
|
|
x2 = (181 * (x4 + x5) + 128) >> 8;
|
|
|
|
x4 = (181 * (x4 - x5) + 128) >> 8;
|
|
|
|
|
|
|
|
/* fourth stage */
|
|
|
|
blk[0] = (x7 + x1) >> 8;
|
|
|
|
blk[1] = (x3 + x2) >> 8;
|
|
|
|
blk[2] = (x0 + x4) >> 8;
|
|
|
|
blk[3] = (x8 + x6) >> 8;
|
|
|
|
blk[4] = (x8 - x6) >> 8;
|
|
|
|
blk[5] = (x0 - x4) >> 8;
|
|
|
|
blk[6] = (x3 - x2) >> 8;
|
|
|
|
blk[7] = (x7 - x1) >> 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
|
|
|
|
static void idctcol10(int *blk) {
|
|
|
|
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
|
|
|
|
|
|
|
|
/* shortcut */
|
|
|
|
if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
|
|
|
|
(x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
|
|
|
|
(x7 = blk[8 * 3]))) {
|
|
|
|
blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
|
|
|
|
= blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
|
|
|
|
= blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
x0 = (blk[8 * 0] << 8) + 16384;
|
|
|
|
|
|
|
|
/* first stage */
|
|
|
|
x5 = (W7 * x4 + 4) >> 3;
|
|
|
|
x4 = (W1 * x4 + 4) >> 3;
|
|
|
|
x6 = (W3 * x7 + 4) >> 3;
|
|
|
|
x7 = (-W5 * x7 + 4) >> 3;
|
|
|
|
|
|
|
|
/* second stage */
|
|
|
|
x2 = (W6 * x3 + 4) >> 3;
|
|
|
|
x3 = (W2 * x3 + 4) >> 3;
|
|
|
|
x1 = x4 + x6;
|
|
|
|
x4 -= x6;
|
|
|
|
x6 = x5 + x7;
|
|
|
|
x5 -= x7;
|
|
|
|
|
|
|
|
/* third stage */
|
|
|
|
x7 = x0 + x3;
|
|
|
|
x8 = x0 - x3;
|
|
|
|
x3 = x0 + x2;
|
|
|
|
x0 -= x2;
|
|
|
|
x2 = (181 * (x4 + x5) + 128) >> 8;
|
|
|
|
x4 = (181 * (x4 - x5) + 128) >> 8;
|
|
|
|
|
|
|
|
/* fourth stage */
|
|
|
|
blk[8 * 0] = (x7 + x1) >> 14;
|
|
|
|
blk[8 * 1] = (x3 + x2) >> 14;
|
|
|
|
blk[8 * 2] = (x0 + x4) >> 14;
|
|
|
|
blk[8 * 3] = (x8 + x6) >> 14;
|
|
|
|
blk[8 * 4] = (x8 - x6) >> 14;
|
|
|
|
blk[8 * 5] = (x0 - x4) >> 14;
|
|
|
|
blk[8 * 6] = (x3 - x2) >> 14;
|
|
|
|
blk[8 * 7] = (x7 - x1) >> 14;
|
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct10_8x8_c(int16_t *coefs, int16_t *block, int pitch) {
|
2012-11-10 02:50:13 +01:00
|
|
|
int X[TX_DIM * TX_DIM];
|
|
|
|
int i, j;
|
|
|
|
int shortpitch = pitch >> 1;
|
|
|
|
|
|
|
|
for (i = 0; i < TX_DIM; i++) {
|
|
|
|
for (j = 0; j < TX_DIM; j++) {
|
|
|
|
X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
|
|
|
|
+ (coefs[i * TX_DIM + j] < 0)) >> 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do first 4 row idct only since non-zero dct coefficients are all in
|
2012-12-19 00:31:19 +01:00
|
|
|
* upper-left 4x4 area. */
|
2012-11-10 02:50:13 +01:00
|
|
|
for (i = 0; i < 4; i++)
|
|
|
|
idctrow10(X + 8 * i);
|
|
|
|
|
|
|
|
for (i = 0; i < 8; i++)
|
|
|
|
idctcol10(X + i);
|
|
|
|
|
|
|
|
for (i = 0; i < TX_DIM; i++) {
|
|
|
|
for (j = 0; j < TX_DIM; j++) {
|
|
|
|
block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2011-02-14 23:18:18 +01:00
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_ihaar2x2_c(int16_t *input, int16_t *output, int pitch) {
|
2012-07-14 00:21:29 +02:00
|
|
|
int i;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t *ip = input; // 0, 1, 4, 8
|
|
|
|
int16_t *op = output;
|
2012-07-14 00:21:29 +02:00
|
|
|
for (i = 0; i < 16; i++) {
|
|
|
|
op[i] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
|
|
|
|
op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
|
|
|
|
op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
|
|
|
|
op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
|
2011-02-14 23:18:18 +01:00
|
|
|
}
|
|
|
|
|
2012-08-03 02:03:14 +02:00
|
|
|
|
|
|
|
#if 0
|
|
|
|
// Keep a really bad float version as reference for now.
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
|
2012-10-05 12:16:46 +02:00
|
|
|
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-10-05 12:16:46 +02:00
|
|
|
{
|
|
|
|
double x;
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j, k, l;
|
|
|
|
for (l = 0; l < 16; ++l) {
|
|
|
|
for (k = 0; k < 16; ++k) {
|
|
|
|
double s = 0;
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
for (j = 0; j < 16; ++j) {
|
|
|
|
x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
|
|
|
|
if (i != 0)
|
|
|
|
x *= sqrt(2.0);
|
|
|
|
if (j != 0)
|
|
|
|
x *= sqrt(2.0);
|
|
|
|
s += x;
|
|
|
|
}
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
2012-10-05 12:16:46 +02:00
|
|
|
output[k*short_pitch+l] = (short)round(s);
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-11-01 17:04:28 +01:00
|
|
|
#define TEST_INT_16x16_IDCT 1
|
|
|
|
#if !TEST_INT_16x16_IDCT
|
2012-08-07 22:55:49 +02:00
|
|
|
|
2012-08-03 02:03:14 +02:00
|
|
|
static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
|
2012-10-05 12:16:46 +02:00
|
|
|
|
2012-12-12 02:06:35 +01:00
|
|
|
static const double C1 = 0.995184726672197;
|
|
|
|
static const double C2 = 0.98078528040323;
|
|
|
|
static const double C3 = 0.956940335732209;
|
|
|
|
static const double C4 = 0.923879532511287;
|
|
|
|
static const double C5 = 0.881921264348355;
|
|
|
|
static const double C6 = 0.831469612302545;
|
|
|
|
static const double C7 = 0.773010453362737;
|
|
|
|
static const double C8 = 0.707106781186548;
|
|
|
|
static const double C9 = 0.634393284163646;
|
|
|
|
static const double C10 = 0.555570233019602;
|
|
|
|
static const double C11 = 0.471396736825998;
|
|
|
|
static const double C12 = 0.38268343236509;
|
|
|
|
static const double C13 = 0.290284677254462;
|
|
|
|
static const double C14 = 0.195090322016128;
|
|
|
|
static const double C15 = 0.098017140329561;
|
|
|
|
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-10-05 12:16:46 +02:00
|
|
|
{
|
|
|
|
double step[16];
|
|
|
|
double intermediate[16];
|
|
|
|
double temp1, temp2;
|
|
|
|
|
|
|
|
|
|
|
|
// step 1 and 2
|
|
|
|
step[ 0] = input[0] + input[8];
|
|
|
|
step[ 1] = input[0] - input[8];
|
|
|
|
|
|
|
|
temp1 = input[4]*C12;
|
|
|
|
temp2 = input[12]*C4;
|
|
|
|
|
|
|
|
temp1 -= temp2;
|
|
|
|
temp1 *= C8;
|
|
|
|
|
|
|
|
step[ 2] = 2*(temp1);
|
|
|
|
|
|
|
|
temp1 = input[4]*C4;
|
|
|
|
temp2 = input[12]*C12;
|
|
|
|
temp1 += temp2;
|
|
|
|
temp1 = (temp1);
|
|
|
|
temp1 *= C8;
|
|
|
|
step[ 3] = 2*(temp1);
|
|
|
|
|
|
|
|
temp1 = input[2]*C8;
|
|
|
|
temp1 = 2*(temp1);
|
|
|
|
temp2 = input[6] + input[10];
|
|
|
|
|
|
|
|
step[ 4] = temp1 + temp2;
|
|
|
|
step[ 5] = temp1 - temp2;
|
|
|
|
|
|
|
|
temp1 = input[14]*C8;
|
|
|
|
temp1 = 2*(temp1);
|
|
|
|
temp2 = input[6] - input[10];
|
|
|
|
|
|
|
|
step[ 6] = temp2 - temp1;
|
|
|
|
step[ 7] = temp2 + temp1;
|
|
|
|
|
|
|
|
// for odd input
|
|
|
|
temp1 = input[3]*C12;
|
|
|
|
temp2 = input[13]*C4;
|
|
|
|
temp1 += temp2;
|
|
|
|
temp1 = (temp1);
|
|
|
|
temp1 *= C8;
|
|
|
|
intermediate[ 8] = 2*(temp1);
|
|
|
|
|
|
|
|
temp1 = input[3]*C4;
|
|
|
|
temp2 = input[13]*C12;
|
|
|
|
temp2 -= temp1;
|
|
|
|
temp2 = (temp2);
|
|
|
|
temp2 *= C8;
|
|
|
|
intermediate[ 9] = 2*(temp2);
|
|
|
|
|
|
|
|
intermediate[10] = 2*(input[9]*C8);
|
|
|
|
intermediate[11] = input[15] - input[1];
|
|
|
|
intermediate[12] = input[15] + input[1];
|
|
|
|
intermediate[13] = 2*((input[7]*C8));
|
|
|
|
|
|
|
|
temp1 = input[11]*C12;
|
|
|
|
temp2 = input[5]*C4;
|
|
|
|
temp2 -= temp1;
|
|
|
|
temp2 = (temp2);
|
|
|
|
temp2 *= C8;
|
|
|
|
intermediate[14] = 2*(temp2);
|
|
|
|
|
|
|
|
temp1 = input[11]*C4;
|
|
|
|
temp2 = input[5]*C12;
|
|
|
|
temp1 += temp2;
|
|
|
|
temp1 = (temp1);
|
|
|
|
temp1 *= C8;
|
|
|
|
intermediate[15] = 2*(temp1);
|
|
|
|
|
|
|
|
step[ 8] = intermediate[ 8] + intermediate[14];
|
|
|
|
step[ 9] = intermediate[ 9] + intermediate[15];
|
|
|
|
step[10] = intermediate[10] + intermediate[11];
|
|
|
|
step[11] = intermediate[10] - intermediate[11];
|
|
|
|
step[12] = intermediate[12] + intermediate[13];
|
|
|
|
step[13] = intermediate[12] - intermediate[13];
|
|
|
|
step[14] = intermediate[ 8] - intermediate[14];
|
|
|
|
step[15] = intermediate[ 9] - intermediate[15];
|
|
|
|
|
|
|
|
// step 3
|
|
|
|
output[0] = step[ 0] + step[ 3];
|
|
|
|
output[1] = step[ 1] + step[ 2];
|
|
|
|
output[2] = step[ 1] - step[ 2];
|
|
|
|
output[3] = step[ 0] - step[ 3];
|
|
|
|
|
|
|
|
temp1 = step[ 4]*C14;
|
|
|
|
temp2 = step[ 7]*C2;
|
|
|
|
temp1 -= temp2;
|
|
|
|
output[4] = (temp1);
|
|
|
|
|
|
|
|
temp1 = step[ 4]*C2;
|
|
|
|
temp2 = step[ 7]*C14;
|
|
|
|
temp1 += temp2;
|
|
|
|
output[7] = (temp1);
|
|
|
|
|
|
|
|
temp1 = step[ 5]*C10;
|
|
|
|
temp2 = step[ 6]*C6;
|
|
|
|
temp1 -= temp2;
|
|
|
|
output[5] = (temp1);
|
|
|
|
|
|
|
|
temp1 = step[ 5]*C6;
|
|
|
|
temp2 = step[ 6]*C10;
|
|
|
|
temp1 += temp2;
|
|
|
|
output[6] = (temp1);
|
|
|
|
|
|
|
|
output[8] = step[ 8] + step[11];
|
|
|
|
output[9] = step[ 9] + step[10];
|
|
|
|
output[10] = step[ 9] - step[10];
|
|
|
|
output[11] = step[ 8] - step[11];
|
|
|
|
output[12] = step[12] + step[15];
|
|
|
|
output[13] = step[13] + step[14];
|
|
|
|
output[14] = step[13] - step[14];
|
|
|
|
output[15] = step[12] - step[15];
|
|
|
|
|
|
|
|
// output 4
|
|
|
|
step[ 0] = output[0] + output[7];
|
|
|
|
step[ 1] = output[1] + output[6];
|
|
|
|
step[ 2] = output[2] + output[5];
|
|
|
|
step[ 3] = output[3] + output[4];
|
|
|
|
step[ 4] = output[3] - output[4];
|
|
|
|
step[ 5] = output[2] - output[5];
|
|
|
|
step[ 6] = output[1] - output[6];
|
|
|
|
step[ 7] = output[0] - output[7];
|
|
|
|
|
|
|
|
temp1 = output[8]*C7;
|
|
|
|
temp2 = output[15]*C9;
|
|
|
|
temp1 -= temp2;
|
|
|
|
step[ 8] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[9]*C11;
|
|
|
|
temp2 = output[14]*C5;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[ 9] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[10]*C3;
|
|
|
|
temp2 = output[13]*C13;
|
|
|
|
temp1 -= temp2;
|
|
|
|
step[10] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[11]*C15;
|
|
|
|
temp2 = output[12]*C1;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[11] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[11]*C1;
|
|
|
|
temp2 = output[12]*C15;
|
|
|
|
temp2 -= temp1;
|
|
|
|
step[12] = (temp2);
|
|
|
|
|
|
|
|
temp1 = output[10]*C13;
|
|
|
|
temp2 = output[13]*C3;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[13] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[9]*C5;
|
|
|
|
temp2 = output[14]*C11;
|
|
|
|
temp2 -= temp1;
|
|
|
|
step[14] = (temp2);
|
|
|
|
|
|
|
|
temp1 = output[8]*C9;
|
|
|
|
temp2 = output[15]*C7;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[15] = (temp1);
|
|
|
|
|
|
|
|
// step 5
|
|
|
|
output[0] = (step[0] + step[15]);
|
|
|
|
output[1] = (step[1] + step[14]);
|
|
|
|
output[2] = (step[2] + step[13]);
|
|
|
|
output[3] = (step[3] + step[12]);
|
|
|
|
output[4] = (step[4] + step[11]);
|
|
|
|
output[5] = (step[5] + step[10]);
|
|
|
|
output[6] = (step[6] + step[ 9]);
|
|
|
|
output[7] = (step[7] + step[ 8]);
|
|
|
|
|
|
|
|
output[15] = (step[0] - step[15]);
|
|
|
|
output[14] = (step[1] - step[14]);
|
|
|
|
output[13] = (step[2] - step[13]);
|
|
|
|
output[12] = (step[3] - step[12]);
|
|
|
|
output[11] = (step[4] - step[11]);
|
|
|
|
output[10] = (step[5] - step[10]);
|
|
|
|
output[9] = (step[6] - step[ 9]);
|
|
|
|
output[8] = (step[7] - step[ 8]);
|
|
|
|
}
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Remove once an int version of iDCT is written
|
|
|
|
#if 0
|
|
|
|
void reference_16x16_idct_1d(double input[16], double output[16]) {
|
2012-10-05 12:16:46 +02:00
|
|
|
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-10-05 12:16:46 +02:00
|
|
|
{
|
|
|
|
const double kPi = 3.141592653589793238462643383279502884;
|
|
|
|
const double kSqrt2 = 1.414213562373095048801688724209698;
|
|
|
|
for (int k = 0; k < 16; k++) {
|
|
|
|
output[k] = 0.0;
|
|
|
|
for (int n = 0; n < 16; n++) {
|
|
|
|
output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
|
|
|
|
if (n == 0)
|
|
|
|
output[k] = output[k]/kSqrt2;
|
|
|
|
}
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
|
|
|
}
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
|
2012-10-05 12:16:46 +02:00
|
|
|
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-10-05 12:16:46 +02:00
|
|
|
{
|
|
|
|
double out[16*16], out2[16*16];
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
|
|
|
// First transform rows
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
double temp_in[16], temp_out[16];
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
temp_in[j] = input[j + i*short_pitch];
|
|
|
|
butterfly_16x16_idct_1d(temp_in, temp_out);
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
out[j + i*16] = temp_out[j];
|
|
|
|
}
|
|
|
|
// Then transform columns
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
double temp_in[16], temp_out[16];
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
temp_in[j] = out[j*16 + i];
|
|
|
|
butterfly_16x16_idct_1d(temp_in, temp_out);
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
out2[j*16 + i] = temp_out[j];
|
|
|
|
}
|
|
|
|
for (i = 0; i < 16*16; ++i)
|
|
|
|
output[i] = round(out2[i]/128);
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
2012-10-31 22:40:53 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
2012-08-03 02:03:14 +02:00
|
|
|
}
|
2012-11-01 17:04:28 +01:00
|
|
|
|
|
|
|
#else
|
2012-12-12 02:06:35 +01:00
|
|
|
|
2013-02-01 01:16:28 +01:00
|
|
|
void idct16_1d(int16_t *input, int16_t *output) {
|
|
|
|
int16_t step1[16], step2[16];
|
2012-12-12 02:06:35 +01:00
|
|
|
int temp1, temp2;
|
2012-11-01 17:04:28 +01:00
|
|
|
|
2013-02-01 01:16:28 +01:00
|
|
|
// stage 1
|
|
|
|
step1[0] = input[0/2];
|
|
|
|
step1[1] = input[16/2];
|
|
|
|
step1[2] = input[8/2];
|
|
|
|
step1[3] = input[24/2];
|
|
|
|
step1[4] = input[4/2];
|
|
|
|
step1[5] = input[20/2];
|
|
|
|
step1[6] = input[12/2];
|
|
|
|
step1[7] = input[28/2];
|
|
|
|
step1[8] = input[2/2];
|
|
|
|
step1[9] = input[18/2];
|
|
|
|
step1[10] = input[10/2];
|
|
|
|
step1[11] = input[26/2];
|
|
|
|
step1[12] = input[6/2];
|
|
|
|
step1[13] = input[22/2];
|
|
|
|
step1[14] = input[14/2];
|
|
|
|
step1[15] = input[30/2];
|
|
|
|
|
|
|
|
// stage 2
|
|
|
|
step2[0] = step1[0];
|
|
|
|
step2[1] = step1[1];
|
|
|
|
step2[2] = step1[2];
|
|
|
|
step2[3] = step1[3];
|
|
|
|
step2[4] = step1[4];
|
|
|
|
step2[5] = step1[5];
|
|
|
|
step2[6] = step1[6];
|
|
|
|
step2[7] = step1[7];
|
|
|
|
|
|
|
|
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
|
|
|
|
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
|
|
|
|
step2[8] = dct_const_round_shift(temp1);
|
|
|
|
step2[15] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
|
|
|
|
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
|
|
|
|
step2[9] = dct_const_round_shift(temp1);
|
|
|
|
step2[14] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
|
|
|
|
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
|
|
|
|
step2[10] = dct_const_round_shift(temp1);
|
|
|
|
step2[13] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
|
|
|
|
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
|
|
|
|
step2[11] = dct_const_round_shift(temp1);
|
|
|
|
step2[12] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
// stage 3
|
|
|
|
step1[0] = step2[0];
|
|
|
|
step1[1] = step2[1];
|
|
|
|
step1[2] = step2[2];
|
|
|
|
step1[3] = step2[3];
|
|
|
|
|
|
|
|
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
|
|
|
|
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
|
|
|
|
step1[4] = dct_const_round_shift(temp1);
|
|
|
|
step1[7] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
|
|
|
|
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
|
|
|
|
step1[5] = dct_const_round_shift(temp1);
|
|
|
|
step1[6] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
step1[8] = step2[8] + step2[9];
|
|
|
|
step1[9] = step2[8] - step2[9];
|
|
|
|
step1[10] = -step2[10] + step2[11];
|
|
|
|
step1[11] = step2[10] + step2[11];
|
|
|
|
step1[12] = step2[12] + step2[13];
|
|
|
|
step1[13] = step2[12] - step2[13];
|
|
|
|
step1[14] = -step2[14] + step2[15];
|
|
|
|
step1[15] = step2[14] + step2[15];
|
|
|
|
|
|
|
|
temp1 = (step1[0] + step1[1]) * cospi_16_64;
|
|
|
|
temp2 = (step1[0] - step1[1]) * cospi_16_64;
|
|
|
|
step2[0] = dct_const_round_shift(temp1);
|
|
|
|
step2[1] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
|
|
|
|
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
|
|
|
|
step2[2] = dct_const_round_shift(temp1);
|
|
|
|
step2[3] = dct_const_round_shift(temp2);
|
|
|
|
step2[4] = step1[4] + step1[5];
|
|
|
|
step2[5] = step1[4] - step1[5];
|
|
|
|
step2[6] = -step1[6] + step1[7];
|
|
|
|
step2[7] = step1[6] + step1[7];
|
|
|
|
|
|
|
|
step2[8] = step1[8];
|
|
|
|
step2[15] = step1[15];
|
|
|
|
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
|
|
|
|
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
|
|
|
|
step2[9] = dct_const_round_shift(temp1);
|
|
|
|
step2[14] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
|
|
|
|
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
|
|
|
|
step2[10] = dct_const_round_shift(temp1);
|
|
|
|
step2[13] = dct_const_round_shift(temp2);
|
|
|
|
step2[11] = step1[11];
|
|
|
|
step2[12] = step1[12];
|
|
|
|
|
|
|
|
// stage 5
|
|
|
|
step1[0] = step2[0] + step2[3];
|
|
|
|
step1[1] = step2[1] + step2[2];
|
|
|
|
step1[2] = step2[1] - step2[2];
|
|
|
|
step1[3] = step2[0] - step2[3];
|
|
|
|
step1[4] = step2[4];
|
|
|
|
temp1 = (step2[6] - step2[5]) * cospi_16_64;
|
|
|
|
temp2 = (step2[5] + step2[6]) * cospi_16_64;
|
|
|
|
step1[5] = dct_const_round_shift(temp1);
|
|
|
|
step1[6] = dct_const_round_shift(temp2);
|
|
|
|
step1[7] = step2[7];
|
|
|
|
|
|
|
|
step1[8] = step2[8] + step2[11];
|
|
|
|
step1[9] = step2[9] + step2[10];
|
|
|
|
step1[10] = step2[9] - step2[10];
|
|
|
|
step1[11] = step2[8] - step2[11];
|
|
|
|
step1[12] = -step2[12] + step2[15];
|
|
|
|
step1[13] = -step2[13] + step2[14];
|
|
|
|
step1[14] = step2[13] + step2[14];
|
|
|
|
step1[15] = step2[12] + step2[15];
|
|
|
|
|
|
|
|
// stage 6
|
|
|
|
step2[0] = step1[0] + step1[7];
|
|
|
|
step2[1] = step1[1] + step1[6];
|
|
|
|
step2[2] = step1[2] + step1[5];
|
|
|
|
step2[3] = step1[3] + step1[4];
|
|
|
|
step2[4] = step1[3] - step1[4];
|
|
|
|
step2[5] = step1[2] - step1[5];
|
|
|
|
step2[6] = step1[1] - step1[6];
|
|
|
|
step2[7] = step1[0] - step1[7];
|
|
|
|
step2[8] = step1[8];
|
|
|
|
step2[9] = step1[9];
|
|
|
|
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
|
|
|
|
temp2 = (step1[10] + step1[13]) * cospi_16_64;
|
|
|
|
step2[10] = dct_const_round_shift(temp1);
|
|
|
|
step2[13] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
|
|
|
|
temp2 = (step1[11] + step1[12]) * cospi_16_64;
|
|
|
|
step2[11] = dct_const_round_shift(temp1);
|
|
|
|
step2[12] = dct_const_round_shift(temp2);
|
|
|
|
step2[14] = step1[14];
|
|
|
|
step2[15] = step1[15];
|
|
|
|
|
|
|
|
// stage 7
|
|
|
|
output[0] = step2[0] + step2[15];
|
|
|
|
output[1] = step2[1] + step2[14];
|
|
|
|
output[2] = step2[2] + step2[13];
|
|
|
|
output[3] = step2[3] + step2[12];
|
|
|
|
output[4] = step2[4] + step2[11];
|
|
|
|
output[5] = step2[5] + step2[10];
|
|
|
|
output[6] = step2[6] + step2[9];
|
|
|
|
output[7] = step2[7] + step2[8];
|
|
|
|
output[8] = step2[7] - step2[8];
|
|
|
|
output[9] = step2[6] - step2[9];
|
|
|
|
output[10] = step2[5] - step2[10];
|
|
|
|
output[11] = step2[4] - step2[11];
|
|
|
|
output[12] = step2[3] - step2[12];
|
|
|
|
output[13] = step2[2] - step2[13];
|
|
|
|
output[14] = step2[1] - step2[14];
|
|
|
|
output[15] = step2[0] - step2[15];
|
2012-11-01 17:04:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
|
2012-12-12 02:06:35 +01:00
|
|
|
int16_t out[16 * 16];
|
|
|
|
int16_t *outptr = &out[0];
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
|
|
|
int16_t temp_in[16], temp_out[16];
|
2012-11-01 17:04:28 +01:00
|
|
|
|
2012-12-12 02:06:35 +01:00
|
|
|
// First transform rows
|
|
|
|
for (i = 0; i < 16; ++i) {
|
2013-02-01 01:16:28 +01:00
|
|
|
idct16_1d(input, outptr);
|
2012-12-12 02:06:35 +01:00
|
|
|
input += short_pitch;
|
|
|
|
outptr += 16;
|
|
|
|
}
|
2012-11-01 17:04:28 +01:00
|
|
|
|
2012-12-12 02:06:35 +01:00
|
|
|
// Then transform columns
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
temp_in[j] = out[j * 16 + i];
|
2013-02-01 01:16:28 +01:00
|
|
|
idct16_1d(temp_in, temp_out);
|
2012-12-12 02:06:35 +01:00
|
|
|
for (j = 0; j < 16; ++j)
|
2013-02-01 01:16:28 +01:00
|
|
|
output[j * 16 + i] = (temp_out[j] + 32) >> 6;
|
2012-11-01 17:04:28 +01:00
|
|
|
}
|
|
|
|
}
|
2012-11-07 01:06:22 +01:00
|
|
|
|
|
|
|
void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
|
|
|
|
int16_t out[16 * 16];
|
|
|
|
int16_t *outptr = &out[0];
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
|
|
|
int16_t temp_in[16], temp_out[16];
|
|
|
|
|
|
|
|
/* First transform rows. Since all non-zero dct coefficients are in
|
|
|
|
* upper-left 4x4 area, we only need to calculate first 4 rows here.
|
|
|
|
*/
|
|
|
|
vpx_memset(out, 0, sizeof(out));
|
|
|
|
for (i = 0; i < 4; ++i) {
|
2013-02-01 01:16:28 +01:00
|
|
|
idct16_1d(input, outptr);
|
2012-11-07 01:06:22 +01:00
|
|
|
input += short_pitch;
|
|
|
|
outptr += 16;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Then transform columns
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
temp_in[j] = out[j*16 + i];
|
2013-02-01 01:16:28 +01:00
|
|
|
idct16_1d(temp_in, temp_out);
|
2012-11-07 01:06:22 +01:00
|
|
|
for (j = 0; j < 16; ++j)
|
2013-02-01 01:16:28 +01:00
|
|
|
output[j*16 + i] = (temp_out[j] + 32) >> 6;
|
2012-11-07 01:06:22 +01:00
|
|
|
}
|
|
|
|
}
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
|
2013-01-30 22:01:49 +01:00
|
|
|
|
2013-02-01 01:16:28 +01:00
|
|
|
void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
|
|
|
|
int tmp;
|
|
|
|
int16_t out;
|
|
|
|
tmp = input[0] * cospi_16_64;
|
|
|
|
out = dct_const_round_shift(tmp);
|
|
|
|
tmp = out * cospi_16_64;
|
|
|
|
out = dct_const_round_shift(tmp);
|
|
|
|
*output = (out + 32) >> 6;
|
2013-01-30 22:01:49 +01:00
|
|
|
}
|
2013-02-01 01:16:28 +01:00
|
|
|
#endif
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
|
2013-02-01 01:16:28 +01:00
|
|
|
#if !CONFIG_DWTDCTHYBRID
|
2013-01-30 22:01:49 +01:00
|
|
|
void idct32_1d(int16_t *input, int16_t *output) {
|
|
|
|
int16_t step1[32], step2[32];
|
|
|
|
int temp1, temp2;
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
|
2013-01-30 22:01:49 +01:00
|
|
|
// stage 1
|
|
|
|
step1[0] = input[0];
|
|
|
|
step1[1] = input[16];
|
|
|
|
step1[2] = input[8];
|
|
|
|
step1[3] = input[24];
|
|
|
|
step1[4] = input[4];
|
|
|
|
step1[5] = input[20];
|
|
|
|
step1[6] = input[12];
|
|
|
|
step1[7] = input[28];
|
|
|
|
step1[8] = input[2];
|
|
|
|
step1[9] = input[18];
|
|
|
|
step1[10] = input[10];
|
|
|
|
step1[11] = input[26];
|
|
|
|
step1[12] = input[6];
|
|
|
|
step1[13] = input[22];
|
|
|
|
step1[14] = input[14];
|
|
|
|
step1[15] = input[30];
|
|
|
|
|
|
|
|
temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
|
|
|
|
temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
|
|
|
|
step1[16] = dct_const_round_shift(temp1);
|
|
|
|
step1[31] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
|
|
|
|
temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
|
|
|
|
step1[17] = dct_const_round_shift(temp1);
|
|
|
|
step1[30] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
|
|
|
|
temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
|
|
|
|
step1[18] = dct_const_round_shift(temp1);
|
|
|
|
step1[29] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
|
|
|
|
temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
|
|
|
|
step1[19] = dct_const_round_shift(temp1);
|
|
|
|
step1[28] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
|
|
|
|
temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
|
|
|
|
step1[20] = dct_const_round_shift(temp1);
|
|
|
|
step1[27] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
|
|
|
|
temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
|
|
|
|
step1[21] = dct_const_round_shift(temp1);
|
|
|
|
step1[26] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
|
|
|
|
temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
|
|
|
|
step1[22] = dct_const_round_shift(temp1);
|
|
|
|
step1[25] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
|
|
|
|
temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
|
|
|
|
step1[23] = dct_const_round_shift(temp1);
|
|
|
|
step1[24] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
// stage 2
|
|
|
|
step2[0] = step1[0];
|
|
|
|
step2[1] = step1[1];
|
|
|
|
step2[2] = step1[2];
|
|
|
|
step2[3] = step1[3];
|
|
|
|
step2[4] = step1[4];
|
|
|
|
step2[5] = step1[5];
|
|
|
|
step2[6] = step1[6];
|
|
|
|
step2[7] = step1[7];
|
|
|
|
|
|
|
|
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
|
|
|
|
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
|
|
|
|
step2[8] = dct_const_round_shift(temp1);
|
|
|
|
step2[15] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
|
|
|
|
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
|
|
|
|
step2[9] = dct_const_round_shift(temp1);
|
|
|
|
step2[14] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
|
|
|
|
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
|
|
|
|
step2[10] = dct_const_round_shift(temp1);
|
|
|
|
step2[13] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
|
|
|
|
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
|
|
|
|
step2[11] = dct_const_round_shift(temp1);
|
|
|
|
step2[12] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
step2[16] = step1[16] + step1[17];
|
|
|
|
step2[17] = step1[16] - step1[17];
|
|
|
|
step2[18] = -step1[18] + step1[19];
|
|
|
|
step2[19] = step1[18] + step1[19];
|
|
|
|
step2[20] = step1[20] + step1[21];
|
|
|
|
step2[21] = step1[20] - step1[21];
|
|
|
|
step2[22] = -step1[22] + step1[23];
|
|
|
|
step2[23] = step1[22] + step1[23];
|
|
|
|
step2[24] = step1[24] + step1[25];
|
|
|
|
step2[25] = step1[24] - step1[25];
|
|
|
|
step2[26] = -step1[26] + step1[27];
|
|
|
|
step2[27] = step1[26] + step1[27];
|
|
|
|
step2[28] = step1[28] + step1[29];
|
|
|
|
step2[29] = step1[28] - step1[29];
|
|
|
|
step2[30] = -step1[30] + step1[31];
|
|
|
|
step2[31] = step1[30] + step1[31];
|
|
|
|
|
|
|
|
// stage 3
|
|
|
|
step1[0] = step2[0];
|
|
|
|
step1[1] = step2[1];
|
|
|
|
step1[2] = step2[2];
|
|
|
|
step1[3] = step2[3];
|
|
|
|
|
|
|
|
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
|
|
|
|
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
|
|
|
|
step1[4] = dct_const_round_shift(temp1);
|
|
|
|
step1[7] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
|
|
|
|
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
|
|
|
|
step1[5] = dct_const_round_shift(temp1);
|
|
|
|
step1[6] = dct_const_round_shift(temp2);
|
|
|
|
|
|
|
|
step1[8] = step2[8] + step2[9];
|
|
|
|
step1[9] = step2[8] - step2[9];
|
|
|
|
step1[10] = -step2[10] + step2[11];
|
|
|
|
step1[11] = step2[10] + step2[11];
|
|
|
|
step1[12] = step2[12] + step2[13];
|
|
|
|
step1[13] = step2[12] - step2[13];
|
|
|
|
step1[14] = -step2[14] + step2[15];
|
|
|
|
step1[15] = step2[14] + step2[15];
|
|
|
|
|
|
|
|
step1[16] = step2[16];
|
|
|
|
step1[31] = step2[31];
|
|
|
|
temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
|
|
|
|
temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
|
|
|
|
step1[17] = dct_const_round_shift(temp1);
|
|
|
|
step1[30] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
|
|
|
|
temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
|
|
|
|
step1[18] = dct_const_round_shift(temp1);
|
|
|
|
step1[29] = dct_const_round_shift(temp2);
|
|
|
|
step1[19] = step2[19];
|
|
|
|
step1[20] = step2[20];
|
|
|
|
temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
|
|
|
|
temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
|
|
|
|
step1[21] = dct_const_round_shift(temp1);
|
|
|
|
step1[26] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
|
|
|
|
temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
|
|
|
|
step1[22] = dct_const_round_shift(temp1);
|
|
|
|
step1[25] = dct_const_round_shift(temp2);
|
|
|
|
step1[23] = step2[23];
|
|
|
|
step1[24] = step2[24];
|
|
|
|
step1[27] = step2[27];
|
|
|
|
step1[28] = step2[28];
|
|
|
|
|
|
|
|
// stage 4
|
|
|
|
temp1 = (step1[0] + step1[1]) * cospi_16_64;
|
|
|
|
temp2 = (step1[0] - step1[1]) * cospi_16_64;
|
|
|
|
step2[0] = dct_const_round_shift(temp1);
|
|
|
|
step2[1] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
|
|
|
|
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
|
|
|
|
step2[2] = dct_const_round_shift(temp1);
|
|
|
|
step2[3] = dct_const_round_shift(temp2);
|
|
|
|
step2[4] = step1[4] + step1[5];
|
|
|
|
step2[5] = step1[4] - step1[5];
|
|
|
|
step2[6] = -step1[6] + step1[7];
|
|
|
|
step2[7] = step1[6] + step1[7];
|
|
|
|
|
|
|
|
step2[8] = step1[8];
|
|
|
|
step2[15] = step1[15];
|
|
|
|
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
|
|
|
|
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
|
|
|
|
step2[9] = dct_const_round_shift(temp1);
|
|
|
|
step2[14] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
|
|
|
|
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
|
|
|
|
step2[10] = dct_const_round_shift(temp1);
|
|
|
|
step2[13] = dct_const_round_shift(temp2);
|
|
|
|
step2[11] = step1[11];
|
|
|
|
step2[12] = step1[12];
|
|
|
|
|
|
|
|
step2[16] = step1[16] + step1[19];
|
|
|
|
step2[17] = step1[17] + step1[18];
|
|
|
|
step2[18] = step1[17] - step1[18];
|
|
|
|
step2[19] = step1[16] - step1[19];
|
|
|
|
step2[20] = -step1[20] + step1[23];
|
|
|
|
step2[21] = -step1[21] + step1[22];
|
|
|
|
step2[22] = step1[21] + step1[22];
|
|
|
|
step2[23] = step1[20] + step1[23];
|
|
|
|
|
|
|
|
step2[24] = step1[24] + step1[27];
|
|
|
|
step2[25] = step1[25] + step1[26];
|
|
|
|
step2[26] = step1[25] - step1[26];
|
|
|
|
step2[27] = step1[24] - step1[27];
|
|
|
|
step2[28] = -step1[28] + step1[31];
|
|
|
|
step2[29] = -step1[29] + step1[30];
|
|
|
|
step2[30] = step1[29] + step1[30];
|
|
|
|
step2[31] = step1[28] + step1[31];
|
|
|
|
|
|
|
|
// stage 5
|
|
|
|
step1[0] = step2[0] + step2[3];
|
|
|
|
step1[1] = step2[1] + step2[2];
|
|
|
|
step1[2] = step2[1] - step2[2];
|
|
|
|
step1[3] = step2[0] - step2[3];
|
|
|
|
step1[4] = step2[4];
|
|
|
|
temp1 = (step2[6] - step2[5]) * cospi_16_64;
|
|
|
|
temp2 = (step2[5] + step2[6]) * cospi_16_64;
|
|
|
|
step1[5] = dct_const_round_shift(temp1);
|
|
|
|
step1[6] = dct_const_round_shift(temp2);
|
|
|
|
step1[7] = step2[7];
|
|
|
|
|
|
|
|
step1[8] = step2[8] + step2[11];
|
|
|
|
step1[9] = step2[9] + step2[10];
|
|
|
|
step1[10] = step2[9] - step2[10];
|
|
|
|
step1[11] = step2[8] - step2[11];
|
|
|
|
step1[12] = -step2[12] + step2[15];
|
|
|
|
step1[13] = -step2[13] + step2[14];
|
|
|
|
step1[14] = step2[13] + step2[14];
|
|
|
|
step1[15] = step2[12] + step2[15];
|
|
|
|
|
|
|
|
step1[16] = step2[16];
|
|
|
|
step1[17] = step2[17];
|
|
|
|
temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
|
|
|
|
temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
|
|
|
|
step1[18] = dct_const_round_shift(temp1);
|
|
|
|
step1[29] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
|
|
|
|
temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
|
|
|
|
step1[19] = dct_const_round_shift(temp1);
|
|
|
|
step1[28] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
|
|
|
|
temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
|
|
|
|
step1[20] = dct_const_round_shift(temp1);
|
|
|
|
step1[27] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
|
|
|
|
temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
|
|
|
|
step1[21] = dct_const_round_shift(temp1);
|
|
|
|
step1[26] = dct_const_round_shift(temp2);
|
|
|
|
step1[22] = step2[22];
|
|
|
|
step1[23] = step2[23];
|
|
|
|
step1[24] = step2[24];
|
|
|
|
step1[25] = step2[25];
|
|
|
|
step1[30] = step2[30];
|
|
|
|
step1[31] = step2[31];
|
|
|
|
|
|
|
|
// stage 6
|
|
|
|
step2[0] = step1[0] + step1[7];
|
|
|
|
step2[1] = step1[1] + step1[6];
|
|
|
|
step2[2] = step1[2] + step1[5];
|
|
|
|
step2[3] = step1[3] + step1[4];
|
|
|
|
step2[4] = step1[3] - step1[4];
|
|
|
|
step2[5] = step1[2] - step1[5];
|
|
|
|
step2[6] = step1[1] - step1[6];
|
|
|
|
step2[7] = step1[0] - step1[7];
|
|
|
|
step2[8] = step1[8];
|
|
|
|
step2[9] = step1[9];
|
|
|
|
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
|
|
|
|
temp2 = (step1[10] + step1[13]) * cospi_16_64;
|
|
|
|
step2[10] = dct_const_round_shift(temp1);
|
|
|
|
step2[13] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
|
|
|
|
temp2 = (step1[11] + step1[12]) * cospi_16_64;
|
|
|
|
step2[11] = dct_const_round_shift(temp1);
|
|
|
|
step2[12] = dct_const_round_shift(temp2);
|
|
|
|
step2[14] = step1[14];
|
|
|
|
step2[15] = step1[15];
|
|
|
|
|
|
|
|
step2[16] = step1[16] + step1[23];
|
|
|
|
step2[17] = step1[17] + step1[22];
|
|
|
|
step2[18] = step1[18] + step1[21];
|
|
|
|
step2[19] = step1[19] + step1[20];
|
|
|
|
step2[20] = step1[19] - step1[20];
|
|
|
|
step2[21] = step1[18] - step1[21];
|
|
|
|
step2[22] = step1[17] - step1[22];
|
|
|
|
step2[23] = step1[16] - step1[23];
|
|
|
|
|
|
|
|
step2[24] = -step1[24] + step1[31];
|
|
|
|
step2[25] = -step1[25] + step1[30];
|
|
|
|
step2[26] = -step1[26] + step1[29];
|
|
|
|
step2[27] = -step1[27] + step1[28];
|
|
|
|
step2[28] = step1[27] + step1[28];
|
|
|
|
step2[29] = step1[26] + step1[29];
|
|
|
|
step2[30] = step1[25] + step1[30];
|
|
|
|
step2[31] = step1[24] + step1[31];
|
|
|
|
|
|
|
|
// stage 7
|
|
|
|
step1[0] = step2[0] + step2[15];
|
|
|
|
step1[1] = step2[1] + step2[14];
|
|
|
|
step1[2] = step2[2] + step2[13];
|
|
|
|
step1[3] = step2[3] + step2[12];
|
|
|
|
step1[4] = step2[4] + step2[11];
|
|
|
|
step1[5] = step2[5] + step2[10];
|
|
|
|
step1[6] = step2[6] + step2[9];
|
|
|
|
step1[7] = step2[7] + step2[8];
|
|
|
|
step1[8] = step2[7] - step2[8];
|
|
|
|
step1[9] = step2[6] - step2[9];
|
|
|
|
step1[10] = step2[5] - step2[10];
|
|
|
|
step1[11] = step2[4] - step2[11];
|
|
|
|
step1[12] = step2[3] - step2[12];
|
|
|
|
step1[13] = step2[2] - step2[13];
|
|
|
|
step1[14] = step2[1] - step2[14];
|
|
|
|
step1[15] = step2[0] - step2[15];
|
|
|
|
|
|
|
|
step1[16] = step2[16];
|
|
|
|
step1[17] = step2[17];
|
|
|
|
step1[18] = step2[18];
|
|
|
|
step1[19] = step2[19];
|
|
|
|
temp1 = (-step2[20] + step2[27]) * cospi_16_64;
|
|
|
|
temp2 = (step2[20] + step2[27]) * cospi_16_64;
|
|
|
|
step1[20] = dct_const_round_shift(temp1);
|
|
|
|
step1[27] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = (-step2[21] + step2[26]) * cospi_16_64;
|
|
|
|
temp2 = (step2[21] + step2[26]) * cospi_16_64;
|
|
|
|
step1[21] = dct_const_round_shift(temp1);
|
|
|
|
step1[26] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = (-step2[22] + step2[25]) * cospi_16_64;
|
|
|
|
temp2 = (step2[22] + step2[25]) * cospi_16_64;
|
|
|
|
step1[22] = dct_const_round_shift(temp1);
|
|
|
|
step1[25] = dct_const_round_shift(temp2);
|
|
|
|
temp1 = (-step2[23] + step2[24]) * cospi_16_64;
|
|
|
|
temp2 = (step2[23] + step2[24]) * cospi_16_64;
|
|
|
|
step1[23] = dct_const_round_shift(temp1);
|
|
|
|
step1[24] = dct_const_round_shift(temp2);
|
|
|
|
step1[28] = step2[28];
|
|
|
|
step1[29] = step2[29];
|
|
|
|
step1[30] = step2[30];
|
|
|
|
step1[31] = step2[31];
|
|
|
|
|
|
|
|
// final stage
|
|
|
|
output[0] = step1[0] + step1[31];
|
|
|
|
output[1] = step1[1] + step1[30];
|
|
|
|
output[2] = step1[2] + step1[29];
|
|
|
|
output[3] = step1[3] + step1[28];
|
|
|
|
output[4] = step1[4] + step1[27];
|
|
|
|
output[5] = step1[5] + step1[26];
|
|
|
|
output[6] = step1[6] + step1[25];
|
|
|
|
output[7] = step1[7] + step1[24];
|
|
|
|
output[8] = step1[8] + step1[23];
|
|
|
|
output[9] = step1[9] + step1[22];
|
|
|
|
output[10] = step1[10] + step1[21];
|
|
|
|
output[11] = step1[11] + step1[20];
|
|
|
|
output[12] = step1[12] + step1[19];
|
|
|
|
output[13] = step1[13] + step1[18];
|
|
|
|
output[14] = step1[14] + step1[17];
|
|
|
|
output[15] = step1[15] + step1[16];
|
|
|
|
output[16] = step1[15] - step1[16];
|
|
|
|
output[17] = step1[14] - step1[17];
|
|
|
|
output[18] = step1[13] - step1[18];
|
|
|
|
output[19] = step1[12] - step1[19];
|
|
|
|
output[20] = step1[11] - step1[20];
|
|
|
|
output[21] = step1[10] - step1[21];
|
|
|
|
output[22] = step1[9] - step1[22];
|
|
|
|
output[23] = step1[8] - step1[23];
|
|
|
|
output[24] = step1[7] - step1[24];
|
|
|
|
output[25] = step1[6] - step1[25];
|
|
|
|
output[26] = step1[5] - step1[26];
|
|
|
|
output[27] = step1[4] - step1[27];
|
|
|
|
output[28] = step1[3] - step1[28];
|
|
|
|
output[29] = step1[2] - step1[29];
|
|
|
|
output[30] = step1[1] - step1[30];
|
|
|
|
output[31] = step1[0] - step1[31];
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
2013-01-09 15:26:54 +01:00
|
|
|
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
|
2013-01-30 22:01:49 +01:00
|
|
|
int16_t out[32 * 32];
|
|
|
|
int16_t *outptr = &out[0];
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
|
|
|
int16_t temp_in[32], temp_out[32];
|
|
|
|
|
|
|
|
// First transform rows
|
|
|
|
for (i = 0; i < 32; ++i) {
|
|
|
|
idct32_1d(input, outptr);
|
|
|
|
input += short_pitch;
|
|
|
|
outptr += 32;
|
|
|
|
}
|
|
|
|
// Then transform columns
|
|
|
|
for (i = 0; i < 32; ++i) {
|
|
|
|
for (j = 0; j < 32; ++j)
|
|
|
|
temp_in[j] = out[j * 32 + i];
|
|
|
|
idct32_1d(temp_in, temp_out);
|
|
|
|
for (j = 0; j < 32; ++j)
|
|
|
|
output[j * 32 + i] = (temp_out[j] + 32) >> 6;
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
|
|
|
}
|
2012-12-12 02:06:35 +01:00
|
|
|
|
2013-01-09 15:26:54 +01:00
|
|
|
#else // !CONFIG_DWTDCTHYBRID
|
2012-12-12 02:06:35 +01:00
|
|
|
|
|
|
|
#if DWT_TYPE == 53
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
|
|
|
|
// Note: block length must be even for this implementation
|
2012-12-19 00:31:19 +01:00
|
|
|
static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass,
|
|
|
|
int16_t *x) {
|
|
|
|
int16_t r, *a, *b;
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
int n;
|
|
|
|
|
|
|
|
n = length >> 1;
|
|
|
|
b = highpass;
|
|
|
|
a = lowpass;
|
|
|
|
r = *highpass;
|
|
|
|
while (n--) {
|
|
|
|
*a++ -= (r + (*b) + 1) >> 1;
|
|
|
|
r = *b++;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = length >> 1;
|
|
|
|
b = highpass;
|
|
|
|
a = lowpass;
|
|
|
|
while (--n) {
|
|
|
|
*x++ = ((r = *a++) + 1) >> 1;
|
|
|
|
*x++ = *b++ + ((r + (*a) + 2) >> 2);
|
|
|
|
}
|
2012-12-12 02:06:35 +01:00
|
|
|
*x++ = ((r = *a) + 1) >> 1;
|
|
|
|
*x++ = *b + ((r + 1) >> 1);
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass,
|
|
|
|
int16_t *x) {
|
|
|
|
int16_t r, *a, *b;
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
int n;
|
|
|
|
|
|
|
|
n = length >> 1;
|
|
|
|
b = highpass;
|
|
|
|
a = lowpass;
|
|
|
|
r = *highpass;
|
|
|
|
while (n--) {
|
|
|
|
*a++ -= (r + (*b) + 1) >> 1;
|
|
|
|
r = *b++;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = length >> 1;
|
|
|
|
b = highpass;
|
|
|
|
a = lowpass;
|
|
|
|
while (--n) {
|
2012-12-12 02:06:35 +01:00
|
|
|
r = *a++;
|
|
|
|
*x++ = r;
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
*x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
|
|
|
|
}
|
2012-12-12 02:06:35 +01:00
|
|
|
*x++ = *a;
|
|
|
|
*x++ = ((*b) << 1) + *a;
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
|
|
|
|
2013-01-08 21:18:16 +01:00
|
|
|
static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,
|
|
|
|
int pitch_c, int16_t *x, int pitch_x) {
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
|
2012-12-12 02:06:35 +01:00
|
|
|
short buffer[2 * DWT_MAX_LENGTH];
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
|
|
|
|
th[0] = hh;
|
|
|
|
tw[0] = hw;
|
|
|
|
for (i = 1; i <= levels; i++) {
|
|
|
|
th[i] = (th[i - 1] + 1) >> 1;
|
|
|
|
tw[i] = (tw[i - 1] + 1) >> 1;
|
|
|
|
}
|
|
|
|
for (lv = levels - 1; lv >= 0; lv--) {
|
|
|
|
nh = th[lv];
|
|
|
|
nw = tw[lv];
|
|
|
|
hh = th[lv + 1];
|
|
|
|
hw = tw[lv + 1];
|
|
|
|
if ((nh < 2) || (nw < 2)) continue;
|
|
|
|
for (j = 0; j < nw; j++) {
|
|
|
|
for (i = 0; i < nh; i++)
|
|
|
|
buffer[i] = c[i * pitch_c + j];
|
|
|
|
synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
|
|
|
|
for (i = 0; i < nh; i++)
|
|
|
|
c[i * pitch_c + j] = buffer[i + nh];
|
|
|
|
}
|
|
|
|
for (i = 0; i < nh; i++) {
|
2012-12-12 02:06:35 +01:00
|
|
|
memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
|
|
|
|
}
|
|
|
|
}
|
2012-12-12 02:06:35 +01:00
|
|
|
for (i = 0; i < height; i++) {
|
|
|
|
for (j = 0; j < width; j++) {
|
|
|
|
x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
|
|
|
|
((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
|
|
|
|
-((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#elif DWT_TYPE == 26
|
|
|
|
|
|
|
|
// Note: block length must be even for this implementation
|
2012-12-19 00:31:19 +01:00
|
|
|
static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass,
|
|
|
|
int16_t *x) {
|
|
|
|
int16_t r, s, *a, *b;
|
2012-12-12 02:06:35 +01:00
|
|
|
int i, n = length >> 1;
|
|
|
|
|
|
|
|
if (n >= 4) {
|
|
|
|
a = lowpass;
|
|
|
|
b = highpass;
|
|
|
|
r = *lowpass;
|
|
|
|
while (--n) {
|
|
|
|
*b++ += (r - a[1] + 4) >> 3;
|
|
|
|
r = *a++;
|
|
|
|
}
|
|
|
|
*b += (r - *a + 4) >> 3;
|
|
|
|
}
|
|
|
|
a = lowpass;
|
|
|
|
b = highpass;
|
|
|
|
for (i = length >> 1; i; i--) {
|
|
|
|
s = *b++;
|
|
|
|
r = *a++;
|
|
|
|
*x++ = (r + s + 1) >> 1;
|
|
|
|
*x++ = (r - s + 1) >> 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass,
|
|
|
|
int16_t *x) {
|
|
|
|
int16_t r, s, *a, *b;
|
2012-12-12 02:06:35 +01:00
|
|
|
int i, n = length >> 1;
|
|
|
|
|
|
|
|
if (n >= 4) {
|
|
|
|
a = lowpass;
|
|
|
|
b = highpass;
|
|
|
|
r = *lowpass;
|
|
|
|
while (--n) {
|
|
|
|
*b++ += (r - a[1] + 4) >> 3;
|
|
|
|
r = *a++;
|
|
|
|
}
|
|
|
|
*b += (r - *a + 4) >> 3;
|
|
|
|
}
|
|
|
|
a = lowpass;
|
|
|
|
b = highpass;
|
|
|
|
for (i = length >> 1; i; i--) {
|
|
|
|
s = *b++;
|
|
|
|
r = *a++;
|
|
|
|
*x++ = r + s;
|
|
|
|
*x++ = r - s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-08 21:18:16 +01:00
|
|
|
static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,
|
|
|
|
int pitch_c, int16_t *x, int pitch_x) {
|
2012-12-12 02:06:35 +01:00
|
|
|
int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t buffer[2 * DWT_MAX_LENGTH];
|
2012-12-12 02:06:35 +01:00
|
|
|
|
|
|
|
th[0] = hh;
|
|
|
|
tw[0] = hw;
|
|
|
|
for (i = 1; i <= levels; i++) {
|
|
|
|
th[i] = (th[i - 1] + 1) >> 1;
|
|
|
|
tw[i] = (tw[i - 1] + 1) >> 1;
|
|
|
|
}
|
|
|
|
for (lv = levels - 1; lv >= 0; lv--) {
|
|
|
|
nh = th[lv];
|
|
|
|
nw = tw[lv];
|
|
|
|
hh = th[lv + 1];
|
|
|
|
hw = tw[lv + 1];
|
|
|
|
if ((nh < 2) || (nw < 2)) continue;
|
|
|
|
for (j = 0; j < nw; j++) {
|
|
|
|
for (i = 0; i < nh; i++)
|
|
|
|
buffer[i] = c[i * pitch_c + j];
|
|
|
|
synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
|
|
|
|
for (i = 0; i < nh; i++)
|
|
|
|
c[i * pitch_c + j] = buffer[i + nh];
|
|
|
|
}
|
|
|
|
for (i = 0; i < nh; i++) {
|
|
|
|
memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
|
|
|
|
synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (i = 0; i < height; i++) {
|
|
|
|
for (j = 0; j < width; j++) {
|
|
|
|
x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
|
|
|
|
((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
|
|
|
|
-((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#elif DWT_TYPE == 97
|
|
|
|
|
|
|
|
static void synthesis_97(int length, double *lowpass, double *highpass,
|
|
|
|
double *x) {
|
|
|
|
static const double a_predict1 = -1.586134342;
|
|
|
|
static const double a_update1 = -0.05298011854;
|
|
|
|
static const double a_predict2 = 0.8829110762;
|
|
|
|
static const double a_update2 = 0.4435068522;
|
|
|
|
static const double s_low = 1.149604398;
|
|
|
|
static const double s_high = 1/1.149604398;
|
|
|
|
static const double inv_s_low = 1 / s_low;
|
|
|
|
static const double inv_s_high = 1 / s_high;
|
|
|
|
int i;
|
|
|
|
double y[DWT_MAX_LENGTH];
|
|
|
|
// Undo pack and scale
|
|
|
|
for (i = 0; i < length / 2; i++) {
|
|
|
|
y[i * 2] = lowpass[i] * inv_s_low;
|
|
|
|
y[i * 2 + 1] = highpass[i] * inv_s_high;
|
|
|
|
}
|
|
|
|
memcpy(x, y, sizeof(*y) * length);
|
|
|
|
// Undo update 2
|
|
|
|
for (i = 2; i < length; i += 2) {
|
|
|
|
x[i] -= a_update2 * (x[i-1] + x[i+1]);
|
|
|
|
}
|
|
|
|
x[0] -= 2 * a_update2 * x[1];
|
|
|
|
// Undo predict 2
|
|
|
|
for (i = 1; i < length - 2; i += 2) {
|
|
|
|
x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
|
|
|
|
}
|
|
|
|
x[length - 1] -= 2 * a_predict2 * x[length - 2];
|
|
|
|
// Undo update 1
|
|
|
|
for (i = 2; i < length; i += 2) {
|
|
|
|
x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
|
|
|
|
}
|
|
|
|
x[0] -= 2 * a_update1 * x[1];
|
|
|
|
// Undo predict 1
|
|
|
|
for (i = 1; i < length - 2; i += 2) {
|
|
|
|
x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
|
|
|
|
}
|
|
|
|
x[length - 1] -= 2 * a_predict1 * x[length - 2];
|
|
|
|
}
|
|
|
|
|
2013-01-08 21:18:16 +01:00
|
|
|
static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,
|
|
|
|
int pitch_c, int16_t *x, int pitch_x) {
|
2012-12-12 02:06:35 +01:00
|
|
|
int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
|
|
|
|
double buffer[2 * DWT_MAX_LENGTH];
|
|
|
|
double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
|
|
|
|
|
|
|
|
th[0] = hh;
|
|
|
|
tw[0] = hw;
|
|
|
|
for (i = 1; i <= levels; i++) {
|
|
|
|
th[i] = (th[i - 1] + 1) >> 1;
|
|
|
|
tw[i] = (tw[i - 1] + 1) >> 1;
|
|
|
|
}
|
|
|
|
for (lv = levels - 1; lv >= 0; lv--) {
|
|
|
|
nh = th[lv];
|
|
|
|
nw = tw[lv];
|
|
|
|
hh = th[lv + 1];
|
|
|
|
hw = tw[lv + 1];
|
|
|
|
if ((nh < 2) || (nw < 2)) continue;
|
|
|
|
for (j = 0; j < nw; j++) {
|
|
|
|
for (i = 0; i < nh; i++)
|
|
|
|
buffer[i] = c[i * pitch_c + j];
|
|
|
|
synthesis_97(nh, buffer, buffer + hh, buffer + nh);
|
|
|
|
for (i = 0; i < nh; i++)
|
|
|
|
y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
|
|
|
|
}
|
|
|
|
for (i = 0; i < nh; i++) {
|
|
|
|
memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
|
|
|
|
synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
|
|
|
|
}
|
|
|
|
}
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
for (i = 0; i < height; i++)
|
|
|
|
for (j = 0; j < width; j++)
|
2012-12-12 02:06:35 +01:00
|
|
|
x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
|
|
|
|
(1 << DWT_PRECISION_BITS));
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // DWT_TYPE
|
|
|
|
|
|
|
|
// TODO(debargha): Implement scaling differently so as not to have to use the
|
|
|
|
// floating point 16x16 dct
|
|
|
|
static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) {
|
|
|
|
static const double C1 = 0.995184726672197;
|
|
|
|
static const double C2 = 0.98078528040323;
|
|
|
|
static const double C3 = 0.956940335732209;
|
|
|
|
static const double C4 = 0.923879532511287;
|
|
|
|
static const double C5 = 0.881921264348355;
|
|
|
|
static const double C6 = 0.831469612302545;
|
|
|
|
static const double C7 = 0.773010453362737;
|
|
|
|
static const double C8 = 0.707106781186548;
|
|
|
|
static const double C9 = 0.634393284163646;
|
|
|
|
static const double C10 = 0.555570233019602;
|
|
|
|
static const double C11 = 0.471396736825998;
|
|
|
|
static const double C12 = 0.38268343236509;
|
|
|
|
static const double C13 = 0.290284677254462;
|
|
|
|
static const double C14 = 0.195090322016128;
|
|
|
|
static const double C15 = 0.098017140329561;
|
|
|
|
|
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
|
|
|
{
|
|
|
|
double step[16];
|
|
|
|
double intermediate[16];
|
|
|
|
double temp1, temp2;
|
|
|
|
|
|
|
|
|
|
|
|
// step 1 and 2
|
|
|
|
step[ 0] = input[0] + input[8];
|
|
|
|
step[ 1] = input[0] - input[8];
|
|
|
|
|
|
|
|
temp1 = input[4]*C12;
|
|
|
|
temp2 = input[12]*C4;
|
|
|
|
|
|
|
|
temp1 -= temp2;
|
|
|
|
temp1 *= C8;
|
|
|
|
|
|
|
|
step[ 2] = 2*(temp1);
|
|
|
|
|
|
|
|
temp1 = input[4]*C4;
|
|
|
|
temp2 = input[12]*C12;
|
|
|
|
temp1 += temp2;
|
|
|
|
temp1 = (temp1);
|
|
|
|
temp1 *= C8;
|
|
|
|
step[ 3] = 2*(temp1);
|
|
|
|
|
|
|
|
temp1 = input[2]*C8;
|
|
|
|
temp1 = 2*(temp1);
|
|
|
|
temp2 = input[6] + input[10];
|
|
|
|
|
|
|
|
step[ 4] = temp1 + temp2;
|
|
|
|
step[ 5] = temp1 - temp2;
|
|
|
|
|
|
|
|
temp1 = input[14]*C8;
|
|
|
|
temp1 = 2*(temp1);
|
|
|
|
temp2 = input[6] - input[10];
|
|
|
|
|
|
|
|
step[ 6] = temp2 - temp1;
|
|
|
|
step[ 7] = temp2 + temp1;
|
|
|
|
|
|
|
|
// for odd input
|
|
|
|
temp1 = input[3]*C12;
|
|
|
|
temp2 = input[13]*C4;
|
|
|
|
temp1 += temp2;
|
|
|
|
temp1 = (temp1);
|
|
|
|
temp1 *= C8;
|
|
|
|
intermediate[ 8] = 2*(temp1);
|
|
|
|
|
|
|
|
temp1 = input[3]*C4;
|
|
|
|
temp2 = input[13]*C12;
|
|
|
|
temp2 -= temp1;
|
|
|
|
temp2 = (temp2);
|
|
|
|
temp2 *= C8;
|
|
|
|
intermediate[ 9] = 2*(temp2);
|
|
|
|
|
|
|
|
intermediate[10] = 2*(input[9]*C8);
|
|
|
|
intermediate[11] = input[15] - input[1];
|
|
|
|
intermediate[12] = input[15] + input[1];
|
|
|
|
intermediate[13] = 2*((input[7]*C8));
|
|
|
|
|
|
|
|
temp1 = input[11]*C12;
|
|
|
|
temp2 = input[5]*C4;
|
|
|
|
temp2 -= temp1;
|
|
|
|
temp2 = (temp2);
|
|
|
|
temp2 *= C8;
|
|
|
|
intermediate[14] = 2*(temp2);
|
|
|
|
|
|
|
|
temp1 = input[11]*C4;
|
|
|
|
temp2 = input[5]*C12;
|
|
|
|
temp1 += temp2;
|
|
|
|
temp1 = (temp1);
|
|
|
|
temp1 *= C8;
|
|
|
|
intermediate[15] = 2*(temp1);
|
|
|
|
|
|
|
|
step[ 8] = intermediate[ 8] + intermediate[14];
|
|
|
|
step[ 9] = intermediate[ 9] + intermediate[15];
|
|
|
|
step[10] = intermediate[10] + intermediate[11];
|
|
|
|
step[11] = intermediate[10] - intermediate[11];
|
|
|
|
step[12] = intermediate[12] + intermediate[13];
|
|
|
|
step[13] = intermediate[12] - intermediate[13];
|
|
|
|
step[14] = intermediate[ 8] - intermediate[14];
|
|
|
|
step[15] = intermediate[ 9] - intermediate[15];
|
|
|
|
|
|
|
|
// step 3
|
|
|
|
output[0] = step[ 0] + step[ 3];
|
|
|
|
output[1] = step[ 1] + step[ 2];
|
|
|
|
output[2] = step[ 1] - step[ 2];
|
|
|
|
output[3] = step[ 0] - step[ 3];
|
|
|
|
|
|
|
|
temp1 = step[ 4]*C14;
|
|
|
|
temp2 = step[ 7]*C2;
|
|
|
|
temp1 -= temp2;
|
|
|
|
output[4] = (temp1);
|
|
|
|
|
|
|
|
temp1 = step[ 4]*C2;
|
|
|
|
temp2 = step[ 7]*C14;
|
|
|
|
temp1 += temp2;
|
|
|
|
output[7] = (temp1);
|
|
|
|
|
|
|
|
temp1 = step[ 5]*C10;
|
|
|
|
temp2 = step[ 6]*C6;
|
|
|
|
temp1 -= temp2;
|
|
|
|
output[5] = (temp1);
|
|
|
|
|
|
|
|
temp1 = step[ 5]*C6;
|
|
|
|
temp2 = step[ 6]*C10;
|
|
|
|
temp1 += temp2;
|
|
|
|
output[6] = (temp1);
|
|
|
|
|
|
|
|
output[8] = step[ 8] + step[11];
|
|
|
|
output[9] = step[ 9] + step[10];
|
|
|
|
output[10] = step[ 9] - step[10];
|
|
|
|
output[11] = step[ 8] - step[11];
|
|
|
|
output[12] = step[12] + step[15];
|
|
|
|
output[13] = step[13] + step[14];
|
|
|
|
output[14] = step[13] - step[14];
|
|
|
|
output[15] = step[12] - step[15];
|
|
|
|
|
|
|
|
// output 4
|
|
|
|
step[ 0] = output[0] + output[7];
|
|
|
|
step[ 1] = output[1] + output[6];
|
|
|
|
step[ 2] = output[2] + output[5];
|
|
|
|
step[ 3] = output[3] + output[4];
|
|
|
|
step[ 4] = output[3] - output[4];
|
|
|
|
step[ 5] = output[2] - output[5];
|
|
|
|
step[ 6] = output[1] - output[6];
|
|
|
|
step[ 7] = output[0] - output[7];
|
|
|
|
|
|
|
|
temp1 = output[8]*C7;
|
|
|
|
temp2 = output[15]*C9;
|
|
|
|
temp1 -= temp2;
|
|
|
|
step[ 8] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[9]*C11;
|
|
|
|
temp2 = output[14]*C5;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[ 9] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[10]*C3;
|
|
|
|
temp2 = output[13]*C13;
|
|
|
|
temp1 -= temp2;
|
|
|
|
step[10] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[11]*C15;
|
|
|
|
temp2 = output[12]*C1;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[11] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[11]*C1;
|
|
|
|
temp2 = output[12]*C15;
|
|
|
|
temp2 -= temp1;
|
|
|
|
step[12] = (temp2);
|
|
|
|
|
|
|
|
temp1 = output[10]*C13;
|
|
|
|
temp2 = output[13]*C3;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[13] = (temp1);
|
|
|
|
|
|
|
|
temp1 = output[9]*C5;
|
|
|
|
temp2 = output[14]*C11;
|
|
|
|
temp2 -= temp1;
|
|
|
|
step[14] = (temp2);
|
|
|
|
|
|
|
|
temp1 = output[8]*C9;
|
|
|
|
temp2 = output[15]*C7;
|
|
|
|
temp1 += temp2;
|
|
|
|
step[15] = (temp1);
|
|
|
|
|
|
|
|
// step 5
|
|
|
|
output[0] = (step[0] + step[15]);
|
|
|
|
output[1] = (step[1] + step[14]);
|
|
|
|
output[2] = (step[2] + step[13]);
|
|
|
|
output[3] = (step[3] + step[12]);
|
|
|
|
output[4] = (step[4] + step[11]);
|
|
|
|
output[5] = (step[5] + step[10]);
|
|
|
|
output[6] = (step[6] + step[ 9]);
|
|
|
|
output[7] = (step[7] + step[ 8]);
|
|
|
|
|
|
|
|
output[15] = (step[0] - step[15]);
|
|
|
|
output[14] = (step[1] - step[14]);
|
|
|
|
output[13] = (step[2] - step[13]);
|
|
|
|
output[12] = (step[3] - step[12]);
|
|
|
|
output[11] = (step[4] - step[11]);
|
|
|
|
output[10] = (step[5] - step[10]);
|
|
|
|
output[9] = (step[6] - step[ 9]);
|
|
|
|
output[8] = (step[7] - step[ 8]);
|
|
|
|
}
|
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
|
|
|
}
|
|
|
|
|
2013-01-08 21:18:16 +01:00
|
|
|
static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch,
|
|
|
|
int scale) {
|
2012-12-12 02:06:35 +01:00
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
|
|
|
{
|
|
|
|
double out[16*16], out2[16*16];
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
|
|
|
// First transform rows
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
double temp_in[16], temp_out[16];
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
temp_in[j] = input[j + i*short_pitch];
|
|
|
|
butterfly_16x16_idct_1d_f(temp_in, temp_out);
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
out[j + i*16] = temp_out[j];
|
|
|
|
}
|
|
|
|
// Then transform columns
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
double temp_in[16], temp_out[16];
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
temp_in[j] = out[j*16 + i];
|
|
|
|
butterfly_16x16_idct_1d_f(temp_in, temp_out);
|
|
|
|
for (j = 0; j < 16; ++j)
|
|
|
|
out2[j*16 + i] = temp_out[j];
|
|
|
|
}
|
|
|
|
for (i = 0; i < 16*16; ++i)
|
2013-01-08 21:18:16 +01:00
|
|
|
output[i] = round(out2[i] / (128 >> scale));
|
2012-12-12 02:06:35 +01:00
|
|
|
}
|
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
|
|
|
|
2013-01-09 15:26:54 +01:00
|
|
|
static void idct8_1d(double *x) {
|
|
|
|
int i, j;
|
|
|
|
double t[8];
|
|
|
|
static const double idctmat[64] = {
|
|
|
|
0.35355339059327, 0.49039264020162, 0.46193976625564, 0.41573480615127,
|
|
|
|
0.35355339059327, 0.2777851165098, 0.19134171618254, 0.097545161008064,
|
|
|
|
0.35355339059327, 0.41573480615127, 0.19134171618254, -0.097545161008064,
|
|
|
|
-0.35355339059327, -0.49039264020161, -0.46193976625564, -0.2777851165098,
|
|
|
|
0.35355339059327, 0.2777851165098, -0.19134171618254, -0.49039264020162,
|
|
|
|
-0.35355339059327, 0.097545161008064, 0.46193976625564, 0.41573480615127,
|
|
|
|
0.35355339059327, 0.097545161008063, -0.46193976625564, -0.2777851165098,
|
|
|
|
0.35355339059327, 0.41573480615127, -0.19134171618254, -0.49039264020162,
|
|
|
|
0.35355339059327, -0.097545161008063, -0.46193976625564, 0.2777851165098,
|
|
|
|
0.35355339059327, -0.41573480615127, -0.19134171618255, 0.49039264020162,
|
|
|
|
0.35355339059327, -0.2777851165098, -0.19134171618254, 0.49039264020161,
|
|
|
|
-0.35355339059327, -0.097545161008064, 0.46193976625564, -0.41573480615127,
|
|
|
|
0.35355339059327, -0.41573480615127, 0.19134171618254, 0.097545161008065,
|
|
|
|
-0.35355339059327, 0.49039264020162, -0.46193976625564, 0.2777851165098,
|
|
|
|
0.35355339059327, -0.49039264020162, 0.46193976625564, -0.41573480615127,
|
|
|
|
0.35355339059327, -0.2777851165098, 0.19134171618255, -0.097545161008064
|
|
|
|
};
|
|
|
|
for (i = 0; i < 8; ++i) {
|
|
|
|
t[i] = 0;
|
|
|
|
for (j = 0; j < 8; ++j)
|
|
|
|
t[i] += idctmat[i * 8 + j] * x[j];
|
|
|
|
}
|
|
|
|
for (i = 0; i < 8; ++i) {
|
|
|
|
x[i] = t[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch,
|
|
|
|
int scale) {
|
|
|
|
double X[8 * 8], Y[8];
|
|
|
|
int i, j;
|
|
|
|
int shortpitch = pitch >> 1;
|
|
|
|
|
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
|
|
|
{
|
|
|
|
for (i = 0; i < 8; i++) {
|
|
|
|
for (j = 0; j < 8; j++) {
|
|
|
|
X[i * 8 + j] = (double)coefs[i * shortpitch + j];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (i = 0; i < 8; i++)
|
|
|
|
idct8_1d(X + 8 * i);
|
|
|
|
for (i = 0; i < 8; i++) {
|
|
|
|
for (j = 0; j < 8; ++j)
|
|
|
|
Y[j] = X[i + 8 * j];
|
|
|
|
idct8_1d(Y);
|
|
|
|
for (j = 0; j < 8; ++j)
|
|
|
|
X[i + 8 * j] = Y[j];
|
|
|
|
}
|
|
|
|
for (i = 0; i < 8; i++) {
|
|
|
|
for (j = 0; j < 8; j++) {
|
|
|
|
block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
vp9_clear_system_state(); // Make it simd safe : __asm emms;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n))
|
|
|
|
|
|
|
|
#if DWTDCT_TYPE == DWTDCT16X16_LEAN
|
|
|
|
|
2012-12-19 00:31:19 +01:00
|
|
|
void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
|
2013-01-08 21:18:16 +01:00
|
|
|
// assume output is a 32x32 buffer
|
2012-12-12 02:06:35 +01:00
|
|
|
// Temporary buffer to hold a 16x16 block for 16x16 inverse dct
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t buffer[16 * 16];
|
2012-12-12 02:06:35 +01:00
|
|
|
// Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
|
2012-12-19 00:31:19 +01:00
|
|
|
int16_t buffer2[32 * 32];
|
2012-12-12 02:06:35 +01:00
|
|
|
// Note: pitch is in bytes, short_pitch is in short units
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
const int short_pitch = pitch >> 1;
|
2013-01-09 15:26:54 +01:00
|
|
|
int i, j;
|
|
|
|
|
|
|
|
// TODO(debargha): Implement more efficiently by adding output pitch
|
|
|
|
// argument to the idct16x16 function
|
|
|
|
vp9_short_idct16x16_c_f(input, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
|
|
|
|
}
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
for (j = 16; j < 32; ++j) {
|
|
|
|
buffer2[i * 32 + j] =
|
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (i = 16; i < 32; ++i) {
|
|
|
|
for (j = 0; j < 32; ++j) {
|
|
|
|
buffer2[i * 32 + j] =
|
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#if DWT_TYPE == 26
|
|
|
|
dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#elif DWT_TYPE == 97
|
|
|
|
dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#elif DWT_TYPE == 53
|
|
|
|
dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#elif DWTDCT_TYPE == DWTDCT16X16
|
|
|
|
|
|
|
|
void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
|
|
|
|
// assume output is a 32x32 buffer
|
|
|
|
// Temporary buffer to hold a 16x16 block for 16x16 inverse dct
|
|
|
|
int16_t buffer[16 * 16];
|
|
|
|
// Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
|
|
|
|
int16_t buffer2[32 * 32];
|
|
|
|
// Note: pitch is in bytes, short_pitch is in short units
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
2012-12-12 02:06:35 +01:00
|
|
|
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
// TODO(debargha): Implement more efficiently by adding output pitch
|
|
|
|
// argument to the idct16x16 function
|
2013-01-08 21:18:16 +01:00
|
|
|
vp9_short_idct16x16_c_f(input, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
2012-12-12 02:06:35 +01:00
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
|
|
|
|
}
|
2013-01-08 21:18:16 +01:00
|
|
|
vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
for (i = 0; i < 16; ++i) {
|
2012-12-12 02:06:35 +01:00
|
|
|
vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
2013-01-08 21:18:16 +01:00
|
|
|
vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
2012-12-12 02:06:35 +01:00
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16,
|
|
|
|
sizeof(*buffer2) * 16);
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
2013-01-08 21:18:16 +01:00
|
|
|
vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
2012-12-12 02:06:35 +01:00
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16,
|
|
|
|
sizeof(*buffer2) * 16);
|
|
|
|
}
|
|
|
|
#if DWT_TYPE == 26
|
|
|
|
dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#elif DWT_TYPE == 97
|
|
|
|
dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#elif DWT_TYPE == 53
|
|
|
|
dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#endif
|
32x32 transform for superblocks.
This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds
code all over the place to wrap that in the bitstream/encoder/decoder/RD.
Some implementation notes (these probably need careful review):
- token range is extended by 1 bit, since the value range out of this
transform is [-16384,16383].
- the coefficients coming out of the FDCT are manually scaled back by
1 bit, or else they won't fit in int16_t (they are 17 bits). Because
of this, the RD error scoring does not right-shift the MSE score by
two (unlike for 4x4/8x8/16x16).
- to compensate for this loss in precision, the quantizer is halved
also. This is currently a little hacky.
- FDCT and IDCT is double-only right now. Needs a fixed-point impl.
- There are no default probabilities for the 32x32 transform yet; I'm
simply using the 16x16 luma ones. A future commit will add newly
generated probabilities for all transforms.
- No ADST version. I don't think we'll add one for this level; if an
ADST is desired, transform-size selection can scale back to 16x16
or lower, and use an ADST at that level.
Additional notes specific to Debargha's DWT/DCT hybrid:
- coefficient scale is different for the top/left 16x16 (DCT-over-DWT)
block than for the rest (DWT pixel differences) of the block. Therefore,
RD error scoring isn't easily scalable between coefficient and pixel
domain. Thus, unfortunately, we need to compute the RD distortion in
the pixel domain until we figure out how to scale these appropriately.
Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
|
|
|
}
|
2013-01-08 21:18:16 +01:00
|
|
|
|
2013-01-09 15:26:54 +01:00
|
|
|
#elif DWTDCT_TYPE == DWTDCT8X8
|
|
|
|
|
|
|
|
void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
|
|
|
|
// assume output is a 32x32 buffer
|
|
|
|
// Temporary buffer to hold a 16x16 block for 16x16 inverse dct
|
|
|
|
int16_t buffer[8 * 8];
|
|
|
|
// Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
|
|
|
|
int16_t buffer2[32 * 32];
|
|
|
|
// Note: pitch is in bytes, short_pitch is in short units
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
// TODO(debargha): Implement more efficiently by adding output pitch
|
|
|
|
// argument to the idct16x16 function
|
|
|
|
vp9_short_idct8x8_c_f(input, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 8; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8);
|
|
|
|
}
|
|
|
|
vp9_short_idct8x8_c_f(input + 8, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 8; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8);
|
|
|
|
}
|
|
|
|
vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 8; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8,
|
|
|
|
sizeof(*buffer2) * 8);
|
|
|
|
}
|
|
|
|
vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch,
|
|
|
|
1 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 8; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8,
|
|
|
|
sizeof(*buffer2) * 8);
|
|
|
|
}
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
for (j = 16; j < 32; ++j) {
|
|
|
|
buffer2[i * 32 + j] =
|
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (i = 16; i < 32; ++i) {
|
|
|
|
for (j = 0; j < 32; ++j) {
|
|
|
|
buffer2[i * 32 + j] =
|
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#if DWT_TYPE == 26
|
|
|
|
dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#elif DWT_TYPE == 97
|
|
|
|
dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#elif DWT_TYPE == 53
|
|
|
|
dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if CONFIG_TX64X64
|
2013-01-08 21:18:16 +01:00
|
|
|
void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {
|
|
|
|
// assume output is a 64x64 buffer
|
|
|
|
// Temporary buffer to hold a 16x16 block for 16x16 inverse dct
|
|
|
|
int16_t buffer[16 * 16];
|
|
|
|
// Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
|
|
|
|
int16_t buffer2[64 * 64];
|
|
|
|
// Note: pitch is in bytes, short_pitch is in short units
|
|
|
|
const int short_pitch = pitch >> 1;
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
// TODO(debargha): Implement more efficiently by adding output pitch
|
|
|
|
// argument to the idct16x16 function
|
|
|
|
vp9_short_idct16x16_c_f(input, buffer, pitch,
|
|
|
|
2 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);
|
|
|
|
}
|
2013-01-09 15:26:54 +01:00
|
|
|
#if DWTDCT_TYPE == DWTDCT16X16_LEAN
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
for (j = 16; j < 64; ++j) {
|
|
|
|
buffer2[i * 64 + j] =
|
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (i = 16; i < 64; ++i) {
|
|
|
|
for (j = 0; j < 64; ++j) {
|
|
|
|
buffer2[i * 64 + j] =
|
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#elif DWTDCT_TYPE == DWTDCT16X16
|
2013-01-08 21:18:16 +01:00
|
|
|
vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
|
|
|
|
2 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
|
|
|
|
}
|
|
|
|
vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
|
|
|
|
2 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16,
|
|
|
|
sizeof(*buffer2) * 16);
|
|
|
|
}
|
|
|
|
vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
|
|
|
|
2 + DWT_PRECISION_BITS);
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16,
|
|
|
|
sizeof(*buffer2) * 16);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copying and scaling highest bands into buffer2
|
|
|
|
for (i = 0; i < 32; ++i) {
|
2013-01-09 15:26:54 +01:00
|
|
|
for (j = 32; j < 64; ++j) {
|
2013-01-08 21:18:16 +01:00
|
|
|
buffer2[i * 64 + j] =
|
2013-01-09 15:26:54 +01:00
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
|
2013-01-08 21:18:16 +01:00
|
|
|
}
|
|
|
|
}
|
2013-01-09 15:26:54 +01:00
|
|
|
for (i = 32; i < 64; ++i) {
|
2013-01-08 21:18:16 +01:00
|
|
|
for (j = 0; j < 64; ++j) {
|
|
|
|
buffer2[i * 64 + j] =
|
2013-01-09 15:26:54 +01:00
|
|
|
multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
|
2013-01-08 21:18:16 +01:00
|
|
|
}
|
|
|
|
}
|
2013-01-09 15:26:54 +01:00
|
|
|
#endif // DWTDCT_TYPE
|
2013-01-08 21:18:16 +01:00
|
|
|
|
|
|
|
#if DWT_TYPE == 26
|
|
|
|
dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);
|
|
|
|
#elif DWT_TYPE == 97
|
|
|
|
dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64);
|
|
|
|
#elif DWT_TYPE == 53
|
|
|
|
dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);
|
|
|
|
#endif
|
|
|
|
}
|
2013-01-09 15:26:54 +01:00
|
|
|
#endif // CONFIG_TX64X64
|
|
|
|
#endif // !CONFIG_DWTDCTHYBRID
|