drm/amd/display: Optimize gamma calculations
[Why&How] 1. Stack usage is pretty high as fixed31_32 struct is 8 bytes and we have functions with >30 vars on the stack. 2. Optimize gamma calculation by reducing number of calls to dc_fixpt_pow Our X points are divided into 32 regions wth 16 pts each. Each region is 2x the previous, meaning x[i] = 2*x[i-16] for i>=16. Using (2x)^gamma = 2^gamma * x^gamma, we can recursively compute powers of gamma, we just need first 16 pts to start it up. dc_fixpt_pow() is expensive, it computes x^y by doing exp(y*logx) Exp is done by Taylor series approximation, and log by Newton-like approximation that also uses exp internally. In short, it's significantly heavier than run-of-the-mill addition/subtraction/multiply. Signed-off-by: Krunoslav Kovac <Krunoslav.Kovac@amd.com> Reviewed-by: Anthony Koo <Anthony.Koo@amd.com> Acked-by: Aric Cyr <Aric.Cyr@amd.com> Acked-by: Leo Li <sunpeng.li@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
c43f89f81c
commit
e752058b86
@ -482,7 +482,6 @@ struct dc_gamma {
|
|||||||
* is_logical_identity indicates the given gamma ramp regardless of type is identity.
|
* is_logical_identity indicates the given gamma ramp regardless of type is identity.
|
||||||
*/
|
*/
|
||||||
bool is_identity;
|
bool is_identity;
|
||||||
bool is_logical_identity;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Used by both ipp amd opp functions*/
|
/* Used by both ipp amd opp functions*/
|
||||||
|
@ -40,6 +40,33 @@ static struct hw_x_point coordinates_x[MAX_HW_POINTS + 2];
|
|||||||
static struct fixed31_32 pq_table[MAX_HW_POINTS + 2];
|
static struct fixed31_32 pq_table[MAX_HW_POINTS + 2];
|
||||||
static struct fixed31_32 de_pq_table[MAX_HW_POINTS + 2];
|
static struct fixed31_32 de_pq_table[MAX_HW_POINTS + 2];
|
||||||
|
|
||||||
|
// these are helpers for calculations to reduce stack usage
|
||||||
|
// do not depend on these being preserved across calls
|
||||||
|
static struct fixed31_32 scratch_1;
|
||||||
|
static struct fixed31_32 scratch_2;
|
||||||
|
static struct translate_from_linear_space_args scratch_gamma_args;
|
||||||
|
|
||||||
|
/* Helper to optimize gamma calculation, only use in translate_from_linear, in
|
||||||
|
* particular the dc_fixpt_pow function which is very expensive
|
||||||
|
* The idea is that our regions for X points are exponential and currently they all use
|
||||||
|
* the same number of points (NUM_PTS_IN_REGION) and in each region every point
|
||||||
|
* is exactly 2x the one at the same index in the previous region. In other words
|
||||||
|
* X[i] = 2 * X[i-NUM_PTS_IN_REGION] for i>=16
|
||||||
|
* The other fact is that (2x)^gamma = 2^gamma * x^gamma
|
||||||
|
* So we compute and save x^gamma for the first 16 regions, and for every next region
|
||||||
|
* just multiply with 2^gamma which can be computed once, and save the result so we
|
||||||
|
* recursively compute all the values.
|
||||||
|
*/
|
||||||
|
static struct fixed31_32 pow_buffer[NUM_PTS_IN_REGION];
|
||||||
|
static struct fixed31_32 gamma_of_2; // 2^gamma
|
||||||
|
int pow_buffer_ptr = -1;
|
||||||
|
|
||||||
|
static const int32_t gamma_numerator01[] = { 31308, 180000, 0};
|
||||||
|
static const int32_t gamma_numerator02[] = { 12920, 4500, 0};
|
||||||
|
static const int32_t gamma_numerator03[] = { 55, 99, 0};
|
||||||
|
static const int32_t gamma_numerator04[] = { 55, 99, 0};
|
||||||
|
static const int32_t gamma_numerator05[] = { 2400, 2200, 2200};
|
||||||
|
|
||||||
static bool pq_initialized; /* = false; */
|
static bool pq_initialized; /* = false; */
|
||||||
static bool de_pq_initialized; /* = false; */
|
static bool de_pq_initialized; /* = false; */
|
||||||
|
|
||||||
@ -251,11 +278,7 @@ enum gamma_type_index {
|
|||||||
|
|
||||||
static void build_coefficients(struct gamma_coefficients *coefficients, enum gamma_type_index type)
|
static void build_coefficients(struct gamma_coefficients *coefficients, enum gamma_type_index type)
|
||||||
{
|
{
|
||||||
static const int32_t numerator01[] = { 31308, 180000, 0};
|
|
||||||
static const int32_t numerator02[] = { 12920, 4500, 0};
|
|
||||||
static const int32_t numerator03[] = { 55, 99, 0};
|
|
||||||
static const int32_t numerator04[] = { 55, 99, 0};
|
|
||||||
static const int32_t numerator05[] = { 2400, 2200, 2200};
|
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
uint32_t index = 0;
|
uint32_t index = 0;
|
||||||
@ -267,69 +290,74 @@ static void build_coefficients(struct gamma_coefficients *coefficients, enum gam
|
|||||||
|
|
||||||
do {
|
do {
|
||||||
coefficients->a0[i] = dc_fixpt_from_fraction(
|
coefficients->a0[i] = dc_fixpt_from_fraction(
|
||||||
numerator01[index], 10000000);
|
gamma_numerator01[index], 10000000);
|
||||||
coefficients->a1[i] = dc_fixpt_from_fraction(
|
coefficients->a1[i] = dc_fixpt_from_fraction(
|
||||||
numerator02[index], 1000);
|
gamma_numerator02[index], 1000);
|
||||||
coefficients->a2[i] = dc_fixpt_from_fraction(
|
coefficients->a2[i] = dc_fixpt_from_fraction(
|
||||||
numerator03[index], 1000);
|
gamma_numerator03[index], 1000);
|
||||||
coefficients->a3[i] = dc_fixpt_from_fraction(
|
coefficients->a3[i] = dc_fixpt_from_fraction(
|
||||||
numerator04[index], 1000);
|
gamma_numerator04[index], 1000);
|
||||||
coefficients->user_gamma[i] = dc_fixpt_from_fraction(
|
coefficients->user_gamma[i] = dc_fixpt_from_fraction(
|
||||||
numerator05[index], 1000);
|
gamma_numerator05[index], 1000);
|
||||||
|
|
||||||
++i;
|
++i;
|
||||||
} while (i != ARRAY_SIZE(coefficients->a0));
|
} while (i != ARRAY_SIZE(coefficients->a0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct fixed31_32 translate_from_linear_space(
|
static struct fixed31_32 translate_from_linear_space(
|
||||||
struct fixed31_32 arg,
|
struct translate_from_linear_space_args *args)
|
||||||
struct fixed31_32 a0,
|
|
||||||
struct fixed31_32 a1,
|
|
||||||
struct fixed31_32 a2,
|
|
||||||
struct fixed31_32 a3,
|
|
||||||
struct fixed31_32 gamma)
|
|
||||||
{
|
{
|
||||||
const struct fixed31_32 one = dc_fixpt_from_int(1);
|
const struct fixed31_32 one = dc_fixpt_from_int(1);
|
||||||
|
|
||||||
if (dc_fixpt_lt(one, arg))
|
if (dc_fixpt_le(one, args->arg))
|
||||||
return one;
|
return one;
|
||||||
|
|
||||||
if (dc_fixpt_le(arg, dc_fixpt_neg(a0)))
|
if (dc_fixpt_le(args->arg, dc_fixpt_neg(args->a0))) {
|
||||||
return dc_fixpt_sub(
|
scratch_1 = dc_fixpt_add(one, args->a3);
|
||||||
a2,
|
scratch_2 = dc_fixpt_pow(
|
||||||
dc_fixpt_mul(
|
dc_fixpt_neg(args->arg),
|
||||||
dc_fixpt_add(
|
dc_fixpt_recip(args->gamma));
|
||||||
one,
|
scratch_1 = dc_fixpt_mul(scratch_1, scratch_2);
|
||||||
a3),
|
scratch_1 = dc_fixpt_sub(args->a2, scratch_1);
|
||||||
dc_fixpt_pow(
|
|
||||||
dc_fixpt_neg(arg),
|
return scratch_1;
|
||||||
dc_fixpt_recip(gamma))));
|
} else if (dc_fixpt_le(args->a0, args->arg)) {
|
||||||
else if (dc_fixpt_le(a0, arg))
|
if (pow_buffer_ptr == 0) {
|
||||||
return dc_fixpt_sub(
|
gamma_of_2 = dc_fixpt_pow(dc_fixpt_from_int(2),
|
||||||
dc_fixpt_mul(
|
dc_fixpt_recip(args->gamma));
|
||||||
dc_fixpt_add(
|
}
|
||||||
one,
|
scratch_1 = dc_fixpt_add(one, args->a3);
|
||||||
a3),
|
if (pow_buffer_ptr < 16)
|
||||||
dc_fixpt_pow(
|
scratch_2 = dc_fixpt_pow(args->arg,
|
||||||
arg,
|
dc_fixpt_recip(args->gamma));
|
||||||
dc_fixpt_recip(gamma))),
|
else
|
||||||
a2);
|
scratch_2 = dc_fixpt_mul(gamma_of_2,
|
||||||
|
pow_buffer[pow_buffer_ptr%16]);
|
||||||
|
|
||||||
|
pow_buffer[pow_buffer_ptr%16] = scratch_2;
|
||||||
|
pow_buffer_ptr++;
|
||||||
|
|
||||||
|
scratch_1 = dc_fixpt_mul(scratch_1, scratch_2);
|
||||||
|
scratch_1 = dc_fixpt_sub(scratch_1, args->a2);
|
||||||
|
|
||||||
|
return scratch_1;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
return dc_fixpt_mul(
|
return dc_fixpt_mul(args->arg, args->a1);
|
||||||
arg,
|
|
||||||
a1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct fixed31_32 calculate_gamma22(struct fixed31_32 arg)
|
static struct fixed31_32 calculate_gamma22(struct fixed31_32 arg)
|
||||||
{
|
{
|
||||||
struct fixed31_32 gamma = dc_fixpt_from_fraction(22, 10);
|
struct fixed31_32 gamma = dc_fixpt_from_fraction(22, 10);
|
||||||
|
|
||||||
return translate_from_linear_space(arg,
|
scratch_gamma_args.arg = arg;
|
||||||
dc_fixpt_zero,
|
scratch_gamma_args.a0 = dc_fixpt_zero;
|
||||||
dc_fixpt_zero,
|
scratch_gamma_args.a1 = dc_fixpt_zero;
|
||||||
dc_fixpt_zero,
|
scratch_gamma_args.a2 = dc_fixpt_zero;
|
||||||
dc_fixpt_zero,
|
scratch_gamma_args.a3 = dc_fixpt_zero;
|
||||||
gamma);
|
scratch_gamma_args.gamma = gamma;
|
||||||
|
|
||||||
|
return translate_from_linear_space(&scratch_gamma_args);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct fixed31_32 translate_to_linear_space(
|
static struct fixed31_32 translate_to_linear_space(
|
||||||
@ -365,18 +393,19 @@ static struct fixed31_32 translate_to_linear_space(
|
|||||||
return linear;
|
return linear;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct fixed31_32 translate_from_linear_space_ex(
|
static struct fixed31_32 translate_from_linear_space_ex(
|
||||||
struct fixed31_32 arg,
|
struct fixed31_32 arg,
|
||||||
struct gamma_coefficients *coeff,
|
struct gamma_coefficients *coeff,
|
||||||
uint32_t color_index)
|
uint32_t color_index)
|
||||||
{
|
{
|
||||||
return translate_from_linear_space(
|
scratch_gamma_args.arg = arg;
|
||||||
arg,
|
scratch_gamma_args.a0 = coeff->a0[color_index];
|
||||||
coeff->a0[color_index],
|
scratch_gamma_args.a1 = coeff->a1[color_index];
|
||||||
coeff->a1[color_index],
|
scratch_gamma_args.a2 = coeff->a2[color_index];
|
||||||
coeff->a2[color_index],
|
scratch_gamma_args.a3 = coeff->a3[color_index];
|
||||||
coeff->a3[color_index],
|
scratch_gamma_args.gamma = coeff->user_gamma[color_index];
|
||||||
coeff->user_gamma[color_index]);
|
|
||||||
|
return translate_from_linear_space(&scratch_gamma_args);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -715,24 +744,32 @@ static void build_regamma(struct pwl_float_data_ex *rgb_regamma,
|
|||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
|
|
||||||
struct gamma_coefficients coeff;
|
struct gamma_coefficients *coeff;
|
||||||
struct pwl_float_data_ex *rgb = rgb_regamma;
|
struct pwl_float_data_ex *rgb = rgb_regamma;
|
||||||
const struct hw_x_point *coord_x = coordinate_x;
|
const struct hw_x_point *coord_x = coordinate_x;
|
||||||
|
|
||||||
build_coefficients(&coeff, type);
|
coeff = kvzalloc(sizeof(*coeff), GFP_KERNEL);
|
||||||
|
if (!coeff)
|
||||||
|
return;
|
||||||
|
|
||||||
|
build_coefficients(coeff, type);
|
||||||
|
|
||||||
|
memset(pow_buffer, 0, NUM_PTS_IN_REGION * sizeof(struct fixed31_32));
|
||||||
|
pow_buffer_ptr = 0; // see variable definition for more info
|
||||||
i = 0;
|
i = 0;
|
||||||
|
while (i <= hw_points_num) {
|
||||||
while (i != hw_points_num + 1) {
|
|
||||||
/*TODO use y vs r,g,b*/
|
/*TODO use y vs r,g,b*/
|
||||||
rgb->r = translate_from_linear_space_ex(
|
rgb->r = translate_from_linear_space_ex(
|
||||||
coord_x->x, &coeff, 0);
|
coord_x->x, coeff, 0);
|
||||||
rgb->g = rgb->r;
|
rgb->g = rgb->r;
|
||||||
rgb->b = rgb->r;
|
rgb->b = rgb->r;
|
||||||
++coord_x;
|
++coord_x;
|
||||||
++rgb;
|
++rgb;
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
|
pow_buffer_ptr = -1; // reset back to no optimize
|
||||||
|
|
||||||
|
kfree(coeff);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hermite_spline_eetf(struct fixed31_32 input_x,
|
static void hermite_spline_eetf(struct fixed31_32 input_x,
|
||||||
@ -862,6 +899,8 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma,
|
|||||||
else
|
else
|
||||||
max_content = max_display;
|
max_content = max_display;
|
||||||
|
|
||||||
|
if (!use_eetf)
|
||||||
|
pow_buffer_ptr = 0; // see var definition for more info
|
||||||
rgb += 32; // first 32 points have problems with fixed point, too small
|
rgb += 32; // first 32 points have problems with fixed point, too small
|
||||||
coord_x += 32;
|
coord_x += 32;
|
||||||
for (i = 32; i <= hw_points_num; i++) {
|
for (i = 32; i <= hw_points_num; i++) {
|
||||||
@ -900,6 +939,7 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma,
|
|||||||
++coord_x;
|
++coord_x;
|
||||||
++rgb;
|
++rgb;
|
||||||
}
|
}
|
||||||
|
pow_buffer_ptr = -1;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1572,14 +1612,15 @@ bool mod_color_calculate_regamma_params(struct dc_transfer_func *output_tf,
|
|||||||
output_tf->tf == TRANSFER_FUNCTION_SRGB) {
|
output_tf->tf == TRANSFER_FUNCTION_SRGB) {
|
||||||
if (ramp == NULL)
|
if (ramp == NULL)
|
||||||
return true;
|
return true;
|
||||||
if ((ramp->is_logical_identity) ||
|
if ((ramp->is_identity && ramp->type != GAMMA_CS_TFM_1D) ||
|
||||||
(!mapUserRamp && ramp->type == GAMMA_RGB_256))
|
(!mapUserRamp && ramp->type == GAMMA_RGB_256))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
output_tf->type = TF_TYPE_DISTRIBUTED_POINTS;
|
output_tf->type = TF_TYPE_DISTRIBUTED_POINTS;
|
||||||
|
|
||||||
if (ramp && (mapUserRamp || ramp->type != GAMMA_RGB_256)) {
|
if (ramp && ramp->type != GAMMA_CS_TFM_1D &&
|
||||||
|
(mapUserRamp || ramp->type != GAMMA_RGB_256)) {
|
||||||
rgb_user = kvcalloc(ramp->num_entries + _EXTRA_POINTS,
|
rgb_user = kvcalloc(ramp->num_entries + _EXTRA_POINTS,
|
||||||
sizeof(*rgb_user),
|
sizeof(*rgb_user),
|
||||||
GFP_KERNEL);
|
GFP_KERNEL);
|
||||||
|
@ -82,6 +82,15 @@ struct freesync_hdr_tf_params {
|
|||||||
unsigned int skip_tm; // skip tm
|
unsigned int skip_tm; // skip tm
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct translate_from_linear_space_args {
|
||||||
|
struct fixed31_32 arg;
|
||||||
|
struct fixed31_32 a0;
|
||||||
|
struct fixed31_32 a1;
|
||||||
|
struct fixed31_32 a2;
|
||||||
|
struct fixed31_32 a3;
|
||||||
|
struct fixed31_32 gamma;
|
||||||
|
};
|
||||||
|
|
||||||
void setup_x_points_distribution(void);
|
void setup_x_points_distribution(void);
|
||||||
void precompute_pq(void);
|
void precompute_pq(void);
|
||||||
void precompute_de_pq(void);
|
void precompute_de_pq(void);
|
||||||
|
Loading…
Reference in New Issue
Block a user