From b5accb3ff997c073f9a9418c97ddd2365f81504f Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sun, 17 May 2020 18:08:11 -0400 Subject: freedreno/a3xx: parameterize ubo optimization A3xx apparently has higher alignment requirements than later gens for indirect const uploads. It also has fewer of them. Add compiler parameters for both settings, and set accordingly for a3xx and a4xx+. This fixes all the ubo test failures caused by this optimization. Signed-off-by: Ilia Mirkin Reviewed-by: Rob Clark Part-of: --- src/freedreno/ir3/ir3_compiler.c | 4 ++++ src/freedreno/ir3/ir3_compiler.h | 10 ++++++++++ src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c | 24 +++++++++++++----------- 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 3bb71c8942d..366e2008f8b 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -73,6 +73,8 @@ struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id compiler->unminify_coords = false; compiler->txf_ms_with_isaml = false; compiler->array_index_add_half = true; + compiler->max_const = 1024; + compiler->const_upload_unit = 4; } else { /* no special handling for "flat" */ compiler->flat_bypass = false; @@ -80,6 +82,8 @@ struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id compiler->unminify_coords = true; compiler->txf_ms_with_isaml = true; compiler->array_index_add_half = false; + compiler->max_const = 512; + compiler->const_upload_unit = 8; } return compiler; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 9b5307ed936..a5af717471e 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -67,6 +67,16 @@ struct ir3_compiler { /* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders: */ bool samgq_workaround; + + /* on a3xx, the limit on const access is lower than later gens (in vec4 + * units): + */ + uint32_t max_const; + + /* on a3xx, the unit of indirect const load is higher than later gens (in + * vec4 units): + */ + uint32_t const_upload_unit; }; struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id); diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index 393b948847a..ba72e7351ef 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -28,15 +28,15 @@ #include "util/u_math.h" static inline struct ir3_ubo_range -get_ubo_load_range(nir_intrinsic_instr *instr) +get_ubo_load_range(nir_intrinsic_instr *instr, uint32_t alignment) { struct ir3_ubo_range r; int offset = nir_src_as_uint(instr->src[1]); const int bytes = nir_intrinsic_dest_components(instr) * 4; - r.start = ROUND_DOWN_TO(offset, 16 * 4); - r.end = ALIGN(offset + bytes, 16 * 4); + r.start = ROUND_DOWN_TO(offset, alignment * 16); + r.end = ALIGN(offset + bytes, alignment * 16); return r; } @@ -85,7 +85,7 @@ get_existing_range(nir_intrinsic_instr *instr, static void gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr, - struct ir3_ubo_analysis_state *state) + struct ir3_ubo_analysis_state *state, uint32_t alignment) { struct ir3_ubo_range *old_r = get_existing_range(instr, state, true); if (!old_r) @@ -97,13 +97,13 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr, * load_uniform. Set the range to cover all of UBO 0. */ old_r->start = 0; - old_r->end = ALIGN(nir->num_uniforms * 16, 16 * 4); + old_r->end = ALIGN(nir->num_uniforms * 16, alignment * 16); } return; } - const struct ir3_ubo_range r = get_ubo_load_range(instr); + const struct ir3_ubo_range r = get_ubo_load_range(instr, alignment); /* if UBO lowering is disabled, we still want to lower block 0 * (which is normal uniforms): @@ -207,7 +207,7 @@ lower_ubo_block_decrement(nir_intrinsic_instr *instr, nir_builder *b, int *num_u static void lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, - struct ir3_ubo_analysis_state *state, int *num_ubos) + struct ir3_ubo_analysis_state *state, int *num_ubos, uint32_t alignment) { b->cursor = nir_before_instr(&instr->instr); @@ -234,7 +234,7 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, /* After gathering the UBO access ranges, we limit the total * upload. Reject if we're now outside the range. */ - const struct ir3_ubo_range r = get_ubo_load_range(instr); + const struct ir3_ubo_range r = get_ubo_load_range(instr, alignment); if (!(range->start <= r.start && r.end <= range->end)) { lower_ubo_block_decrement(instr, b, num_ubos); return; @@ -325,7 +325,8 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) nir_foreach_block (block, function->impl) { nir_foreach_instr (instr, block) { if (instr_is_load_ubo(instr)) - gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), state); + gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), + state, shader->compiler->const_upload_unit); } } } @@ -339,7 +340,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) * dynamically accessed ranges separately and upload static rangtes * first. */ - const uint32_t max_upload = 16 * 1024; + const uint32_t max_upload = shader->compiler->max_const * 16; uint32_t offset = shader->const_state.num_reserved_user_consts * 16; state->num_enabled = ARRAY_SIZE(state->range); for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { @@ -370,7 +371,8 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) nir_foreach_instr_safe (instr, block) { if (instr_is_load_ubo(instr)) lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr), - &builder, state, &num_ubos); + &builder, state, &num_ubos, + shader->compiler->const_upload_unit); } } -- cgit v1.2.3