diff options
author | Bas Nieuwenhuizen <[email protected]> | 2017-12-30 17:31:44 +0100 |
---|---|---|
committer | Bas Nieuwenhuizen <[email protected]> | 2017-12-31 15:07:07 +0100 |
commit | 6a36bfc64d2096aa338958c4605f5fc6372c07b8 (patch) | |
tree | 3a59c9fd9a30dcf09cb004511979d2010aa0829d | |
parent | b0d17270ada1b7292f09b5d4ab2c77880ee64c35 (diff) |
radv: Implement binning on GFX9.
Overall it does not really help or hurt. The deferred demo gets 1%
improvement and some games a 3% decrease, so I don't think this
should be enabled by default.
But with the code upstream it is easier to experiment with it.
v2: Remove initializing the registers from si_emit_config.
Reviewed-by: Dave Airlie <[email protected]>
-rw-r--r-- | src/amd/vulkan/radv_cmd_buffer.c | 16 | ||||
-rw-r--r-- | src/amd/vulkan/radv_pipeline.c | 325 | ||||
-rw-r--r-- | src/amd/vulkan/radv_private.h | 7 | ||||
-rw-r--r-- | src/amd/vulkan/si_cmd_buffer.c | 6 |
4 files changed, 348 insertions, 6 deletions
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index c735d201802..261344e939b 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -1043,6 +1043,21 @@ radv_emit_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer, } static void +radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + struct radeon_winsys_cs *cs = cmd_buffer->cs; + + if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9) + return; + + radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0, + pipeline->graphics.bin.pa_sc_binner_cntl_0); + radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL, + pipeline->graphics.bin.db_dfsm_control); +} + +static void radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; @@ -1059,6 +1074,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) radv_emit_geometry_shader(cmd_buffer, pipeline); radv_emit_fragment_shader(cmd_buffer, pipeline); radv_emit_vgt_vertex_reuse(cmd_buffer, pipeline); + radv_emit_binning_state(cmd_buffer, pipeline); cmd_buffer->scratch_size_needed = MAX2(cmd_buffer->scratch_size_needed, diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 14ada20d525..9b5728ee9e7 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2002,6 +2002,329 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, } } +struct radv_bin_size_entry { + unsigned bpp; + VkExtent2D extent; +}; + +static VkExtent2D +radv_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + static const struct radv_bin_size_entry color_size_table[][3][9] = { + { + /* One RB / SE */ + { + /* One shader engine */ + { 0, {128, 128}}, + { 1, { 64, 128}}, + { 2, { 32, 128}}, + { 3, { 16, 128}}, + { 17, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + /* Two shader engines */ + { 0, {128, 128}}, + { 2, { 64, 128}}, + { 3, { 32, 128}}, + { 5, { 16, 128}}, + { 17, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + /* Four shader engines */ + { 0, {128, 128}}, + { 3, { 64, 128}}, + { 5, { 16, 128}}, + { 17, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + }, + { + /* Two RB / SE */ + { + /* One shader engine */ + { 0, {128, 128}}, + { 2, { 64, 128}}, + { 3, { 32, 128}}, + { 5, { 16, 128}}, + { 33, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + /* Two shader engines */ + { 0, {128, 128}}, + { 3, { 64, 128}}, + { 5, { 32, 128}}, + { 9, { 16, 128}}, + { 33, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + /* Four shader engines */ + { 0, {256, 256}}, + { 2, {128, 256}}, + { 3, {128, 128}}, + { 5, { 64, 128}}, + { 9, { 16, 128}}, + { 33, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + }, + { + /* Four RB / SE */ + { + /* One shader engine */ + { 0, {128, 256}}, + { 2, {128, 128}}, + { 3, { 64, 128}}, + { 5, { 32, 128}}, + { 9, { 16, 128}}, + { 33, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + /* Two shader engines */ + { 0, {256, 256}}, + { 2, {128, 256}}, + { 3, {128, 128}}, + { 5, { 64, 128}}, + { 9, { 32, 128}}, + { 17, { 16, 128}}, + { 33, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + /* Four shader engines */ + { 0, {256, 512}}, + { 2, {256, 256}}, + { 3, {128, 256}}, + { 5, {128, 128}}, + { 9, { 64, 128}}, + { 17, { 16, 128}}, + { 33, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + }, + }; + static const struct radv_bin_size_entry ds_size_table[][3][9] = { + { + // One RB / SE + { + // One shader engine + { 0, {128, 256}}, + { 2, {128, 128}}, + { 4, { 64, 128}}, + { 7, { 32, 128}}, + { 13, { 16, 128}}, + { 49, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + // Two shader engines + { 0, {256, 256}}, + { 2, {128, 256}}, + { 4, {128, 128}}, + { 7, { 64, 128}}, + { 13, { 32, 128}}, + { 25, { 16, 128}}, + { 49, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + // Four shader engines + { 0, {256, 512}}, + { 2, {256, 256}}, + { 4, {128, 256}}, + { 7, {128, 128}}, + { 13, { 64, 128}}, + { 25, { 16, 128}}, + { 49, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + }, + { + // Two RB / SE + { + // One shader engine + { 0, {256, 256}}, + { 2, {128, 256}}, + { 4, {128, 128}}, + { 7, { 64, 128}}, + { 13, { 32, 128}}, + { 25, { 16, 128}}, + { 97, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + // Two shader engines + { 0, {256, 512}}, + { 2, {256, 256}}, + { 4, {128, 256}}, + { 7, {128, 128}}, + { 13, { 64, 128}}, + { 25, { 32, 128}}, + { 49, { 16, 128}}, + { 97, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + { + // Four shader engines + { 0, {512, 512}}, + { 2, {256, 512}}, + { 4, {256, 256}}, + { 7, {128, 256}}, + { 13, {128, 128}}, + { 25, { 64, 128}}, + { 49, { 16, 128}}, + { 97, { 0, 0}}, + { UINT_MAX, { 0, 0}}, + }, + }, + { + // Four RB / SE + { + // One shader engine + { 0, {256, 512}}, + { 2, {256, 256}}, + { 4, {128, 256}}, + { 7, {128, 128}}, + { 13, { 64, 128}}, + { 25, { 32, 128}}, + { 49, { 16, 128}}, + { UINT_MAX, { 0, 0}}, + }, + { + // Two shader engines + { 0, {512, 512}}, + { 2, {256, 512}}, + { 4, {256, 256}}, + { 7, {128, 256}}, + { 13, {128, 128}}, + { 25, { 64, 128}}, + { 49, { 32, 128}}, + { 97, { 16, 128}}, + { UINT_MAX, { 0, 0}}, + }, + { + // Four shader engines + { 0, {512, 512}}, + { 4, {256, 512}}, + { 7, {256, 256}}, + { 13, {128, 256}}, + { 25, {128, 128}}, + { 49, { 64, 128}}, + { 97, { 16, 128}}, + { UINT_MAX, { 0, 0}}, + }, + }, + }; + + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; + VkExtent2D extent = {512, 512}; + + unsigned log_num_rb_per_se = + util_logbase2_ceil(pipeline->device->physical_device->rad_info.num_render_backends / + pipeline->device->physical_device->rad_info.max_se); + unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se); + + unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_mode_cntl_1); + unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa); + unsigned effective_samples = total_samples; + unsigned cb_target_mask = pipeline->graphics.blend.cb_target_mask; + unsigned color_bytes_per_pixel = 0; + + for (unsigned i = 0; i < subpass->color_count; i++) { + if (!(cb_target_mask & (0xf << (i * 4)))) + continue; + + if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) + continue; + + VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format; + color_bytes_per_pixel += vk_format_get_blocksize(format); + } + + /* MSAA images typically don't use all samples all the time. */ + if (effective_samples >= 2 && ps_iter_samples <= 1) + effective_samples = 2; + color_bytes_per_pixel *= effective_samples; + + const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se]; + while(color_entry->bpp <= color_bytes_per_pixel) + ++color_entry; + + extent = color_entry->extent; + + if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { + struct radv_render_pass_attachment *attachment = pass->attachments + subpass->depth_stencil_attachment.attachment; + + /* Coefficients taken from AMDVLK */ + unsigned depth_coeff = vk_format_is_depth(attachment->format) ? 5 : 0; + unsigned stencil_coeff = vk_format_is_stencil(attachment->format) ? 1 : 0; + unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples; + + const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se]; + while(ds_entry->bpp <= ds_bytes_per_pixel) + ++ds_entry; + + extent.width = MIN2(extent.width, ds_entry->extent.width); + extent.height = MIN2(extent.height, ds_entry->extent.height); + } + + return extent; +} + +static void +radv_compute_binning_state(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + pipeline->graphics.bin.pa_sc_binner_cntl_0 = + S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | + S_028C44_DISABLE_START_OF_PRIM(1); + pipeline->graphics.bin.db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF); + + if (!pipeline->device->pbb_allowed) + return; + + VkExtent2D bin_size = radv_compute_bin_size(pipeline, pCreateInfo); + if (!bin_size.width || !bin_size.height) + return; + + unsigned context_states_per_bin; /* allowed range: [1, 6] */ + unsigned persistent_states_per_bin; /* allowed range: [1, 32] */ + unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */ + + switch (pipeline->device->physical_device->rad_info.family) { + case CHIP_VEGA10: + context_states_per_bin = 1; + persistent_states_per_bin = 1; + fpovs_per_batch = 63; + break; + case CHIP_RAVEN: + context_states_per_bin = 6; + persistent_states_per_bin = 32; + fpovs_per_batch = 63; + break; + default: + unreachable("unhandled family while determining binning state."); + } + + pipeline->graphics.bin.pa_sc_binner_cntl_0 = + S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | + S_028C44_BIN_SIZE_X(bin_size.width == 16) | + S_028C44_BIN_SIZE_Y(bin_size.height == 16) | + S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) | + S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) | + S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) | + S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) | + S_028C44_DISABLE_START_OF_PRIM(1) | + S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | + S_028C44_OPTIMAL_BIN_SELECTION(1); + + /* DFSM is not implemented yet */ + assert(!pipeline->device->dfsm_allowed); +} static VkResult radv_pipeline_init(struct radv_pipeline *pipeline, @@ -2290,6 +2613,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline, radv_dump_pipeline_stats(device, pipeline); } + radv_compute_binning_state(pipeline, pCreateInfo); + result = radv_pipeline_scratch_init(device, pipeline); return result; } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index eb5a64d2536..bae353c0e5f 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -583,6 +583,7 @@ struct radv_device { bool llvm_supports_spill; bool has_distributed_tess; + bool pbb_allowed; bool dfsm_allowed; uint32_t tess_offchip_block_dw_size; uint32_t scratch_waves; @@ -1165,6 +1166,11 @@ struct radv_vs_state { uint32_t vgt_reuse_off; }; +struct radv_binning_state { + uint32_t pa_sc_binner_cntl_0; + uint32_t db_dfsm_control; +}; + #define SI_GS_PER_ES 128 struct radv_pipeline { @@ -1193,6 +1199,7 @@ struct radv_pipeline { struct radv_tessellation_state tess; struct radv_gs_state gs; struct radv_vs_state vs; + struct radv_binning_state bin; uint32_t db_shader_control; uint32_t shader_z_format; unsigned prim; diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index a6981c136e7..68913ec2ad3 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -518,12 +518,6 @@ si_emit_config(struct radv_physical_device *physical_device, assert(0); } - radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL, - S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF)); - /* TODO: Enable the binner: */ - radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0, - S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | - S_028C44_DISABLE_START_OF_PRIM(1)); radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1, S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) | S_028C48_MAX_PRIM_PER_BATCH(1023)); |