aboutsummaryrefslogtreecommitdiffstats
path: root/src/freedreno/vulkan
diff options
context:
space:
mode:
authorConnor Abbott <[email protected]>2020-03-23 17:23:32 +0100
committerMarge Bot <[email protected]>2020-04-09 15:56:55 +0000
commita07b55443b59572d022924b65c2ab67cd91250e4 (patch)
tree34ddc92bce3ce64f81d74230c7c9a4a9dceed524 /src/freedreno/vulkan
parentd37843fee15f953e18fa9e87c3dc4761ef054998 (diff)
tu: Emit CP_LOAD_STATE6 for descriptors
This restores the pre-loading of descriptor state, using the new SS6_BINDLESS method that allows us to pre-load bindless resources. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4358>
Diffstat (limited to 'src/freedreno/vulkan')
-rw-r--r--src/freedreno/vulkan/tu_cmd_buffer.c40
-rw-r--r--src/freedreno/vulkan/tu_descriptor_set.c1
-rw-r--r--src/freedreno/vulkan/tu_descriptor_set.h3
-rw-r--r--src/freedreno/vulkan/tu_pipeline.c260
-rw-r--r--src/freedreno/vulkan/tu_private.h5
5 files changed, 304 insertions, 5 deletions
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 1c9c43f35e9..a15797e1042 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -2527,6 +2527,7 @@ enum tu_draw_state_group_id
TU_DRAW_STATE_FS_CONST,
TU_DRAW_STATE_DESC_SETS,
TU_DRAW_STATE_DESC_SETS_GMEM,
+ TU_DRAW_STATE_DESC_SETS_LOAD,
TU_DRAW_STATE_VS_PARAMS,
TU_DRAW_STATE_COUNT,
@@ -3089,6 +3090,42 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
.ib = desc_sets_gmem,
};
}
+
+ /* We need to reload the descriptors every time the descriptor sets
+ * change. However, the commands we send only depend on the pipeline
+ * because the whole point is to cache descriptors which are used by the
+ * pipeline. There's a problem here, in that the firmware has an
+ * "optimization" which skips executing groups that are set to the same
+ * value as the last draw. This means that if the descriptor sets change
+ * but not the pipeline, we'd try to re-execute the same buffer which
+ * the firmware would ignore and we wouldn't pre-load the new
+ * descriptors. The blob seems to re-emit the LOAD_STATE group whenever
+ * the descriptor sets change, which we emulate here by copying the
+ * pre-prepared buffer.
+ */
+ const struct tu_cs_entry *load_entry = &pipeline->load_state.state_ib;
+ if (load_entry->size > 0) {
+ struct tu_cs load_cs;
+ result = tu_cs_begin_sub_stream(&cmd->sub_cs, load_entry->size, &load_cs);
+ if (result != VK_SUCCESS)
+ return result;
+ tu_cs_emit_array(&load_cs,
+ (uint32_t *)((char *)load_entry->bo->map + load_entry->offset),
+ load_entry->size / 4);
+ struct tu_cs_entry load_copy = tu_cs_end_sub_stream(&cmd->sub_cs, &load_cs);
+
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_DESC_SETS_LOAD,
+ /* The blob seems to not enable this for binning, even when
+ * resources would actually be used in the binning shader.
+ * Presumably the overhead of prefetching the resources isn't
+ * worth it.
+ */
+ .enable_mask = ENABLE_DRAW,
+ .ib = load_copy,
+ };
+ }
}
struct tu_cs_entry vs_params;
@@ -3520,6 +3557,9 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
if (ib.size)
tu_cs_emit_ib(cs, &ib);
+ if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS)
+ tu_cs_emit_ib(cs, &pipeline->load_state.state_ib);
+
cmd->state.dirty &=
~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c
index de1683c6bd7..f6327b85c50 100644
--- a/src/freedreno/vulkan/tu_descriptor_set.c
+++ b/src/freedreno/vulkan/tu_descriptor_set.c
@@ -173,6 +173,7 @@ tu_CreateDescriptorSetLayout(
set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count;
set_layout->binding[b].input_attachment_offset = input_attachment_count;
set_layout->binding[b].size = descriptor_size(binding->descriptorType);
+ set_layout->binding[b].shader_stages = binding->stageFlags;
if (variable_flags && binding->binding < variable_flags->bindingCount &&
(variable_flags->pBindingFlags[binding->binding] &
diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h
index 3a24822eb67..4c1bd502e30 100644
--- a/src/freedreno/vulkan/tu_descriptor_set.h
+++ b/src/freedreno/vulkan/tu_descriptor_set.h
@@ -60,6 +60,9 @@ struct tu_descriptor_set_binding_layout
/* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0
* if there are no immutable samplers. */
uint32_t immutable_samplers_offset;
+
+ /* Shader stages that use this binding */
+ uint32_t shader_stages;
};
struct tu_descriptor_set_layout
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index dc2a568a59c..38765025641 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -40,6 +40,247 @@
#include "tu_cs.h"
+/* Emit IB that preloads the descriptors that the shader uses */
+
+static inline uint32_t
+tu6_vkstage2opcode(VkShaderStageFlags stage)
+{
+ switch (stage) {
+ case VK_SHADER_STAGE_VERTEX_BIT:
+ case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
+ case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
+ case VK_SHADER_STAGE_GEOMETRY_BIT:
+ return CP_LOAD_STATE6_GEOM;
+ case VK_SHADER_STAGE_FRAGMENT_BIT:
+ case VK_SHADER_STAGE_COMPUTE_BIT:
+ return CP_LOAD_STATE6_FRAG;
+ default:
+ unreachable("bad shader type");
+ }
+}
+
+static enum a6xx_state_block
+tu6_tex_stage2sb(VkShaderStageFlags stage)
+{
+ switch (stage) {
+ case VK_SHADER_STAGE_VERTEX_BIT:
+ return SB6_VS_TEX;
+ case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
+ return SB6_HS_TEX;
+ case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
+ return SB6_DS_TEX;
+ case VK_SHADER_STAGE_GEOMETRY_BIT:
+ return SB6_GS_TEX;
+ case VK_SHADER_STAGE_FRAGMENT_BIT:
+ return SB6_FS_TEX;
+ case VK_SHADER_STAGE_COMPUTE_BIT:
+ return SB6_CS_TEX;
+ default:
+ unreachable("bad shader stage");
+ }
+}
+
+static enum a6xx_state_block
+tu6_ubo_stage2sb(VkShaderStageFlags stage)
+{
+ switch (stage) {
+ case VK_SHADER_STAGE_VERTEX_BIT:
+ return SB6_VS_SHADER;
+ case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
+ return SB6_HS_SHADER;
+ case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
+ return SB6_DS_SHADER;
+ case VK_SHADER_STAGE_GEOMETRY_BIT:
+ return SB6_GS_SHADER;
+ case VK_SHADER_STAGE_FRAGMENT_BIT:
+ return SB6_FS_SHADER;
+ case VK_SHADER_STAGE_COMPUTE_BIT:
+ return SB6_CS_SHADER;
+ default:
+ unreachable("bad shader stage");
+ }
+}
+
+static void
+emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
+ enum a6xx_state_block sb, unsigned base, unsigned offset,
+ unsigned count)
+{
+ /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
+ * clear if emitting more packets will even help anything. Presumably the
+ * descriptor cache is relatively small, and these packets stop doing
+ * anything when there are too many descriptors.
+ */
+ tu_cs_emit_pkt7(cs, opcode, 3);
+ tu_cs_emit(cs,
+ CP_LOAD_STATE6_0_STATE_TYPE(st) |
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
+ CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
+ tu_cs_emit_qw(cs, offset | (base << 28));
+}
+
+static unsigned
+tu6_load_state_size(struct tu_pipeline_layout *layout, bool compute)
+{
+ const unsigned load_state_size = 4;
+ unsigned size = 0;
+ for (unsigned i = 0; i < layout->num_sets; i++) {
+ struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
+ for (unsigned j = 0; j < set_layout->binding_count; j++) {
+ struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
+ unsigned count = 0;
+ /* Note: some users, like amber for example, pass in
+ * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
+ * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
+ */
+ VkShaderStageFlags stages = compute ?
+ binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
+ binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
+ unsigned stage_count = util_bitcount(stages);
+ switch (binding->type) {
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ /* IBO-backed resources only need one packet for all graphics stages */
+ if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT)
+ count += 1;
+ if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
+ count += 1;
+ break;
+ case VK_DESCRIPTOR_TYPE_SAMPLER:
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ /* Textures and UBO's needs a packet for each stage */
+ count = stage_count;
+ break;
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ /* Because of how we pack combined images and samplers, we
+ * currently can't use one packet for the whole array.
+ */
+ count = stage_count * binding->array_size * 2;
+ break;
+ default:
+ unreachable("bad descriptor type");
+ }
+ size += count * load_state_size;
+ }
+ }
+ return size;
+}
+
+static void
+tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
+{
+ unsigned size = tu6_load_state_size(pipeline->layout, compute);
+ if (size == 0)
+ return;
+
+ struct tu_cs cs;
+ tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
+
+ struct tu_pipeline_layout *layout = pipeline->layout;
+ for (unsigned i = 0; i < layout->num_sets; i++) {
+ struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
+ for (unsigned j = 0; j < set_layout->binding_count; j++) {
+ struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
+ unsigned base = i;
+ unsigned offset = binding->offset / 4;
+ /* Note: some users, like amber for example, pass in
+ * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
+ * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
+ */
+ VkShaderStageFlags stages = compute ?
+ binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
+ binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
+ unsigned count = binding->array_size;
+ if (count == 0 || stages == 0)
+ continue;
+ switch (binding->type) {
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+ base = MAX_SETS;
+ offset = (layout->input_attachment_count +
+ layout->set[i].dynamic_offset_start +
+ binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
+ /* fallthrough */
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ /* IBO-backed resources only need one packet for all graphics stages */
+ if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
+ emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
+ base, offset, count);
+ }
+ if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
+ emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
+ base, offset, count);
+ }
+ break;
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ base = MAX_SETS;
+ offset = (layout->set[i].input_attachment_start +
+ binding->input_attachment_offset) * A6XX_TEX_CONST_DWORDS;
+ case VK_DESCRIPTOR_TYPE_SAMPLER:
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
+ unsigned stage_log2;
+ for_each_bit(stage_log2, stages) {
+ VkShaderStageFlags stage = 1 << stage_log2;
+ emit_load_state(&cs, tu6_vkstage2opcode(stage),
+ binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
+ ST6_SHADER : ST6_CONSTANTS,
+ tu6_tex_stage2sb(stage), base, offset, count);
+ }
+ break;
+ }
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ base = MAX_SETS;
+ offset = (layout->input_attachment_count +
+ layout->set[i].dynamic_offset_start +
+ binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
+ /* fallthrough */
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
+ unsigned stage_log2;
+ for_each_bit(stage_log2, stages) {
+ VkShaderStageFlags stage = 1 << stage_log2;
+ emit_load_state(&cs, tu6_vkstage2opcode(stage), ST6_UBO,
+ tu6_ubo_stage2sb(stage), base, offset, count);
+ }
+ break;
+ }
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
+ unsigned stage_log2;
+ for_each_bit(stage_log2, stages) {
+ VkShaderStageFlags stage = 1 << stage_log2;
+ /* TODO: We could emit less CP_LOAD_STATE6 if we used
+ * struct-of-arrays instead of array-of-structs.
+ */
+ for (unsigned i = 0; i < count; i++) {
+ unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
+ unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
+ emit_load_state(&cs, tu6_vkstage2opcode(stage),
+ ST6_CONSTANTS, tu6_tex_stage2sb(stage),
+ base, tex_offset, 1);
+ emit_load_state(&cs, tu6_vkstage2opcode(stage),
+ ST6_SHADER, tu6_tex_stage2sb(stage),
+ base, sam_offset, 1);
+ }
+ }
+ break;
+ }
+ default:
+ unreachable("bad descriptor type");
+ }
+ }
+ }
+
+ pipeline->load_state.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &cs);
+}
+
struct tu_pipeline_builder
{
struct tu_device *device;
@@ -1774,6 +2015,8 @@ tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4])
static VkResult
tu_pipeline_create(struct tu_device *dev,
+ struct tu_pipeline_layout *layout,
+ bool compute,
const VkAllocationCallbacks *pAllocator,
struct tu_pipeline **out_pipeline)
{
@@ -1785,8 +2028,12 @@ tu_pipeline_create(struct tu_device *dev,
tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, 2048);
- /* reserve the space now such that tu_cs_begin_sub_stream never fails */
- VkResult result = tu_cs_reserve_space(&pipeline->cs, 2048);
+ /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note
+ * that LOAD_STATE can potentially take up a large amount of space so we
+ * calculate its size explicitly.
+ */
+ unsigned load_state_size = tu6_load_state_size(layout, compute);
+ VkResult result = tu_cs_reserve_space(&pipeline->cs, 2048 + load_state_size);
if (result != VK_SUCCESS) {
vk_free2(&dev->alloc, pAllocator, pipeline);
return result;
@@ -2182,8 +2429,8 @@ static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
struct tu_pipeline **pipeline)
{
- VkResult result = tu_pipeline_create(builder->device, builder->alloc,
- pipeline);
+ VkResult result = tu_pipeline_create(builder->device, builder->layout,
+ false, builder->alloc, pipeline);
if (result != VK_SUCCESS)
return result;
@@ -2209,6 +2456,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
tu_pipeline_builder_parse_rasterization(builder, *pipeline);
tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
+ tu6_emit_load_state(*pipeline, false);
/* we should have reserved enough space upfront such that the CS never
* grows
@@ -2381,7 +2629,7 @@ tu_compute_pipeline_create(VkDevice device,
*pPipeline = VK_NULL_HANDLE;
- result = tu_pipeline_create(dev, pAllocator, &pipeline);
+ result = tu_pipeline_create(dev, layout, true, pAllocator, &pipeline);
if (result != VK_SUCCESS)
return result;
@@ -2418,6 +2666,8 @@ tu_compute_pipeline_create(VkDevice device,
tu6_emit_compute_program(&prog_cs, shader, &pipeline->program.binary_bo);
pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs);
+ tu6_emit_load_state(pipeline, true);
+
*pPipeline = tu_pipeline_to_handle(pipeline);
return VK_SUCCESS;
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 87e4df85ff5..9fa8763179d 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -1212,6 +1212,11 @@ struct tu_pipeline
struct
{
+ struct tu_cs_entry state_ib;
+ } load_state;
+
+ struct
+ {
uint8_t bindings[MAX_VERTEX_ATTRIBS];
uint32_t count;