summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/radeonsi/si_descriptors.c163
-rw-r--r--src/gallium/drivers/radeonsi/si_state.h7
2 files changed, 134 insertions, 36 deletions
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 5086a33969a..61eb2f10be2 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -95,10 +95,13 @@ static uint32_t null_image_descriptor[8] = {
* descriptor */
};
-static void si_init_descriptors(struct si_descriptors *desc,
+static void si_init_descriptors(struct si_context *sctx,
+ struct si_descriptors *desc,
unsigned shader_userdata_index,
unsigned element_dw_size,
unsigned num_elements,
+ unsigned first_ce_slot,
+ unsigned num_ce_slots,
unsigned *ce_offset)
{
assert(num_elements <= sizeof(desc->dirty_mask)*8);
@@ -106,14 +109,16 @@ static void si_init_descriptors(struct si_descriptors *desc,
desc->list = CALLOC(num_elements, element_dw_size * 4);
desc->element_dw_size = element_dw_size;
desc->num_elements = num_elements;
+ desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
+ desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
desc->dirty_mask = u_bit_consecutive64(0, num_elements);
desc->shader_userdata_offset = shader_userdata_index * 4;
- if (ce_offset) {
+ if (desc->num_ce_slots) {
desc->uses_ce = true;
desc->ce_offset = *ce_offset;
- *ce_offset += element_dw_size * num_elements * 4;
+ *ce_offset += element_dw_size * desc->num_ce_slots * 4;
}
}
@@ -205,13 +210,16 @@ static bool si_upload_descriptors(struct si_context *sctx,
if (!upload_size)
return true;
- if (sctx->ce_ib && desc->uses_ce) {
- uint32_t const* list = (uint32_t const*)desc->list;
+ if (desc->uses_ce) {
+ const uint32_t *list = desc->list +
+ desc->first_ce_slot * desc->element_dw_size;
+ uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) &
+ u_bit_consecutive64(0, desc->num_ce_slots);
- while(desc->dirty_mask) {
+
+ while (mask) {
int begin, count;
- u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
- &count);
+ u_bit_scan_consecutive_range64(&mask, &begin, &count);
begin *= desc->element_dw_size;
count *= desc->element_dw_size;
@@ -222,7 +230,9 @@ static bool si_upload_descriptors(struct si_context *sctx,
radeon_emit_array(sctx->ce_ib, list + begin, count);
}
- if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+ if (!si_ce_upload(sctx,
+ desc->ce_offset +
+ (first_slot_offset - desc->first_ce_slot * slot_size),
upload_size, (unsigned*)&desc->buffer_offset,
&desc->buffer))
return false;
@@ -920,9 +930,12 @@ static void si_bind_sampler_states(struct pipe_context *ctx,
/* BUFFER RESOURCES */
-static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_context *sctx,
+ struct si_buffer_resources *buffers,
struct si_descriptors *descs,
unsigned num_buffers,
+ unsigned first_ce_slot,
+ unsigned num_ce_slots,
unsigned shader_userdata_index,
enum radeon_bo_usage shader_usage,
enum radeon_bo_usage shader_usage_constbuf,
@@ -936,8 +949,8 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
buffers->priority_constbuf = priority_constbuf;
buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
- si_init_descriptors(descs, shader_userdata_index, 4,
- num_buffers, ce_offset);
+ si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers,
+ first_ce_slot, num_ce_slots, ce_offset);
}
static void si_release_buffer_resources(struct si_buffer_resources *buffers,
@@ -1994,6 +2007,48 @@ void si_emit_compute_shader_userdata(struct si_context *sctx)
/* INIT/DEINIT/UPLOAD */
+/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order
+ * to make CE RAM as useful as possible, this defines limits
+ * for the number slots that can be in CE RAM on GFX9. If a shader
+ * is using more, descriptors will be uploaded to memory directly and
+ * CE won't be used.
+ *
+ * These numbers are based on shader-db.
+ */
+static unsigned gfx9_max_ce_samplers[SI_NUM_SHADERS] = {
+ [PIPE_SHADER_VERTEX] = 0,
+ [PIPE_SHADER_TESS_CTRL] = 0,
+ [PIPE_SHADER_TESS_EVAL] = 1,
+ [PIPE_SHADER_GEOMETRY] = 0,
+ [PIPE_SHADER_FRAGMENT] = 24,
+ [PIPE_SHADER_COMPUTE] = 16,
+};
+static unsigned gfx9_max_ce_images[SI_NUM_SHADERS] = {
+ /* these must be even due to slot alignment */
+ [PIPE_SHADER_VERTEX] = 0,
+ [PIPE_SHADER_TESS_CTRL] = 0,
+ [PIPE_SHADER_TESS_EVAL] = 0,
+ [PIPE_SHADER_GEOMETRY] = 0,
+ [PIPE_SHADER_FRAGMENT] = 2,
+ [PIPE_SHADER_COMPUTE] = 8,
+};
+static unsigned gfx9_max_ce_const_buffers[SI_NUM_SHADERS] = {
+ [PIPE_SHADER_VERTEX] = 9,
+ [PIPE_SHADER_TESS_CTRL] = 3,
+ [PIPE_SHADER_TESS_EVAL] = 5,
+ [PIPE_SHADER_GEOMETRY] = 0,
+ [PIPE_SHADER_FRAGMENT] = 8,
+ [PIPE_SHADER_COMPUTE] = 6,
+};
+static unsigned gfx9_max_ce_shader_buffers[SI_NUM_SHADERS] = {
+ [PIPE_SHADER_VERTEX] = 0,
+ [PIPE_SHADER_TESS_CTRL] = 0,
+ [PIPE_SHADER_TESS_EVAL] = 0,
+ [PIPE_SHADER_GEOMETRY] = 0,
+ [PIPE_SHADER_FRAGMENT] = 12,
+ [PIPE_SHADER_COMPUTE] = 13,
+};
+
void si_init_all_descriptors(struct si_context *sctx)
{
int i;
@@ -2003,23 +2058,37 @@ void si_init_all_descriptors(struct si_context *sctx)
STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
for (i = 0; i < SI_NUM_SHADERS; i++) {
- bool gfx9_tcs = sctx->b.chip_class == GFX9 &&
- i == PIPE_SHADER_TESS_CTRL;
- bool gfx9_gs = sctx->b.chip_class == GFX9 &&
- i == PIPE_SHADER_GEOMETRY;
- /* GFX9 has only 4KB of CE, while previous chips had 32KB.
- * Rarely used descriptors don't use CE RAM.
- */
- bool big_ce = sctx->b.chip_class <= VI;
- bool const_and_shaderbufs_use_ce = big_ce ||
- i == PIPE_SHADER_VERTEX ||
- i == PIPE_SHADER_FRAGMENT;
- bool samplers_and_images_use_ce = big_ce ||
- i == PIPE_SHADER_FRAGMENT;
-
- si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
+ bool gfx9_tcs = false;
+ bool gfx9_gs = false;
+ unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
+ unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+
+ unsigned first_sampler_ce_slot = 0;
+ unsigned num_sampler_ce_slots = num_sampler_slots;
+
+ unsigned first_buffer_ce_slot = 0;
+ unsigned num_buffer_ce_slots = num_buffer_slots;
+
+ /* Adjust CE slot ranges based on GFX9 CE RAM limits. */
+ if (sctx->b.chip_class >= GFX9) {
+ gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
+ gfx9_gs = i == PIPE_SHADER_GEOMETRY;
+
+ first_sampler_ce_slot =
+ si_get_image_slot(gfx9_max_ce_images[i] - 1) / 2;
+ num_sampler_ce_slots = gfx9_max_ce_images[i] / 2 +
+ gfx9_max_ce_samplers[i];
+
+ first_buffer_ce_slot =
+ si_get_shaderbuf_slot(gfx9_max_ce_shader_buffers[i] - 1);
+ num_buffer_ce_slots = gfx9_max_ce_shader_buffers[i] +
+ gfx9_max_ce_const_buffers[i];
+ }
+
+ si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i],
si_const_and_shader_buffer_descriptors(sctx, i),
- SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS,
+ num_buffer_slots,
+ first_buffer_ce_slot, num_buffer_ce_slots,
gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
SI_SGPR_CONST_AND_SHADER_BUFFERS,
@@ -2027,15 +2096,16 @@ void si_init_all_descriptors(struct si_context *sctx)
RADEON_USAGE_READ,
RADEON_PRIO_SHADER_RW_BUFFER,
RADEON_PRIO_CONST_BUFFER,
- const_and_shaderbufs_use_ce ? &ce_offset : NULL);
+ &ce_offset);
struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
- si_init_descriptors(desc,
+ si_init_descriptors(sctx, desc,
gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
SI_SGPR_SAMPLERS_AND_IMAGES,
- 16, SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS,
- samplers_and_images_use_ce ? &ce_offset : NULL);
+ 16, num_sampler_slots,
+ first_sampler_ce_slot, num_sampler_ce_slots,
+ &ce_offset);
int j;
for (j = 0; j < SI_NUM_IMAGES; j++)
@@ -2044,9 +2114,10 @@ void si_init_all_descriptors(struct si_context *sctx)
memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
}
- si_init_buffer_resources(&sctx->rw_buffers,
+ si_init_buffer_resources(sctx, &sctx->rw_buffers,
&sctx->descriptors[SI_DESCS_RW_BUFFERS],
- SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
+ SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS,
+ SI_SGPR_RW_BUFFERS,
/* The second set of usage/priority is used by
* const buffers in RW buffer slots. */
RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
@@ -2054,8 +2125,8 @@ void si_init_all_descriptors(struct si_context *sctx)
&ce_offset);
sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
- si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
- 4, SI_NUM_VERTEX_BUFFERS, NULL);
+ si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
+ 4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
sctx->total_ce_ram_allocated = ce_offset;
@@ -2197,6 +2268,26 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
first + count > desc->first_active_slot + desc->num_active_slots)
sctx->descriptors_dirty |= 1u << desc_idx;
+ /* Enable or disable CE for this descriptor array. */
+ bool used_ce = desc->uses_ce;
+ desc->uses_ce = desc->first_ce_slot <= first &&
+ desc->first_ce_slot + desc->num_ce_slots >= first + count;
+
+ if (desc->uses_ce != used_ce) {
+ /* Upload or dump descriptors if we're disabling or enabling CE,
+ * respectively. */
+ sctx->descriptors_dirty |= 1u << desc_idx;
+
+ /* If we're enabling CE, re-upload all descriptors to CE RAM.
+ * When CE was disabled, uploads to CE RAM stopped.
+ */
+ if (desc->uses_ce) {
+ desc->dirty_mask |=
+ u_bit_consecutive64(desc->first_ce_slot,
+ desc->num_ce_slots);
+ }
+ }
+
desc->first_active_slot = first;
desc->num_active_slots = count;
}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index dfabaa35566..275f830613c 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,13 @@ struct si_descriptors {
/* Offset in CE RAM */
unsigned ce_offset;
+ /* Slots allocated in CE RAM. If we get active slots outside of this
+ * range, direct uploads to memory will be used instead. This basically
+ * governs switching between onchip (CE) and offchip (upload) modes.
+ */
+ unsigned first_ce_slot;
+ unsigned num_ce_slots;
+
/* Slots that are used by currently-bound shaders.
* With CE: It determines which slots are dumped to L2.
* It doesn't skip uploads to CE RAM.