2 files changed, 134 insertions, 36 deletions
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 5086a33969a..61eb2f10be2 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -95,10 +95,13 @@ static uint32_t null_image_descriptor[8] = {
 	 * descriptor */
 };
 
-static void si_init_descriptors(struct si_descriptors *desc,
+static void si_init_descriptors(struct si_context *sctx,
+				struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
 				unsigned num_elements,
+				unsigned first_ce_slot,
+				unsigned num_ce_slots,
 				unsigned *ce_offset)
 {
 	assert(num_elements <= sizeof(desc->dirty_mask)*8);
@@ -106,14 +109,16 @@ static void si_init_descriptors(struct si_descriptors *desc,
 	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
+	desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
+	desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
 	desc->dirty_mask = u_bit_consecutive64(0, num_elements);
 	desc->shader_userdata_offset = shader_userdata_index * 4;
 
-	if (ce_offset) {
+	if (desc->num_ce_slots) {
 		desc->uses_ce = true;
 		desc->ce_offset = *ce_offset;
 
-		*ce_offset += element_dw_size * num_elements * 4;
+		*ce_offset += element_dw_size * desc->num_ce_slots * 4;
 	}
 }
 
@@ -205,13 +210,16 @@ static bool si_upload_descriptors(struct si_context *sctx,
 	if (!upload_size)
 		return true;
 
-	if (sctx->ce_ib && desc->uses_ce) {
-		uint32_t const* list = (uint32_t const*)desc->list;
+	if (desc->uses_ce) {
+		const uint32_t *list = desc->list +
+				       desc->first_ce_slot * desc->element_dw_size;
+		uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) &
+				u_bit_consecutive64(0, desc->num_ce_slots);
 
-		while(desc->dirty_mask) {
+
+		while (mask) {
 			int begin, count;
-			u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
-						       &count);
+			u_bit_scan_consecutive_range64(&mask, &begin, &count);
 
 			begin *= desc->element_dw_size;
 			count *= desc->element_dw_size;
@@ -222,7 +230,9 @@ static bool si_upload_descriptors(struct si_context *sctx,
 			radeon_emit_array(sctx->ce_ib, list + begin, count);
 		}
 
-		if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+		if (!si_ce_upload(sctx,
+				  desc->ce_offset +
+				  (first_slot_offset - desc->first_ce_slot * slot_size),
 				  upload_size, (unsigned*)&desc->buffer_offset,
 				  &desc->buffer))
 			return false;
@@ -920,9 +930,12 @@ static void si_bind_sampler_states(struct pipe_context *ctx,
 
 /* BUFFER RESOURCES */
 
-static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_context *sctx,
+				     struct si_buffer_resources *buffers,
 				     struct si_descriptors *descs,
 				     unsigned num_buffers,
+				     unsigned first_ce_slot,
+				     unsigned num_ce_slots,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_usage shader_usage_constbuf,
@@ -936,8 +949,8 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 	buffers->priority_constbuf = priority_constbuf;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
-	si_init_descriptors(descs, shader_userdata_index, 4,
-			    num_buffers, ce_offset);
+	si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers,
+			    first_ce_slot, num_ce_slots, ce_offset);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
@@ -1994,6 +2007,48 @@ void si_emit_compute_shader_userdata(struct si_context *sctx)
 
 /* INIT/DEINIT/UPLOAD */
 
+/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order
+ * to make CE RAM as useful as possible, this defines limits
+ * for the number slots that can be in CE RAM on GFX9. If a shader
+ * is using more, descriptors will be uploaded to memory directly and
+ * CE won't be used.
+ *
+ * These numbers are based on shader-db.
+ */
+static unsigned gfx9_max_ce_samplers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 1,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 24,
+	[PIPE_SHADER_COMPUTE] = 16,
+};
+static unsigned gfx9_max_ce_images[SI_NUM_SHADERS] = {
+	/* these must be even due to slot alignment */
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 2,
+	[PIPE_SHADER_COMPUTE] = 8,
+};
+static unsigned gfx9_max_ce_const_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 9,
+	[PIPE_SHADER_TESS_CTRL] = 3,
+	[PIPE_SHADER_TESS_EVAL] = 5,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 8,
+	[PIPE_SHADER_COMPUTE] = 6,
+};
+static unsigned gfx9_max_ce_shader_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 12,
+	[PIPE_SHADER_COMPUTE] = 13,
+};
+
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
@@ -2003,23 +2058,37 @@ void si_init_all_descriptors(struct si_context *sctx)
 	STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		bool gfx9_tcs = sctx->b.chip_class == GFX9 &&
-				i == PIPE_SHADER_TESS_CTRL;
-		bool gfx9_gs = sctx->b.chip_class == GFX9 &&
-			       i == PIPE_SHADER_GEOMETRY;
-		/* GFX9 has only 4KB of CE, while previous chips had 32KB.
-		 * Rarely used descriptors don't use CE RAM.
-		 */
-		bool big_ce = sctx->b.chip_class <= VI;
-		bool const_and_shaderbufs_use_ce = big_ce ||
-						   i == PIPE_SHADER_VERTEX ||
-						   i == PIPE_SHADER_FRAGMENT;
-		bool samplers_and_images_use_ce = big_ce ||
-						  i == PIPE_SHADER_FRAGMENT;
-
-		si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
+		bool gfx9_tcs = false;
+		bool gfx9_gs = false;
+		unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
+		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+
+		unsigned first_sampler_ce_slot = 0;
+		unsigned num_sampler_ce_slots = num_sampler_slots;
+
+		unsigned first_buffer_ce_slot = 0;
+		unsigned num_buffer_ce_slots = num_buffer_slots;
+
+		/* Adjust CE slot ranges based on GFX9 CE RAM limits. */
+		if (sctx->b.chip_class >= GFX9) {
+			gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
+			gfx9_gs = i == PIPE_SHADER_GEOMETRY;
+
+			first_sampler_ce_slot =
+				si_get_image_slot(gfx9_max_ce_images[i] - 1) / 2;
+			num_sampler_ce_slots = gfx9_max_ce_images[i] / 2 +
+					       gfx9_max_ce_samplers[i];
+
+			first_buffer_ce_slot =
+				si_get_shaderbuf_slot(gfx9_max_ce_shader_buffers[i] - 1);
+			num_buffer_ce_slots = gfx9_max_ce_shader_buffers[i] +
+					      gfx9_max_ce_const_buffers[i];
+		}
+
+		si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i],
 					 si_const_and_shader_buffer_descriptors(sctx, i),
-					 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS,
+					 num_buffer_slots,
+					 first_buffer_ce_slot, num_buffer_ce_slots,
 					 gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
 					 gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
 						   SI_SGPR_CONST_AND_SHADER_BUFFERS,
@@ -2027,15 +2096,16 @@ void si_init_all_descriptors(struct si_context *sctx)
 					 RADEON_USAGE_READ,
 					 RADEON_PRIO_SHADER_RW_BUFFER,
 					 RADEON_PRIO_CONST_BUFFER,
-					 const_and_shaderbufs_use_ce ? &ce_offset : NULL);
+					 &ce_offset);
 
 		struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
-		si_init_descriptors(desc,
+		si_init_descriptors(sctx, desc,
 				    gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
 				    gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
 					      SI_SGPR_SAMPLERS_AND_IMAGES,
-				    16, SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS,
-				    samplers_and_images_use_ce ? &ce_offset : NULL);
+				    16, num_sampler_slots,
+				    first_sampler_ce_slot, num_sampler_ce_slots,
+				    &ce_offset);
 
 		int j;
 		for (j = 0; j < SI_NUM_IMAGES; j++)
@@ -2044,9 +2114,10 @@ void si_init_all_descriptors(struct si_context *sctx)
 			memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
 	}
 
-	si_init_buffer_resources(&sctx->rw_buffers,
+	si_init_buffer_resources(sctx, &sctx->rw_buffers,
 				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
-				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
+				 SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS,
+				 SI_SGPR_RW_BUFFERS,
 				 /* The second set of usage/priority is used by
 				  * const buffers in RW buffer slots. */
 				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
@@ -2054,8 +2125,8 @@ void si_init_all_descriptors(struct si_context *sctx)
 				 &ce_offset);
 	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
 
-	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-			    4, SI_NUM_VERTEX_BUFFERS, NULL);
+	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
+			    4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
 
 	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
 	sctx->total_ce_ram_allocated = ce_offset;
@@ -2197,6 +2268,26 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
 	    first + count > desc->first_active_slot + desc->num_active_slots)
 		sctx->descriptors_dirty |= 1u << desc_idx;
 
+	/* Enable or disable CE for this descriptor array. */
+	bool used_ce = desc->uses_ce;
+	desc->uses_ce = desc->first_ce_slot <= first &&
+			desc->first_ce_slot + desc->num_ce_slots >= first + count;
+
+	if (desc->uses_ce != used_ce) {
+		/* Upload or dump descriptors if we're disabling or enabling CE,
+		 * respectively. */
+		sctx->descriptors_dirty |= 1u << desc_idx;
+
+		/* If we're enabling CE, re-upload all descriptors to CE RAM.
+		 * When CE was disabled, uploads to CE RAM stopped.
+		 */
+		if (desc->uses_ce) {
+			desc->dirty_mask |=
+				u_bit_consecutive64(desc->first_ce_slot,
+						    desc->num_ce_slots);
+		}
+	}
+
 	desc->first_active_slot = first;
 	desc->num_active_slots = count;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index dfabaa35566..275f830613c 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,13 @@ struct si_descriptors {
 	/* Offset in CE RAM */
 	unsigned ce_offset;
 
+	/* Slots allocated in CE RAM. If we get active slots outside of this
+	 * range, direct uploads to memory will be used instead. This basically
+	 * governs switching between onchip (CE) and offchip (upload) modes.
+	 */
+	unsigned first_ce_slot;
+	unsigned num_ce_slots;
+
 	/* Slots that are used by currently-bound shaders.
 	 * With CE: It determines which slots are dumped to L2.
 	 *          It doesn't skip uploads to CE RAM.