From f8e16010e51eef19ed7030ac7248438f729ae511 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel.daenzer@amd.com>
Date: Tue, 28 Jan 2014 15:39:30 +0900
Subject: radeonsi: Put GS ring buffer descriptors with streamout buffer
 descriptors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And mark the constant buffers as read only for the GPU again.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 93 ++++++++++++++++-----------
 src/gallium/drivers/radeonsi/si_pipe.h        |  6 +-
 src/gallium/drivers/radeonsi/si_shader.c      | 22 ++++---
 src/gallium/drivers/radeonsi/si_shader.h      | 72 +++++++++++----------
 src/gallium/drivers/radeonsi/si_state.h       |  6 +-
 5 files changed, 115 insertions(+), 84 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 2a54fcb54fc..9078c6c7f3e 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -516,7 +516,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			unsigned element_size, unsigned index_stride)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
+	struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
 
 	if (shader >= SI_NUM_SHADERS)
 		return;
@@ -608,9 +608,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 				     unsigned append_bitmask)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_buffer_resources *buffers = &sctx->streamout_buffers;
+	struct si_buffer_resources *buffers = &sctx->rw_buffers[PIPE_SHADER_VERTEX];
 	unsigned old_num_targets = sctx->b.streamout.num_targets;
-	unsigned i;
+	unsigned i, bufidx;
 
 	/* Streamout buffers must be bound in 2 places:
 	 * 1) in VGT by setting the VGT_STRMOUT registers
@@ -622,12 +622,14 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 
 	/* Set the shader resources.*/
 	for (i = 0; i < num_targets; i++) {
+		bufidx = SI_RW_SO + i;
+
 		if (targets[i]) {
 			struct pipe_resource *buffer = targets[i]->buffer;
 			uint64_t va = r600_resource_va(ctx->screen, buffer);
 
 			/* Set the descriptor. */
-			uint32_t *desc = buffers->desc_data[i];
+			uint32_t *desc = buffers->desc_data[bufidx];
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
 			desc[2] = 0xffffffff;
@@ -637,25 +639,29 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
 
 			/* Set the resource. */
-			pipe_resource_reference(&buffers->buffers[i], buffer);
+			pipe_resource_reference(&buffers->buffers[bufidx],
+						buffer);
 			r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 					      (struct r600_resource*)buffer,
 					      buffers->shader_usage);
-			buffers->desc.enabled_mask |= 1 << i;
+			buffers->desc.enabled_mask |= 1 << bufidx;
 		} else {
 			/* Clear the descriptor and unset the resource. */
-			memset(buffers->desc_data[i], 0, sizeof(uint32_t) * 4);
-			pipe_resource_reference(&buffers->buffers[i], NULL);
-			buffers->desc.enabled_mask &= ~(1 << i);
+			memset(buffers->desc_data[bufidx], 0,
+			       sizeof(uint32_t) * 4);
+			pipe_resource_reference(&buffers->buffers[bufidx],
+						NULL);
+			buffers->desc.enabled_mask &= ~(1 << bufidx);
 		}
-		buffers->desc.dirty_mask |= 1 << i;
+		buffers->desc.dirty_mask |= 1 << bufidx;
 	}
 	for (; i < old_num_targets; i++) {
+		bufidx = SI_RW_SO + i;
 		/* Clear the descriptor and unset the resource. */
-		memset(buffers->desc_data[i], 0, sizeof(uint32_t) * 4);
-		pipe_resource_reference(&buffers->buffers[i], NULL);
-		buffers->desc.enabled_mask &= ~(1 << i);
-		buffers->desc.dirty_mask |= 1 << i;
+		memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
+		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
+		buffers->desc.enabled_mask &= ~(1 << bufidx);
+		buffers->desc.dirty_mask |= 1 << bufidx;
 	}
 
 	si_update_descriptors(sctx, &buffers->desc);
@@ -712,25 +718,37 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Vertex buffers. */
 	/* Nothing to do. Vertex buffer bindings are updated before every draw call. */
 
-	/* Streamout buffers. */
-	for (i = 0; i < sctx->streamout_buffers.num_buffers; i++) {
-		if (sctx->streamout_buffers.buffers[i] == buf) {
-			/* Update the descriptor. */
-			si_desc_reset_buffer_offset(ctx, sctx->streamout_buffers.desc_data[i],
-						    old_va, buf);
+	/* Read/Write buffers. */
+	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+		struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
+		bool found = false;
+		uint32_t mask = buffers->desc.enabled_mask;
 
-			r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
-					      (struct r600_resource*)buf,
-					      sctx->streamout_buffers.shader_usage);
-			sctx->streamout_buffers.desc.dirty_mask |= 1 << i;
-			si_update_descriptors(sctx, &sctx->streamout_buffers.desc);
-
-			/* Update the streamout state. */
-			if (sctx->b.streamout.begin_emitted) {
-				r600_emit_streamout_end(&sctx->b);
+		while (mask) {
+			i = u_bit_scan(&mask);
+			if (buffers->buffers[i] == buf) {
+				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+							    old_va, buf);
+
+				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+						      rbuffer, buffers->shader_usage);
+
+				buffers->desc.dirty_mask |= 1 << i;
+				found = true;
+
+				if (i >= SI_RW_SO && shader == PIPE_SHADER_VERTEX) {
+					/* Update the streamout state. */
+					if (sctx->b.streamout.begin_emitted) {
+						r600_emit_streamout_end(&sctx->b);
+					}
+					sctx->b.streamout.append_bitmask =
+						sctx->b.streamout.enabled_mask;
+					r600_streamout_buffers_dirty(&sctx->b);
+				}
 			}
-			sctx->b.streamout.append_bitmask = sctx->b.streamout.enabled_mask;
-			r600_streamout_buffers_dirty(&sctx->b);
+		}
+		if (found) {
+			si_update_descriptors(sctx, &buffers->desc);
 		}
 	}
 
@@ -936,17 +954,20 @@ void si_init_all_descriptors(struct si_context *sctx)
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
 					 NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
+					 RADEON_USAGE_READ);
+		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
+					 i == PIPE_SHADER_VERTEX ?
+					 SI_RW_SO + 4 : SI_RW_SO,
+					 i, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE);
 
 		si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
 
 		sctx->atoms.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
+		sctx->atoms.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
 		sctx->atoms.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
 	}
 
-	si_init_buffer_resources(sctx, &sctx->streamout_buffers, 4, PIPE_SHADER_VERTEX,
-				 SI_SGPR_SO_BUFFER, RADEON_USAGE_WRITE);
-	sctx->atoms.streamout_buffers = &sctx->streamout_buffers.desc.atom;
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -961,9 +982,9 @@ void si_release_all_descriptors(struct si_context *sctx)
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_release_buffer_resources(&sctx->const_buffers[i]);
+		si_release_buffer_resources(&sctx->rw_buffers[i]);
 		si_release_sampler_views(&sctx->samplers[i].views);
 	}
-	si_release_buffer_resources(&sctx->streamout_buffers);
 }
 
 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
@@ -972,7 +993,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
+		si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
 		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
 	}
-	si_buffer_resources_begin_new_cs(sctx, &sctx->streamout_buffers);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index d63a52b0aff..f97feb0464c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -78,6 +78,8 @@ struct si_surface {
 
 #define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
 
+#define SI_RW_SO 2 /* Streamout buffer descriptors after ring buffers */
+
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
@@ -93,8 +95,8 @@ struct si_context {
 		struct {
 			/* The order matters. */
 			struct r600_atom *const_buffers[SI_NUM_SHADERS];
+			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
 			struct r600_atom *sampler_views[SI_NUM_SHADERS];
-			struct r600_atom *streamout_buffers;
 			/* Caches must be flushed after resource descriptors are
 			 * updated in memory. */
 			struct r600_atom *cache_flush;
@@ -120,7 +122,7 @@ struct si_context {
 	unsigned			sprite_coord_enable;
 	unsigned			export_16bpc;
 	struct si_buffer_resources	const_buffers[SI_NUM_SHADERS];
-	struct si_buffer_resources	streamout_buffers;
+	struct si_buffer_resources	rw_buffers[SI_NUM_SHADERS];
 	struct si_textures_info	samplers[SI_NUM_SHADERS];
 	struct r600_resource		*border_color_table;
 	unsigned			border_color_offset;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 5b95c11580f..54270cdb733 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -319,7 +319,8 @@ static LLVMValueRef fetch_input_gs(
 				      4);
 
 	/* Load the ESGS ring resource descriptor */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				  SI_PARAM_RW_BUFFERS);
 	t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
 				    lp_build_const_int32(gallivm, SI_RING_ESGS));
 
@@ -1202,7 +1203,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	}
 
 	/* Load the ESGS ring resource descriptor */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				  SI_PARAM_RW_BUFFERS);
 	t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
 				    lp_build_const_int32(gallivm, SI_RING_ESGS));
 
@@ -1910,7 +1912,8 @@ static void si_llvm_emit_vertex(
 	int i;
 
 	/* Load the GSVS ring resource descriptor */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				  SI_PARAM_RW_BUFFERS);
 	t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
 				    lp_build_const_int32(gallivm, SI_RING_GSVS));
 
@@ -2038,7 +2041,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	struct si_pipe_shader *shader = si_shader_ctx->shader;
-	LLVMTypeRef params[21], f32, i8, i32, v2i32, v3i32;
+	LLVMTypeRef params[SI_NUM_PARAMS], f32, i8, i32, v2i32, v3i32;
 	unsigned i, last_sgpr, num_params;
 
 	i8 = LLVMInt8TypeInContext(gallivm->context);
@@ -2049,6 +2052,8 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
 	params[SI_PARAM_CONST] = LLVMPointerType(
 		LLVMArrayType(LLVMVectorType(i8, 16), NUM_CONST_BUFFERS), CONST_ADDR_SPACE);
+	params[SI_PARAM_RW_BUFFERS] = params[SI_PARAM_CONST];
+
 	/* We assume at most 16 textures per program at the moment.
 	 * This need probably need to be changed to support bindless textures */
 	params[SI_PARAM_SAMPLER] = LLVMPointerType(
@@ -2059,7 +2064,6 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	switch (si_shader_ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
 		params[SI_PARAM_VERTEX_BUFFER] = params[SI_PARAM_CONST];
-		params[SI_PARAM_SO_BUFFER] = params[SI_PARAM_CONST];
 		params[SI_PARAM_START_INSTANCE] = i32;
 		num_params = SI_PARAM_START_INSTANCE+1;
 		if (shader->key.vs.as_es) {
@@ -2257,12 +2261,13 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
 		return;
 
 	LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    SI_PARAM_SO_BUFFER);
+					    SI_PARAM_RW_BUFFERS);
 
 	/* Load the resources, we rely on the code sinking to do the rest */
 	for (i = 0; i < 4; ++i) {
 		if (si_shader_ctx->shader->selector->so.stride[i]) {
-			LLVMValueRef offset = lp_build_const_int32(gallivm, i);
+			LLVMValueRef offset = lp_build_const_int32(gallivm,
+								   SI_RW_SO + i);
 
 			si_shader_ctx->so_buffers[i] = build_indexed_load(si_shader_ctx, buf_ptr, offset);
 		}
@@ -2371,7 +2376,8 @@ static int si_generate_gs_copy_shader(struct si_context *sctx,
 	preload_streamout_buffers(si_shader_ctx);
 
 	/* Load the GSVS ring resource descriptor */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				  SI_PARAM_RW_BUFFERS);
 	t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
 				    lp_build_const_int32(gallivm, SI_RING_GSVS));
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 63c19ecaeef..d667baf402d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -34,23 +34,23 @@
 #define SI_SGPR_CONST		0
 #define SI_SGPR_SAMPLER		2
 #define SI_SGPR_RESOURCE	4
-#define SI_SGPR_VERTEX_BUFFER	6  /* VS only */
-#define SI_SGPR_SO_BUFFER	8  /* VS only, stream-out */
+#define SI_SGPR_RW_BUFFERS	6  /* rings (& stream-out, VS only) */
+#define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
 #define SI_SGPR_START_INSTANCE	10 /* VS only */
-#define SI_SGPR_ALPHA_REF	6  /* PS only */
+#define SI_SGPR_ALPHA_REF	8  /* PS only */
 
 #define SI_VS_NUM_USER_SGPR	11
-#define SI_GS_NUM_USER_SGPR	6
-#define SI_PS_NUM_USER_SGPR	7
+#define SI_GS_NUM_USER_SGPR	8
+#define SI_PS_NUM_USER_SGPR	9
 
 /* LLVM function parameter indices */
 #define SI_PARAM_CONST		0
 #define SI_PARAM_SAMPLER	1
 #define SI_PARAM_RESOURCE	2
+#define SI_PARAM_RW_BUFFERS	3
 
 /* VS only parameters */
-#define SI_PARAM_VERTEX_BUFFER	3
-#define SI_PARAM_SO_BUFFER	4
+#define SI_PARAM_VERTEX_BUFFER	4
 #define SI_PARAM_START_INSTANCE	5
 /* the other VS parameters are assigned dynamically */
 
@@ -58,36 +58,38 @@
 #define SI_PARAM_ES2GS_OFFSET	6
 
 /* GS only parameters */
-#define SI_PARAM_GS2VS_OFFSET	3
-#define SI_PARAM_GS_WAVE_ID	4
-#define SI_PARAM_VTX0_OFFSET	5
-#define SI_PARAM_VTX1_OFFSET	6
-#define SI_PARAM_PRIMITIVE_ID	7
-#define SI_PARAM_VTX2_OFFSET	8
-#define SI_PARAM_VTX3_OFFSET	9
-#define SI_PARAM_VTX4_OFFSET	10
-#define SI_PARAM_VTX5_OFFSET	11
-#define SI_PARAM_GS_INSTANCE_ID	12
+#define SI_PARAM_GS2VS_OFFSET	4
+#define SI_PARAM_GS_WAVE_ID	5
+#define SI_PARAM_VTX0_OFFSET	6
+#define SI_PARAM_VTX1_OFFSET	7
+#define SI_PARAM_PRIMITIVE_ID	8
+#define SI_PARAM_VTX2_OFFSET	9
+#define SI_PARAM_VTX3_OFFSET	10
+#define SI_PARAM_VTX4_OFFSET	11
+#define SI_PARAM_VTX5_OFFSET	12
+#define SI_PARAM_GS_INSTANCE_ID	13
 
 /* PS only parameters */
-#define SI_PARAM_ALPHA_REF		3
-#define SI_PARAM_PRIM_MASK		4
-#define SI_PARAM_PERSP_SAMPLE		5
-#define SI_PARAM_PERSP_CENTER		6
-#define SI_PARAM_PERSP_CENTROID		7
-#define SI_PARAM_PERSP_PULL_MODEL	8
-#define SI_PARAM_LINEAR_SAMPLE		9
-#define SI_PARAM_LINEAR_CENTER		10
-#define SI_PARAM_LINEAR_CENTROID	11
-#define SI_PARAM_LINE_STIPPLE_TEX	12
-#define SI_PARAM_POS_X_FLOAT		13
-#define SI_PARAM_POS_Y_FLOAT		14
-#define SI_PARAM_POS_Z_FLOAT		15
-#define SI_PARAM_POS_W_FLOAT		16
-#define SI_PARAM_FRONT_FACE		17
-#define SI_PARAM_ANCILLARY		18
-#define SI_PARAM_SAMPLE_COVERAGE	19
-#define SI_PARAM_POS_FIXED_PT		20
+#define SI_PARAM_ALPHA_REF		4
+#define SI_PARAM_PRIM_MASK		5
+#define SI_PARAM_PERSP_SAMPLE		6
+#define SI_PARAM_PERSP_CENTER		7
+#define SI_PARAM_PERSP_CENTROID		8
+#define SI_PARAM_PERSP_PULL_MODEL	9
+#define SI_PARAM_LINEAR_SAMPLE		10
+#define SI_PARAM_LINEAR_CENTER		11
+#define SI_PARAM_LINEAR_CENTROID	12
+#define SI_PARAM_LINE_STIPPLE_TEX	13
+#define SI_PARAM_POS_X_FLOAT		14
+#define SI_PARAM_POS_Y_FLOAT		15
+#define SI_PARAM_POS_Z_FLOAT		16
+#define SI_PARAM_POS_W_FLOAT		17
+#define SI_PARAM_FRONT_FACE		18
+#define SI_PARAM_ANCILLARY		19
+#define SI_PARAM_SAMPLE_COVERAGE	20
+#define SI_PARAM_POS_FIXED_PT		21
+
+#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 1)
 
 struct si_shader_input {
 	unsigned		name;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f7082f5df43..6922c88e07c 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -115,10 +115,10 @@ union si_state {
 #define NUM_SAMPLER_STATES	NUM_TEX_UNITS
 
 #define NUM_PIPE_CONST_BUFFERS	16
-#define SI_RING_ESGS		17
-#define SI_RING_GSVS		18
-#define NUM_CONST_BUFFERS	(SI_RING_GSVS + 1)
+#define NUM_CONST_BUFFERS	(NUM_PIPE_CONST_BUFFERS + 1)
 
+#define SI_RING_ESGS		0
+#define SI_RING_GSVS		1
 
 /* This represents resource descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
-- 
cgit v1.2.3