radeonsi: implement streamout shader support

The shader is responsible for writing to streamout buffers using the TBUFFER_STORE_FORMAT_* instructions. The locations of some input SGPRs and VGPRs are assigned dynamically, because the input SGPRs controlling streamout are not declared if they are not needed, decreasing the indices of all following inputs. Reviewed-by: Michel Dänzer <[email protected]>
author: Marek Olšák <[email protected]> 2013-09-01 23:59:06 +0200
committer: Marek Olšák <[email protected]> 2013-09-13 01:04:44 +0200
commit: 8d03d923b6f6816f3771ae3186cd801bfbb98d4c (patch)
tree: 9f6f551d758498a3d9e5687e7404332d9dfdd481 /src/gallium/drivers
parent: 9d16e70b3f95ea632a0442458c0d5027a8b42d36 (diff)
3 files changed, 276 insertions, 15 deletions
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 335cd79b1a8..80dd773c10c 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -34,6 +34,7 @@
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_tgsi.h"
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_flow.h"
 #include "radeon_llvm.h"
 #include "radeon_llvm_emit.h"
 #include "util/u_memory.h"
@@ -59,6 +60,11 @@ struct si_shader_context
 	struct tgsi_token * tokens;
 	struct si_pipe_shader *shader;
 	unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
+	int param_streamout_config;
+	int param_streamout_write_index;
+	int param_streamout_offset[4];
+	int param_vertex_id;
+	int param_instance_id;
 	LLVMValueRef const_md;
 	LLVMValueRef const_resource;
 #if HAVE_LLVM >= 0x0304
@@ -67,6 +73,7 @@ struct si_shader_context
 	LLVMValueRef *constants;
 	LLVMValueRef *resources;
 	LLVMValueRef *samplers;
+	LLVMValueRef so_buffers[4];
 };
 
 static struct si_shader_context * si_shader_context(
@@ -119,9 +126,12 @@ static LLVMValueRef get_instance_index(
 	struct radeon_llvm_context * radeon_bld,
 	unsigned divisor)
 {
+	struct si_shader_context *si_shader_ctx =
+		si_shader_context(&radeon_bld->soa.bld_base);
 	struct gallivm_state * gallivm = radeon_bld->soa.bld_base.base.gallivm;
 
-	LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_INSTANCE_ID);
+	LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
+					   si_shader_ctx->param_instance_id);
 	result = LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
 			radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
 
@@ -168,7 +178,8 @@ static void declare_input_vs(
 	} else {
 		/* Load the buffer index, which is always stored in VGPR0
 		 * for Vertex Shaders */
-		buffer_index = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_ID);
+		buffer_index = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+					    si_shader_ctx->param_vertex_id);
 	}
 
 	vec4_type = LLVMVectorType(base->elem_type, 4);
@@ -397,7 +408,8 @@ static void declare_system_value(
 	unsigned index,
 	const struct tgsi_full_declaration *decl)
 {
-
+	struct si_shader_context *si_shader_ctx =
+		si_shader_context(&radeon_bld->soa.bld_base);
 	LLVMValueRef value = 0;
 
 	switch (decl->Semantic.Name) {
@@ -406,7 +418,8 @@ static void declare_system_value(
 		break;
 
 	case TGSI_SEMANTIC_VERTEXID:
-		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_VERTEX_ID);
+		value = LLVMGetParam(radeon_bld->main_fn,
+				     si_shader_ctx->param_vertex_id);
 		break;
 
 	default:
@@ -651,6 +664,206 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
 	}
 }
 
+static void si_dump_streamout(struct pipe_stream_output_info *so)
+{
+	unsigned i;
+
+	if (so->num_outputs)
+		fprintf(stderr, "STREAMOUT\n");
+
+	for (i = 0; i < so->num_outputs; i++) {
+		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
+				so->output[i].start_component;
+		fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
+			i, so->output[i].output_buffer,
+			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
+			so->output[i].register_index,
+			mask & 1 ? "x" : "",
+		        mask & 2 ? "y" : "",
+		        mask & 4 ? "z" : "",
+		        mask & 8 ? "w" : "");
+	}
+}
+
+/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
+ * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
+ * or v4i32 (num_channels=3,4). */
+static void build_tbuffer_store(struct si_shader_context *shader,
+				LLVMValueRef rsrc,
+				LLVMValueRef vdata,
+				unsigned num_channels,
+				LLVMValueRef vaddr,
+				LLVMValueRef soffset,
+				unsigned inst_offset,
+				unsigned dfmt,
+				unsigned nfmt,
+				unsigned offen,
+				unsigned idxen,
+				unsigned glc,
+				unsigned slc,
+				unsigned tfe)
+{
+	struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
+	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
+	LLVMValueRef args[] = {
+		rsrc,
+		vdata,
+		LLVMConstInt(i32, num_channels, 0),
+		vaddr,
+		soffset,
+		LLVMConstInt(i32, inst_offset, 0),
+		LLVMConstInt(i32, dfmt, 0),
+		LLVMConstInt(i32, nfmt, 0),
+		LLVMConstInt(i32, offen, 0),
+		LLVMConstInt(i32, idxen, 0),
+		LLVMConstInt(i32, glc, 0),
+		LLVMConstInt(i32, slc, 0),
+		LLVMConstInt(i32, tfe, 0)
+	};
+
+	/* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
+	unsigned func = CLAMP(num_channels, 1, 3) - 1;
+	const char *types[] = {"i32", "v2i32", "v4i32"};
+	char name[256];
+	snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+
+	lp_build_intrinsic(gallivm->builder, name,
+			   LLVMVoidTypeInContext(gallivm->context),
+			   args, Elements(args));
+}
+
+static void build_streamout_store(struct si_shader_context *shader,
+				  LLVMValueRef rsrc,
+				  LLVMValueRef vdata,
+				  unsigned num_channels,
+				  LLVMValueRef vaddr,
+				  LLVMValueRef soffset,
+				  unsigned inst_offset)
+{
+	static unsigned dfmt[] = {
+		V_008F0C_BUF_DATA_FORMAT_32,
+		V_008F0C_BUF_DATA_FORMAT_32_32,
+		V_008F0C_BUF_DATA_FORMAT_32_32_32,
+		V_008F0C_BUF_DATA_FORMAT_32_32_32_32
+	};
+	assert(num_channels >= 1 && num_channels <= 4);
+
+	build_tbuffer_store(shader, rsrc, vdata, num_channels, vaddr, soffset,
+			    inst_offset, dfmt[num_channels-1],
+			    V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
+}
+
+/* On SI, the vertex shader is responsible for writing streamout data
+ * to buffers. */
+static void si_llvm_emit_streamout(struct si_shader_context *shader)
+{
+	struct pipe_stream_output_info *so = &shader->shader->selector->so;
+	struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	int i, j;
+	struct lp_build_if_state if_ctx;
+
+	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
+
+	LLVMValueRef so_param =
+		LLVMGetParam(shader->radeon_bld.main_fn,
+			     shader->param_streamout_config);
+
+	/* Get bits [22:16], i.e. (so_param >> 16) & 127; */
+	LLVMValueRef so_vtx_count =
+		LLVMBuildAnd(builder,
+			     LLVMBuildLShr(builder, so_param,
+					   LLVMConstInt(i32, 16, 0), ""),
+			     LLVMConstInt(i32, 127, 0), "");
+
+	LLVMValueRef tid = build_intrinsic(builder, "llvm.SI.tid", i32,
+					   NULL, 0, LLVMReadNoneAttribute);
+
+	/* can_emit = tid < so_vtx_count; */
+	LLVMValueRef can_emit =
+		LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
+
+	/* Emit the streamout code conditionally. This actually avoids
+	 * out-of-bounds buffer access. The hw tells us via the SGPR
+	 * (so_vtx_count) which threads are allowed to emit streamout data. */
+	lp_build_if(&if_ctx, gallivm, can_emit);
+	{
+		/* The buffer offset is computed as follows:
+		 *   ByteOffset = streamout_offset[buffer_id]*4 +
+		 *                (streamout_write_index + thread_id)*stride[buffer_id] +
+		 *                attrib_offset
+                 */
+
+		LLVMValueRef so_write_index =
+			LLVMGetParam(shader->radeon_bld.main_fn,
+				     shader->param_streamout_write_index);
+
+		/* Compute (streamout_write_index + thread_id). */
+		so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
+
+		/* Compute the write offset for each enabled buffer. */
+		LLVMValueRef so_write_offset[4] = {};
+		for (i = 0; i < 4; i++) {
+			if (!so->stride[i])
+				continue;
+
+			LLVMValueRef so_offset = LLVMGetParam(shader->radeon_bld.main_fn,
+							      shader->param_streamout_offset[i]);
+			so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(i32, 4, 0), "");
+
+			so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
+							  LLVMConstInt(i32, so->stride[i]*4, 0), "");
+			so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
+		}
+
+		LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS] = shader->radeon_bld.soa.outputs;
+
+		/* Write streamout data. */
+		for (i = 0; i < so->num_outputs; i++) {
+			unsigned buf_idx = so->output[i].output_buffer;
+			unsigned reg = so->output[i].register_index;
+			unsigned start = so->output[i].start_component;
+			unsigned num_comps = so->output[i].num_components;
+			LLVMValueRef out[4];
+
+			assert(num_comps && num_comps <= 4);
+			if (!num_comps || num_comps > 4)
+				continue;
+
+			/* Load the output as int. */
+			for (j = 0; j < num_comps; j++) {
+				out[j] = LLVMBuildLoad(builder, outputs[reg][start+j], "");
+				out[j] = LLVMBuildBitCast(builder, out[j], i32, "");
+			}
+
+			/* Pack the output. */
+			LLVMValueRef vdata = NULL;
+
+			switch (num_comps) {
+			case 1: /* as i32 */
+				vdata = out[0];
+				break;
+			case 2: /* as v2i32 */
+			case 3: /* as v4i32 (aligned to 4) */
+			case 4: /* as v4i32 */
+				vdata = LLVMGetUndef(LLVMVectorType(i32, util_next_power_of_two(num_comps)));
+				for (j = 0; j < num_comps; j++) {
+					vdata = LLVMBuildInsertElement(builder, vdata, out[j],
+								       LLVMConstInt(i32, j, 0), "");
+				}
+				break;
+			}
+
+			build_streamout_store(shader, shader->so_buffers[buf_idx],
+					      vdata, num_comps,
+					      so_write_offset[buf_idx],
+					      LLVMConstInt(i32, 0, 0),
+					      so->output[i].dst_offset*4);
+		}
+	}
+	lp_build_endif(&if_ctx);
+}
+
 /* XXX: This is partially implemented for VS only at this point.  It is not complete */
 static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 {
@@ -669,6 +882,10 @@ static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 	int depth_index = -1, stencil_index = -1;
 	int i;
 
+	if (si_shader_ctx->shader->selector->so.num_outputs) {
+		si_llvm_emit_streamout(si_shader_ctx);
+	}
+
 	while (!tgsi_parse_end_of_tokens(parse)) {
 		struct tgsi_full_declaration *d =
 					&parse->FullToken.FullDeclaration;
@@ -1366,12 +1583,30 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 		params[SI_PARAM_VERTEX_BUFFER] = params[SI_PARAM_CONST];
 		params[SI_PARAM_SO_BUFFER] = params[SI_PARAM_CONST];
 		params[SI_PARAM_START_INSTANCE] = i32;
-		last_sgpr = SI_PARAM_START_INSTANCE;
-		params[SI_PARAM_VERTEX_ID] = i32;
-		params[SI_PARAM_DUMMY_0] = i32;
-		params[SI_PARAM_DUMMY_1] = i32;
-		params[SI_PARAM_INSTANCE_ID] = i32;
-		num_params = SI_PARAM_INSTANCE_ID+1;
+		num_params = SI_PARAM_START_INSTANCE+1;
+
+		/* The locations of the other parameters are assigned dynamically. */
+
+		/* Streamout SGPRs. */
+		if (si_shader_ctx->shader->selector->so.num_outputs) {
+			params[si_shader_ctx->param_streamout_config = num_params++] = i32;
+			params[si_shader_ctx->param_streamout_write_index = num_params++] = i32;
+		}
+		/* A streamout buffer offset is loaded if the stride is non-zero. */
+		for (i = 0; i < 4; i++) {
+			if (!si_shader_ctx->shader->selector->so.stride[i])
+				continue;
+
+			params[si_shader_ctx->param_streamout_offset[i] = num_params++] = i32;
+		}
+
+		last_sgpr = num_params-1;
+
+		/* VGPRs */
+		params[si_shader_ctx->param_vertex_id = num_params++] = i32;
+		params[num_params++] = i32; /* unused*/
+		params[num_params++] = i32; /* unused */
+		params[si_shader_ctx->param_instance_id = num_params++] = i32;
 		break;
 
 	case TGSI_PROCESSOR_FRAGMENT:
@@ -1492,6 +1727,28 @@ static void preload_samplers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
+static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
+{
+	struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state * gallivm = bld_base->base.gallivm;
+	unsigned i;
+
+	if (!si_shader_ctx->shader->selector->so.num_outputs)
+		return;
+
+	LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+					    SI_PARAM_SO_BUFFER);
+
+	/* Load the resources, we rely on the code sinking to do the rest */
+	for (i = 0; i < 4; ++i) {
+		if (si_shader_ctx->shader->selector->so.stride[i]) {
+			LLVMValueRef offset = lp_build_const_int32(gallivm, i);
+
+			si_shader_ctx->so_buffers[i] = build_indexed_load(si_shader_ctx, buf_ptr, offset);
+		}
+	}
+}
+
 int si_compile_llvm(struct r600_context *rctx, struct si_pipe_shader *shader,
 							LLVMModuleRef mod)
 {
@@ -1625,6 +1882,7 @@ int si_pipe_shader_create(
 	create_function(&si_shader_ctx);
 	preload_constants(&si_shader_ctx);
 	preload_samplers(&si_shader_ctx);
+	preload_streamout_buffers(&si_shader_ctx);
 
 	shader->shader.nr_cbufs = rctx->framebuffer.nr_cbufs;
 
@@ -1632,6 +1890,7 @@ int si_pipe_shader_create(
 	 * conversion fails. */
 	if (dump) {
 		tgsi_dump(sel->tokens, 0);
+		si_dump_streamout(&sel->so);
 	}
 
 	if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h b/src/gallium/drivers/radeonsi/radeonsi_shader.h
index 64766c97327..1db8bb8d8ee 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
@@ -50,10 +50,7 @@
 #define SI_PARAM_VERTEX_BUFFER	3
 #define SI_PARAM_SO_BUFFER	4
 #define SI_PARAM_START_INSTANCE	5
-#define SI_PARAM_VERTEX_ID	6
-#define SI_PARAM_DUMMY_0	7
-#define SI_PARAM_DUMMY_1	8
-#define SI_PARAM_INSTANCE_ID	9
+/* the other VS parameters are assigned dynamically */
 
 /* PS only parameters */
 #define SI_PARAM_PRIM_MASK		3
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 687410c169b..cb5055a57d0 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -104,7 +104,12 @@ static void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *s
 		       S_00B128_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt));
 	si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS,
-		       S_00B12C_USER_SGPR(num_user_sgprs));
+		       S_00B12C_USER_SGPR(num_user_sgprs) |
+		       S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
+		       S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
+		       S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
+		       S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
+		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs));
 
 	if (rctx->b.chip_class >= CIK) {
 		si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
author	Marek Olšák <[email protected]>	2013-09-01 23:59:06 +0200
committer	Marek Olšák <[email protected]>	2013-09-13 01:04:44 +0200
commit	8d03d923b6f6816f3771ae3186cd801bfbb98d4c (patch)
tree	9f6f551d758498a3d9e5687e7404332d9dfdd481 /src/gallium/drivers
parent	9d16e70b3f95ea632a0442458c0d5027a8b42d36 (diff)