radeonsi: pack GS output components for each vertex stream contiguously

Note that the memory layout of one vertex stream inside one "item" (= memory written by one GS wave) on the GSVS ring is: t0v0c0 ... t15v0c0 t0v1c0 ... t15v1c0 ... t0vLc0 ... t15vLc0 t0v0c1 ... t15v0c1 t0v1c1 ... t15v1c1 ... t0vLc1 ... t15vLc1 ... t0v0cL ... t15v0cL t0v1cL ... t15v1cL ... t0vLcL ... t15vLcL t16v0c0 ... t31v0c0 t16v1c0 ... t31v1c0 ... t16vLc0 ... t31vLc0 t16v0c1 ... t31v0c1 t16v1c1 ... t31v1c1 ... t16vLc1 ... t31vLc1 ... t16v0cL ... t31v0cL t16v1cL ... t31v1cL ... t16vLcL ... t31vLcL ... t48v0c0 ... t63v0c0 t48v1c0 ... t63v1c0 ... t48vLc0 ... t63vLc0 t48v0c1 ... t63v0c1 t48v1c1 ... t63v1c1 ... t48vLc1 ... t63vLc1 ... t48v0cL ... t63v0cL t48v1cL ... t63v1cL ... t48vLcL ... t63vLcL where tNN indicates the thread number, vNN the vertex number (in the order of EMIT_VERTEX), and cNN the output component (vL and cL are the last vertex and component, respectively). The vertex streams are laid out sequentially. The swizzling by 16 threads is hard-coded in the way the VGT generates the offset passed into the GS copy shader, and the jump every 16 threads is calculated from VGT_GSVS_RING_OFFSET_n and VGT_GSVS_RING_ITEMSIZE in a way that makes it difficult to deviate from this layout (at least that's what I've experimentally confirmed on VI after first trying to go the simpler route of just interleaving the vertex streams). Reviewed-by: Marek Olšák <[email protected]>
author: Nicolai Hähnle <[email protected]> 2016-11-30 11:33:25 +0100
committer: Nicolai Hähnle <[email protected]> 2016-12-12 09:05:00 +0100
commit: 18616e7551fcecb9445597d78446df6e1df98fbb (patch)
tree: 25a4d39b967b380fd1e789b3f2ef2a2d13fde1d6 /src/gallium/drivers
parent: edf034ac142f2ae10befdf331b170373ff456495 (diff)
1 files changed, 8 insertions, 3 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 7324eb38a1c..48ccd83b396 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5277,7 +5277,7 @@ static void si_llvm_emit_vertex(
 	LLVMValueRef gs_next_vertex;
 	LLVMValueRef can_emit, kill;
 	LLVMValueRef args[2];
-	unsigned chan;
+	unsigned chan, offset;
 	int i;
 	unsigned stream;
 
@@ -5312,6 +5312,7 @@ static void si_llvm_emit_vertex(
 		lp_build_if(&if_state, gallivm, can_emit);
 	}
 
+	offset = 0;
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
 			ctx->soa.outputs[i];
@@ -5323,8 +5324,9 @@ static void si_llvm_emit_vertex(
 
 			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
 			LLVMValueRef voffset =
-				lp_build_const_int32(gallivm, (i * 4 + chan) *
+				lp_build_const_int32(gallivm, offset *
 						     shader->selector->gs_max_out_vertices);
+			offset++;
 
 			voffset = lp_build_add(uint, voffset, gs_next_vertex);
 			voffset = lp_build_mul_imm(uint, voffset, 4);
@@ -6419,6 +6421,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	for (int stream = 0; stream < 4; stream++) {
 		LLVMBasicBlockRef bb;
+		unsigned offset;
 
 		if (!gsinfo->num_stream_output_components[stream])
 			continue;
@@ -6431,6 +6434,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 		LLVMPositionBuilderAtEnd(builder, bb);
 
 		/* Fetch vertex data from GSVS ring */
+		offset = 0;
 		for (i = 0; i < gsinfo->num_outputs; ++i) {
 			for (unsigned chan = 0; chan < 4; chan++) {
 				if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
@@ -6441,7 +6445,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 				args[2] = lp_build_const_int32(
 					gallivm,
-					(i * 4 + chan) * gs_selector->gs_max_out_vertices * 16 * 4);
+					offset * gs_selector->gs_max_out_vertices * 16 * 4);
+				offset++;
 
 				outputs[i].values[chan] =
 					LLVMBuildBitCast(gallivm->builder,
author	Nicolai Hähnle <[email protected]>	2016-11-30 11:33:25 +0100
committer	Nicolai Hähnle <[email protected]>	2016-12-12 09:05:00 +0100
commit	18616e7551fcecb9445597d78446df6e1df98fbb (patch)
tree	25a4d39b967b380fd1e789b3f2ef2a2d13fde1d6 /src/gallium/drivers
parent	edf034ac142f2ae10befdf331b170373ff456495 (diff)