radeonsi: overhaul the vertex fetch fixup mechanism

The overall goal is to support unaligned loads from vertex buffers natively on SI. In the unaligned case, we fall back to the general case implementation in ac_build_opencoded_load_format. Since this function is fully general, we will also use it going forward for cases requiring fully manual format conversions of dwords anyway. This requires a different encoding of the fix_fetch array, which will now contain the entire format information if a fixup is required. Having to check the alignment of vertex buffers is awkward. To keep the impact on the fast path minimal, the si_context will keep track of which vertex buffers are (not) at least dword-aligned, while the si_vertex_elements will note which vertex buffers have some (at most dword) alignment requirement. Vertex buffers should be dword-aligned most of the time, which allows a fast early-out in almost all cases. Add the radeonsi_vs_fetch_always_opencode configuration variable for testing purposes. Note that it can only be used reliably on LLVM >= 9, because support for byte and short load is required. v2: - add a missing check to si_bind_vertex_elements Reviewed-by: Marek Olšák <[email protected]>
author: Nicolai Hähnle <[email protected]> 2019-04-01 15:44:39 +0200
committer: Nicolai Hähnle <[email protected]> 2019-05-13 17:07:23 +0200
commit: d814c21b1bea0396c735d65b363a8f2c6324c7d8 (patch)
tree: 9fc16e31d42e34c8b039c348ffa436821f47f403 /src/gallium/drivers/radeonsi/si_state.c
parent: 8a951c3d2f89f7d070ffb47e5db4351fd4c43401 (diff)
1 files changed, 149 insertions, 88 deletions
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 876a993b158..55965bc86a1 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4459,10 +4459,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 	for (i = 0; i < count; ++i) {
 		const struct util_format_description *desc;
 		const struct util_format_channel_description *channel;
-		unsigned data_format, num_format;
 		int first_non_void;
 		unsigned vbo_index = elements[i].vertex_buffer_index;
-		unsigned char swizzle[4];
 
 		if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
 			FREE(v);
@@ -4489,105 +4487,137 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 
 		desc = util_format_description(elements[i].src_format);
 		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
-		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
-		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
 		channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
-		memcpy(swizzle, desc->swizzle, sizeof(swizzle));
 
 		v->format_size[i] = desc->block.bits / 8;
 		v->src_offset[i] = elements[i].src_offset;
 		v->vertex_buffer_index[i] = vbo_index;
 
-		/* The hardware always treats the 2-bit alpha channel as
-		 * unsigned, so a shader workaround is needed. The affected
-		 * chips are VI and older except Stoney (GFX8.1).
-		 */
-		if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 &&
-		    sscreen->info.chip_class <= VI &&
-		    sscreen->info.family != CHIP_STONEY) {
-			if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM;
-			} else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) {
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SSCALED;
-			} else if (num_format == V_008F0C_BUF_NUM_FORMAT_SINT) {
-				/* This isn't actually used in OpenGL. */
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SINT;
-			}
-		} else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
-			if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_FIXED;
-			else
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_FIXED;
-		} else if (channel && channel->size == 32 && !channel->pure_integer) {
-			if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
-				if (channel->normalized) {
-					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_SNORM;
-					else
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SNORM;
-				} else {
-					v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SSCALED;
-				}
-			} else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) {
-				if (channel->normalized) {
-					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_UNORM;
-					else
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_UNORM;
-				} else {
-					v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_USCALED;
-				}
-			}
-		} else if (channel && channel->size == 64 &&
-			   channel->type == UTIL_FORMAT_TYPE_FLOAT) {
-			switch (desc->nr_channels) {
-			case 1:
-			case 2:
-				v->fix_fetch[i] = SI_FIX_FETCH_RG_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X;
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = desc->nr_channels == 2 ? PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0;
-				swizzle[3] = desc->nr_channels == 2 ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_0;
-				break;
-			case 3:
-				v->fix_fetch[i] = SI_FIX_FETCH_RGB_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = PIPE_SWIZZLE_0;
-				swizzle[3] = PIPE_SWIZZLE_0;
-				break;
-			case 4:
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBA_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = PIPE_SWIZZLE_Z;
-				swizzle[3] = PIPE_SWIZZLE_W;
-				break;
-			default:
-				assert(0);
-			}
-		} else if (channel && desc->nr_channels == 3) {
-			assert(desc->swizzle[0] == PIPE_SWIZZLE_X);
+		bool always_fix = false;
+		union si_vs_fix_fetch fix_fetch;
+		unsigned log_hw_load_size; /* the load element size as seen by the hardware */
 
-			if (channel->size == 8) {
+		fix_fetch.bits = 0;
+		log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+		if (channel) {
+			switch (channel->type) {
+			case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+			case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
+			case UTIL_FORMAT_TYPE_SIGNED: {
 				if (channel->pure_integer)
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_8_INT;
+					fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+				else if (channel->normalized)
+					fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
 				else
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_8;
-			} else if (channel->size == 16) {
+					fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+				break;
+			}
+			case UTIL_FORMAT_TYPE_UNSIGNED: {
 				if (channel->pure_integer)
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_16_INT;
+					fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+				else if (channel->normalized)
+					fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
 				else
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_16;
+					fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+				break;
+			}
+			default: unreachable("bad format type");
+			}
+		} else {
+			switch (elements[i].src_format) {
+			case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+			default: unreachable("bad other format");
 			}
 		}
 
-		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-				   S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-				   S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-				   S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-				   S_008F0C_NUM_FORMAT(num_format) |
-				   S_008F0C_DATA_FORMAT(data_format);
+		if (desc->channel[0].size == 10) {
+			fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+			log_hw_load_size = 2;
+
+			/* The hardware always treats the 2-bit alpha channel as
+			 * unsigned, so a shader workaround is needed. The affected
+			 * chips are VI and older except Stoney (GFX8.1).
+			 */
+			always_fix = sscreen->info.chip_class <= VI &&
+				     sscreen->info.family != CHIP_STONEY &&
+				     channel->type == UTIL_FORMAT_TYPE_SIGNED;
+		} else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+			fix_fetch.u.log_size = 3; /* special encoding */
+			fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+			log_hw_load_size = 2;
+		} else {
+			fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+			fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+			/* Always fix up:
+			 * - doubles (multiple loads + truncate to float)
+			 * - 32-bit requiring a conversion
+			 */
+			always_fix =
+				(fix_fetch.u.log_size == 3) ||
+				(fix_fetch.u.log_size == 2 &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+			/* Also fixup 8_8_8 and 16_16_16. */
+			if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+				always_fix = true;
+				log_hw_load_size = fix_fetch.u.log_size;
+			}
+		}
+
+		if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+			assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+			       (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+			fix_fetch.u.reverse = 1;
+		}
+
+		/* Force the workaround for unaligned access here already if the
+		 * offset relative to the vertex buffer base is unaligned.
+		 *
+		 * There is a theoretical case in which this is too conservative:
+		 * if the vertex buffer's offset is also unaligned in just the
+		 * right way, we end up with an aligned address after all.
+		 * However, this case should be extremely rare in practice (it
+		 * won't happen in well-behaved applications), and taking it
+		 * into account would complicate the fast path (where everything
+		 * is nicely aligned).
+		 */
+		bool check_alignment = log_hw_load_size >= 1 && sscreen->info.chip_class == SI;
+		bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+		if (check_alignment &&
+		    (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+			opencode = true;
+
+		if (always_fix || check_alignment || opencode)
+			v->fix_fetch[i] = fix_fetch.bits;
+
+		if (opencode)
+			v->fix_fetch_opencode |= 1 << i;
+		if (opencode || always_fix)
+			v->fix_fetch_always |= 1 << i;
+
+		if (check_alignment && !opencode) {
+			assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+			v->fix_fetch_unaligned |= 1 << i;
+			v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+			v->vb_alignment_check_mask |= 1 << vbo_index;
+		}
+
+		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+				   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+				   S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+				   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+		unsigned data_format, num_format;
+		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+		v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
+				    S_008F0C_DATA_FORMAT(data_format);
 	}
 
 	if (v->instance_divisor_is_fetched) {
@@ -4621,7 +4651,17 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 	    (!old ||
 	     old->count != v->count ||
 	     old->uses_instance_divisors != v->uses_instance_divisors ||
-	     v->uses_instance_divisors || /* we don't check which divisors changed */
+	     /* we don't check which divisors changed */
+	     v->uses_instance_divisors ||
+	     (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
+	     ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+	      memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+		     sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+	     /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+	      * functions of fix_fetch and the src_offset alignment.
+	      * If they change and fix_fetch doesn't, it must be due to different
+	      * src_offset alignment, which is reflected in fix_fetch_opencode. */
+	     old->fix_fetch_opencode != v->fix_fetch_opencode ||
 	     memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
 		sctx->do_update_shaders = true;
 
@@ -4653,6 +4693,8 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+	uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+	uint32_t unaligned = orig_unaligned;
 	int i;
 
 	assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
@@ -4666,6 +4708,11 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 			pipe_resource_reference(&dsti->buffer.resource, buf);
 			dsti->buffer_offset = src->buffer_offset;
 			dsti->stride = src->stride;
+			if (dsti->buffer_offset & 3 || dsti->stride & 3)
+				unaligned |= 1 << (start_slot + i);
+			else
+				unaligned &= ~(1 << (start_slot + i));
+
 			si_context_add_resource_size(sctx, buf);
 			if (buf)
 				si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
@@ -4674,8 +4721,22 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 		for (i = 0; i < count; i++) {
 			pipe_resource_reference(&dst[i].buffer.resource, NULL);
 		}
+		unaligned &= ~u_bit_consecutive(start_slot, count);
 	}
 	sctx->vertex_buffers_dirty = true;
+	sctx->vertex_buffer_unaligned = unaligned;
+
+	/* Check whether alignment may have changed in a way that requires
+	 * shader changes. This check is conservative: a vertex buffer can only
+	 * trigger a shader change if the misalignment amount changes (e.g.
+	 * from byte-aligned to short-aligned), but we only keep track of
+	 * whether buffers are at least dword-aligned, since that should always
+	 * be the case in well-behaved applications anyway.
+	 */
+	if (sctx->vertex_elements &&
+	    (sctx->vertex_elements->vb_alignment_check_mask &
+	     (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count)))
+		sctx->do_update_shaders = true;
 }
 
 /*
author	Nicolai Hähnle <[email protected]>	2019-04-01 15:44:39 +0200
committer	Nicolai Hähnle <[email protected]>	2019-05-13 17:07:23 +0200
commit	d814c21b1bea0396c735d65b363a8f2c6324c7d8 (patch)
tree	9fc16e31d42e34c8b039c348ffa436821f47f403 /src/gallium/drivers/radeonsi/si_state.c
parent	8a951c3d2f89f7d070ffb47e5db4351fd4c43401 (diff)