radeonsi: add support for displayable DCC for multi-RB chips

A compute shader is used to reorder DCC data from aligned to unaligned.
author: Marek Olšák <[email protected]> 2019-01-04 19:39:01 -0500
committer: Marek Olšák <[email protected]> 2019-04-04 09:53:24 -0400
commit: 1f21396431a03dc4e5a542628d7d8370973c967f (patch)
tree: 70cdc799793f64a8ee03987e9c93c1b099f30e8c /src/gallium/drivers/radeonsi/si_compute_blit.c
parent: 2c09eb41221eb704e9e7a21654828173158d1a7d (diff)
1 files changed, 78 insertions, 0 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 61bef999357..c8513557b1e 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -416,6 +416,84 @@ void si_compute_copy_image(struct si_context *sctx,
 	si_compute_internal_end(sctx);
 }
 
+void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
+{
+	struct pipe_context *ctx = &sctx->b;
+
+	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+		       SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
+		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
+	si_emit_cache_flush(sctx);
+
+	/* Save states. */
+	void *saved_cs = sctx->cs_shader_state.program;
+	struct pipe_image_view saved_img[3] = {};
+
+	for (unsigned i = 0; i < 3; i++) {
+		util_copy_image_view(&saved_img[i],
+				     &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
+	}
+
+	/* Set images. */
+	bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+	unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+	struct pipe_image_view img[3];
+
+	assert(tex->dcc_retile_map_offset && tex->dcc_retile_map_offset <= UINT_MAX);
+	assert(tex->dcc_offset && tex->dcc_offset <= UINT_MAX);
+	assert(tex->display_dcc_offset && tex->display_dcc_offset <= UINT_MAX);
+
+	for (unsigned i = 0; i < 3; i++) {
+		img[i].resource = &tex->buffer.b.b;
+		img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
+		img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
+	}
+
+	img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT :
+				     PIPE_FORMAT_R32G32B32A32_UINT;
+	img[0].u.buf.offset = tex->dcc_retile_map_offset;
+	img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
+
+	img[1].format = PIPE_FORMAT_R8_UINT;
+	img[1].u.buf.offset = tex->dcc_offset;
+	img[1].u.buf.size = tex->surface.dcc_size;
+
+	img[2].format = PIPE_FORMAT_R8_UINT;
+	img[2].u.buf.offset = tex->display_dcc_offset;
+	img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
+
+	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
+
+	/* Bind the compute shader. */
+	if (!sctx->cs_dcc_retile)
+		sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
+	ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
+
+	/* Dispatch compute. */
+	/* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
+	unsigned num_threads = num_elements / 4;
+
+	struct pipe_grid_info info = {};
+	info.block[0] = 64;
+	info.block[1] = 1;
+	info.block[2] = 1;
+	info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
+	info.grid[1] = 1;
+	info.grid[2] = 1;
+	info.last_block[0] = num_threads % 64;
+
+	ctx->launch_grid(ctx, &info);
+
+	/* Don't flush caches or wait. The driver will wait at the end of this IB,
+	 * and L2 will be flushed by the kernel fence.
+	 */
+
+	/* Restore states. */
+	ctx->bind_compute_state(ctx, saved_cs);
+	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
+}
+
 void si_init_compute_blit_functions(struct si_context *sctx)
 {
 	sctx->b.clear_buffer = si_pipe_clear_buffer;
author	Marek Olšák <[email protected]>	2019-01-04 19:39:01 -0500
committer	Marek Olšák <[email protected]>	2019-04-04 09:53:24 -0400
commit	1f21396431a03dc4e5a542628d7d8370973c967f (patch)
tree	70cdc799793f64a8ee03987e9c93c1b099f30e8c /src/gallium/drivers/radeonsi/si_compute_blit.c
parent	2c09eb41221eb704e9e7a21654828173158d1a7d (diff)