diff options
author | Marek Olšák <[email protected]> | 2020-04-26 08:38:54 -0400 |
---|---|---|
committer | Marge Bot <[email protected]> | 2020-04-30 22:27:31 +0000 |
commit | d6acdbd9352bd4175191069139fd5f54cf2cc95f (patch) | |
tree | f4a2da73c164b8ea3c1c253fa49fb8372205378d | |
parent | d3da73954a639f8e43e6d22ac3f16a786d5e37cb (diff) |
radeonsi: implement and use compute-based DCC decompression on gfx9-10
DCC_DECOMPRESS doesn't work. Instead of trying to figure out why,
use a compute blit where the load is compressed and the store is
uncompressed.
Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4761>
-rw-r--r-- | src/amd/common/ac_surface.c | 3 | ||||
-rw-r--r-- | src/amd/common/ac_surface.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_blit.c | 29 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute_blit.c | 60 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 5 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c | 39 |
7 files changed, 128 insertions, 14 deletions
diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c index 50828e47e16..ef266922564 100644 --- a/src/amd/common/ac_surface.c +++ b/src/amd/common/ac_surface.c @@ -1269,6 +1269,9 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib, surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned; surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned; + surf->u.gfx9.dcc_block_width = dout.compressBlkWidth; + surf->u.gfx9.dcc_block_height = dout.compressBlkHeight; + surf->u.gfx9.dcc_block_depth = dout.compressBlkDepth; surf->dcc_size = dout.dccRamSize; surf->dcc_alignment = dout.dccRamBaseAlign; surf->num_dcc_levels = in->numMipLevels; diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h index 8bdafa295ef..bd90d958ea2 100644 --- a/src/amd/common/ac_surface.h +++ b/src/amd/common/ac_surface.h @@ -167,6 +167,10 @@ struct gfx9_surf_layout { uint64_t stencil_offset; /* separate stencil */ + uint8_t dcc_block_width; + uint8_t dcc_block_height; + uint8_t dcc_block_depth; + /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0. * The 3D engine doesn't support that layout except for chips with 1 RB. * All other chips must set rb_aligned=1. diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index ab69c7e4ddd..057cdc6ce31 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -419,6 +419,7 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture first_level, last_level, level_mask); if (need_dcc_decompress) { + assert(sctx->chip_class == GFX8); custom_blend = sctx->custom_blend_dcc_decompress; assert(tex->surface.dcc_offset); @@ -834,7 +835,8 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst !sdst->surface.dcc_offset && !(dst->target != src->target && (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) { - si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box); + si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, + src_box, false); return; } @@ -1226,8 +1228,29 @@ void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex) if (!tex->surface.dcc_offset || !sctx->has_graphics) return; - si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0, - util_max_layer(&tex->buffer.b.b, 0), true, false); + if (sctx->chip_class == GFX8) { + si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0, + util_max_layer(&tex->buffer.b.b, 0), true, false); + } else { + struct pipe_resource *ptex = &tex->buffer.b.b; + + /* DCC decompression using a compute shader. */ + for (unsigned level = 0; level < tex->surface.num_dcc_levels; level++) { + struct pipe_box box; + + u_box_3d(0, 0, 0, u_minify(ptex->width0, level), + u_minify(ptex->height0, level), + util_num_layers(ptex, level), &box); + si_compute_copy_image(sctx, ptex, level, ptex, level, 0, 0, 0, &box, + true); + } + + /* Now clear DCC metadata to uncompressed. */ + uint32_t clear_value = DCC_UNCOMPRESSED; + si_clear_buffer(sctx, ptex, tex->surface.dcc_offset, + tex->surface.dcc_size, &clear_value, 4, + SI_COHERENCY_CB_META, false); + } } void si_init_blit_functions(struct si_context *sctx) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index a4784fd477f..a754dc8bb48 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -376,7 +376,8 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, struct pipe_resource *src, unsigned src_level, unsigned dstx, - unsigned dsty, unsigned dstz, const struct pipe_box *src_box) + unsigned dsty, unsigned dstz, const struct pipe_box *src_box, + bool is_dcc_decompress) { struct pipe_context *ctx = &sctx->b; unsigned width = src_box->width; @@ -396,7 +397,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u * we must keep the original values to get the correct results. */ } - unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0}; if (width == 0 || height == 0) return; @@ -413,7 +413,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned); struct pipe_constant_buffer saved_cb = {}; - si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; struct pipe_image_view saved_image[2] = {0}; @@ -422,10 +421,16 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u void *saved_cs = sctx->cs_shader_state.program; - struct pipe_constant_buffer cb = {}; - cb.buffer_size = sizeof(data); - cb.user_buffer = data; - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + if (!is_dcc_decompress) { + unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0}; + + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + struct pipe_constant_buffer cb = {}; + cb.buffer_size = sizeof(data); + cb.user_buffer = data; + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + } struct pipe_image_view image[2] = {0}; image[0].resource = src; @@ -454,11 +459,44 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format); } + if (is_dcc_decompress) + image[1].access |= SI_IMAGE_ACCESS_DCC_OFF; + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image); struct pipe_grid_info info = {0}; - if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) { + if (is_dcc_decompress) { + /* The DCC decompression is a normal blit where the load is compressed + * and the store is uncompressed. The workgroup size is either equal to + * the DCC block size or a multiple thereof. The shader uses a barrier + * between loads and stores to safely overwrite each DCC block of pixels. + */ + struct si_texture *tex = (struct si_texture*)src; + unsigned dim[3] = {src_box->width, src_box->height, src_box->depth}; + + assert(src == dst); + assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY); + + if (!sctx->cs_dcc_decompress) + sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx); + ctx->bind_compute_state(ctx, sctx->cs_dcc_decompress); + + info.block[0] = tex->surface.u.gfx9.dcc_block_width; + info.block[1] = tex->surface.u.gfx9.dcc_block_height; + info.block[2] = tex->surface.u.gfx9.dcc_block_depth; + + /* Make sure the block size is at least the same as wave size. */ + while (info.block[0] * info.block[1] * info.block[2] < + sctx->screen->compute_wave_size) { + info.block[0] *= 2; + } + + for (unsigned i = 0; i < 3; i++) { + info.last_block[i] = dim[i] % info.block[i]; + info.grid[i] = DIV_ROUND_UP(dim[i], info.block[i]); + } + } else if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) { if (!sctx->cs_copy_image_1d_array) sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx); ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array); @@ -487,10 +525,12 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); for (int i = 0; i < 2; i++) pipe_resource_reference(&saved_image[i].resource, NULL); - pipe_resource_reference(&saved_cb.buffer, NULL); + if (!is_dcc_decompress) { + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + pipe_resource_reference(&saved_cb.buffer, NULL); + } } void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index cd38131229a..86511428376 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -235,6 +235,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array); if (sctx->cs_clear_12bytes_buffer) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer); + if (sctx->cs_dcc_decompress) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_decompress); if (sctx->cs_dcc_retile) sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 9f777f40b5c..e4104cf8d78 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -927,6 +927,7 @@ struct si_context { void *cs_clear_render_target; void *cs_clear_render_target_1d_array; void *cs_clear_12bytes_buffer; + void *cs_dcc_decompress; void *cs_dcc_retile; void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ struct si_screen *screen; @@ -1316,7 +1317,8 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p uint64_t dst_offset, uint64_t src_offset, unsigned size); void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, struct pipe_resource *src, unsigned src_level, unsigned dstx, - unsigned dsty, unsigned dstz, const struct pipe_box *src_box); + unsigned dsty, unsigned dstz, const struct pipe_box *src_box, + bool is_dcc_decompress); void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, const union pipe_color_union *color, unsigned dstx, unsigned dsty, unsigned width, unsigned height, @@ -1455,6 +1457,7 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords bool dst_stream_cache_policy, bool is_copy); void *si_create_copy_image_compute_shader(struct pipe_context *ctx); void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); +void *si_create_dcc_decompress_cs(struct pipe_context *ctx); void *si_clear_render_target_shader(struct pipe_context *ctx); void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index e5fd089b59f..d1a97c210b0 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -573,6 +573,45 @@ void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx) return ctx->create_compute_state(ctx, &state); } +/* Create a compute shader implementing DCC decompression via a blit. + * This is a trivial copy_image shader except that it has a variable block + * size and a barrier. + */ +void *si_create_dcc_decompress_cs(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL SV[2], BLOCK_SIZE\n" + "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL TEMP[0..1]\n" + + "UMAD TEMP[0].xyz, SV[1].xyzz, SV[2].xyzz, SV[0].xyzz\n" + "LOAD TEMP[1], IMAGE[0], TEMP[0].xyzz, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + /* Wait for the whole threadgroup (= DCC block) to load texels before + * overwriting them, because overwriting any pixel within a DCC block + * can break compression for the whole block. + */ + "BARRIER\n" + "STORE IMAGE[1], TEMP[0].xyzz, TEMP[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + void *si_clear_render_target_shader(struct pipe_context *ctx) { static const char text[] = |