diff options
Diffstat (limited to 'src/gallium')
-rw-r--r-- | src/gallium/drivers/radeon/r600_pipe_common.c | 15 | ||||
-rw-r--r-- | src/gallium/drivers/radeon/r600_pipe_common.h | 45 | ||||
-rw-r--r-- | src/gallium/drivers/radeon/r600_texture.c | 247 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_blit.c | 15 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.c | 15 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_draw.c | 5 |
6 files changed, 338 insertions, 4 deletions
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 06388d51735..1a892ab91e3 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -472,6 +472,21 @@ bool r600_common_context_init(struct r600_common_context *rctx, void r600_common_context_cleanup(struct r600_common_context *rctx) { + unsigned i,j; + + /* Release DCC stats. */ + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) { + assert(!rctx->dcc_stats[i].query_active); + + for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++) + if (rctx->dcc_stats[i].ps_stats[j]) + rctx->b.destroy_query(&rctx->b, + rctx->dcc_stats[i].ps_stats[j]); + + pipe_resource_reference((struct pipe_resource**) + &rctx->dcc_stats[i].tex, NULL); + } + if (rctx->gfx.cs) rctx->ws->cs_destroy(rctx->gfx.cs); if (rctx->dma.cs) diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index b411b23ba87..ca8ec545c07 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -268,10 +268,30 @@ struct r600_texture { /* Whether the texture is a displayable back buffer and needs DCC * decompression, which is expensive. Therefore, it's enabled only * if statistics suggest that it will pay off and it's allocated - * separately. Limited to target == 2D and last_level == 0. If enabled, - * dcc_offset contains the absolute GPUVM address, not the relative one. + * separately. It can't be bound as a sampler by apps. Limited to + * target == 2D and last_level == 0. If enabled, dcc_offset contains + * the absolute GPUVM address, not the relative one. */ struct r600_resource *dcc_separate_buffer; + /* When DCC is temporarily disabled, the separate buffer is here. */ + struct r600_resource *last_dcc_separate_buffer; + /* We need to track DCC dirtiness, because st/dri usually calls + * flush_resource twice per frame (not a bug) and we don't wanna + * decompress DCC twice. Also, the dirty tracking must be done even + * if DCC isn't used, because it's required by the DCC usage analysis + * for a possible future enablement. + */ + bool separate_dcc_dirty; + /* Statistics gathering for the DCC enablement heuristic. */ + bool dcc_gather_statistics; + /* Estimate of how much this color buffer is written to in units of + * full-screen draws: ps_invocations / (width * height) + * Shader kills, late Z, and blending with trivial discards make it + * inaccurate (we need to count CB updates, not PS invocations). + */ + unsigned ps_draw_ratio; + /* The number of clears since the last DCC usage analysis. */ + unsigned num_slow_clears; /* Counter that should be non-zero if the texture is bound to a * framebuffer. Implemented in radeonsi only. @@ -544,6 +564,21 @@ struct r600_common_context { float sample_locations_8x[8][2]; float sample_locations_16x[16][2]; + /* Statistics gathering for the DCC enablement heuristic. It can't be + * in r600_texture because r600_texture can be shared by multiple + * contexts. This is for back buffers only. We shouldn't get too many + * of those. + */ + struct { + struct r600_texture *tex; + /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */ + struct pipe_query *ps_stats[3]; + /* If all slots are used and another slot is needed, + * the least recently used slot is evicted based on this. */ + int64_t last_use_timestamp; + bool query_active; + } dcc_stats[2]; + /* The list of all texture buffer objects in this context. * This list is walked when a buffer is invalidated/reallocated and * the GPU addresses are updated. */ @@ -703,6 +738,12 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, const struct pipe_surface *templ, unsigned width, unsigned height); unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap); +void vi_separate_dcc_start_query(struct pipe_context *ctx, + struct r600_texture *tex); +void vi_separate_dcc_stop_query(struct pipe_context *ctx, + struct r600_texture *tex); +void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, + struct r600_texture *tex); void vi_dcc_clear_level(struct r600_common_context *rctx, struct r600_texture *rtex, unsigned level, unsigned clear_value); diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index c45c5f29fb8..5ed94c27264 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -26,9 +26,11 @@ */ #include "r600_pipe_common.h" #include "r600_cs.h" +#include "r600_query.h" #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_pack_color.h" +#include "os/os_time.h" #include <errno.h> #include <inttypes.h> @@ -567,6 +569,7 @@ static void r600_texture_destroy(struct pipe_screen *screen, } pb_reference(&resource->buf, NULL); r600_resource_reference(&rtex->dcc_separate_buffer, NULL); + r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL); FREE(rtex); } @@ -1018,6 +1021,13 @@ r600_texture_create_object(struct pipe_screen *screen, /* Applies to GCN. */ rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode; + /* We want to optimistically enable separate DCC on the first clear + * for apps that don't call SwapBuffers or call it too late, so set + * a very high number at the beginning. If the PS/draw ratio is too + * low, the first DCC decompression will disable DCC. + */ + rtex->ps_draw_ratio = 100; + if (rtex->is_depth) { if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER | R600_RESOURCE_FLAG_FLUSHED_DEPTH)) && @@ -1705,6 +1715,226 @@ unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap) return ~0U; } +/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */ + +/** + * Return the per-context slot where DCC statistics queries for the texture live. + */ +static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx, + struct r600_texture *tex) +{ + int i, empty_slot = -1; + + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) { + /* Return if found. */ + if (rctx->dcc_stats[i].tex == tex) { + rctx->dcc_stats[i].last_use_timestamp = os_time_get(); + return i; + } + + /* Record the first seen empty slot. */ + if (empty_slot == -1 && !rctx->dcc_stats[i].tex) + empty_slot = i; + } + + /* Not found. Remove the oldest member to make space in the array. */ + if (empty_slot == -1) { + int oldest_slot = 0; + + /* Find the oldest slot. */ + for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++) + if (rctx->dcc_stats[oldest_slot].last_use_timestamp > + rctx->dcc_stats[i].last_use_timestamp) + oldest_slot = i; + + /* Clean up the oldest slot. */ + if (rctx->dcc_stats[oldest_slot].query_active) + vi_separate_dcc_stop_query(&rctx->b, + rctx->dcc_stats[oldest_slot].tex); + + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[oldest_slot].ps_stats); i++) + if (rctx->dcc_stats[oldest_slot].ps_stats[i]) { + rctx->b.destroy_query(&rctx->b, + rctx->dcc_stats[oldest_slot].ps_stats[i]); + rctx->dcc_stats[oldest_slot].ps_stats[i] = NULL; + } + + pipe_resource_reference((struct pipe_resource**) + &rctx->dcc_stats[oldest_slot].tex, NULL); + empty_slot = oldest_slot; + } + + /* Add the texture to the new slot. */ + pipe_resource_reference((struct pipe_resource**)&rctx->dcc_stats[empty_slot].tex, + &tex->resource.b.b); + rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get(); + return empty_slot; +} + +static struct pipe_query * +vi_create_resuming_pipestats_query(struct pipe_context *ctx) +{ + struct r600_query_hw *query = (struct r600_query_hw*) + ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0); + + query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES; + return (struct pipe_query*)query; +} + +/** + * Called when binding a color buffer. + */ +void vi_separate_dcc_start_query(struct pipe_context *ctx, + struct r600_texture *tex) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + unsigned i = vi_get_context_dcc_stats_index(rctx, tex); + + assert(!rctx->dcc_stats[i].query_active); + + if (!rctx->dcc_stats[i].ps_stats[0]) + rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx); + + /* begin or resume the query */ + ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]); + rctx->dcc_stats[i].query_active = true; +} + +/** + * Called when unbinding a color buffer. + */ +void vi_separate_dcc_stop_query(struct pipe_context *ctx, + struct r600_texture *tex) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + unsigned i = vi_get_context_dcc_stats_index(rctx, tex); + + assert(rctx->dcc_stats[i].query_active); + assert(rctx->dcc_stats[i].ps_stats[0]); + + /* pause or end the query */ + ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]); + rctx->dcc_stats[i].query_active = false; +} + +static bool vi_should_enable_separate_dcc(struct r600_texture *tex) +{ + /* The minimum number of fullscreen draws per frame that is required + * to enable DCC. */ + return tex->ps_draw_ratio + tex->num_slow_clears >= 5; +} + +/* Called by fast clear. */ +static void vi_separate_dcc_try_enable(struct r600_common_context *rctx, + struct r600_texture *tex) +{ + /* The intent is to use this with shared displayable back buffers, + * but it's not strictly limited only to them. + */ + if (!tex->resource.is_shared || + !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) || + tex->resource.b.b.target != PIPE_TEXTURE_2D || + tex->surface.last_level > 0 || + !tex->surface.dcc_size) + return; + + if (tex->dcc_offset) + return; /* already enabled */ + + if (!vi_should_enable_separate_dcc(tex)) + return; /* stats show that DCC decompression is too expensive */ + + assert(tex->surface.level[0].dcc_enabled); + assert(!tex->dcc_separate_buffer); + + r600_texture_discard_cmask(rctx->screen, tex); + + /* Get a DCC buffer. */ + if (tex->last_dcc_separate_buffer) { + assert(tex->dcc_gather_statistics); + assert(!tex->dcc_separate_buffer); + tex->dcc_separate_buffer = tex->last_dcc_separate_buffer; + tex->last_dcc_separate_buffer = NULL; + } else { + tex->dcc_separate_buffer = (struct r600_resource*) + r600_aligned_buffer_create(rctx->b.screen, 0, + PIPE_USAGE_DEFAULT, + tex->surface.dcc_size, + tex->surface.dcc_alignment); + if (!tex->dcc_separate_buffer) + return; + + /* Enabling for the first time, so start the query. */ + tex->dcc_gather_statistics = true; + vi_separate_dcc_start_query(&rctx->b, tex); + } + + /* dcc_offset is the absolute GPUVM address. */ + tex->dcc_offset = tex->dcc_separate_buffer->gpu_address; + + /* no need to flag anything since this is called by fast clear that + * flags framebuffer state + */ +} + +/** + * Called by pipe_context::flush_resource, the place where DCC decompression + * takes place. + */ +void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, + struct r600_texture *tex) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + unsigned i = vi_get_context_dcc_stats_index(rctx, tex); + bool query_active = rctx->dcc_stats[i].query_active; + bool disable = false; + + if (rctx->dcc_stats[i].ps_stats[2]) { + union pipe_query_result result; + + /* Read the results. */ + ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2], + true, &result); + ctx->destroy_query(ctx, rctx->dcc_stats[i].ps_stats[2]); + rctx->dcc_stats[i].ps_stats[2] = NULL; + + /* Compute the approximate number of fullscreen draws. */ + tex->ps_draw_ratio = + result.pipeline_statistics.ps_invocations / + (tex->resource.b.b.width0 * tex->resource.b.b.height0); + + disable = tex->dcc_separate_buffer && + !vi_should_enable_separate_dcc(tex); + } + + tex->num_slow_clears = 0; + + /* stop the statistics query for ps_stats[0] */ + if (query_active) + vi_separate_dcc_stop_query(ctx, tex); + + /* Move the queries in the queue by one. */ + rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1]; + rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0]; + rctx->dcc_stats[i].ps_stats[0] = NULL; + + /* create and start a new query as ps_stats[0] */ + if (query_active) + vi_separate_dcc_start_query(ctx, tex); + + if (disable) { + assert(!tex->last_dcc_separate_buffer); + tex->last_dcc_separate_buffer = tex->dcc_separate_buffer; + tex->dcc_separate_buffer = NULL; + tex->dcc_offset = 0; + /* no need to flag anything since this is called after + * decompression that re-sets framebuffer state + */ + } +} + +/* FAST COLOR CLEAR */ + static void evergreen_set_clear_color(struct r600_texture *rtex, enum pipe_format surface_format, const union pipe_color_union *color) @@ -1966,6 +2196,22 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, continue; } + /* Fast clear is the most appropriate place to enable DCC for + * displayable surfaces. + */ + if (rctx->chip_class >= VI) { + vi_separate_dcc_try_enable(rctx, tex); + + /* Stoney can't do a CMASK-based clear, so all clears are + * considered to be hypothetically slow clears, which + * is weighed when determining to enable separate DCC. + */ + if (tex->dcc_gather_statistics && + rctx->family == CHIP_STONEY) + tex->num_slow_clears++; + } + + /* Try to clear DCC first, otherwise try CMASK. */ if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) { uint32_t reset_value; bool clear_words_needed; @@ -1982,6 +2228,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, if (clear_words_needed) tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; + tex->separate_dcc_dirty = true; } else { /* Stoney/RB+ doesn't work with CMASK fast clear. */ if (rctx->family == CHIP_STONEY) diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 46daeac4ac0..a48c5bc19e1 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -1055,10 +1055,23 @@ static void si_flush_resource(struct pipe_context *ctx, struct r600_texture *rtex = (struct r600_texture*)res; assert(res->target != PIPE_BUFFER); + assert(!rtex->dcc_separate_buffer || rtex->dcc_gather_statistics); + + /* st/dri calls flush twice per frame (not a bug), this prevents double + * decompression. */ + if (rtex->dcc_separate_buffer && !rtex->separate_dcc_dirty) + return; if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) { si_blit_decompress_color(ctx, rtex, 0, res->last_level, - 0, util_max_layer(res, 0), false); + 0, util_max_layer(res, 0), + rtex->dcc_separate_buffer != NULL); + } + + /* Always do the analysis even if DCC is disabled at the moment. */ + if (rtex->dcc_gather_statistics && rtex->separate_dcc_dirty) { + rtex->separate_dcc_dirty = false; + vi_separate_dcc_process_and_reset_stats(ctx, rtex); } } diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 5aed352f55c..26831faa107 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2236,6 +2236,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, unsigned old_nr_samples = sctx->framebuffer.nr_samples; int i; + for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + if (!sctx->framebuffer.state.cbufs[i]) + continue; + + rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture; + if (rtex->dcc_gather_statistics) + vi_separate_dcc_stop_query(ctx, rtex); + } + /* Only flush TC when changing the framebuffer state, because * the only client not using TC that can change textures is * the framebuffer. @@ -2307,6 +2316,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, r600_context_add_resource_size(ctx, surf->base.texture); p_atomic_inc(&rtex->framebuffers_bound); + + if (rtex->dcc_gather_statistics) { + /* Dirty tracking must be enabled for DCC usage analysis. */ + sctx->framebuffer.compressed_cb_mask |= 1 << i; + vi_separate_dcc_start_query(ctx, rtex); + } } /* Set the second SPI format for possible dual-src blending. */ if (i == 1 && surf) { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index ce8def46659..a596bd81381 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1042,7 +1042,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) surf = sctx->framebuffer.state.cbufs[i]; rtex = (struct r600_texture*)surf->texture; - rtex->dirty_level_mask |= 1 << surf->u.tex.level; + if (rtex->fmask.size) + rtex->dirty_level_mask |= 1 << surf->u.tex.level; + if (rtex->dcc_gather_statistics) + rtex->separate_dcc_dirty = true; } while (mask); } |