summaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/drivers/radeon/r600_pipe_common.c15
-rw-r--r--src/gallium/drivers/radeon/r600_pipe_common.h45
-rw-r--r--src/gallium/drivers/radeon/r600_texture.c247
-rw-r--r--src/gallium/drivers/radeonsi/si_blit.c15
-rw-r--r--src/gallium/drivers/radeonsi/si_state.c15
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.c5
6 files changed, 338 insertions, 4 deletions
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 06388d51735..1a892ab91e3 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -472,6 +472,21 @@ bool r600_common_context_init(struct r600_common_context *rctx,
void r600_common_context_cleanup(struct r600_common_context *rctx)
{
+ unsigned i,j;
+
+ /* Release DCC stats. */
+ for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+ assert(!rctx->dcc_stats[i].query_active);
+
+ for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
+ if (rctx->dcc_stats[i].ps_stats[j])
+ rctx->b.destroy_query(&rctx->b,
+ rctx->dcc_stats[i].ps_stats[j]);
+
+ pipe_resource_reference((struct pipe_resource**)
+ &rctx->dcc_stats[i].tex, NULL);
+ }
+
if (rctx->gfx.cs)
rctx->ws->cs_destroy(rctx->gfx.cs);
if (rctx->dma.cs)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index b411b23ba87..ca8ec545c07 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -268,10 +268,30 @@ struct r600_texture {
/* Whether the texture is a displayable back buffer and needs DCC
* decompression, which is expensive. Therefore, it's enabled only
* if statistics suggest that it will pay off and it's allocated
- * separately. Limited to target == 2D and last_level == 0. If enabled,
- * dcc_offset contains the absolute GPUVM address, not the relative one.
+ * separately. It can't be bound as a sampler by apps. Limited to
+ * target == 2D and last_level == 0. If enabled, dcc_offset contains
+ * the absolute GPUVM address, not the relative one.
*/
struct r600_resource *dcc_separate_buffer;
+ /* When DCC is temporarily disabled, the separate buffer is here. */
+ struct r600_resource *last_dcc_separate_buffer;
+ /* We need to track DCC dirtiness, because st/dri usually calls
+ * flush_resource twice per frame (not a bug) and we don't wanna
+ * decompress DCC twice. Also, the dirty tracking must be done even
+ * if DCC isn't used, because it's required by the DCC usage analysis
+ * for a possible future enablement.
+ */
+ bool separate_dcc_dirty;
+ /* Statistics gathering for the DCC enablement heuristic. */
+ bool dcc_gather_statistics;
+ /* Estimate of how much this color buffer is written to in units of
+ * full-screen draws: ps_invocations / (width * height)
+ * Shader kills, late Z, and blending with trivial discards make it
+ * inaccurate (we need to count CB updates, not PS invocations).
+ */
+ unsigned ps_draw_ratio;
+ /* The number of clears since the last DCC usage analysis. */
+ unsigned num_slow_clears;
/* Counter that should be non-zero if the texture is bound to a
* framebuffer. Implemented in radeonsi only.
@@ -544,6 +564,21 @@ struct r600_common_context {
float sample_locations_8x[8][2];
float sample_locations_16x[16][2];
+ /* Statistics gathering for the DCC enablement heuristic. It can't be
+ * in r600_texture because r600_texture can be shared by multiple
+ * contexts. This is for back buffers only. We shouldn't get too many
+ * of those.
+ */
+ struct {
+ struct r600_texture *tex;
+ /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
+ struct pipe_query *ps_stats[3];
+ /* If all slots are used and another slot is needed,
+ * the least recently used slot is evicted based on this. */
+ int64_t last_use_timestamp;
+ bool query_active;
+ } dcc_stats[2];
+
/* The list of all texture buffer objects in this context.
* This list is walked when a buffer is invalidated/reallocated and
* the GPU addresses are updated. */
@@ -703,6 +738,12 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
const struct pipe_surface *templ,
unsigned width, unsigned height);
unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap);
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+ struct r600_texture *tex);
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+ struct r600_texture *tex);
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+ struct r600_texture *tex);
void vi_dcc_clear_level(struct r600_common_context *rctx,
struct r600_texture *rtex,
unsigned level, unsigned clear_value);
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index c45c5f29fb8..5ed94c27264 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -26,9 +26,11 @@
*/
#include "r600_pipe_common.h"
#include "r600_cs.h"
+#include "r600_query.h"
#include "util/u_format.h"
#include "util/u_memory.h"
#include "util/u_pack_color.h"
+#include "os/os_time.h"
#include <errno.h>
#include <inttypes.h>
@@ -567,6 +569,7 @@ static void r600_texture_destroy(struct pipe_screen *screen,
}
pb_reference(&resource->buf, NULL);
r600_resource_reference(&rtex->dcc_separate_buffer, NULL);
+ r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL);
FREE(rtex);
}
@@ -1018,6 +1021,13 @@ r600_texture_create_object(struct pipe_screen *screen,
/* Applies to GCN. */
rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
+ /* We want to optimistically enable separate DCC on the first clear
+ * for apps that don't call SwapBuffers or call it too late, so set
+ * a very high number at the beginning. If the PS/draw ratio is too
+ * low, the first DCC decompression will disable DCC.
+ */
+ rtex->ps_draw_ratio = 100;
+
if (rtex->is_depth) {
if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER |
R600_RESOURCE_FLAG_FLUSHED_DEPTH)) &&
@@ -1705,6 +1715,226 @@ unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap)
return ~0U;
}
+/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
+
+/**
+ * Return the per-context slot where DCC statistics queries for the texture live.
+ */
+static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx,
+ struct r600_texture *tex)
+{
+ int i, empty_slot = -1;
+
+ for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+ /* Return if found. */
+ if (rctx->dcc_stats[i].tex == tex) {
+ rctx->dcc_stats[i].last_use_timestamp = os_time_get();
+ return i;
+ }
+
+ /* Record the first seen empty slot. */
+ if (empty_slot == -1 && !rctx->dcc_stats[i].tex)
+ empty_slot = i;
+ }
+
+ /* Not found. Remove the oldest member to make space in the array. */
+ if (empty_slot == -1) {
+ int oldest_slot = 0;
+
+ /* Find the oldest slot. */
+ for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++)
+ if (rctx->dcc_stats[oldest_slot].last_use_timestamp >
+ rctx->dcc_stats[i].last_use_timestamp)
+ oldest_slot = i;
+
+ /* Clean up the oldest slot. */
+ if (rctx->dcc_stats[oldest_slot].query_active)
+ vi_separate_dcc_stop_query(&rctx->b,
+ rctx->dcc_stats[oldest_slot].tex);
+
+ for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[oldest_slot].ps_stats); i++)
+ if (rctx->dcc_stats[oldest_slot].ps_stats[i]) {
+ rctx->b.destroy_query(&rctx->b,
+ rctx->dcc_stats[oldest_slot].ps_stats[i]);
+ rctx->dcc_stats[oldest_slot].ps_stats[i] = NULL;
+ }
+
+ pipe_resource_reference((struct pipe_resource**)
+ &rctx->dcc_stats[oldest_slot].tex, NULL);
+ empty_slot = oldest_slot;
+ }
+
+ /* Add the texture to the new slot. */
+ pipe_resource_reference((struct pipe_resource**)&rctx->dcc_stats[empty_slot].tex,
+ &tex->resource.b.b);
+ rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+ return empty_slot;
+}
+
+static struct pipe_query *
+vi_create_resuming_pipestats_query(struct pipe_context *ctx)
+{
+ struct r600_query_hw *query = (struct r600_query_hw*)
+ ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+
+ query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES;
+ return (struct pipe_query*)query;
+}
+
+/**
+ * Called when binding a color buffer.
+ */
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+ struct r600_texture *tex)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+ assert(!rctx->dcc_stats[i].query_active);
+
+ if (!rctx->dcc_stats[i].ps_stats[0])
+ rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx);
+
+ /* begin or resume the query */
+ ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+ rctx->dcc_stats[i].query_active = true;
+}
+
+/**
+ * Called when unbinding a color buffer.
+ */
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+ struct r600_texture *tex)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+ assert(rctx->dcc_stats[i].query_active);
+ assert(rctx->dcc_stats[i].ps_stats[0]);
+
+ /* pause or end the query */
+ ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+ rctx->dcc_stats[i].query_active = false;
+}
+
+static bool vi_should_enable_separate_dcc(struct r600_texture *tex)
+{
+ /* The minimum number of fullscreen draws per frame that is required
+ * to enable DCC. */
+ return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+}
+
+/* Called by fast clear. */
+static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
+ struct r600_texture *tex)
+{
+ /* The intent is to use this with shared displayable back buffers,
+ * but it's not strictly limited only to them.
+ */
+ if (!tex->resource.is_shared ||
+ !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+ tex->resource.b.b.target != PIPE_TEXTURE_2D ||
+ tex->surface.last_level > 0 ||
+ !tex->surface.dcc_size)
+ return;
+
+ if (tex->dcc_offset)
+ return; /* already enabled */
+
+ if (!vi_should_enable_separate_dcc(tex))
+ return; /* stats show that DCC decompression is too expensive */
+
+ assert(tex->surface.level[0].dcc_enabled);
+ assert(!tex->dcc_separate_buffer);
+
+ r600_texture_discard_cmask(rctx->screen, tex);
+
+ /* Get a DCC buffer. */
+ if (tex->last_dcc_separate_buffer) {
+ assert(tex->dcc_gather_statistics);
+ assert(!tex->dcc_separate_buffer);
+ tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+ tex->last_dcc_separate_buffer = NULL;
+ } else {
+ tex->dcc_separate_buffer = (struct r600_resource*)
+ r600_aligned_buffer_create(rctx->b.screen, 0,
+ PIPE_USAGE_DEFAULT,
+ tex->surface.dcc_size,
+ tex->surface.dcc_alignment);
+ if (!tex->dcc_separate_buffer)
+ return;
+
+ /* Enabling for the first time, so start the query. */
+ tex->dcc_gather_statistics = true;
+ vi_separate_dcc_start_query(&rctx->b, tex);
+ }
+
+ /* dcc_offset is the absolute GPUVM address. */
+ tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+ /* no need to flag anything since this is called by fast clear that
+ * flags framebuffer state
+ */
+}
+
+/**
+ * Called by pipe_context::flush_resource, the place where DCC decompression
+ * takes place.
+ */
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+ struct r600_texture *tex)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+ bool query_active = rctx->dcc_stats[i].query_active;
+ bool disable = false;
+
+ if (rctx->dcc_stats[i].ps_stats[2]) {
+ union pipe_query_result result;
+
+ /* Read the results. */
+ ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2],
+ true, &result);
+ ctx->destroy_query(ctx, rctx->dcc_stats[i].ps_stats[2]);
+ rctx->dcc_stats[i].ps_stats[2] = NULL;
+
+ /* Compute the approximate number of fullscreen draws. */
+ tex->ps_draw_ratio =
+ result.pipeline_statistics.ps_invocations /
+ (tex->resource.b.b.width0 * tex->resource.b.b.height0);
+
+ disable = tex->dcc_separate_buffer &&
+ !vi_should_enable_separate_dcc(tex);
+ }
+
+ tex->num_slow_clears = 0;
+
+ /* stop the statistics query for ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_stop_query(ctx, tex);
+
+ /* Move the queries in the queue by one. */
+ rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1];
+ rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0];
+ rctx->dcc_stats[i].ps_stats[0] = NULL;
+
+ /* create and start a new query as ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_start_query(ctx, tex);
+
+ if (disable) {
+ assert(!tex->last_dcc_separate_buffer);
+ tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+ tex->dcc_separate_buffer = NULL;
+ tex->dcc_offset = 0;
+ /* no need to flag anything since this is called after
+ * decompression that re-sets framebuffer state
+ */
+ }
+}
+
+/* FAST COLOR CLEAR */
+
static void evergreen_set_clear_color(struct r600_texture *rtex,
enum pipe_format surface_format,
const union pipe_color_union *color)
@@ -1966,6 +2196,22 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
continue;
}
+ /* Fast clear is the most appropriate place to enable DCC for
+ * displayable surfaces.
+ */
+ if (rctx->chip_class >= VI) {
+ vi_separate_dcc_try_enable(rctx, tex);
+
+ /* Stoney can't do a CMASK-based clear, so all clears are
+ * considered to be hypothetically slow clears, which
+ * is weighed when determining to enable separate DCC.
+ */
+ if (tex->dcc_gather_statistics &&
+ rctx->family == CHIP_STONEY)
+ tex->num_slow_clears++;
+ }
+
+ /* Try to clear DCC first, otherwise try CMASK. */
if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) {
uint32_t reset_value;
bool clear_words_needed;
@@ -1982,6 +2228,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
if (clear_words_needed)
tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+ tex->separate_dcc_dirty = true;
} else {
/* Stoney/RB+ doesn't work with CMASK fast clear. */
if (rctx->family == CHIP_STONEY)
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 46daeac4ac0..a48c5bc19e1 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -1055,10 +1055,23 @@ static void si_flush_resource(struct pipe_context *ctx,
struct r600_texture *rtex = (struct r600_texture*)res;
assert(res->target != PIPE_BUFFER);
+ assert(!rtex->dcc_separate_buffer || rtex->dcc_gather_statistics);
+
+ /* st/dri calls flush twice per frame (not a bug), this prevents double
+ * decompression. */
+ if (rtex->dcc_separate_buffer && !rtex->separate_dcc_dirty)
+ return;
if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) {
si_blit_decompress_color(ctx, rtex, 0, res->last_level,
- 0, util_max_layer(res, 0), false);
+ 0, util_max_layer(res, 0),
+ rtex->dcc_separate_buffer != NULL);
+ }
+
+ /* Always do the analysis even if DCC is disabled at the moment. */
+ if (rtex->dcc_gather_statistics && rtex->separate_dcc_dirty) {
+ rtex->separate_dcc_dirty = false;
+ vi_separate_dcc_process_and_reset_stats(ctx, rtex);
}
}
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 5aed352f55c..26831faa107 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2236,6 +2236,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
unsigned old_nr_samples = sctx->framebuffer.nr_samples;
int i;
+ for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+ if (!sctx->framebuffer.state.cbufs[i])
+ continue;
+
+ rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
+ if (rtex->dcc_gather_statistics)
+ vi_separate_dcc_stop_query(ctx, rtex);
+ }
+
/* Only flush TC when changing the framebuffer state, because
* the only client not using TC that can change textures is
* the framebuffer.
@@ -2307,6 +2316,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
r600_context_add_resource_size(ctx, surf->base.texture);
p_atomic_inc(&rtex->framebuffers_bound);
+
+ if (rtex->dcc_gather_statistics) {
+ /* Dirty tracking must be enabled for DCC usage analysis. */
+ sctx->framebuffer.compressed_cb_mask |= 1 << i;
+ vi_separate_dcc_start_query(ctx, rtex);
+ }
}
/* Set the second SPI format for possible dual-src blending. */
if (i == 1 && surf) {
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index ce8def46659..a596bd81381 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1042,7 +1042,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
surf = sctx->framebuffer.state.cbufs[i];
rtex = (struct r600_texture*)surf->texture;
- rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+ if (rtex->fmask.size)
+ rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+ if (rtex->dcc_gather_statistics)
+ rtex->separate_dcc_dirty = true;
} while (mask);
}