/* * Copyright 2018 Advanced Micro Devices, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "si_pipe.h" #include "si_query.h" #include "sid.h" #include "util/u_memory.h" #include "util/u_suballoc.h" #include /** * The query buffer is written to by ESGS NGG shaders with statistics about * generated and (streamout-)emitted primitives. * * The context maintains a ring of these query buffers, and queries simply * point into the ring, allowing an arbitrary number of queries to be active * without additional GPU cost. */ struct gfx10_sh_query_buffer { struct list_head list; struct si_resource *buf; unsigned refcount; /* Offset into the buffer in bytes; points at the first un-emitted entry. */ unsigned head; }; /* Memory layout of the query buffer. Must be kept in sync with shaders * (including QBO shaders) and should be aligned to cachelines. * * The somewhat awkward memory layout is for compatibility with the * SET_PREDICATION packet, which also means that we're setting the high bit * of all those values unconditionally. */ struct gfx10_sh_query_buffer_mem { struct { uint64_t generated_primitives_start_dummy; uint64_t emitted_primitives_start_dummy; uint64_t generated_primitives; uint64_t emitted_primitives; } stream[4]; uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */ uint32_t pad[31]; }; /* Shader-based queries. */ struct gfx10_sh_query { struct si_query b; struct gfx10_sh_query_buffer *first; struct gfx10_sh_query_buffer *last; unsigned first_begin; unsigned last_end; unsigned stream; }; static void emit_shader_query(struct si_context *sctx) { assert(!list_is_empty(&sctx->shader_query_buffers)); struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); } static void gfx10_release_query_buffers(struct si_context *sctx, struct gfx10_sh_query_buffer *first, struct gfx10_sh_query_buffer *last) { while (first) { struct gfx10_sh_query_buffer *qbuf = first; if (first != last) first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); else first = NULL; qbuf->refcount--; if (qbuf->refcount) continue; if (qbuf->list.next == &sctx->shader_query_buffers) continue; /* keep the most recent buffer; it may not be full yet */ if (qbuf->list.prev == &sctx->shader_query_buffers) continue; /* keep the oldest buffer for recycling */ list_del(&qbuf->list); si_resource_reference(&qbuf->buf, NULL); FREE(qbuf); } } static bool gfx10_alloc_query_buffer(struct si_context *sctx) { if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) return true; struct gfx10_sh_query_buffer *qbuf = NULL; if (!list_is_empty(&sctx->shader_query_buffers)) { qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) goto success; qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); if (!qbuf->refcount && !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { /* Can immediately re-use the oldest buffer */ list_del(&qbuf->list); } else { qbuf = NULL; } } if (!qbuf) { qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); if (unlikely(!qbuf)) return false; struct si_screen *screen = sctx->screen; unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size); qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); if (unlikely(!qbuf->buf)) { FREE(qbuf); return false; } } /* The buffer is currently unused by the GPU. Initialize it. * * We need to set the high bit of all the primitive counters for * compatibility with the SET_PREDICATION packet. */ uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED); assert(results); for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e; ++i) { for (unsigned j = 0; j < 16; ++j) results[32 * i + j] = (uint64_t)1 << 63; results[32 * i + 16] = 0; } list_addtail(&qbuf->list, &sctx->shader_query_buffers); qbuf->head = 0; qbuf->refcount = sctx->num_active_shader_queries; success:; struct pipe_shader_buffer sbuf; sbuf.buffer = &qbuf->buf->b.b; sbuf.buffer_offset = qbuf->head; sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf); sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1); si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query); return true; } static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery) { struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; gfx10_release_query_buffers(sctx, query->first, query->last); FREE(query); } static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery) { struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; gfx10_release_query_buffers(sctx, query->first, query->last); query->first = query->last = NULL; if (unlikely(!gfx10_alloc_query_buffer(sctx))) return false; query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); query->first_begin = query->first->head; sctx->num_active_shader_queries++; query->first->refcount++; return true; } static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery) { struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; if (unlikely(!query->first)) return false; /* earlier out of memory error */ query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); query->last_end = query->last->head; /* Signal the fence of the previous chunk */ if (query->last_end != 0) { uint64_t fence_va = query->last->buf->gpu_address; fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va, 0xffffffff, PIPE_QUERY_GPU_FINISHED); } sctx->num_active_shader_queries--; if (sctx->num_active_shader_queries > 0) { gfx10_alloc_query_buffer(sctx); } else { si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL); sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED; /* If a query_begin is followed by a query_end without a draw * in-between, we need to clear the atom to ensure that the * next query_begin will re-initialize the shader buffer. */ si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false); } return true; } static void gfx10_sh_query_add_result(struct gfx10_sh_query *query, struct gfx10_sh_query_buffer_mem *qmem, union pipe_query_result *result) { static const uint64_t mask = ((uint64_t)1 << 63) - 1; switch (query->b.type) { case PIPE_QUERY_PRIMITIVES_EMITTED: result->u64 += qmem->stream[query->stream].emitted_primitives & mask; break; case PIPE_QUERY_PRIMITIVES_GENERATED: result->u64 += qmem->stream[query->stream].generated_primitives & mask; break; case PIPE_QUERY_SO_STATISTICS: result->so_statistics.num_primitives_written += qmem->stream[query->stream].emitted_primitives & mask; result->so_statistics.primitives_storage_needed += qmem->stream[query->stream].generated_primitives & mask; break; case PIPE_QUERY_SO_OVERFLOW_PREDICATE: result->b |= qmem->stream[query->stream].emitted_primitives != qmem->stream[query->stream].generated_primitives; break; case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { result->b |= qmem->stream[query->stream].emitted_primitives != qmem->stream[query->stream].generated_primitives; } break; default: assert(0); } } static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait, union pipe_query_result *result) { struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; util_query_clear_result(result, query->b.type); if (unlikely(!query->first)) return false; /* earlier out of memory error */ assert(query->last); for (struct gfx10_sh_query_buffer *qbuf = query->last;; qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) { unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); void *map; if (rquery->b.flushed) map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); else map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); if (!map) return false; unsigned results_begin = 0; unsigned results_end = qbuf->head; if (qbuf == query->first) results_begin = query->first_begin; if (qbuf == query->last) results_end = query->last_end; while (results_begin != results_end) { struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; results_begin += sizeof(*qmem); gfx10_sh_query_add_result(query, qmem, result); } if (qbuf == query->first) break; } return true; } static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery, bool wait, enum pipe_query_value_type result_type, int index, struct pipe_resource *resource, unsigned offset) { struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; struct si_qbo_state saved_state = {}; struct pipe_resource *tmp_buffer = NULL; unsigned tmp_buffer_offset = 0; if (!sctx->sh_query_result_shader) { sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); if (!sctx->sh_query_result_shader) return; } if (query->first != query->last) { u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer); if (!tmp_buffer) return; } si_save_qbo_state(sctx, &saved_state); /* Pre-fill the constants configuring the shader behavior. */ struct { uint32_t config; uint32_t offset; uint32_t chain; uint32_t result_count; } consts; struct pipe_constant_buffer constant_buffer = {}; if (index >= 0) { switch (query->b.type) { case PIPE_QUERY_PRIMITIVES_GENERATED: consts.offset = sizeof(uint32_t) * query->stream; consts.config = 0; break; case PIPE_QUERY_PRIMITIVES_EMITTED: consts.offset = sizeof(uint32_t) * (4 + query->stream); consts.config = 0; break; case PIPE_QUERY_SO_STATISTICS: consts.offset = sizeof(uint32_t) * (4 * index + query->stream); consts.config = 0; break; case PIPE_QUERY_SO_OVERFLOW_PREDICATE: consts.offset = sizeof(uint32_t) * query->stream; consts.config = 2; break; case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: consts.offset = 0; consts.config = 3; break; default: unreachable("bad query type"); } } else { /* Check result availability. */ consts.offset = 0; consts.config = 1; } if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64) consts.config |= 8; constant_buffer.buffer_size = sizeof(consts); constant_buffer.user_buffer = &consts; /* Pre-fill the SSBOs and grid. */ struct pipe_shader_buffer ssbo[3]; struct pipe_grid_info grid = {}; ssbo[1].buffer = tmp_buffer; ssbo[1].buffer_offset = tmp_buffer_offset; ssbo[1].buffer_size = 16; ssbo[2] = ssbo[1]; sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader); grid.block[0] = 1; grid.block[1] = 1; grid.block[2] = 1; grid.grid[0] = 1; grid.grid[1] = 1; grid.grid[2] = 1; struct gfx10_sh_query_buffer *qbuf = query->first; for (;;) { unsigned begin = qbuf == query->first ? query->first_begin : 0; unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; if (!end) continue; ssbo[0].buffer = &qbuf->buf->b.b; ssbo[0].buffer_offset = begin; ssbo[0].buffer_size = end - begin; consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); consts.chain = 0; if (qbuf != query->first) consts.chain |= 1; if (qbuf != query->last) consts.chain |= 2; if (qbuf == query->last) { ssbo[2].buffer = resource; ssbo[2].buffer_offset = offset; ssbo[2].buffer_size = 8; } sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6); if (wait) { uint64_t va; /* Wait for result availability. Wait only for readiness * of the last entry, since the fence writes should be * serialized in the CP. */ va = qbuf->buf->gpu_address; va += end - sizeof(struct gfx10_sh_query_buffer_mem); va += offsetof(struct gfx10_sh_query_buffer_mem, fence); si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); } sctx->b.launch_grid(&sctx->b, &grid); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; if (qbuf == query->last) break; qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); } si_restore_qbo_state(sctx, &saved_state); pipe_resource_reference(&tmp_buffer, NULL); } static const struct si_query_ops gfx10_sh_query_ops = { .destroy = gfx10_sh_query_destroy, .begin = gfx10_sh_query_begin, .end = gfx10_sh_query_end, .get_result = gfx10_sh_query_get_result, .get_result_resource = gfx10_sh_query_get_result_resource, }; struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, unsigned index) { struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); if (unlikely(!query)) return NULL; query->b.ops = &gfx10_sh_query_ops; query->b.type = query_type; query->stream = index; return (struct pipe_query *)query; } void gfx10_init_query(struct si_context *sctx) { list_inithead(&sctx->shader_query_buffers); sctx->atoms.s.shader_query.emit = emit_shader_query; } void gfx10_destroy_query(struct si_context *sctx) { while (!list_is_empty(&sctx->shader_query_buffers)) { struct gfx10_sh_query_buffer *qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); list_del(&qbuf->list); assert(!qbuf->refcount); si_resource_reference(&qbuf->buf, NULL); FREE(qbuf); } }