/* * Copyright 2013-2017 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include "util/os_time.h" #include "util/u_memory.h" #include "util/u_queue.h" #include "util/u_upload_mgr.h" #include "si_pipe.h" #include "radeon/r600_cs.h" struct si_fine_fence { struct r600_resource *buf; unsigned offset; }; struct si_multi_fence { struct pipe_reference reference; struct pipe_fence_handle *gfx; struct pipe_fence_handle *sdma; struct tc_unflushed_batch_token *tc_token; struct util_queue_fence ready; /* If the context wasn't flushed at fence creation, this is non-NULL. */ struct { struct r600_common_context *ctx; unsigned ib_index; } gfx_unflushed; struct si_fine_fence fine; }; static void si_add_fence_dependency(struct r600_common_context *rctx, struct pipe_fence_handle *fence) { struct radeon_winsys *ws = rctx->ws; if (rctx->dma.cs) ws->cs_add_fence_dependency(rctx->dma.cs, fence); ws->cs_add_fence_dependency(rctx->gfx.cs, fence); } static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_handle **dst, struct pipe_fence_handle *src) { struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; struct si_multi_fence **rdst = (struct si_multi_fence **)dst; struct si_multi_fence *rsrc = (struct si_multi_fence *)src; if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { ws->fence_reference(&(*rdst)->gfx, NULL); ws->fence_reference(&(*rdst)->sdma, NULL); tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL); r600_resource_reference(&(*rdst)->fine.buf, NULL); FREE(*rdst); } *rdst = rsrc; } static struct si_multi_fence *si_create_multi_fence() { struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence); if (!fence) return NULL; pipe_reference_init(&fence->reference, 1); util_queue_fence_init(&fence->ready); return fence; } struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, struct tc_unflushed_batch_token *tc_token) { struct si_multi_fence *fence = si_create_multi_fence(); if (!fence) return NULL; util_queue_fence_reset(&fence->ready); tc_unflushed_batch_token_reference(&fence->tc_token, tc_token); return (struct pipe_fence_handle *)fence; } static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct si_multi_fence *rfence = (struct si_multi_fence *)fence; util_queue_fence_wait(&rfence->ready); /* Unflushed fences from the same context are no-ops. */ if (rfence->gfx_unflushed.ctx && rfence->gfx_unflushed.ctx == rctx) return; /* All unflushed commands will not start execution before * this fence dependency is signalled. * * Should we flush the context to allow more GPU parallelism? */ if (rfence->sdma) si_add_fence_dependency(rctx, rfence->sdma); if (rfence->gfx) si_add_fence_dependency(rctx, rfence->gfx); } static bool si_fine_fence_signaled(struct radeon_winsys *rws, const struct si_fine_fence *fine) { char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED); if (!map) return false; uint32_t *fence = (uint32_t*)(map + fine->offset); return *fence != 0; } static void si_fine_fence_set(struct si_context *ctx, struct si_fine_fence *fine, unsigned flags) { uint32_t *fence_ptr; assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1); /* Use uncached system memory for the fence. */ u_upload_alloc(ctx->b.b.stream_uploader, 0, 4, 4, &fine->offset, (struct pipe_resource **)&fine->buf, (void **)&fence_ptr); if (!fine->buf) return; *fence_ptr = 0; uint64_t fence_va = fine->buf->gpu_address + fine->offset; radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, fine->buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); if (flags & PIPE_FLUSH_TOP_OF_PIPE) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); radeon_emit(cs, fence_va); radeon_emit(cs, fence_va >> 32); radeon_emit(cs, 0x80000000); } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) { si_gfx_write_event_eop(&ctx->b, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DATA_SEL_VALUE_32BIT, NULL, fence_va, 0x80000000, PIPE_QUERY_GPU_FINISHED); } else { assert(false); } } static boolean si_fence_finish(struct pipe_screen *screen, struct pipe_context *ctx, struct pipe_fence_handle *fence, uint64_t timeout) { struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; struct si_multi_fence *rfence = (struct si_multi_fence *)fence; struct r600_common_context *rctx; int64_t abs_timeout = os_time_get_absolute_timeout(timeout); ctx = threaded_context_unwrap_sync(ctx); rctx = ctx ? (struct r600_common_context*)ctx : NULL; if (!util_queue_fence_is_signalled(&rfence->ready)) { if (!timeout) return false; if (rfence->tc_token) { /* Ensure that si_flush_from_st will be called for * this fence, but only if we're in the API thread * where the context is current. * * Note that the batch containing the flush may already * be in flight in the driver thread, so the fence * may not be ready yet when this call returns. */ threaded_context_flush(ctx, rfence->tc_token); } if (timeout == PIPE_TIMEOUT_INFINITE) { util_queue_fence_wait(&rfence->ready); } else { if (!util_queue_fence_wait_timeout(&rfence->ready, abs_timeout)) return false; } } if (rfence->sdma) { if (!rws->fence_wait(rws, rfence->sdma, timeout)) return false; /* Recompute the timeout after waiting. */ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { int64_t time = os_time_get_nano(); timeout = abs_timeout > time ? abs_timeout - time : 0; } } if (!rfence->gfx) return true; if (rfence->fine.buf && si_fine_fence_signaled(rws, &rfence->fine)) { rws->fence_reference(&rfence->gfx, NULL); r600_resource_reference(&rfence->fine.buf, NULL); return true; } /* Flush the gfx IB if it hasn't been flushed yet. */ if (rctx && rfence->gfx_unflushed.ctx == rctx && rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) { /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile) * spec says: * * "If the sync object being blocked upon will not be * signaled in finite time (for example, by an associated * fence command issued previously, but not yet flushed to * the graphics pipeline), then ClientWaitSync may hang * forever. To help prevent this behavior, if * ClientWaitSync is called and all of the following are * true: * * * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags, * * sync is unsignaled when ClientWaitSync is called, * * and the calls to ClientWaitSync and FenceSync were * issued from the same context, * * then the GL will behave as if the equivalent of Flush * were inserted immediately after the creation of sync." * * This means we need to flush for such fences even when we're * not going to wait. */ rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL); rfence->gfx_unflushed.ctx = NULL; if (!timeout) return false; /* Recompute the timeout after all that. */ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { int64_t time = os_time_get_nano(); timeout = abs_timeout > time ? abs_timeout - time : 0; } } if (rws->fence_wait(rws, rfence->gfx, timeout)) return true; /* Re-check in case the GPU is slow or hangs, but the commands before * the fine-grained fence have completed. */ if (rfence->fine.buf && si_fine_fence_signaled(rws, &rfence->fine)) return true; return false; } static void si_create_fence_fd(struct pipe_context *ctx, struct pipe_fence_handle **pfence, int fd) { struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct radeon_winsys *ws = rscreen->ws; struct si_multi_fence *rfence; *pfence = NULL; if (!rscreen->info.has_sync_file) return; rfence = si_create_multi_fence(); if (!rfence) return; rfence->gfx = ws->fence_import_sync_file(ws, fd); if (!rfence->gfx) { FREE(rfence); return; } *pfence = (struct pipe_fence_handle*)rfence; } static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *fence) { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct radeon_winsys *ws = rscreen->ws; struct si_multi_fence *rfence = (struct si_multi_fence *)fence; int gfx_fd = -1, sdma_fd = -1; if (!rscreen->info.has_sync_file) return -1; util_queue_fence_wait(&rfence->ready); /* Deferred fences aren't supported. */ assert(!rfence->gfx_unflushed.ctx); if (rfence->gfx_unflushed.ctx) return -1; if (rfence->sdma) { sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma); if (sdma_fd == -1) return -1; } if (rfence->gfx) { gfx_fd = ws->fence_export_sync_file(ws, rfence->gfx); if (gfx_fd == -1) { if (sdma_fd != -1) close(sdma_fd); return -1; } } /* If we don't have FDs at this point, it means we don't have fences * either. */ if (sdma_fd == -1) return gfx_fd; if (gfx_fd == -1) return sdma_fd; /* Get a fence that will be a combination of both fences. */ sync_accumulate("radeonsi", &gfx_fd, sdma_fd); close(sdma_fd); return gfx_fd; } static void si_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence, unsigned flags) { struct pipe_screen *screen = ctx->screen; struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct radeon_winsys *ws = rctx->ws; struct pipe_fence_handle *gfx_fence = NULL; struct pipe_fence_handle *sdma_fence = NULL; bool deferred_fence = false; struct si_fine_fence fine = {}; unsigned rflags = RADEON_FLUSH_ASYNC; if (flags & PIPE_FLUSH_END_OF_FRAME) rflags |= RADEON_FLUSH_END_OF_FRAME; if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) { assert(flags & PIPE_FLUSH_DEFERRED); assert(fence); si_fine_fence_set((struct si_context *)rctx, &fine, flags); } /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ if (rctx->dma.cs) rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) { if (fence) ws->fence_reference(&gfx_fence, rctx->last_gfx_fence); if (!(flags & PIPE_FLUSH_DEFERRED)) ws->cs_sync_flush(rctx->gfx.cs); } else { /* Instead of flushing, create a deferred fence. Constraints: * - The state tracker must allow a deferred flush. * - The state tracker must request a fence. * - fence_get_fd is not allowed. * Thread safety in fence_finish must be ensured by the state tracker. */ if (flags & PIPE_FLUSH_DEFERRED && !(flags & PIPE_FLUSH_FENCE_FD) && fence) { gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs); deferred_fence = true; } else { rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); } } /* Both engines can signal out of order, so we need to keep both fences. */ if (fence) { struct si_multi_fence *multi_fence; if (flags & TC_FLUSH_ASYNC) { multi_fence = (struct si_multi_fence *)*fence; assert(multi_fence); } else { multi_fence = si_create_multi_fence(); if (!multi_fence) { ws->fence_reference(&sdma_fence, NULL); ws->fence_reference(&gfx_fence, NULL); goto finish; } screen->fence_reference(screen, fence, NULL); *fence = (struct pipe_fence_handle*)multi_fence; } /* If both fences are NULL, fence_finish will always return true. */ multi_fence->gfx = gfx_fence; multi_fence->sdma = sdma_fence; if (deferred_fence) { multi_fence->gfx_unflushed.ctx = rctx; multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes; } multi_fence->fine = fine; if (flags & TC_FLUSH_ASYNC) { util_queue_fence_signal(&multi_fence->ready); tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL); } } finish: if (!(flags & PIPE_FLUSH_DEFERRED)) { if (rctx->dma.cs) ws->cs_sync_flush(rctx->dma.cs); ws->cs_sync_flush(rctx->gfx.cs); } } void si_init_fence_functions(struct si_context *ctx) { ctx->b.b.flush = si_flush_from_st; ctx->b.b.create_fence_fd = si_create_fence_fd; ctx->b.b.fence_server_sync = si_fence_server_sync; } void si_init_screen_fence_functions(struct si_screen *screen) { screen->b.b.fence_finish = si_fence_finish; screen->b.b.fence_reference = si_fence_reference; screen->b.b.fence_get_fd = si_fence_get_fd; }