diff options
author | Marek Olšák <[email protected]> | 2017-03-02 01:15:55 +0100 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2017-05-15 13:01:33 +0200 |
commit | b8e552424eed58d95671da3191c7199cf171b3f0 (patch) | |
tree | 04b3030072ef8285580f2b31162d0e45a0c92dd1 /src/gallium/auxiliary/util/u_threaded_context.h | |
parent | dca19b1d427f0ecbc0bbd530d1fc3f6c0ce2b5c1 (diff) |
gallium/util: add threaded_context as a pipe_context wrapper
v2: - rename num_calls -> num_call_slots (for tc_call)
- rename num_calls -> num_total_call_slots (for tc_batch)
- rename num_offloaded/direct_calls -> num_offloaded/direct_slots
- declare slot[0] instead of slot[1]
- remove no-op leftover code from tc_draw_vbo
- use tc_set_resource_reference to fill threaded_transfer
- fix map flags for sparse buffers
- cosmetic changes
Reviewed-by: Nicolai Hähnle <[email protected]>
Tested-by: Dieter Nützel <[email protected]>
Diffstat (limited to 'src/gallium/auxiliary/util/u_threaded_context.h')
-rw-r--r-- | src/gallium/auxiliary/util/u_threaded_context.h | 349 |
1 files changed, 349 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h new file mode 100644 index 00000000000..ea58d4ca0cf --- /dev/null +++ b/src/gallium/auxiliary/util/u_threaded_context.h @@ -0,0 +1,349 @@ +/************************************************************************** + * + * Copyright 2017 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* This is a wrapper for pipe_context that executes all pipe_context calls + * in another thread. + * + * + * Guidelines for adopters and deviations from Gallium + * --------------------------------------------------- + * + * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen + * driver functions that take a context (fence_finish, texture_get_handle) + * should manually unwrap pipe_context by doing: + * pipe = threaded_context_unwrap_sync(pipe); + * + * pipe_context::priv is used to unwrap the context, so drivers and state + * trackers shouldn't use it. + * + * No other objects are wrapped. + * + * 2) Drivers must subclass and initialize these structures: + * - threaded_resource for pipe_resource (use threaded_resource_init/deinit) + * - threaded_query for pipe_query (zero memory) + * - threaded_transfer for pipe_transfer (zero memory) + * + * 3) The threaded context must not be enabled for contexts that can use video + * codecs. + * + * 4) Changes in driver behavior: + * - begin_query and end_query always return true; return values from + * the driver are ignored. + * - generate_mipmap uses is_format_supported to determine success; + * the return value from the driver is ignored. + * - resource_commit always returns true; failures are ignored. + * - If a non-async debug callback is set, the threaded context keeps using + * asynchronous execution. This is OK for shader-db, but the driver + * shouldn't use the debug callback in any other way. + * + * + * Thread-safety requirements on context functions + * ----------------------------------------------- + * + * These pipe_context functions are executed directly, so they shouldn't use + * pipe_context in an unsafe way. They are de-facto screen functions now: + * - create_query + * - create_batch_query + * - create_*_state (all CSOs and shaders) + * - Make sure the shader compiler doesn't use any per-context stuff. + * (e.g. LLVM target machine) + * - Only pipe_context's debug callback for shader dumps is guaranteed to + * be up to date, because set_debug_callback synchronizes execution. + * - create_surface + * - surface_destroy + * - create_sampler_view + * - sampler_view_destroy + * - stream_output_target_destroy + * - transfer_map (only unsychronized buffer mappings) + * - get_query_result (when threaded_query::flushed == true) + * + * Create calls causing a sync that can't be async due to driver limitations: + * - create_stream_output_target + * + * + * Transfer_map rules for buffer mappings + * -------------------------------------- + * + * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made + * in the non-driver thread without flushing the queue. The driver will + * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_- + * UNSYNCHRONIZED to indicate this. + * Note that transfer_unmap is always enqueued and called from the driver + * thread. + * + * 2) The driver isn't allowed to infer unsychronized mappings by tracking + * the valid buffer range. The threaded context always sends TC_TRANSFER_- + * MAP_IGNORE_VALID_RANGE to indicate this. Ignoring the flag will lead + * to failures. + * The threaded context does its own detection of unsynchronized mappings. + * + * 3) The driver isn't allowed to do buffer invalidations by itself under any + * circumstances. This is necessary for unsychronized maps to map the latest + * version of the buffer. (because invalidations can be queued, while + * unsychronized maps are not queued and they should return the latest + * storage after invalidation). The threaded context always sends + * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to + * indicate this. Ignoring the flag will lead to failures. + * The threaded context uses its own buffer invalidation mechanism. + * + * + * Additional requirements + * ----------------------- + * + * get_query_result: + * If threaded_query::flushed == true, get_query_result should assume that + * it's called from a non-driver thread, in which case the driver shouldn't + * use the context in an unsafe way. + * + * replace_buffer_storage: + * The driver has to implement this callback, which will be called when + * the threaded context wants to replace a resource's backing storage with + * another resource's backing storage. The threaded context uses it to + * implement buffer invalidation. This call is always queued. + * + * + * Performance gotchas + * ------------------- + * + * Buffer invalidations are done unconditionally - they don't check whether + * the buffer is busy. This can cause drivers to have more live allocations + * and CPU mappings than necessary. + * + * + * How it works (queue architecture) + * --------------------------------- + * + * There is a multithreaded queue consisting of batches, each batch consisting + * of call slots. Each call slot consists of an 8-byte header (call ID + + * call size + constant 32-bit marker for integrity checking) and an 8-byte + * body for per-call data. That is 16 bytes per call slot. + * + * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger + * calls occupy multiple call slots depending on the size needed by call + * parameters. That means that calls can have a variable size in the batch. + * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only + * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots. + * Even though the first call slot can use only 8 bytes for data, additional + * call slots used by the same call can use all 16 bytes for data. + * For example, a call using 2 call slots has 24 bytes of space for data. + * + * Once a batch is full and there is no space for the next call, it's flushed, + * meaning that it's added to the queue for execution in the other thread. + * The batches are ordered in a ring and reused once they are idle again. + * The batching is necessary for low queue/mutex overhead. + * + */ + +#ifndef U_THREADED_CONTEXT_H +#define U_THREADED_CONTEXT_H + +#include "pipe/p_context.h" +#include "pipe/p_state.h" +#include "util/u_queue.h" +#include "util/u_range.h" +#include "util/slab.h" + +/* These are transfer flags sent to drivers. */ +/* Never infer whether it's safe to use unsychronized mappings: */ +#define TC_TRANSFER_MAP_IGNORE_VALID_RANGE (1u << 29) +/* Don't invalidate buffers: */ +#define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30) +/* transfer_map is called from a non-driver thread: */ +#define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31) + +/* Size of the queue = number of batch slots in memory. + * - 1 batch is always idle and records new commands + * - 1 batch is being executed + * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches. + * + * Use a size as small as possible for low CPU L2 cache usage but large enough + * so that the queue isn't stalled too often for not having enough idle batch + * slots. + */ +#define TC_MAX_BATCHES 10 + +/* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer) + * can occupy multiple call slots. + * + * The idea is to have batches as small as possible but large enough so that + * the queuing and mutex overhead is negligible. + */ +#define TC_CALLS_PER_BATCH 192 + +/* Threshold for when to use the queue or sync. */ +#define TC_MAX_STRING_MARKER_BYTES 512 + +/* Threshold for when to enqueue buffer/texture_subdata as-is. + * If the upload size is greater than this, it will do instead: + * - for buffers: DISCARD_RANGE is done by the threaded context + * - for textures: sync and call the driver directly + */ +#define TC_MAX_SUBDATA_BYTES 320 + +typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, + struct pipe_resource *dst, + struct pipe_resource *src); + +struct threaded_resource { + struct pipe_resource b; + const struct u_resource_vtbl *vtbl; + + /* Since buffer invalidations are queued, we can't use the base resource + * for unsychronized mappings. This points to the latest version of + * the buffer after the latest invalidation. It's only used for unsychro- + * nized mappings in the non-driver thread. Initially it's set to &b. + */ + struct pipe_resource *latest; + + /* The buffer range which is initialized (with a write transfer, streamout, + * or writable shader resources). The remainder of the buffer is considered + * invalid and can be mapped unsynchronized. + * + * This allows unsychronized mapping of a buffer range which hasn't been + * used yet. It's for applications which forget to use the unsynchronized + * map flag and expect the driver to figure it out. + * + * Drivers should set this to the full range for buffers backed by user + * memory. + */ + struct util_range valid_buffer_range; + + /* If "this" is not the base instance of the buffer, but it's one of its + * reallocations (set in "latest" of the base instance), this points to + * the valid range of the base instance. It's used for transfers after + * a buffer invalidation, because such transfers operate on "latest", not + * the base instance. Initially it's set to &valid_buffer_range. + */ + struct util_range *base_valid_buffer_range; + + /* Drivers are required to update this for shared resources and user + * pointers. */ + bool is_shared; + bool is_user_ptr; +}; + +struct threaded_transfer { + struct pipe_transfer b; + + /* Staging buffer for DISCARD_RANGE transfers. */ + struct pipe_resource *staging; + + /* Offset into the staging buffer, because the backing buffer is + * sub-allocated. */ + unsigned offset; +}; + +struct threaded_query { + /* The query is added to the list in end_query and removed in flush. */ + struct list_head head_unflushed; + + /* Whether pipe->flush has been called after end_query. */ + bool flushed; +}; + +/* This is the second half of tc_call containing call data. + * Most calls will typecast this to the type they need, typically larger + * than 8 bytes. + */ +union tc_payload { + struct pipe_query *query; + struct pipe_resource *resource; + struct pipe_transfer *transfer; + uint64_t __use_8_bytes; +}; + +struct tc_call { + unsigned sentinel; + ushort num_call_slots; + ushort call_id; + union tc_payload payload; +}; + +struct tc_batch { + struct pipe_context *pipe; + unsigned sentinel; + unsigned num_total_call_slots; + struct util_queue_fence fence; + struct tc_call call[TC_CALLS_PER_BATCH]; + unsigned sentinel2; +}; + +struct threaded_context { + struct pipe_context base; + struct pipe_context *pipe; + struct slab_child_pool pool_transfers; + tc_replace_buffer_storage_func replace_buffer_storage; + unsigned map_buffer_alignment; + + struct list_head unflushed_queries; + + /* Counters for the HUD. */ + unsigned num_offloaded_slots; + unsigned num_direct_slots; + unsigned num_syncs; + + struct util_queue queue; + struct util_queue_fence *fence; + + unsigned last, next; + struct tc_batch batch_slots[TC_MAX_BATCHES]; +}; + +void threaded_resource_init(struct pipe_resource *res); +void threaded_resource_deinit(struct pipe_resource *res); +struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); + +struct pipe_context * +threaded_context_create(struct pipe_context *pipe, + struct slab_parent_pool *parent_transfer_pool, + tc_replace_buffer_storage_func replace_buffer, + struct threaded_context **out); + +static inline struct threaded_context * +threaded_context(struct pipe_context *pipe) +{ + return (struct threaded_context*)pipe; +} + +static inline struct threaded_resource * +threaded_resource(struct pipe_resource *res) +{ + return (struct threaded_resource*)res; +} + +static inline struct threaded_query * +threaded_query(struct pipe_query *q) +{ + return (struct threaded_query*)q; +} + +static inline struct threaded_transfer * +threaded_transfer(struct pipe_transfer *transfer) +{ + return (struct threaded_transfer*)transfer; +} + +#endif |