diff options
author | Marek Olšák <[email protected]> | 2017-03-02 01:15:55 +0100 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2017-05-15 13:01:33 +0200 |
commit | b8e552424eed58d95671da3191c7199cf171b3f0 (patch) | |
tree | 04b3030072ef8285580f2b31162d0e45a0c92dd1 /src/gallium/auxiliary/util | |
parent | dca19b1d427f0ecbc0bbd530d1fc3f6c0ce2b5c1 (diff) |
gallium/util: add threaded_context as a pipe_context wrapper
v2: - rename num_calls -> num_call_slots (for tc_call)
- rename num_calls -> num_total_call_slots (for tc_batch)
- rename num_offloaded/direct_calls -> num_offloaded/direct_slots
- declare slot[0] instead of slot[1]
- remove no-op leftover code from tc_draw_vbo
- use tc_set_resource_reference to fill threaded_transfer
- fix map flags for sparse buffers
- cosmetic changes
Reviewed-by: Nicolai Hähnle <[email protected]>
Tested-by: Dieter Nützel <[email protected]>
Diffstat (limited to 'src/gallium/auxiliary/util')
-rw-r--r-- | src/gallium/auxiliary/util/u_threaded_context.c | 2305 | ||||
-rw-r--r-- | src/gallium/auxiliary/util/u_threaded_context.h | 349 | ||||
-rw-r--r-- | src/gallium/auxiliary/util/u_threaded_context_calls.h | 66 |
3 files changed, 2720 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c new file mode 100644 index 00000000000..b44430fd096 --- /dev/null +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -0,0 +1,2305 @@ +/************************************************************************** + * + * Copyright 2017 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_threaded_context.h" +#include "util/u_cpu_detect.h" +#include "util/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_upload_mgr.h" + +/* 0 = disabled, 1 = assertions, 2 = printfs */ +#define TC_DEBUG 0 + +#if TC_DEBUG >= 1 +#define tc_assert assert +#else +#define tc_assert(x) +#endif + +#if TC_DEBUG >= 2 +#define tc_printf printf +#define tc_asprintf asprintf +#define tc_strcmp strcmp +#else +#define tc_printf(...) +#define tc_asprintf(...) 0 +#define tc_strcmp(...) 0 +#endif + +#define TC_SENTINEL 0x5ca1ab1e + +enum tc_call_id { +#define CALL(name) TC_CALL_##name, +#include "u_threaded_context_calls.h" +#undef CALL + TC_NUM_CALLS, +}; + +typedef void (*tc_execute)(struct pipe_context *pipe, union tc_payload *payload); + +static const tc_execute execute_func[TC_NUM_CALLS]; + +static void +tc_batch_check(struct tc_batch *batch) +{ + tc_assert(batch->sentinel == TC_SENTINEL); + tc_assert(batch->sentinel2 == TC_SENTINEL); + tc_assert(batch->num_total_call_slots <= TC_CALLS_PER_BATCH); +} + +static void +tc_debug_check(struct threaded_context *tc) +{ + for (unsigned i = 0; i < TC_MAX_BATCHES; i++) { + tc_batch_check(&tc->batch_slots[i]); + tc_assert(tc->batch_slots[i].pipe == tc->pipe); + } +} + +static void +tc_batch_execute(void *job, int thread_index) +{ + struct tc_batch *batch = job; + struct pipe_context *pipe = batch->pipe; + struct tc_call *last = &batch->call[batch->num_total_call_slots]; + + tc_batch_check(batch); + + for (struct tc_call *iter = batch->call; iter != last; + iter += iter->num_call_slots) { + tc_assert(iter->sentinel == TC_SENTINEL); + execute_func[iter->call_id](pipe, &iter->payload); + } + + tc_batch_check(batch); + batch->num_total_call_slots = 0; +} + +static void +tc_batch_flush(struct threaded_context *tc) +{ + struct tc_batch *next = &tc->batch_slots[tc->next]; + + tc_assert(next->num_total_call_slots != 0); + tc_batch_check(next); + tc_debug_check(tc); + p_atomic_add(&tc->num_offloaded_slots, next->num_total_call_slots); + + util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute, + NULL); + tc->last = tc->next; + tc->next = (tc->next + 1) % TC_MAX_BATCHES; +} + +/* This is the function that adds variable-sized calls into the current + * batch. It also flushes the batch if there is not enough space there. + * All other higher-level "add" functions use it. + */ +static union tc_payload * +tc_add_sized_call(struct threaded_context *tc, enum tc_call_id id, + unsigned payload_size) +{ + struct tc_batch *next = &tc->batch_slots[tc->next]; + unsigned total_size = offsetof(struct tc_call, payload) + payload_size; + unsigned num_call_slots = DIV_ROUND_UP(total_size, sizeof(struct tc_call)); + + tc_debug_check(tc); + + if (unlikely(next->num_total_call_slots + num_call_slots > TC_CALLS_PER_BATCH)) { + tc_batch_flush(tc); + next = &tc->batch_slots[tc->next]; + tc_assert(next->num_total_call_slots == 0); + } + + tc_assert(util_queue_fence_is_signalled(&next->fence)); + + struct tc_call *call = &next->call[next->num_total_call_slots]; + next->num_total_call_slots += num_call_slots; + + call->sentinel = TC_SENTINEL; + call->call_id = id; + call->num_call_slots = num_call_slots; + + tc_debug_check(tc); + return &call->payload; +} + +#define tc_add_struct_typed_call(tc, execute, type) \ + ((struct type*)tc_add_sized_call(tc, execute, sizeof(struct type))) + +#define tc_add_slot_based_call(tc, execute, type, num_slots) \ + ((struct type*)tc_add_sized_call(tc, execute, \ + sizeof(struct type) + \ + sizeof(((struct type*)NULL)->slot[0]) * \ + num_slots)) + +static union tc_payload * +tc_add_small_call(struct threaded_context *tc, enum tc_call_id id) +{ + return tc_add_sized_call(tc, id, 0); +} + +static void +_tc_sync(struct threaded_context *tc, const char *info, const char *func) +{ + struct tc_batch *last = &tc->batch_slots[tc->last]; + struct tc_batch *next = &tc->batch_slots[tc->next]; + bool synced = false; + + tc_debug_check(tc); + + /* Only wait for queued calls... */ + if (!util_queue_fence_is_signalled(&last->fence)) { + util_queue_fence_wait(&last->fence); + synced = true; + } + + tc_debug_check(tc); + + /* .. and execute unflushed calls directly. */ + if (next->num_total_call_slots) { + p_atomic_add(&tc->num_direct_slots, next->num_total_call_slots); + tc_batch_execute(next, 0); + synced = true; + } + + if (synced) { + p_atomic_inc(&tc->num_syncs); + + if (tc_strcmp(func, "tc_destroy") != 0) + tc_printf("sync %s %s\n", func, info); + } + + tc_debug_check(tc); +} + +#define tc_sync(tc) _tc_sync(tc, "", __func__) +#define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__) + +static void +tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src) +{ + *dst = NULL; + pipe_resource_reference(dst, src); +} + +void +threaded_resource_init(struct pipe_resource *res) +{ + struct threaded_resource *tres = threaded_resource(res); + + tres->latest = &tres->b; + util_range_init(&tres->valid_buffer_range); + tres->base_valid_buffer_range = &tres->valid_buffer_range; + tres->is_shared = false; + tres->is_user_ptr = false; +} + +void +threaded_resource_deinit(struct pipe_resource *res) +{ + struct threaded_resource *tres = threaded_resource(res); + + if (tres->latest != &tres->b) + pipe_resource_reference(&tres->latest, NULL); + util_range_destroy(&tres->valid_buffer_range); +} + +struct pipe_context * +threaded_context_unwrap_sync(struct pipe_context *pipe) +{ + if (!pipe || !pipe->priv) + return pipe; + + tc_sync(threaded_context(pipe)); + return (struct pipe_context*)pipe->priv; +} + + +/******************************************************************** + * simple functions + */ + +#define TC_FUNC1(func, m_payload, qualifier, type, deref, deref2) \ + static void \ + tc_call_##func(struct pipe_context *pipe, union tc_payload *payload) \ + { \ + pipe->func(pipe, deref2((type*)payload)); \ + } \ + \ + static void \ + tc_##func(struct pipe_context *_pipe, qualifier type deref param) \ + { \ + struct threaded_context *tc = threaded_context(_pipe); \ + type *p = (type*)tc_add_sized_call(tc, TC_CALL_##func, sizeof(type)); \ + *p = deref(param); \ + } + +TC_FUNC1(set_active_query_state, flags, , boolean, , *) + +TC_FUNC1(set_blend_color, blend_color, const, struct pipe_blend_color, *, ) +TC_FUNC1(set_stencil_ref, stencil_ref, const, struct pipe_stencil_ref, *, ) +TC_FUNC1(set_clip_state, clip_state, const, struct pipe_clip_state, *, ) +TC_FUNC1(set_sample_mask, sample_mask, , unsigned, , *) +TC_FUNC1(set_min_samples, min_samples, , unsigned, , *) +TC_FUNC1(set_polygon_stipple, polygon_stipple, const, struct pipe_poly_stipple, *, ) + +TC_FUNC1(texture_barrier, flags, , unsigned, , *) +TC_FUNC1(memory_barrier, flags, , unsigned, , *) + + +/******************************************************************** + * queries + */ + +static struct pipe_query * +tc_create_query(struct pipe_context *_pipe, unsigned query_type, + unsigned index) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + return pipe->create_query(pipe, query_type, index); +} + +static struct pipe_query * +tc_create_batch_query(struct pipe_context *_pipe, unsigned num_queries, + unsigned *query_types) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + return pipe->create_batch_query(pipe, num_queries, query_types); +} + +static void +tc_call_destroy_query(struct pipe_context *pipe, union tc_payload *payload) +{ + pipe->destroy_query(pipe, payload->query); +} + +static void +tc_destroy_query(struct pipe_context *_pipe, struct pipe_query *query) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_query *tq = threaded_query(query); + + if (tq->head_unflushed.next) + LIST_DEL(&tq->head_unflushed); + + tc_add_small_call(tc, TC_CALL_destroy_query)->query = query; +} + +static void +tc_call_begin_query(struct pipe_context *pipe, union tc_payload *payload) +{ + pipe->begin_query(pipe, payload->query); +} + +static boolean +tc_begin_query(struct pipe_context *_pipe, struct pipe_query *query) +{ + struct threaded_context *tc = threaded_context(_pipe); + union tc_payload *payload = tc_add_small_call(tc, TC_CALL_begin_query); + + payload->query = query; + return true; /* we don't care about the return value for this call */ +} + +static void +tc_call_end_query(struct pipe_context *pipe, union tc_payload *payload) +{ + pipe->end_query(pipe, payload->query); +} + +static bool +tc_end_query(struct pipe_context *_pipe, struct pipe_query *query) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_query *tq = threaded_query(query); + union tc_payload *payload = tc_add_small_call(tc, TC_CALL_end_query); + + payload->query = query; + + tq->flushed = false; + if (!tq->head_unflushed.next) + LIST_ADD(&tq->head_unflushed, &tc->unflushed_queries); + + return true; /* we don't care about the return value for this call */ +} + +static boolean +tc_get_query_result(struct pipe_context *_pipe, + struct pipe_query *query, boolean wait, + union pipe_query_result *result) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_query *tq = threaded_query(query); + struct pipe_context *pipe = tc->pipe; + + if (!tq->flushed) + tc_sync_msg(tc, wait ? "wait" : "nowait"); + + bool success = pipe->get_query_result(pipe, query, wait, result); + + if (success) { + tq->flushed = true; + if (tq->head_unflushed.next) + LIST_DEL(&tq->head_unflushed); + } + return success; +} + +struct tc_query_result_resource { + struct pipe_query *query; + boolean wait; + enum pipe_query_value_type result_type; + int index; + struct pipe_resource *resource; + unsigned offset; +}; + +static void +tc_call_get_query_result_resource(struct pipe_context *pipe, + union tc_payload *payload) +{ + struct tc_query_result_resource *p = (struct tc_query_result_resource *)payload; + + pipe->get_query_result_resource(pipe, p->query, p->wait, p->result_type, + p->index, p->resource, p->offset); + pipe_resource_reference(&p->resource, NULL); +} + +static void +tc_get_query_result_resource(struct pipe_context *_pipe, + struct pipe_query *query, boolean wait, + enum pipe_query_value_type result_type, int index, + struct pipe_resource *resource, unsigned offset) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_query_result_resource *p = + tc_add_struct_typed_call(tc, TC_CALL_get_query_result_resource, + tc_query_result_resource); + + p->query = query; + p->wait = wait; + p->result_type = result_type; + p->index = index; + tc_set_resource_reference(&p->resource, resource); + p->offset = offset; +} + +struct tc_render_condition { + struct pipe_query *query; + bool condition; + unsigned mode; +}; + +static void +tc_call_render_condition(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_render_condition *p = (struct tc_render_condition *)payload; + pipe->render_condition(pipe, p->query, p->condition, p->mode); +} + +static void +tc_render_condition(struct pipe_context *_pipe, + struct pipe_query *query, boolean condition, + uint mode) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_render_condition *p = + tc_add_struct_typed_call(tc, TC_CALL_render_condition, tc_render_condition); + + p->query = query; + p->condition = condition; + p->mode = mode; +} + + +/******************************************************************** + * constant (immutable) states + */ + +#define TC_CSO_CREATE(name, sname) \ + static void * \ + tc_create_##name##_state(struct pipe_context *_pipe, \ + const struct pipe_##sname##_state *state) \ + { \ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; \ + return pipe->create_##name##_state(pipe, state); \ + } + +#define TC_CSO_BIND(name) TC_FUNC1(bind_##name##_state, cso, , void *, , *) +#define TC_CSO_DELETE(name) TC_FUNC1(delete_##name##_state, cso, , void *, , *) + +#define TC_CSO_WHOLE2(name, sname) \ + TC_CSO_CREATE(name, sname) \ + TC_CSO_BIND(name) \ + TC_CSO_DELETE(name) + +#define TC_CSO_WHOLE(name) TC_CSO_WHOLE2(name, name) + +TC_CSO_WHOLE(blend) +TC_CSO_WHOLE(rasterizer) +TC_CSO_WHOLE(depth_stencil_alpha) +TC_CSO_WHOLE(compute) +TC_CSO_WHOLE2(fs, shader) +TC_CSO_WHOLE2(vs, shader) +TC_CSO_WHOLE2(gs, shader) +TC_CSO_WHOLE2(tcs, shader) +TC_CSO_WHOLE2(tes, shader) +TC_CSO_CREATE(sampler, sampler) +TC_CSO_DELETE(sampler) +TC_CSO_BIND(vertex_elements) +TC_CSO_DELETE(vertex_elements) + +static void * +tc_create_vertex_elements_state(struct pipe_context *_pipe, unsigned count, + const struct pipe_vertex_element *elems) +{ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; + + return pipe->create_vertex_elements_state(pipe, count, elems); +} + +struct tc_sampler_states { + ubyte shader, start, count; + void *slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_bind_sampler_states(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_sampler_states *p = (struct tc_sampler_states *)payload; + pipe->bind_sampler_states(pipe, p->shader, p->start, p->count, p->slot); +} + +static void +tc_bind_sampler_states(struct pipe_context *_pipe, + enum pipe_shader_type shader, + unsigned start, unsigned count, void **states) +{ + if (!count) + return; + + struct threaded_context *tc = threaded_context(_pipe); + struct tc_sampler_states *p = + tc_add_slot_based_call(tc, TC_CALL_bind_sampler_states, tc_sampler_states, count); + + p->shader = shader; + p->start = start; + p->count = count; + memcpy(p->slot, states, count * sizeof(states[0])); +} + + +/******************************************************************** + * immediate states + */ + +static void +tc_call_set_framebuffer_state(struct pipe_context *pipe, union tc_payload *payload) +{ + struct pipe_framebuffer_state *p = (struct pipe_framebuffer_state *)payload; + + pipe->set_framebuffer_state(pipe, p); + + unsigned nr_cbufs = p->nr_cbufs; + for (unsigned i = 0; i < nr_cbufs; i++) + pipe_surface_reference(&p->cbufs[i], NULL); + pipe_surface_reference(&p->zsbuf, NULL); +} + +static void +tc_set_framebuffer_state(struct pipe_context *_pipe, + const struct pipe_framebuffer_state *fb) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_framebuffer_state *p = + tc_add_struct_typed_call(tc, TC_CALL_set_framebuffer_state, + pipe_framebuffer_state); + unsigned nr_cbufs = fb->nr_cbufs; + + p->width = fb->width; + p->height = fb->height; + p->samples = fb->samples; + p->layers = fb->layers; + p->nr_cbufs = nr_cbufs; + + for (unsigned i = 0; i < nr_cbufs; i++) { + p->cbufs[i] = NULL; + pipe_surface_reference(&p->cbufs[i], fb->cbufs[i]); + } + p->zsbuf = NULL; + pipe_surface_reference(&p->zsbuf, fb->zsbuf); +} + +static void +tc_call_set_tess_state(struct pipe_context *pipe, union tc_payload *payload) +{ + float *p = (float*)payload; + pipe->set_tess_state(pipe, p, p + 4); +} + +static void +tc_set_tess_state(struct pipe_context *_pipe, + const float default_outer_level[4], + const float default_inner_level[2]) +{ + struct threaded_context *tc = threaded_context(_pipe); + float *p = (float*)tc_add_sized_call(tc, TC_CALL_set_tess_state, + sizeof(float) * 6); + + memcpy(p, default_outer_level, 4 * sizeof(float)); + memcpy(p + 4, default_inner_level, 2 * sizeof(float)); +} + +struct tc_constant_buffer { + ubyte shader, index; + struct pipe_constant_buffer cb; +}; + +static void +tc_call_set_constant_buffer(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_constant_buffer *p = (struct tc_constant_buffer *)payload; + + pipe->set_constant_buffer(pipe, + p->shader, + p->index, + &p->cb); + pipe_resource_reference(&p->cb.buffer, NULL); +} + +static void +tc_set_constant_buffer(struct pipe_context *_pipe, + uint shader, uint index, + const struct pipe_constant_buffer *cb) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_resource *buffer = NULL; + unsigned offset; + + /* This must be done before adding set_constant_buffer, because it could + * generate e.g. transfer_unmap and flush partially-uninitialized + * set_constant_buffer to the driver if it was done afterwards. + */ + if (cb && cb->user_buffer) { + u_upload_data(tc->base.const_uploader, 0, cb->buffer_size, 64, + cb->user_buffer, &offset, &buffer); + } + + struct tc_constant_buffer *p = + tc_add_struct_typed_call(tc, TC_CALL_set_constant_buffer, + tc_constant_buffer); + p->shader = shader; + p->index = index; + + if (cb) { + if (cb->user_buffer) { + p->cb.buffer_size = cb->buffer_size; + p->cb.user_buffer = NULL; + p->cb.buffer_offset = offset; + p->cb.buffer = buffer; + } else { + tc_set_resource_reference(&p->cb.buffer, + cb->buffer); + memcpy(&p->cb, cb, sizeof(*cb)); + } + } else { + memset(&p->cb, 0, sizeof(*cb)); + } +} + +struct tc_scissors { + ubyte start, count; + struct pipe_scissor_state slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_set_scissor_states(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_scissors *p = (struct tc_scissors *)payload; + pipe->set_scissor_states(pipe, p->start, p->count, p->slot); +} + +static void +tc_set_scissor_states(struct pipe_context *_pipe, + unsigned start, unsigned count, + const struct pipe_scissor_state *states) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_scissors *p = + tc_add_slot_based_call(tc, TC_CALL_set_scissor_states, tc_scissors, count); + + p->start = start; + p->count = count; + memcpy(&p->slot, states, count * sizeof(states[0])); +} + +struct tc_viewports { + ubyte start, count; + struct pipe_viewport_state slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_set_viewport_states(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_viewports *p = (struct tc_viewports *)payload; + pipe->set_viewport_states(pipe, p->start, p->count, p->slot); +} + +static void +tc_set_viewport_states(struct pipe_context *_pipe, + unsigned start, unsigned count, + const struct pipe_viewport_state *states) +{ + if (!count) + return; + + struct threaded_context *tc = threaded_context(_pipe); + struct tc_viewports *p = + tc_add_slot_based_call(tc, TC_CALL_set_viewport_states, tc_viewports, count); + + p->start = start; + p->count = count; + memcpy(&p->slot, states, count * sizeof(states[0])); +} + +struct tc_window_rects { + bool include; + ubyte count; + struct pipe_scissor_state slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_set_window_rectangles(struct pipe_context *pipe, + union tc_payload *payload) +{ + struct tc_window_rects *p = (struct tc_window_rects *)payload; + pipe->set_window_rectangles(pipe, p->include, p->count, p->slot); +} + +static void +tc_set_window_rectangles(struct pipe_context *_pipe, boolean include, + unsigned count, + const struct pipe_scissor_state *rects) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_window_rects *p = + tc_add_slot_based_call(tc, TC_CALL_set_window_rectangles, tc_window_rects, count); + + p->include = include; + p->count = count; + memcpy(p->slot, rects, count * sizeof(rects[0])); +} + +struct tc_sampler_views { + ubyte shader, start, count; + struct pipe_sampler_view *slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_set_sampler_views(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_sampler_views *p = (struct tc_sampler_views *)payload; + unsigned count = p->count; + + pipe->set_sampler_views(pipe, p->shader, p->start, p->count, p->slot); + for (unsigned i = 0; i < count; i++) + pipe_sampler_view_reference(&p->slot[i], NULL); +} + +static void +tc_set_sampler_views(struct pipe_context *_pipe, + enum pipe_shader_type shader, + unsigned start, unsigned count, + struct pipe_sampler_view **views) +{ + if (!count) + return; + + struct threaded_context *tc = threaded_context(_pipe); + struct tc_sampler_views *p = + tc_add_slot_based_call(tc, TC_CALL_set_sampler_views, tc_sampler_views, count); + + p->shader = shader; + p->start = start; + p->count = count; + + if (views) { + for (unsigned i = 0; i < count; i++) { + p->slot[i] = NULL; + pipe_sampler_view_reference(&p->slot[i], views[i]); + } + } else { + memset(p->slot, 0, count * sizeof(views[0])); + } +} + +struct tc_shader_images { + ubyte shader, start, count; + struct pipe_image_view slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_set_shader_images(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_shader_images *p = (struct tc_shader_images *)payload; + unsigned count = p->count; + + pipe->set_shader_images(pipe, p->shader, p->start, p->count, p->slot); + + for (unsigned i = 0; i < count; i++) + pipe_resource_reference(&p->slot[i].resource, NULL); +} + +static void +tc_set_shader_images(struct pipe_context *_pipe, + enum pipe_shader_type shader, + unsigned start, unsigned count, + const struct pipe_image_view *images) +{ + if (!count) + return; + + struct threaded_context *tc = threaded_context(_pipe); + struct tc_shader_images *p = + tc_add_slot_based_call(tc, TC_CALL_set_shader_images, tc_shader_images, count); + + p->shader = shader; + p->start = start; + p->count = count; + + if (images) { + for (unsigned i = 0; i < count; i++) { + tc_set_resource_reference(&p->slot[i].resource, images[i].resource); + + if (images[i].access & PIPE_IMAGE_ACCESS_WRITE && + images[i].resource && + images[i].resource->target == PIPE_BUFFER) { + struct threaded_resource *tres = + threaded_resource(images[i].resource); + + util_range_add(&tres->valid_buffer_range, images[i].u.buf.offset, + images[i].u.buf.offset + images[i].u.buf.size); + } + } + memcpy(p->slot, images, count * sizeof(images[0])); + } else { + memset(p->slot, 0, count * sizeof(images[0])); + } +} + +struct tc_shader_buffers { + ubyte shader, start, count; + struct pipe_shader_buffer slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_set_shader_buffers(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_shader_buffers *p = (struct tc_shader_buffers *)payload; + unsigned count = p->count; + + pipe->set_shader_buffers(pipe, p->shader, p->start, p->count, p->slot); + + for (unsigned i = 0; i < count; i++) + pipe_resource_reference(&p->slot[i].buffer, NULL); +} + +static void +tc_set_shader_buffers(struct pipe_context *_pipe, unsigned shader, + unsigned start, unsigned count, + const struct pipe_shader_buffer *buffers) +{ + if (!count) + return; + + struct threaded_context *tc = threaded_context(_pipe); + struct tc_shader_buffers *p = + tc_add_slot_based_call(tc, TC_CALL_set_shader_buffers, tc_shader_buffers, count); + + p->shader = shader; + p->start = start; + p->count = count; + + if (buffers) { + for (unsigned i = 0; i < count; i++) { + struct pipe_shader_buffer *dst = &p->slot[i]; + const struct pipe_shader_buffer *src = buffers + i; + + tc_set_resource_reference(&dst->buffer, src->buffer); + dst->buffer_offset = src->buffer_offset; + dst->buffer_size = src->buffer_size; + + if (src->buffer) { + struct threaded_resource *tres = threaded_resource(src->buffer); + + util_range_add(&tres->valid_buffer_range, src->buffer_offset, + src->buffer_offset + src->buffer_size); + } + } + } else { + memset(p->slot, 0, count * sizeof(buffers[0])); + } +} + +struct tc_vertex_buffers { + ubyte start, count; + bool unbind; + struct pipe_vertex_buffer slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_set_vertex_buffers(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_vertex_buffers *p = (struct tc_vertex_buffers *)payload; + unsigned count = p->count; + + if (p->unbind) { + pipe->set_vertex_buffers(pipe, p->start, count, NULL); + return; + } + + for (unsigned i = 0; i < count; i++) + tc_assert(!p->slot[i].is_user_buffer); + + pipe->set_vertex_buffers(pipe, p->start, count, p->slot); + for (unsigned i = 0; i < count; i++) + pipe_resource_reference(&p->slot[i].buffer.resource, NULL); +} + +static void +tc_set_vertex_buffers(struct pipe_context *_pipe, + unsigned start, unsigned count, + const struct pipe_vertex_buffer *buffers) +{ + struct threaded_context *tc = threaded_context(_pipe); + + if (!count) + return; + + if (buffers) { + struct tc_vertex_buffers *p = + tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, count); + p->start = start; + p->count = count; + p->unbind = false; + + for (unsigned i = 0; i < count; i++) { + struct pipe_vertex_buffer *dst = &p->slot[i]; + const struct pipe_vertex_buffer *src = buffers + i; + + tc_assert(!src->is_user_buffer); + dst->stride = src->stride; + dst->is_user_buffer = false; + tc_set_resource_reference(&dst->buffer.resource, + src->buffer.resource); + dst->buffer_offset = src->buffer_offset; + } + } else { + struct tc_vertex_buffers *p = + tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, 0); + p->start = start; + p->count = count; + p->unbind = true; + } +} + +struct tc_stream_outputs { + unsigned count; + struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; + unsigned offsets[PIPE_MAX_SO_BUFFERS]; +}; + +static void +tc_call_set_stream_output_targets(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_stream_outputs *p = (struct tc_stream_outputs *)payload; + unsigned count = p->count; + + pipe->set_stream_output_targets(pipe, count, p->targets, p->offsets); + for (unsigned i = 0; i < count; i++) + pipe_so_target_reference(&p->targets[i], NULL); +} + +static void +tc_set_stream_output_targets(struct pipe_context *_pipe, + unsigned count, + struct pipe_stream_output_target **tgs, + const unsigned *offsets) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_stream_outputs *p = + tc_add_struct_typed_call(tc, TC_CALL_set_stream_output_targets, + tc_stream_outputs); + + for (unsigned i = 0; i < count; i++) { + p->targets[i] = NULL; + pipe_so_target_reference(&p->targets[i], tgs[i]); + } + p->count = count; + memcpy(p->offsets, offsets, count * sizeof(unsigned)); +} + +static void +tc_set_compute_resources(struct pipe_context *_pipe, unsigned start, + unsigned count, struct pipe_surface **resources) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->set_compute_resources(pipe, start, count, resources); +} + +static void +tc_set_global_binding(struct pipe_context *_pipe, unsigned first, + unsigned count, struct pipe_resource **resources, + uint32_t **handles) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->set_global_binding(pipe, first, count, resources, handles); +} + + +/******************************************************************** + * views + */ + +static struct pipe_surface * +tc_create_surface(struct pipe_context *_pipe, + struct pipe_resource *resource, + const struct pipe_surface *surf_tmpl) +{ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; + struct pipe_surface *view = + pipe->create_surface(pipe, resource, surf_tmpl); + + if (view) + view->context = _pipe; + return view; +} + +static void +tc_surface_destroy(struct pipe_context *_pipe, + struct pipe_surface *surf) +{ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; + + pipe->surface_destroy(pipe, surf); +} + +static struct pipe_sampler_view * +tc_create_sampler_view(struct pipe_context *_pipe, + struct pipe_resource *resource, + const struct pipe_sampler_view *templ) +{ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; + struct pipe_sampler_view *view = + pipe->create_sampler_view(pipe, resource, templ); + + if (view) + view->context = _pipe; + return view; +} + +static void +tc_sampler_view_destroy(struct pipe_context *_pipe, + struct pipe_sampler_view *view) +{ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; + + pipe->sampler_view_destroy(pipe, view); +} + +static struct pipe_stream_output_target * +tc_create_stream_output_target(struct pipe_context *_pipe, + struct pipe_resource *res, + unsigned buffer_offset, + unsigned buffer_size) +{ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; + struct threaded_resource *tres = threaded_resource(res); + struct pipe_stream_output_target *view; + + tc_sync(threaded_context(_pipe)); + util_range_add(&tres->valid_buffer_range, buffer_offset, + buffer_offset + buffer_size); + + view = pipe->create_stream_output_target(pipe, res, buffer_offset, + buffer_size); + if (view) + view->context = _pipe; + return view; +} + +static void +tc_stream_output_target_destroy(struct pipe_context *_pipe, + struct pipe_stream_output_target *target) +{ + struct pipe_context *pipe = threaded_context(_pipe)->pipe; + + pipe->stream_output_target_destroy(pipe, target); +} + + +/******************************************************************** + * transfer + */ + +struct tc_replace_buffer_storage { + struct pipe_resource *dst; + struct pipe_resource *src; + tc_replace_buffer_storage_func func; +}; + +static void +tc_call_replace_buffer_storage(struct pipe_context *pipe, + union tc_payload *payload) +{ + struct tc_replace_buffer_storage *p = + (struct tc_replace_buffer_storage *)payload; + + p->func(pipe, p->dst, p->src); + pipe_resource_reference(&p->dst, NULL); + pipe_resource_reference(&p->src, NULL); +} + +static bool +tc_invalidate_buffer(struct threaded_context *tc, + struct threaded_resource *tbuf) +{ + /* We can't check if the buffer is idle, so we invalidate it + * unconditionally. */ + struct pipe_screen *screen = tc->base.screen; + struct pipe_resource *new_buf; + + /* Shared, pinned, and sparse buffers can't be reallocated. */ + if (tbuf->is_shared || + tbuf->is_user_ptr || + tbuf->b.flags & PIPE_RESOURCE_FLAG_SPARSE) + return false; + + /* Allocate a new one. */ + new_buf = screen->resource_create(screen, &tbuf->b); + if (!new_buf) + return false; + + /* Replace the "latest" pointer. */ + if (tbuf->latest != &tbuf->b) + pipe_resource_reference(&tbuf->latest, NULL); + + tbuf->latest = new_buf; + util_range_set_empty(&tbuf->valid_buffer_range); + + /* The valid range should point to the original buffer. */ + threaded_resource(new_buf)->base_valid_buffer_range = + &tbuf->valid_buffer_range; + + /* Enqueue storage replacement of the original buffer. */ + struct tc_replace_buffer_storage *p = + tc_add_struct_typed_call(tc, TC_CALL_replace_buffer_storage, + tc_replace_buffer_storage); + + p->func = tc->replace_buffer_storage; + tc_set_resource_reference(&p->dst, &tbuf->b); + tc_set_resource_reference(&p->src, new_buf); + return true; +} + +static unsigned +tc_improve_map_buffer_flags(struct threaded_context *tc, + struct threaded_resource *tres, unsigned usage, + unsigned offset, unsigned size) +{ + /* Sparse buffers can't be mapped directly and can't be reallocated + * (fully invalidated). That may just be a radeonsi limitation, but + * the threaded context must obey it with radeonsi. + */ + if (tres->b.flags & PIPE_RESOURCE_FLAG_SPARSE) { + /* We can use DISCARD_RANGE instead of full discard. This is the only + * fast path for sparse buffers that doesn't need thread synchronization. + */ + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) + usage |= PIPE_TRANSFER_DISCARD_RANGE; + + /* Allow DISCARD_WHOLE_RESOURCE and infering UNSYNCHRONIZED in drivers. + * The threaded context doesn't do unsychronized mappings and invalida- + * tions of sparse buffers, therefore a correct driver behavior won't + * result in an incorrect behavior with the threaded context. + */ + return usage; + } + + /* Handle CPU reads trivially. */ + if (usage & PIPE_TRANSFER_READ) { + /* Driver aren't allowed to do buffer invalidations. */ + return (usage & ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) | + TC_TRANSFER_MAP_NO_INVALIDATE | + TC_TRANSFER_MAP_IGNORE_VALID_RANGE; + } + + /* See if the buffer range being mapped has never been initialized, + * in which case it can be mapped unsynchronized. */ + if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && + !tres->is_shared && + !util_ranges_intersect(&tres->valid_buffer_range, offset, offset + size)) + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + + if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { + /* If discarding the entire range, discard the whole resource instead. */ + if (usage & PIPE_TRANSFER_DISCARD_RANGE && + offset == 0 && size == tres->b.width0) + usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; + + /* Discard the whole resource if needed. */ + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { + if (tc_invalidate_buffer(tc, tres)) + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + else + usage |= PIPE_TRANSFER_DISCARD_RANGE; /* fallback */ + } + } + + /* We won't need this flag anymore. */ + /* TODO: We might not need TC_TRANSFER_MAP_NO_INVALIDATE with this. */ + usage &= ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; + + /* GL_AMD_pinned_memory and persistent mappings can't use staging + * buffers. */ + if (usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_PERSISTENT) || + tres->is_user_ptr) + usage &= ~PIPE_TRANSFER_DISCARD_RANGE; + + /* Unsychronized buffer mappings don't have to synchronize the thread. */ + if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) + usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* notify the driver */ + + /* Never invalidate inside the driver and never infer "unsynchronized". */ + return usage | + TC_TRANSFER_MAP_NO_INVALIDATE | + TC_TRANSFER_MAP_IGNORE_VALID_RANGE; +} + +static void * +tc_transfer_map(struct pipe_context *_pipe, + struct pipe_resource *resource, unsigned level, + unsigned usage, const struct pipe_box *box, + struct pipe_transfer **transfer) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_resource *tres = threaded_resource(resource); + struct pipe_context *pipe = tc->pipe; + + if (resource->target == PIPE_BUFFER) { + usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width); + + /* Do a staging transfer within the threaded context. The driver should + * only get resource_copy_region. + */ + if (usage & PIPE_TRANSFER_DISCARD_RANGE) { + struct threaded_transfer *ttrans = slab_alloc(&tc->pool_transfers); + uint8_t *map; + + ttrans->staging = NULL; + + u_upload_alloc(tc->base.stream_uploader, 0, + box->width + (box->x % tc->map_buffer_alignment), + 64, &ttrans->offset, &ttrans->staging, (void**)&map); + if (!map) { + slab_free(&tc->pool_transfers, ttrans); + return NULL; + } + + tc_set_resource_reference(&ttrans->b.resource, resource); + ttrans->b.level = 0; + ttrans->b.usage = usage; + ttrans->b.box = *box; + ttrans->b.stride = 0; + ttrans->b.layer_stride = 0; + *transfer = &ttrans->b; + return map + (box->x % tc->map_buffer_alignment); + } + } + + /* Unsychronized buffer mappings don't have to synchronize the thread. */ + if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC)) + tc_sync_msg(tc, resource->target != PIPE_BUFFER ? " texture" : + usage & PIPE_TRANSFER_DISCARD_RANGE ? " discard_range" : + usage & PIPE_TRANSFER_READ ? " read" : " ??"); + + return pipe->transfer_map(pipe, tres->latest ? tres->latest : resource, + level, usage, box, transfer); +} + +struct tc_transfer_flush_region { + struct pipe_transfer *transfer; + struct pipe_box box; +}; + +static void +tc_call_transfer_flush_region(struct pipe_context *pipe, + union tc_payload *payload) +{ + struct tc_transfer_flush_region *p = + (struct tc_transfer_flush_region *)payload; + + pipe->transfer_flush_region(pipe, p->transfer, &p->box); +} + +struct tc_resource_copy_region { + struct pipe_resource *dst; + unsigned dst_level; + unsigned dstx, dsty, dstz; + struct pipe_resource *src; + unsigned src_level; + struct pipe_box src_box; +}; + +static void +tc_resource_copy_region(struct pipe_context *_pipe, + struct pipe_resource *dst, unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box); + +static void +tc_buffer_do_flush_region(struct threaded_context *tc, + struct threaded_transfer *ttrans, + const struct pipe_box *box) +{ + struct threaded_resource *tres = threaded_resource(ttrans->b.resource); + + if (ttrans->staging) { + struct pipe_box src_box; + + u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment, + box->width, &src_box); + + /* Copy the staging buffer into the original one. */ + tc_resource_copy_region(&tc->base, ttrans->b.resource, 0, box->x, 0, 0, + ttrans->staging, 0, &src_box); + } + + util_range_add(tres->base_valid_buffer_range, box->x, box->x + box->width); +} + +static void +tc_transfer_flush_region(struct pipe_context *_pipe, + struct pipe_transfer *transfer, + const struct pipe_box *rel_box) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_transfer *ttrans = threaded_transfer(transfer); + struct threaded_resource *tres = threaded_resource(transfer->resource); + unsigned required_usage = PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_FLUSH_EXPLICIT; + + if (tres->b.target == PIPE_BUFFER) { + if ((transfer->usage & required_usage) == required_usage) { + struct pipe_box box; + + u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box); + tc_buffer_do_flush_region(tc, ttrans, &box); + } + + /* Staging transfers don't send the call to the driver. */ + if (ttrans->staging) + return; + } + + struct tc_transfer_flush_region *p = + tc_add_struct_typed_call(tc, TC_CALL_transfer_flush_region, + tc_transfer_flush_region); + p->transfer = transfer; + p->box = *rel_box; +} + +static void +tc_call_transfer_unmap(struct pipe_context *pipe, union tc_payload *payload) +{ + pipe->transfer_unmap(pipe, payload->transfer); +} + +static void +tc_transfer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_transfer *ttrans = threaded_transfer(transfer); + struct threaded_resource *tres = threaded_resource(transfer->resource); + + if (tres->b.target == PIPE_BUFFER) { + if (transfer->usage & PIPE_TRANSFER_WRITE && + !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) + tc_buffer_do_flush_region(tc, ttrans, &transfer->box); + + /* Staging transfers don't send the call to the driver. */ + if (ttrans->staging) { + pipe_resource_reference(&ttrans->staging, NULL); + pipe_resource_reference(&ttrans->b.resource, NULL); + slab_free(&tc->pool_transfers, ttrans); + return; + } + } + + tc_add_small_call(tc, TC_CALL_transfer_unmap)->transfer = transfer; +} + +struct tc_buffer_subdata { + struct pipe_resource *resource; + unsigned usage, offset, size; + char slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_buffer_subdata(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_buffer_subdata *p = (struct tc_buffer_subdata *)payload; + + pipe->buffer_subdata(pipe, p->resource, p->usage, p->offset, p->size, + p->slot); + pipe_resource_reference(&p->resource, NULL); +} + +static void +tc_buffer_subdata(struct pipe_context *_pipe, + struct pipe_resource *resource, + unsigned usage, unsigned offset, + unsigned size, const void *data) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_resource *tres = threaded_resource(resource); + + if (!size) + return; + + usage |= PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_DISCARD_RANGE; + + usage = tc_improve_map_buffer_flags(tc, tres, usage, offset, size); + + /* Unsychronized and big transfers should use transfer_map. Also handle + * full invalidations, because drivers aren't allowed to do them. + */ + if (usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) || + size > TC_MAX_SUBDATA_BYTES) { + struct pipe_transfer *transfer; + struct pipe_box box; + uint8_t *map = NULL; + + u_box_1d(offset, size, &box); + + map = tc_transfer_map(_pipe, resource, 0, usage, &box, &transfer); + if (map) { + memcpy(map, data, size); + tc_transfer_unmap(_pipe, transfer); + } + return; + } + + util_range_add(&tres->valid_buffer_range, offset, offset + size); + + /* The upload is small. Enqueue it. */ + struct tc_buffer_subdata *p = + tc_add_slot_based_call(tc, TC_CALL_buffer_subdata, tc_buffer_subdata, size); + + tc_set_resource_reference(&p->resource, resource); + p->usage = usage; + p->offset = offset; + p->size = size; + memcpy(p->slot, data, size); +} + +struct tc_texture_subdata { + struct pipe_resource *resource; + unsigned level, usage, stride, layer_stride; + struct pipe_box box; + char slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_texture_subdata(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_texture_subdata *p = (struct tc_texture_subdata *)payload; + + pipe->texture_subdata(pipe, p->resource, p->level, p->usage, &p->box, + p->slot, p->stride, p->layer_stride); + pipe_resource_reference(&p->resource, NULL); +} + +static void +tc_texture_subdata(struct pipe_context *_pipe, + struct pipe_resource *resource, + unsigned level, unsigned usage, + const struct pipe_box *box, + const void *data, unsigned stride, + unsigned layer_stride) +{ + struct threaded_context *tc = threaded_context(_pipe); + unsigned size; + + assert(box->height >= 1); + assert(box->depth >= 1); + + size = (box->depth - 1) * layer_stride + + (box->height - 1) * stride + + box->width * util_format_get_blocksize(resource->format); + if (!size) + return; + + /* Small uploads can be enqueued, big uploads must sync. */ + if (size <= TC_MAX_SUBDATA_BYTES) { + struct tc_texture_subdata *p = + tc_add_slot_based_call(tc, TC_CALL_texture_subdata, tc_texture_subdata, size); + + tc_set_resource_reference(&p->resource, resource); + p->level = level; + p->usage = usage; + p->box = *box; + p->stride = stride; + p->layer_stride = layer_stride; + memcpy(p->slot, data, size); + } else { + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->texture_subdata(pipe, resource, level, usage, box, data, + stride, layer_stride); + } +} + + +/******************************************************************** + * miscellaneous + */ + +#define TC_FUNC_SYNC_RET0(ret_type, func) \ + static ret_type \ + tc_##func(struct pipe_context *_pipe) \ + { \ + struct threaded_context *tc = threaded_context(_pipe); \ + struct pipe_context *pipe = tc->pipe; \ + tc_sync(tc); \ + return pipe->func(pipe); \ + } + +TC_FUNC_SYNC_RET0(enum pipe_reset_status, get_device_reset_status) +TC_FUNC_SYNC_RET0(uint64_t, get_timestamp) + +static void +tc_get_sample_position(struct pipe_context *_pipe, + unsigned sample_count, unsigned sample_index, + float *out_value) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->get_sample_position(pipe, sample_count, sample_index, + out_value); +} + +static void +tc_set_device_reset_callback(struct pipe_context *_pipe, + const struct pipe_device_reset_callback *cb) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->set_device_reset_callback(pipe, cb); +} + +struct tc_string_marker { + int len; + char slot[0]; /* more will be allocated if needed */ +}; + +static void +tc_call_emit_string_marker(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_string_marker *p = (struct tc_string_marker *)payload; + pipe->emit_string_marker(pipe, p->slot, p->len); +} + +static void +tc_emit_string_marker(struct pipe_context *_pipe, + const char *string, int len) +{ + struct threaded_context *tc = threaded_context(_pipe); + + if (len <= TC_MAX_STRING_MARKER_BYTES) { + struct tc_string_marker *p = + tc_add_slot_based_call(tc, TC_CALL_emit_string_marker, tc_string_marker, len); + + memcpy(p->slot, string, len); + p->len = len; + } else { + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->emit_string_marker(pipe, string, len); + } +} + +static void +tc_dump_debug_state(struct pipe_context *_pipe, FILE *stream, + unsigned flags) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->dump_debug_state(pipe, stream, flags); +} + +static void +tc_set_debug_callback(struct pipe_context *_pipe, + const struct pipe_debug_callback *cb) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->set_debug_callback(pipe, cb); +} + +static void +tc_create_fence_fd(struct pipe_context *_pipe, + struct pipe_fence_handle **fence, int fd) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->create_fence_fd(pipe, fence, fd); +} + +static void +tc_fence_server_sync(struct pipe_context *_pipe, + struct pipe_fence_handle *fence) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->fence_server_sync(pipe, fence); +} + +static struct pipe_video_codec * +tc_create_video_codec(struct pipe_context *_pipe, + const struct pipe_video_codec *templ) +{ + unreachable("Threaded context should not be enabled for video APIs"); + return NULL; +} + +static struct pipe_video_buffer * +tc_create_video_buffer(struct pipe_context *_pipe, + const struct pipe_video_buffer *templ) +{ + unreachable("Threaded context should not be enabled for video APIs"); + return NULL; +} + + +/******************************************************************** + * draw, launch, clear, blit, copy, flush + */ + +static void +tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence, + unsigned flags) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + struct threaded_query *tq, *tmp; + + LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) { + tq->flushed = true; + LIST_DEL(&tq->head_unflushed); + } + + /* TODO: deferred flushes? */ + tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" : + flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal"); + pipe->flush(pipe, fence, flags); +} + +/* This is actually variable-sized, because indirect isn't allocated if it's + * not needed. */ +struct tc_full_draw_info { + struct pipe_draw_info draw; + struct pipe_draw_indirect_info indirect; +}; + +static void +tc_call_draw_vbo(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_full_draw_info *info = (struct tc_full_draw_info*)payload; + + pipe->draw_vbo(pipe, &info->draw); + pipe_so_target_reference(&info->draw.count_from_stream_output, NULL); + if (info->draw.index_size) + pipe_resource_reference(&info->draw.index.resource, NULL); + if (info->draw.indirect) { + pipe_resource_reference(&info->indirect.buffer, NULL); + pipe_resource_reference(&info->indirect.indirect_draw_count, NULL); + } +} + +static struct tc_full_draw_info * +tc_add_draw_vbo(struct pipe_context *_pipe, bool indirect) +{ + return (struct tc_full_draw_info*) + tc_add_sized_call(threaded_context(_pipe), TC_CALL_draw_vbo, + indirect ? sizeof(struct tc_full_draw_info) : + sizeof(struct pipe_draw_info)); +} + +static void +tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_draw_indirect_info *indirect = info->indirect; + unsigned index_size = info->index_size; + bool has_user_indices = info->has_user_indices; + + if (index_size && has_user_indices) { + unsigned size = info->count * index_size; + struct pipe_resource *buffer = NULL; + unsigned offset; + + tc_assert(!indirect); + + /* This must be done before adding draw_vbo, because it could generate + * e.g. transfer_unmap and flush partially-uninitialized draw_vbo + * to the driver if it was done afterwards. + */ + u_upload_data(tc->base.stream_uploader, 0, size, 4, info->index.user, + &offset, &buffer); + if (unlikely(!buffer)) + return; + + struct tc_full_draw_info *p = tc_add_draw_vbo(_pipe, false); + p->draw.count_from_stream_output = NULL; + pipe_so_target_reference(&p->draw.count_from_stream_output, + info->count_from_stream_output); + memcpy(&p->draw, info, sizeof(*info)); + p->draw.has_user_indices = false; + p->draw.index.resource = buffer; + p->draw.start = offset / index_size; + } else { + /* Non-indexed call or indexed with a real index buffer. */ + struct tc_full_draw_info *p = tc_add_draw_vbo(_pipe, indirect != NULL); + p->draw.count_from_stream_output = NULL; + pipe_so_target_reference(&p->draw.count_from_stream_output, + info->count_from_stream_output); + if (index_size) { + tc_set_resource_reference(&p->draw.index.resource, + info->index.resource); + } + memcpy(&p->draw, info, sizeof(*info)); + + if (indirect) { + tc_set_resource_reference(&p->draw.indirect->buffer, indirect->buffer); + tc_set_resource_reference(&p->indirect.indirect_draw_count, + indirect->indirect_draw_count); + memcpy(&p->indirect, indirect, sizeof(*indirect)); + p->draw.indirect = &p->indirect; + } + } +} + +static void +tc_call_launch_grid(struct pipe_context *pipe, union tc_payload *payload) +{ + struct pipe_grid_info *p = (struct pipe_grid_info *)payload; + + pipe->launch_grid(pipe, p); + pipe_resource_reference(&p->indirect, NULL); +} + +static void +tc_launch_grid(struct pipe_context *_pipe, + const struct pipe_grid_info *info) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_grid_info *p = tc_add_struct_typed_call(tc, TC_CALL_launch_grid, + pipe_grid_info); + assert(info->input == NULL); + + tc_set_resource_reference(&p->indirect, info->indirect); + memcpy(p, info, sizeof(*info)); +} + +static void +tc_call_resource_copy_region(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_resource_copy_region *p = (struct tc_resource_copy_region *)payload; + + pipe->resource_copy_region(pipe, p->dst, p->dst_level, p->dstx, p->dsty, + p->dstz, p->src, p->src_level, &p->src_box); + pipe_resource_reference(&p->dst, NULL); + pipe_resource_reference(&p->src, NULL); +} + +static void +tc_resource_copy_region(struct pipe_context *_pipe, + struct pipe_resource *dst, unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_resource *tdst = threaded_resource(dst); + struct tc_resource_copy_region *p = + tc_add_struct_typed_call(tc, TC_CALL_resource_copy_region, + tc_resource_copy_region); + + tc_set_resource_reference(&p->dst, dst); + p->dst_level = dst_level; + p->dstx = dstx; + p->dsty = dsty; + p->dstz = dstz; + tc_set_resource_reference(&p->src, src); + p->src_level = src_level; + p->src_box = *src_box; + + if (dst->target == PIPE_BUFFER) + util_range_add(&tdst->valid_buffer_range, dstx, dstx + src_box->width); +} + +static void +tc_call_blit(struct pipe_context *pipe, union tc_payload *payload) +{ + struct pipe_blit_info *blit = (struct pipe_blit_info*)payload; + + pipe->blit(pipe, blit); + pipe_resource_reference(&blit->dst.resource, NULL); + pipe_resource_reference(&blit->src.resource, NULL); +} + +static void +tc_blit(struct pipe_context *_pipe, const struct pipe_blit_info *info) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_blit_info *blit = + tc_add_struct_typed_call(tc, TC_CALL_blit, pipe_blit_info); + + tc_set_resource_reference(&blit->dst.resource, info->dst.resource); + tc_set_resource_reference(&blit->src.resource, info->src.resource); + memcpy(blit, info, sizeof(*info)); +} + +struct tc_generate_mipmap { + struct pipe_resource *res; + enum pipe_format format; + unsigned base_level; + unsigned last_level; + unsigned first_layer; + unsigned last_layer; +}; + +static void +tc_call_generate_mipmap(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_generate_mipmap *p = (struct tc_generate_mipmap *)payload; + bool result = pipe->generate_mipmap(pipe, p->res, p->format, p->base_level, + p->last_level, p->first_layer, + p->last_layer); + assert(result); + pipe_resource_reference(&p->res, NULL); +} + +static boolean +tc_generate_mipmap(struct pipe_context *_pipe, + struct pipe_resource *res, + enum pipe_format format, + unsigned base_level, + unsigned last_level, + unsigned first_layer, + unsigned last_layer) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + struct pipe_screen *screen = pipe->screen; + unsigned bind = PIPE_BIND_SAMPLER_VIEW; + + if (util_format_is_depth_or_stencil(format)) + bind = PIPE_BIND_DEPTH_STENCIL; + else + bind = PIPE_BIND_RENDER_TARGET; + + if (!screen->is_format_supported(screen, format, res->target, + res->nr_samples, bind)) + return false; + + struct tc_generate_mipmap *p = + tc_add_struct_typed_call(tc, TC_CALL_generate_mipmap, tc_generate_mipmap); + + tc_set_resource_reference(&p->res, res); + p->format = format; + p->base_level = base_level; + p->last_level = last_level; + p->first_layer = first_layer; + p->last_layer = last_layer; + return true; +} + +static void +tc_call_flush_resource(struct pipe_context *pipe, union tc_payload *payload) +{ + pipe->flush_resource(pipe, payload->resource); + pipe_resource_reference(&payload->resource, NULL); +} + +static void +tc_flush_resource(struct pipe_context *_pipe, + struct pipe_resource *resource) +{ + struct threaded_context *tc = threaded_context(_pipe); + union tc_payload *payload = tc_add_small_call(tc, TC_CALL_flush_resource); + + tc_set_resource_reference(&payload->resource, resource); +} + +static void +tc_call_invalidate_resource(struct pipe_context *pipe, union tc_payload *payload) +{ + pipe->invalidate_resource(pipe, payload->resource); + pipe_resource_reference(&payload->resource, NULL); +} + +static void +tc_invalidate_resource(struct pipe_context *_pipe, + struct pipe_resource *resource) +{ + struct threaded_context *tc = threaded_context(_pipe); + + if (resource->target == PIPE_BUFFER) { + tc_invalidate_buffer(tc, threaded_resource(resource)); + return; + } + + union tc_payload *payload = tc_add_small_call(tc, TC_CALL_invalidate_resource); + tc_set_resource_reference(&payload->resource, resource); +} + +struct tc_clear { + unsigned buffers; + union pipe_color_union color; + double depth; + unsigned stencil; +}; + +static void +tc_call_clear(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_clear *p = (struct tc_clear *)payload; + pipe->clear(pipe, p->buffers, &p->color, p->depth, p->stencil); +} + +static void +tc_clear(struct pipe_context *_pipe, unsigned buffers, + const union pipe_color_union *color, double depth, + unsigned stencil) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_clear *p = tc_add_struct_typed_call(tc, TC_CALL_clear, tc_clear); + + p->buffers = buffers; + p->color = *color; + p->depth = depth; + p->stencil = stencil; +} + +static void +tc_clear_render_target(struct pipe_context *_pipe, + struct pipe_surface *dst, + const union pipe_color_union *color, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height, + bool render_condition_enabled) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->clear_render_target(pipe, dst, color, dstx, dsty, width, height, + render_condition_enabled); +} + +static void +tc_clear_depth_stencil(struct pipe_context *_pipe, + struct pipe_surface *dst, unsigned clear_flags, + double depth, unsigned stencil, unsigned dstx, + unsigned dsty, unsigned width, unsigned height, + bool render_condition_enabled) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + pipe->clear_depth_stencil(pipe, dst, clear_flags, depth, stencil, + dstx, dsty, width, height, + render_condition_enabled); +} + +struct tc_clear_buffer { + struct pipe_resource *res; + unsigned offset; + unsigned size; + char clear_value[16]; + int clear_value_size; +}; + +static void +tc_call_clear_buffer(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_clear_buffer *p = (struct tc_clear_buffer *)payload; + + pipe->clear_buffer(pipe, p->res, p->offset, p->size, p->clear_value, + p->clear_value_size); + pipe_resource_reference(&p->res, NULL); +} + +static void +tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res, + unsigned offset, unsigned size, + const void *clear_value, int clear_value_size) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct threaded_resource *tres = threaded_resource(res); + struct tc_clear_buffer *p = + tc_add_struct_typed_call(tc, TC_CALL_clear_buffer, tc_clear_buffer); + + tc_set_resource_reference(&p->res, res); + p->offset = offset; + p->size = size; + memcpy(p->clear_value, clear_value, clear_value_size); + p->clear_value_size = clear_value_size; + + util_range_add(&tres->valid_buffer_range, offset, offset + size); +} + +struct tc_clear_texture { + struct pipe_resource *res; + unsigned level; + struct pipe_box box; + char data[16]; +}; + +static void +tc_call_clear_texture(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_clear_texture *p = (struct tc_clear_texture *)payload; + + pipe->clear_texture(pipe, p->res, p->level, &p->box, p->data); + pipe_resource_reference(&p->res, NULL); +} + +static void +tc_clear_texture(struct pipe_context *_pipe, struct pipe_resource *res, + unsigned level, const struct pipe_box *box, const void *data) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_clear_texture *p = + tc_add_struct_typed_call(tc, TC_CALL_clear_texture, tc_clear_texture); + + tc_set_resource_reference(&p->res, res); + p->level = level; + p->box = *box; + memcpy(p->data, data, + util_format_get_blocksize(res->format)); +} + +struct tc_resource_commit { + struct pipe_resource *res; + unsigned level; + struct pipe_box box; + bool commit; +}; + +static void +tc_call_resource_commit(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_resource_commit *p = (struct tc_resource_commit *)payload; + + pipe->resource_commit(pipe, p->res, p->level, &p->box, p->commit); + pipe_resource_reference(&p->res, NULL); +} + +static bool +tc_resource_commit(struct pipe_context *_pipe, struct pipe_resource *res, + unsigned level, struct pipe_box *box, bool commit) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct tc_resource_commit *p = + tc_add_struct_typed_call(tc, TC_CALL_resource_commit, tc_resource_commit); + + tc_set_resource_reference(&p->res, res); + p->level = level; + p->box = *box; + p->commit = commit; + return true; /* we don't care about the return value for this call */ +} + + +/******************************************************************** + * create & destroy + */ + +static void +tc_destroy(struct pipe_context *_pipe) +{ + struct threaded_context *tc = threaded_context(_pipe); + struct pipe_context *pipe = tc->pipe; + + tc_sync(tc); + + if (util_queue_is_initialized(&tc->queue)) { + util_queue_destroy(&tc->queue); + + for (unsigned i = 0; i < TC_MAX_BATCHES; i++) + util_queue_fence_destroy(&tc->batch_slots[i].fence); + } + + if (tc->base.const_uploader && + tc->base.stream_uploader != tc->base.const_uploader) + u_upload_destroy(tc->base.const_uploader); + + if (tc->base.stream_uploader) + u_upload_destroy(tc->base.stream_uploader); + + slab_destroy_child(&tc->pool_transfers); + pipe->destroy(pipe); + FREE(tc); +} + +static const tc_execute execute_func[TC_NUM_CALLS] = { +#define CALL(name) tc_call_##name, +#include "u_threaded_context_calls.h" +#undef CALL +}; + +/** + * Wrap an existing pipe_context into a threaded_context. + * + * \param pipe pipe_context to wrap + * \param parent_transfer_pool parent slab pool set up for creating pipe_- + * transfer objects; the driver should have one + * in pipe_screen. + * \param replace_buffer callback for replacing a pipe_resource's storage + * with another pipe_resource's storage. + * \param out if successful, the threaded_context will be returned here in + * addition to the return value if "out" != NULL + */ +struct pipe_context * +threaded_context_create(struct pipe_context *pipe, + struct slab_parent_pool *parent_transfer_pool, + tc_replace_buffer_storage_func replace_buffer, + struct threaded_context **out) +{ + struct threaded_context *tc; + + STATIC_ASSERT(sizeof(union tc_payload) <= 8); + STATIC_ASSERT(sizeof(struct tc_call) <= 16); + + if (!pipe) + return NULL; + + util_cpu_detect(); + + if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1)) + return pipe; + + tc = CALLOC_STRUCT(threaded_context); + if (!tc) { + pipe->destroy(pipe); + return NULL; + } + + /* The driver context isn't wrapped, so set its "priv" to NULL. */ + pipe->priv = NULL; + + tc->pipe = pipe; + tc->replace_buffer_storage = replace_buffer; + tc->map_buffer_alignment = + pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT); + tc->base.priv = pipe; /* priv points to the wrapped driver context */ + tc->base.screen = pipe->screen; + tc->base.destroy = tc_destroy; + + tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader); + if (pipe->stream_uploader == pipe->const_uploader) + tc->base.const_uploader = tc->base.stream_uploader; + else + tc->base.const_uploader = u_upload_clone(&tc->base, pipe->const_uploader); + + if (!tc->base.stream_uploader || !tc->base.const_uploader) + goto fail; + + /* The queue size is the number of batches "waiting". Batches are removed + * from the queue before being executed, so keep one tc_batch slot for that + * execution. Also, keep one unused slot for an unflushed batch. + */ + if (!util_queue_init(&tc->queue, "gallium_drv", TC_MAX_BATCHES - 2, 1)) + goto fail; + + for (unsigned i = 0; i < TC_MAX_BATCHES; i++) { + tc->batch_slots[i].sentinel = TC_SENTINEL; + tc->batch_slots[i].sentinel2 = TC_SENTINEL; + tc->batch_slots[i].pipe = pipe; + util_queue_fence_init(&tc->batch_slots[i].fence); + } + + LIST_INITHEAD(&tc->unflushed_queries); + + slab_create_child(&tc->pool_transfers, parent_transfer_pool); + +#define CTX_INIT(_member) \ + tc->base._member = tc->pipe->_member ? tc_##_member : NULL + + CTX_INIT(flush); + CTX_INIT(draw_vbo); + CTX_INIT(launch_grid); + CTX_INIT(resource_copy_region); + CTX_INIT(blit); + CTX_INIT(clear); + CTX_INIT(clear_render_target); + CTX_INIT(clear_depth_stencil); + CTX_INIT(clear_buffer); + CTX_INIT(clear_texture); + CTX_INIT(flush_resource); + CTX_INIT(generate_mipmap); + CTX_INIT(render_condition); + CTX_INIT(create_query); + CTX_INIT(create_batch_query); + CTX_INIT(destroy_query); + CTX_INIT(begin_query); + CTX_INIT(end_query); + CTX_INIT(get_query_result); + CTX_INIT(get_query_result_resource); + CTX_INIT(set_active_query_state); + CTX_INIT(create_blend_state); + CTX_INIT(bind_blend_state); + CTX_INIT(delete_blend_state); + CTX_INIT(create_sampler_state); + CTX_INIT(bind_sampler_states); + CTX_INIT(delete_sampler_state); + CTX_INIT(create_rasterizer_state); + CTX_INIT(bind_rasterizer_state); + CTX_INIT(delete_rasterizer_state); + CTX_INIT(create_depth_stencil_alpha_state); + CTX_INIT(bind_depth_stencil_alpha_state); + CTX_INIT(delete_depth_stencil_alpha_state); + CTX_INIT(create_fs_state); + CTX_INIT(bind_fs_state); + CTX_INIT(delete_fs_state); + CTX_INIT(create_vs_state); + CTX_INIT(bind_vs_state); + CTX_INIT(delete_vs_state); + CTX_INIT(create_gs_state); + CTX_INIT(bind_gs_state); + CTX_INIT(delete_gs_state); + CTX_INIT(create_tcs_state); + CTX_INIT(bind_tcs_state); + CTX_INIT(delete_tcs_state); + CTX_INIT(create_tes_state); + CTX_INIT(bind_tes_state); + CTX_INIT(delete_tes_state); + CTX_INIT(create_compute_state); + CTX_INIT(bind_compute_state); + CTX_INIT(delete_compute_state); + CTX_INIT(create_vertex_elements_state); + CTX_INIT(bind_vertex_elements_state); + CTX_INIT(delete_vertex_elements_state); + CTX_INIT(set_blend_color); + CTX_INIT(set_stencil_ref); + CTX_INIT(set_sample_mask); + CTX_INIT(set_min_samples); + CTX_INIT(set_clip_state); + CTX_INIT(set_constant_buffer); + CTX_INIT(set_framebuffer_state); + CTX_INIT(set_polygon_stipple); + CTX_INIT(set_scissor_states); + CTX_INIT(set_viewport_states); + CTX_INIT(set_window_rectangles); + CTX_INIT(set_sampler_views); + CTX_INIT(set_tess_state); + CTX_INIT(set_shader_buffers); + CTX_INIT(set_shader_images); + CTX_INIT(set_vertex_buffers); + CTX_INIT(create_stream_output_target); + CTX_INIT(stream_output_target_destroy); + CTX_INIT(set_stream_output_targets); + CTX_INIT(create_sampler_view); + CTX_INIT(sampler_view_destroy); + CTX_INIT(create_surface); + CTX_INIT(surface_destroy); + CTX_INIT(transfer_map); + CTX_INIT(transfer_flush_region); + CTX_INIT(transfer_unmap); + CTX_INIT(buffer_subdata); + CTX_INIT(texture_subdata); + CTX_INIT(texture_barrier); + CTX_INIT(memory_barrier); + CTX_INIT(resource_commit); + CTX_INIT(create_video_codec); + CTX_INIT(create_video_buffer); + CTX_INIT(set_compute_resources); + CTX_INIT(set_global_binding); + CTX_INIT(get_sample_position); + CTX_INIT(invalidate_resource); + CTX_INIT(get_device_reset_status); + CTX_INIT(set_device_reset_callback); + CTX_INIT(dump_debug_state); + CTX_INIT(emit_string_marker); + CTX_INIT(set_debug_callback); + CTX_INIT(create_fence_fd); + CTX_INIT(fence_server_sync); + CTX_INIT(get_timestamp); +#undef CTX_INIT + + if (out) + *out = tc; + + return &tc->base; + +fail: + tc_destroy(&tc->base); + return NULL; +} diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h new file mode 100644 index 00000000000..ea58d4ca0cf --- /dev/null +++ b/src/gallium/auxiliary/util/u_threaded_context.h @@ -0,0 +1,349 @@ +/************************************************************************** + * + * Copyright 2017 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* This is a wrapper for pipe_context that executes all pipe_context calls + * in another thread. + * + * + * Guidelines for adopters and deviations from Gallium + * --------------------------------------------------- + * + * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen + * driver functions that take a context (fence_finish, texture_get_handle) + * should manually unwrap pipe_context by doing: + * pipe = threaded_context_unwrap_sync(pipe); + * + * pipe_context::priv is used to unwrap the context, so drivers and state + * trackers shouldn't use it. + * + * No other objects are wrapped. + * + * 2) Drivers must subclass and initialize these structures: + * - threaded_resource for pipe_resource (use threaded_resource_init/deinit) + * - threaded_query for pipe_query (zero memory) + * - threaded_transfer for pipe_transfer (zero memory) + * + * 3) The threaded context must not be enabled for contexts that can use video + * codecs. + * + * 4) Changes in driver behavior: + * - begin_query and end_query always return true; return values from + * the driver are ignored. + * - generate_mipmap uses is_format_supported to determine success; + * the return value from the driver is ignored. + * - resource_commit always returns true; failures are ignored. + * - If a non-async debug callback is set, the threaded context keeps using + * asynchronous execution. This is OK for shader-db, but the driver + * shouldn't use the debug callback in any other way. + * + * + * Thread-safety requirements on context functions + * ----------------------------------------------- + * + * These pipe_context functions are executed directly, so they shouldn't use + * pipe_context in an unsafe way. They are de-facto screen functions now: + * - create_query + * - create_batch_query + * - create_*_state (all CSOs and shaders) + * - Make sure the shader compiler doesn't use any per-context stuff. + * (e.g. LLVM target machine) + * - Only pipe_context's debug callback for shader dumps is guaranteed to + * be up to date, because set_debug_callback synchronizes execution. + * - create_surface + * - surface_destroy + * - create_sampler_view + * - sampler_view_destroy + * - stream_output_target_destroy + * - transfer_map (only unsychronized buffer mappings) + * - get_query_result (when threaded_query::flushed == true) + * + * Create calls causing a sync that can't be async due to driver limitations: + * - create_stream_output_target + * + * + * Transfer_map rules for buffer mappings + * -------------------------------------- + * + * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made + * in the non-driver thread without flushing the queue. The driver will + * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_- + * UNSYNCHRONIZED to indicate this. + * Note that transfer_unmap is always enqueued and called from the driver + * thread. + * + * 2) The driver isn't allowed to infer unsychronized mappings by tracking + * the valid buffer range. The threaded context always sends TC_TRANSFER_- + * MAP_IGNORE_VALID_RANGE to indicate this. Ignoring the flag will lead + * to failures. + * The threaded context does its own detection of unsynchronized mappings. + * + * 3) The driver isn't allowed to do buffer invalidations by itself under any + * circumstances. This is necessary for unsychronized maps to map the latest + * version of the buffer. (because invalidations can be queued, while + * unsychronized maps are not queued and they should return the latest + * storage after invalidation). The threaded context always sends + * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to + * indicate this. Ignoring the flag will lead to failures. + * The threaded context uses its own buffer invalidation mechanism. + * + * + * Additional requirements + * ----------------------- + * + * get_query_result: + * If threaded_query::flushed == true, get_query_result should assume that + * it's called from a non-driver thread, in which case the driver shouldn't + * use the context in an unsafe way. + * + * replace_buffer_storage: + * The driver has to implement this callback, which will be called when + * the threaded context wants to replace a resource's backing storage with + * another resource's backing storage. The threaded context uses it to + * implement buffer invalidation. This call is always queued. + * + * + * Performance gotchas + * ------------------- + * + * Buffer invalidations are done unconditionally - they don't check whether + * the buffer is busy. This can cause drivers to have more live allocations + * and CPU mappings than necessary. + * + * + * How it works (queue architecture) + * --------------------------------- + * + * There is a multithreaded queue consisting of batches, each batch consisting + * of call slots. Each call slot consists of an 8-byte header (call ID + + * call size + constant 32-bit marker for integrity checking) and an 8-byte + * body for per-call data. That is 16 bytes per call slot. + * + * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger + * calls occupy multiple call slots depending on the size needed by call + * parameters. That means that calls can have a variable size in the batch. + * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only + * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots. + * Even though the first call slot can use only 8 bytes for data, additional + * call slots used by the same call can use all 16 bytes for data. + * For example, a call using 2 call slots has 24 bytes of space for data. + * + * Once a batch is full and there is no space for the next call, it's flushed, + * meaning that it's added to the queue for execution in the other thread. + * The batches are ordered in a ring and reused once they are idle again. + * The batching is necessary for low queue/mutex overhead. + * + */ + +#ifndef U_THREADED_CONTEXT_H +#define U_THREADED_CONTEXT_H + +#include "pipe/p_context.h" +#include "pipe/p_state.h" +#include "util/u_queue.h" +#include "util/u_range.h" +#include "util/slab.h" + +/* These are transfer flags sent to drivers. */ +/* Never infer whether it's safe to use unsychronized mappings: */ +#define TC_TRANSFER_MAP_IGNORE_VALID_RANGE (1u << 29) +/* Don't invalidate buffers: */ +#define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30) +/* transfer_map is called from a non-driver thread: */ +#define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31) + +/* Size of the queue = number of batch slots in memory. + * - 1 batch is always idle and records new commands + * - 1 batch is being executed + * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches. + * + * Use a size as small as possible for low CPU L2 cache usage but large enough + * so that the queue isn't stalled too often for not having enough idle batch + * slots. + */ +#define TC_MAX_BATCHES 10 + +/* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer) + * can occupy multiple call slots. + * + * The idea is to have batches as small as possible but large enough so that + * the queuing and mutex overhead is negligible. + */ +#define TC_CALLS_PER_BATCH 192 + +/* Threshold for when to use the queue or sync. */ +#define TC_MAX_STRING_MARKER_BYTES 512 + +/* Threshold for when to enqueue buffer/texture_subdata as-is. + * If the upload size is greater than this, it will do instead: + * - for buffers: DISCARD_RANGE is done by the threaded context + * - for textures: sync and call the driver directly + */ +#define TC_MAX_SUBDATA_BYTES 320 + +typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, + struct pipe_resource *dst, + struct pipe_resource *src); + +struct threaded_resource { + struct pipe_resource b; + const struct u_resource_vtbl *vtbl; + + /* Since buffer invalidations are queued, we can't use the base resource + * for unsychronized mappings. This points to the latest version of + * the buffer after the latest invalidation. It's only used for unsychro- + * nized mappings in the non-driver thread. Initially it's set to &b. + */ + struct pipe_resource *latest; + + /* The buffer range which is initialized (with a write transfer, streamout, + * or writable shader resources). The remainder of the buffer is considered + * invalid and can be mapped unsynchronized. + * + * This allows unsychronized mapping of a buffer range which hasn't been + * used yet. It's for applications which forget to use the unsynchronized + * map flag and expect the driver to figure it out. + * + * Drivers should set this to the full range for buffers backed by user + * memory. + */ + struct util_range valid_buffer_range; + + /* If "this" is not the base instance of the buffer, but it's one of its + * reallocations (set in "latest" of the base instance), this points to + * the valid range of the base instance. It's used for transfers after + * a buffer invalidation, because such transfers operate on "latest", not + * the base instance. Initially it's set to &valid_buffer_range. + */ + struct util_range *base_valid_buffer_range; + + /* Drivers are required to update this for shared resources and user + * pointers. */ + bool is_shared; + bool is_user_ptr; +}; + +struct threaded_transfer { + struct pipe_transfer b; + + /* Staging buffer for DISCARD_RANGE transfers. */ + struct pipe_resource *staging; + + /* Offset into the staging buffer, because the backing buffer is + * sub-allocated. */ + unsigned offset; +}; + +struct threaded_query { + /* The query is added to the list in end_query and removed in flush. */ + struct list_head head_unflushed; + + /* Whether pipe->flush has been called after end_query. */ + bool flushed; +}; + +/* This is the second half of tc_call containing call data. + * Most calls will typecast this to the type they need, typically larger + * than 8 bytes. + */ +union tc_payload { + struct pipe_query *query; + struct pipe_resource *resource; + struct pipe_transfer *transfer; + uint64_t __use_8_bytes; +}; + +struct tc_call { + unsigned sentinel; + ushort num_call_slots; + ushort call_id; + union tc_payload payload; +}; + +struct tc_batch { + struct pipe_context *pipe; + unsigned sentinel; + unsigned num_total_call_slots; + struct util_queue_fence fence; + struct tc_call call[TC_CALLS_PER_BATCH]; + unsigned sentinel2; +}; + +struct threaded_context { + struct pipe_context base; + struct pipe_context *pipe; + struct slab_child_pool pool_transfers; + tc_replace_buffer_storage_func replace_buffer_storage; + unsigned map_buffer_alignment; + + struct list_head unflushed_queries; + + /* Counters for the HUD. */ + unsigned num_offloaded_slots; + unsigned num_direct_slots; + unsigned num_syncs; + + struct util_queue queue; + struct util_queue_fence *fence; + + unsigned last, next; + struct tc_batch batch_slots[TC_MAX_BATCHES]; +}; + +void threaded_resource_init(struct pipe_resource *res); +void threaded_resource_deinit(struct pipe_resource *res); +struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); + +struct pipe_context * +threaded_context_create(struct pipe_context *pipe, + struct slab_parent_pool *parent_transfer_pool, + tc_replace_buffer_storage_func replace_buffer, + struct threaded_context **out); + +static inline struct threaded_context * +threaded_context(struct pipe_context *pipe) +{ + return (struct threaded_context*)pipe; +} + +static inline struct threaded_resource * +threaded_resource(struct pipe_resource *res) +{ + return (struct threaded_resource*)res; +} + +static inline struct threaded_query * +threaded_query(struct pipe_query *q) +{ + return (struct threaded_query*)q; +} + +static inline struct threaded_transfer * +threaded_transfer(struct pipe_transfer *transfer) +{ + return (struct threaded_transfer*)transfer; +} + +#endif diff --git a/src/gallium/auxiliary/util/u_threaded_context_calls.h b/src/gallium/auxiliary/util/u_threaded_context_calls.h new file mode 100644 index 00000000000..7dfccb0f0de --- /dev/null +++ b/src/gallium/auxiliary/util/u_threaded_context_calls.h @@ -0,0 +1,66 @@ +CALL(destroy_query) +CALL(begin_query) +CALL(end_query) +CALL(get_query_result_resource) +CALL(render_condition) +CALL(bind_sampler_states) +CALL(set_framebuffer_state) +CALL(set_tess_state) +CALL(set_constant_buffer) +CALL(set_scissor_states) +CALL(set_viewport_states) +CALL(set_window_rectangles) +CALL(set_sampler_views) +CALL(set_shader_images) +CALL(set_shader_buffers) +CALL(set_vertex_buffers) +CALL(set_stream_output_targets) +CALL(replace_buffer_storage) +CALL(transfer_flush_region) +CALL(transfer_unmap) +CALL(buffer_subdata) +CALL(texture_subdata) +CALL(emit_string_marker) +CALL(draw_vbo) +CALL(launch_grid) +CALL(resource_copy_region) +CALL(blit) +CALL(generate_mipmap) +CALL(flush_resource) +CALL(invalidate_resource) +CALL(clear) +CALL(clear_buffer) +CALL(clear_texture) +CALL(resource_commit) +CALL(set_active_query_state) +CALL(set_blend_color) +CALL(set_stencil_ref) +CALL(set_clip_state) +CALL(set_sample_mask) +CALL(set_min_samples) +CALL(set_polygon_stipple) +CALL(texture_barrier) +CALL(memory_barrier) + +CALL(bind_blend_state) +CALL(bind_rasterizer_state) +CALL(bind_depth_stencil_alpha_state) +CALL(bind_compute_state) +CALL(bind_fs_state) +CALL(bind_vs_state) +CALL(bind_gs_state) +CALL(bind_tcs_state) +CALL(bind_tes_state) +CALL(bind_vertex_elements_state) + +CALL(delete_blend_state) +CALL(delete_rasterizer_state) +CALL(delete_depth_stencil_alpha_state) +CALL(delete_compute_state) +CALL(delete_fs_state) +CALL(delete_vs_state) +CALL(delete_gs_state) +CALL(delete_tcs_state) +CALL(delete_tes_state) +CALL(delete_vertex_elements_state) +CALL(delete_sampler_state) |