diff options
author | Marek Olšák <[email protected]> | 2017-08-19 18:56:36 +0200 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2017-08-22 13:29:47 +0200 |
commit | 113278ee79a6366ad88a4f584aa1c0310d71b479 (patch) | |
tree | 4924237f2e22c383472f8cef09138a02183ac50b /src/gallium/drivers/radeonsi | |
parent | 166823bfd26ff7e9b88099598305967336525716 (diff) |
radeonsi: remove Constant Engine support
We have come to the conclusion that it doesn't improve performance.
Tested-by: Dieter Nützel <[email protected]>
Reviewed-by: Samuel Pitoiset <[email protected]>
Reviewed-by: Nicolai Hähnle <[email protected]>
Diffstat (limited to 'src/gallium/drivers/radeonsi')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_debug.c | 32 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_descriptors.c | 295 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_hw_context.c | 51 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 43 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 10 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 24 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_draw.c | 39 |
8 files changed, 33 insertions, 465 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index d0e481a3f15..3ebd22c3c16 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -844,12 +844,8 @@ static void si_launch_grid( if (program->ir_type == PIPE_SHADER_IR_TGSI) si_setup_tgsi_grid(sctx, info); - si_ce_pre_draw_synchronization(sctx); - si_emit_dispatch_packets(sctx, info); - si_ce_post_draw_synchronization(sctx); - if (unlikely(sctx->current_saved_cs)) si_trace_emit(sctx); diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index b6bddc52160..c2242a6deab 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -274,7 +274,6 @@ struct si_log_chunk_cs { struct si_saved_cs *cs; bool dump_bo_list; unsigned gfx_begin, gfx_end; - unsigned ce_begin, ce_end; }; static void si_log_chunk_type_cs_destroy(void *data) @@ -331,7 +330,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) struct si_context *ctx = chunk->ctx; struct si_saved_cs *scs = chunk->cs; int last_trace_id = -1; - int last_ce_trace_id = -1; /* We are expecting that the ddebug pipe has already * waited for the context, so this buffer should be idle. @@ -341,10 +339,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) NULL, PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ); - if (map) { + if (map) last_trace_id = map[0]; - last_ce_trace_id = map[1]; - } if (chunk->gfx_end != chunk->gfx_begin) { if (chunk->gfx_begin == 0) { @@ -372,21 +368,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) } } - if (chunk->ce_end != chunk->ce_begin) { - assert(ctx->ce_ib); - - if (scs->flushed) { - ac_parse_ib(f, scs->ce.ib + chunk->ce_begin, - chunk->ce_end - chunk->ce_begin, - last_ce_trace_id, "CE IB", ctx->b.chip_class, - NULL, NULL); - } else { - si_parse_current_ib(f, ctx->ce_ib, chunk->ce_begin, - chunk->ce_end, last_ce_trace_id, "CE IB", - ctx->b.chip_class); - } - } - if (chunk->dump_bo_list) { fprintf(f, "Flushing.\n\n"); si_dump_bo_list(ctx, &scs->gfx, f); @@ -405,14 +386,9 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, struct si_saved_cs *scs = ctx->current_saved_cs; unsigned gfx_cur = ctx->b.gfx.cs->prev_dw + ctx->b.gfx.cs->current.cdw; - unsigned ce_cur = 0; - - if (ctx->ce_ib) - ce_cur = ctx->ce_ib->prev_dw + ctx->ce_ib->current.cdw; if (!dump_bo_list && - gfx_cur == scs->gfx_last_dw && - ce_cur == scs->ce_last_dw) + gfx_cur == scs->gfx_last_dw) return; struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); @@ -425,10 +401,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, chunk->gfx_end = gfx_cur; scs->gfx_last_dw = gfx_cur; - chunk->ce_begin = scs->ce_last_dw; - chunk->ce_end = ce_cur; - scs->ce_last_dw = ce_cur; - u_log_chunk(log, &si_log_chunk_type_cs, chunk); } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index ab399a5fb0d..646a9ec2570 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -97,11 +97,6 @@ static uint32_t null_image_descriptor[8] = { * descriptor */ }; -static uint16_t si_ce_ram_size(struct si_context *sctx) -{ - return sctx->b.chip_class >= GFX9 ? 4096 : 32768; -} - static void si_init_descriptor_list(uint32_t *desc_list, unsigned element_dw_size, unsigned num_elements, @@ -117,32 +112,15 @@ static void si_init_descriptor_list(uint32_t *desc_list, } } -static void si_init_descriptors(struct si_context *sctx, - struct si_descriptors *desc, +static void si_init_descriptors(struct si_descriptors *desc, unsigned shader_userdata_index, unsigned element_dw_size, - unsigned num_elements, - unsigned first_ce_slot, - unsigned num_ce_slots, - unsigned *ce_offset) + unsigned num_elements) { desc->list = CALLOC(num_elements, element_dw_size * 4); desc->element_dw_size = element_dw_size; desc->num_elements = num_elements; - desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0; - desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0; - desc->dirty_mask = 0; desc->shader_userdata_offset = shader_userdata_index * 4; - - if (desc->num_ce_slots) { - assert(num_elements <= sizeof(desc->dirty_mask)*8); - - desc->uses_ce = true; - desc->ce_offset = *ce_offset; - desc->dirty_mask = u_bit_consecutive64(0, num_elements); - - *ce_offset += element_dw_size * desc->num_ce_slots * 4; - } } static void si_release_descriptors(struct si_descriptors *desc) @@ -151,80 +129,6 @@ static void si_release_descriptors(struct si_descriptors *desc) FREE(desc->list); } -static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, - unsigned *out_offset, struct r600_resource **out_buf) -{ - uint64_t va; - unsigned cache_line_size = sctx->screen->b.info.tcc_cache_line_size; - - /* The base and size should be aligned to the L2 cache line size - * for optimal performance. (all dumps should rewrite whole lines) - */ - size = align(size, cache_line_size); - - (void)si_ce_ram_size; /* silence an "unused" warning */ - assert(ce_offset + size <= si_ce_ram_size(sctx)); - - u_suballocator_alloc(sctx->ce_suballocator, size, cache_line_size, - out_offset, (struct pipe_resource**)out_buf); - if (!out_buf) - return false; - - va = (*out_buf)->gpu_address + *out_offset; - - radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); - radeon_emit(sctx->ce_ib, ce_offset); - radeon_emit(sctx->ce_ib, size / 4); - radeon_emit(sctx->ce_ib, va); - radeon_emit(sctx->ce_ib, va >> 32); - - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf, - RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); - - sctx->ce_need_synchronization = true; - return true; -} - -void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx) -{ - bool success = si_ce_upload(sctx, 0, sctx->total_ce_ram_allocated, - &sctx->ce_ram_saved_offset, - &sctx->ce_ram_saved_buffer); - (void)success; - assert(success); -} - -void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx) -{ - if (!sctx->ce_ram_saved_buffer) - return; - - struct radeon_winsys_cs *ib = sctx->ce_preamble_ib; - if (!ib) - ib = sctx->ce_ib; - - uint64_t va = sctx->ce_ram_saved_buffer->gpu_address + - sctx->ce_ram_saved_offset; - - radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0)); - radeon_emit(ib, va); - radeon_emit(ib, va >> 32); - radeon_emit(ib, sctx->total_ce_ram_allocated / 4); - radeon_emit(ib, 0); - - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - sctx->ce_ram_saved_buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); -} - -void si_ce_enable_loads(struct radeon_winsys_cs *ib) -{ - radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); - radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) | - CONTEXT_CONTROL_LOAD_CE_RAM(1)); - radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1)); -} - static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc, struct r600_atom * atom) @@ -240,56 +144,25 @@ static bool si_upload_descriptors(struct si_context *sctx, if (!upload_size) return true; - if (desc->uses_ce) { - const uint32_t *list = desc->list + - desc->first_ce_slot * desc->element_dw_size; - uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) & - u_bit_consecutive64(0, desc->num_ce_slots); - - - while (mask) { - int begin, count; - u_bit_scan_consecutive_range64(&mask, &begin, &count); - - begin *= desc->element_dw_size; - count *= desc->element_dw_size; - - radeon_emit(sctx->ce_ib, - PKT3(PKT3_WRITE_CONST_RAM, count, 0)); - radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4); - radeon_emit_array(sctx->ce_ib, list + begin, count); - } - - if (!si_ce_upload(sctx, - desc->ce_offset + - (first_slot_offset - desc->first_ce_slot * slot_size), - upload_size, (unsigned*)&desc->buffer_offset, - &desc->buffer)) - return false; - } else { - uint32_t *ptr; - - u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, - si_optimal_tcc_alignment(sctx, upload_size), - (unsigned*)&desc->buffer_offset, - (struct pipe_resource**)&desc->buffer, - (void**)&ptr); - if (!desc->buffer) - return false; /* skip the draw call */ + uint32_t *ptr; + u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, + si_optimal_tcc_alignment(sctx, upload_size), + (unsigned*)&desc->buffer_offset, + (struct pipe_resource**)&desc->buffer, + (void**)&ptr); + if (!desc->buffer) + return false; /* skip the draw call */ - util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, - upload_size); - desc->gpu_list = ptr - first_slot_offset / 4; + util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, + upload_size); + desc->gpu_list = ptr - first_slot_offset / 4; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); - } + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, + RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); /* The shader pointer should point to slot 0. */ desc->buffer_offset -= first_slot_offset; - desc->dirty_mask = 0; - if (atom) si_mark_atom_dirty(sctx, atom); @@ -598,7 +471,6 @@ static void si_set_sampler_view(struct si_context *sctx, views->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1ull << desc_slot; sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } @@ -750,8 +622,6 @@ si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot) memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4); images->enabled_mask &= ~(1u << slot); - /* two 8-byte images share one 16-byte slot */ - descs->dirty_mask |= 1u << (desc_slot / 2); ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } } @@ -887,8 +757,6 @@ static void si_set_shader_image(struct si_context *ctx, } images->enabled_mask |= 1u << slot; - /* two 8-byte images share one 16-byte slot */ - descs->dirty_mask |= 1u << (desc_slot / 2); ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); /* Since this can flush, it must be done after enabled_mask is updated. */ @@ -984,25 +852,20 @@ static void si_bind_sampler_states(struct pipe_context *ctx, continue; memcpy(desc->list + desc_slot * 16 + 12, sstates[i]->val, 4*4); - desc->dirty_mask |= 1ull << desc_slot; sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } } /* BUFFER RESOURCES */ -static void si_init_buffer_resources(struct si_context *sctx, - struct si_buffer_resources *buffers, +static void si_init_buffer_resources(struct si_buffer_resources *buffers, struct si_descriptors *descs, unsigned num_buffers, - unsigned first_ce_slot, - unsigned num_ce_slots, unsigned shader_userdata_index, enum radeon_bo_usage shader_usage, enum radeon_bo_usage shader_usage_constbuf, enum radeon_bo_priority priority, - enum radeon_bo_priority priority_constbuf, - unsigned *ce_offset) + enum radeon_bo_priority priority_constbuf) { buffers->shader_usage = shader_usage; buffers->shader_usage_constbuf = shader_usage_constbuf; @@ -1010,8 +873,7 @@ static void si_init_buffer_resources(struct si_context *sctx, buffers->priority_constbuf = priority_constbuf; buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers, - first_ce_slot, num_ce_slots, ce_offset); + si_init_descriptors(descs, shader_userdata_index, 4, num_buffers); } static void si_release_buffer_resources(struct si_buffer_resources *buffers, @@ -1277,7 +1139,6 @@ static void si_set_constant_buffer(struct si_context *sctx, buffers->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << descriptors_idx; } @@ -1339,7 +1200,6 @@ static void si_set_shader_buffers(struct pipe_context *ctx, pipe_resource_reference(&buffers->buffers[slot], NULL); memset(desc, 0, sizeof(uint32_t) * 4); buffers->enabled_mask &= ~(1u << slot); - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << si_const_and_shader_buffer_descriptors_idx(shader); continue; @@ -1366,7 +1226,6 @@ static void si_set_shader_buffers(struct pipe_context *ctx, buf->bind_history |= PIPE_BIND_SHADER_BUFFER; buffers->enabled_mask |= 1u << slot; - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << si_const_and_shader_buffer_descriptors_idx(shader); @@ -1486,7 +1345,6 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint slot, buffers->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; } @@ -1591,7 +1449,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, NULL); buffers->enabled_mask &= ~(1u << bufidx); } - descs->dirty_mask |= 1u << bufidx; } for (; i < old_num_targets; i++) { bufidx = SI_VS_STREAMOUT_BUF0 + i; @@ -1599,7 +1456,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, memset(descs->list + bufidx*4, 0, sizeof(uint32_t) * 4); pipe_resource_reference(&buffers->buffers[bufidx], NULL); buffers->enabled_mask &= ~(1u << bufidx); - descs->dirty_mask |= 1u << bufidx; } sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; @@ -1718,7 +1574,6 @@ static void si_reset_buffer_resources(struct si_context *sctx, si_desc_reset_buffer_offset(&sctx->b.b, descs->list + i*4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= 1u << descriptors_idx; radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, @@ -1772,7 +1627,6 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf si_desc_reset_buffer_offset(ctx, descs->list + i*4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, @@ -1827,7 +1681,6 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf descs->list + desc_slot * 16 + 4, old_va, buf); - descs->dirty_mask |= 1ull << desc_slot; sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); @@ -1860,8 +1713,6 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf si_desc_reset_buffer_offset( ctx, descs->list + desc_slot * 8 + 4, old_va, buf); - /* two 8-byte images share one 16-byte slot */ - descs->dirty_mask |= 1u << (desc_slot / 2); sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); @@ -2328,8 +2179,7 @@ static void si_init_bindless_descriptors(struct si_context *sctx, unsigned shader_userdata_index, unsigned num_elements) { - si_init_descriptors(sctx, desc, shader_userdata_index, 16, num_elements, - 0, 0, NULL); + si_init_descriptors(desc, shader_userdata_index, 16, num_elements); sctx->bindless_descriptors.num_active_slots = num_elements; /* The first bindless descriptor is stored at slot 1, because 0 is not @@ -2753,52 +2603,9 @@ void si_all_resident_buffers_begin_new_cs(struct si_context *sctx) /* INIT/DEINIT/UPLOAD */ -/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order - * to make CE RAM as useful as possible, this defines limits - * for the number slots that can be in CE RAM on GFX9. If a shader - * is using more, descriptors will be uploaded to memory directly and - * CE won't be used. - * - * These numbers are based on shader-db. - */ -static unsigned gfx9_max_ce_samplers[SI_NUM_SHADERS] = { - [PIPE_SHADER_VERTEX] = 0, - [PIPE_SHADER_TESS_CTRL] = 0, - [PIPE_SHADER_TESS_EVAL] = 1, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 24, - [PIPE_SHADER_COMPUTE] = 16, -}; -static unsigned gfx9_max_ce_images[SI_NUM_SHADERS] = { - /* these must be even due to slot alignment */ - [PIPE_SHADER_VERTEX] = 0, - [PIPE_SHADER_TESS_CTRL] = 0, - [PIPE_SHADER_TESS_EVAL] = 0, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 2, - [PIPE_SHADER_COMPUTE] = 8, -}; -static unsigned gfx9_max_ce_const_buffers[SI_NUM_SHADERS] = { - [PIPE_SHADER_VERTEX] = 9, - [PIPE_SHADER_TESS_CTRL] = 3, - [PIPE_SHADER_TESS_EVAL] = 5, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 8, - [PIPE_SHADER_COMPUTE] = 6, -}; -static unsigned gfx9_max_ce_shader_buffers[SI_NUM_SHADERS] = { - [PIPE_SHADER_VERTEX] = 0, - [PIPE_SHADER_TESS_CTRL] = 0, - [PIPE_SHADER_TESS_EVAL] = 0, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 12, - [PIPE_SHADER_COMPUTE] = 13, -}; - void si_init_all_descriptors(struct si_context *sctx) { int i; - unsigned ce_offset = 0; STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0); STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0); @@ -2809,49 +2616,28 @@ void si_init_all_descriptors(struct si_context *sctx) unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS; unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; - unsigned first_sampler_ce_slot = 0; - unsigned num_sampler_ce_slots = num_sampler_slots; - - unsigned first_buffer_ce_slot = 0; - unsigned num_buffer_ce_slots = num_buffer_slots; - - /* Adjust CE slot ranges based on GFX9 CE RAM limits. */ if (sctx->b.chip_class >= GFX9) { gfx9_tcs = i == PIPE_SHADER_TESS_CTRL; gfx9_gs = i == PIPE_SHADER_GEOMETRY; - - first_sampler_ce_slot = - si_get_image_slot(gfx9_max_ce_images[i] - 1) / 2; - num_sampler_ce_slots = gfx9_max_ce_images[i] / 2 + - gfx9_max_ce_samplers[i]; - - first_buffer_ce_slot = - si_get_shaderbuf_slot(gfx9_max_ce_shader_buffers[i] - 1); - num_buffer_ce_slots = gfx9_max_ce_shader_buffers[i] + - gfx9_max_ce_const_buffers[i]; } - si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i], + si_init_buffer_resources(&sctx->const_and_shader_buffers[i], si_const_and_shader_buffer_descriptors(sctx, i), num_buffer_slots, - first_buffer_ce_slot, num_buffer_ce_slots, gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS : gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS : SI_SGPR_CONST_AND_SHADER_BUFFERS, RADEON_USAGE_READWRITE, RADEON_USAGE_READ, RADEON_PRIO_SHADER_RW_BUFFER, - RADEON_PRIO_CONST_BUFFER, - &ce_offset); + RADEON_PRIO_CONST_BUFFER); struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i); - si_init_descriptors(sctx, desc, + si_init_descriptors(desc, gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES : gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES : SI_SGPR_SAMPLERS_AND_IMAGES, - 16, num_sampler_slots, - first_sampler_ce_slot, num_sampler_ce_slots, - &ce_offset); + 16, num_sampler_slots); int j; for (j = 0; j < SI_NUM_IMAGES; j++) @@ -2860,19 +2646,17 @@ void si_init_all_descriptors(struct si_context *sctx) memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); } - si_init_buffer_resources(sctx, &sctx->rw_buffers, + si_init_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS], - SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS, - SI_SGPR_RW_BUFFERS, + SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, /* The second set of usage/priority is used by * const buffers in RW buffer slots. */ RADEON_USAGE_READWRITE, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER, - &ce_offset); + RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER); sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; - si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, - 4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL); + si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, + 4, SI_NUM_VERTEX_BUFFERS); FREE(sctx->vertex_buffers.list); /* not used */ sctx->vertex_buffers.list = NULL; @@ -2884,9 +2668,6 @@ void si_init_all_descriptors(struct si_context *sctx) 1024); sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - sctx->total_ce_ram_allocated = ce_offset; - - assert(ce_offset <= si_ce_ram_size(sctx)); /* Set pipe_context functions. */ sctx->b.b.bind_sampler_states = si_bind_sampler_states; @@ -3026,26 +2807,6 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, first + count > desc->first_active_slot + desc->num_active_slots) sctx->descriptors_dirty |= 1u << desc_idx; - /* Enable or disable CE for this descriptor array. */ - bool used_ce = desc->uses_ce; - desc->uses_ce = desc->first_ce_slot <= first && - desc->first_ce_slot + desc->num_ce_slots >= first + count; - - if (desc->uses_ce != used_ce) { - /* Upload or dump descriptors if we're disabling or enabling CE, - * respectively. */ - sctx->descriptors_dirty |= 1u << desc_idx; - - /* If we're enabling CE, re-upload all descriptors to CE RAM. - * When CE was disabled, uploads to CE RAM stopped. - */ - if (desc->uses_ce) { - desc->dirty_mask |= - u_bit_consecutive64(desc->first_ce_slot, - desc->num_ce_slots); - } - } - desc->first_active_slot = first; desc->num_active_slots = count; } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 7481d013f3c..f3af8dcb446 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -27,40 +27,9 @@ #include "si_pipe.h" #include "radeon/r600_cs.h" -static unsigned si_descriptor_list_cs_space(unsigned count, unsigned element_size) -{ - /* Ensure we have enough space to start a new range in a hole */ - assert(element_size >= 3); - - /* 5 dwords for write to L2 + 3 bytes for the packet header of - * every disjoint range written to CE RAM. - */ - return 5 + (3 * count / 2) + count * element_size; -} - -static unsigned si_ce_needed_cs_space(void) -{ - unsigned space = 0; - - space += si_descriptor_list_cs_space(SI_NUM_SHADER_BUFFERS + - SI_NUM_CONST_BUFFERS, 4); - /* two 8-byte images share one 16-byte slot */ - space += si_descriptor_list_cs_space(SI_NUM_IMAGES / 2 + - SI_NUM_SAMPLERS, 16); - space *= SI_NUM_SHADERS; - - space += si_descriptor_list_cs_space(SI_NUM_RW_BUFFERS, 4); - - /* Increment CE counter packet */ - space += 2; - - return space; -} - void si_destroy_saved_cs(struct si_saved_cs *scs) { radeon_clear_saved_cs(&scs->gfx); - radeon_clear_saved_cs(&scs->ce); r600_resource_reference(&scs->trace_buf, NULL); free(scs); } @@ -69,7 +38,6 @@ void si_destroy_saved_cs(struct si_saved_cs *scs) void si_need_cs_space(struct si_context *ctx) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; - struct radeon_winsys_cs *ce_ib = ctx->ce_ib; /* There is no need to flush the DMA IB here, because * r600_need_dma_space always flushes the GFX IB if there is @@ -95,8 +63,7 @@ void si_need_cs_space(struct si_context *ctx) /* If the CS is sufficiently large, don't count the space needed * and just flush if there is not enough space left. */ - if (!ctx->b.ws->cs_check_space(cs, 2048) || - (ce_ib && !ctx->b.ws->cs_check_space(ce_ib, si_ce_needed_cs_space()))) + if (!ctx->b.ws->cs_check_space(cs, 2048)) ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } @@ -131,10 +98,6 @@ void si_context_gfx_flush(void *context, unsigned flags, ctx->gfx_flush_in_progress = true; - /* This CE dump should be done in parallel with the last draw. */ - if (ctx->ce_ib) - si_ce_save_all_descriptors_at_ib_end(ctx); - r600_preflush_suspend_features(&ctx->b); ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | @@ -153,8 +116,6 @@ void si_context_gfx_flush(void *context, unsigned flags, /* Save the IB for debug contexts. */ radeon_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true); - if (ctx->ce_ib) - radeon_save_cs(ws, ctx->ce_ib, &ctx->current_saved_cs->ce, false); ctx->current_saved_cs->flushed = true; } @@ -183,7 +144,7 @@ void si_context_gfx_flush(void *context, unsigned flags, static void si_begin_cs_debug(struct si_context *ctx) { - static const uint32_t zeros[2]; + static const uint32_t zeros[1]; assert(!ctx->current_saved_cs); ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs)); @@ -233,14 +194,6 @@ void si_begin_new_cs(struct si_context *ctx) if (ctx->init_config_gs_rings) si_pm4_emit(ctx, ctx->init_config_gs_rings); - if (ctx->ce_preamble_ib) - si_ce_enable_loads(ctx->ce_preamble_ib); - else if (ctx->ce_ib) - si_ce_enable_loads(ctx->ce_ib); - - if (ctx->ce_ib) - si_ce_restore_all_descriptors_at_ib_start(ctx); - if (ctx->queued.named.ls) ctx->prefetch_L2_mask |= SI_PREFETCH_LS; if (ctx->queued.named.hs) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1523eaed941..3ceaaac165a 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -55,10 +55,6 @@ static void si_destroy_context(struct pipe_context *context) si_release_all_descriptors(sctx); - if (sctx->ce_suballocator) - u_suballocator_destroy(sctx->ce_suballocator); - - r600_resource_reference(&sctx->ce_ram_saved_buffer, NULL); pipe_resource_reference(&sctx->esgs_ring, NULL); pipe_resource_reference(&sctx->gsvs_ring, NULL); pipe_resource_reference(&sctx->tf_ring, NULL); @@ -210,45 +206,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, sctx); - - bool enable_ce = sscreen->b.chip_class != SI && /* SI hangs */ - /* These can't use CE due to a power gating bug in the kernel. */ - sscreen->b.family != CHIP_CARRIZO && - sscreen->b.family != CHIP_STONEY; - - /* CE is currently disabled by default, because it makes s_load latency - * worse, because CE IB doesn't run in lockstep with DE. - * Remove this line after that performance issue has been resolved. - */ - enable_ce = false; - - /* Apply CE overrides. */ - if (sscreen->b.debug_flags & DBG_NO_CE) - enable_ce = false; - else if (sscreen->b.debug_flags & DBG_CE) - enable_ce = true; - - if (ws->cs_add_const_ib && enable_ce) { - sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs); - if (!sctx->ce_ib) - goto fail; - - if (ws->cs_add_const_preamble_ib) { - sctx->ce_preamble_ib = - ws->cs_add_const_preamble_ib(sctx->b.gfx.cs); - - if (!sctx->ce_preamble_ib) - goto fail; - } - - sctx->ce_suballocator = - u_suballocator_create(&sctx->b.b, 1024 * 1024, 0, - PIPE_USAGE_DEFAULT, - R600_RESOURCE_FLAG_UNMAPPABLE, false); - if (!sctx->ce_suballocator) - goto fail; - } - sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index fd99c975ad4..69a35ea1945 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -266,12 +266,10 @@ struct si_saved_cs { struct pipe_reference reference; struct si_context *ctx; struct radeon_saved_cs gfx; - struct radeon_saved_cs ce; struct r600_resource *trace_buf; unsigned trace_id; unsigned gfx_last_dw; - unsigned ce_last_dw; bool flushed; }; @@ -288,15 +286,7 @@ struct si_context { struct si_shader_ctx_state fixed_func_tcs_shader; struct r600_resource *wait_mem_scratch; unsigned wait_mem_number; - - struct radeon_winsys_cs *ce_ib; - struct radeon_winsys_cs *ce_preamble_ib; - struct r600_resource *ce_ram_saved_buffer; - struct u_suballocator *ce_suballocator; - unsigned ce_ram_saved_offset; - uint16_t total_ce_ram_allocated; uint16_t prefetch_L2_mask; - bool ce_need_synchronization:1; bool gfx_flush_in_progress:1; bool compute_is_busy:1; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 2b3c37fa16d..ca701658d0b 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -224,8 +224,6 @@ struct si_descriptors { uint32_t *list; /* The list in mapped GPU memory. */ uint32_t *gpu_list; - /* Slots that have been changed and need to be uploaded. */ - uint64_t dirty_mask; /* The buffer where the descriptors have been uploaded. */ struct r600_resource *buffer; @@ -236,27 +234,12 @@ struct si_descriptors { /* The maximum number of descriptors. */ uint32_t num_elements; - /* Offset in CE RAM */ - uint16_t ce_offset; - - /* Slots allocated in CE RAM. If we get active slots outside of this - * range, direct uploads to memory will be used instead. This basically - * governs switching between onchip (CE) and offchip (upload) modes. - */ - uint32_t first_ce_slot; - uint32_t num_ce_slots; - /* Slots that are used by currently-bound shaders. - * With CE: It determines which slots are dumped to L2. - * It doesn't skip uploads to CE RAM. - * Without CE: It determines which slots are uploaded. + * It determines which slots are uploaded. */ uint32_t first_active_slot; uint32_t num_active_slots; - /* Whether CE is used to upload this descriptor array. */ - bool uses_ce; - /* The SGPR index where the 64-bit pointer to the descriptor array will * be stored. */ ubyte shader_userdata_offset; @@ -307,9 +290,6 @@ struct si_buffer_resources { } while(0) /* si_descriptors.c */ -void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx); -void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx); -void si_ce_enable_loads(struct radeon_winsys_cs *ib); void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct r600_texture *tex, const struct legacy_surf_level *base_level_info, @@ -401,8 +381,6 @@ void si_get_active_slot_masks(const struct tgsi_shader_info *info, /* si_state_draw.c */ void si_init_ia_multi_vgt_param_table(struct si_context *sctx); void si_emit_cache_flush(struct si_context *sctx); -void si_ce_pre_draw_synchronization(struct si_context *sctx); -void si_ce_post_draw_synchronization(struct si_context *sctx); void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo); void si_trace_emit(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cb9bbd20805..f2b889677a5 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1145,27 +1145,6 @@ static void si_get_draw_start_count(struct si_context *sctx, } } -void si_ce_pre_draw_synchronization(struct si_context *sctx) -{ - if (sctx->ce_need_synchronization) { - radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0)); - radeon_emit(sctx->ce_ib, 1); /* 1 = increment CE counter */ - - radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 0)); - radeon_emit(sctx->b.gfx.cs, 0); /* 0 = don't flush sL1 conditionally */ - } -} - -void si_ce_post_draw_synchronization(struct si_context *sctx) -{ - if (sctx->ce_need_synchronization) { - radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0)); - radeon_emit(sctx->b.gfx.cs, 0); /* unused */ - - sctx->ce_need_synchronization = false; - } -} - static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, unsigned skip_atom_mask) { @@ -1413,7 +1392,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) sctx->dirty_atoms = 0; } - si_ce_pre_draw_synchronization(sctx); si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* <-- CUs are busy here. */ @@ -1436,12 +1414,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) return; si_emit_all_states(sctx, info, 0); - si_ce_pre_draw_synchronization(sctx); si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); } - si_ce_post_draw_synchronization(sctx); - if (unlikely(sctx->current_saved_cs)) si_trace_emit(sctx); @@ -1485,20 +1460,6 @@ void si_trace_emit(struct si_context *sctx) radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id)); - if (sctx->ce_ib) { - struct radeon_winsys_cs *ce = sctx->ce_ib; - - radeon_emit(ce, PKT3(PKT3_WRITE_DATA, 3, 0)); - radeon_emit(ce, S_370_DST_SEL(V_370_MEM_ASYNC) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(V_370_CE)); - radeon_emit(ce, va + 4); - radeon_emit(ce, (va + 4) >> 32); - radeon_emit(ce, trace_id); - radeon_emit(ce, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(ce, AC_ENCODE_TRACE_POINT(trace_id)); - } - if (sctx->b.log) u_log_flush(sctx->b.log); } |