diff options
Diffstat (limited to 'src/gallium')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute.c | 32 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 22 |
2 files changed, 49 insertions, 5 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index cbcd8e79c7b..1f003cd36f2 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -797,11 +797,6 @@ static void si_emit_dispatch_packets(struct si_context *sctx, radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, compute_resource_limits); - radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); - radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); - unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) | @@ -809,6 +804,33 @@ static void si_emit_dispatch_packets(struct si_context *sctx, * allow launching waves out-of-order. (same as Vulkan) */ S_00B800_ORDER_MODE(sctx->chip_class >= CIK); + uint *last_block = sctx->compute_last_block; + bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; + + radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + + if (partial_block_en) { + unsigned partial[3]; + + /* If no partial_block, these should be an entire block size, not 0. */ + partial[0] = last_block[0] ? last_block[0] : info->block[0]; + partial[1] = last_block[1] ? last_block[1] : info->block[1]; + partial[2] = last_block[2] ? last_block[2] : info->block[2]; + + radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | + S_00B81C_NUM_THREAD_PARTIAL(partial[0])); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | + S_00B820_NUM_THREAD_PARTIAL(partial[1])); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | + S_00B824_NUM_THREAD_PARTIAL(partial[2])); + + dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); + } else { + radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); + } + if (info->indirect) { uint64_t base_va = r600_resource(info->indirect)->gpu_address; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 89a93182ed3..37eb15f539e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -896,6 +896,28 @@ struct si_context { uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD]; uint32_t cs_user_data[4]; + /** + * last_block allows disabling threads at the farthermost grid boundary. + * Full blocks as specified by "block" are launched, but the threads + * outside of "last_block" dimensions are disabled. + * + * If a block touches the grid boundary in the i-th axis, threads with + * THREAD_ID[i] >= last_block[i] are disabled. + * + * If last_block[i] is 0, it has the same behavior as last_block[i] = block[i], + * meaning no effect. + * + * It's equivalent to doing this at the beginning of the compute shader: + * + * for (i = 0; i < 3; i++) { + * if (block_id[i] == grid[i] - 1 && + * last_block[i] && last_block[i] >= thread_id[i]) + * return; + * } + * (this could be moved into pipe_grid_info) + */ + uint compute_last_block[3]; + /* Vertex and index buffers. */ bool vertex_buffers_dirty; bool vertex_buffer_pointer_dirty; |