diff options
-rw-r--r-- | src/amd/vulkan/radv_cmd_buffer.c | 22 | ||||
-rw-r--r-- | src/amd/vulkan/radv_device.c | 22 | ||||
-rw-r--r-- | src/amd/vulkan/radv_pipeline.c | 75 | ||||
-rw-r--r-- | src/amd/vulkan/radv_private.h | 8 |
4 files changed, 119 insertions, 8 deletions
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index c62d275fd95..eebfac5fbf5 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -627,6 +627,13 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, pipeline->graphics.prim_restart_enable); + cmd_buffer->scratch_size_needed = + MAX2(cmd_buffer->scratch_size_needed, + pipeline->max_waves * pipeline->scratch_bytes_per_wave); + + radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE, + S_0286E8_WAVES(pipeline->max_waves) | + S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); cmd_buffer->state.emitted_pipeline = pipeline; } @@ -1402,6 +1409,8 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) free(up); } + cmd_buffer->scratch_size_needed = 0; + cmd_buffer->compute_scratch_size_needed = 0; if (cmd_buffer->upload.upload_bo) cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->upload.upload_bo, 8); @@ -1629,9 +1638,15 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) radeon_emit(cmd_buffer->cs, compute_shader->rsrc1); radeon_emit(cmd_buffer->cs, compute_shader->rsrc2); + + cmd_buffer->compute_scratch_size_needed = + MAX2(cmd_buffer->compute_scratch_size_needed, + pipeline->max_waves * pipeline->scratch_bytes_per_wave); + /* change these once we have scratch support */ radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(32) | S_00B860_WAVESIZE(0)); + S_00B860_WAVES(pipeline->max_waves) | + S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); radeon_emit(cmd_buffer->cs, @@ -1821,6 +1836,11 @@ void radv_CmdExecuteCommands( for (uint32_t i = 0; i < commandBufferCount; i++) { RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); + primary->scratch_size_needed = MAX2(primary->scratch_size_needed, + secondary->scratch_size_needed); + primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed, + secondary->compute_scratch_size_needed); + primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs); } diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index ad83f9f4eb1..da65511cf15 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -813,6 +813,28 @@ VkResult radv_CreateDevice( } } +#if HAVE_LLVM < 0x0400 + device->llvm_supports_spill = false; +#else + device->llvm_supports_spill = true; +#endif + + /* The maximum number of scratch waves. Scratch space isn't divided + * evenly between CUs. The number is only a function of the number of CUs. + * We can decrease the constant to decrease the scratch buffer size. + * + * sctx->scratch_waves must be >= the maximum posible size of + * 1 threadgroup, so that the hw doesn't hang from being unable + * to start any. + * + * The recommended value is 4 per CU at most. Higher numbers don't + * bring much benefit, but they still occupy chip resources (think + * async compute). I've seen ~2% performance difference between 4 and 32. + */ + uint32_t max_threads_per_block = 2048; + device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, + max_threads_per_block / 64); + result = radv_device_init_meta(device); if (result != VK_SUCCESS) goto fail; diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 4d88ed77f93..e332877e2ba 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -104,6 +104,19 @@ void radv_DestroyShaderModule( vk_free2(&device->alloc, pAllocator, module); } + +static void +radv_pipeline_destroy(struct radv_device *device, + struct radv_pipeline *pipeline, + const VkAllocationCallbacks* allocator) +{ + for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) + if (pipeline->shaders[i]) + radv_shader_variant_destroy(device, pipeline->shaders[i]); + + vk_free2(&device->alloc, allocator, pipeline); +} + void radv_DestroyPipeline( VkDevice _device, VkPipeline _pipeline, @@ -115,11 +128,7 @@ void radv_DestroyPipeline( if (!_pipeline) return; - for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) - if (pipeline->shaders[i]) - radv_shader_variant_destroy(device, pipeline->shaders[i]); - - vk_free2(&device->alloc, pAllocator, pipeline); + radv_pipeline_destroy(device, pipeline, pAllocator); } @@ -499,6 +508,48 @@ radv_pipeline_compile(struct radv_pipeline *pipeline, return variant; } +static VkResult +radv_pipeline_scratch_init(struct radv_device *device, + struct radv_pipeline *pipeline) +{ + unsigned scratch_bytes_per_wave = 0; + unsigned max_waves = 0; + unsigned min_waves = 1; + + for (int i = 0; i < MESA_SHADER_STAGES; ++i) { + if (pipeline->shaders[i]) { + unsigned max_stage_waves = device->scratch_waves; + + scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave, + pipeline->shaders[i]->config.scratch_bytes_per_wave); + + max_stage_waves = MIN2(max_stage_waves, + 4 * device->physical_device->rad_info.num_good_compute_units * + (256 / pipeline->shaders[i]->config.num_vgprs)); + max_waves = MAX2(max_waves, max_stage_waves); + } + } + + if (pipeline->shaders[MESA_SHADER_COMPUTE]) { + unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] * + pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] * + pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2]; + min_waves = MAX2(min_waves, round_up_u32(group_size, 64)); + } + + if (scratch_bytes_per_wave) + max_waves = MIN2(max_waves, 0xffffffffu / scratch_bytes_per_wave); + + if (scratch_bytes_per_wave && max_waves < min_waves) { + /* Not really true at this moment, but will be true on first + * execution. Avoid having hanging shaders. */ + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave; + pipeline->max_waves = max_waves; + return VK_SUCCESS; +} + static uint32_t si_translate_blend_function(VkBlendOp op) { switch (op) { @@ -1313,6 +1364,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline, const VkAllocationCallbacks *alloc) { struct radv_shader_module fs_m = {0}; + VkResult result; if (alloc == NULL) alloc = &device->alloc; @@ -1421,7 +1473,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline, radv_dump_pipeline_stats(device, pipeline); } - return VK_SUCCESS; + result = radv_pipeline_scratch_init(device, pipeline); + return result; } VkResult @@ -1447,7 +1500,7 @@ radv_graphics_pipeline_create( result = radv_pipeline_init(pipeline, device, cache, pCreateInfo, extra, pAllocator); if (result != VK_SUCCESS) { - vk_free2(&device->alloc, pAllocator, pipeline); + radv_pipeline_destroy(device, pipeline, pAllocator); return result; } @@ -1493,6 +1546,7 @@ static VkResult radv_compute_pipeline_create( RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache); RADV_FROM_HANDLE(radv_shader_module, module, pCreateInfo->stage.module); struct radv_pipeline *pipeline; + VkResult result; pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -1510,6 +1564,13 @@ static VkResult radv_compute_pipeline_create( pCreateInfo->stage.pSpecializationInfo, pipeline->layout, NULL); + + result = radv_pipeline_scratch_init(device, pipeline); + if (result != VK_SUCCESS) { + radv_pipeline_destroy(device, pipeline, pAllocator); + return result; + } + *pPipeline = radv_pipeline_to_handle(pipeline); if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) { diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 0b8f50a5d6d..88e05595380 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -485,6 +485,8 @@ struct radv_device { uint64_t debug_flags; + bool llvm_supports_spill; + uint32_t scratch_waves; /* MSAA sample locations. * The first index is the sample index. * The second index is the coordinate: X, Y. */ @@ -726,6 +728,9 @@ struct radv_cmd_buffer { struct radv_cmd_buffer_upload upload; bool record_fail; + + uint32_t scratch_size_needed; + uint32_t compute_scratch_size_needed; }; struct radv_image; @@ -923,6 +928,9 @@ struct radv_pipeline { bool prim_restart_enable; } graphics; }; + + unsigned max_waves; + unsigned scratch_bytes_per_wave; }; struct radv_graphics_pipeline_create_info { |