From a639d40f1330351924d736ca260de764734f9ef7 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 25 Oct 2017 07:12:13 +0100 Subject: radv: add support for local bos. (v3) This uses the new kernel interfaces for reduced cs overhead, We only set the local flag for memory allocations that don't have a dedicated allocation and ones that aren't imports. v2: add to all the internal buffer creation paths. v3: missed some command submission paths, handle 0/empty bo lists. Reviewed-by: Bas Nieuwenhuizen Signed-off-by: Dave Airlie --- src/amd/vulkan/radv_cmd_buffer.c | 3 ++- src/amd/vulkan/radv_debug.c | 3 ++- src/amd/vulkan/radv_descriptor_set.c | 2 +- src/amd/vulkan/radv_device.c | 20 ++++++++++------- src/amd/vulkan/radv_query.c | 2 +- src/amd/vulkan/radv_radeon_winsys.h | 1 + src/amd/vulkan/radv_shader.c | 2 +- src/amd/vulkan/si_cmd_buffer.c | 3 ++- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c | 4 ++++ src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h | 1 + src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 32 ++++++++++++++++++++------- 11 files changed, 51 insertions(+), 22 deletions(-) (limited to 'src/amd') diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 08a05277fa5..1c276168f83 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -313,7 +313,8 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, bo = device->ws->buffer_create(device->ws, new_size, 4096, RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_CPU_ACCESS| + RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!bo) { cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; diff --git a/src/amd/vulkan/radv_debug.c b/src/amd/vulkan/radv_debug.c index b69c05b64f3..cdf8e7a1145 100644 --- a/src/amd/vulkan/radv_debug.c +++ b/src/amd/vulkan/radv_debug.c @@ -61,7 +61,8 @@ radv_init_trace(struct radv_device *device) device->trace_bo = ws->buffer_create(ws, TRACE_BO_SIZE, 8, RADEON_DOMAIN_VRAM, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_CPU_ACCESS| + RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!device->trace_bo) return false; diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c index c6b736bb689..167944f4e2f 100644 --- a/src/amd/vulkan/radv_descriptor_set.c +++ b/src/amd/vulkan/radv_descriptor_set.c @@ -431,7 +431,7 @@ VkResult radv_CreateDescriptorPool( if (bo_size) { pool->bo = device->ws->buffer_create(device->ws, bo_size, - 32, RADEON_DOMAIN_VRAM, 0); + 32, RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING); pool->mapped_ptr = (uint8_t*)device->ws->buffer_map(pool->bo); } pool->size = bo_size; diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 19ff8fec647..d25e9c97ba8 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1394,6 +1394,7 @@ radv_get_preamble_cs(struct radv_queue *queue, unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; unsigned max_offchip_buffers; unsigned hs_offchip_param = 0; + uint32_t ring_bo_flags = RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING; if (!queue->has_tess_rings) { if (needs_tess_rings) add_tess_rings = true; @@ -1427,7 +1428,7 @@ radv_get_preamble_cs(struct radv_queue *queue, scratch_size, 4096, RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_CPU_ACCESS); + ring_bo_flags); if (!scratch_bo) goto fail; } else @@ -1438,7 +1439,7 @@ radv_get_preamble_cs(struct radv_queue *queue, compute_scratch_size, 4096, RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_CPU_ACCESS); + ring_bo_flags); if (!compute_scratch_bo) goto fail; @@ -1450,7 +1451,7 @@ radv_get_preamble_cs(struct radv_queue *queue, esgs_ring_size, 4096, RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_CPU_ACCESS); + ring_bo_flags); if (!esgs_ring_bo) goto fail; } else { @@ -1463,7 +1464,7 @@ radv_get_preamble_cs(struct radv_queue *queue, gsvs_ring_size, 4096, RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_CPU_ACCESS); + ring_bo_flags); if (!gsvs_ring_bo) goto fail; } else { @@ -1476,14 +1477,14 @@ radv_get_preamble_cs(struct radv_queue *queue, tess_factor_ring_size, 256, RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_CPU_ACCESS); + ring_bo_flags); if (!tess_factor_ring_bo) goto fail; tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws, tess_offchip_ring_size, 256, RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_CPU_ACCESS); + ring_bo_flags); if (!tess_offchip_ring_bo) goto fail; } else { @@ -1510,7 +1511,7 @@ radv_get_preamble_cs(struct radv_queue *queue, size, 4096, RADEON_DOMAIN_VRAM, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_CPU_ACCESS|RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!descriptor_bo) goto fail; } else @@ -2119,6 +2120,9 @@ VkResult radv_alloc_memory(VkDevice _device, if (mem_flags & RADV_MEM_IMPLICIT_SYNC) flags |= RADEON_FLAG_IMPLICIT_SYNC; + if (!dedicate_info && !import_info) + flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; + mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment, domain, flags); @@ -2682,7 +2686,7 @@ VkResult radv_CreateEvent( event->bo = device->ws->buffer_create(device->ws, 8, 8, RADEON_DOMAIN_GTT, - RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!event->bo) { vk_free2(&device->alloc, pAllocator, event); return VK_ERROR_OUT_OF_DEVICE_MEMORY; diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index 06045d6b41b..06bf14ab680 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -780,7 +780,7 @@ VkResult radv_CreateQueryPool( size += 4 * pCreateInfo->queryCount; pool->bo = device->ws->buffer_create(device->ws, size, - 64, RADEON_DOMAIN_GTT, 0); + 64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!pool->bo) { vk_free2(&device->alloc, pAllocator, pool); diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index cf5a9e8f069..395c8499b3d 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -54,6 +54,7 @@ enum radeon_bo_flag { /* bitfield */ RADEON_FLAG_VIRTUAL = (1 << 3), RADEON_FLAG_VA_UNCACHED = (1 << 4), RADEON_FLAG_IMPLICIT_SYNC = (1 << 5), + RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 6), }; enum radeon_bo_usage { /* bitfield */ diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 59039170687..7f10798fdf4 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -325,7 +325,7 @@ radv_alloc_shader_memory(struct radv_device *device, slab->size = 256 * 1024; slab->bo = device->ws->buffer_create(device->ws, slab->size, 256, - RADEON_DOMAIN_VRAM, 0); + RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING); slab->ptr = (char*)device->ws->buffer_map(slab->bo); list_inithead(&slab->shaders); diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 20144d39ea3..89ee399817d 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -571,7 +571,8 @@ cik_create_gfx_config(struct radv_device *device) device->gfx_init = device->ws->buffer_create(device->ws, cs->cdw * 4, 4096, RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_CPU_ACCESS| + RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!device->gfx_init) goto fail; diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c index 15099b318e7..dac549a20ad 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c @@ -332,6 +332,10 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; if (!(flags & RADEON_FLAG_IMPLICIT_SYNC) && ws->info.drm_minor >= 22) request.flags |= AMDGPU_GEM_CREATE_EXPLICIT_SYNC; + if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && ws->info.drm_minor >= 20) { + bo->is_local = true; + request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID; + } /* this won't do anything on pre 4.9 kernels */ if (ws->zero_all_vram_allocs && (initial_domain & RADEON_DOMAIN_VRAM)) diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h index f32e4308386..f9aac9451c0 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h @@ -45,6 +45,7 @@ struct radv_amdgpu_winsys_bo { uint64_t size; struct radv_amdgpu_winsys *ws; bool is_virtual; + bool is_local; int ref_count; union { diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 46e5b767033..939c221e0c8 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -202,7 +202,8 @@ radv_amdgpu_cs_create(struct radeon_winsys *ws, if (cs->ws->use_ib_bos) { cs->ib_buffer = ws->buffer_create(ws, ib_size, 0, RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_CPU_ACCESS| + RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!cs->ib_buffer) { free(cs); return NULL; @@ -287,7 +288,8 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size) cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0, RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_CPU_ACCESS| + RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!cs->ib_buffer) { cs->base.cdw = 0; @@ -471,6 +473,9 @@ static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs, return; } + if (bo->is_local) + return; + radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority); } @@ -541,6 +546,10 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, } else if (count == 1 && !extra_bo && !extra_cs && !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) { struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0]; + if (cs->num_buffers == 0) { + *bo_list = 0; + return 0; + } r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles, cs->priorities, bo_list); } else { @@ -556,7 +565,10 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, if (extra_cs) { total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers; } - + if (total_buffer_count == 0) { + *bo_list = 0; + return 0; + } amdgpu_bo_handle *handles = malloc(sizeof(amdgpu_bo_handle) * total_buffer_count); uint8_t *priorities = malloc(sizeof(uint8_t) * total_buffer_count); if (!handles || !priorities) { @@ -721,7 +733,8 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx, "see dmesg for more information.\n"); } - amdgpu_bo_list_destroy(bo_list); + if (bo_list) + amdgpu_bo_list_destroy(bo_list); if (fence) radv_amdgpu_request_to_fence(ctx, fence, &request); @@ -795,7 +808,8 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx, "see dmesg for more information.\n"); } - amdgpu_bo_list_destroy(bo_list); + if (bo_list) + amdgpu_bo_list_destroy(bo_list); if (r) return r; @@ -856,7 +870,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx, } assert(cnt); - bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS); + bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS|RADEON_FLAG_NO_INTERPROCESS_SHARING); ptr = ws->buffer_map(bo); if (preamble_cs) { @@ -905,7 +919,8 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx, "see dmesg for more information.\n"); } - amdgpu_bo_list_destroy(bo_list); + if (bo_list) + amdgpu_bo_list_destroy(bo_list); ws->buffer_destroy(bo); if (r) @@ -1038,7 +1053,8 @@ static struct radeon_winsys_ctx *radv_amdgpu_ctx_create(struct radeon_winsys *_w assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * sizeof(uint64_t) <= 4096); ctx->fence_bo = ws->base.buffer_create(&ws->base, 4096, 8, RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_CPU_ACCESS| + RADEON_FLAG_NO_INTERPROCESS_SHARING); if (ctx->fence_bo) ctx->fence_map = (uint64_t*)ws->base.buffer_map(ctx->fence_bo); if (ctx->fence_map) -- cgit v1.2.3