diff options
Diffstat (limited to 'src/amd/vulkan/radv_query.c')
-rw-r--r-- | src/amd/vulkan/radv_query.c | 415 |
1 files changed, 415 insertions, 0 deletions
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c new file mode 100644 index 00000000000..f60c10a3a8c --- /dev/null +++ b/src/amd/vulkan/radv_query.c @@ -0,0 +1,415 @@ +/* + * Copyrigh 2016 Red Hat Inc. + * Based on anv: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <assert.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include "radv_private.h" +#include "radv_cs.h" +#include "sid.h" + +static unsigned get_max_db(struct radv_device *device) +{ + unsigned num_db = device->instance->physicalDevice.rad_info.num_render_backends; + unsigned rb_mask = device->instance->physicalDevice.rad_info.enabled_rb_mask; + + if (device->instance->physicalDevice.rad_info.chip_class == SI) + num_db = 8; + else + num_db = MAX2(8, num_db); + + /* Otherwise we need to change the query reset procedure */ + assert(rb_mask == ((1ull << num_db) - 1)); + + return num_db; +} + +VkResult radv_CreateQueryPool( + VkDevice _device, + const VkQueryPoolCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkQueryPool* pQueryPool) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + uint64_t size; + struct radv_query_pool *pool = radv_alloc2(&device->alloc, pAllocator, + sizeof(*pool), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!pool) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + + switch(pCreateInfo->queryType) { + case VK_QUERY_TYPE_OCCLUSION: + /* 16 bytes tmp. buffer as the compute packet writes 64 bits, but + * the app. may have 32 bits of space. */ + pool->stride = 16 * get_max_db(device) + 16; + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + pool->stride = 16 * 11; + break; + case VK_QUERY_TYPE_TIMESTAMP: + pool->stride = 8; + break; + default: + unreachable("creating unhandled query type"); + } + + pool->type = pCreateInfo->queryType; + pool->availability_offset = pool->stride * pCreateInfo->queryCount; + size = pool->availability_offset + 4 * pCreateInfo->queryCount; + + pool->bo = device->ws->buffer_create(device->ws, size, + 64, RADEON_DOMAIN_GTT, 0); + + if (!pool->bo) { + radv_free2(&device->alloc, pAllocator, pool); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + pool->ptr = device->ws->buffer_map(pool->bo); + + if (!pool->ptr) { + device->ws->buffer_destroy(pool->bo); + radv_free2(&device->alloc, pAllocator, pool); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + memset(pool->ptr, 0, size); + + *pQueryPool = radv_query_pool_to_handle(pool); + return VK_SUCCESS; +} + +void radv_DestroyQueryPool( + VkDevice _device, + VkQueryPool _pool, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_query_pool, pool, _pool); + + if (!pool) + return; + + device->ws->buffer_destroy(pool->bo); + radv_free2(&device->alloc, pAllocator, pool); +} + +VkResult radv_GetQueryPoolResults( + VkDevice _device, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + size_t dataSize, + void* pData, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + char *data = pData; + VkResult result = VK_SUCCESS; + + for(unsigned i = 0; i < queryCount; ++i, data += stride) { + char *dest = data; + unsigned query = firstQuery + i; + char *src = pool->ptr + query * pool->stride; + uint32_t available; + + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query)) + ; + } + + if (!*(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query) && + !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) { + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + *(uint32_t*)dest = 0; + result = VK_NOT_READY; + continue; + + } + + available = *(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query); + switch (pool->type) { + case VK_QUERY_TYPE_TIMESTAMP: + if (flags & VK_QUERY_RESULT_64_BIT) { + *(uint64_t*)dest = *(uint64_t*)src; + dest += 8; + } else { + *(uint32_t*)dest = *(uint32_t*)src; + dest += 4; + } + break; + case VK_QUERY_TYPE_OCCLUSION: { + uint64_t result = *(uint64_t*)(src + pool->stride - 16); + + if (flags & VK_QUERY_RESULT_64_BIT) { + *(uint64_t*)dest = result; + dest += 8; + } else { + *(uint32_t*)dest = result; + dest += 4; + } + break; + default: + unreachable("trying to get results of unhandled query type"); + } + } + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + *(uint32_t*)dest = available; + dest += 4; + } + } + + return result; +} + +void radv_CmdCopyQueryPoolResults( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint64_t dest_va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); + dest_va += dst_buffer->offset + dstOffset; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8); + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, dst_buffer->bo, 8); + + for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) { + unsigned query = firstQuery + i; + uint64_t local_src_va = va + query * pool->stride; + unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4; + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 26); + + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + /* TODO, not sure if there is any case where we won't always be ready yet */ + uint64_t avail_va = va + pool->availability_offset + 4 * query; + + + /* This waits on the ME. All copies below are done on the ME */ + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, 1); /* reference value */ + radeon_emit(cs, 0xffffffff); /* mask */ + radeon_emit(cs, 4); /* poll interval */ + } + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + local_src_va += pool->stride - 16; + + case VK_QUERY_TYPE_TIMESTAMP: + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_MEM) | + ((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0)); + radeon_emit(cs, local_src_va); + radeon_emit(cs, local_src_va >> 32); + radeon_emit(cs, dest_va); + radeon_emit(cs, dest_va >> 32); + break; + default: + unreachable("trying to get results of unhandled query type"); + } + + /* The flag could be still changed while the data copy is busy and we + * then might have invalid data, but a ready flag. However, the availability + * writes happen on the ME too, so they should be synchronized. Might need to + * revisit this with multiple queues. + */ + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + uint64_t avail_va = va + pool->availability_offset + 4 * query; + uint64_t avail_dest_va = dest_va; + if (pool->type != VK_QUERY_TYPE_PIPELINE_STATISTICS) + avail_dest_va += elem_size; + else + abort(); + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_MEM)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, avail_dest_va); + radeon_emit(cs, avail_dest_va >> 32); + } + + assert(cs->cdw <= cdw_max); + } + +} + +void radv_CmdResetQueryPool( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8); + + si_cp_dma_clear_buffer(cmd_buffer, va + firstQuery * pool->stride, + queryCount * pool->stride, 0); + si_cp_dma_clear_buffer(cmd_buffer, va + pool->availability_offset + firstQuery * 4, + queryCount * 4, 0); +} + +void radv_CmdBeginQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + VkQueryControlFlags flags) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + va += pool->stride * query; + + cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + radeon_check_space(cmd_buffer->device->ws, cs, 7); + + ++cmd_buffer->state.active_occlusion_queries; + if (cmd_buffer->state.active_occlusion_queries == 1) + radv_set_db_count_control(cmd_buffer); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + break; + default: + unreachable("beginning unhandled query type"); + } +} + + +void radv_CmdEndQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint64_t avail_va = va + pool->availability_offset + 4 * query; + va += pool->stride * query; + + cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + radeon_check_space(cmd_buffer->device->ws, cs, 14); + + cmd_buffer->state.active_occlusion_queries--; + if (cmd_buffer->state.active_occlusion_queries == 0) + radv_set_db_count_control(cmd_buffer); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va + 8); + radeon_emit(cs, (va + 8) >> 32); + + radeon_emit(cs, PKT3(PKT3_OCCLUSION_QUERY, 3, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, va + pool->stride - 16); + radeon_emit(cs, (va + pool->stride - 16) >> 32); + + break; + default: + unreachable("ending unhandled query type"); + } + + radeon_check_space(cmd_buffer->device->ws, cs, 5); + + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, 1); +} + +void radv_CmdWriteTimestamp( + VkCommandBuffer commandBuffer, + VkPipelineStageFlagBits pipelineStage, + VkQueryPool queryPool, + uint32_t query) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint64_t avail_va = va + pool->availability_offset + 4 * query; + uint64_t query_va = va + pool->stride * query; + + cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 5); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 11); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5)); + radeon_emit(cs, query_va); + radeon_emit(cs, (3 << 29) | ((query_va >> 32) & 0xFFFF)); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, 1); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} |