summaryrefslogtreecommitdiffstats
path: root/src/intel/vulkan/anv_device.c
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2017-03-27 16:03:57 -0700
committerJason Ekstrand <[email protected]>2017-04-04 18:33:52 -0700
commitc964f0e485dc8cb901a54bf40f1d69f503ac0a0a (patch)
tree95064276060d6e889ada789ce187c59111cb658e /src/intel/vulkan/anv_device.c
parent82573d0f752f20f76cef773de01efda8a7cc0a31 (diff)
anv: Query the kernel for reset status
When a client causes a GPU hang (or experiences issues due to a hang in another client) we want to let it know as soon as possible. In particular, if it submits work with a fence and calls vkWaitForFences or vkQueueQaitIdle and it returns VK_SUCCESS, then the client should be able to trust the results of that rendering. In order to provide this guarantee, we have to ask the kernel for context status in a few key locations. Reviewed-by: Kenneth Graunke <[email protected]>
Diffstat (limited to 'src/intel/vulkan/anv_device.c')
-rw-r--r--src/intel/vulkan/anv_device.c114
1 files changed, 82 insertions, 32 deletions
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 45a6e333685..9e860d51896 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -888,8 +888,6 @@ anv_device_submit_simple_batch(struct anv_device *device,
struct anv_bo bo, *exec_bos[1];
VkResult result = VK_SUCCESS;
uint32_t size;
- int64_t timeout;
- int ret;
/* Kernel driver requires 8 byte aligned batch length */
size = align_u32(batch->next - batch->start, 8);
@@ -929,14 +927,7 @@ anv_device_submit_simple_batch(struct anv_device *device,
if (result != VK_SUCCESS)
goto fail;
- timeout = INT64_MAX;
- ret = anv_gem_wait(device, bo.gem_handle, &timeout);
- if (ret != 0) {
- /* We don't know the real error. */
- device->lost = true;
- result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
- goto fail;
- }
+ result = anv_device_wait(device, &bo, INT64_MAX);
fail:
anv_bo_pool_free(&device->batch_bo_pool, &bo);
@@ -1268,6 +1259,58 @@ anv_device_execbuf(struct anv_device *device,
return VK_SUCCESS;
}
+VkResult
+anv_device_query_status(struct anv_device *device)
+{
+ /* This isn't likely as most of the callers of this function already check
+ * for it. However, it doesn't hurt to check and it potentially lets us
+ * avoid an ioctl.
+ */
+ if (unlikely(device->lost))
+ return VK_ERROR_DEVICE_LOST;
+
+ uint32_t active, pending;
+ int ret = anv_gem_gpu_get_reset_stats(device, &active, &pending);
+ if (ret == -1) {
+ /* We don't know the real error. */
+ device->lost = true;
+ return vk_errorf(VK_ERROR_DEVICE_LOST, "get_reset_stats failed: %m");
+ }
+
+ if (active) {
+ device->lost = true;
+ return vk_errorf(VK_ERROR_DEVICE_LOST,
+ "GPU hung on one of our command buffers");
+ } else if (pending) {
+ device->lost = true;
+ return vk_errorf(VK_ERROR_DEVICE_LOST,
+ "GPU hung with commands in-flight");
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+ int64_t timeout)
+{
+ int ret = anv_gem_wait(device, bo->gem_handle, &timeout);
+ if (ret == -1 && errno == ETIME) {
+ return VK_TIMEOUT;
+ } else if (ret == -1) {
+ /* We don't know the real error. */
+ device->lost = true;
+ return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
+ }
+
+ /* Query for device status after the wait. If the BO we're waiting on got
+ * caught in a GPU hang we don't want to return VK_SUCCESS to the client
+ * because it clearly doesn't have valid data. Yes, this most likely means
+ * an ioctl, but we just did an ioctl to wait so it's no great loss.
+ */
+ return anv_device_query_status(device);
+}
+
VkResult anv_QueueSubmit(
VkQueue _queue,
uint32_t submitCount,
@@ -1277,10 +1320,17 @@ VkResult anv_QueueSubmit(
ANV_FROM_HANDLE(anv_queue, queue, _queue);
ANV_FROM_HANDLE(anv_fence, fence, _fence);
struct anv_device *device = queue->device;
- if (unlikely(device->lost))
- return VK_ERROR_DEVICE_LOST;
- VkResult result = VK_SUCCESS;
+ /* Query for device status prior to submitting. Technically, we don't need
+ * to do this. However, if we have a client that's submitting piles of
+ * garbage, we would rather break as early as possible to keep the GPU
+ * hanging contained. If we don't check here, we'll either be waiting for
+ * the kernel to kick us or we'll have to wait until the client waits on a
+ * fence before we actually know whether or not we've hung.
+ */
+ VkResult result = anv_device_query_status(device);
+ if (result != VK_SUCCESS)
+ return result;
/* We lock around QueueSubmit for three main reasons:
*
@@ -1806,9 +1856,6 @@ VkResult anv_GetFenceStatus(
if (unlikely(device->lost))
return VK_ERROR_DEVICE_LOST;
- int64_t t = 0;
- int ret;
-
switch (fence->state) {
case ANV_FENCE_STATE_RESET:
/* If it hasn't even been sent off to the GPU yet, it's not ready */
@@ -1818,15 +1865,18 @@ VkResult anv_GetFenceStatus(
/* It's been signaled, return success */
return VK_SUCCESS;
- case ANV_FENCE_STATE_SUBMITTED:
- /* It's been submitted to the GPU but we don't know if it's done yet. */
- ret = anv_gem_wait(device, fence->bo.gem_handle, &t);
- if (ret == 0) {
+ case ANV_FENCE_STATE_SUBMITTED: {
+ VkResult result = anv_device_wait(device, &fence->bo, 0);
+ switch (result) {
+ case VK_SUCCESS:
fence->state = ANV_FENCE_STATE_SIGNALED;
return VK_SUCCESS;
- } else {
+ case VK_TIMEOUT:
return VK_NOT_READY;
+ default:
+ return result;
}
+ }
default:
unreachable("Invalid fence status");
}
@@ -1888,20 +1938,20 @@ VkResult anv_WaitForFences(
/* These are the fences we really care about. Go ahead and wait
* on it until we hit a timeout.
*/
- ret = anv_gem_wait(device, fence->bo.gem_handle, &timeout);
- if (ret == -1 && errno == ETIME) {
- result = VK_TIMEOUT;
- goto done;
- } else if (ret == -1) {
- /* We don't know the real error. */
- device->lost = true;
- return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
- } else {
+ result = anv_device_wait(device, &fence->bo, timeout);
+ switch (result) {
+ case VK_SUCCESS:
fence->state = ANV_FENCE_STATE_SIGNALED;
signaled_fences = true;
if (!waitAll)
- return VK_SUCCESS;
- continue;
+ goto done;
+ break;
+
+ case VK_TIMEOUT:
+ goto done;
+
+ default:
+ return result;
}
}
}