3 files changed, 516 insertions, 484 deletions
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 0d446614c62..e9a39a659ac 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -202,6 +202,7 @@ VULKAN_FILES := \
 	vulkan/anv_pipeline.c \
 	vulkan/anv_pipeline_cache.c \
 	vulkan/anv_private.h \
+	vulkan/anv_queue.c \
 	vulkan/anv_util.c \
 	vulkan/anv_wsi.c \
 	vulkan/vk_format_info.h
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 62974f19d80..cf32df66d2d 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -981,62 +981,6 @@ anv_device_init_border_colors(struct anv_device *device)
                                                     border_colors);
 }
 
-VkResult
-anv_device_submit_simple_batch(struct anv_device *device,
-                               struct anv_batch *batch)
-{
-   struct drm_i915_gem_execbuffer2 execbuf;
-   struct drm_i915_gem_exec_object2 exec2_objects[1];
-   struct anv_bo bo, *exec_bos[1];
-   VkResult result = VK_SUCCESS;
-   uint32_t size;
-
-   /* Kernel driver requires 8 byte aligned batch length */
-   size = align_u32(batch->next - batch->start, 8);
-   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size);
-   if (result != VK_SUCCESS)
-      return result;
-
-   memcpy(bo.map, batch->start, size);
-   if (!device->info.has_llc)
-      anv_flush_range(bo.map, size);
-
-   exec_bos[0] = &bo;
-   exec2_objects[0].handle = bo.gem_handle;
-   exec2_objects[0].relocation_count = 0;
-   exec2_objects[0].relocs_ptr = 0;
-   exec2_objects[0].alignment = 0;
-   exec2_objects[0].offset = bo.offset;
-   exec2_objects[0].flags = 0;
-   exec2_objects[0].rsvd1 = 0;
-   exec2_objects[0].rsvd2 = 0;
-
-   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
-   execbuf.buffer_count = 1;
-   execbuf.batch_start_offset = 0;
-   execbuf.batch_len = size;
-   execbuf.cliprects_ptr = 0;
-   execbuf.num_cliprects = 0;
-   execbuf.DR1 = 0;
-   execbuf.DR4 = 0;
-
-   execbuf.flags =
-      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
-   execbuf.rsvd1 = device->context_id;
-   execbuf.rsvd2 = 0;
-
-   result = anv_device_execbuf(device, &execbuf, exec_bos);
-   if (result != VK_SUCCESS)
-      goto fail;
-
-   result = anv_device_wait(device, &bo, INT64_MAX);
-
- fail:
-   anv_bo_pool_free(&device->batch_bo_pool, &bo);
-
-   return result;
-}
-
 VkResult anv_CreateDevice(
     VkPhysicalDevice                            physicalDevice,
     const VkDeviceCreateInfo*                   pCreateInfo,
@@ -1350,26 +1294,6 @@ void anv_GetDeviceQueue(
 }
 
 VkResult
-anv_device_execbuf(struct anv_device *device,
-                   struct drm_i915_gem_execbuffer2 *execbuf,
-                   struct anv_bo **execbuf_bos)
-{
-   int ret = anv_gem_execbuffer(device, execbuf);
-   if (ret != 0) {
-      /* We don't know the real error. */
-      device->lost = true;
-      return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
-   }
-
-   struct drm_i915_gem_exec_object2 *objects =
-      (void *)(uintptr_t)execbuf->buffers_ptr;
-   for (uint32_t k = 0; k < execbuf->buffer_count; k++)
-      execbuf_bos[k]->offset = objects[k].offset;
-
-   return VK_SUCCESS;
-}
-
-VkResult
 anv_device_query_status(struct anv_device *device)
 {
    /* This isn't likely as most of the callers of this function already check
@@ -1446,119 +1370,6 @@ anv_device_wait(struct anv_device *device, struct anv_bo *bo,
    return anv_device_query_status(device);
 }
 
-VkResult anv_QueueSubmit(
-    VkQueue                                     _queue,
-    uint32_t                                    submitCount,
-    const VkSubmitInfo*                         pSubmits,
-    VkFence                                     _fence)
-{
-   ANV_FROM_HANDLE(anv_queue, queue, _queue);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-   struct anv_device *device = queue->device;
-
-   /* Query for device status prior to submitting.  Technically, we don't need
-    * to do this.  However, if we have a client that's submitting piles of
-    * garbage, we would rather break as early as possible to keep the GPU
-    * hanging contained.  If we don't check here, we'll either be waiting for
-    * the kernel to kick us or we'll have to wait until the client waits on a
-    * fence before we actually know whether or not we've hung.
-    */
-   VkResult result = anv_device_query_status(device);
-   if (result != VK_SUCCESS)
-      return result;
-
-   /* We lock around QueueSubmit for three main reasons:
-    *
-    *  1) When a block pool is resized, we create a new gem handle with a
-    *     different size and, in the case of surface states, possibly a
-    *     different center offset but we re-use the same anv_bo struct when
-    *     we do so.  If this happens in the middle of setting up an execbuf,
-    *     we could end up with our list of BOs out of sync with our list of
-    *     gem handles.
-    *
-    *  2) The algorithm we use for building the list of unique buffers isn't
-    *     thread-safe.  While the client is supposed to syncronize around
-    *     QueueSubmit, this would be extremely difficult to debug if it ever
-    *     came up in the wild due to a broken app.  It's better to play it
-    *     safe and just lock around QueueSubmit.
-    *
-    *  3)  The anv_cmd_buffer_execbuf function may perform relocations in
-    *      userspace.  Due to the fact that the surface state buffer is shared
-    *      between batches, we can't afford to have that happen from multiple
-    *      threads at the same time.  Even though the user is supposed to
-    *      ensure this doesn't happen, we play it safe as in (2) above.
-    *
-    * Since the only other things that ever take the device lock such as block
-    * pool resize only rarely happen, this will almost never be contended so
-    * taking a lock isn't really an expensive operation in this case.
-    */
-   pthread_mutex_lock(&device->mutex);
-
-   for (uint32_t i = 0; i < submitCount; i++) {
-      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
-         ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
-                         pSubmits[i].pCommandBuffers[j]);
-         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-         assert(!anv_batch_has_error(&cmd_buffer->batch));
-
-         result = anv_cmd_buffer_execbuf(device, cmd_buffer);
-         if (result != VK_SUCCESS)
-            goto out;
-      }
-   }
-
-   if (fence) {
-      struct anv_bo *fence_bo = &fence->bo;
-      result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
-      if (result != VK_SUCCESS)
-         goto out;
-
-      /* Update the fence and wake up any waiters */
-      assert(fence->state == ANV_FENCE_STATE_RESET);
-      fence->state = ANV_FENCE_STATE_SUBMITTED;
-      pthread_cond_broadcast(&device->queue_submit);
-   }
-
-out:
-   if (result != VK_SUCCESS) {
-      /* In the case that something has gone wrong we may end up with an
-       * inconsistent state from which it may not be trivial to recover.
-       * For example, we might have computed address relocations and
-       * any future attempt to re-submit this job will need to know about
-       * this and avoid computing relocation addresses again.
-       *
-       * To avoid this sort of issues, we assume that if something was
-       * wrong during submission we must already be in a really bad situation
-       * anyway (such us being out of memory) and return
-       * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
-       * submit the same job again to this device.
-       */
-      result = VK_ERROR_DEVICE_LOST;
-      device->lost = true;
-
-      /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
-       * vkWaitForFences() and vkGetFenceStatus() return a valid result
-       * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
-       * Setting the fence status to SIGNALED ensures this will happen in
-       * any case.
-       */
-      if (fence)
-         fence->state = ANV_FENCE_STATE_SIGNALED;
-   }
-
-   pthread_mutex_unlock(&device->mutex);
-
-   return result;
-}
-
-VkResult anv_QueueWaitIdle(
-    VkQueue                                     _queue)
-{
-   ANV_FROM_HANDLE(anv_queue, queue, _queue);
-
-   return anv_DeviceWaitIdle(anv_device_to_handle(queue->device));
-}
-
 VkResult anv_DeviceWaitIdle(
     VkDevice                                    _device)
 {
@@ -1953,301 +1764,6 @@ VkResult anv_QueueBindSparse(
    return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
 }
 
-VkResult anv_CreateFence(
-    VkDevice                                    _device,
-    const VkFenceCreateInfo*                    pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkFence*                                    pFence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_bo fence_bo;
-   struct anv_fence *fence;
-   struct anv_batch batch;
-   VkResult result;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
-
-   result = anv_bo_pool_alloc(&device->batch_bo_pool, &fence_bo, 4096);
-   if (result != VK_SUCCESS)
-      return result;
-
-   /* Fences are small.  Just store the CPU data structure in the BO. */
-   fence = fence_bo.map;
-   fence->bo = fence_bo;
-
-   /* Place the batch after the CPU data but on its own cache line. */
-   const uint32_t batch_offset = align_u32(sizeof(*fence), CACHELINE_SIZE);
-   batch.next = batch.start = fence->bo.map + batch_offset;
-   batch.end = fence->bo.map + fence->bo.size;
-   anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe);
-   anv_batch_emit(&batch, GEN7_MI_NOOP, noop);
-
-   if (!device->info.has_llc) {
-      assert(((uintptr_t) batch.start & CACHELINE_MASK) == 0);
-      assert(batch.next - batch.start <= CACHELINE_SIZE);
-      __builtin_ia32_mfence();
-      __builtin_ia32_clflush(batch.start);
-   }
-
-   fence->exec2_objects[0].handle = fence->bo.gem_handle;
-   fence->exec2_objects[0].relocation_count = 0;
-   fence->exec2_objects[0].relocs_ptr = 0;
-   fence->exec2_objects[0].alignment = 0;
-   fence->exec2_objects[0].offset = fence->bo.offset;
-   fence->exec2_objects[0].flags = 0;
-   fence->exec2_objects[0].rsvd1 = 0;
-   fence->exec2_objects[0].rsvd2 = 0;
-
-   fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
-   fence->execbuf.buffer_count = 1;
-   fence->execbuf.batch_start_offset = batch.start - fence->bo.map;
-   fence->execbuf.batch_len = batch.next - batch.start;
-   fence->execbuf.cliprects_ptr = 0;
-   fence->execbuf.num_cliprects = 0;
-   fence->execbuf.DR1 = 0;
-   fence->execbuf.DR4 = 0;
-
-   fence->execbuf.flags =
-      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
-   fence->execbuf.rsvd1 = device->context_id;
-   fence->execbuf.rsvd2 = 0;
-
-   if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
-      fence->state = ANV_FENCE_STATE_SIGNALED;
-   } else {
-      fence->state = ANV_FENCE_STATE_RESET;
-   }
-
-   *pFence = anv_fence_to_handle(fence);
-
-   return VK_SUCCESS;
-}
-
-void anv_DestroyFence(
-    VkDevice                                    _device,
-    VkFence                                     _fence,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-
-   if (!fence)
-      return;
-
-   assert(fence->bo.map == fence);
-   anv_bo_pool_free(&device->batch_bo_pool, &fence->bo);
-}
-
-VkResult anv_ResetFences(
-    VkDevice                                    _device,
-    uint32_t                                    fenceCount,
-    const VkFence*                              pFences)
-{
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-      fence->state = ANV_FENCE_STATE_RESET;
-   }
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetFenceStatus(
-    VkDevice                                    _device,
-    VkFence                                     _fence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-
-   if (unlikely(device->lost))
-      return VK_ERROR_DEVICE_LOST;
-
-   switch (fence->state) {
-   case ANV_FENCE_STATE_RESET:
-      /* If it hasn't even been sent off to the GPU yet, it's not ready */
-      return VK_NOT_READY;
-
-   case ANV_FENCE_STATE_SIGNALED:
-      /* It's been signaled, return success */
-      return VK_SUCCESS;
-
-   case ANV_FENCE_STATE_SUBMITTED: {
-      VkResult result = anv_device_bo_busy(device, &fence->bo);
-      if (result == VK_SUCCESS) {
-         fence->state = ANV_FENCE_STATE_SIGNALED;
-         return VK_SUCCESS;
-      } else {
-         return result;
-      }
-   }
-   default:
-      unreachable("Invalid fence status");
-   }
-}
-
-#define NSEC_PER_SEC 1000000000
-#define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)
-
-VkResult anv_WaitForFences(
-    VkDevice                                    _device,
-    uint32_t                                    fenceCount,
-    const VkFence*                              pFences,
-    VkBool32                                    waitAll,
-    uint64_t                                    _timeout)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   int ret;
-
-   if (unlikely(device->lost))
-      return VK_ERROR_DEVICE_LOST;
-
-   /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
-    * to block indefinitely timeouts <= 0.  Unfortunately, this was broken
-    * for a couple of kernel releases.  Since there's no way to know
-    * whether or not the kernel we're using is one of the broken ones, the
-    * best we can do is to clamp the timeout to INT64_MAX.  This limits the
-    * maximum timeout from 584 years to 292 years - likely not a big deal.
-    */
-   int64_t timeout = MIN2(_timeout, INT64_MAX);
-
-   VkResult result = VK_SUCCESS;
-   uint32_t pending_fences = fenceCount;
-   while (pending_fences) {
-      pending_fences = 0;
-      bool signaled_fences = false;
-      for (uint32_t i = 0; i < fenceCount; i++) {
-         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-         switch (fence->state) {
-         case ANV_FENCE_STATE_RESET:
-            /* This fence hasn't been submitted yet, we'll catch it the next
-             * time around.  Yes, this may mean we dead-loop but, short of
-             * lots of locking and a condition variable, there's not much that
-             * we can do about that.
-             */
-            pending_fences++;
-            continue;
-
-         case ANV_FENCE_STATE_SIGNALED:
-            /* This fence is not pending.  If waitAll isn't set, we can return
-             * early.  Otherwise, we have to keep going.
-             */
-            if (!waitAll) {
-               result = VK_SUCCESS;
-               goto done;
-            }
-            continue;
-
-         case ANV_FENCE_STATE_SUBMITTED:
-            /* These are the fences we really care about.  Go ahead and wait
-             * on it until we hit a timeout.
-             */
-            result = anv_device_wait(device, &fence->bo, timeout);
-            switch (result) {
-            case VK_SUCCESS:
-               fence->state = ANV_FENCE_STATE_SIGNALED;
-               signaled_fences = true;
-               if (!waitAll)
-                  goto done;
-               break;
-
-            case VK_TIMEOUT:
-               goto done;
-
-            default:
-               return result;
-            }
-         }
-      }
-
-      if (pending_fences && !signaled_fences) {
-         /* If we've hit this then someone decided to vkWaitForFences before
-          * they've actually submitted any of them to a queue.  This is a
-          * fairly pessimal case, so it's ok to lock here and use a standard
-          * pthreads condition variable.
-          */
-         pthread_mutex_lock(&device->mutex);
-
-         /* It's possible that some of the fences have changed state since the
-          * last time we checked.  Now that we have the lock, check for
-          * pending fences again and don't wait if it's changed.
-          */
-         uint32_t now_pending_fences = 0;
-         for (uint32_t i = 0; i < fenceCount; i++) {
-            ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-            if (fence->state == ANV_FENCE_STATE_RESET)
-               now_pending_fences++;
-         }
-         assert(now_pending_fences <= pending_fences);
-
-         if (now_pending_fences == pending_fences) {
-            struct timespec before;
-            clock_gettime(CLOCK_MONOTONIC, &before);
-
-            uint32_t abs_nsec = before.tv_nsec + timeout % NSEC_PER_SEC;
-            uint64_t abs_sec = before.tv_sec + (abs_nsec / NSEC_PER_SEC) +
-                               (timeout / NSEC_PER_SEC);
-            abs_nsec %= NSEC_PER_SEC;
-
-            /* Avoid roll-over in tv_sec on 32-bit systems if the user
-             * provided timeout is UINT64_MAX
-             */
-            struct timespec abstime;
-            abstime.tv_nsec = abs_nsec;
-            abstime.tv_sec = MIN2(abs_sec, INT_TYPE_MAX(abstime.tv_sec));
-
-            ret = pthread_cond_timedwait(&device->queue_submit,
-                                         &device->mutex, &abstime);
-            assert(ret != EINVAL);
-
-            struct timespec after;
-            clock_gettime(CLOCK_MONOTONIC, &after);
-            uint64_t time_elapsed =
-               ((uint64_t)after.tv_sec * NSEC_PER_SEC + after.tv_nsec) -
-               ((uint64_t)before.tv_sec * NSEC_PER_SEC + before.tv_nsec);
-
-            if (time_elapsed >= timeout) {
-               pthread_mutex_unlock(&device->mutex);
-               result = VK_TIMEOUT;
-               goto done;
-            }
-
-            timeout -= time_elapsed;
-         }
-
-         pthread_mutex_unlock(&device->mutex);
-      }
-   }
-
-done:
-   if (unlikely(device->lost))
-      return VK_ERROR_DEVICE_LOST;
-
-   return result;
-}
-
-// Queue semaphore functions
-
-VkResult anv_CreateSemaphore(
-    VkDevice                                    device,
-    const VkSemaphoreCreateInfo*                pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkSemaphore*                                pSemaphore)
-{
-   /* The DRM execbuffer ioctl always execute in-oder, even between different
-    * rings. As such, there's nothing to do for the user space semaphore.
-    */
-
-   *pSemaphore = (VkSemaphore)1;
-
-   return VK_SUCCESS;
-}
-
-void anv_DestroySemaphore(
-    VkDevice                                    device,
-    VkSemaphore                                 semaphore,
-    const VkAllocationCallbacks*                pAllocator)
-{
-}
-
 // Event functions
 
 VkResult anv_CreateEvent(
diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c
new file mode 100644
index 00000000000..5a22ff7fe60
--- /dev/null
+++ b/src/intel/vulkan/anv_queue.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file implements VkQueue, VkFence, and VkSemaphore
+ */
+
+#include "anv_private.h"
+#include "util/vk_util.h"
+
+#include "genxml/gen7_pack.h"
+
+VkResult
+anv_device_execbuf(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf,
+                   struct anv_bo **execbuf_bos)
+{
+   int ret = anv_gem_execbuffer(device, execbuf);
+   if (ret != 0) {
+      /* We don't know the real error. */
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
+   }
+
+   struct drm_i915_gem_exec_object2 *objects =
+      (void *)(uintptr_t)execbuf->buffers_ptr;
+   for (uint32_t k = 0; k < execbuf->buffer_count; k++)
+      execbuf_bos[k]->offset = objects[k].offset;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_submit_simple_batch(struct anv_device *device,
+                               struct anv_batch *batch)
+{
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 exec2_objects[1];
+   struct anv_bo bo, *exec_bos[1];
+   VkResult result = VK_SUCCESS;
+   uint32_t size;
+
+   /* Kernel driver requires 8 byte aligned batch length */
+   size = align_u32(batch->next - batch->start, 8);
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(bo.map, batch->start, size);
+   if (!device->info.has_llc)
+      anv_flush_range(bo.map, size);
+
+   exec_bos[0] = &bo;
+   exec2_objects[0].handle = bo.gem_handle;
+   exec2_objects[0].relocation_count = 0;
+   exec2_objects[0].relocs_ptr = 0;
+   exec2_objects[0].alignment = 0;
+   exec2_objects[0].offset = bo.offset;
+   exec2_objects[0].flags = 0;
+   exec2_objects[0].rsvd1 = 0;
+   exec2_objects[0].rsvd2 = 0;
+
+   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
+   execbuf.buffer_count = 1;
+   execbuf.batch_start_offset = 0;
+   execbuf.batch_len = size;
+   execbuf.cliprects_ptr = 0;
+   execbuf.num_cliprects = 0;
+   execbuf.DR1 = 0;
+   execbuf.DR4 = 0;
+
+   execbuf.flags =
+      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
+   execbuf.rsvd1 = device->context_id;
+   execbuf.rsvd2 = 0;
+
+   result = anv_device_execbuf(device, &execbuf, exec_bos);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   result = anv_device_wait(device, &bo, INT64_MAX);
+
+ fail:
+   anv_bo_pool_free(&device->batch_bo_pool, &bo);
+
+   return result;
+}
+
+VkResult anv_QueueSubmit(
+    VkQueue                                     _queue,
+    uint32_t                                    submitCount,
+    const VkSubmitInfo*                         pSubmits,
+    VkFence                                     _fence)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+   struct anv_device *device = queue->device;
+
+   /* Query for device status prior to submitting.  Technically, we don't need
+    * to do this.  However, if we have a client that's submitting piles of
+    * garbage, we would rather break as early as possible to keep the GPU
+    * hanging contained.  If we don't check here, we'll either be waiting for
+    * the kernel to kick us or we'll have to wait until the client waits on a
+    * fence before we actually know whether or not we've hung.
+    */
+   VkResult result = anv_device_query_status(device);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* We lock around QueueSubmit for three main reasons:
+    *
+    *  1) When a block pool is resized, we create a new gem handle with a
+    *     different size and, in the case of surface states, possibly a
+    *     different center offset but we re-use the same anv_bo struct when
+    *     we do so.  If this happens in the middle of setting up an execbuf,
+    *     we could end up with our list of BOs out of sync with our list of
+    *     gem handles.
+    *
+    *  2) The algorithm we use for building the list of unique buffers isn't
+    *     thread-safe.  While the client is supposed to syncronize around
+    *     QueueSubmit, this would be extremely difficult to debug if it ever
+    *     came up in the wild due to a broken app.  It's better to play it
+    *     safe and just lock around QueueSubmit.
+    *
+    *  3)  The anv_cmd_buffer_execbuf function may perform relocations in
+    *      userspace.  Due to the fact that the surface state buffer is shared
+    *      between batches, we can't afford to have that happen from multiple
+    *      threads at the same time.  Even though the user is supposed to
+    *      ensure this doesn't happen, we play it safe as in (2) above.
+    *
+    * Since the only other things that ever take the device lock such as block
+    * pool resize only rarely happen, this will almost never be contended so
+    * taking a lock isn't really an expensive operation in this case.
+    */
+   pthread_mutex_lock(&device->mutex);
+
+   for (uint32_t i = 0; i < submitCount; i++) {
+      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
+         ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
+                         pSubmits[i].pCommandBuffers[j]);
+         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+         assert(!anv_batch_has_error(&cmd_buffer->batch));
+
+         result = anv_cmd_buffer_execbuf(device, cmd_buffer);
+         if (result != VK_SUCCESS)
+            goto out;
+      }
+   }
+
+   if (fence) {
+      struct anv_bo *fence_bo = &fence->bo;
+      result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
+      if (result != VK_SUCCESS)
+         goto out;
+
+      /* Update the fence and wake up any waiters */
+      assert(fence->state == ANV_FENCE_STATE_RESET);
+      fence->state = ANV_FENCE_STATE_SUBMITTED;
+      pthread_cond_broadcast(&device->queue_submit);
+   }
+
+out:
+   if (result != VK_SUCCESS) {
+      /* In the case that something has gone wrong we may end up with an
+       * inconsistent state from which it may not be trivial to recover.
+       * For example, we might have computed address relocations and
+       * any future attempt to re-submit this job will need to know about
+       * this and avoid computing relocation addresses again.
+       *
+       * To avoid this sort of issues, we assume that if something was
+       * wrong during submission we must already be in a really bad situation
+       * anyway (such us being out of memory) and return
+       * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
+       * submit the same job again to this device.
+       */
+      result = VK_ERROR_DEVICE_LOST;
+      device->lost = true;
+
+      /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
+       * vkWaitForFences() and vkGetFenceStatus() return a valid result
+       * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
+       * Setting the fence status to SIGNALED ensures this will happen in
+       * any case.
+       */
+      if (fence)
+         fence->state = ANV_FENCE_STATE_SIGNALED;
+   }
+
+   pthread_mutex_unlock(&device->mutex);
+
+   return result;
+}
+
+VkResult anv_QueueWaitIdle(
+    VkQueue                                     _queue)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+
+   return anv_DeviceWaitIdle(anv_device_to_handle(queue->device));
+}
+
+VkResult anv_CreateFence(
+    VkDevice                                    _device,
+    const VkFenceCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFence*                                    pFence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_bo fence_bo;
+   struct anv_fence *fence;
+   struct anv_batch batch;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
+
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, &fence_bo, 4096);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Fences are small.  Just store the CPU data structure in the BO. */
+   fence = fence_bo.map;
+   fence->bo = fence_bo;
+
+   /* Place the batch after the CPU data but on its own cache line. */
+   const uint32_t batch_offset = align_u32(sizeof(*fence), CACHELINE_SIZE);
+   batch.next = batch.start = fence->bo.map + batch_offset;
+   batch.end = fence->bo.map + fence->bo.size;
+   anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe);
+   anv_batch_emit(&batch, GEN7_MI_NOOP, noop);
+
+   if (!device->info.has_llc) {
+      assert(((uintptr_t) batch.start & CACHELINE_MASK) == 0);
+      assert(batch.next - batch.start <= CACHELINE_SIZE);
+      __builtin_ia32_mfence();
+      __builtin_ia32_clflush(batch.start);
+   }
+
+   fence->exec2_objects[0].handle = fence->bo.gem_handle;
+   fence->exec2_objects[0].relocation_count = 0;
+   fence->exec2_objects[0].relocs_ptr = 0;
+   fence->exec2_objects[0].alignment = 0;
+   fence->exec2_objects[0].offset = fence->bo.offset;
+   fence->exec2_objects[0].flags = 0;
+   fence->exec2_objects[0].rsvd1 = 0;
+   fence->exec2_objects[0].rsvd2 = 0;
+
+   fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
+   fence->execbuf.buffer_count = 1;
+   fence->execbuf.batch_start_offset = batch.start - fence->bo.map;
+   fence->execbuf.batch_len = batch.next - batch.start;
+   fence->execbuf.cliprects_ptr = 0;
+   fence->execbuf.num_cliprects = 0;
+   fence->execbuf.DR1 = 0;
+   fence->execbuf.DR4 = 0;
+
+   fence->execbuf.flags =
+      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
+   fence->execbuf.rsvd1 = device->context_id;
+   fence->execbuf.rsvd2 = 0;
+
+   if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
+      fence->state = ANV_FENCE_STATE_SIGNALED;
+   } else {
+      fence->state = ANV_FENCE_STATE_RESET;
+   }
+
+   *pFence = anv_fence_to_handle(fence);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyFence(
+    VkDevice                                    _device,
+    VkFence                                     _fence,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+
+   if (!fence)
+      return;
+
+   assert(fence->bo.map == fence);
+   anv_bo_pool_free(&device->batch_bo_pool, &fence->bo);
+}
+
+VkResult anv_ResetFences(
+    VkDevice                                    _device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences)
+{
+   for (uint32_t i = 0; i < fenceCount; i++) {
+      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+      fence->state = ANV_FENCE_STATE_RESET;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_GetFenceStatus(
+    VkDevice                                    _device,
+    VkFence                                     _fence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+
+   if (unlikely(device->lost))
+      return VK_ERROR_DEVICE_LOST;
+
+   switch (fence->state) {
+   case ANV_FENCE_STATE_RESET:
+      /* If it hasn't even been sent off to the GPU yet, it's not ready */
+      return VK_NOT_READY;
+
+   case ANV_FENCE_STATE_SIGNALED:
+      /* It's been signaled, return success */
+      return VK_SUCCESS;
+
+   case ANV_FENCE_STATE_SUBMITTED: {
+      VkResult result = anv_device_bo_busy(device, &fence->bo);
+      if (result == VK_SUCCESS) {
+         fence->state = ANV_FENCE_STATE_SIGNALED;
+         return VK_SUCCESS;
+      } else {
+         return result;
+      }
+   }
+   default:
+      unreachable("Invalid fence status");
+   }
+}
+
+#define NSEC_PER_SEC 1000000000
+#define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)
+
+VkResult anv_WaitForFences(
+    VkDevice                                    _device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences,
+    VkBool32                                    waitAll,
+    uint64_t                                    _timeout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   int ret;
+
+   if (unlikely(device->lost))
+      return VK_ERROR_DEVICE_LOST;
+
+   /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
+    * to block indefinitely timeouts <= 0.  Unfortunately, this was broken
+    * for a couple of kernel releases.  Since there's no way to know
+    * whether or not the kernel we're using is one of the broken ones, the
+    * best we can do is to clamp the timeout to INT64_MAX.  This limits the
+    * maximum timeout from 584 years to 292 years - likely not a big deal.
+    */
+   int64_t timeout = MIN2(_timeout, INT64_MAX);
+
+   VkResult result = VK_SUCCESS;
+   uint32_t pending_fences = fenceCount;
+   while (pending_fences) {
+      pending_fences = 0;
+      bool signaled_fences = false;
+      for (uint32_t i = 0; i < fenceCount; i++) {
+         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+         switch (fence->state) {
+         case ANV_FENCE_STATE_RESET:
+            /* This fence hasn't been submitted yet, we'll catch it the next
+             * time around.  Yes, this may mean we dead-loop but, short of
+             * lots of locking and a condition variable, there's not much that
+             * we can do about that.
+             */
+            pending_fences++;
+            continue;
+
+         case ANV_FENCE_STATE_SIGNALED:
+            /* This fence is not pending.  If waitAll isn't set, we can return
+             * early.  Otherwise, we have to keep going.
+             */
+            if (!waitAll) {
+               result = VK_SUCCESS;
+               goto done;
+            }
+            continue;
+
+         case ANV_FENCE_STATE_SUBMITTED:
+            /* These are the fences we really care about.  Go ahead and wait
+             * on it until we hit a timeout.
+             */
+            result = anv_device_wait(device, &fence->bo, timeout);
+            switch (result) {
+            case VK_SUCCESS:
+               fence->state = ANV_FENCE_STATE_SIGNALED;
+               signaled_fences = true;
+               if (!waitAll)
+                  goto done;
+               break;
+
+            case VK_TIMEOUT:
+               goto done;
+
+            default:
+               return result;
+            }
+         }
+      }
+
+      if (pending_fences && !signaled_fences) {
+         /* If we've hit this then someone decided to vkWaitForFences before
+          * they've actually submitted any of them to a queue.  This is a
+          * fairly pessimal case, so it's ok to lock here and use a standard
+          * pthreads condition variable.
+          */
+         pthread_mutex_lock(&device->mutex);
+
+         /* It's possible that some of the fences have changed state since the
+          * last time we checked.  Now that we have the lock, check for
+          * pending fences again and don't wait if it's changed.
+          */
+         uint32_t now_pending_fences = 0;
+         for (uint32_t i = 0; i < fenceCount; i++) {
+            ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+            if (fence->state == ANV_FENCE_STATE_RESET)
+               now_pending_fences++;
+         }
+         assert(now_pending_fences <= pending_fences);
+
+         if (now_pending_fences == pending_fences) {
+            struct timespec before;
+            clock_gettime(CLOCK_MONOTONIC, &before);
+
+            uint32_t abs_nsec = before.tv_nsec + timeout % NSEC_PER_SEC;
+            uint64_t abs_sec = before.tv_sec + (abs_nsec / NSEC_PER_SEC) +
+                               (timeout / NSEC_PER_SEC);
+            abs_nsec %= NSEC_PER_SEC;
+
+            /* Avoid roll-over in tv_sec on 32-bit systems if the user
+             * provided timeout is UINT64_MAX
+             */
+            struct timespec abstime;
+            abstime.tv_nsec = abs_nsec;
+            abstime.tv_sec = MIN2(abs_sec, INT_TYPE_MAX(abstime.tv_sec));
+
+            ret = pthread_cond_timedwait(&device->queue_submit,
+                                         &device->mutex, &abstime);
+            assert(ret != EINVAL);
+
+            struct timespec after;
+            clock_gettime(CLOCK_MONOTONIC, &after);
+            uint64_t time_elapsed =
+               ((uint64_t)after.tv_sec * NSEC_PER_SEC + after.tv_nsec) -
+               ((uint64_t)before.tv_sec * NSEC_PER_SEC + before.tv_nsec);
+
+            if (time_elapsed >= timeout) {
+               pthread_mutex_unlock(&device->mutex);
+               result = VK_TIMEOUT;
+               goto done;
+            }
+
+            timeout -= time_elapsed;
+         }
+
+         pthread_mutex_unlock(&device->mutex);
+      }
+   }
+
+done:
+   if (unlikely(device->lost))
+      return VK_ERROR_DEVICE_LOST;
+
+   return result;
+}
+
+// Queue semaphore functions
+
+VkResult anv_CreateSemaphore(
+    VkDevice                                    device,
+    const VkSemaphoreCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSemaphore*                                pSemaphore)
+{
+   /* The DRM execbuffer ioctl always execute in-oder, even between different
+    * rings. As such, there's nothing to do for the user space semaphore.
+    */
+
+   *pSemaphore = (VkSemaphore)1;
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroySemaphore(
+    VkDevice                                    device,
+    VkSemaphore                                 semaphore,
+    const VkAllocationCallbacks*                pAllocator)
+{
+}