/*
 * Copyright © 2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

/**
 * This file implements VkQueue, VkFence, and VkSemaphore
 */

#include <fcntl.h>
#include <unistd.h>
#include <sys/eventfd.h>

#include "anv_private.h"
#include "vk_util.h"

#include "genxml/gen7_pack.h"

VkResult
anv_device_execbuf(struct anv_device *device,
                   struct drm_i915_gem_execbuffer2 *execbuf,
                   struct anv_bo **execbuf_bos)
{
   int ret = anv_gem_execbuffer(device, execbuf);
   if (ret != 0) {
      /* We don't know the real error. */
      device->lost = true;
      return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
   }

   struct drm_i915_gem_exec_object2 *objects =
      (void *)(uintptr_t)execbuf->buffers_ptr;
   for (uint32_t k = 0; k < execbuf->buffer_count; k++)
      execbuf_bos[k]->offset = objects[k].offset;

   return VK_SUCCESS;
}

VkResult
anv_device_submit_simple_batch(struct anv_device *device,
                               struct anv_batch *batch)
{
   struct drm_i915_gem_execbuffer2 execbuf;
   struct drm_i915_gem_exec_object2 exec2_objects[1];
   struct anv_bo bo, *exec_bos[1];
   VkResult result = VK_SUCCESS;
   uint32_t size;

   /* Kernel driver requires 8 byte aligned batch length */
   size = align_u32(batch->next - batch->start, 8);
   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size);
   if (result != VK_SUCCESS)
      return result;

   memcpy(bo.map, batch->start, size);
   if (!device->info.has_llc)
      gen_flush_range(bo.map, size);

   exec_bos[0] = &bo;
   exec2_objects[0].handle = bo.gem_handle;
   exec2_objects[0].relocation_count = 0;
   exec2_objects[0].relocs_ptr = 0;
   exec2_objects[0].alignment = 0;
   exec2_objects[0].offset = bo.offset;
   exec2_objects[0].flags = 0;
   exec2_objects[0].rsvd1 = 0;
   exec2_objects[0].rsvd2 = 0;

   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
   execbuf.buffer_count = 1;
   execbuf.batch_start_offset = 0;
   execbuf.batch_len = size;
   execbuf.cliprects_ptr = 0;
   execbuf.num_cliprects = 0;
   execbuf.DR1 = 0;
   execbuf.DR4 = 0;

   execbuf.flags =
      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
   execbuf.rsvd1 = device->context_id;
   execbuf.rsvd2 = 0;

   result = anv_device_execbuf(device, &execbuf, exec_bos);
   if (result != VK_SUCCESS)
      goto fail;

   result = anv_device_wait(device, &bo, INT64_MAX);

 fail:
   anv_bo_pool_free(&device->batch_bo_pool, &bo);

   return result;
}

VkResult anv_QueueSubmit(
    VkQueue                                     _queue,
    uint32_t                                    submitCount,
    const VkSubmitInfo*                         pSubmits,
    VkFence                                     _fence)
{
   ANV_FROM_HANDLE(anv_queue, queue, _queue);
   ANV_FROM_HANDLE(anv_fence, fence, _fence);
   struct anv_device *device = queue->device;

   /* Query for device status prior to submitting.  Technically, we don't need
    * to do this.  However, if we have a client that's submitting piles of
    * garbage, we would rather break as early as possible to keep the GPU
    * hanging contained.  If we don't check here, we'll either be waiting for
    * the kernel to kick us or we'll have to wait until the client waits on a
    * fence before we actually know whether or not we've hung.
    */
   VkResult result = anv_device_query_status(device);
   if (result != VK_SUCCESS)
      return result;

   /* We lock around QueueSubmit for three main reasons:
    *
    *  1) When a block pool is resized, we create a new gem handle with a
    *     different size and, in the case of surface states, possibly a
    *     different center offset but we re-use the same anv_bo struct when
    *     we do so.  If this happens in the middle of setting up an execbuf,
    *     we could end up with our list of BOs out of sync with our list of
    *     gem handles.
    *
    *  2) The algorithm we use for building the list of unique buffers isn't
    *     thread-safe.  While the client is supposed to syncronize around
    *     QueueSubmit, this would be extremely difficult to debug if it ever
    *     came up in the wild due to a broken app.  It's better to play it
    *     safe and just lock around QueueSubmit.
    *
    *  3)  The anv_cmd_buffer_execbuf function may perform relocations in
    *      userspace.  Due to the fact that the surface state buffer is shared
    *      between batches, we can't afford to have that happen from multiple
    *      threads at the same time.  Even though the user is supposed to
    *      ensure this doesn't happen, we play it safe as in (2) above.
    *
    * Since the only other things that ever take the device lock such as block
    * pool resize only rarely happen, this will almost never be contended so
    * taking a lock isn't really an expensive operation in this case.
    */
   pthread_mutex_lock(&device->mutex);

   for (uint32_t i = 0; i < submitCount; i++) {
      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
         ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
                         pSubmits[i].pCommandBuffers[j]);
         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
         assert(!anv_batch_has_error(&cmd_buffer->batch));

         const VkSemaphore *in_semaphores = NULL, *out_semaphores = NULL;
         uint32_t num_in_semaphores = 0, num_out_semaphores = 0;
         if (j == 0) {
            /* Only the first batch gets the in semaphores */
            in_semaphores = pSubmits[i].pWaitSemaphores;
            num_in_semaphores = pSubmits[i].waitSemaphoreCount;
         }

         if (j == pSubmits[i].commandBufferCount - 1) {
            /* Only the last batch gets the out semaphores */
            out_semaphores = pSubmits[i].pSignalSemaphores;
            num_out_semaphores = pSubmits[i].signalSemaphoreCount;
         }

         result = anv_cmd_buffer_execbuf(device, cmd_buffer,
                                         in_semaphores, num_in_semaphores,
                                         out_semaphores, num_out_semaphores);
         if (result != VK_SUCCESS)
            goto out;
      }
   }

   if (fence) {
      struct anv_bo *fence_bo = &fence->bo;
      result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
      if (result != VK_SUCCESS)
         goto out;

      /* Update the fence and wake up any waiters */
      assert(fence->state == ANV_FENCE_STATE_RESET);
      fence->state = ANV_FENCE_STATE_SUBMITTED;
      pthread_cond_broadcast(&device->queue_submit);
   }

out:
   if (result != VK_SUCCESS) {
      /* In the case that something has gone wrong we may end up with an
       * inconsistent state from which it may not be trivial to recover.
       * For example, we might have computed address relocations and
       * any future attempt to re-submit this job will need to know about
       * this and avoid computing relocation addresses again.
       *
       * To avoid this sort of issues, we assume that if something was
       * wrong during submission we must already be in a really bad situation
       * anyway (such us being out of memory) and return
       * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
       * submit the same job again to this device.
       */
      result = vk_errorf(VK_ERROR_DEVICE_LOST, "vkQueueSubmit() failed");
      device->lost = true;

      /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
       * vkWaitForFences() and vkGetFenceStatus() return a valid result
       * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
       * Setting the fence status to SIGNALED ensures this will happen in
       * any case.
       */
      if (fence)
         fence->state = ANV_FENCE_STATE_SIGNALED;
   }

   pthread_mutex_unlock(&device->mutex);

   return result;
}

VkResult anv_QueueWaitIdle(
    VkQueue                                     _queue)
{
   ANV_FROM_HANDLE(anv_queue, queue, _queue);

   return anv_DeviceWaitIdle(anv_device_to_handle(queue->device));
}

VkResult anv_CreateFence(
    VkDevice                                    _device,
    const VkFenceCreateInfo*                    pCreateInfo,
    const VkAllocationCallbacks*                pAllocator,
    VkFence*                                    pFence)
{
   ANV_FROM_HANDLE(anv_device, device, _device);
   struct anv_bo fence_bo;
   struct anv_fence *fence;
   struct anv_batch batch;
   VkResult result;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);

   result = anv_bo_pool_alloc(&device->batch_bo_pool, &fence_bo, 4096);
   if (result != VK_SUCCESS)
      return result;

   /* Fences are small.  Just store the CPU data structure in the BO. */
   fence = fence_bo.map;
   fence->bo = fence_bo;

   /* Place the batch after the CPU data but on its own cache line. */
   const uint32_t batch_offset = align_u32(sizeof(*fence), CACHELINE_SIZE);
   batch.next = batch.start = fence->bo.map + batch_offset;
   batch.end = fence->bo.map + fence->bo.size;
   anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe);
   anv_batch_emit(&batch, GEN7_MI_NOOP, noop);

   if (!device->info.has_llc) {
      assert(((uintptr_t) batch.start & CACHELINE_MASK) == 0);
      assert(batch.next - batch.start <= CACHELINE_SIZE);
      __builtin_ia32_mfence();
      __builtin_ia32_clflush(batch.start);
   }

   fence->exec2_objects[0].handle = fence->bo.gem_handle;
   fence->exec2_objects[0].relocation_count = 0;
   fence->exec2_objects[0].relocs_ptr = 0;
   fence->exec2_objects[0].alignment = 0;
   fence->exec2_objects[0].offset = fence->bo.offset;
   fence->exec2_objects[0].flags = 0;
   fence->exec2_objects[0].rsvd1 = 0;
   fence->exec2_objects[0].rsvd2 = 0;

   fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
   fence->execbuf.buffer_count = 1;
   fence->execbuf.batch_start_offset = batch.start - fence->bo.map;
   fence->execbuf.batch_len = batch.next - batch.start;
   fence->execbuf.cliprects_ptr = 0;
   fence->execbuf.num_cliprects = 0;
   fence->execbuf.DR1 = 0;
   fence->execbuf.DR4 = 0;

   fence->execbuf.flags =
      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
   fence->execbuf.rsvd1 = device->context_id;
   fence->execbuf.rsvd2 = 0;

   if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
      fence->state = ANV_FENCE_STATE_SIGNALED;
   } else {
      fence->state = ANV_FENCE_STATE_RESET;
   }

   *pFence = anv_fence_to_handle(fence);

   return VK_SUCCESS;
}

void anv_DestroyFence(
    VkDevice                                    _device,
    VkFence                                     _fence,
    const VkAllocationCallbacks*                pAllocator)
{
   ANV_FROM_HANDLE(anv_device, device, _device);
   ANV_FROM_HANDLE(anv_fence, fence, _fence);

   if (!fence)
      return;

   assert(fence->bo.map == fence);
   anv_bo_pool_free(&device->batch_bo_pool, &fence->bo);
}

VkResult anv_ResetFences(
    VkDevice                                    _device,
    uint32_t                                    fenceCount,
    const VkFence*                              pFences)
{
   for (uint32_t i = 0; i < fenceCount; i++) {
      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
      fence->state = ANV_FENCE_STATE_RESET;
   }

   return VK_SUCCESS;
}

VkResult anv_GetFenceStatus(
    VkDevice                                    _device,
    VkFence                                     _fence)
{
   ANV_FROM_HANDLE(anv_device, device, _device);
   ANV_FROM_HANDLE(anv_fence, fence, _fence);

   if (unlikely(device->lost))
      return VK_ERROR_DEVICE_LOST;

   switch (fence->state) {
   case ANV_FENCE_STATE_RESET:
      /* If it hasn't even been sent off to the GPU yet, it's not ready */
      return VK_NOT_READY;

   case ANV_FENCE_STATE_SIGNALED:
      /* It's been signaled, return success */
      return VK_SUCCESS;

   case ANV_FENCE_STATE_SUBMITTED: {
      VkResult result = anv_device_bo_busy(device, &fence->bo);
      if (result == VK_SUCCESS) {
         fence->state = ANV_FENCE_STATE_SIGNALED;
         return VK_SUCCESS;
      } else {
         return result;
      }
   }
   default:
      unreachable("Invalid fence status");
   }
}

#define NSEC_PER_SEC 1000000000
#define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)

VkResult anv_WaitForFences(
    VkDevice                                    _device,
    uint32_t                                    fenceCount,
    const VkFence*                              pFences,
    VkBool32                                    waitAll,
    uint64_t                                    _timeout)
{
   ANV_FROM_HANDLE(anv_device, device, _device);
   int ret;

   if (unlikely(device->lost))
      return VK_ERROR_DEVICE_LOST;

   /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
    * to block indefinitely timeouts <= 0.  Unfortunately, this was broken
    * for a couple of kernel releases.  Since there's no way to know
    * whether or not the kernel we're using is one of the broken ones, the
    * best we can do is to clamp the timeout to INT64_MAX.  This limits the
    * maximum timeout from 584 years to 292 years - likely not a big deal.
    */
   int64_t timeout = MIN2(_timeout, INT64_MAX);

   VkResult result = VK_SUCCESS;
   uint32_t pending_fences = fenceCount;
   while (pending_fences) {
      pending_fences = 0;
      bool signaled_fences = false;
      for (uint32_t i = 0; i < fenceCount; i++) {
         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
         switch (fence->state) {
         case ANV_FENCE_STATE_RESET:
            /* This fence hasn't been submitted yet, we'll catch it the next
             * time around.  Yes, this may mean we dead-loop but, short of
             * lots of locking and a condition variable, there's not much that
             * we can do about that.
             */
            pending_fences++;
            continue;

         case ANV_FENCE_STATE_SIGNALED:
            /* This fence is not pending.  If waitAll isn't set, we can return
             * early.  Otherwise, we have to keep going.
             */
            if (!waitAll) {
               result = VK_SUCCESS;
               goto done;
            }
            continue;

         case ANV_FENCE_STATE_SUBMITTED:
            /* These are the fences we really care about.  Go ahead and wait
             * on it until we hit a timeout.
             */
            result = anv_device_wait(device, &fence->bo, timeout);
            switch (result) {
            case VK_SUCCESS:
               fence->state = ANV_FENCE_STATE_SIGNALED;
               signaled_fences = true;
               if (!waitAll)
                  goto done;
               break;

            case VK_TIMEOUT:
               goto done;

            default:
               return result;
            }
         }
      }

      if (pending_fences && !signaled_fences) {
         /* If we've hit this then someone decided to vkWaitForFences before
          * they've actually submitted any of them to a queue.  This is a
          * fairly pessimal case, so it's ok to lock here and use a standard
          * pthreads condition variable.
          */
         pthread_mutex_lock(&device->mutex);

         /* It's possible that some of the fences have changed state since the
          * last time we checked.  Now that we have the lock, check for
          * pending fences again and don't wait if it's changed.
          */
         uint32_t now_pending_fences = 0;
         for (uint32_t i = 0; i < fenceCount; i++) {
            ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
            if (fence->state == ANV_FENCE_STATE_RESET)
               now_pending_fences++;
         }
         assert(now_pending_fences <= pending_fences);

         if (now_pending_fences == pending_fences) {
            struct timespec before;
            clock_gettime(CLOCK_MONOTONIC, &before);

            uint32_t abs_nsec = before.tv_nsec + timeout % NSEC_PER_SEC;
            uint64_t abs_sec = before.tv_sec + (abs_nsec / NSEC_PER_SEC) +
                               (timeout / NSEC_PER_SEC);
            abs_nsec %= NSEC_PER_SEC;

            /* Avoid roll-over in tv_sec on 32-bit systems if the user
             * provided timeout is UINT64_MAX
             */
            struct timespec abstime;
            abstime.tv_nsec = abs_nsec;
            abstime.tv_sec = MIN2(abs_sec, INT_TYPE_MAX(abstime.tv_sec));

            ret = pthread_cond_timedwait(&device->queue_submit,
                                         &device->mutex, &abstime);
            assert(ret != EINVAL);

            struct timespec after;
            clock_gettime(CLOCK_MONOTONIC, &after);
            uint64_t time_elapsed =
               ((uint64_t)after.tv_sec * NSEC_PER_SEC + after.tv_nsec) -
               ((uint64_t)before.tv_sec * NSEC_PER_SEC + before.tv_nsec);

            if (time_elapsed >= timeout) {
               pthread_mutex_unlock(&device->mutex);
               result = VK_TIMEOUT;
               goto done;
            }

            timeout -= time_elapsed;
         }

         pthread_mutex_unlock(&device->mutex);
      }
   }

done:
   if (unlikely(device->lost))
      return VK_ERROR_DEVICE_LOST;

   return result;
}

// Queue semaphore functions

VkResult anv_CreateSemaphore(
    VkDevice                                    _device,
    const VkSemaphoreCreateInfo*                pCreateInfo,
    const VkAllocationCallbacks*                pAllocator,
    VkSemaphore*                                pSemaphore)
{
   ANV_FROM_HANDLE(anv_device, device, _device);
   struct anv_semaphore *semaphore;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);

   semaphore = vk_alloc2(&device->alloc, pAllocator, sizeof(*semaphore), 8,
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (semaphore == NULL)
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

   /* The DRM execbuffer ioctl always execute in-oder so long as you stay
    * on the same ring.  Since we don't expose the blit engine as a DMA
    * queue, a dummy no-op semaphore is a perfectly valid implementation.
    */
   semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DUMMY;
   semaphore->temporary.type = ANV_SEMAPHORE_TYPE_NONE;

   *pSemaphore = anv_semaphore_to_handle(semaphore);

   return VK_SUCCESS;
}

static void
anv_semaphore_impl_cleanup(struct anv_device *device,
                           struct anv_semaphore_impl *impl)
{
   switch (impl->type) {
   case ANV_SEMAPHORE_TYPE_NONE:
   case ANV_SEMAPHORE_TYPE_DUMMY:
      /* Dummy.  Nothing to do */
      return;

   case ANV_SEMAPHORE_TYPE_BO:
      anv_bo_cache_release(device, &device->bo_cache, impl->bo);
      return;
   }

   unreachable("Invalid semaphore type");
}

void anv_DestroySemaphore(
    VkDevice                                    _device,
    VkSemaphore                                 _semaphore,
    const VkAllocationCallbacks*                pAllocator)
{
   ANV_FROM_HANDLE(anv_device, device, _device);
   ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);

   if (semaphore == NULL)
      return;

   anv_semaphore_impl_cleanup(device, &semaphore->temporary);
   anv_semaphore_impl_cleanup(device, &semaphore->permanent);

   vk_free2(&device->alloc, pAllocator, semaphore);
}