/*
 * Copyright 2018 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "si_pipe.h"
#include "si_query.h"
#include "sid.h"
#include "util/u_memory.h"
#include "util/u_suballoc.h"

#include <stddef.h>

/**
 * The query buffer is written to by ESGS NGG shaders with statistics about
 * generated and (streamout-)emitted primitives.
 *
 * The context maintains a ring of these query buffers, and queries simply
 * point into the ring, allowing an arbitrary number of queries to be active
 * without additional GPU cost.
 */
struct gfx10_sh_query_buffer {
   struct list_head list;
   struct si_resource *buf;
   unsigned refcount;

   /* Offset into the buffer in bytes; points at the first un-emitted entry. */
   unsigned head;
};

/* Memory layout of the query buffer. Must be kept in sync with shaders
 * (including QBO shaders) and should be aligned to cachelines.
 *
 * The somewhat awkward memory layout is for compatibility with the
 * SET_PREDICATION packet, which also means that we're setting the high bit
 * of all those values unconditionally.
 */
struct gfx10_sh_query_buffer_mem {
   struct {
      uint64_t generated_primitives_start_dummy;
      uint64_t emitted_primitives_start_dummy;
      uint64_t generated_primitives;
      uint64_t emitted_primitives;
   } stream[4];
   uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
   uint32_t pad[31];
};

/* Shader-based queries. */
struct gfx10_sh_query {
   struct si_query b;

   struct gfx10_sh_query_buffer *first;
   struct gfx10_sh_query_buffer *last;
   unsigned first_begin;
   unsigned last_end;

   unsigned stream;
};

static void emit_shader_query(struct si_context *sctx)
{
   assert(!list_is_empty(&sctx->shader_query_buffers));

   struct gfx10_sh_query_buffer *qbuf =
      list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
   qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
}

static void gfx10_release_query_buffers(struct si_context *sctx,
                                        struct gfx10_sh_query_buffer *first,
                                        struct gfx10_sh_query_buffer *last)
{
   while (first) {
      struct gfx10_sh_query_buffer *qbuf = first;
      if (first != last)
         first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
      else
         first = NULL;

      qbuf->refcount--;
      if (qbuf->refcount)
         continue;

      if (qbuf->list.next == &sctx->shader_query_buffers)
         continue; /* keep the most recent buffer; it may not be full yet */
      if (qbuf->list.prev == &sctx->shader_query_buffers)
         continue; /* keep the oldest buffer for recycling */

      list_del(&qbuf->list);
      si_resource_reference(&qbuf->buf, NULL);
      FREE(qbuf);
   }
}

static bool gfx10_alloc_query_buffer(struct si_context *sctx)
{
   if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
      return true;

   struct gfx10_sh_query_buffer *qbuf = NULL;

   if (!list_is_empty(&sctx->shader_query_buffers)) {
      qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
      if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
         goto success;

      qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
      if (!qbuf->refcount &&
          !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
          sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
         /* Can immediately re-use the oldest buffer */
         list_del(&qbuf->list);
      } else {
         qbuf = NULL;
      }
   }

   if (!qbuf) {
      qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
      if (unlikely(!qbuf))
         return false;

      struct si_screen *screen = sctx->screen;
      unsigned buf_size =
         MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
      qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
      if (unlikely(!qbuf->buf)) {
         FREE(qbuf);
         return false;
      }
   }

   /* The buffer is currently unused by the GPU. Initialize it.
    *
    * We need to set the high bit of all the primitive counters for
    * compatibility with the SET_PREDICATION packet.
    */
   uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
                                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
   assert(results);

   for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
        ++i) {
      for (unsigned j = 0; j < 16; ++j)
         results[32 * i + j] = (uint64_t)1 << 63;
      results[32 * i + 16] = 0;
   }

   list_addtail(&qbuf->list, &sctx->shader_query_buffers);
   qbuf->head = 0;
   qbuf->refcount = sctx->num_active_shader_queries;

success:;
   struct pipe_shader_buffer sbuf;
   sbuf.buffer = &qbuf->buf->b.b;
   sbuf.buffer_offset = qbuf->head;
   sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
   si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
   sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);

   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
   return true;
}

static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
{
   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
   gfx10_release_query_buffers(sctx, query->first, query->last);
   FREE(query);
}

static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
{
   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

   gfx10_release_query_buffers(sctx, query->first, query->last);
   query->first = query->last = NULL;

   if (unlikely(!gfx10_alloc_query_buffer(sctx)))
      return false;

   query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
   query->first_begin = query->first->head;

   sctx->num_active_shader_queries++;
   query->first->refcount++;

   return true;
}

static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
{
   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

   if (unlikely(!query->first))
      return false; /* earlier out of memory error */

   query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
   query->last_end = query->last->head;

   /* Signal the fence of the previous chunk */
   if (query->last_end != 0) {
      uint64_t fence_va = query->last->buf->gpu_address;
      fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
      fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
                        0xffffffff, PIPE_QUERY_GPU_FINISHED);
   }

   sctx->num_active_shader_queries--;

   if (sctx->num_active_shader_queries > 0) {
      gfx10_alloc_query_buffer(sctx);
   } else {
      si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
      sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;

      /* If a query_begin is followed by a query_end without a draw
       * in-between, we need to clear the atom to ensure that the
       * next query_begin will re-initialize the shader buffer. */
      si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
   }

   return true;
}

static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
                                      struct gfx10_sh_query_buffer_mem *qmem,
                                      union pipe_query_result *result)
{
   static const uint64_t mask = ((uint64_t)1 << 63) - 1;

   switch (query->b.type) {
   case PIPE_QUERY_PRIMITIVES_EMITTED:
      result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
      break;
   case PIPE_QUERY_PRIMITIVES_GENERATED:
      result->u64 += qmem->stream[query->stream].generated_primitives & mask;
      break;
   case PIPE_QUERY_SO_STATISTICS:
      result->so_statistics.num_primitives_written +=
         qmem->stream[query->stream].emitted_primitives & mask;
      result->so_statistics.primitives_storage_needed +=
         qmem->stream[query->stream].generated_primitives & mask;
      break;
   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
      result->b |= qmem->stream[query->stream].emitted_primitives !=
                   qmem->stream[query->stream].generated_primitives;
      break;
   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
         result->b |= qmem->stream[query->stream].emitted_primitives !=
                      qmem->stream[query->stream].generated_primitives;
      }
      break;
   default:
      assert(0);
   }
}

static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
                                      union pipe_query_result *result)
{
   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

   util_query_clear_result(result, query->b.type);

   if (unlikely(!query->first))
      return false; /* earlier out of memory error */
   assert(query->last);

   for (struct gfx10_sh_query_buffer *qbuf = query->last;;
        qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
      void *map;

      if (rquery->b.flushed)
         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
      else
         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);

      if (!map)
         return false;

      unsigned results_begin = 0;
      unsigned results_end = qbuf->head;
      if (qbuf == query->first)
         results_begin = query->first_begin;
      if (qbuf == query->last)
         results_end = query->last_end;

      while (results_begin != results_end) {
         struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
         results_begin += sizeof(*qmem);

         gfx10_sh_query_add_result(query, qmem, result);
      }

      if (qbuf == query->first)
         break;
   }

   return true;
}

static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
                                               bool wait, enum pipe_query_value_type result_type,
                                               int index, struct pipe_resource *resource,
                                               unsigned offset)
{
   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
   struct si_qbo_state saved_state = {};
   struct pipe_resource *tmp_buffer = NULL;
   unsigned tmp_buffer_offset = 0;

   if (!sctx->sh_query_result_shader) {
      sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
      if (!sctx->sh_query_result_shader)
         return;
   }

   if (query->first != query->last) {
      u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
      if (!tmp_buffer)
         return;
   }

   si_save_qbo_state(sctx, &saved_state);

   /* Pre-fill the constants configuring the shader behavior. */
   struct {
      uint32_t config;
      uint32_t offset;
      uint32_t chain;
      uint32_t result_count;
   } consts;
   struct pipe_constant_buffer constant_buffer = {};

   if (index >= 0) {
      switch (query->b.type) {
      case PIPE_QUERY_PRIMITIVES_GENERATED:
         consts.offset = sizeof(uint32_t) * query->stream;
         consts.config = 0;
         break;
      case PIPE_QUERY_PRIMITIVES_EMITTED:
         consts.offset = sizeof(uint32_t) * (4 + query->stream);
         consts.config = 0;
         break;
      case PIPE_QUERY_SO_STATISTICS:
         consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
         consts.config = 0;
         break;
      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
         consts.offset = sizeof(uint32_t) * query->stream;
         consts.config = 2;
         break;
      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
         consts.offset = 0;
         consts.config = 3;
         break;
      default:
         unreachable("bad query type");
      }
   } else {
      /* Check result availability. */
      consts.offset = 0;
      consts.config = 1;
   }

   if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
      consts.config |= 8;

   constant_buffer.buffer_size = sizeof(consts);
   constant_buffer.user_buffer = &consts;

   /* Pre-fill the SSBOs and grid. */
   struct pipe_shader_buffer ssbo[3];
   struct pipe_grid_info grid = {};

   ssbo[1].buffer = tmp_buffer;
   ssbo[1].buffer_offset = tmp_buffer_offset;
   ssbo[1].buffer_size = 16;

   ssbo[2] = ssbo[1];

   sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);

   grid.block[0] = 1;
   grid.block[1] = 1;
   grid.block[2] = 1;
   grid.grid[0] = 1;
   grid.grid[1] = 1;
   grid.grid[2] = 1;

   struct gfx10_sh_query_buffer *qbuf = query->first;
   for (;;) {
      unsigned begin = qbuf == query->first ? query->first_begin : 0;
      unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
      if (!end)
         continue;

      ssbo[0].buffer = &qbuf->buf->b.b;
      ssbo[0].buffer_offset = begin;
      ssbo[0].buffer_size = end - begin;

      consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
      consts.chain = 0;
      if (qbuf != query->first)
         consts.chain |= 1;
      if (qbuf != query->last)
         consts.chain |= 2;

      if (qbuf == query->last) {
         ssbo[2].buffer = resource;
         ssbo[2].buffer_offset = offset;
         ssbo[2].buffer_size = 8;
      }

      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
      sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);

      if (wait) {
         uint64_t va;

         /* Wait for result availability. Wait only for readiness
          * of the last entry, since the fence writes should be
          * serialized in the CP.
          */
         va = qbuf->buf->gpu_address;
         va += end - sizeof(struct gfx10_sh_query_buffer_mem);
         va += offsetof(struct gfx10_sh_query_buffer_mem, fence);

         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
      }

      sctx->b.launch_grid(&sctx->b, &grid);
      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;

      if (qbuf == query->last)
         break;
      qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
   }

   si_restore_qbo_state(sctx, &saved_state);
   pipe_resource_reference(&tmp_buffer, NULL);
}

static const struct si_query_ops gfx10_sh_query_ops = {
   .destroy = gfx10_sh_query_destroy,
   .begin = gfx10_sh_query_begin,
   .end = gfx10_sh_query_end,
   .get_result = gfx10_sh_query_get_result,
   .get_result_resource = gfx10_sh_query_get_result_resource,
};

struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
                                         unsigned index)
{
   struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
   if (unlikely(!query))
      return NULL;

   query->b.ops = &gfx10_sh_query_ops;
   query->b.type = query_type;
   query->stream = index;

   return (struct pipe_query *)query;
}

void gfx10_init_query(struct si_context *sctx)
{
   list_inithead(&sctx->shader_query_buffers);
   sctx->atoms.s.shader_query.emit = emit_shader_query;
}

void gfx10_destroy_query(struct si_context *sctx)
{
   while (!list_is_empty(&sctx->shader_query_buffers)) {
      struct gfx10_sh_query_buffer *qbuf =
         list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
      list_del(&qbuf->list);

      assert(!qbuf->refcount);
      si_resource_reference(&qbuf->buf, NULL);
      FREE(qbuf);
   }
}