/*
 * Copyright 2018 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include <stddef.h>

#include "si_pipe.h"
#include "si_query.h"
#include "util/u_memory.h"
#include "util/u_suballoc.h"
#include "sid.h"

/**
 * The query buffer is written to by ESGS NGG shaders with statistics about
 * generated and (streamout-)emitted primitives.
 *
 * The context maintains a ring of these query buffers, and queries simply
 * point into the ring, allowing an arbitrary number of queries to be active
 * without additional GPU cost.
 */
struct gfx10_sh_query_buffer {
	struct list_head list;
	struct si_resource *buf;
	unsigned refcount;

	/* Offset into the buffer in bytes; points at the first un-emitted entry. */
	unsigned head;
};

/* Memory layout of the query buffer. Must be kept in sync with shaders
 * (including QBO shaders) and should be aligned to cachelines.
 *
 * The somewhat awkward memory layout is for compatibility with the
 * SET_PREDICATION packet, which also means that we're setting the high bit
 * of all those values unconditionally.
 */
struct gfx10_sh_query_buffer_mem {
	struct {
		uint64_t generated_primitives_start_dummy;
		uint64_t emitted_primitives_start_dummy;
		uint64_t generated_primitives;
		uint64_t emitted_primitives;
	} stream[4];
	uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
	uint32_t pad[31];
};

/* Shader-based queries. */
struct gfx10_sh_query {
	struct si_query b;

	struct gfx10_sh_query_buffer *first;
	struct gfx10_sh_query_buffer *last;
	unsigned first_begin;
	unsigned last_end;

	unsigned stream;
};

static void emit_shader_query(struct si_context *sctx)
{
	assert(!list_is_empty(&sctx->shader_query_buffers));

	struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
							     struct gfx10_sh_query_buffer, list);
	qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
}

static void gfx10_release_query_buffers(struct si_context *sctx,
					struct gfx10_sh_query_buffer *first,
					struct gfx10_sh_query_buffer *last)
{
	while (first) {
		struct gfx10_sh_query_buffer *qbuf = first;
		if (first != last)
			first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
		else
			first = NULL;

		qbuf->refcount--;
		if (qbuf->refcount)
			continue;

		if (qbuf->list.next == &sctx->shader_query_buffers)
			continue; /* keep the most recent buffer; it may not be full yet */
		if (qbuf->list.prev == &sctx->shader_query_buffers)
			continue; /* keep the oldest buffer for recycling */

		list_del(&qbuf->list);
		si_resource_reference(&qbuf->buf, NULL);
		FREE(qbuf);
	}
}

static bool gfx10_alloc_query_buffer(struct si_context *sctx)
{
	if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
		return true;

	struct gfx10_sh_query_buffer *qbuf = NULL;

	if (!list_is_empty(&sctx->shader_query_buffers)) {
		qbuf = list_last_entry(&sctx->shader_query_buffers,
				       struct gfx10_sh_query_buffer, list);
		if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
			goto success;

		qbuf = list_first_entry(&sctx->shader_query_buffers,
				        struct gfx10_sh_query_buffer, list);
		if (!qbuf->refcount &&
		    !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
		    sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
			/* Can immediately re-use the oldest buffer */
			list_del(&qbuf->list);
		} else {
			qbuf = NULL;
		}
	}

	if (!qbuf) {
		qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
		if (unlikely(!qbuf))
			return false;

		struct si_screen *screen = sctx->screen;
		unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
					 screen->info.min_alloc_size);
		qbuf->buf = si_resource(
			pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
		if (unlikely(!qbuf->buf)) {
			FREE(qbuf);
			return false;
		}
	}

	/* The buffer is currently unused by the GPU. Initialize it.
	 *
	 * We need to set the high bit of all the primitive counters for
	 * compatibility with the SET_PREDICATION packet.
	 */
	uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
						 PIPE_TRANSFER_WRITE |
						 PIPE_TRANSFER_UNSYNCHRONIZED);
	assert(results);

	for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
	     i < e; ++i) {
		for (unsigned j = 0; j < 16; ++j)
			results[32 * i + j] = (uint64_t)1 << 63;
		results[32 * i + 16] = 0;
	}

	list_addtail(&qbuf->list, &sctx->shader_query_buffers);
	qbuf->head = 0;
	qbuf->refcount = sctx->num_active_shader_queries;

success:;
	struct pipe_shader_buffer sbuf;
	sbuf.buffer = &qbuf->buf->b.b;
	sbuf.buffer_offset = qbuf->head;
	sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
	si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
	sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);

	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
	return true;
}

static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
{
	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
	gfx10_release_query_buffers(sctx, query->first, query->last);
	FREE(query);
}

static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
{
	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

	gfx10_release_query_buffers(sctx, query->first, query->last);
	query->first = query->last = NULL;

	if (unlikely(!gfx10_alloc_query_buffer(sctx)))
		return false;

	query->first = list_last_entry(&sctx->shader_query_buffers,
				       struct gfx10_sh_query_buffer, list);
	query->first_begin = query->first->head;

	sctx->num_active_shader_queries++;
	query->first->refcount++;

	return true;
}

static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
{
	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

	if (unlikely(!query->first))
		return false; /* earlier out of memory error */

	query->last = list_last_entry(&sctx->shader_query_buffers,
				      struct gfx10_sh_query_buffer, list);
	query->last_end = query->last->head;

	/* Signal the fence of the previous chunk */
	if (query->last_end != 0) {
		uint64_t fence_va = query->last->buf->gpu_address;
		fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
		fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
		si_cp_release_mem(sctx, sctx->gfx_cs,
				  V_028A90_BOTTOM_OF_PIPE_TS, 0,
				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
				  EOP_DATA_SEL_VALUE_32BIT,
				  query->last->buf, fence_va, 0xffffffff,
				  PIPE_QUERY_GPU_FINISHED);
	}

	sctx->num_active_shader_queries--;

	if (sctx->num_active_shader_queries > 0) {
		gfx10_alloc_query_buffer(sctx);
	} else {
		si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
		sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;

		/* If a query_begin is followed by a query_end without a draw
		 * in-between, we need to clear the atom to ensure that the
		 * next query_begin will re-initialize the shader buffer. */
		si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
	}

	return true;
}

static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
				      struct gfx10_sh_query_buffer_mem *qmem,
				      union pipe_query_result *result)
{
	static const uint64_t mask = ((uint64_t)1 << 63) - 1;

	switch (query->b.type) {
	case PIPE_QUERY_PRIMITIVES_EMITTED:
		result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
		break;
	case PIPE_QUERY_PRIMITIVES_GENERATED:
		result->u64 += qmem->stream[query->stream].generated_primitives & mask;
		break;
	case PIPE_QUERY_SO_STATISTICS:
		result->so_statistics.num_primitives_written +=
			qmem->stream[query->stream].emitted_primitives & mask;
		result->so_statistics.primitives_storage_needed +=
			qmem->stream[query->stream].generated_primitives & mask;
		break;
	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
		result->b |= qmem->stream[query->stream].emitted_primitives !=
			     qmem->stream[query->stream].generated_primitives;
		break;
	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
			result->b |= qmem->stream[query->stream].emitted_primitives !=
				     qmem->stream[query->stream].generated_primitives;
		}
		break;
	default:
		assert(0);
	}
}

static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
				      bool wait, union pipe_query_result *result)
{
	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

	util_query_clear_result(result, query->b.type);

	if (unlikely(!query->first))
		return false; /* earlier out of memory error */
	assert(query->last);

	for (struct gfx10_sh_query_buffer *qbuf = query->last;;
	     qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
		unsigned usage = PIPE_TRANSFER_READ |
				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
		void *map;

		if (rquery->b.flushed)
			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
		else
			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);

		if (!map)
			return false;

		unsigned results_begin = 0;
		unsigned results_end = qbuf->head;
		if (qbuf == query->first)
			results_begin = query->first_begin;
		if (qbuf == query->last)
			results_end = query->last_end;

		while (results_begin != results_end) {
			struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
			results_begin += sizeof(*qmem);

			gfx10_sh_query_add_result(query, qmem, result);
		}

		if (qbuf == query->first)
			break;
	}

	return true;
}

static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
					       struct si_query *rquery,
					       bool wait,
					       enum pipe_query_value_type result_type,
					       int index,
					       struct pipe_resource *resource,
					       unsigned offset)
{
	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
	struct si_qbo_state saved_state = {};
	struct pipe_resource *tmp_buffer = NULL;
	unsigned tmp_buffer_offset = 0;

	if (!sctx->sh_query_result_shader) {
		sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
		if (!sctx->sh_query_result_shader)
			return;
	}

	if (query->first != query->last) {
		u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
				     &tmp_buffer_offset, &tmp_buffer);
		if (!tmp_buffer)
			return;
	}

	si_save_qbo_state(sctx, &saved_state);

	/* Pre-fill the constants configuring the shader behavior. */
	struct {
		uint32_t config;
		uint32_t offset;
		uint32_t chain;
		uint32_t result_count;
	} consts;
	struct pipe_constant_buffer constant_buffer = {};

	if (index >= 0) {
		switch (query->b.type) {
		case PIPE_QUERY_PRIMITIVES_GENERATED:
			consts.offset = sizeof(uint32_t) * query->stream;
			consts.config = 0;
			break;
		case PIPE_QUERY_PRIMITIVES_EMITTED:
			consts.offset = sizeof(uint32_t) * (4 + query->stream);
			consts.config = 0;
			break;
		case PIPE_QUERY_SO_STATISTICS:
			consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
			consts.config = 0;
			break;
		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
			consts.offset = sizeof(uint32_t) * query->stream;
			consts.config = 2;
			break;
		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
			consts.offset = 0;
			consts.config = 3;
			break;
		default: unreachable("bad query type");
		}
	} else {
		/* Check result availability. */
		consts.offset = 0;
		consts.config = 1;
	}

	if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
		consts.config |= 8;

	constant_buffer.buffer_size = sizeof(consts);
	constant_buffer.user_buffer = &consts;

	/* Pre-fill the SSBOs and grid. */
	struct pipe_shader_buffer ssbo[3];
	struct pipe_grid_info grid = {};

	ssbo[1].buffer = tmp_buffer;
	ssbo[1].buffer_offset = tmp_buffer_offset;
	ssbo[1].buffer_size = 16;

	ssbo[2] = ssbo[1];

	sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);

	grid.block[0] = 1;
	grid.block[1] = 1;
	grid.block[2] = 1;
	grid.grid[0] = 1;
	grid.grid[1] = 1;
	grid.grid[2] = 1;

	struct gfx10_sh_query_buffer *qbuf = query->first;
	for (;;) {
		unsigned begin = qbuf == query->first ? query->first_begin : 0;
		unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
		if (!end)
			continue;

		ssbo[0].buffer = &qbuf->buf->b.b;
		ssbo[0].buffer_offset = begin;
		ssbo[0].buffer_size = end - begin;

		consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
		consts.chain = 0;
		if (qbuf != query->first)
			consts.chain |= 1;
		if (qbuf != query->last)
			consts.chain |= 2;

		if (qbuf == query->last) {
			ssbo[2].buffer = resource;
			ssbo[2].buffer_offset = offset;
			ssbo[2].buffer_size = 8;
		}

		sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);

		if (wait) {
			uint64_t va;

			/* Wait for result availability. Wait only for readiness
			 * of the last entry, since the fence writes should be
			 * serialized in the CP.
			 */
			va = qbuf->buf->gpu_address;
			va += end - sizeof(struct gfx10_sh_query_buffer_mem);
			va += offsetof(struct gfx10_sh_query_buffer_mem, fence);

			si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
		}

		sctx->b.launch_grid(&sctx->b, &grid);
		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;

		if (qbuf == query->last)
			break;
		qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
	}

	si_restore_qbo_state(sctx, &saved_state);
	pipe_resource_reference(&tmp_buffer, NULL);
}

static const struct si_query_ops gfx10_sh_query_ops = {
	.destroy = gfx10_sh_query_destroy,
	.begin = gfx10_sh_query_begin,
	.end = gfx10_sh_query_end,
	.get_result = gfx10_sh_query_get_result,
	.get_result_resource = gfx10_sh_query_get_result_resource,
};

struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
					 enum pipe_query_type query_type,
					 unsigned index)
{
	struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
	if (unlikely(!query))
		return NULL;

	query->b.ops = &gfx10_sh_query_ops;
	query->b.type = query_type;
	query->stream = index;

	return (struct pipe_query *)query;
}

void gfx10_init_query(struct si_context *sctx)
{
	list_inithead(&sctx->shader_query_buffers);
	sctx->atoms.s.shader_query.emit = emit_shader_query;
}

void gfx10_destroy_query(struct si_context *sctx)
{
	while (!list_is_empty(&sctx->shader_query_buffers)) {
		struct gfx10_sh_query_buffer *qbuf =
			list_first_entry(&sctx->shader_query_buffers,
					 struct gfx10_sh_query_buffer, list);
		list_del(&qbuf->list);

		assert(!qbuf->refcount);
		si_resource_reference(&qbuf->buf, NULL);
		FREE(qbuf);
	}
}