/*
 * Copyright 2013 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "r600_pipe_common.h"
#include "r600_cs.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
#include "radeon/radeon_video.h"

/*
 * pipe_context
 */

/**
 * Write an EOP event.
 *
 * \param event		EVENT_TYPE_*
 * \param event_flags	Optional cache flush flags (TC)
 * \param data_sel	1 = fence, 3 = timestamp
 * \param buf		Buffer
 * \param va		GPU address
 * \param old_value	Previous fence value (for a bug workaround)
 * \param new_value	Fence value to write for this event.
 */
void si_gfx_write_event_eop(struct r600_common_context *ctx,
			    unsigned event, unsigned event_flags,
			    unsigned data_sel,
			    struct r600_resource *buf, uint64_t va,
			    uint32_t new_fence, unsigned query_type)
{
	struct radeon_winsys_cs *cs = ctx->gfx.cs;
	unsigned op = EVENT_TYPE(event) |
		      EVENT_INDEX(5) |
		      event_flags;
	unsigned sel = EOP_DATA_SEL(data_sel);

	/* Wait for write confirmation before writing data, but don't send
	 * an interrupt. */
	if (data_sel != EOP_DATA_SEL_DISCARD)
		sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);

	if (ctx->chip_class >= GFX9) {
		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
		 * counters) must immediately precede every timestamp event to
		 * prevent a GPU hang on GFX9.
		 *
		 * Occlusion queries don't need to do it here, because they
		 * always do ZPASS_DONE before the timestamp.
		 */
		if (ctx->chip_class == GFX9 &&
		    query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
			struct r600_resource *scratch = ctx->eop_bug_scratch;

			assert(16 * ctx->screen->info.num_render_backends <=
			       scratch->b.b.width0);
			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
			radeon_emit(cs, scratch->gpu_address);
			radeon_emit(cs, scratch->gpu_address >> 32);

			radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
		}

		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
		radeon_emit(cs, op);
		radeon_emit(cs, sel);
		radeon_emit(cs, va);		/* address lo */
		radeon_emit(cs, va >> 32);	/* address hi */
		radeon_emit(cs, new_fence);	/* immediate data lo */
		radeon_emit(cs, 0); /* immediate data hi */
		radeon_emit(cs, 0); /* unused */
	} else {
		if (ctx->chip_class == CIK ||
		    ctx->chip_class == VI) {
			struct r600_resource *scratch = ctx->eop_bug_scratch;
			uint64_t va = scratch->gpu_address;

			/* Two EOP events are required to make all engines go idle
			 * (and optional cache flushes executed) before the timestamp
			 * is written.
			 */
			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
			radeon_emit(cs, op);
			radeon_emit(cs, va);
			radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
			radeon_emit(cs, 0); /* immediate data */
			radeon_emit(cs, 0); /* unused */

			radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
		}

		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
		radeon_emit(cs, op);
		radeon_emit(cs, va);
		radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
		radeon_emit(cs, new_fence); /* immediate data */
		radeon_emit(cs, 0); /* unused */
	}

	if (buf) {
		radeon_add_to_buffer_list(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
					  RADEON_PRIO_QUERY);
	}
}

unsigned si_gfx_write_fence_dwords(struct si_screen *screen)
{
	unsigned dwords = 6;

	if (screen->info.chip_class == CIK ||
	    screen->info.chip_class == VI)
		dwords *= 2;

	if (!screen->info.has_virtual_memory)
		dwords += 2;

	return dwords;
}

void si_gfx_wait_fence(struct r600_common_context *ctx,
		       uint64_t va, uint32_t ref, uint32_t mask)
{
	struct radeon_winsys_cs *cs = ctx->gfx.cs;

	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
	radeon_emit(cs, va);
	radeon_emit(cs, va >> 32);
	radeon_emit(cs, ref); /* reference value */
	radeon_emit(cs, mask); /* mask */
	radeon_emit(cs, 4); /* poll interval */
}

static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
{
	struct radeon_winsys_cs *cs = rctx->dma.cs;

	/* NOP waits for idle on Evergreen and later. */
	if (rctx->chip_class >= CIK)
		radeon_emit(cs, 0x00000000); /* NOP */
	else
		radeon_emit(cs, 0xf0000000); /* NOP */
}

void si_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
		       struct r600_resource *dst, struct r600_resource *src)
{
	uint64_t vram = ctx->dma.cs->used_vram;
	uint64_t gtt = ctx->dma.cs->used_gart;

	if (dst) {
		vram += dst->vram_usage;
		gtt += dst->gart_usage;
	}
	if (src) {
		vram += src->vram_usage;
		gtt += src->gart_usage;
	}

	/* Flush the GFX IB if DMA depends on it. */
	if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
	    ((dst &&
	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
					       RADEON_USAGE_READWRITE)) ||
	     (src &&
	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
					       RADEON_USAGE_WRITE))))
		ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);

	/* Flush if there's not enough space, or if the memory usage per IB
	 * is too large.
	 *
	 * IBs using too little memory are limited by the IB submission overhead.
	 * IBs using too much memory are limited by the kernel/TTM overhead.
	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
	 *
	 * This heuristic makes sure that DMA requests are executed
	 * very soon after the call is made and lowers memory usage.
	 * It improves texture upload performance by keeping the DMA
	 * engine busy while uploads are being submitted.
	 */
	num_dw++; /* for emit_wait_idle below */
	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
	    ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
		ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
		assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
	}

	/* Wait for idle if either buffer has been used in the IB before to
	 * prevent read-after-write hazards.
	 */
	if ((dst &&
	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
					      RADEON_USAGE_READWRITE)) ||
	    (src &&
	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
					      RADEON_USAGE_WRITE)))
		r600_dma_emit_wait_idle(ctx);

	/* If GPUVM is not supported, the CS checker needs 2 entries
	 * in the buffer list per packet, which has to be done manually.
	 */
	if (ctx->screen->info.has_virtual_memory) {
		if (dst)
			radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
						  RADEON_USAGE_WRITE,
						  RADEON_PRIO_SDMA_BUFFER);
		if (src)
			radeon_add_to_buffer_list(ctx, &ctx->dma, src,
						  RADEON_USAGE_READ,
						  RADEON_PRIO_SDMA_BUFFER);
	}

	/* this function is called before all DMA calls, so increment this. */
	ctx->num_dma_calls++;
}

static void r600_flush_dma_ring(void *ctx, unsigned flags,
				struct pipe_fence_handle **fence)
{
	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
	struct radeon_winsys_cs *cs = rctx->dma.cs;
	struct radeon_saved_cs saved;
	bool check_vm =
		(rctx->screen->debug_flags & DBG(CHECK_VM)) &&
		rctx->check_vm_faults;

	if (!radeon_emitted(cs, 0)) {
		if (fence)
			rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
		return;
	}

	if (check_vm)
		si_save_cs(rctx->ws, cs, &saved, true);

	rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
	if (fence)
		rctx->ws->fence_reference(fence, rctx->last_sdma_fence);

	if (check_vm) {
		/* Use conservative timeout 800ms, after which we won't wait any
		 * longer and assume the GPU is hung.
		 */
		rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);

		rctx->check_vm_faults(rctx, &saved, RING_DMA);
		si_clear_saved_cs(&saved);
	}
}

/**
 * Store a linearized copy of all chunks of \p cs together with the buffer
 * list in \p saved.
 */
void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
		struct radeon_saved_cs *saved, bool get_buffer_list)
{
	uint32_t *buf;
	unsigned i;

	/* Save the IB chunks. */
	saved->num_dw = cs->prev_dw + cs->current.cdw;
	saved->ib = MALLOC(4 * saved->num_dw);
	if (!saved->ib)
		goto oom;

	buf = saved->ib;
	for (i = 0; i < cs->num_prev; ++i) {
		memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
		buf += cs->prev[i].cdw;
	}
	memcpy(buf, cs->current.buf, cs->current.cdw * 4);

	if (!get_buffer_list)
		return;

	/* Save the buffer list. */
	saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
	saved->bo_list = CALLOC(saved->bo_count,
				sizeof(saved->bo_list[0]));
	if (!saved->bo_list) {
		FREE(saved->ib);
		goto oom;
	}
	ws->cs_get_buffer_list(cs, saved->bo_list);

	return;

oom:
	fprintf(stderr, "%s: out of memory\n", __func__);
	memset(saved, 0, sizeof(*saved));
}

void si_clear_saved_cs(struct radeon_saved_cs *saved)
{
	FREE(saved->ib);
	FREE(saved->bo_list);

	memset(saved, 0, sizeof(*saved));
}

static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
{
	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
	unsigned latest = rctx->ws->query_value(rctx->ws,
						RADEON_GPU_RESET_COUNTER);

	if (rctx->gpu_reset_counter == latest)
		return PIPE_NO_RESET;

	rctx->gpu_reset_counter = latest;
	return PIPE_UNKNOWN_CONTEXT_RESET;
}

static void r600_set_device_reset_callback(struct pipe_context *ctx,
					   const struct pipe_device_reset_callback *cb)
{
	struct r600_common_context *rctx = (struct r600_common_context *)ctx;

	if (cb)
		rctx->device_reset_callback = *cb;
	else
		memset(&rctx->device_reset_callback, 0,
		       sizeof(rctx->device_reset_callback));
}

bool si_check_device_reset(struct r600_common_context *rctx)
{
	enum pipe_reset_status status;

	if (!rctx->device_reset_callback.reset)
		return false;

	if (!rctx->b.get_device_reset_status)
		return false;

	status = rctx->b.get_device_reset_status(&rctx->b);
	if (status == PIPE_NO_RESET)
		return false;

	rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
	return true;
}

static bool r600_resource_commit(struct pipe_context *pctx,
				 struct pipe_resource *resource,
				 unsigned level, struct pipe_box *box,
				 bool commit)
{
	struct r600_common_context *ctx = (struct r600_common_context *)pctx;
	struct r600_resource *res = r600_resource(resource);

	/*
	 * Since buffer commitment changes cannot be pipelined, we need to
	 * (a) flush any pending commands that refer to the buffer we're about
	 *     to change, and
	 * (b) wait for threaded submit to finish, including those that were
	 *     triggered by some other, earlier operation.
	 */
	if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
	    ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
					     res->buf, RADEON_USAGE_READWRITE)) {
		ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
	}
	if (radeon_emitted(ctx->dma.cs, 0) &&
	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
					     res->buf, RADEON_USAGE_READWRITE)) {
		ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
	}

	ctx->ws->cs_sync_flush(ctx->dma.cs);
	ctx->ws->cs_sync_flush(ctx->gfx.cs);

	assert(resource->target == PIPE_BUFFER);

	return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
}

bool si_common_context_init(struct r600_common_context *rctx,
			    struct si_screen *sscreen,
			    unsigned context_flags)
{
	slab_create_child(&rctx->pool_transfers, &sscreen->pool_transfers);
	slab_create_child(&rctx->pool_transfers_unsync, &sscreen->pool_transfers);

	rctx->screen = sscreen;
	rctx->ws = sscreen->ws;
	rctx->family = sscreen->info.family;
	rctx->chip_class = sscreen->info.chip_class;

	rctx->b.resource_commit = r600_resource_commit;

	if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 43) {
		rctx->b.get_device_reset_status = r600_get_reset_status;
		rctx->gpu_reset_counter =
			rctx->ws->query_value(rctx->ws,
					      RADEON_GPU_RESET_COUNTER);
	}

	rctx->b.set_device_reset_callback = r600_set_device_reset_callback;

	si_init_context_texture_functions(rctx);
	si_init_query_functions(rctx);

	if (rctx->chip_class == CIK ||
	    rctx->chip_class == VI ||
	    rctx->chip_class == GFX9) {
		rctx->eop_bug_scratch = (struct r600_resource*)
			pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
					   16 * sscreen->info.num_render_backends);
		if (!rctx->eop_bug_scratch)
			return false;
	}

	rctx->allocator_zeroed_memory =
		u_suballocator_create(&rctx->b, sscreen->info.gart_page_size,
				      0, PIPE_USAGE_DEFAULT, 0, true);
	if (!rctx->allocator_zeroed_memory)
		return false;

	rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
						  0, PIPE_USAGE_STREAM);
	if (!rctx->b.stream_uploader)
		return false;

	rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
						 0, PIPE_USAGE_DEFAULT);
	if (!rctx->b.const_uploader)
		return false;

	rctx->ctx = rctx->ws->ctx_create(rctx->ws);
	if (!rctx->ctx)
		return false;

	if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
						   r600_flush_dma_ring,
						   rctx);
		rctx->dma.flush = r600_flush_dma_ring;
	}

	return true;
}

void si_common_context_cleanup(struct r600_common_context *rctx)
{
	unsigned i,j;

	/* Release DCC stats. */
	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
		assert(!rctx->dcc_stats[i].query_active);

		for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
			if (rctx->dcc_stats[i].ps_stats[j])
				rctx->b.destroy_query(&rctx->b,
						      rctx->dcc_stats[i].ps_stats[j]);

		r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
	}

	if (rctx->query_result_shader)
		rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);

	if (rctx->gfx.cs)
		rctx->ws->cs_destroy(rctx->gfx.cs);
	if (rctx->dma.cs)
		rctx->ws->cs_destroy(rctx->dma.cs);
	if (rctx->ctx)
		rctx->ws->ctx_destroy(rctx->ctx);

	if (rctx->b.stream_uploader)
		u_upload_destroy(rctx->b.stream_uploader);
	if (rctx->b.const_uploader)
		u_upload_destroy(rctx->b.const_uploader);

	slab_destroy_child(&rctx->pool_transfers);
	slab_destroy_child(&rctx->pool_transfers_unsync);

	if (rctx->allocator_zeroed_memory) {
		u_suballocator_destroy(rctx->allocator_zeroed_memory);
	}
	rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
	rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
	r600_resource_reference(&rctx->eop_bug_scratch, NULL);
}


void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
			    uint64_t offset, uint64_t size, unsigned value)
{
	struct r600_common_context *rctx = (struct r600_common_context*)sscreen->aux_context;

	mtx_lock(&sscreen->aux_context_lock);
	rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
	sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
	mtx_unlock(&sscreen->aux_context_lock);
}