/*
 * Copyright 2018 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

/* This file implements tests on the si_clearbuffer function. */

#include "si_pipe.h"
#include "si_query.h"

#define MIN_SIZE	512
#define MAX_SIZE	(128 * 1024 * 1024)
#define SIZE_SHIFT	1
#define NUM_RUNS	128

static double get_MBps_rate(unsigned num_bytes, unsigned ns)
{
	return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
}

void si_test_dma_perf(struct si_screen *sscreen)
{
	struct pipe_screen *screen = &sscreen->b;
	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
	struct si_context *sctx = (struct si_context*)ctx;
	const uint32_t clear_value = 0x12345678;
	static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
	static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};

#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))

	static const char *method_str[] = {
		"CP MC   ",
		"CP L2   ",
		"CP L2   ",
		"SDMA    ",
	};
	static const char *placement_str[] = {
		/* Clear */
		"fill->VRAM",
		"fill->GTT ",
		/* Copy */
		"VRAM->VRAM",
		"VRAM->GTT ",
		"GTT ->VRAM",
	};

	printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
	printf("Heap       ,Method  ,L2p,Wa,");
	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
		if (size >= 1024)
			printf("%6uKB,", size / 1024);
		else
			printf(" %6uB,", size);
	}
	printf("\n");

	/* results[log2(size)][placement][method][] */
	struct si_result {
		bool is_valid;
		bool is_cp;
		bool is_sdma;
		bool is_cs;
		unsigned cache_policy;
		unsigned dwords_per_thread;
		unsigned waves_per_sh;
		unsigned score;
		unsigned index; /* index in results[x][y][index] */
	} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};

	/* Run benchmarks. */
	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
		bool is_copy = placement >= 2;

		printf("-----------,--------,---,--,");
		for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
			printf("--------,");
		printf("\n");

		for (unsigned method = 0; method < NUM_METHODS; method++) {
			bool test_cp = method <= 2;
			bool test_sdma = method == 3;
			bool test_cs = method >= 4;
			unsigned cs_method = method - 4;
			STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
			unsigned cs_waves_per_sh =
				test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
			cs_method %= 2*NUM_SHADERS;
			unsigned cache_policy = test_cp ? method % 3 :
						test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
			unsigned cs_dwords_per_thread =
				test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;

			if (sctx->chip_class == SI) {
				/* SI doesn't support CP DMA operations through L2. */
				if (test_cp && cache_policy != L2_BYPASS)
					continue;
				/* WAVES_PER_SH is in multiples of 16 on SI. */
				if (test_cs && cs_waves_per_sh % 16 != 0)
					continue;
			}

			printf("%s ,", placement_str[placement]);
			if (test_cs) {
				printf("CS x%-4u,%3s,", cs_dwords_per_thread,
				       cache_policy == L2_LRU ? "LRU" :
				       cache_policy == L2_STREAM ? "Str" : "");
			} else {
				printf("%s,%3s,", method_str[method],
				       method == L2_LRU ? "LRU" :
				       method == L2_STREAM ? "Str" : "");
			}
			if (test_cs && cs_waves_per_sh)
				printf("%2u,", cs_waves_per_sh);
			else
				printf("  ,");

			double score = 0;
			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
				/* Don't test bigger sizes if it's too slow. Print 0. */
				if (size >= 512*1024 &&
				    score < 400 * (size / (4*1024*1024))) {
					printf("%7.0f ,", 0.0);
					continue;
				}

				enum pipe_resource_usage dst_usage, src_usage;
				struct pipe_resource *dst, *src;
				struct pipe_query *q[NUM_RUNS];
				unsigned query_type = PIPE_QUERY_TIME_ELAPSED;

				if (test_sdma) {
					if (sctx->chip_class == SI)
						query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
					else
						query_type = SI_QUERY_TIME_ELAPSED_SDMA;
				}

				if (placement == 0 || placement == 2 || placement == 4)
					dst_usage = PIPE_USAGE_DEFAULT;
				else
					dst_usage = PIPE_USAGE_STREAM;

				if (placement == 2 || placement == 3)
					src_usage = PIPE_USAGE_DEFAULT;
				else
					src_usage = PIPE_USAGE_STREAM;

				dst = pipe_buffer_create(screen, 0, dst_usage, size);
				src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;

				/* Run tests. */
				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
					q[iter] = ctx->create_query(ctx, query_type, 0);
					ctx->begin_query(ctx, q[iter]);

					if (test_cp) {
						/* CP DMA */
						if (is_copy) {
							si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
									      SI_COHERENCY_NONE, cache_policy);
						} else {
							si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
									       clear_value, 0,
									       SI_COHERENCY_NONE, cache_policy);
						}
					} else if (test_sdma) {
						/* SDMA */
						if (is_copy) {
							struct pipe_box box;
							u_box_1d(0, size, &box);
							sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
						} else {
							si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
						}
					} else {
						/* Compute */
						/* The memory accesses are coalesced, meaning that the 1st instruction writes
						 * the 1st contiguous block of data for the whole wave, the 2nd instruction
						 * writes the 2nd contiguous block of data, etc.
						 */
						unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
						unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
						unsigned dwords_per_wave = cs_dwords_per_thread * 64;

						unsigned num_dwords = size / 4;
						unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);

						void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
											cache_policy == L2_STREAM, is_copy);

						struct pipe_grid_info info = {};
						info.block[0] = MIN2(64, num_instructions);
						info.block[1] = 1;
						info.block[2] = 1;
						info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
						info.grid[1] = 1;
						info.grid[2] = 1;

						struct pipe_shader_buffer sb[2] = {};
						sb[0].buffer = dst;
						sb[0].buffer_size = size;

						if (is_copy) {
							sb[1].buffer = src;
							sb[1].buffer_size = size;
						} else {
							for (unsigned i = 0; i < 4; i++)
								sctx->cs_user_data[i] = clear_value;
						}

						sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
							       SI_CONTEXT_INV_SMEM_L1;

						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb);
						ctx->bind_compute_state(ctx, cs);
						sctx->cs_max_waves_per_sh = cs_waves_per_sh;

						ctx->launch_grid(ctx, &info);

						ctx->bind_compute_state(ctx, NULL);
						ctx->delete_compute_state(ctx, cs);
						sctx->cs_max_waves_per_sh = 0; /* disable the limit */

						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
					}

					/* Flush L2, so that we don't just test L2 cache performance. */
					if (!test_sdma) {
						sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
						si_emit_cache_flush(sctx);
					}

					ctx->end_query(ctx, q[iter]);
					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
				}
				pipe_resource_reference(&dst, NULL);
				pipe_resource_reference(&src, NULL);

				/* Get results. */
				uint64_t min = ~0ull, max = 0, total = 0;

				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
					union pipe_query_result result;

					ctx->get_query_result(ctx, q[iter], true, &result);
					ctx->destroy_query(ctx, q[iter]);

					min = MIN2(min, result.u64);
					max = MAX2(max, result.u64);
					total += result.u64;
				}

				score = get_MBps_rate(size, total / (double)NUM_RUNS);
				printf("%7.0f ,", score);
				fflush(stdout);

				struct si_result *r = &results[util_logbase2(size)][placement][method];
				r->is_valid = true;
				r->is_cp = test_cp;
				r->is_sdma = test_sdma;
				r->is_cs = test_cs;
				r->cache_policy = cache_policy;
				r->dwords_per_thread = cs_dwords_per_thread;
				r->waves_per_sh = cs_waves_per_sh;
				r->score = score;
				r->index = method;
			}
			puts("");
		}
	}

	puts("");
	puts("static struct si_method");
	printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
	       sctx->screen->info.name);
	puts("{");
	puts("   unsigned size = MIN2(size64, UINT_MAX);\n");

	/* Analyze results and find the best methods. */
	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
		if (placement == 0)
			puts("   if (dst == RADEON_DOMAIN_VRAM) {");
		else if (placement == 1)
			puts("   } else { /* GTT */");
		else if (placement == 2) {
			puts("}");
			puts("");
			puts("static struct si_method");
			printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
			       sctx->screen->info.name);
			printf("                     uint64_t size64, bool async, bool cached)\n");
			puts("{");
			puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
			puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
		} else if (placement == 3)
			puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
		else
			puts("   } else { /* GTT -> VRAM */");

		for (unsigned mode = 0; mode < 3; mode++) {
			bool async = mode == 0;
			bool cached = mode == 1;

			if (async)
				puts("      if (async) { /* SDMA or async compute */");
			else if (cached)
				puts("      if (cached) { /* gfx ring */");
			else
				puts("      } else { /* gfx ring - uncached */");

			/* The list of best chosen methods. */
			struct si_result *methods[32];
			unsigned method_max_size[32];
			unsigned num_methods = 0;

			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
				/* Find the best method. */
				struct si_result *best = NULL;

				for (unsigned i = 0; i < NUM_METHODS; i++) {
					struct si_result *r = &results[util_logbase2(size)][placement][i];

					if (!r->is_valid)
						continue;

					/* Ban CP DMA clears via MC on <= VI. They are super slow
					 * on GTT, which we can get due to BO evictions.
					 */
					if (sctx->chip_class <= VI && placement == 1 &&
					    r->is_cp && r->cache_policy == L2_BYPASS)
						continue;

					if (async) {
						/* The following constraints for compute IBs try to limit
						 * resource usage so as not to decrease the performance
						 * of gfx IBs too much.
						 */

						/* Don't use CP DMA on asynchronous rings, because
						 * the engine is shared with gfx IBs.
						 */
						if (r->is_cp)
							continue;

						/* Don't use L2 caching on asynchronous rings to minimize
						 * L2 usage.
						 */
						if (r->cache_policy == L2_LRU)
							continue;

						/* Asynchronous compute recommends waves_per_sh != 0
						 * to limit CU usage. */
						if (r->is_cs && r->waves_per_sh == 0)
							continue;
					} else {
						/* SDMA is always asynchronous */
						if (r->is_sdma)
							continue;

						if (cached && r->cache_policy == L2_BYPASS)
							continue;
						if (!cached && r->cache_policy == L2_LRU)
							continue;
					}

					if (!best) {
						best = r;
						continue;
					}

					/* Assume some measurement error. Earlier methods occupy fewer
					 * resources, so the next method is always more greedy, and we
					 * don't want to select it due to a measurement error.
					 */
					double min_improvement = 1.03;

					if (best->score * min_improvement < r->score)
						best = r;
				}

				if (num_methods > 0) {
					unsigned prev_index = num_methods - 1;
					struct si_result *prev = methods[prev_index];
					struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];

					/* If the best one is also the best for the previous size,
					 * just bump the size for the previous one.
					 *
					 * If there is no best, it means all methods were too slow
					 * for this size and were not tested. Use the best one for
					 * the previous size.
					 */
					if (!best ||
					    /* If it's the same method as for the previous size: */
					    (prev->is_cp == best->is_cp &&
					     prev->is_sdma == best->is_sdma &&
					     prev->is_cs == best->is_cs &&
					     prev->cache_policy == best->cache_policy &&
					     prev->dwords_per_thread == best->dwords_per_thread &&
					     prev->waves_per_sh == best->waves_per_sh) ||
					    /* If the method for the previous size is also the best
					     * for this size: */
					    (prev_this_size->is_valid &&
					     prev_this_size->score * 1.03 > best->score)) {
						method_max_size[prev_index] = size;
						continue;
					}
				}

				/* Add it to the list. */
				assert(num_methods < ARRAY_SIZE(methods));
				methods[num_methods] = best;
				method_max_size[num_methods] = size;
				num_methods++;
			}

			for (unsigned i = 0; i < num_methods; i++) {
				struct si_result *best = methods[i];
				unsigned size = method_max_size[i];

				/* The size threshold is between the current benchmarked
				 * size and the next benchmarked size. */
				if (i < num_methods - 1)
					printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
				else if (i > 0)
					printf("         else                   ");
				else
					printf("         ");
				printf("return ");

				assert(best);
				if (best->is_cp) {
					printf("CP_DMA(%s);\n",
					       best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
					       best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
				}
				if (best->is_sdma)
					printf("SDMA;\n");
				if (best->is_cs) {
					printf("COMPUTE(%s, %u, %u);\n",
					       best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
					       best->dwords_per_thread,
					       best->waves_per_sh);
				}
			}
		}
		puts("      }");
	}
	puts("   }");
	puts("}");

	ctx->destroy(ctx);
	exit(0);
}