/*
 * Copyright 2019 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#include "si_pipe.h"
#include "si_shader_internal.h"
#include "sid.h"
#include "si_build_pm4.h"
#include "ac_llvm_cull.h"

#include "util/u_prim.h"
#include "util/u_suballoc.h"
#include "util/u_upload_mgr.h"
#include "util/fast_idiv_by_const.h"

/* Based on:
 * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
 */

/* This file implements primitive culling using asynchronous compute.
 * It's written to be GL conformant.
 *
 * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
 * in a compute shader. The shader processes 1 primitive/thread by invoking
 * the VS for each vertex to get the positions, decomposes strips and fans
 * into triangles (if needed), eliminates primitive restart (if needed),
 * does (W<0) culling, face culling, view XY culling, zero-area and
 * small-primitive culling, and generates a new index buffer that doesn't
 * contain culled primitives.
 *
 * The index buffer is generated using the Ordered Count feature of GDS,
 * which is an atomic counter that is incremented in the wavefront launch
 * order, so that the original primitive order is preserved.
 *
 * Another GDS ordered counter is used to eliminate primitive restart indices.
 * If a restart index lands on an even thread ID, the compute shader has to flip
 * the primitive orientation of the whole following triangle strip. The primitive
 * orientation has to be correct after strip and fan decomposition for two-sided
 * shading to behave correctly. The decomposition also needs to be aware of
 * which vertex is the provoking vertex for flat shading to behave correctly.
 *
 * IB = a GPU command buffer
 *
 * Both the compute and gfx IBs run in parallel sort of like CE and DE.
 * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
 * doesn't continue if its word isn't 0x80000000. Once compute shaders are
 * finished culling, the last wave will write the final primitive count from
 * GDS directly into the count word of the draw packet in the gfx IB, and
 * a CS_DONE event will signal the REWIND packet to continue. It's really
 * a direct draw with command buffer patching from the compute queue.
 *
 * The compute IB doesn't have to start when its corresponding gfx IB starts,
 * but can start sooner. The compute IB is signaled to start after the last
 * execution barrier in the *previous* gfx IB. This is handled as follows.
 * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
 * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
 * represents the barrier in the previous gfx IB.
 *
 * Features:
 * - Triangle strips and fans are decomposed into an indexed triangle list.
 *   The decomposition differs based on the provoking vertex state.
 * - Instanced draws are converted into non-instanced draws for 16-bit indices.
 *   (InstanceID is stored in the high bits of VertexID and unpacked by VS)
 * - Primitive restart is fully supported with triangle strips, including
 *   correct primitive orientation across multiple waves. (restart indices
 *   reset primitive orientation)
 * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
 * - Back face culling, incl. culling zero-area / degenerate primitives.
 * - View XY culling.
 * - View Z culling (disabled due to limited impact with perspective projection).
 * - Small primitive culling for all MSAA modes and all quant modes.
 *
 * The following are not implemented:
 * - ClipVertex/ClipDistance/CullDistance-based culling.
 * - Scissor culling.
 * - HiZ culling.
 *
 * Limitations (and unimplemented features that may be possible to implement):
 * - Only triangles, triangle strips, and triangle fans are supported.
 * - Primitive restart is only supported with triangle strips.
 * - Instancing and primitive restart can't be used together.
 * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
 * - The instance divisor buffer is unavailable, so all divisors must be
 *   either 0 or 1.
 * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
 * - No support for tessellation and geometry shaders.
 *   (patch elimination where tess factors are 0 would be possible to implement)
 * - The vertex shader must not contain memory stores.
 * - All VS resources must not have a write usage in the command buffer.
 *   (TODO: all shader buffers currently set the write usage)
 * - Bindless textures and images must not occur in the vertex shader.
 *
 * User data SGPR layout:
 *   INDEX_BUFFERS: pointer to constants
 *     0..3: input index buffer - typed buffer view
 *     4..7: output index buffer - typed buffer view
 *     8..11: viewport state - scale.xy, translate.xy
 *   VERTEX_COUNTER: counter address or first primitive ID
 *     - If unordered memory counter: address of "count" in the draw packet
 *       and is incremented atomically by the shader.
 *     - If unordered GDS counter: address of "count" in GDS starting from 0,
 *       must be initialized to 0 before the dispatch.
 *     - If ordered GDS counter: the primitive ID that should reset the vertex
 *       counter to 0 in GDS
 *   LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
 *       count to memory if using GDS ordered append
 *   VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
 *       using GDS ordered append
 *   VS.VERTEX_BUFFERS:           same value as VS
 *   VS.CONST_AND_SHADER_BUFFERS: same value as VS
 *   VS.SAMPLERS_AND_IMAGES:      same value as VS
 *   VS.BASE_VERTEX:              same value as VS
 *   VS.START_INSTANCE:           same value as VS
 *   NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
 *       per instance for instancing.
 *   NUM_PRIMS_UDIV_TERMS:
 *     - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
 *     - Bits [5:31]: The number of primitives per instance for computing the remainder.
 *   PRIMITIVE_RESTART_INDEX
 *   SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
 *
 *
 * The code contains 3 codepaths:
 * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
 * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
 * - Ordered GDS counter (it preserves the primitive order)
 *
 * How to test primitive restart (the most complicated part because it needs
 * to get the primitive orientation right):
 *   Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
 *   primitive orientation flips with small draw calls, which is what most tests use.
 *   You can also enable draw call splitting into draw calls with just 2 primitives.
 */

/* At least 256 is needed for the fastest wave launch rate from compute queues
 * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
#define THREADGROUP_SIZE		256 /* high numbers limit available VGPRs */
#define THREADGROUPS_PER_CU		1 /* TGs to launch on 1 CU before going onto the next, max 8 */
#define MAX_WAVES_PER_SH		0 /* no limit */
#define INDEX_STORES_USE_SLC		1 /* don't cache indices if L2 is full */
/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
#define CULL_Z				0
/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
#define VERTEX_COUNTER_GDS_MODE		2
#define GDS_SIZE_UNORDERED		(4 * 1024) /* only for the unordered GDS counter */

/* Grouping compute dispatches for small draw calls: How many primitives from multiple
 * draw calls to process by compute before signaling the gfx IB. This reduces the number
 * of EOP events + REWIND packets, because they decrease performance. */
#define PRIMS_PER_BATCH			(512 * 1024)
/* Draw call splitting at the packet level. This allows signaling the gfx IB
 * for big draw calls sooner, but doesn't allow context flushes between packets.
 * Primitive restart is supported. Only implemented for ordered append. */
#define SPLIT_PRIMS_PACKET_LEVEL_VALUE	PRIMS_PER_BATCH
/* If there is not enough ring buffer space for the current IB, split draw calls into
 * this number of primitives, so that we can flush the context and get free ring space. */
#define SPLIT_PRIMS_DRAW_LEVEL		PRIMS_PER_BATCH

/* Derived values. */
#define WAVES_PER_TG			DIV_ROUND_UP(THREADGROUP_SIZE, 64)
#define SPLIT_PRIMS_PACKET_LEVEL	(VERTEX_COUNTER_GDS_MODE == 2 ? \
					 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
					 UINT_MAX & ~(THREADGROUP_SIZE - 1))

#define REWIND_SIGNAL_BIT		0x80000000
/* For emulating the rewind packet on CI. */
#define FORCE_REWIND_EMULATION		0

void si_initialize_prim_discard_tunables(struct si_context *sctx)
{
	sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */

	if (sctx->chip_class == GFX6 || /* SI support is not implemented */
	    !sctx->screen->info.has_gds_ordered_append ||
	    sctx->screen->debug_flags & DBG(NO_PD) ||
	    /* If aux_context == NULL, we are initializing aux_context right now. */
	    !sctx->screen->aux_context)
		return;

	/* TODO: enable this after the GDS kernel memory management is fixed */
	bool enable_on_pro_graphics_by_default = false;

	if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
	    sctx->screen->debug_flags & DBG(PD) ||
	    (enable_on_pro_graphics_by_default &&
	     sctx->screen->info.is_pro_graphics &&
	     (sctx->family == CHIP_BONAIRE ||
	      sctx->family == CHIP_HAWAII ||
	      sctx->family == CHIP_TONGA ||
	      sctx->family == CHIP_FIJI ||
	      sctx->family == CHIP_POLARIS10 ||
	      sctx->family == CHIP_POLARIS11 ||
	      sctx->family == CHIP_VEGA10 ||
	      sctx->family == CHIP_VEGA20))) {
		sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */

		if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
			sctx->prim_discard_vertex_count_threshold = 0; /* always enable */

		const uint32_t MB = 1024 * 1024;
		const uint64_t GB = 1024 * 1024 * 1024;

		/* The total size is double this per context.
		 * Greater numbers allow bigger gfx IBs.
		 */
		if (sctx->screen->info.vram_size <= 2 * GB)
			sctx->index_ring_size_per_ib = 64 * MB;
		else if (sctx->screen->info.vram_size <= 4 * GB)
			sctx->index_ring_size_per_ib = 128 * MB;
		else
			sctx->index_ring_size_per_ib = 256 * MB;
	}
}

/* Opcode can be "add" or "swap". */
static LLVMValueRef
si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
		       LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
		       bool release, bool done)
{
	LLVMValueRef args[] = {
		LLVMBuildIntToPtr(ctx->ac.builder, m0,
				  LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""),
		value,
		LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
		ctx->i32_0, /* scope */
		ctx->i1false, /* volatile */
		LLVMConstInt(ctx->i32, ordered_count_index, 0),
		LLVMConstInt(ctx->i1, release, 0),
		LLVMConstInt(ctx->i1, done, 0),
	};

	char intrinsic[64];
	snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
	return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0);
}

static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
{
	uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
	ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, "");
	ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), "");
	return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
				 LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), "");
}

struct si_thread0_section {
	struct si_shader_context *ctx;
	LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
	LLVMValueRef saved_exec;
};

/* Enter a section that only executes on thread 0. */
static void si_enter_thread0_section(struct si_shader_context *ctx,
				     struct si_thread0_section *section,
				     LLVMValueRef thread_id)
{
	section->ctx = ctx;
	section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0");

	/* This IF has 4 instructions:
	 *   v_and_b32_e32 v, 63, v         ; get the thread ID
	 *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
	 *   s_and_saveexec_b64 s, vcc
	 *   s_cbranch_execz BB0_4
	 *
	 * It could just be s_and_saveexec_b64 s, 1.
	 */
	ac_build_ifcc(&ctx->ac,
		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
				    ctx->i32_0, ""), 12601);
}

/* Exit a section that only executes on thread 0 and broadcast the result
 * to all threads. */
static void si_exit_thread0_section(struct si_thread0_section *section,
				    LLVMValueRef *result)
{
	struct si_shader_context *ctx = section->ctx;

	LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);

	ac_build_endif(&ctx->ac, 12601);

	/* Broadcast the result from thread 0 to all threads. */
	*result = ac_build_readlane(&ctx->ac,
			LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
}

void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
{
	struct si_shader_key *key = &ctx->shader->key;
	LLVMBuilderRef builder = ctx->ac.builder;
	LLVMValueRef vs = ctx->main_fn;

	/* Always inline the VS function. */
	ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
	LLVMSetLinkage(vs, LLVMPrivateLinkage);

	LLVMTypeRef const_desc_type;
	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
	    ctx->shader->selector->info.shader_buffers_declared == 0)
		const_desc_type = ctx->f32;
	else
		const_desc_type = ctx->v4i32;

	struct si_function_info fninfo;
	si_init_function_info(&fninfo);

	LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc;
	LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id;
	LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision;
	LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc;
	LLVMValueRef last_wave_prim_id, vertex_count_addr;

	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
		       &index_buffers_and_constants);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr);
	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
		       &vb_desc);
	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type),
		       &const_desc);
	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32),
		       &sampler_desc);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index);
	add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision);

	/* Block ID and thread ID inputs. */
	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id);
	if (VERTEX_COUNTER_GDS_MODE == 2)
		add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id);
	add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id);

	/* Create the compute shader function. */
	unsigned old_type = ctx->type;
	ctx->type = PIPE_SHADER_COMPUTE;
	si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE);
	ctx->type = old_type;

	if (VERTEX_COUNTER_GDS_MODE == 1) {
		ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
						     GDS_SIZE_UNORDERED);
	}

	/* Assemble parameters for VS. */
	LLVMValueRef vs_params[16];
	unsigned num_vs_params = 0;
	unsigned param_vertex_id, param_instance_id;

	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
	vs_params[num_vs_params++] = const_desc;
	vs_params[num_vs_params++] = sampler_desc;
	vs_params[num_vs_params++] = LLVMConstInt(ctx->i32,
					S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
	vs_params[num_vs_params++] = base_vertex;
	vs_params[num_vs_params++] = start_instance;
	vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */
	vs_params[num_vs_params++] = vb_desc;

	vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
	vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
	vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */
	vs_params[num_vs_params++] = ctx->i32_0; /* unused */

	assert(num_vs_params <= ARRAY_SIZE(vs_params));
	assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));

	/* Load descriptors. (load 8 dwords at once) */
	LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];

	tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
				   ac_array_in_const32_addr_space(ctx->v8i32), "");
	tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0);

	for (unsigned i = 0; i < 8; i++)
		desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);

	input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
	output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);

	/* Compute PrimID and InstanceID. */
	LLVMValueRef global_thread_id =
		ac_build_imad(&ctx->ac, block_id,
			      LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id);
	LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
	LLVMValueRef instance_id = ctx->i32_0;

	if (key->opt.cs_instancing) {
		/* Unpack num_prims_udiv_terms. */
		LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
						       LLVMConstInt(ctx->i32, 0x1f, 0), "");
		LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
								LLVMConstInt(ctx->i32, 5, 0), "");
		/* Divide the total prim_id by the number of prims per instance. */
		instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
							       num_prims_udiv_multiplier,
							       post_shift);
		/* Compute the remainder. */
		prim_id = LLVMBuildSub(builder, prim_id,
				       LLVMBuildMul(builder, instance_id,
						    prims_per_instance, ""), "");
	}

	/* Generate indices (like a non-indexed draw call). */
	LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)};
	unsigned vertices_per_prim = 3;

	switch (key->opt.cs_prim_type) {
	case PIPE_PRIM_TRIANGLES:
		for (unsigned i = 0; i < 3; i++) {
			index[i] = ac_build_imad(&ctx->ac, prim_id,
						 LLVMConstInt(ctx->i32, 3, 0),
						 LLVMConstInt(ctx->i32, i, 0));
		}
		break;
	case PIPE_PRIM_TRIANGLE_STRIP:
		for (unsigned i = 0; i < 3; i++) {
			index[i] = LLVMBuildAdd(builder, prim_id,
						LLVMConstInt(ctx->i32, i, 0), "");
		}
		break;
	case PIPE_PRIM_TRIANGLE_FAN:
		/* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
		 * and rasterizer as a normal triangle, so we need to put the provoking
		 * vertex into the correct index variable and preserve orientation at the same time.
		 * gl_VertexID is preserved, because it's equal to the index.
		 */
		if (key->opt.cs_provoking_vertex_first) {
			index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
			index[2] = ctx->i32_0;
		} else {
			index[0] = ctx->i32_0;
			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
			index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
		}
		break;
	default:
		unreachable("unexpected primitive type");
	}

	/* Fetch indices. */
	if (key->opt.cs_indexed) {
		for (unsigned i = 0; i < 3; i++) {
			index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
							       index[i], ctx->i32_0, 1,
							       0, true);
			index[i] = ac_to_integer(&ctx->ac, index[i]);
		}
	}

	/* Extract the ordered wave ID. */
	if (VERTEX_COUNTER_GDS_MODE == 2) {
		ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
						LLVMConstInt(ctx->i32, 6, 0), "");
		ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
					       LLVMConstInt(ctx->i32, 0xfff, 0), "");
	}
	LLVMValueRef thread_id =
		LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), "");

	/* Every other triangle in a strip has a reversed vertex order, so we
	 * need to swap vertices of odd primitives to get the correct primitive
	 * orientation when converting triangle strips to triangles. Primitive
	 * restart complicates it, because a strip can start anywhere.
	 */
	LLVMValueRef prim_restart_accepted = ctx->i1true;

	if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
		/* Without primitive restart, odd primitives have reversed orientation.
		 * Only primitive restart can flip it with respect to the first vertex
		 * of the draw call.
		 */
		LLVMValueRef first_is_odd = ctx->i1false;

		/* Handle primitive restart. */
		if (key->opt.cs_primitive_restart) {
			/* Get the GDS primitive restart continue flag and clear
			 * the flag in vertex_counter. This flag is used when the draw
			 * call was split and we need to load the primitive orientation
			 * flag from GDS for the first wave too.
			 */
			LLVMValueRef gds_prim_restart_continue =
				LLVMBuildLShr(builder, vertex_counter,
					      LLVMConstInt(ctx->i32, 31, 0), "");
			gds_prim_restart_continue =
				LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, "");
			vertex_counter = LLVMBuildAnd(builder, vertex_counter,
						      LLVMConstInt(ctx->i32, 0x7fffffff, 0), "");

			LLVMValueRef index0_is_reset;

			for (unsigned i = 0; i < 3; i++) {
				LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
								       restart_index, "");
				if (i == 0)
					index0_is_reset = LLVMBuildNot(builder, not_reset, "");
				prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
								     not_reset, "");
			}

			/* If the previous waves flip the primitive orientation
			 * of the current triangle strip, it will be stored in GDS.
			 *
			 * Sometimes the correct orientation is not needed, in which case
			 * we don't need to execute this.
			 */
			if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
				/* If there are reset indices in this wave, get the thread index
				 * where the most recent strip starts relative to each thread.
				 */
				LLVMValueRef preceding_threads_mask =
					LLVMBuildSub(builder,
						     LLVMBuildShl(builder, ctx->ac.i64_1,
								  LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""),
						     ctx->ac.i64_1, "");

				LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
				LLVMValueRef preceding_reset_threadmask =
					LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
				LLVMValueRef strip_start =
					ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
				strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, "");

				/* This flips the orientatino based on reset indices within this wave only. */
				first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, "");

				LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
				LLVMValueRef is_first_wave, current_wave_resets_index;

				/* Get the thread index where the last strip starts in this wave.
				 *
				 * If the last strip doesn't start in this wave, the thread index
				 * will be 0.
				 *
				 * If the last strip starts in the next wave, the thread index will
				 * be 64.
				 */
				last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
				last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, "");

				struct si_thread0_section section;
				si_enter_thread0_section(ctx, &section, thread_id);

				/* This must be done in the thread 0 section, because
				 * we expect PrimID to be 0 for the whole first wave
				 * in this expression.
				 *
				 * NOTE: This will need to be different if we wanna support
				 * instancing with primitive restart.
				 */
				is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, "");
				is_first_wave = LLVMBuildAnd(builder, is_first_wave,
							     LLVMBuildNot(builder,
									  gds_prim_restart_continue, ""), "");
				current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
									  last_strip_start, ctx->i32_0, "");

				ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state");

				/* Save the last strip start primitive index in GDS and read
				 * the value that previous waves stored.
				 *
				 * if (is_first_wave || current_wave_resets_strip)
				 *    // Read the value that previous waves stored and store a new one.
				 *    first_is_odd = ds.ordered.swap(last_strip_start);
				 * else
				 *    // Just read the value that previous waves stored.
				 *    first_is_odd = ds.ordered.add(0);
				 */
				ac_build_ifcc(&ctx->ac,
					      LLVMBuildOr(builder, is_first_wave,
							  current_wave_resets_index, ""), 12602);
				{
					/* The GDS address is always 0 with ordered append. */
					tmp = si_build_ds_ordered_op(ctx, "swap",
								     ordered_wave_id, last_strip_start,
								     1, true, false);
					LLVMBuildStore(builder, tmp, ret);
				}
				ac_build_else(&ctx->ac, 12603);
				{
					/* Just read the value from GDS. */
					tmp = si_build_ds_ordered_op(ctx, "add",
								     ordered_wave_id, ctx->i32_0,
								     1, true, false);
					LLVMBuildStore(builder, tmp, ret);
				}
				ac_build_endif(&ctx->ac, 12602);

				prev_wave_state = LLVMBuildLoad(builder, ret, "");
				/* Ignore the return value if this is the first wave. */
				prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
								  ctx->i32_0, prev_wave_state, "");
				si_exit_thread0_section(&section, &prev_wave_state);
				prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, "");

				/* If the strip start appears to be on thread 0 for the current primitive
				 * (meaning the reset index is not present in this wave and might have
				 * appeared in previous waves), use the value from GDS to determine
				 * primitive orientation.
				 *
				 * If the strip start is in this wave for the current primitive, use
				 * the value from the current wave to determine primitive orientation.
				 */
				LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
									     strip_start, ctx->i32_0, "");
				first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
							       first_is_odd, "");
			}
		}
		/* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
		LLVMValueRef prim_is_odd =
			LLVMBuildXor(builder, first_is_odd,
				     LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), "");

		/* Determine the primitive orientation.
		 * Only swap the vertices that are not the provoking vertex. We need to keep
		 * the provoking vertex in place.
		 */
		if (key->opt.cs_provoking_vertex_first) {
			LLVMValueRef index1 = index[1];
			LLVMValueRef index2 = index[2];
			index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, "");
			index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, "");
		} else {
			LLVMValueRef index0 = index[0];
			LLVMValueRef index1 = index[1];
			index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, "");
			index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, "");
		}
	}

	/* Execute the vertex shader for each vertex to get vertex positions. */
	LLVMValueRef pos[3][4];
	for (unsigned i = 0; i < vertices_per_prim; i++) {
		vs_params[param_vertex_id] = index[i];
		vs_params[param_instance_id] = instance_id;

		LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
		for (unsigned chan = 0; chan < 4; chan++)
			pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
	}

	/* Divide XYZ by W. */
	for (unsigned i = 0; i < vertices_per_prim; i++) {
		for (unsigned chan = 0; chan < 3; chan++)
			pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
	}

	/* Load the viewport state. */
	LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
						  LLVMConstInt(ctx->i32, 2, 0));
	vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
	vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
	vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
	vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
	vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);

	/* Do culling. */
	struct ac_cull_options options = {};
	options.cull_front = key->opt.cs_cull_front;
	options.cull_back = key->opt.cs_cull_back;
	options.cull_view_xy = true;
	options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
	options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
	options.cull_small_prims = true;
	options.cull_zero_area = true;
	options.cull_w = true;
	options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;

	LLVMValueRef accepted =
		ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
				 vp_scale, vp_translate, smallprim_precision,
				 &options);

	LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);

	/* Count the number of active threads by doing bitcount(accepted). */
	LLVMValueRef num_prims_accepted =
		ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64,
				   &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
	num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, "");

	LLVMValueRef start;

	/* Execute atomic_add on the vertex count. */
	struct si_thread0_section section;
	si_enter_thread0_section(ctx, &section, thread_id);
	{
		if (VERTEX_COUNTER_GDS_MODE == 0) {
			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
			vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
						   vertex_counter, num_indices,
						   LLVMAtomicOrderingMonotonic, false);
		} else if (VERTEX_COUNTER_GDS_MODE == 1) {
			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
			vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
							   LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), "");
			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
						   vertex_counter, num_indices,
						   LLVMAtomicOrderingMonotonic, false);
		} else if (VERTEX_COUNTER_GDS_MODE == 2) {
			LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");

			/* If the draw call was split into multiple subdraws, each using
			 * a separate draw packet, we need to start counting from 0 for
			 * the first compute wave of the subdraw.
			 *
			 * vertex_counter contains the primitive ID of the first thread
			 * in the first wave.
			 *
			 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
			 */
			LLVMValueRef is_first_wave =
				LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
					      vertex_counter, "");

			/* Store the primitive count for ordered append, not vertex count.
			 * The idea is to avoid GDS initialization via CP DMA. The shader
			 * effectively stores the first count using "swap".
			 *
			 * if (first_wave) {
			 *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
			 *    previous = 0;
			 * } else {
			 *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
			 * }
			 */
			ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
			{
				/* The GDS address is always 0 with ordered append. */
				si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
						       num_prims_accepted, 0, true, true);
				LLVMBuildStore(builder, ctx->i32_0, tmp_store);
			}
			ac_build_else(&ctx->ac, 12605);
			{
				LLVMBuildStore(builder,
					       si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
								      num_prims_accepted, 0,
								      true, true),
					       tmp_store);
			}
			ac_build_endif(&ctx->ac, 12604);

			start = LLVMBuildLoad(builder, tmp_store, "");
		}
	}
	si_exit_thread0_section(&section, &start);

	/* Write the final vertex count to memory. An EOS/EOP event could do this,
	 * but those events are super slow and should be avoided if performance
	 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
	 * event like this.
	 */
	if (VERTEX_COUNTER_GDS_MODE == 2) {
		ac_build_ifcc(&ctx->ac,
			      LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
					    last_wave_prim_id, ""), 12606);
		LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
		count = LLVMBuildMul(builder, count,
				     LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");

		/* GFX8 needs to disable caching, so that the CP can see the stored value.
		 * MTYPE=3 bypasses TC L2.
		 */
		if (ctx->screen->info.chip_class <= GFX8) {
			LLVMValueRef desc[] = {
				vertex_count_addr,
				LLVMConstInt(ctx->i32,
					S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
				LLVMConstInt(ctx->i32, 4, 0),
				LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
						       S_008F0C_MTYPE(3 /* uncached */), 0),
			};
			LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
			ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
						    ctx->i32_0, 0, ac_glc | ac_slc, false);
		} else {
			LLVMBuildStore(builder, count,
				       si_expand_32bit_pointer(ctx, vertex_count_addr));
		}
		ac_build_endif(&ctx->ac, 12606);
	} else {
		/* For unordered modes that increment a vertex count instead of
		 * primitive count, convert it into the primitive index.
		 */
		start = LLVMBuildUDiv(builder, start,
				      LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
	}

	/* Now we need to store the indices of accepted primitives into
	 * the output index buffer.
	 */
	ac_build_ifcc(&ctx->ac, accepted, 16607);
	{
		/* Get the number of bits set before the index of this thread. */
		LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);

		/* We have lowered instancing. Pack the instance ID into vertex ID. */
		if (key->opt.cs_instancing) {
			instance_id = LLVMBuildShl(builder, instance_id,
						   LLVMConstInt(ctx->i32, 16, 0), "");

			for (unsigned i = 0; i < vertices_per_prim; i++)
				index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
		}

		if (VERTEX_COUNTER_GDS_MODE == 2) {
			/* vertex_counter contains the first primitive ID
			 * for this dispatch. If the draw call was split into
			 * multiple subdraws, the first primitive ID is > 0
			 * for subsequent subdraws. Each subdraw uses a different
			 * portion of the output index buffer. Offset the store
			 * vindex by the first primitive ID to get the correct
			 * store address for the subdraw.
			 */
			start = LLVMBuildAdd(builder, start, vertex_counter, "");
		}

		/* Write indices for accepted primitives. */
		LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
		LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);

		if (!ac_has_vec3_support(ctx->ac.chip_class, true))
			vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);

		ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
					     vindex, ctx->i32_0, 3,
					     ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
	}
	ac_build_endif(&ctx->ac, 16607);

	LLVMBuildRetVoid(builder);
}

/* Return false if the shader isn't ready. */
static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
					     const struct pipe_draw_info *info,
					     bool primitive_restart)
{
	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
	struct si_shader_key key;

	/* Primitive restart needs ordered counters. */
	assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
	assert(!primitive_restart || info->instance_count == 1);

	memset(&key, 0, sizeof(key));
	si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
	assert(!key.part.vs.prolog.instance_divisor_is_fetched);

	key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
	key.opt.vs_as_prim_discard_cs = 1;
	key.opt.cs_prim_type = info->mode;
	key.opt.cs_indexed = info->index_size != 0;
	key.opt.cs_instancing = info->instance_count > 1;
	key.opt.cs_primitive_restart = primitive_restart;
	key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;

	/* Primitive restart with triangle strips needs to preserve primitive
	 * orientation for cases where front and back primitive orientation matters.
	 */
	if (primitive_restart) {
		struct si_shader_selector *ps = sctx->ps_shader.cso;

		key.opt.cs_need_correct_orientation =
			rs->cull_front != rs->cull_back ||
			ps->info.uses_frontface ||
			(rs->two_side && ps->info.colors_read);
	}

	if (rs->rasterizer_discard) {
		/* Just for performance testing and analysis of trivial bottlenecks.
		 * This should result in a very short compute shader. */
		key.opt.cs_cull_front = 1;
		key.opt.cs_cull_back = 1;
	} else {
		key.opt.cs_cull_front =
			sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
		key.opt.cs_cull_back =
			sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
	}

	if (!rs->depth_clamp_any && CULL_Z) {
		key.opt.cs_cull_z = 1;
		key.opt.cs_halfz_clip_space = rs->clip_halfz;
	}

	sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
	sctx->cs_prim_discard_state.current = NULL;

	struct si_compiler_ctx_state compiler_state;
	compiler_state.compiler = &sctx->compiler;
	compiler_state.debug = sctx->debug;
	compiler_state.is_debug_context = sctx->is_debug;

	return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
					 &compiler_state, &key, -1, true) == 0 &&
	       /* Disallow compute shaders using the scratch buffer. */
	       sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
}

static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
{
	if (sctx->index_ring)
		return true;

	if (!sctx->prim_discard_compute_cs) {
		struct radeon_winsys *ws = sctx->ws;
		unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
				    VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
		unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;

		if (gds_size) {
			sctx->gds = ws->buffer_create(ws, gds_size, 4,
						      RADEON_DOMAIN_GDS, 0);
			if (!sctx->gds)
				return false;

			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
					  RADEON_USAGE_READWRITE, 0, 0);
		}
		if (num_oa_counters) {
			assert(gds_size);
			sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
							 1, RADEON_DOMAIN_OA, 0);
			if (!sctx->gds_oa)
				return false;

			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
					  RADEON_USAGE_READWRITE, 0, 0);
		}

		sctx->prim_discard_compute_cs =
			ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
						       num_oa_counters > 0);
		if (!sctx->prim_discard_compute_cs)
			return false;
	}

	if (!sctx->index_ring) {
		sctx->index_ring =
			si_aligned_buffer_create(sctx->b.screen,
						 SI_RESOURCE_FLAG_UNMAPPABLE,
						 PIPE_USAGE_DEFAULT,
						 sctx->index_ring_size_per_ib * 2,
						 sctx->screen->info.pte_fragment_size);
		if (!sctx->index_ring)
			return false;
	}
	return true;
}

static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
{
	return sctx->index_ring_offset +
	       align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
	       sctx->index_ring_size_per_ib;
}

enum si_prim_discard_outcome
si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
				      const struct pipe_draw_info *info,
				      bool primitive_restart)
{
	/* If the compute shader compilation isn't finished, this returns false. */
	if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
		return SI_PRIM_DISCARD_DISABLED;

	if (!si_initialize_prim_discard_cmdbuf(sctx))
		return SI_PRIM_DISCARD_DISABLED;

	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
	unsigned prim = info->mode;
	unsigned count = info->count;
	unsigned instance_count = info->instance_count;
	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
	unsigned num_prims = num_prims_per_instance * instance_count;
	unsigned out_indexbuf_size = num_prims * 12;
	bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
	const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;

	/* Split draws at the draw call level if the ring is full. This makes
	 * better use of the ring space.
	 */
	if (ring_full &&
	    num_prims > split_prims_draw_level &&
	    instance_count == 1 && /* TODO: support splitting instanced draws */
	    (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
			   (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
		/* Split draws. */
		struct pipe_draw_info split_draw = *info;
		split_draw.primitive_restart = primitive_restart;

		unsigned base_start = split_draw.start;

		if (prim == PIPE_PRIM_TRIANGLES) {
			unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
			assert(vert_count_per_subdraw < count);

			for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
				split_draw.start = base_start + start;
				split_draw.count = MIN2(count - start, vert_count_per_subdraw);

				sctx->b.draw_vbo(&sctx->b, &split_draw);
			}
		} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
			/* No primitive pair can be split, because strips reverse orientation
			 * for odd primitives. */
			STATIC_ASSERT(split_prims_draw_level % 2 == 0);

			unsigned vert_count_per_subdraw = split_prims_draw_level;

			for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
				split_draw.start = base_start + start;
				split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);

				sctx->b.draw_vbo(&sctx->b, &split_draw);

				if (start == 0 &&
				    primitive_restart &&
				    sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
					sctx->preserve_prim_restart_gds_at_flush = true;
			}
			sctx->preserve_prim_restart_gds_at_flush = false;
		} else {
			assert(0);
		}

		return SI_PRIM_DISCARD_DRAW_SPLIT;
	}

	/* Just quit if the draw call doesn't fit into the ring and can't be split. */
	if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
		if (SI_PRIM_DISCARD_DEBUG)
			puts("PD failed: draw call too big, can't be split");
		return SI_PRIM_DISCARD_DISABLED;
	}

	unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
	unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
				   24 * (num_subdraws - 1) + /* subdraws */
				   20; /* leave some space at the end */
	unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);

	if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
		need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
	else
		need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */

	if (ring_full ||
	    (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
	    !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
		/* If the current IB is empty but the size is too small, add a NOP
		 * packet to force a flush and get a bigger IB.
		 */
		if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
		    gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
			radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
			radeon_emit(gfx_cs, 0);
		}

		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
	}

	/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
	ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
	assert(compute_has_space);
	assert(si_check_ring_space(sctx, out_indexbuf_size));
	return SI_PRIM_DISCARD_ENABLED;
}

void si_compute_signal_gfx(struct si_context *sctx)
{
	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
	unsigned writeback_L2_flags = 0;

	/* The writeback L2 flags vary with each chip generation. */
	/* CI needs to flush vertex indices to memory. */
	if (sctx->chip_class <= GFX7)
		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
	else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;

	if (!sctx->compute_num_prims_in_batch)
		return;

	assert(sctx->compute_rewind_va);

	/* After the queued dispatches are done and vertex counts are written to
	 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
	 * the dispatches to finish, it only adds the CS_DONE event into the event
	 * queue.
	 */
	si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
			  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
			  writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
					       EOP_INT_SEL_NONE,
			  EOP_DATA_SEL_VALUE_32BIT,
			  NULL,
			  sctx->compute_rewind_va |
			  ((uint64_t)sctx->screen->info.address32_hi << 32),
			  REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
			  SI_NOT_QUERY);

	sctx->compute_rewind_va = 0;
	sctx->compute_num_prims_in_batch = 0;
}

/* Dispatch a primitive discard compute shader. */
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
					  const struct pipe_draw_info *info,
					  unsigned index_size,
					  unsigned base_vertex,
					  uint64_t input_indexbuf_va,
					  unsigned input_indexbuf_num_elements)
{
	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
	if (!num_prims_per_instance)
		return;

	unsigned num_prims = num_prims_per_instance * info->instance_count;
	unsigned vertices_per_prim, output_indexbuf_format;

	switch (info->mode) {
	case PIPE_PRIM_TRIANGLES:
	case PIPE_PRIM_TRIANGLE_STRIP:
	case PIPE_PRIM_TRIANGLE_FAN:
		vertices_per_prim = 3;
		output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
		break;
	default:
		unreachable("unsupported primitive type");
		return;
	}

	unsigned out_indexbuf_offset;
	uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
	bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;

	/* Initialize the compute IB if it's empty. */
	if (!sctx->prim_discard_compute_ib_initialized) {
		/* 1) State initialization. */
		sctx->compute_gds_offset = 0;
		sctx->compute_ib_last_shader = NULL;

		if (sctx->last_ib_barrier_fence) {
			assert(!sctx->last_ib_barrier_buf);
			sctx->ws->cs_add_fence_dependency(gfx_cs,
							  sctx->last_ib_barrier_fence,
							  RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
		}

		/* 2) IB initialization. */

		/* This needs to be done at the beginning of IBs due to possible
		 * TTM buffer moves in the kernel.
		 *
		 * TODO: update for GFX10
		 */
		si_emit_surface_sync(sctx, cs,
				     S_0085F0_TC_ACTION_ENA(1) |
				     S_0085F0_TCL1_ACTION_ENA(1) |
				     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
				     S_0085F0_SH_ICACHE_ACTION_ENA(1) |
				     S_0085F0_SH_KCACHE_ACTION_ENA(1));

		/* Restore the GDS prim restart counter if needed. */
		if (sctx->preserve_prim_restart_gds_at_flush) {
			si_cp_copy_data(sctx, cs,
					COPY_DATA_GDS, NULL, 4,
					COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
		}

		si_emit_initial_compute_regs(sctx, cs);

		radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
				  S_00B860_WAVES(sctx->scratch_waves) |
				  S_00B860_WAVESIZE(0)); /* no scratch */

		/* Only 1D grids are launched. */
		radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
				S_00B820_NUM_THREAD_PARTIAL(1));
		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
				S_00B824_NUM_THREAD_PARTIAL(1));

		radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
		radeon_emit(cs, 0);
		radeon_emit(cs, 0);

		/* Disable ordered alloc for OA resources. */
		for (unsigned i = 0; i < 2; i++) {
			radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
			radeon_emit(cs, S_031074_INDEX(i));
			radeon_emit(cs, 0);
			radeon_emit(cs, S_03107C_ENABLE(0));
		}

		if (sctx->last_ib_barrier_buf) {
			assert(!sctx->last_ib_barrier_fence);
			radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
						  RADEON_USAGE_READ, RADEON_PRIO_FENCE);
			si_cp_wait_mem(sctx, cs,
				       sctx->last_ib_barrier_buf->gpu_address +
				       sctx->last_ib_barrier_buf_offset, 1, 1,
				       WAIT_REG_MEM_EQUAL);
		}

		sctx->prim_discard_compute_ib_initialized = true;
	}

	/* Allocate the output index buffer. */
	output_indexbuf_size = align(output_indexbuf_size,
				     sctx->screen->info.tcc_cache_line_size);
	assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
	out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
	sctx->index_ring_offset += output_indexbuf_size;

	radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
				  RADEON_PRIO_SHADER_RW_BUFFER);
	uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;

	/* Prepare index buffer descriptors. */
	struct si_resource *indexbuf_desc = NULL;
	unsigned indexbuf_desc_offset;
	unsigned desc_size = 12 * 4;
	uint32_t *desc;

	u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
		       si_optimal_tcc_alignment(sctx, desc_size),
		       &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
		       (void**)&desc);
	radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
				  RADEON_PRIO_DESCRIPTORS);

	/* Input index buffer. */
	desc[0] = input_indexbuf_va;
	desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
		  S_008F04_STRIDE(index_size);
	desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
		  S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
				       index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
							 V_008F0C_BUF_DATA_FORMAT_32);

	/* Output index buffer. */
	desc[4] = out_indexbuf_va;
	desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
		  S_008F04_STRIDE(vertices_per_prim * 4);
	desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
	desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
		  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
		  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
		  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
		  S_008F0C_DATA_FORMAT(output_indexbuf_format);

	/* Viewport state.
	 * This is needed by the small primitive culling, because it's done
	 * in screen space.
	 */
	float scale[2], translate[2];

	scale[0] = sctx->viewports.states[0].scale[0];
	scale[1] = sctx->viewports.states[0].scale[1];
	translate[0] = sctx->viewports.states[0].translate[0];
	translate[1] = sctx->viewports.states[0].translate[1];

	/* The viewport shouldn't flip the X axis for the small prim culling to work. */
	assert(-scale[0] + translate[0] <= scale[0] + translate[0]);

	/* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
	 * This is because the viewport transformation inverts the clip space
	 * bounding box, so min becomes max, which breaks small primitive
	 * culling.
	 */
	if (sctx->viewports.y_inverted) {
		scale[1] = -scale[1];
		translate[1] = -translate[1];
	}

	/* Scale the framebuffer up, so that samples become pixels and small
	 * primitive culling is the same for all sample counts.
	 * This only works with the standard DX sample positions, because
	 * the samples are evenly spaced on both X and Y axes.
	 */
	unsigned num_samples = sctx->framebuffer.nr_samples;
	assert(num_samples >= 1);

	for (unsigned i = 0; i < 2; i++) {
		scale[i] *= num_samples;
		translate[i] *= num_samples;
	}

	desc[8] = fui(scale[0]);
	desc[9] = fui(scale[1]);
	desc[10] = fui(translate[0]);
	desc[11] = fui(translate[1]);

	/* Better subpixel precision increases the efficiency of small
	 * primitive culling. */
	unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
	float small_prim_cull_precision;

	if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
		small_prim_cull_precision = num_samples / 4096.0;
	else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
		small_prim_cull_precision = num_samples / 1024.0;
	else
		small_prim_cull_precision = num_samples / 256.0;

	/* Set user data SGPRs. */
	/* This can't be greater than 14 if we want the fastest launch rate. */
	unsigned user_sgprs = 13;

	uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
	unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
	unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
	uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
	uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
	uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
				      sctx->vb_descriptors_buffer->gpu_address +
				      sctx->vb_descriptors_offset : 0;
	unsigned gds_offset, gds_size;
	struct si_fast_udiv_info32 num_prims_udiv = {};

	if (info->instance_count > 1)
		num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);

	/* Limitations on how these two are packed in the user SGPR. */
	assert(num_prims_udiv.post_shift < 32);
	assert(num_prims_per_instance < 1 << 27);

	si_resource_reference(&indexbuf_desc, NULL);

	bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;

	if (VERTEX_COUNTER_GDS_MODE == 1) {
		gds_offset = sctx->compute_gds_offset;
		gds_size = primitive_restart ? 8 : 4;
		sctx->compute_gds_offset += gds_size;

		/* Reset the counters in GDS for the first dispatch using WRITE_DATA.
		 * The remainder of the GDS will be cleared after the dispatch packet
		 * in parallel with compute shaders.
		 */
		if (first_dispatch) {
			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
			radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
			radeon_emit(cs, gds_offset);
			radeon_emit(cs, 0);
			radeon_emit(cs, 0); /* value to write */
			if (gds_size == 8)
				radeon_emit(cs, 0);
		}
	}

	/* Set shader registers. */
	struct si_shader *shader = sctx->cs_prim_discard_state.current;

	if (shader != sctx->compute_ib_last_shader) {
		radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
					  RADEON_PRIO_SHADER_BINARY);
		uint64_t shader_va = shader->bo->gpu_address;

		assert(shader->config.scratch_bytes_per_wave == 0);
		assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);

		radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
		radeon_emit(cs, shader_va >> 8);
		radeon_emit(cs, S_00B834_DATA(shader_va >> 40));

		radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
		radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
				S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
				S_00B848_FLOAT_MODE(shader->config.float_mode) |
				S_00B848_DX10_CLAMP(1));
		radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
				S_00B84C_USER_SGPR(user_sgprs) |
				S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
				S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
				S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
				S_00B84C_LDS_SIZE(shader->config.lds_size));

		radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
			ac_get_compute_resource_limits(&sctx->screen->info,
						       WAVES_PER_TG,
						       MAX_WAVES_PER_SH,
						       THREADGROUPS_PER_CU));
		sctx->compute_ib_last_shader = shader;
	}

	STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);

	/* Big draw calls are split into smaller dispatches and draw packets. */
	for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
		unsigned num_subdraw_prims;

		if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
			num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
		else
			num_subdraw_prims = num_prims - start_prim;

		/* Small dispatches are executed back to back until a specific primitive
		 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
		 * to start drawing the batch. This batching adds latency to the gfx IB,
		 * but CS_DONE and REWIND are too slow.
		 */
		if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
			si_compute_signal_gfx(sctx);

		if (sctx->compute_num_prims_in_batch == 0) {
			assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
			sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;

			if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
				radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
				radeon_emit(gfx_cs, 0);

				si_cp_wait_mem(sctx, gfx_cs,
					       sctx->compute_rewind_va |
					       (uint64_t)sctx->screen->info.address32_hi << 32,
					       REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
					       WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);

				/* Use INDIRECT_BUFFER to chain to a different buffer
				 * to discard the CP prefetch cache.
				 */
				sctx->ws->cs_check_space(gfx_cs, 0, true);
			} else {
				radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
				radeon_emit(gfx_cs, 0);
			}
		}

		sctx->compute_num_prims_in_batch += num_subdraw_prims;

		uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
		uint64_t index_va = out_indexbuf_va + start_prim * 12;

		/* Emit the draw packet into the gfx IB. */
		radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
		radeon_emit(gfx_cs, num_prims * vertices_per_prim);
		radeon_emit(gfx_cs, index_va);
		radeon_emit(gfx_cs, index_va >> 32);
		radeon_emit(gfx_cs, 0);
		radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);

		/* Continue with the compute IB. */
		if (start_prim == 0) {
			uint32_t gds_prim_restart_continue_bit = 0;

			if (sctx->preserve_prim_restart_gds_at_flush) {
				assert(primitive_restart &&
				       info->mode == PIPE_PRIM_TRIANGLE_STRIP);
				assert(start_prim < 1 << 31);
				gds_prim_restart_continue_bit = 1 << 31;
			}

			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
			radeon_emit(cs, index_buffers_va);
			radeon_emit(cs,
				    VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
				    VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
								   start_prim |
								   gds_prim_restart_continue_bit);
			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
			radeon_emit(cs, count_va);
			radeon_emit(cs, vb_desc_va);
			radeon_emit(cs, vs_const_desc_va);
			radeon_emit(cs, vs_sampler_desc_va);
			radeon_emit(cs, base_vertex);
			radeon_emit(cs, info->start_instance);
			radeon_emit(cs, num_prims_udiv.multiplier);
			radeon_emit(cs, num_prims_udiv.post_shift |
					(num_prims_per_instance << 5));
			radeon_emit(cs, info->restart_index);
			/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
			radeon_emit(cs, fui(small_prim_cull_precision));
		} else {
			assert(VERTEX_COUNTER_GDS_MODE == 2);
			/* Only update the SGPRs that changed. */
			radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
			radeon_emit(cs, start_prim);
			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
			radeon_emit(cs, count_va);
		}

		/* Set grid dimensions. */
		unsigned start_block = start_prim / THREADGROUP_SIZE;
		unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
		unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;

		radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
		radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
				  S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
				  S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));

		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
				PKT3_SHADER_TYPE_S(1));
		radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
		radeon_emit(cs, 1);
		radeon_emit(cs, 1);
		radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
				S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
				S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
				S_00B800_ORDER_MODE(0 /* launch in order */));

		/* This is only for unordered append. Ordered append writes this from
		 * the shader.
		 *
		 * Note that EOP and EOS events are super slow, so emulating the event
		 * in a shader is an important optimization.
		 */
		if (VERTEX_COUNTER_GDS_MODE == 1) {
			si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
					  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
					  EOP_INT_SEL_NONE,
					  EOP_DATA_SEL_GDS,
					  NULL,
					  count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
					  EOP_DATA_GDS(gds_offset / 4, 1),
					  SI_NOT_QUERY);

			/* Now that compute shaders are running, clear the remainder of GDS. */
			if (first_dispatch) {
				unsigned offset = gds_offset + gds_size;
				si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
						       GDS_SIZE_UNORDERED - offset,
						       0,
						       SI_CPDMA_SKIP_CHECK_CS_SPACE |
						       SI_CPDMA_SKIP_GFX_SYNC |
						       SI_CPDMA_SKIP_SYNC_BEFORE,
						       SI_COHERENCY_NONE, L2_BYPASS);
			}
		}
		first_dispatch = false;

		assert(cs->current.cdw <= cs->current.max_dw);
		assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
	}
}